summaryrefslogtreecommitdiff
path: root/fs/bcachefs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/Makefile3
-rw-r--r--fs/bcachefs/acl.c67
-rw-r--r--fs/bcachefs/alloc_background.c386
-rw-r--r--fs/bcachefs/alloc_background.h117
-rw-r--r--fs/bcachefs/alloc_foreground.c308
-rw-r--r--fs/bcachefs/alloc_foreground.h15
-rw-r--r--fs/bcachefs/alloc_types.h10
-rw-r--r--fs/bcachefs/backpointers.c332
-rw-r--r--fs/bcachefs/backpointers.h92
-rw-r--r--fs/bcachefs/bcachefs.h86
-rw-r--r--fs/bcachefs/bcachefs_format.h216
-rw-r--r--fs/bcachefs/bkey.c15
-rw-r--r--fs/bcachefs/bkey.h39
-rw-r--r--fs/bcachefs/bkey_methods.c28
-rw-r--r--fs/bcachefs/bkey_methods.h73
-rw-r--r--fs/bcachefs/bkey_sort.c79
-rw-r--r--fs/bcachefs/bkey_sort.h4
-rw-r--r--fs/bcachefs/bset.c43
-rw-r--r--fs/bcachefs/bset.h8
-rw-r--r--fs/bcachefs/btree_cache.c227
-rw-r--r--fs/bcachefs/btree_cache.h5
-rw-r--r--fs/bcachefs/btree_gc.c1451
-rw-r--r--fs/bcachefs/btree_gc.h72
-rw-r--r--fs/bcachefs/btree_gc_types.h29
-rw-r--r--fs/bcachefs/btree_io.c239
-rw-r--r--fs/bcachefs/btree_io.h2
-rw-r--r--fs/bcachefs/btree_iter.c401
-rw-r--r--fs/bcachefs/btree_iter.h94
-rw-r--r--fs/bcachefs/btree_journal_iter.c126
-rw-r--r--fs/bcachefs/btree_journal_iter.h10
-rw-r--r--fs/bcachefs/btree_key_cache.c154
-rw-r--r--fs/bcachefs/btree_key_cache_types.h8
-rw-r--r--fs/bcachefs/btree_locking.c208
-rw-r--r--fs/bcachefs/btree_locking.h4
-rw-r--r--fs/bcachefs/btree_node_scan.c524
-rw-r--r--fs/bcachefs/btree_node_scan.h11
-rw-r--r--fs/bcachefs/btree_node_scan_types.h31
-rw-r--r--fs/bcachefs/btree_trans_commit.c109
-rw-r--r--fs/bcachefs/btree_types.h143
-rw-r--r--fs/bcachefs/btree_update.c101
-rw-r--r--fs/bcachefs/btree_update.h14
-rw-r--r--fs/bcachefs/btree_update_interior.c482
-rw-r--r--fs/bcachefs/btree_update_interior.h36
-rw-r--r--fs/bcachefs/btree_write_buffer.c36
-rw-r--r--fs/bcachefs/buckets.c698
-rw-r--r--fs/bcachefs/buckets.h79
-rw-r--r--fs/bcachefs/chardev.c170
-rw-r--r--fs/bcachefs/checksum.c78
-rw-r--r--fs/bcachefs/checksum.h5
-rw-r--r--fs/bcachefs/compress.h8
-rw-r--r--fs/bcachefs/data_update.c83
-rw-r--r--fs/bcachefs/debug.c155
-rw-r--r--fs/bcachefs/dirent.c100
-rw-r--r--fs/bcachefs/dirent.h8
-rw-r--r--fs/bcachefs/disk_groups.c11
-rw-r--r--fs/bcachefs/disk_groups_format.h21
-rw-r--r--fs/bcachefs/ec.c419
-rw-r--r--fs/bcachefs/ec.h9
-rw-r--r--fs/bcachefs/errcode.h4
-rw-r--r--fs/bcachefs/error.c65
-rw-r--r--fs/bcachefs/error.h6
-rw-r--r--fs/bcachefs/extent_update.c2
-rw-r--r--fs/bcachefs/extents.c207
-rw-r--r--fs/bcachefs/extents.h37
-rw-r--r--fs/bcachefs/eytzinger.c305
-rw-r--r--fs/bcachefs/eytzinger.h81
-rw-r--r--fs/bcachefs/fs-common.c38
-rw-r--r--fs/bcachefs/fs-io-buffered.c24
-rw-r--r--fs/bcachefs/fs-io-direct.c29
-rw-r--r--fs/bcachefs/fs-io-pagecache.c2
-rw-r--r--fs/bcachefs/fs-io.c25
-rw-r--r--fs/bcachefs/fs-ioctl.c2
-rw-r--r--fs/bcachefs/fs.c136
-rw-r--r--fs/bcachefs/fsck.c519
-rw-r--r--fs/bcachefs/inode.c66
-rw-r--r--fs/bcachefs/inode.h23
-rw-r--r--fs/bcachefs/io_misc.c12
-rw-r--r--fs/bcachefs/io_read.c68
-rw-r--r--fs/bcachefs/io_write.c125
-rw-r--r--fs/bcachefs/io_write_types.h1
-rw-r--r--fs/bcachefs/journal.c139
-rw-r--r--fs/bcachefs/journal.h6
-rw-r--r--fs/bcachefs/journal_io.c240
-rw-r--r--fs/bcachefs/journal_io.h5
-rw-r--r--fs/bcachefs/journal_reclaim.c10
-rw-r--r--fs/bcachefs/journal_sb.c10
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c80
-rw-r--r--fs/bcachefs/journal_seq_blacklist.h2
-rw-r--r--fs/bcachefs/journal_seq_blacklist_format.h15
-rw-r--r--fs/bcachefs/journal_types.h16
-rw-r--r--fs/bcachefs/logged_ops.c9
-rw-r--r--fs/bcachefs/lru.c4
-rw-r--r--fs/bcachefs/lru.h2
-rw-r--r--fs/bcachefs/mean_and_variance_test.c29
-rw-r--r--fs/bcachefs/migrate.c8
-rw-r--r--fs/bcachefs/move.c102
-rw-r--r--fs/bcachefs/movinggc.c4
-rw-r--r--fs/bcachefs/opts.c33
-rw-r--r--fs/bcachefs/opts.h28
-rw-r--r--fs/bcachefs/printbuf.c239
-rw-r--r--fs/bcachefs/printbuf.h53
-rw-r--r--fs/bcachefs/quota.c131
-rw-r--r--fs/bcachefs/quota.h4
-rw-r--r--fs/bcachefs/rebalance.c10
-rw-r--r--fs/bcachefs/recovery.c559
-rw-r--r--fs/bcachefs/recovery.h32
-rw-r--r--fs/bcachefs/recovery_passes.c245
-rw-r--r--fs/bcachefs/recovery_passes.h17
-rw-r--r--fs/bcachefs/recovery_passes_types.h (renamed from fs/bcachefs/recovery_types.h)11
-rw-r--r--fs/bcachefs/reflink.c75
-rw-r--r--fs/bcachefs/reflink.h16
-rw-r--r--fs/bcachefs/replicas.c81
-rw-r--r--fs/bcachefs/replicas_format.h31
-rw-r--r--fs/bcachefs/sb-clean.c35
-rw-r--r--fs/bcachefs/sb-counters.c20
-rw-r--r--fs/bcachefs/sb-downgrade.c39
-rw-r--r--fs/bcachefs/sb-downgrade_format.h17
-rw-r--r--fs/bcachefs/sb-errors.c2
-rw-r--r--fs/bcachefs/sb-errors_format.h296
-rw-r--r--fs/bcachefs/sb-errors_types.h271
-rw-r--r--fs/bcachefs/sb-members.c196
-rw-r--r--fs/bcachefs/sb-members.h186
-rw-r--r--fs/bcachefs/sb-members_format.h110
-rw-r--r--fs/bcachefs/sb-members_types.h21
-rw-r--r--fs/bcachefs/snapshot.c360
-rw-r--r--fs/bcachefs/snapshot.h111
-rw-r--r--fs/bcachefs/str_hash.h70
-rw-r--r--fs/bcachefs/subvolume.c103
-rw-r--r--fs/bcachefs/subvolume.h10
-rw-r--r--fs/bcachefs/subvolume_types.h2
-rw-r--r--fs/bcachefs/super-io.c194
-rw-r--r--fs/bcachefs/super-io.h3
-rw-r--r--fs/bcachefs/super.c151
-rw-r--r--fs/bcachefs/super_types.h13
-rw-r--r--fs/bcachefs/sysfs.c195
-rw-r--r--fs/bcachefs/tests.c18
-rw-r--r--fs/bcachefs/thread_with_file.c15
-rw-r--r--fs/bcachefs/thread_with_file.h3
-rw-r--r--fs/bcachefs/trace.h103
-rw-r--r--fs/bcachefs/util.c204
-rw-r--r--fs/bcachefs/util.h27
-rw-r--r--fs/bcachefs/xattr.c47
-rw-r--r--fs/bcachefs/xattr.h2
143 files changed, 8660 insertions, 6469 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index b02796c8a595..66ca0bbee639 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -17,6 +17,7 @@ bcachefs-y := \
btree_journal_iter.o \
btree_key_cache.o \
btree_locking.o \
+ btree_node_scan.o \
btree_trans_commit.o \
btree_update.o \
btree_update_interior.o \
@@ -37,6 +38,7 @@ bcachefs-y := \
error.o \
extents.o \
extent_update.o \
+ eytzinger.o \
fs.o \
fs-common.o \
fs-ioctl.o \
@@ -67,6 +69,7 @@ bcachefs-y := \
quota.o \
rebalance.o \
recovery.o \
+ recovery_passes.o \
reflink.o \
replicas.o \
sb-clean.o \
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 3640f417cce1..250d6c6d3a3a 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -281,37 +281,29 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
- struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
- struct bkey_s_c k;
- int ret;
retry:
bch2_trans_begin(trans);
- ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
- &hash, inode_inum(inode), &search, 0);
- if (ret) {
- if (!bch2_err_matches(ret, ENOENT))
- acl = ERR_PTR(ret);
- goto out;
- }
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret) {
- acl = ERR_PTR(ret);
- goto out;
- }
+ struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+ &hash, inode_inum(inode), &search, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ goto err;
- xattr = bkey_s_c_to_xattr(k);
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
+ le16_to_cpu(xattr.v->x_val_len));
+ ret = PTR_ERR_OR_ZERO(acl);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ if (ret)
+ acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
- if (!IS_ERR(acl))
+ if (!IS_ERR_OR_NULL(acl))
set_cached_acl(&inode->v, type, acl);
-out:
- if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
- goto retry;
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
@@ -368,7 +360,7 @@ retry:
ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto btree_err;
@@ -416,39 +408,30 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
struct btree_iter iter;
- struct bkey_s_c_xattr xattr;
- struct bkey_i_xattr *new;
struct posix_acl *acl = NULL;
- struct bkey_s_c k;
- int ret;
- ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
- &hash_info, inum, &search, BTREE_ITER_INTENT);
+ struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+ &hash_info, inum, &search, BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (ret)
return bch2_err_matches(ret, ENOENT) ? 0 : ret;
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
- xattr = bkey_s_c_to_xattr(k);
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
ret = PTR_ERR_OR_ZERO(acl);
- if (IS_ERR_OR_NULL(acl))
+ if (ret)
goto err;
- ret = allocate_dropping_locks_errcode(trans,
- __posix_acl_chmod(&acl, _gfp, mode));
+ ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode));
if (ret)
goto err;
- new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
- if (IS_ERR(new)) {
- ret = PTR_ERR(new);
+ struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
goto err;
- }
new->k.p = iter.pos;
ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 893e38f9db80..346cd91f91f9 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -195,7 +195,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
}
int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
@@ -211,7 +211,7 @@ fsck_err:
}
int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_alloc_unpacked u;
@@ -225,7 +225,7 @@ fsck_err:
}
int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_alloc_unpacked u;
@@ -239,15 +239,15 @@ fsck_err:
}
int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
int ret = 0;
- bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err,
+ bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err,
alloc_v4_val_size_bad,
"bad val size (%u > %zu)",
- alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
+ alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
@@ -263,7 +263,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
- bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe,
+ bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe,
c, err, alloc_key_empty_but_have_data,
"empty data type free but have data");
break;
@@ -330,27 +330,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
bch2_prt_data_type(out, a->data_type);
prt_newline(out);
- prt_printf(out, "journal_seq %llu", a->journal_seq);
- prt_newline(out);
- prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a));
- prt_newline(out);
- prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a));
- prt_newline(out);
- prt_printf(out, "dirty_sectors %u", a->dirty_sectors);
- prt_newline(out);
- prt_printf(out, "cached_sectors %u", a->cached_sectors);
- prt_newline(out);
- prt_printf(out, "stripe %u", a->stripe);
- prt_newline(out);
- prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy);
- prt_newline(out);
- prt_printf(out, "io_time[READ] %llu", a->io_time[READ]);
- prt_newline(out);
- prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]);
- prt_newline(out);
- prt_printf(out, "fragmentation %llu", a->fragmentation_lru);
- prt_newline(out);
- prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
+ prt_printf(out, "journal_seq %llu\n", a->journal_seq);
+ prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
+ prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
+ prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
+ prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
+ prt_printf(out, "stripe %u\n", a->stripe);
+ prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
+ prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
+ prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
+ prt_printf(out, "fragmentation %llu\n", a->fragmentation_lru);
+ prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
printbuf_indent_sub(out, 2);
}
@@ -439,22 +429,18 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct b
}
struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos pos)
+bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos)
{
- struct bkey_s_c k;
- struct bkey_i_alloc_v4 *a;
- int ret;
-
- k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
- BTREE_ITER_WITH_UPDATES|
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- ret = bkey_err(k);
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
+ BTREE_ITER_with_updates|
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (unlikely(ret))
return ERR_PTR(ret);
- a = bch2_alloc_to_v4_mut_inlined(trans, k);
+ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
ret = PTR_ERR_OR_ZERO(a);
if (unlikely(ret))
goto err;
@@ -464,6 +450,20 @@ err:
return ERR_PTR(ret);
}
+__flatten
+struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
+ int ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ bch2_trans_iter_exit(trans, &iter);
+ return unlikely(ret) ? ERR_PTR(ret) : a;
+}
+
static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
{
*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
@@ -487,7 +487,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
}
int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
int ret = 0;
@@ -520,7 +520,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
int ret;
ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
@@ -567,29 +567,31 @@ iter_err:
int bch2_alloc_read(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
+ struct bch_dev *ca = NULL;
int ret;
down_read(&c->gc_lock);
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
if (k.k->type != KEY_TYPE_bucket_gens)
continue;
- const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
-
+ ca = bch2_dev_iterate(c, ca, k.k->p.inode);
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
*/
- if (!bch2_dev_exists2(c, k.k->p.inode))
+ if (!ca) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
continue;
+ }
- struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
for (u64 b = max_t(u64, ca->mi.first_bucket, start);
b < min_t(u64, ca->mi.nbuckets, end);
@@ -599,15 +601,16 @@ int bch2_alloc_read(struct bch_fs *c)
}));
} else {
ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
+ ca = bch2_dev_iterate(c, ca, k.k->p.inode);
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
*/
- if (!bch2_dev_bucket_exists(c, k.k->p))
+ if (!ca) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
continue;
-
- struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ }
struct bch_alloc_v4 a;
*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
@@ -615,6 +618,7 @@ int bch2_alloc_read(struct bch_fs *c)
}));
}
+ bch2_dev_put(ca);
bch2_trans_put(trans);
up_read(&c->gc_lock);
@@ -625,12 +629,12 @@ int bch2_alloc_read(struct bch_fs *c)
/* Free space/discard btree: */
static int bch2_bucket_do_index(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bkey_s_c alloc_k,
const struct bch_alloc_v4 *a,
bool set)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
struct btree_iter iter;
struct bkey_s_c old;
struct bkey_i *k;
@@ -667,7 +671,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
old = bch2_bkey_get_iter(trans, &iter, btree,
bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bkey_err(old);
if (ret)
return ret;
@@ -711,8 +715,8 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
return ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
- BTREE_ITER_INTENT|
- BTREE_ITER_WITH_UPDATES);
+ BTREE_ITER_intent|
+ BTREE_ITER_with_updates);
ret = bkey_err(k);
if (ret)
return ret;
@@ -734,26 +738,24 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
int bch2_trigger_alloc(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
int ret = 0;
- if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
- "alloc key for invalid device or bucket"))
+ struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
+ if (!ca)
return -EIO;
- struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode);
-
struct bch_alloc_v4 old_a_convert;
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
- new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
+ alloc_data_type_set(new_a, new_a->data_type);
- if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) {
+ if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
@@ -770,10 +772,10 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (old_a->data_type != new_a->data_type ||
(new_a->data_type == BCH_DATA_free &&
alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
- ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
- bch2_bucket_do_index(trans, new.s_c, new_a, true);
+ ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
+ bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
if (ret)
- return ret;
+ goto err;
}
if (new_a->data_type == BCH_DATA_cached &&
@@ -787,24 +789,23 @@ int bch2_trigger_alloc(struct btree_trans *trans,
bucket_to_u64(new.k->p),
old_lru, new_lru);
if (ret)
- return ret;
+ goto err;
}
- new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
- bch_dev_bkey_exists(c, new.k->p.inode));
+ new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca);
if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
ret = bch2_lru_change(trans,
BCH_LRU_FRAGMENTATION_START,
bucket_to_u64(new.k->p),
old_a->fragmentation_lru, new_a->fragmentation_lru);
if (ret)
- return ret;
+ goto err;
}
if (old_a->gen != new_a->gen) {
ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
if (ret)
- return ret;
+ goto err;
}
/*
@@ -812,21 +813,21 @@ int bch2_trigger_alloc(struct btree_trans *trans,
* not:
*/
- if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
+ if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
old_a->cached_sectors) {
ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
-((s64) old_a->cached_sectors));
if (ret)
- return ret;
+ goto err;
}
}
- if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
+ if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
u64 journal_seq = trans->journal_res.seq;
u64 bucket_journal_seq = new_a->journal_seq;
- if ((flags & BTREE_TRIGGER_INSERT) &&
+ if ((flags & BTREE_TRIGGER_insert) &&
data_type_is_empty(old_a->data_type) !=
data_type_is_empty(new_a->data_type) &&
new.k->type == KEY_TYPE_alloc_v4) {
@@ -854,7 +855,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (ret) {
bch2_fs_fatal_error(c,
"setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
- return ret;
+ goto err;
}
}
@@ -884,11 +885,11 @@ int bch2_trigger_alloc(struct btree_trans *trans,
bch2_do_invalidates(c);
if (statechange(a->data_type == BCH_DATA_need_gc_gens))
- bch2_do_gc_gens(c);
+ bch2_gc_gens_async(c);
}
- if ((flags & BTREE_TRIGGER_GC) &&
- (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) {
+ if ((flags & BTREE_TRIGGER_gc) &&
+ (flags & BTREE_TRIGGER_bucket_invalidate)) {
struct bch_alloc_v4 new_a_convert;
const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
@@ -908,12 +909,13 @@ int bch2_trigger_alloc(struct btree_trans *trans,
bucket_unlock(g);
percpu_up_read(&c->mark_lock);
}
-
- return 0;
+err:
+ bch2_dev_put(ca);
+ return ret;
}
/*
- * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for
+ * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
* extents style btrees, but works on non-extents btrees:
*/
static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
@@ -958,35 +960,34 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
}
}
-static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
+static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
{
- struct bch_dev *ca;
-
- if (bch2_dev_bucket_exists(c, *bucket))
- return true;
-
- if (bch2_dev_exists2(c, bucket->inode)) {
- ca = bch_dev_bkey_exists(c, bucket->inode);
+ if (*ca) {
+ if (bucket->offset < (*ca)->mi.first_bucket)
+ bucket->offset = (*ca)->mi.first_bucket;
- if (bucket->offset < ca->mi.first_bucket) {
- bucket->offset = ca->mi.first_bucket;
+ if (bucket->offset < (*ca)->mi.nbuckets)
return true;
- }
+ bch2_dev_put(*ca);
+ *ca = NULL;
bucket->inode++;
bucket->offset = 0;
}
rcu_read_lock();
- ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
- if (ca)
- *bucket = POS(ca->dev_idx, ca->mi.first_bucket);
+ *ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
+ if (*ca) {
+ *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
+ bch2_dev_get(*ca);
+ }
rcu_read_unlock();
- return ca != NULL;
+ return *ca != NULL;
}
-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole)
+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
+ struct bch_dev **ca, struct bkey *hole)
{
struct bch_fs *c = iter->trans->c;
struct bkey_s_c k;
@@ -995,22 +996,21 @@ again:
if (bkey_err(k))
return k;
+ *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
+
if (!k.k->type) {
- struct bpos bucket = bkey_start_pos(k.k);
+ struct bpos hole_start = bkey_start_pos(k.k);
- if (!bch2_dev_bucket_exists(c, bucket)) {
- if (!next_bucket(c, &bucket))
+ if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
+ if (!next_bucket(c, ca, &hole_start))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, bucket);
+ bch2_btree_iter_set_pos(iter, hole_start);
goto again;
}
- if (!bch2_dev_bucket_exists(c, k.k->p)) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
-
- bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset);
- }
+ if (k.k->p.offset > (*ca)->mi.nbuckets)
+ bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
}
return k;
@@ -1025,24 +1025,25 @@ int bch2_check_alloc_key(struct btree_trans *trans,
struct btree_iter *bucket_gens_iter)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca;
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a;
unsigned discard_key_type, freespace_key_type;
unsigned gens_offset;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
- int ret;
+ int ret = 0;
- if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
- alloc_key_to_missing_dev_bucket,
+ struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
+ if (fsck_err_on(!ca,
+ c, alloc_key_to_missing_dev_bucket,
"alloc key for invalid device:bucket %llu:%llu",
alloc_k.k->p.inode, alloc_k.k->p.offset))
- return bch2_btree_delete_at(trans, alloc_iter, 0);
+ ret = bch2_btree_delete_at(trans, alloc_iter, 0);
+ if (!ca)
+ return ret;
- ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
if (!ca->mi.freespace_initialized)
- return 0;
+ goto out;
a = bch2_alloc_to_v4(alloc_k, &a_convert);
@@ -1141,25 +1142,26 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
}
+out:
err:
fsck_err:
+ bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
}
static noinline_for_stack
int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bpos start,
struct bpos *end,
struct btree_iter *freespace_iter)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
int ret;
- ca = bch_dev_bkey_exists(c, start.inode);
if (!ca->mi.freespace_initialized)
return 0;
@@ -1313,7 +1315,7 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran
goto delete;
out:
fsck_err:
- set_btree_iter_dontneed(&alloc_iter);
+ bch2_set_btree_iter_dontneed(&alloc_iter);
bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf);
return ret;
@@ -1337,30 +1339,25 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_i_bucket_gens g;
- struct bch_dev *ca;
u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
u64 b;
- bool need_update = false, dev_exists;
+ bool need_update = false;
struct printbuf buf = PRINTBUF;
int ret = 0;
BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
bkey_reassemble(&g.k_i, k);
- /* if no bch_dev, skip out whether we repair or not */
- dev_exists = bch2_dev_exists2(c, k.k->p.inode);
- if (!dev_exists) {
- if (fsck_err_on(!dev_exists, c,
- bucket_gens_to_invalid_dev,
- "bucket_gens key for invalid device:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
+ if (!ca) {
+ if (fsck_err(c, bucket_gens_to_invalid_dev,
+ "bucket_gens key for invalid device:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter, 0);
- }
goto out;
}
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
if (fsck_err_on(end <= ca->mi.first_bucket ||
start >= ca->mi.nbuckets, c,
bucket_gens_to_invalid_buckets,
@@ -1398,6 +1395,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
}
out:
fsck_err:
+ bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
}
@@ -1406,25 +1404,26 @@ int bch2_check_alloc_info(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
+ struct bch_dev *ca = NULL;
struct bkey hole;
struct bkey_s_c k;
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
while (1) {
struct bpos next;
bch2_trans_begin(trans);
- k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
+ k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
ret = bkey_err(k);
if (ret)
goto bkey_err;
@@ -1445,7 +1444,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
} else {
next = k.k->p;
- ret = bch2_check_alloc_hole_freespace(trans,
+ ret = bch2_check_alloc_hole_freespace(trans, ca,
bkey_start_pos(k.k),
&next,
&freespace_iter) ?:
@@ -1473,19 +1472,21 @@ bkey_err:
bch2_trans_iter_exit(trans, &freespace_iter);
bch2_trans_iter_exit(trans, &discard_iter);
bch2_trans_iter_exit(trans, &iter);
+ bch2_dev_put(ca);
+ ca = NULL;
if (ret < 0)
goto err;
ret = for_each_btree_key(trans, iter,
BTREE_ID_need_discard, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
bch2_check_discard_freespace_key(trans, &iter));
if (ret)
goto err;
bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
while (1) {
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
@@ -1515,7 +1516,7 @@ bkey_err:
ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_bucket_gens_key(trans, &iter, k));
err:
@@ -1562,7 +1563,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
ret = bch2_trans_update(trans, alloc_iter,
- &a_mut->k_i, BTREE_TRIGGER_NORUN);
+ &a_mut->k_i, BTREE_TRIGGER_norun);
if (ret)
goto err;
@@ -1601,7 +1602,7 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS_MIN, BTREE_ITER_PREFETCH, k,
+ POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_alloc_to_lru_ref(trans, &iter)));
bch_err_fn(c, ret);
@@ -1657,9 +1658,7 @@ static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_st
bch2_journal_flush_async(&c->journal, NULL);
if (s->ca)
- percpu_ref_put(&s->ca->ref);
- if (ca)
- percpu_ref_get(&ca->ref);
+ percpu_ref_put(&s->ca->io_ref);
s->ca = ca;
s->need_journal_commit_this_dev = 0;
}
@@ -1673,15 +1672,15 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
struct bpos pos = need_discard_iter->pos;
struct btree_iter iter = { NULL };
struct bkey_s_c k;
- struct bch_dev *ca;
struct bkey_i_alloc_v4 *a;
struct printbuf buf = PRINTBUF;
bool discard_locked = false;
int ret = 0;
- ca = bch_dev_bkey_exists(c, pos.inode);
-
- if (!percpu_ref_tryget(&ca->io_ref)) {
+ struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode
+ ? s->ca
+ : bch2_dev_get_ioref(c, pos.inode, WRITE);
+ if (!ca) {
bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
return 0;
}
@@ -1703,7 +1702,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
need_discard_iter->pos,
- BTREE_ITER_CACHED);
+ BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
goto out;
@@ -1713,34 +1712,37 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
if (ret)
goto out;
- if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
- a->v.gen++;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
- goto write;
- }
-
- if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- bch2_trans_inconsistent(trans,
- "clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
- "%s",
- a->v.journal_seq,
- c->journal.flushed_seq_ondisk,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ if (bch2_bucket_sectors_total(a->v)) {
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "attempting to discard bucket with dirty data\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = -EIO;
- }
goto out;
}
if (a->v.data_type != BCH_DATA_need_discard) {
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- bch2_trans_inconsistent(trans,
- "bucket incorrectly set in need_discard btree\n"
- "%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
+ if (data_type_is_empty(a->v.data_type) &&
+ BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+ a->v.gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ goto write;
}
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "bucket incorrectly set in need_discard btree\n"
+ "%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = -EIO;
+ goto out;
+ }
+
+ if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
+ a->v.journal_seq,
+ c->journal.flushed_seq_ondisk,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = -EIO;
goto out;
}
@@ -1768,7 +1770,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
}
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
- a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+ alloc_data_type_set(&a->v, a->v.data_type);
write:
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
@@ -1784,7 +1786,6 @@ out:
discard_in_flight_remove(c, iter.pos);
s->seen++;
bch2_trans_iter_exit(trans, &iter);
- percpu_ref_put(&ca->io_ref);
printbuf_exit(&buf);
return ret;
}
@@ -1824,7 +1825,7 @@ void bch2_do_discards(struct bch_fs *c)
static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
{
struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent);
struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
int ret = bkey_err(k);
if (ret)
@@ -1835,8 +1836,9 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo
if (ret)
goto err;
+ BUG_ON(a->v.dirty_sectors);
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
- a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+ alloc_data_type_set(&a->v, a->v.data_type);
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
err:
@@ -1858,9 +1860,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
if (i->snapshot)
continue;
- ca = bch_dev_bkey_exists(c, i->inode);
-
- if (!percpu_ref_tryget(&ca->io_ref)) {
+ ca = bch2_dev_get_ioref(c, i->inode, WRITE);
+ if (!ca) {
darray_remove_item(&c->discard_buckets_in_flight, i);
continue;
}
@@ -1899,9 +1900,12 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode);
+ bool dead = !ca || percpu_ref_is_dying(&ca->io_ref);
+ rcu_read_unlock();
- if (!percpu_ref_is_dying(&ca->io_ref) &&
+ if (!dead &&
!discard_in_flight_add(c, bucket) &&
bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
!queue_work(c->write_ref_wq, &c->discard_fast_work))
@@ -1914,7 +1918,6 @@ static int invalidate_one_bucket(struct btree_trans *trans,
s64 *nr_to_invalidate)
{
struct bch_fs *c = trans->c;
- struct btree_iter alloc_iter = { NULL };
struct bkey_i_alloc_v4 *a = NULL;
struct printbuf buf = PRINTBUF;
struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
@@ -1932,7 +1935,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
return 0;
- a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
+ a = bch2_trans_start_alloc_update(trans, bucket);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
@@ -1942,6 +1945,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
goto out;
BUG_ON(a->v.data_type != BCH_DATA_cached);
+ BUG_ON(a->v.dirty_sectors);
if (!a->v.cached_sectors)
bch_err(c, "invalidating empty bucket, confused");
@@ -1956,18 +1960,15 @@ static int invalidate_one_bucket(struct btree_trans *trans,
a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
- ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
- BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc);
+ ret = bch2_trans_commit(trans, NULL, NULL,
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
--*nr_to_invalidate;
out:
- bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf);
return ret;
err:
@@ -2009,11 +2010,11 @@ static void bch2_do_invalidates_work(struct work_struct *work)
ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
lru_pos(ca->dev_idx, 0, 0),
lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
- BTREE_ITER_INTENT, k,
+ BTREE_ITER_intent, k,
invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
if (ret < 0) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
break;
}
}
@@ -2046,7 +2047,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
/*
* Scan the alloc btree for every bucket on @ca, and add buckets to the
* freespace/need_discard/need_gc_gens btrees as needed:
@@ -2078,7 +2079,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
- ret = bch2_bucket_do_index(trans, k, a, true) ?:
+ ret = bch2_bucket_do_index(trans, ca, k, a, true) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc);
if (ret)
@@ -2150,7 +2151,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
if (ret) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
bch_err_fn(c, ret);
return ret;
}
@@ -2177,7 +2178,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
u64 now;
int ret = 0;
- a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr));
+ if (bch2_trans_relock(trans))
+ bch2_trans_begin(trans);
+
+ a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
ret = PTR_ERR_OR_ZERO(a);
if (ret)
return ret;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 052b2fac25d6..ae31a94be6f9 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -8,21 +8,18 @@
#include "debug.h"
#include "super.h"
-enum bkey_invalid_flags;
+enum bch_validate_flags;
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
{
- struct bch_dev *ca;
-
- if (!bch2_dev_exists2(c, pos.inode))
- return false;
-
- ca = bch_dev_bkey_exists(c, pos.inode);
- return pos.offset >= ca->mi.first_bucket &&
- pos.offset < ca->mi.nbuckets;
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, pos.inode);
+ bool ret = ca && bucket_valid(ca, pos.offset);
+ rcu_read_unlock();
+ return ret;
}
static inline u64 bucket_to_u64(struct bpos bucket)
@@ -40,38 +37,50 @@ static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
return a.gen - a.oldest_gen;
}
-static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors,
- u32 cached_sectors,
- u32 stripe,
- struct bch_alloc_v4 a,
- enum bch_data_type data_type)
+static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
{
- if (stripe)
- return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
- if (dirty_sectors)
- return data_type;
- if (cached_sectors)
- return BCH_DATA_cached;
- if (BCH_ALLOC_V4_NEED_DISCARD(&a))
- return BCH_DATA_need_discard;
- if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
- return BCH_DATA_need_gc_gens;
- return BCH_DATA_free;
+ dst->gen = src.gen;
+ dst->data_type = src.data_type;
+ dst->dirty_sectors = src.dirty_sectors;
+ dst->cached_sectors = src.cached_sectors;
+ dst->stripe = src.stripe;
}
-static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
- enum bch_data_type data_type)
+static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src)
+{
+ dst->gen = src.gen;
+ dst->data_type = src.data_type;
+ dst->dirty_sectors = src.dirty_sectors;
+ dst->cached_sectors = src.cached_sectors;
+ dst->stripe = src.stripe;
+}
+
+static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
{
- return __alloc_data_type(a.dirty_sectors, a.cached_sectors,
- a.stripe, a, data_type);
+ struct bch_alloc_v4 ret = {};
+ __bucket_m_to_alloc(&ret, b);
+ return ret;
}
static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
{
- return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type;
+ switch (data_type) {
+ case BCH_DATA_cached:
+ case BCH_DATA_stripe:
+ return BCH_DATA_user;
+ default:
+ return data_type;
+ }
+}
+
+static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
+ enum bch_data_type ptr)
+{
+ return !data_type_is_empty(bucket) &&
+ bucket_data_type(bucket) != bucket_data_type(ptr);
}
-static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a)
+static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a)
{
return a.dirty_sectors + a.cached_sectors;
}
@@ -89,6 +98,27 @@ static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
return d ? max(0, ca->mi.bucket_size - d) : 0;
}
+static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
+ enum bch_data_type data_type)
+{
+ if (a.stripe)
+ return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
+ if (a.dirty_sectors)
+ return data_type;
+ if (a.cached_sectors)
+ return BCH_DATA_cached;
+ if (BCH_ALLOC_V4_NEED_DISCARD(&a))
+ return BCH_DATA_need_discard;
+ if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+ return BCH_DATA_need_gc_gens;
+ return BCH_DATA_free;
+}
+
+static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type)
+{
+ a->data_type = alloc_data_type(*a, data_type);
+}
+
static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
{
return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
@@ -126,13 +156,17 @@ static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_
return pos;
}
-static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a)
{
- unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+ return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
BCH_ALLOC_V4_U64s_V0) +
BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
(sizeof(struct bch_backpointer) / sizeof(u64));
+}
+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+{
+ unsigned ret = alloc_v4_u64s_noerror(a);
BUG_ON(ret > U8_MAX - BKEY_U64s);
return ret;
}
@@ -143,7 +177,9 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
}
struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
+bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *, struct bpos);
void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
@@ -169,13 +205,13 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -209,7 +245,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
})
int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \
@@ -229,7 +265,8 @@ static inline bool bkey_is_alloc(const struct bkey *k)
int bch2_alloc_read(struct bch_fs *);
int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
void bch2_do_discards(struct bch_fs *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 214b15c84d1f..927a5f300b30 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -71,7 +71,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *c)
{
rcu_read_lock();
for_each_member_device_rcu(c, ca, NULL)
- ca->alloc_cursor = 0;
+ memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
rcu_read_unlock();
}
@@ -100,7 +100,7 @@ static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *o
void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
if (ob->ec) {
ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
@@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
{
switch (watermark) {
- case BCH_WATERMARK_reclaim:
+ case BCH_WATERMARK_interior_updates:
return 0;
+ case BCH_WATERMARK_reclaim:
+ return OPEN_BUCKETS_COUNT / 6;
case BCH_WATERMARK_btree:
case BCH_WATERMARK_btree_copygc:
return OPEN_BUCKETS_COUNT / 4;
@@ -298,7 +300,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
k = bch2_bkey_get_iter(trans, &iter,
BTREE_ID_alloc, POS(ca->dev_idx, b),
- BTREE_ITER_CACHED);
+ BTREE_ITER_cached);
ret = bkey_err(k);
if (ret) {
ob = ERR_PTR(ret);
@@ -340,9 +342,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
struct bch_backpointer bp;
struct bpos bp_pos = POS_MIN;
- ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
+ ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1,
&bp_pos, &bp,
- BTREE_ITER_NOPRESERVE);
+ BTREE_ITER_nopreserve);
if (ret) {
ob = ERR_PTR(ret);
goto err;
@@ -361,10 +363,10 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
if (!ob)
- set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(&iter);
err:
if (iter.path)
- set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(&iter);
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
return ob;
@@ -387,7 +389,8 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
struct bkey_s_c k, ck;
struct open_bucket *ob = NULL;
u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
- u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
+ u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
u64 alloc_cursor = alloc_start;
int ret;
@@ -402,9 +405,8 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
*/
again:
for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
- BTREE_ITER_SLOTS, k, ret) {
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
+ BTREE_ITER_slots, k, ret) {
+ u64 bucket = k.k->p.offset;
if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
break;
@@ -413,12 +415,29 @@ again:
is_superblock_bucket(ca, k.k->p.offset))
continue;
- a = bch2_alloc_to_v4(k, &a_convert);
+ if (s->btree_bitmap != BTREE_BITMAP_ANY &&
+ s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
+ bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
+ if (s->btree_bitmap == BTREE_BITMAP_YES &&
+ bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
+ break;
+
+ bucket = sector_to_bucket(ca,
+ round_up(bucket_to_sector(ca, bucket) + 1,
+ 1ULL << ca->mi.btree_bitmap_shift));
+ bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket));
+ s->buckets_seen++;
+ s->skipped_mi_btree_bitmap++;
+ continue;
+ }
+
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
if (a->data_type != BCH_DATA_free)
continue;
/* now check the cached key to serialize concurrent allocs of the bucket */
- ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
+ ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached);
ret = bkey_err(ck);
if (ret)
break;
@@ -431,7 +450,7 @@ again:
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
next:
- set_btree_iter_dontneed(&citer);
+ bch2_set_btree_iter_dontneed(&citer);
bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
@@ -439,7 +458,6 @@ next:
bch2_trans_iter_exit(trans, &iter);
alloc_cursor = iter.pos.offset;
- ca->alloc_cursor = alloc_cursor;
if (!ob && ret)
ob = ERR_PTR(ret);
@@ -449,6 +467,8 @@ next:
goto again;
}
+ *dev_alloc_cursor = alloc_cursor;
+
return ob;
}
@@ -461,7 +481,8 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_s_c k;
struct open_bucket *ob = NULL;
- u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
+ u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
u64 alloc_cursor = alloc_start;
int ret;
@@ -483,10 +504,30 @@ again:
s->buckets_seen++;
+ u64 bucket = alloc_cursor & ~(~0ULL << 56);
+ if (s->btree_bitmap != BTREE_BITMAP_ANY &&
+ s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
+ bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
+ if (s->btree_bitmap == BTREE_BITMAP_YES &&
+ bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
+ goto fail;
+
+ bucket = sector_to_bucket(ca,
+ round_up(bucket_to_sector(ca, bucket) + 1,
+ 1ULL << ca->mi.btree_bitmap_shift));
+ u64 genbits = alloc_cursor >> 56;
+ alloc_cursor = bucket | (genbits << 56);
+
+ if (alloc_cursor > k.k->p.offset)
+ bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
+ s->skipped_mi_btree_bitmap++;
+ continue;
+ }
+
ob = try_alloc_bucket(trans, ca, watermark,
alloc_cursor, s, k, cl);
if (ob) {
- set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(&iter);
break;
}
}
@@ -494,10 +535,9 @@ again:
if (ob || ret)
break;
}
+fail:
bch2_trans_iter_exit(trans, &iter);
- ca->alloc_cursor = alloc_cursor;
-
if (!ob && ret)
ob = ERR_PTR(ret);
@@ -506,14 +546,56 @@ again:
goto again;
}
+ *dev_alloc_cursor = alloc_cursor;
+
return ob;
}
+static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
+ enum bch_watermark watermark,
+ enum bch_data_type data_type,
+ struct closure *cl,
+ struct bch_dev_usage *usage,
+ struct bucket_alloc_state *s,
+ struct open_bucket *ob)
+{
+ struct printbuf buf = PRINTBUF;
+
+ printbuf_tabstop_push(&buf, 24);
+
+ prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx);
+ prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]);
+ prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]);
+ prt_printf(&buf, "blocking\t%u\n", cl != NULL);
+ prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets);
+ prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark));
+ prt_printf(&buf, "copygc_wait\t%lu/%lli\n",
+ bch2_copygc_wait_amount(c),
+ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
+ prt_printf(&buf, "seen\t%llu\n", s->buckets_seen);
+ prt_printf(&buf, "open\t%llu\n", s->skipped_open);
+ prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit);
+ prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow);
+ prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse);
+ prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap);
+
+ if (!IS_ERR(ob)) {
+ prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
+ trace_bucket_alloc(c, buf.buf);
+ } else {
+ prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob)));
+ trace_bucket_alloc_fail(c, buf.buf);
+ }
+
+ printbuf_exit(&buf);
+}
+
/**
* bch2_bucket_alloc_trans - allocate a single bucket from a specific device
* @trans: transaction object
* @ca: device to allocate from
* @watermark: how important is this allocation?
+ * @data_type: BCH_DATA_journal, btree, user...
* @cl: if not NULL, closure to be used to wait if buckets not available
* @usage: for secondarily also returning the current device usage
*
@@ -522,6 +604,7 @@ again:
static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
struct bch_dev *ca,
enum bch_watermark watermark,
+ enum bch_data_type data_type,
struct closure *cl,
struct bch_dev_usage *usage)
{
@@ -529,7 +612,9 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
struct open_bucket *ob = NULL;
bool freespace = READ_ONCE(ca->mi.freespace_initialized);
u64 avail;
- struct bucket_alloc_state s = { 0 };
+ struct bucket_alloc_state s = {
+ .btree_bitmap = data_type == BCH_DATA_btree,
+ };
bool waiting = false;
again:
bch2_dev_usage_read_fast(ca, usage);
@@ -539,7 +624,7 @@ again:
bch2_do_discards(c);
if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
- bch2_do_gc_gens(c);
+ bch2_gc_gens_async(c);
if (should_invalidate_buckets(ca, *usage))
bch2_do_invalidates(c);
@@ -567,6 +652,11 @@ alloc:
if (s.skipped_need_journal_commit * 2 > avail)
bch2_journal_flush_async(&c->journal, NULL);
+ if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) {
+ s.btree_bitmap = BTREE_BITMAP_ANY;
+ goto alloc;
+ }
+
if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
freespace = false;
goto alloc;
@@ -576,33 +666,24 @@ err:
ob = ERR_PTR(-BCH_ERR_no_buckets_found);
if (!IS_ERR(ob))
- trace_and_count(c, bucket_alloc, ca,
- bch2_watermarks[watermark],
- ob->bucket,
- usage->d[BCH_DATA_free].buckets,
- avail,
- bch2_copygc_wait_amount(c),
- c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
- &s,
- cl == NULL,
- "");
+ ob->data_type = data_type;
+
+ if (!IS_ERR(ob))
+ count_event(c, bucket_alloc);
else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
- trace_and_count(c, bucket_alloc_fail, ca,
- bch2_watermarks[watermark],
- 0,
- usage->d[BCH_DATA_free].buckets,
- avail,
- bch2_copygc_wait_amount(c),
- c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
- &s,
- cl == NULL,
- bch2_err_str(PTR_ERR(ob)));
+ count_event(c, bucket_alloc_fail);
+
+ if (!IS_ERR(ob)
+ ? trace_bucket_alloc_enabled()
+ : trace_bucket_alloc_fail_enabled())
+ trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob);
return ob;
}
struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
enum bch_watermark watermark,
+ enum bch_data_type data_type,
struct closure *cl)
{
struct bch_dev_usage usage;
@@ -610,7 +691,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
bch2_trans_do(c, NULL, NULL, 0,
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
- cl, &usage)));
+ data_type, cl, &usage)));
return ob;
}
@@ -676,8 +757,7 @@ static int add_new_bucket(struct bch_fs *c,
unsigned flags,
struct open_bucket *ob)
{
- unsigned durability =
- bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+ unsigned durability = ob_dev(c, ob)->mi.durability;
BUG_ON(*nr_effective >= nr_replicas);
@@ -709,37 +789,28 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct dev_alloc_list devs_sorted =
bch2_dev_alloc_list(c, stripe, devs_may_alloc);
- unsigned dev;
- struct bch_dev *ca;
int ret = -BCH_ERR_insufficient_devices;
- unsigned i;
BUG_ON(*nr_effective >= nr_replicas);
- for (i = 0; i < devs_sorted.nr; i++) {
+ for (unsigned i = 0; i < devs_sorted.nr; i++) {
struct bch_dev_usage usage;
struct open_bucket *ob;
- dev = devs_sorted.devs[i];
-
- rcu_read_lock();
- ca = rcu_dereference(c->devs[dev]);
- if (ca)
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
-
+ unsigned dev = devs_sorted.devs[i];
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
if (!ca)
continue;
if (!ca->mi.durability && *have_cache) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
continue;
}
- ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
+ ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage);
if (!IS_ERR(ob))
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
if (IS_ERR(ob)) {
ret = PTR_ERR(ob);
@@ -748,8 +819,6 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
continue;
}
- ob->data_type = data_type;
-
if (add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
have_cache, flags, ob)) {
@@ -834,7 +903,7 @@ static bool want_bucket(struct bch_fs *c,
bool *have_cache, bool ec,
struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
if (!test_bit(ob->dev, devs_may_alloc->d))
return false;
@@ -904,7 +973,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
struct bch_dev_usage usage;
u64 avail;
@@ -1289,7 +1358,7 @@ deallocate_extra_replicas(struct bch_fs *c,
unsigned i;
open_bucket_for_each(c, ptrs, ob, i) {
- unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+ unsigned d = ob_dev(c, ob)->mi.durability;
if (d && d <= extra_replicas) {
extra_replicas -= d;
@@ -1340,6 +1409,10 @@ retry:
*wp_ret = wp = writepoint_find(trans, write_point.v);
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ goto err;
+
/* metadata may not allocate on cache devices: */
if (wp->data_type != BCH_DATA_user)
have_cache = true;
@@ -1442,7 +1515,7 @@ err:
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
return (struct bch_extent_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_ptr,
@@ -1518,7 +1591,7 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
unsigned data_type = ob->data_type;
barrier(); /* READ_ONCE() doesn't work on bitfields */
@@ -1620,3 +1693,104 @@ void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
prt_str(out, "Btree write point\n");
bch2_write_point_to_text(out, c, &c->btree_write_point);
}
+
+void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ unsigned nr[BCH_DATA_NR];
+
+ memset(nr, 0, sizeof(nr));
+
+ for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+ nr[c->open_buckets[i].data_type]++;
+
+ printbuf_tabstop_push(out, 24);
+
+ percpu_down_read(&c->mark_lock);
+ prt_printf(out, "hidden\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.hidden));
+ prt_printf(out, "btree\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.btree));
+ prt_printf(out, "data\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.data));
+ prt_printf(out, "cached\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.cached));
+ prt_printf(out, "reserved\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.reserved));
+ prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved));
+ prt_printf(out, "nr_inodes\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes));
+ percpu_up_read(&c->mark_lock);
+
+ prt_newline(out);
+ prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty");
+ prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
+ prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT);
+ prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty");
+ prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]);
+ prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]);
+ prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr);
+}
+
+void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+ struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+ unsigned nr[BCH_DATA_NR];
+
+ memset(nr, 0, sizeof(nr));
+
+ for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+ nr[c->open_buckets[i].data_type]++;
+
+ printbuf_tabstop_push(out, 12);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+
+ bch2_dev_usage_to_text(out, &stats);
+
+ prt_newline(out);
+
+ prt_printf(out, "reserves:\n");
+ for (unsigned i = 0; i < BCH_WATERMARK_NR; i++)
+ prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i));
+
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 12);
+ printbuf_tabstop_push(out, 16);
+
+ prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets);
+ prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats));
+}
+
+void bch2_print_allocator_stuck(struct bch_fs *c)
+{
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "Allocator stuck? Waited for 10 seconds\n");
+
+ prt_printf(&buf, "Allocator debug:\n");
+ printbuf_indent_add(&buf, 2);
+ bch2_fs_alloc_debug_to_text(&buf, c);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+
+ for_each_online_member(c, ca) {
+ prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
+ printbuf_indent_add(&buf, 2);
+ bch2_dev_alloc_debug_to_text(&buf, ca);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+ }
+
+ prt_printf(&buf, "Copygc debug:\n");
+ printbuf_indent_add(&buf, 2);
+ bch2_copygc_wait_to_text(&buf, c);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "Journal debug:\n");
+ printbuf_indent_add(&buf, 2);
+ bch2_journal_debug_to_text(&buf, &c->journal);
+ printbuf_indent_sub(&buf, 2);
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 7aaeec44c746..a42c9730d32a 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -30,8 +30,14 @@ void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
long bch2_bucket_alloc_new_fs(struct bch_dev *);
+static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
+{
+ return bch2_dev_have_ref(c, ob->dev);
+}
+
struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
- enum bch_watermark, struct closure *);
+ enum bch_watermark, enum bch_data_type,
+ struct closure *);
static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
struct open_bucket *ob)
@@ -184,7 +190,7 @@ bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
wp->sectors_allocated += sectors;
open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
ptr.cached = cached ||
@@ -221,4 +227,9 @@ void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
+void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *);
+void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
+
+void bch2_print_allocator_stuck(struct bch_fs *);
+
#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index b91b7a461056..9bbb28e90b93 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -9,11 +9,18 @@
#include "fifo.h"
struct bucket_alloc_state {
+ enum {
+ BTREE_BITMAP_NO,
+ BTREE_BITMAP_YES,
+ BTREE_BITMAP_ANY,
+ } btree_bitmap;
+
u64 buckets_seen;
u64 skipped_open;
u64 skipped_need_journal_commit;
u64 skipped_nocow;
u64 skipped_nouse;
+ u64 skipped_mi_btree_bitmap;
};
#define BCH_WATERMARKS() \
@@ -22,7 +29,8 @@ struct bucket_alloc_state {
x(copygc) \
x(btree) \
x(btree_copygc) \
- x(reclaim)
+ x(reclaim) \
+ x(interior_updates)
enum bch_watermark {
#define x(name) BCH_WATERMARK_##name,
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 8cb35ea572cb..4321f9fb73bd 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -8,6 +8,7 @@
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "checksum.h"
#include "error.h"
#include <linux/mm.h>
@@ -22,6 +23,7 @@ static bool extent_matches_bp(struct bch_fs *c,
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
struct bpos bucket2;
struct bch_backpointer bp2;
@@ -29,28 +31,46 @@ static bool extent_matches_bp(struct bch_fs *c,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
- &bucket2, &bp2);
+ struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+ if (!ca)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2);
if (bpos_eq(bucket, bucket2) &&
- !memcmp(&bp, &bp2, sizeof(bp)))
+ !memcmp(&bp, &bp2, sizeof(bp))) {
+ rcu_read_unlock();
return true;
+ }
}
+ rcu_read_unlock();
return false;
}
int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
- struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, bp.k->p.inode);
+ if (!ca) {
+ /* these will be caught by fsck */
+ rcu_read_unlock();
+ return 0;
+ }
+
+ struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p);
+ struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset);
+ rcu_read_unlock();
int ret = 0;
- bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
+ bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
+ !bpos_eq(bp.k->p, bp_pos),
c, err,
- backpointer_pos_wrong,
- "backpointer at wrong pos");
+ backpointer_bucket_offset_wrong,
+ "backpointer bucket_offset wrong");
fsck_err:
return ret;
}
@@ -68,10 +88,16 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer
void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
- if (bch2_dev_exists2(c, k.k->p.inode)) {
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, k.k->p.inode);
+ if (ca) {
+ struct bpos bucket = bp_pos_to_bucket(ca, k.k->p);
+ rcu_read_unlock();
prt_str(out, "bucket=");
- bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
+ bch2_bpos_to_text(out, bucket);
prt_str(out, " ");
+ } else {
+ rcu_read_unlock();
}
bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
@@ -110,8 +136,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
bch_err(c, "%s", buf.buf);
} else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
- prt_printf(&buf, "backpointer not found when deleting");
- prt_newline(&buf);
+ prt_printf(&buf, "backpointer not found when deleting\n");
printbuf_indent_add(&buf, 2);
prt_printf(&buf, "searching for ");
@@ -138,6 +163,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
}
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bpos bucket,
struct bch_backpointer bp,
struct bkey_s_c orig_k,
@@ -154,7 +180,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
return ret;
bkey_backpointer_init(&bp_k->k_i);
- bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
bp_k->v = bp;
if (!insert) {
@@ -164,9 +190,9 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
bp_k->k.p,
- BTREE_ITER_INTENT|
- BTREE_ITER_SLOTS|
- BTREE_ITER_WITH_UPDATES);
+ BTREE_ITER_intent|
+ BTREE_ITER_slots|
+ BTREE_ITER_with_updates);
ret = bkey_err(k);
if (ret)
goto err;
@@ -190,13 +216,13 @@ err:
* Find the next backpointer >= *bp_offset:
*/
int bch2_get_next_backpointer(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bpos bucket, int gen,
struct bpos *bp_pos,
struct bch_backpointer *bp,
unsigned iter_flags)
{
- struct bch_fs *c = trans->c;
- struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
+ struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0);
struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL };
struct bkey_s_c k;
int ret = 0;
@@ -206,7 +232,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans,
if (gen >= 0) {
k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
- bucket, BTREE_ITER_CACHED|iter_flags);
+ bucket, BTREE_ITER_cached|iter_flags);
ret = bkey_err(k);
if (ret)
goto out;
@@ -216,7 +242,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans,
goto done;
}
- *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0));
+ *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0));
for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
*bp_pos, iter_flags, k, ret) {
@@ -242,7 +268,6 @@ static void backpointer_not_found(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
- struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
/*
* If we're using the btree write buffer, the backpointer we were
@@ -252,6 +277,10 @@ static void backpointer_not_found(struct btree_trans *trans,
if (likely(!bch2_backpointers_no_use_write_buffer))
return;
+ struct bpos bucket;
+ if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
+ return;
+
prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
bp.level ? "btree node" : "extent");
prt_printf(&buf, "bucket: ");
@@ -281,15 +310,17 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
{
if (likely(!bp.level)) {
struct bch_fs *c = trans->c;
- struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
- struct bkey_s_c k;
+
+ struct bpos bucket;
+ if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
+ return bkey_s_c_err(-EIO);
bch2_trans_node_iter_init(trans, iter,
bp.btree_id,
bp.pos,
0, 0,
iter_flags);
- k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
if (bkey_err(k)) {
bch2_trans_iter_exit(trans, iter);
return k;
@@ -318,18 +349,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
struct bch_backpointer bp)
{
struct bch_fs *c = trans->c;
- struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
- struct btree *b;
BUG_ON(!bp.level);
+ struct bpos bucket;
+ if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
+ return ERR_PTR(-EIO);
+
bch2_trans_node_iter_init(trans, iter,
bp.btree_id,
bp.pos,
0,
bp.level - 1,
0);
- b = bch2_btree_iter_peek_node(iter);
+ struct btree *b = bch2_btree_iter_peek_node(iter);
if (IS_ERR_OR_NULL(b))
goto err;
@@ -360,16 +393,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
struct printbuf buf = PRINTBUF;
int ret = 0;
- if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
- backpointer_to_missing_device,
- "backpointer for missing device:\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_delete_at(trans, bp_iter, 0);
+ struct bpos bucket;
+ if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
+ if (fsck_err(c, backpointer_to_missing_device,
+ "backpointer for missing device:\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = bch2_btree_delete_at(trans, bp_iter, 0);
goto out;
}
- alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
- bp_pos_to_bucket(c, k.k->p), 0);
+ alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0);
ret = bkey_err(alloc_k);
if (ret)
goto out;
@@ -378,7 +411,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
backpointer_to_missing_alloc,
"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
alloc_iter.pos.inode, alloc_iter.pos.offset,
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, bp_iter, 0);
goto out;
}
@@ -414,6 +447,84 @@ struct extents_to_bp_state {
struct bkey_buf last_flushed;
};
+static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
+ struct bkey_s_c extent, unsigned dev)
+{
+ struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bch2_bkey_drop_device(bkey_i_to_s(n), dev);
+ return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
+static int check_extent_checksum(struct btree_trans *trans,
+ enum btree_id btree, struct bkey_s_c extent,
+ enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct printbuf buf = PRINTBUF;
+ void *data_buf = NULL;
+ struct bio *bio = NULL;
+ size_t bytes;
+ int ret = 0;
+
+ if (bkey_is_btree_ptr(extent.k))
+ return false;
+
+ bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
+ if (p.ptr.dev == dev)
+ goto found;
+ BUG();
+found:
+ if (!p.crc.csum_type)
+ return false;
+
+ bytes = p.crc.compressed_size << 9;
+
+ struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ);
+ if (!ca)
+ return false;
+
+ data_buf = kvmalloc(bytes, GFP_KERNEL);
+ if (!data_buf) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL);
+ bio->bi_iter.bi_sector = p.ptr.offset;
+ bch2_bio_map(bio, data_buf, bytes);
+ ret = submit_bio_wait(bio);
+ if (ret)
+ goto err;
+
+ prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
+ prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree));
+ bch2_bkey_val_to_text(&buf, c, extent);
+ prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree));
+ bch2_bkey_val_to_text(&buf, c, extent2);
+
+ struct nonce nonce = extent_nonce(extent.k->version, p.crc);
+ struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
+ if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
+ c, dup_backpointer_to_bad_csum_extent,
+ "%s", buf.buf))
+ ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
+fsck_err:
+err:
+ if (bio)
+ bio_put(bio);
+ kvfree(data_buf);
+ percpu_ref_put(&ca->io_ref);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static int check_bp_exists(struct btree_trans *trans,
struct extents_to_bp_state *s,
struct bpos bucket,
@@ -421,23 +532,32 @@ static int check_bp_exists(struct btree_trans *trans,
struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
- struct btree_iter bp_iter = { NULL };
+ struct btree_iter bp_iter = {};
+ struct btree_iter other_extent_iter = {};
struct printbuf buf = PRINTBUF;
struct bkey_s_c bp_k;
struct bkey_buf tmp;
- int ret;
+ int ret = 0;
bch2_bkey_buf_init(&tmp);
+ struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket);
+ if (!ca) {
+ prt_str(&buf, "extent for nonexistent device:bucket ");
+ bch2_bpos_to_text(&buf, bucket);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ bch_err(c, "%s", buf.buf);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
+ }
+
if (bpos_lt(bucket, s->bucket_start) ||
bpos_gt(bucket, s->bucket_end))
- return 0;
-
- if (!bch2_dev_bucket_exists(c, bucket))
- goto missing;
+ goto out;
bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
- bucket_pos_to_bp(c, bucket, bp.bucket_offset),
+ bucket_pos_to_bp(ca, bucket, bp.bucket_offset),
0);
ret = bkey_err(bp_k);
if (ret)
@@ -461,24 +581,98 @@ static int check_bp_exists(struct btree_trans *trans,
ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
}
- goto missing;
+
+ goto check_existing_bp;
}
out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &other_extent_iter);
bch2_trans_iter_exit(trans, &bp_iter);
bch2_bkey_buf_exit(&tmp, c);
+ bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
+check_existing_bp:
+ /* Do we have a backpointer for a different extent? */
+ if (bp_k.k->type != KEY_TYPE_backpointer)
+ goto missing;
+
+ struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v;
+
+ struct bkey_s_c other_extent =
+ bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0);
+ ret = bkey_err(other_extent);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ ret = 0;
+ if (ret)
+ goto err;
+
+ if (!other_extent.k)
+ goto missing;
+
+ if (bch2_extents_match(orig_k, other_extent)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, other_extent);
+ bch_err(c, "%s", buf.buf);
+
+ if (other_extent.k->size <= orig_k.k->size) {
+ ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode);
+ if (ret)
+ goto err;
+ goto out;
+ } else {
+ ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode);
+ if (ret)
+ goto err;
+ goto missing;
+ }
+ }
+
+ ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ ret = 0;
+ goto missing;
+ }
+
+ ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ printbuf_reset(&buf);
+ prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode);
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, other_extent);
+ bch_err(c, "%s", buf.buf);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
missing:
+ printbuf_reset(&buf);
prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
bch2_btree_id_str(bp.btree_id), bp.level);
bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_printf(&buf, "\nbp pos ");
- bch2_bpos_to_text(&buf, bp_iter.pos);
+ prt_printf(&buf, "\n got: ");
+ bch2_bkey_val_to_text(&buf, c, bp_k);
+
+ struct bkey_i_backpointer n_bp_k;
+ bkey_backpointer_init(&n_bp_k.k_i);
+ n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
+ n_bp_k.v = bp;
+ prt_printf(&buf, "\n want: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
- ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
+ ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true);
goto out;
}
@@ -496,14 +690,20 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bpos bucket_pos;
+ struct bpos bucket_pos = POS_MIN;
struct bch_backpointer bp;
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, btree, level,
- k, p, &bucket_pos, &bp);
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+ if (ca)
+ bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp);
+ rcu_read_unlock();
+
+ if (!ca)
+ continue;
ret = check_bp_exists(trans, s, bucket_pos, bp, k);
if (ret)
@@ -596,7 +796,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
__for_each_btree_node(trans, iter, btree,
btree == start.btree ? start.pos : POS_MIN,
- 0, depth, BTREE_ITER_PREFETCH, b, ret) {
+ 0, depth, BTREE_ITER_prefetch, b, ret) {
mem_may_pin -= btree_buf_bytes(b);
if (mem_may_pin <= 0) {
c->btree_cache.pinned_nodes_end = *end =
@@ -630,31 +830,13 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
while (level >= depth) {
struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
- level,
- BTREE_ITER_PREFETCH);
- while (1) {
- bch2_trans_begin(trans);
-
- struct bkey_s_c k = bch2_btree_iter_peek(&iter);
- if (!k.k)
- break;
- ret = bkey_err(k) ?:
- check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- ret = 0;
- continue;
- }
- if (ret)
- break;
- if (bpos_eq(iter.pos, SPOS_MAX))
- break;
- bch2_btree_iter_advance(&iter);
- }
- bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level,
+ BTREE_ITER_prefetch);
+ ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ }));
if (ret)
return ret;
@@ -772,7 +954,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bpos last_flushed_pos = SPOS_MAX;
return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
- POS_MIN, BTREE_ITER_PREFETCH, k,
+ POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k),
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 327365a9feac..6021de1c5e98 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -6,6 +6,7 @@
#include "btree_iter.h"
#include "btree_update.h"
#include "buckets.h"
+#include "error.h"
#include "super.h"
static inline u64 swab40(u64 x)
@@ -18,7 +19,7 @@ static inline u64 swab40(u64 x)
}
int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_backpointer_swab(struct bkey_s);
@@ -36,50 +37,67 @@ void bch2_backpointer_swab(struct bkey_s);
* Convert from pos in backpointer btree to pos of corresponding bucket in alloc
* btree:
*/
-static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
- struct bpos bp_pos)
+static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
}
+static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
+{
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode);
+ if (ca)
+ *bucket = bp_pos_to_bucket(ca, bp_pos);
+ rcu_read_unlock();
+ return ca != NULL;
+}
+
+static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
+{
+ return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket),
+ c, "backpointer for missing device %llu", bp_pos.inode);
+}
+
+static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
+ struct bpos bucket,
+ u64 bucket_offset)
+{
+ return POS(bucket.inode,
+ (bucket_to_sector(ca, bucket.offset) <<
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+}
+
/*
* Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
*/
-static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
+static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca,
struct bpos bucket,
u64 bucket_offset)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
- struct bpos ret;
-
- ret = POS(bucket.inode,
- (bucket_to_sector(ca, bucket.offset) <<
- MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
-
- EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
-
+ struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset);
+ EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret)));
return ret;
}
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket,
- struct bch_backpointer, struct bkey_s_c, bool);
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *,
+ struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool);
static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bpos bucket,
struct bch_backpointer bp,
struct bkey_s_c orig_k,
bool insert)
{
if (unlikely(bch2_backpointers_no_use_write_buffer))
- return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert);
+ return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert);
struct bkey_i_backpointer bp_k;
bkey_backpointer_init(&bp_k.k_i);
- bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
bp_k.v = bp;
if (!insert) {
@@ -90,24 +108,44 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
}
-static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p)
+static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry)
{
- return level ? BCH_DATA_btree :
- p.has_ec ? BCH_DATA_stripe :
- BCH_DATA_user;
+ switch (k.k->type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
+ return BCH_DATA_btree;
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
+ return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
+ case KEY_TYPE_stripe: {
+ const struct bch_extent_ptr *ptr = &entry->ptr;
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+ BUG_ON(ptr < s.v->ptrs ||
+ ptr >= s.v->ptrs + s.v->nr_blocks);
+
+ return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
+ ? BCH_DATA_parity
+ : BCH_DATA_user;
+ }
+ default:
+ BUG();
+ }
}
-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry,
struct bpos *bucket_pos, struct bch_backpointer *bp)
{
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
s64 sectors = level ? btree_sectors(c) : k.k->size;
u32 bucket_offset;
- *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
+ *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset);
*bp = (struct bch_backpointer) {
.btree_id = btree_id,
.level = level,
@@ -119,7 +157,7 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
};
}
-int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
+int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int,
struct bpos *, struct bch_backpointer *, unsigned);
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
struct bpos, struct bch_backpointer,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 799aa32b6b4d..2a538eb2af11 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -209,7 +209,7 @@
#include "fifo.h"
#include "nocow_locking_types.h"
#include "opts.h"
-#include "recovery_types.h"
+#include "recovery_passes_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
#include "time_stats.h"
@@ -359,6 +359,8 @@ do { \
#define BCH_DEBUG_PARAMS_ALWAYS() \
BCH_DEBUG_PARAM(key_merging_disabled, \
"Disables merging of extents") \
+ BCH_DEBUG_PARAM(btree_node_merging_disabled, \
+ "Disables merging of btree nodes") \
BCH_DEBUG_PARAM(btree_gc_always_rewrite, \
"Causes mark and sweep to compact and rewrite every " \
"btree node it traverses") \
@@ -455,7 +457,9 @@ enum bch_time_stats {
};
#include "alloc_types.h"
+#include "btree_gc_types.h"
#include "btree_types.h"
+#include "btree_node_scan_types.h"
#include "btree_write_buffer_types.h"
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
@@ -467,6 +471,7 @@ enum bch_time_stats {
#include "quota_types.h"
#include "rebalance_types.h"
#include "replicas_types.h"
+#include "sb-members_types.h"
#include "subvolume_types.h"
#include "super_types.h"
#include "thread_with_file_types.h"
@@ -484,56 +489,19 @@ enum bch_time_stats {
struct btree;
-enum gc_phase {
- GC_PHASE_NOT_RUNNING,
- GC_PHASE_START,
- GC_PHASE_SB,
-
- GC_PHASE_BTREE_stripes,
- GC_PHASE_BTREE_extents,
- GC_PHASE_BTREE_inodes,
- GC_PHASE_BTREE_dirents,
- GC_PHASE_BTREE_xattrs,
- GC_PHASE_BTREE_alloc,
- GC_PHASE_BTREE_quotas,
- GC_PHASE_BTREE_reflink,
- GC_PHASE_BTREE_subvolumes,
- GC_PHASE_BTREE_snapshots,
- GC_PHASE_BTREE_lru,
- GC_PHASE_BTREE_freespace,
- GC_PHASE_BTREE_need_discard,
- GC_PHASE_BTREE_backpointers,
- GC_PHASE_BTREE_bucket_gens,
- GC_PHASE_BTREE_snapshot_trees,
- GC_PHASE_BTREE_deleted_inodes,
- GC_PHASE_BTREE_logged_ops,
- GC_PHASE_BTREE_rebalance_work,
- GC_PHASE_BTREE_subvolume_children,
-
- GC_PHASE_PENDING_DELETE,
-};
-
-struct gc_pos {
- enum gc_phase phase;
- struct bpos pos;
- unsigned level;
-};
-
-struct reflink_gc {
- u64 offset;
- u32 size;
- u32 refcount;
-};
-
-typedef GENRADIX(struct reflink_gc) reflink_gc_table;
-
struct io_count {
u64 sectors[2][BCH_DATA_NR];
};
struct bch_dev {
struct kobject kobj;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ atomic_long_t ref;
+ bool dying;
+ unsigned long last_put;
+#else
struct percpu_ref ref;
+#endif
struct completion ref_completion;
struct percpu_ref io_ref;
struct completion io_ref_completion;
@@ -559,14 +527,11 @@ struct bch_dev {
struct bch_devs_mask self;
- /* biosets used in cloned bios for writing multiple replicas */
- struct bio_set replica_set;
-
/*
* Buckets:
* Per-bucket arrays are protected by c->mark_lock, bucket_lock and
* gc_lock, for device resize - holding any is sufficient for access:
- * Or rcu_read_lock(), but only for ptr_stale():
+ * Or rcu_read_lock(), but only for dev_ptr_stale():
*/
struct bucket_array __rcu *buckets_gc;
struct bucket_gens __rcu *bucket_gens;
@@ -580,7 +545,7 @@ struct bch_dev {
/* Allocator: */
u64 new_fs_bucket_idx;
- u64 alloc_cursor;
+ u64 alloc_cursor[3];
unsigned nr_open_buckets;
unsigned nr_btree_reserve;
@@ -614,6 +579,7 @@ struct bch_dev {
*/
#define BCH_FS_FLAGS() \
+ x(new_fs) \
x(started) \
x(may_go_rw) \
x(rw) \
@@ -625,12 +591,12 @@ struct bch_dev {
x(clean_shutdown) \
x(fsck_running) \
x(initial_gc_unfixed) \
- x(need_another_gc) \
x(need_delete_dead_snapshots) \
x(error) \
x(topology_error) \
x(errors_fixed) \
- x(errors_not_fixed)
+ x(errors_not_fixed) \
+ x(no_invalid_checks)
enum bch_fs_flags {
#define x(n) BCH_FS_##n,
@@ -707,10 +673,13 @@ struct btree_trans_buf {
x(stripe_delete) \
x(reflink) \
x(fallocate) \
+ x(fsync) \
+ x(dio_write) \
x(discard) \
x(discard_fast) \
x(invalidate) \
x(delete_dead_snapshots) \
+ x(gc_gens) \
x(snapshot_delete_pagecache) \
x(sysfs) \
x(btree_write_buffer)
@@ -796,6 +765,7 @@ struct bch_fs {
u64 features;
u64 compat;
unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
+ u64 btrees_lost_data;
} sb;
@@ -810,7 +780,6 @@ struct bch_fs {
/* snapshot.c: */
struct snapshot_table __rcu *snapshots;
- size_t snapshot_table_size;
struct mutex snapshot_table_lock;
struct rw_semaphore snapshot_create_lock;
@@ -922,7 +891,6 @@ struct bch_fs {
/* JOURNAL SEQ BLACKLIST */
struct journal_seq_blacklist_table *
journal_seq_blacklist_table;
- struct work_struct journal_seq_blacklist_gc_work;
/* ALLOCATOR */
spinlock_t freelist_lock;
@@ -953,8 +921,7 @@ struct bch_fs {
struct work_struct discard_fast_work;
/* GARBAGE COLLECTION */
- struct task_struct *gc_thread;
- atomic_t kick_gc;
+ struct work_struct gc_gens_work;
unsigned long gc_count;
enum btree_id gc_gens_btree;
@@ -984,6 +951,7 @@ struct bch_fs {
struct bio_set bio_read;
struct bio_set bio_read_split;
struct bio_set bio_write;
+ struct bio_set replica_set;
struct mutex bio_bounce_pages_lock;
mempool_t bio_bounce_pages;
struct bucket_nocow_lock_table
@@ -1104,12 +1072,13 @@ struct bch_fs {
struct journal_keys journal_keys;
struct list_head journal_iters;
+ struct find_btree_nodes found_btree_nodes;
+
u64 last_bucket_seq_cleanup;
u64 counters_on_mount[BCH_COUNTER_NR];
u64 __percpu *counters;
- unsigned btree_gc_periodic:1;
unsigned copy_gc_enabled:1;
bool promote_whole_extents;
@@ -1244,11 +1213,6 @@ static inline s64 bch2_current_time(const struct bch_fs *c)
return timespec_to_bch2_time(c, now);
}
-static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
-{
- return dev < c->sb.nr_devices && c->devs[dev];
-}
-
static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
{
struct stdio_redirect *stdio = c->stdio;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index bff8750ac0d7..90c12fe2a2cd 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -76,6 +76,7 @@
#include <asm/byteorder.h>
#include <linux/kernel.h>
#include <linux/uuid.h>
+#include <uapi/linux/magic.h>
#include "vstructs.h"
#ifdef __KERNEL__
@@ -502,16 +503,22 @@ struct bch_sb_field {
#include "alloc_background_format.h"
#include "extents_format.h"
-#include "reflink_format.h"
#include "ec_format.h"
-#include "inode_format.h"
#include "dirent_format.h"
-#include "xattr_format.h"
-#include "quota_format.h"
+#include "disk_groups_format.h"
+#include "inode_format.h"
+#include "journal_seq_blacklist_format.h"
#include "logged_ops_format.h"
+#include "quota_format.h"
+#include "reflink_format.h"
+#include "replicas_format.h"
#include "snapshot_format.h"
#include "subvolume_format.h"
#include "sb-counters_format.h"
+#include "sb-downgrade_format.h"
+#include "sb-errors_format.h"
+#include "sb-members_format.h"
+#include "xattr_format.h"
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
@@ -544,92 +551,6 @@ struct bch_sb_field_journal_v2 {
} d[];
};
-/* BCH_SB_FIELD_members_v1: */
-
-#define BCH_MIN_NR_NBUCKETS (1 << 6)
-
-#define BCH_IOPS_MEASUREMENTS() \
- x(seqread, 0) \
- x(seqwrite, 1) \
- x(randread, 2) \
- x(randwrite, 3)
-
-enum bch_iops_measurement {
-#define x(t, n) BCH_IOPS_##t = n,
- BCH_IOPS_MEASUREMENTS()
-#undef x
- BCH_IOPS_NR
-};
-
-#define BCH_MEMBER_ERROR_TYPES() \
- x(read, 0) \
- x(write, 1) \
- x(checksum, 2)
-
-enum bch_member_error_type {
-#define x(t, n) BCH_MEMBER_ERROR_##t = n,
- BCH_MEMBER_ERROR_TYPES()
-#undef x
- BCH_MEMBER_ERROR_NR
-};
-
-struct bch_member {
- __uuid_t uuid;
- __le64 nbuckets; /* device size */
- __le16 first_bucket; /* index of first bucket used */
- __le16 bucket_size; /* sectors */
- __le32 pad;
- __le64 last_mount; /* time_t */
-
- __le64 flags;
- __le32 iops[4];
- __le64 errors[BCH_MEMBER_ERROR_NR];
- __le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
- __le64 errors_reset_time;
- __le64 seq;
-};
-
-#define BCH_MEMBER_V1_BYTES 56
-
-LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
-/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
-LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
-LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
-LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
-LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
- struct bch_member, flags, 30, 31)
-
-#if 0
-LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
-LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
-#endif
-
-#define BCH_MEMBER_STATES() \
- x(rw, 0) \
- x(ro, 1) \
- x(failed, 2) \
- x(spare, 3)
-
-enum bch_member_state {
-#define x(t, n) BCH_MEMBER_STATE_##t = n,
- BCH_MEMBER_STATES()
-#undef x
- BCH_MEMBER_STATE_NR
-};
-
-struct bch_sb_field_members_v1 {
- struct bch_sb_field field;
- struct bch_member _members[]; //Members are now variable size
-};
-
-struct bch_sb_field_members_v2 {
- struct bch_sb_field field;
- __le16 member_bytes; //size of single member entry
- u8 pad[6];
- struct bch_member _members[];
-};
-
/* BCH_SB_FIELD_crypt: */
struct nonce {
@@ -678,8 +599,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
-/* BCH_SB_FIELD_replicas: */
-
#define BCH_DATA_TYPES() \
x(free, 0) \
x(sb, 1) \
@@ -722,50 +641,6 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
}
}
-struct bch_replicas_entry_v0 {
- __u8 data_type;
- __u8 nr_devs;
- __u8 devs[];
-} __packed;
-
-struct bch_sb_field_replicas_v0 {
- struct bch_sb_field field;
- struct bch_replicas_entry_v0 entries[];
-} __packed __aligned(8);
-
-struct bch_replicas_entry_v1 {
- __u8 data_type;
- __u8 nr_devs;
- __u8 nr_required;
- __u8 devs[];
-} __packed;
-
-#define replicas_entry_bytes(_i) \
- (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
-
-struct bch_sb_field_replicas {
- struct bch_sb_field field;
- struct bch_replicas_entry_v1 entries[];
-} __packed __aligned(8);
-
-/* BCH_SB_FIELD_disk_groups: */
-
-#define BCH_SB_LABEL_SIZE 32
-
-struct bch_disk_group {
- __u8 label[BCH_SB_LABEL_SIZE];
- __le64 flags[2];
-} __packed __aligned(8);
-
-LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
-LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
-LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
-
-struct bch_sb_field_disk_groups {
- struct bch_sb_field field;
- struct bch_disk_group entries[];
-} __packed __aligned(8);
-
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
@@ -793,43 +668,11 @@ struct bch_sb_field_clean {
__u64 _data[];
};
-struct journal_seq_blacklist_entry {
- __le64 start;
- __le64 end;
-};
-
-struct bch_sb_field_journal_seq_blacklist {
- struct bch_sb_field field;
- struct journal_seq_blacklist_entry start[];
-};
-
-struct bch_sb_field_errors {
- struct bch_sb_field field;
- struct bch_sb_field_error_entry {
- __le64 v;
- __le64 last_error_time;
- } entries[];
-};
-
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
-
struct bch_sb_field_ext {
struct bch_sb_field field;
__le64 recovery_passes_required[2];
__le64 errors_silent[8];
-};
-
-struct bch_sb_field_downgrade_entry {
- __le16 version;
- __le64 recovery_passes[2];
- __le16 nr_errors;
- __le16 errors[] __counted_by(nr_errors);
-} __packed __aligned(2);
-
-struct bch_sb_field_downgrade {
- struct bch_sb_field field;
- struct bch_sb_field_downgrade_entry entries[];
+ __le64 btrees_lost_data;
};
/* Superblock: */
@@ -875,7 +718,8 @@ struct bch_sb_field_downgrade {
x(rebalance_work, BCH_VERSION(1, 3)) \
x(member_seq, BCH_VERSION(1, 4)) \
x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
- x(btree_subvolume_children, BCH_VERSION(1, 6))
+ x(btree_subvolume_children, BCH_VERSION(1, 6)) \
+ x(mi_btree_bitmap, BCH_VERSION(1, 7))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -891,7 +735,8 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
#define BCH_SB_SECTOR 8
-#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */
+
+#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */
struct bch_sb_layout {
__uuid_t magic; /* bcachefs superblock UUID */
@@ -1271,7 +1116,7 @@ enum bch_compression_opts {
UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \
0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
-#define BCACHEFS_STATFS_MAGIC 0xca451a4e
+#define BCACHEFS_STATFS_MAGIC BCACHEFS_SUPER_MAGIC
#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL)
#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL)
@@ -1313,7 +1158,7 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(write_buffer_keys, 11) \
x(datetime, 12)
-enum {
+enum bch_jset_entry_type {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
BCH_JSET_ENTRY_TYPES()
#undef x
@@ -1359,7 +1204,7 @@ struct jset_entry_blacklist_v2 {
x(inodes, 1) \
x(key_version, 2)
-enum {
+enum bch_fs_usage_type {
#define x(f, nr) BCH_FS_USAGE_##f = nr,
BCH_FS_USAGE_TYPES()
#undef x
@@ -1500,7 +1345,8 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_stripe)) \
x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \
BIT_ULL(KEY_TYPE_reflink_v)| \
- BIT_ULL(KEY_TYPE_indirect_inline_data)) \
+ BIT_ULL(KEY_TYPE_indirect_inline_data)| \
+ BIT_ULL(KEY_TYPE_error)) \
x(subvolumes, 8, 0, \
BIT_ULL(KEY_TYPE_subvolume)) \
x(snapshots, 9, 0, \
@@ -1534,6 +1380,26 @@ enum btree_id {
BTREE_ID_NR
};
+/*
+ * Maximum number of btrees that we will _ever_ have under the current scheme,
+ * where we refer to them with bitfields
+ */
+#define BTREE_ID_NR_MAX 64
+
+static inline bool btree_id_is_alloc(enum btree_id id)
+{
+ switch (id) {
+ case BTREE_ID_alloc:
+ case BTREE_ID_backpointers:
+ case BTREE_ID_need_discard:
+ case BTREE_ID_freespace:
+ case BTREE_ID_bucket_gens:
+ return true;
+ default:
+ return false;
+ }
+}
+
#define BTREE_MAX_DEPTH 4U
/* Btree nodes */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 76e79a15ba08..f46978e5cb7c 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -640,7 +640,7 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
int bch2_bkey_format_invalid(struct bch_fs *c,
struct bkey_format *f,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
unsigned i, bits = KEY_PACKED_BITS_START;
@@ -656,20 +656,17 @@ int bch2_bkey_format_invalid(struct bch_fs *c,
* unpacked format:
*/
for (i = 0; i < f->nr_fields; i++) {
- if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) {
+ if ((!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) &&
+ bch2_bkey_format_field_overflows(f, i)) {
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
u64 packed_max = f->bits_per_field[i]
? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
: 0;
- u64 field_offset = le64_to_cpu(f->field_offset[i]);
- if (packed_max + field_offset < packed_max ||
- packed_max + field_offset > unpacked_max) {
- prt_printf(err, "field %u too large: %llu + %llu > %llu",
- i, packed_max, field_offset, unpacked_max);
- return -BCH_ERR_invalid;
- }
+ prt_printf(err, "field %u too large: %llu + %llu > %llu",
+ i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max);
+ return -BCH_ERR_invalid;
}
bits += f->bits_per_field[i];
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index cf23ff47bed8..fcd43915df07 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -9,10 +9,10 @@
#include "util.h"
#include "vstructs.h"
-enum bkey_invalid_flags {
- BKEY_INVALID_WRITE = (1U << 0),
- BKEY_INVALID_COMMIT = (1U << 1),
- BKEY_INVALID_JOURNAL = (1U << 2),
+enum bch_validate_flags {
+ BCH_VALIDATE_write = (1U << 0),
+ BCH_VALIDATE_commit = (1U << 1),
+ BCH_VALIDATE_journal = (1U << 2),
};
#if 0
@@ -314,6 +314,12 @@ static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
}
+static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
+ const struct bkey_packed *k)
+{
+ return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
+}
+
static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
const struct bkey_packed *k)
{
@@ -568,8 +574,31 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s
void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+
+static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i)
+{
+ unsigned f_bits = f->bits_per_field[i];
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+ u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+ if (f_bits > unpacked_bits)
+ return true;
+
+ if ((f_bits == unpacked_bits) && field_offset)
+ return true;
+
+ u64 f_mask = f_bits
+ ? ~((~0ULL << (f_bits - 1)) << 1)
+ : 0;
+
+ if (((field_offset + f_mask) & unpacked_mask) < field_offset)
+ return true;
+ return false;
+}
+
int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5e52684764eb..c2c3dae52186 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -27,7 +27,7 @@ const char * const bch2_bkey_types[] = {
};
static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
return 0;
}
@@ -41,7 +41,7 @@ static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
})
static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
int ret = 0;
@@ -58,7 +58,7 @@ fsck_err:
})
static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
return 0;
}
@@ -82,7 +82,7 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
})
static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
return 0;
}
@@ -123,9 +123,12 @@ const struct bkey_ops bch2_bkey_null_ops = {
};
int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
+ if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
+ return 0;
+
const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
int ret = 0;
@@ -159,9 +162,12 @@ const char *bch2_btree_node_type_str(enum btree_node_type type)
int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
+ if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
+ return 0;
+
int ret = 0;
bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err,
@@ -171,11 +177,15 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
if (type >= BKEY_TYPE_NR)
return 0;
- bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
+ bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
+ (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) &&
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",
- bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
+ bch2_btree_node_type_str(type),
+ k.k->type < KEY_TYPE_MAX
+ ? bch2_bkey_types[k.k->type]
+ : "(unknown)");
if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
bkey_fsck_err_on(k.k->size == 0, c, err,
@@ -220,7 +230,7 @@ fsck_err:
int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
return __bch2_bkey_invalid(c, k, type, flags, err) ?:
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 03efe8ee565a..726ef7483763 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -22,14 +22,15 @@ extern const struct bkey_ops bch2_bkey_null_ops;
*/
struct bkey_ops {
int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err);
+ enum bch_validate_flags flags, struct printbuf *err);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
void (*swab)(struct bkey_s);
bool (*key_normalize)(struct bch_fs *, struct bkey_s);
bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
int (*trigger)(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
void (*compat)(enum btree_id id, unsigned version,
unsigned big_endian, int write,
struct bkey_s);
@@ -48,11 +49,11 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
}
int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *,
struct bkey_s_c, struct printbuf *);
@@ -76,56 +77,10 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-enum btree_update_flags {
- __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
- __BTREE_UPDATE_NOJOURNAL,
- __BTREE_UPDATE_KEY_CACHE_RECLAIM,
-
- __BTREE_TRIGGER_NORUN,
- __BTREE_TRIGGER_TRANSACTIONAL,
- __BTREE_TRIGGER_ATOMIC,
- __BTREE_TRIGGER_GC,
- __BTREE_TRIGGER_INSERT,
- __BTREE_TRIGGER_OVERWRITE,
- __BTREE_TRIGGER_BUCKET_INVALIDATE,
-};
-
-#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
-#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
-#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
-
-/* Don't run triggers at all */
-#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
-
-/*
- * If set, we're running transactional triggers as part of a transaction commit:
- * triggers may generate new updates
- *
- * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set,
- * we're running atomic triggers during a transaction commit: we have our
- * journal reservation, we're holding btree node write locks, and we know the
- * transaction is going to commit (returning an error here is a fatal error,
- * causing us to go emergency read-only)
- */
-#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL)
-#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC)
-
-/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
-#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
-
-/* @new is entering the btree */
-#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
-
-/* @old is leaving the btree */
-#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
-
-/* signal from bucket invalidate path to alloc trigger */
-#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-
static inline int bch2_key_trigger(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
@@ -135,8 +90,9 @@ static inline int bch2_key_trigger(struct btree_trans *trans,
}
static inline int bch2_key_trigger_old(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, unsigned flags)
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_i deleted;
@@ -144,12 +100,13 @@ static inline int bch2_key_trigger_old(struct btree_trans *trans,
deleted.k.p = old.k->p;
return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted),
- BTREE_TRIGGER_OVERWRITE|flags);
+ BTREE_TRIGGER_overwrite|flags);
}
static inline int bch2_key_trigger_new(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s new, unsigned flags)
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s new,
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_i deleted;
@@ -157,7 +114,7 @@ static inline int bch2_key_trigger_new(struct btree_trans *trans,
deleted.k.p = new.k->p;
return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
- BTREE_TRIGGER_INSERT|flags);
+ BTREE_TRIGGER_insert|flags);
}
void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index bcca9e76a0b4..4536eb50fc40 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -6,9 +6,9 @@
#include "bset.h"
#include "extents.h"
-typedef int (*sort_cmp_fn)(struct btree *,
- struct bkey_packed *,
- struct bkey_packed *);
+typedef int (*sort_cmp_fn)(const struct btree *,
+ const struct bkey_packed *,
+ const struct bkey_packed *);
static inline bool sort_iter_end(struct sort_iter *iter)
{
@@ -70,9 +70,9 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
/*
* If keys compare equal, compare by pointer order:
*/
-static inline int key_sort_fix_overlapping_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
+static inline int key_sort_fix_overlapping_cmp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
{
return bch2_bkey_cmp_packed(b, l, r) ?:
cmp_int((unsigned long) l, (unsigned long) r);
@@ -154,46 +154,59 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
return nr;
}
-static inline int sort_keys_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
+static inline int keep_unwritten_whiteouts_cmp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
{
return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
- (int) l->needs_whiteout - (int) r->needs_whiteout;
+ (long) l - (long) r;
}
-unsigned bch2_sort_keys(struct bkey_packed *dst,
- struct sort_iter *iter,
- bool filter_whiteouts)
+#include "btree_update_interior.h"
+
+/*
+ * For sorting in the btree node write path: whiteouts not in the unwritten
+ * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are
+ * dropped if overwritten by real keys:
+ */
+unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter)
{
- const struct bkey_format *f = &iter->b->format;
struct bkey_packed *in, *next, *out = dst;
- sort_iter_sort(iter, sort_keys_cmp);
+ sort_iter_sort(iter, keep_unwritten_whiteouts_cmp);
- while ((in = sort_iter_next(iter, sort_keys_cmp))) {
- bool needs_whiteout = false;
+ while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) {
+ if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b))
+ continue;
- if (bkey_deleted(in) &&
- (filter_whiteouts || !in->needs_whiteout))
+ if ((next = sort_iter_peek(iter)) &&
+ !bch2_bkey_cmp_packed_inlined(iter->b, in, next))
continue;
- while ((next = sort_iter_peek(iter)) &&
- !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) {
- BUG_ON(in->needs_whiteout &&
- next->needs_whiteout);
- needs_whiteout |= in->needs_whiteout;
- in = sort_iter_next(iter, sort_keys_cmp);
- }
+ bkey_p_copy(out, in);
+ out = bkey_p_next(out);
+ }
- if (bkey_deleted(in)) {
- memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in));
- set_bkeyp_val_u64s(f, out, 0);
- } else {
- bkey_p_copy(out, in);
- }
- out->needs_whiteout |= needs_whiteout;
+ return (u64 *) out - (u64 *) dst;
+}
+
+/*
+ * Main sort routine for compacting a btree node in memory: we always drop
+ * whiteouts because any whiteouts that need to be written are in the unwritten
+ * whiteouts area:
+ */
+unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter)
+{
+ struct bkey_packed *in, *out = dst;
+
+ sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined);
+
+ while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) {
+ if (bkey_deleted(in))
+ continue;
+
+ bkey_p_copy(out, in);
out = bkey_p_next(out);
}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
index 7c0f0b160f18..9be969d46890 100644
--- a/fs/bcachefs/bkey_sort.h
+++ b/fs/bcachefs/bkey_sort.h
@@ -48,7 +48,7 @@ bch2_sort_repack(struct bset *, struct btree *,
struct btree_node_iter *,
struct bkey_format *, bool);
-unsigned bch2_sort_keys(struct bkey_packed *,
- struct sort_iter *, bool);
+unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *);
+unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *);
#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 3fd1085b6c61..575e1d0b6eeb 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -103,8 +103,6 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
{
- struct bset_tree *t;
-
console_lock();
for_each_bset(b, t)
bch2_dump_bset(c, b, bset(b, t), t - b->set);
@@ -134,18 +132,23 @@ void bch2_dump_btree_node_iter(struct btree *b,
printbuf_exit(&buf);
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_verify_btree_nr_keys(struct btree *b)
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
{
- struct bset_tree *t;
struct bkey_packed *k;
- struct btree_nr_keys nr = { 0 };
+ struct btree_nr_keys nr = {};
for_each_bset(b, t)
bset_tree_for_each_key(b, t, k)
if (!bkey_deleted(k))
btree_keys_account_key_add(&nr, t - b->set, k);
+ return nr;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+ struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
}
@@ -192,7 +195,6 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
{
struct btree_node_iter_set *set, *s2;
struct bkey_packed *k, *p;
- struct bset_tree *t;
if (bch2_btree_node_iter_end(iter))
return;
@@ -207,12 +209,14 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
/* Verify that set->end is correct: */
btree_node_iter_for_each(iter, set) {
for_each_bset(b, t)
- if (set->end == t->end_offset)
+ if (set->end == t->end_offset) {
+ BUG_ON(set->k < btree_bkey_first_offset(t) ||
+ set->k >= t->end_offset);
goto found;
+ }
BUG();
found:
- BUG_ON(set->k < btree_bkey_first_offset(t) ||
- set->k >= t->end_offset);
+ do {} while (0);
}
/* Verify iterator is sorted: */
@@ -371,11 +375,9 @@ static struct bkey_float *bkey_float(const struct btree *b,
return ro_aux_tree_base(b, t)->f + idx;
}
-static void bset_aux_tree_verify(const struct btree *b)
+static void bset_aux_tree_verify(struct btree *b)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- const struct bset_tree *t;
-
for_each_bset(b, t) {
if (t->aux_data_offset == U16_MAX)
continue;
@@ -679,20 +681,20 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
}
/* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
+static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
{
bset_aux_tree_verify(b);
return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
}
-static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
+static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
{
return __bset_tree_capacity(b, t) /
(sizeof(struct bkey_float) + sizeof(u8));
}
-static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
+static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
{
return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
}
@@ -1368,8 +1370,6 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
struct btree *b)
{
- struct bset_tree *t;
-
memset(iter, 0, sizeof(*iter));
for_each_bset(b, t)
@@ -1475,7 +1475,6 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
{
struct bkey_packed *k, *prev = NULL;
struct btree_node_iter_set *set;
- struct bset_tree *t;
unsigned end = 0;
if (bch2_expensive_debug_checks)
@@ -1544,9 +1543,7 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
{
- const struct bset_tree *t;
-
- for_each_bset(b, t) {
+ for_each_bset_c(b, t) {
enum bset_aux_tree_type type = bset_aux_tree_type(t);
size_t j;
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 79c77baaa383..5c6c7a14fa0f 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -206,7 +206,10 @@ static inline size_t btree_aux_data_u64s(const struct btree *b)
}
#define for_each_bset(_b, _t) \
- for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+ for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+
+#define for_each_bset_c(_b, _t) \
+ for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
#define bset_tree_for_each_key(_b, _t, _k) \
for (_k = btree_bkey_first(_b, _t); \
@@ -294,7 +297,6 @@ static inline struct bset_tree *
bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
{
unsigned offset = __btree_node_key_to_offset(b, k);
- struct bset_tree *t;
for_each_bset(b, t)
if (offset <= t->end_offset) {
@@ -458,6 +460,8 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
/* Accounting: */
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *);
+
static inline void btree_keys_account_key(struct btree_nr_keys *n,
unsigned bset,
struct bkey_packed *k,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 562561a9a510..9e4ed75d3675 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -16,6 +16,12 @@
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
+#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
+do { \
+ if (shrinker_counter) \
+ bc->not_freed_##counter++; \
+} while (0)
+
const char * const bch2_btree_node_flags[] = {
#define x(f) #f,
BTREE_FLAGS()
@@ -162,6 +168,9 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
/* Cause future lookups for this node to fail: */
b->hash_val = 0;
+
+ if (b->c.btree_id < BTREE_ID_NR)
+ --bc->used_by_btree[b->c.btree_id];
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
@@ -169,8 +178,11 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
BUG_ON(b->hash_val);
b->hash_val = btree_ptr_hash_val(&b->key);
- return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
- bch_btree_cache_params);
+ int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
+ bch_btree_cache_params);
+ if (!ret && b->c.btree_id < BTREE_ID_NR)
+ bc->used_by_btree[b->c.btree_id]++;
+ return ret;
}
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
@@ -190,6 +202,35 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
return ret;
}
+void bch2_btree_node_update_key_early(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b;
+ struct bkey_buf tmp;
+ int ret;
+
+ bch2_bkey_buf_init(&tmp);
+ bch2_bkey_buf_reassemble(&tmp, c, old);
+
+ b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
+ if (!IS_ERR_OR_NULL(b)) {
+ mutex_lock(&c->btree_cache.lock);
+
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, new);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+
+ mutex_unlock(&c->btree_cache.lock);
+ six_unlock_read(&b->c.lock);
+ }
+
+ bch2_bkey_buf_exit(&tmp, c);
+}
+
__flatten
static inline struct btree *btree_cache_find(struct btree_cache *bc,
const struct bkey_i *k)
@@ -203,7 +244,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc,
* this version is for btree nodes that have already been freed (we're not
* reaping a real btree node)
*/
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
{
struct btree_cache *bc = &c->btree_cache;
int ret = 0;
@@ -225,38 +266,64 @@ wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush)
+ if (!flush) {
+ if (btree_node_dirty(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
+ else if (btree_node_read_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ else if (btree_node_write_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ }
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read(b);
bch2_btree_node_wait_on_write(b);
}
- if (!six_trylock_intent(&b->c.lock))
+ if (!six_trylock_intent(&b->c.lock)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ }
- if (!six_trylock_write(&b->c.lock))
+ if (!six_trylock_write(&b->c.lock)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
goto out_unlock_intent;
+ }
/* recheck under lock */
if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush)
+ if (!flush) {
+ if (btree_node_read_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ else if (btree_node_write_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
goto out_unlock;
+ }
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
}
- if (btree_node_noevict(b) ||
- btree_node_write_blocked(b) ||
- btree_node_will_make_reachable(b))
+ if (btree_node_noevict(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
+ goto out_unlock;
+ }
+ if (btree_node_write_blocked(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
goto out_unlock;
+ }
+ if (btree_node_will_make_reachable(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
+ goto out_unlock;
+ }
if (btree_node_dirty(b)) {
- if (!flush)
+ if (!flush) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
goto out_unlock;
+ }
/*
* Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted
@@ -286,14 +353,14 @@ out_unlock_intent:
goto out;
}
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
{
- return __btree_node_reclaim(c, b, false);
+ return __btree_node_reclaim(c, b, false, shrinker_counter);
}
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
{
- return __btree_node_reclaim(c, b, true);
+ return __btree_node_reclaim(c, b, true, false);
}
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
@@ -341,11 +408,12 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
if (touched >= nr)
goto out;
- if (!btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b, true)) {
btree_node_data_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
+ bc->freed++;
}
}
restart:
@@ -354,9 +422,11 @@ restart:
if (btree_node_accessed(b)) {
clear_btree_node_accessed(b);
- } else if (!btree_node_reclaim(c, b)) {
+ bc->not_freed_access_bit++;
+ } else if (!btree_node_reclaim(c, b, true)) {
freed++;
btree_node_data_free(c, b);
+ bc->freed++;
bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->c.lock);
@@ -564,7 +634,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
struct btree *b;
list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_reclaim(c, b))
+ if (!btree_node_reclaim(c, b, false))
return b;
while (1) {
@@ -600,7 +670,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
* disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, freed, list)
- if (!btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b, false)) {
list_del_init(&b->list);
goto got_node;
}
@@ -626,7 +696,7 @@ got_node:
* the list. Check if there's any freed nodes there:
*/
list_for_each_entry(b2, &bc->freeable, list)
- if (!btree_node_reclaim(c, b2)) {
+ if (!btree_node_reclaim(c, b2, false)) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
btree_node_to_freedlist(bc, b2);
@@ -709,9 +779,31 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- u32 seq;
- BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
+ if (unlikely(level >= BTREE_MAX_DEPTH)) {
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
+ level, BTREE_MAX_DEPTH);
+ return ERR_PTR(ret);
+ }
+
+ if (unlikely(!bkey_is_btree_ptr(&k->k))) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
+ printbuf_exit(&buf);
+ return ERR_PTR(ret);
+ }
+
+ if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
+ printbuf_exit(&buf);
+ return ERR_PTR(ret);
+ }
+
/*
* Parent node must be locked, else we could read in a btree node that's
* been freed:
@@ -752,34 +844,26 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
}
set_btree_node_read_in_flight(b);
-
six_unlock_write(&b->c.lock);
- seq = six_lock_seq(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- /* Unlock before doing IO: */
- if (path && sync)
- bch2_trans_unlock_noassert(trans);
-
- bch2_btree_node_read(trans, b, sync);
+ if (path) {
+ u32 seq = six_lock_seq(&b->c.lock);
- if (!sync)
- return NULL;
+ /* Unlock before doing IO: */
+ six_unlock_intent(&b->c.lock);
+ bch2_trans_unlock_noassert(trans);
- if (path) {
- int ret = bch2_trans_relock(trans) ?:
- bch2_btree_path_relock_intent(trans, path);
- if (ret) {
- BUG_ON(!trans->restarted);
- return ERR_PTR(ret);
- }
- }
+ bch2_btree_node_read(trans, b, sync);
- if (!six_relock_type(&b->c.lock, lock_type, seq)) {
- BUG_ON(!path);
+ if (!sync)
+ return NULL;
- trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
+ if (!six_relock_type(&b->c.lock, lock_type, seq))
+ b = NULL;
+ } else {
+ bch2_btree_node_read(trans, b, sync);
+ if (lock_type == SIX_LOCK_read)
+ six_lock_downgrade(&b->c.lock);
}
return b;
@@ -808,7 +892,8 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
prt_printf(&buf, "\nmax ");
bch2_bpos_to_text(&buf, b->data->max_key);
- bch2_fs_inconsistent(c, "%s", buf.buf);
+ bch2_fs_topology_error(c, "%s", buf.buf);
+
printbuf_exit(&buf);
}
@@ -831,7 +916,6 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- struct bset_tree *t;
bool need_relock = false;
int ret;
@@ -951,7 +1035,6 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
{
struct bch_fs *c = trans->c;
struct btree *b;
- struct bset_tree *t;
int ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
@@ -1028,7 +1111,6 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- struct bset_tree *t;
int ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
@@ -1111,18 +1193,19 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
BUG_ON(path && !btree_node_locked(path, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
- b = btree_cache_find(bc, k);
+ struct btree *b = btree_cache_find(bc, k);
if (b)
return 0;
b = bch2_btree_node_fill(trans, path, k, btree_id,
level, SIX_LOCK_read, false);
- return PTR_ERR_OR_ZERO(b);
+ if (!IS_ERR_OR_NULL(b))
+ six_unlock_read(&b->c.lock);
+ return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b);
}
void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
@@ -1134,6 +1217,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
b = btree_cache_find(bc, k);
if (!b)
return;
+
+ BUG_ON(b == btree_node_root(trans->c, b));
wait_on_io:
/* not allowed to wait on io with btree locks held: */
@@ -1145,6 +1230,8 @@ wait_on_io:
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
+ goto out;
if (btree_node_dirty(b)) {
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
@@ -1159,7 +1246,7 @@ wait_on_io:
btree_node_data_free(c, b);
bch2_btree_node_hash_remove(bc, b);
mutex_unlock(&bc->lock);
-
+out:
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
@@ -1220,9 +1307,39 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
stats.failed);
}
-void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c)
+static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
+ const char *label, unsigned nr)
{
- prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
- prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
- prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+ prt_printf(out, "%s\t", label);
+ prt_human_readable_u64(out, nr * c->opts.btree_node_size);
+ prt_printf(out, " (%u)\n", nr);
+}
+
+void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
+
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
+ prt_btree_cache_line(out, c, "total:", bc->used);
+ prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty));
+ prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
+ prt_newline(out);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++)
+ prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]);
+
+ prt_newline(out);
+ prt_printf(out, "freed:\t%u\n", bc->freed);
+ prt_printf(out, "not freed:\n");
+ prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty);
+ prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight);
+ prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight);
+ prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent);
+ prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write);
+ prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit);
+ prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict);
+ prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked);
+ prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable);
}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 6d33885fdbde..fed35de3e4de 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -17,6 +17,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
+void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *);
+
void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
@@ -131,6 +134,6 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
const char *bch2_btree_id_str(enum btree_id);
void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *);
+void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index bdaed29f084a..dc97991bcd6a 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -7,11 +7,13 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "bkey_methods.h"
#include "bkey_buf.h"
#include "btree_journal_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
+#include "btree_node_scan.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_gc.h"
@@ -24,7 +26,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "reflink.h"
#include "replicas.h"
#include "super-io.h"
@@ -40,6 +42,7 @@
#define DROP_THIS_NODE 10
#define DROP_PREV_NODE 11
+#define DID_FILL_FROM_SCAN 12
static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
{
@@ -49,12 +52,6 @@ static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
}}};
}
-static bool should_restart_for_topology_repair(struct bch_fs *c)
-{
- return c->opts.fix_errors != FSCK_FIX_no &&
- !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
-}
-
static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
{
preempt_disable();
@@ -66,94 +63,10 @@ static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
{
- BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
+ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
__gc_pos_set(c, new_pos);
}
-/*
- * Missing: if an interior btree node is empty, we need to do something -
- * perhaps just kill it
- */
-static int bch2_gc_check_topology(struct bch_fs *c,
- struct btree *b,
- struct bkey_buf *prev,
- struct bkey_buf cur,
- bool is_last)
-{
- struct bpos node_start = b->data->min_key;
- struct bpos node_end = b->data->max_key;
- struct bpos expected_start = bkey_deleted(&prev->k->k)
- ? node_start
- : bpos_successor(prev->k->k.p);
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
- int ret = 0;
-
- if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
- struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
-
- if (!bpos_eq(expected_start, bp->v.min_key)) {
- bch2_topology_error(c);
-
- if (bkey_deleted(&prev->k->k)) {
- prt_printf(&buf1, "start of node: ");
- bch2_bpos_to_text(&buf1, node_start);
- } else {
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
- }
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
-
- if (__fsck_err(c,
- FSCK_CAN_FIX|
- FSCK_CAN_IGNORE|
- FSCK_NO_RATELIMIT,
- btree_node_topology_bad_min_key,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " cur %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto err;
- } else {
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- }
- }
- }
-
- if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
- bch2_topology_error(c);
-
- printbuf_reset(&buf1);
- printbuf_reset(&buf2);
-
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
- bch2_bpos_to_text(&buf2, node_end);
-
- if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " %s\n"
- " expected %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf) &&
- should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto err;
- } else {
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- }
- }
-
- bch2_bkey_buf_copy(prev, c, cur.k);
-err:
-fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- return ret;
-}
-
static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
{
switch (b->key.k.type) {
@@ -178,40 +91,22 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
}
}
-static void bch2_btree_node_update_key_early(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_i *new)
+static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
{
- struct bch_fs *c = trans->c;
- struct btree *b;
- struct bkey_buf tmp;
+ struct bkey_i_btree_ptr_v2 *new;
int ret;
- bch2_bkey_buf_init(&tmp);
- bch2_bkey_buf_reassemble(&tmp, c, old);
-
- b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
- if (!IS_ERR_OR_NULL(b)) {
- mutex_lock(&c->btree_cache.lock);
-
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
- bkey_copy(&b->key, new);
- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
- BUG_ON(ret);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, " -> ");
+ bch2_bpos_to_text(&buf, new_min);
- mutex_unlock(&c->btree_cache.lock);
- six_unlock_read(&b->c.lock);
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
}
- bch2_bkey_buf_exit(&tmp, c);
-}
-
-static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
-{
- struct bkey_i_btree_ptr_v2 *new;
- int ret;
-
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
return -BCH_ERR_ENOMEM_gc_repair_key;
@@ -237,6 +132,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
struct bkey_i_btree_ptr_v2 *new;
int ret;
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, " -> ");
+ bch2_bpos_to_text(&buf, new_max);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
if (ret)
return ret;
@@ -268,127 +174,138 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
return 0;
}
-static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
- struct btree *prev, struct btree *cur)
+static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
+ struct btree *prev, struct btree *cur,
+ struct bpos *pulled_from_scan)
{
struct bpos expected_start = !prev
? b->data->min_key
: bpos_successor(prev->key.k.p);
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- if (!prev) {
- prt_printf(&buf1, "start of node: ");
- bch2_bpos_to_text(&buf1, b->data->min_key);
- } else {
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
+ BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+ b->data->min_key));
+
+ if (bpos_eq(expected_start, cur->data->min_key))
+ return 0;
+
+ prt_printf(&buf, " at btree %s level %u:\n parent: ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ if (prev) {
+ prt_printf(&buf, "\n prev: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
}
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
-
- if (prev &&
- bpos_gt(expected_start, cur->data->min_key) &&
- BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
- /* cur overwrites prev: */
-
- if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
- cur->data->min_key), c,
- btree_node_topology_overwritten_by_next_node,
- "btree node overwritten by next node at btree %s level %u:\n"
- " node %s\n"
- " next %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = DROP_PREV_NODE;
- goto out;
- }
+ prt_str(&buf, "\n next: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
- if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
- bpos_predecessor(cur->data->min_key)), c,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " node %s\n"
- " next %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf))
- ret = set_node_max(c, prev,
- bpos_predecessor(cur->data->min_key));
- } else {
- /* prev overwrites cur: */
-
- if (mustfix_fsck_err_on(bpos_ge(expected_start,
- cur->data->max_key), c,
- btree_node_topology_overwritten_by_prev_node,
- "btree node overwritten by prev node at btree %s level %u:\n"
- " prev %s\n"
- " node %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = DROP_THIS_NODE;
- goto out;
- }
+ if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
+ if (b->c.level == 1 &&
+ bpos_lt(*pulled_from_scan, cur->data->min_key)) {
+ ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+ expected_start,
+ bpos_predecessor(cur->data->min_key));
+ if (ret)
+ goto err;
- if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
- btree_node_topology_bad_min_key,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " node %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf))
- ret = set_node_min(c, cur, expected_start);
+ *pulled_from_scan = cur->data->min_key;
+ ret = DID_FILL_FROM_SCAN;
+ } else {
+ if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key%s", buf.buf))
+ ret = set_node_min(c, cur, expected_start);
+ }
+ } else { /* overlap */
+ if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
+ if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
+ if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
+ "btree node overwritten by next node%s", buf.buf))
+ ret = DROP_PREV_NODE;
+ } else {
+ if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key%s", buf.buf))
+ ret = set_node_max(c, prev,
+ bpos_predecessor(cur->data->min_key));
+ }
+ } else {
+ if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
+ if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
+ "btree node overwritten by prev node%s", buf.buf))
+ ret = DROP_THIS_NODE;
+ } else {
+ if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key%s", buf.buf))
+ ret = set_node_min(c, cur, expected_start);
+ }
+ }
}
-out:
+err:
fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
+ printbuf_exit(&buf);
return ret;
}
static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
- struct btree *child)
+ struct btree *child, struct bpos *pulled_from_scan)
{
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
- bch2_bpos_to_text(&buf2, b->key.k.p);
+ if (bpos_eq(child->key.k.p, b->key.k.p))
+ return 0;
- if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " %s\n"
- " expected %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = set_node_max(c, child, b->key.k.p);
- if (ret)
- goto err;
+ prt_printf(&buf, "at btree %s level %u:\n parent: ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ prt_str(&buf, "\n child: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
+
+ if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key%s", buf.buf)) {
+ if (b->c.level == 1 &&
+ bpos_lt(*pulled_from_scan, b->key.k.p)) {
+ ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+ bpos_successor(child->key.k.p), b->key.k.p);
+ if (ret)
+ goto err;
+
+ *pulled_from_scan = b->key.k.p;
+ ret = DID_FILL_FROM_SCAN;
+ } else {
+ ret = set_node_max(c, child, b->key.k.p);
+ }
}
err:
fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
+ struct bpos *pulled_from_scan)
{
struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf prev_k, cur_k;
struct btree *prev = NULL, *cur = NULL;
- bool have_child, dropped_children = false;
+ bool have_child, new_pass = false;
struct printbuf buf = PRINTBUF;
int ret = 0;
if (!b->c.level)
return 0;
-again:
- prev = NULL;
- have_child = dropped_children = false;
+
bch2_bkey_buf_init(&prev_k);
bch2_bkey_buf_init(&cur_k);
+again:
+ cur = prev = NULL;
+ have_child = new_pass = false;
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
@@ -415,11 +332,17 @@ again:
b->c.level - 1,
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
+ cur = NULL;
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
- cur = NULL;
if (ret)
break;
+
+ if (!btree_id_is_alloc(b->c.btree_id)) {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ break;
+ }
continue;
}
@@ -427,7 +350,23 @@ again:
if (ret)
break;
- ret = btree_repair_node_boundaries(c, b, prev, cur);
+ if (bch2_btree_node_is_stale(c, cur)) {
+ bch_info(c, "btree node %s older than nodes found by scanning", buf.buf);
+ six_unlock_read(&cur->c.lock);
+ bch2_btree_node_evict(trans, cur_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
+ cur = NULL;
+ if (ret)
+ break;
+ continue;
+ }
+
+ ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
+ if (ret == DID_FILL_FROM_SCAN) {
+ new_pass = true;
+ ret = 0;
+ }
if (ret == DROP_THIS_NODE) {
six_unlock_read(&cur->c.lock);
@@ -445,6 +384,7 @@ again:
prev = NULL;
if (ret == DROP_PREV_NODE) {
+ bch_info(c, "dropped prev node");
bch2_btree_node_evict(trans, prev_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, prev_k.k->k.p);
@@ -452,8 +392,6 @@ again:
break;
bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&prev_k, c);
- bch2_bkey_buf_exit(&cur_k, c);
goto again;
} else if (ret)
break;
@@ -465,7 +403,11 @@ again:
if (!ret && !IS_ERR_OR_NULL(prev)) {
BUG_ON(cur);
- ret = btree_repair_node_end(c, b, prev);
+ ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
+ if (ret == DID_FILL_FROM_SCAN) {
+ new_pass = true;
+ ret = 0;
+ }
}
if (!IS_ERR_OR_NULL(prev))
@@ -479,6 +421,10 @@ again:
goto err;
bch2_btree_and_journal_iter_exit(&iter);
+
+ if (new_pass)
+ goto again;
+
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
@@ -495,7 +441,7 @@ again:
if (ret)
goto err;
- ret = bch2_btree_repair_topology_recurse(trans, cur);
+ ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
six_unlock_read(&cur->c.lock);
cur = NULL;
@@ -503,7 +449,7 @@ again:
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
- dropped_children = true;
+ new_pass = true;
}
if (ret)
@@ -530,12 +476,14 @@ fsck_err:
six_unlock_read(&cur->c.lock);
bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&prev_k, c);
- bch2_bkey_buf_exit(&cur_k, c);
- if (!ret && dropped_children)
+ if (!ret && new_pass)
goto again;
+ BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
+
+ bch2_bkey_buf_exit(&prev_k, c);
+ bch2_bkey_buf_exit(&cur_k, c);
printbuf_exit(&buf);
return ret;
}
@@ -543,544 +491,193 @@ fsck_err:
int bch2_check_topology(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree *b;
- unsigned i;
+ struct bpos pulled_from_scan = POS_MIN;
int ret = 0;
- for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
+ bool reconstructed_root = false;
- if (!r->alive)
- continue;
-
- b = r->b;
- if (btree_node_fake(b))
- continue;
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- ret = bch2_btree_repair_topology_recurse(trans, b);
- six_unlock_read(&b->c.lock);
-
- if (ret == DROP_THIS_NODE) {
- bch_err(c, "empty btree root - repair unimplemented");
- ret = -BCH_ERR_fsck_repair_unimplemented;
- }
- }
-
- bch2_trans_put(trans);
-
- return ret;
-}
-
-static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
- unsigned level, bool is_root,
- struct bkey_s_c *k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k);
- const union bch_extent_entry *entry_c;
- struct extent_ptr_decoded p = { 0 };
- bool do_update = false;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- /*
- * XXX
- * use check_bucket_ref here
- */
- bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
-
- if (fsck_err_on(!g->gen_valid,
- c, ptr_to_missing_alloc_key,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
- if (!p.ptr.cached) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- } else {
- do_update = true;
- }
- }
-
- if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
- c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
- if (!p.ptr.cached) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- set_bit(BCH_FS_need_another_gc, &c->flags);
- } else {
- do_update = true;
- }
- }
-
- if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
- c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
- do_update = true;
-
- if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
- c, stale_dirty_ptr,
- "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
- do_update = true;
+ if (r->error) {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ break;
+reconstruct_root:
+ bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i));
- if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
- continue;
+ r->alive = false;
+ r->error = 0;
- if (fsck_err_on(bucket_data_type(g->data_type) &&
- bucket_data_type(g->data_type) != data_type, c,
- ptr_bucket_data_type_mismatch,
- "bucket %u:%zu different types of data in same bucket: %s, %s\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
- if (data_type == BCH_DATA_btree) {
- g->data_type = data_type;
- set_bit(BCH_FS_need_another_gc, &c->flags);
+ if (!bch2_btree_has_scanned_nodes(c, i)) {
+ mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
+ "no nodes found for btree %s, continue?", bch2_btree_id_str(i));
+ bch2_btree_root_alloc_fake_trans(trans, i, 0);
} else {
- do_update = true;
+ bch2_btree_root_alloc_fake_trans(trans, i, 1);
+ bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
+ if (ret)
+ break;
}
- }
- if (p.has_ec) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
-
- if (fsck_err_on(!m || !m->alive, c,
- ptr_to_missing_stripe,
- "pointer to nonexistent stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
- do_update = true;
-
- if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
- ptr_to_incorrect_stripe,
- "pointer does not match stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
- do_update = true;
+ reconstructed_root = true;
}
- }
- if (do_update) {
- struct bkey_ptrs ptrs;
- union bch_extent_entry *entry;
- struct bch_extent_ptr *ptr;
- struct bkey_i *new;
+ struct btree *b = r->b;
- if (is_root) {
- bch_err(c, "cannot update btree roots yet");
- ret = -EINVAL;
- goto err;
- }
-
- new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
- if (!new) {
- ret = -BCH_ERR_ENOMEM_gc_repair_key;
- bch_err_msg(c, ret, "allocating new key");
- goto err;
- }
-
- bkey_reassemble(new, *k);
-
- if (level) {
- /*
- * We don't want to drop btree node pointers - if the
- * btree node isn't there anymore, the read path will
- * sort it out:
- */
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-
- ptr->gen = g->gen;
- }
- } else {
- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
-
- (ptr->cached &&
- (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
- (!ptr->cached &&
- gen_cmp(ptr->gen, g->gen) < 0) ||
- gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
- (g->data_type &&
- g->data_type != data_type);
- }));
-again:
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_extent_entry_for_each(ptrs, entry) {
- if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
- entry->stripe_ptr.idx);
- union bch_extent_entry *next_ptr;
-
- bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
- if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
- goto found;
- next_ptr = NULL;
-found:
- if (!next_ptr) {
- bch_err(c, "aieee, found stripe ptr with no data ptr");
- continue;
- }
-
- if (!m || !m->alive ||
- !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
- &next_ptr->ptr,
- m->sectors)) {
- bch2_bkey_extent_entry_drop(new, entry);
- goto again;
- }
- }
- }
- }
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+ ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
+ six_unlock_read(&b->c.lock);
- ret = bch2_journal_key_insert_take(c, btree_id, level, new);
- if (ret) {
- kfree(new);
- goto err;
- }
+ if (ret == DROP_THIS_NODE) {
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&b->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
- if (level)
- bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
+ r->b = NULL;
- if (0) {
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, *k);
- bch_info(c, "updated %s", buf.buf);
+ if (!reconstructed_root)
+ goto reconstruct_root;
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
- bch_info(c, "new key %s", buf.buf);
+ bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
+ bch2_btree_root_alloc_fake_trans(trans, i, 0);
+ r->alive = false;
+ ret = 0;
}
-
- *k = bkey_i_to_s_c(new);
}
-err:
fsck_err:
- printbuf_exit(&buf);
+ bch2_trans_put(trans);
return ret;
}
/* marking of btree keys/nodes: */
static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
- unsigned level, bool is_root,
- struct bkey_s_c *k,
+ unsigned level, struct btree **prev,
+ struct btree_iter *iter, struct bkey_s_c k,
bool initial)
{
struct bch_fs *c = trans->c;
- struct bkey deleted = KEY(0, 0, 0);
- struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
- int ret = 0;
-
- deleted.p = k->k->p;
-
- if (initial) {
- BUG_ON(bch2_journal_seq_verify &&
- k->k->version.lo > atomic64_read(&c->journal.seq));
-
- ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
- if (ret)
- goto err;
-
- if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
- bkey_version_in_future,
- "key version number higher than recorded: %llu > %llu",
- k->k->version.lo,
- atomic64_read(&c->key_version)))
- atomic64_set(&c->key_version, k->k->version.lo);
- }
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
-fsck_err:
-err:
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
-{
- struct bch_fs *c = trans->c;
- struct btree_node_iter iter;
- struct bkey unpacked;
- struct bkey_s_c k;
- struct bkey_buf prev, cur;
- int ret = 0;
- if (!btree_node_type_needs_gc(btree_node_type(b)))
- return 0;
-
- bch2_btree_node_iter_init_from_start(&iter, b);
- bch2_bkey_buf_init(&prev);
- bch2_bkey_buf_init(&cur);
- bkey_init(&prev.k->k);
+ if (iter) {
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct btree *b = path_l(path)->b;
- while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
- &k, initial);
- if (ret)
- break;
-
- bch2_btree_node_iter_advance(&iter, b);
-
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&cur, c, k);
-
- ret = bch2_gc_check_topology(c, b, &prev, cur,
- bch2_btree_node_iter_end(&iter));
+ if (*prev != b) {
+ int ret = bch2_btree_node_check_topology(trans, b);
if (ret)
- break;
+ return ret;
}
+ *prev = b;
}
- bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
- return ret;
-}
-
-static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
- bool initial, bool metadata_only)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct btree *b;
- unsigned depth = metadata_only ? 1 : 0;
+ struct bkey deleted = KEY(0, 0, 0);
+ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
-
- __for_each_btree_node(trans, iter, btree_id, POS_MIN,
- 0, depth, BTREE_ITER_PREFETCH, b, ret) {
- bch2_verify_btree_nr_keys(b);
+ deleted.p = k.k->p;
- gc_pos_set(c, gc_pos_btree_node(b));
+ if (initial) {
+ BUG_ON(bch2_journal_seq_verify &&
+ k.k->version.lo > atomic64_read(&c->journal.seq));
- ret = btree_gc_mark_node(trans, b, initial);
- if (ret)
- break;
+ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
+ bkey_version_in_future,
+ "key version number higher than recorded %llu\n %s",
+ atomic64_read(&c->key_version),
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ atomic64_set(&c->key_version, k.k->version.lo);
}
- bch2_trans_iter_exit(trans, &iter);
- if (ret)
- return ret;
-
- mutex_lock(&c->btree_root_lock);
- b = bch2_btree_id_root(c, btree_id)->b;
- if (!btree_node_fake(b)) {
- struct bkey_s_c k = bkey_i_to_s_c(&b->key);
-
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
- true, &k, initial);
+ if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
+ c, btree_bitmap_not_marked,
+ "btree ptr not marked in member info btree allocated bitmap\n %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf))) {
+ mutex_lock(&c->sb_lock);
+ bch2_dev_btree_bitmap_mark(c, k);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
}
- gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
- mutex_unlock(&c->btree_root_lock);
- return ret;
-}
-
-static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
- unsigned target_depth)
-{
- struct bch_fs *c = trans->c;
- struct btree_and_journal_iter iter;
- struct bkey_s_c k;
- struct bkey_buf cur, prev;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- bch2_bkey_buf_init(&prev);
- bch2_bkey_buf_init(&cur);
- bkey_init(&prev.k->k);
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- BUG_ON(bpos_lt(k.k->p, b->data->min_key));
- BUG_ON(bpos_gt(k.k->p, b->data->max_key));
-
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
- false, &k, true);
- if (ret)
- goto fsck_err;
-
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&cur, c, k);
- k = bkey_i_to_s_c(cur.k);
+ /*
+ * We require a commit before key_trigger() because
+ * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
+ * wrong result if we run it multiple times.
+ */
+ unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
- bch2_btree_and_journal_iter_advance(&iter);
+ ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
+ BTREE_TRIGGER_check_repair|flags);
+ if (ret)
+ goto out;
- ret = bch2_gc_check_topology(c, b,
- &prev, cur,
- !bch2_btree_and_journal_iter_peek(&iter).k);
- if (ret)
- goto fsck_err;
- } else {
- bch2_btree_and_journal_iter_advance(&iter);
- }
+ if (trans->nr_updates) {
+ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
}
- if (b->c.level > target_depth) {
- bch2_btree_and_journal_iter_exit(&iter);
- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- iter.prefetch = true;
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- struct btree *child;
-
- bch2_bkey_buf_reassemble(&cur, c, k);
- bch2_btree_and_journal_iter_advance(&iter);
-
- child = bch2_btree_node_get_noiter(trans, cur.k,
- b->c.btree_id, b->c.level - 1,
- false);
- ret = PTR_ERR_OR_ZERO(child);
-
- if (bch2_err_matches(ret, EIO)) {
- bch2_topology_error(c);
-
- if (__fsck_err(c,
- FSCK_CAN_FIX|
- FSCK_CAN_IGNORE|
- FSCK_NO_RATELIMIT,
- btree_node_read_error,
- "Unreadable btree node at btree %s level %u:\n"
- " %s",
- bch2_btree_id_str(b->c.btree_id),
- b->c.level - 1,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
- should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto fsck_err;
- } else {
- /* Continue marking when opted to not
- * fix the error: */
- ret = 0;
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- continue;
- }
- } else if (ret) {
- bch_err_msg(c, ret, "getting btree node");
- break;
- }
-
- ret = bch2_gc_btree_init_recurse(trans, child,
- target_depth);
- six_unlock_read(&child->c.lock);
-
- if (ret)
- break;
- }
- }
+ ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
+ BTREE_TRIGGER_gc|flags);
+out:
fsck_err:
- bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
- bch2_btree_and_journal_iter_exit(&iter);
printbuf_exit(&buf);
+ bch_err_fn(c, ret);
return ret;
}
-static int bch2_gc_btree_init(struct btree_trans *trans,
- enum btree_id btree_id,
- bool metadata_only)
+static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
{
struct bch_fs *c = trans->c;
- struct btree *b;
- unsigned target_depth = metadata_only ? 1 : 0;
- struct printbuf buf = PRINTBUF;
+ int level = 0, target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1;
int ret = 0;
- b = bch2_btree_id_root(c, btree_id)->b;
-
- if (btree_node_fake(b))
- return 0;
-
- six_lock_read(&b->c.lock, NULL, NULL);
- printbuf_reset(&buf);
- bch2_bpos_to_text(&buf, b->data->min_key);
- if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
- btree_root_bad_min_key,
- "btree root with incorrect min_key: %s", buf.buf)) {
- bch_err(c, "repair unimplemented");
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto fsck_err;
- }
+ /* We need to make sure every leaf node is readable before going RW */
+ if (initial)
+ target_depth = 0;
- printbuf_reset(&buf);
- bch2_bpos_to_text(&buf, b->data->max_key);
- if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
- btree_root_bad_max_key,
- "btree root with incorrect max_key: %s", buf.buf)) {
- bch_err(c, "repair unimplemented");
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto fsck_err;
+ /* root */
+ mutex_lock(&c->btree_root_lock);
+ struct btree *b = bch2_btree_id_root(c, btree)->b;
+ if (!btree_node_fake(b)) {
+ gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
+ ret = lockrestart_do(trans,
+ bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
+ NULL, NULL, bkey_i_to_s_c(&b->key), initial));
+ level = b->c.level;
}
+ mutex_unlock(&c->btree_root_lock);
- if (b->c.level >= target_depth)
- ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
+ if (ret)
+ return ret;
- if (!ret) {
- struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+ for (; level >= target_depth; --level) {
+ struct btree *prev = NULL;
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
+ BTREE_ITER_prefetch);
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true,
- &k, true);
+ ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
+ bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
+ }));
+ if (ret)
+ break;
}
-fsck_err:
- six_unlock_read(&b->c.lock);
- bch_err_fn(c, ret);
- printbuf_exit(&buf);
return ret;
}
static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
{
- return (int) btree_id_to_gc_phase(l) -
- (int) btree_id_to_gc_phase(r);
+ return cmp_int(gc_btree_order(l), gc_btree_order(r));
}
-static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
+static int bch2_gc_btrees(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
enum btree_id ids[BTREE_ID_NR];
@@ -1091,98 +688,36 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
ids[i] = i;
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
- for (i = 0; i < BTREE_ID_NR && !ret; i++)
- ret = initial
- ? bch2_gc_btree_init(trans, ids[i], metadata_only)
- : bch2_gc_btree(trans, ids[i], initial, metadata_only);
+ for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
- for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
- if (!bch2_btree_id_root(c, i)->alive)
+ if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
continue;
- ret = initial
- ? bch2_gc_btree_init(trans, i, metadata_only)
- : bch2_gc_btree(trans, i, initial, metadata_only);
- }
+ ret = bch2_gc_btree(trans, btree, true);
+ if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
+ c, btree_node_read_error,
+ "btree node read error for %s",
+ bch2_btree_id_str(btree)))
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+ }
+fsck_err:
bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
}
-static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
- u64 start, u64 end,
- enum bch_data_type type,
- unsigned flags)
-{
- u64 b = sector_to_bucket(ca, start);
-
- do {
- unsigned sectors =
- min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
-
- bch2_mark_metadata_bucket(c, ca, b, type, sectors,
- gc_phase(GC_PHASE_SB), flags);
- b++;
- start += sectors;
- } while (start < end);
-}
-
-static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
- unsigned flags)
-{
- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
- unsigned i;
- u64 b;
-
- for (i = 0; i < layout->nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout->sb_offset[i]);
-
- if (offset == BCH_SB_SECTOR)
- mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
- BCH_DATA_sb, flags);
-
- mark_metadata_sectors(c, ca, offset,
- offset + (1 << layout->sb_max_size_bits),
- BCH_DATA_sb, flags);
- }
-
- for (i = 0; i < ca->journal.nr; i++) {
- b = ca->journal.buckets[i];
- bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB), flags);
- }
-}
-
-static void bch2_mark_superblocks(struct bch_fs *c)
+static int bch2_mark_superblocks(struct bch_fs *c)
{
mutex_lock(&c->sb_lock);
- gc_pos_set(c, gc_phase(GC_PHASE_SB));
+ gc_pos_set(c, gc_phase(GC_PHASE_sb));
- for_each_online_member(c, ca)
- bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
+ int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
mutex_unlock(&c->sb_lock);
+ return ret;
}
-#if 0
-/* Also see bch2_pending_btree_node_free_insert_done() */
-static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
-{
- struct btree_update *as;
- struct pending_btree_node_free *d;
-
- mutex_lock(&c->btree_interior_update_lock);
- gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
-
- for_each_pending_btree_node_free(c, as, d)
- if (d->index_update_done)
- bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
-
- mutex_unlock(&c->btree_interior_update_lock);
-}
-#endif
-
static void bch2_gc_free(struct bch_fs *c)
{
genradix_free(&c->reflink_gc_table);
@@ -1200,28 +735,23 @@ static void bch2_gc_free(struct bch_fs *c)
c->usage_gc = NULL;
}
-static int bch2_gc_done(struct bch_fs *c,
- bool initial, bool metadata_only)
+static int bch2_gc_done(struct bch_fs *c)
{
struct bch_dev *ca = NULL;
struct printbuf buf = PRINTBUF;
- bool verify = !metadata_only &&
- !c->opts.reconstruct_alloc &&
- (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
unsigned i;
int ret = 0;
percpu_down_write(&c->mark_lock);
-#define copy_field(_err, _f, _msg, ...) \
- if (dst->_f != src->_f && \
- (!verify || \
- fsck_err(c, _err, _msg ": got %llu, should be %llu" \
- , ##__VA_ARGS__, dst->_f, src->_f))) \
+#define copy_field(_err, _f, _msg, ...) \
+ if (fsck_err_on(dst->_f != src->_f, c, _err, \
+ _msg ": got %llu, should be %llu" , ##__VA_ARGS__, \
+ dst->_f, src->_f)) \
dst->_f = src->_f
-#define copy_dev_field(_err, _f, _msg, ...) \
+#define copy_dev_field(_err, _f, _msg, ...) \
copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__)
-#define copy_fs_field(_err, _f, _msg, ...) \
+#define copy_fs_field(_err, _f, _msg, ...) \
copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
@@ -1254,31 +784,24 @@ static int bch2_gc_done(struct bch_fs *c,
copy_fs_field(fs_usage_btree_wrong,
b.btree, "btree");
- if (!metadata_only) {
- copy_fs_field(fs_usage_data_wrong,
- b.data, "data");
- copy_fs_field(fs_usage_cached_wrong,
- b.cached, "cached");
- copy_fs_field(fs_usage_reserved_wrong,
- b.reserved, "reserved");
- copy_fs_field(fs_usage_nr_inodes_wrong,
- b.nr_inodes,"nr_inodes");
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- copy_fs_field(fs_usage_persistent_reserved_wrong,
- persistent_reserved[i],
- "persistent_reserved[%i]", i);
- }
+ copy_fs_field(fs_usage_data_wrong,
+ b.data, "data");
+ copy_fs_field(fs_usage_cached_wrong,
+ b.cached, "cached");
+ copy_fs_field(fs_usage_reserved_wrong,
+ b.reserved, "reserved");
+ copy_fs_field(fs_usage_nr_inodes_wrong,
+ b.nr_inodes,"nr_inodes");
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++)
+ copy_fs_field(fs_usage_persistent_reserved_wrong,
+ persistent_reserved[i],
+ "persistent_reserved[%i]", i);
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
- if (metadata_only &&
- (e->data_type == BCH_DATA_user ||
- e->data_type == BCH_DATA_cached))
- continue;
-
printbuf_reset(&buf);
bch2_replicas_entry_to_text(&buf, e);
@@ -1292,10 +815,8 @@ static int bch2_gc_done(struct bch_fs *c,
#undef copy_stripe_field
#undef copy_field
fsck_err:
- if (ca)
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
bch_err_fn(c, ret);
-
percpu_up_write(&c->mark_lock);
printbuf_exit(&buf);
return ret;
@@ -1318,7 +839,7 @@ static int bch2_gc_start(struct bch_fs *c)
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
if (!ca->usage_gc) {
bch_err(c, "error allocating ca->usage_gc");
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return -BCH_ERR_ENOMEM_gc_start;
}
@@ -1329,19 +850,6 @@ static int bch2_gc_start(struct bch_fs *c)
return 0;
}
-static int bch2_gc_reset(struct bch_fs *c)
-{
- for_each_member_device(c, ca) {
- free_percpu(ca->usage_gc);
- ca->usage_gc = NULL;
- }
-
- free_percpu(c->usage_gc);
- c->usage_gc = NULL;
-
- return bch2_gc_start(c);
-}
-
/* returns true if not equal */
static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
struct bch_alloc_v4 r)
@@ -1357,56 +865,41 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
static int bch2_alloc_write_key(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_s_c k,
- bool metadata_only)
+ struct bch_dev *ca,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
- struct bucket old_gc, gc, *b;
struct bkey_i_alloc_v4 *a;
- struct bch_alloc_v4 old_convert, new;
+ struct bch_alloc_v4 old_gc, gc, old_convert, new;
const struct bch_alloc_v4 *old;
int ret;
old = bch2_alloc_to_v4(k, &old_convert);
- new = *old;
+ gc = new = *old;
percpu_down_read(&c->mark_lock);
- b = gc_bucket(ca, iter->pos.offset);
- old_gc = *b;
+ __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
+
+ old_gc = gc;
if ((old->data_type == BCH_DATA_sb ||
old->data_type == BCH_DATA_journal) &&
!bch2_dev_is_online(ca)) {
- b->data_type = old->data_type;
- b->dirty_sectors = old->dirty_sectors;
+ gc.data_type = old->data_type;
+ gc.dirty_sectors = old->dirty_sectors;
}
/*
- * b->data_type doesn't yet include need_discard & need_gc_gen states -
+ * gc.data_type doesn't yet include need_discard & need_gc_gen states -
* fix that here:
*/
- b->data_type = __alloc_data_type(b->dirty_sectors,
- b->cached_sectors,
- b->stripe,
- *old,
- b->data_type);
- gc = *b;
+ alloc_data_type_set(&gc, gc.data_type);
if (gc.data_type != old_gc.data_type ||
gc.dirty_sectors != old_gc.dirty_sectors)
- bch2_dev_usage_update_m(c, ca, &old_gc, &gc);
+ bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true);
percpu_up_read(&c->mark_lock);
- if (metadata_only &&
- gc.data_type != BCH_DATA_sb &&
- gc.data_type != BCH_DATA_journal &&
- gc.data_type != BCH_DATA_btree)
- return 0;
-
- if (gen_after(old->gen, gc.gen))
- return 0;
-
if (fsck_err_on(new.data_type != gc.data_type, c,
alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
@@ -1456,12 +949,12 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
+ ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
fsck_err:
return ret;
}
-static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_alloc_done(struct bch_fs *c)
{
int ret = 0;
@@ -1470,11 +963,11 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc,
POS(ca->dev_idx, ca->mi.first_bucket),
POS(ca->dev_idx, ca->mi.nbuckets - 1),
- BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_slots|BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
- bch2_alloc_write_key(trans, &iter, k, metadata_only)));
+ bch2_alloc_write_key(trans, &iter, ca, k)));
if (ret) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
break;
}
}
@@ -1483,14 +976,14 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
return ret;
}
-static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_alloc_start(struct bch_fs *c)
{
for_each_member_device(c, ca) {
struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
bch_err(c, "error allocating ca->buckets[gc]");
return -BCH_ERR_ENOMEM_gc_alloc_start;
}
@@ -1500,54 +993,29 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
rcu_assign_pointer(ca->buckets_gc, buckets);
}
+ struct bch_dev *ca = NULL;
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
- struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
- struct bucket *g = gc_bucket(ca, k.k->p.offset);
+ BTREE_ITER_prefetch, k, ({
+ ca = bch2_dev_iterate(c, ca, k.k->p.inode);
+ if (!ca) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ continue;
+ }
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+ struct bucket *g = gc_bucket(ca, k.k->p.offset);
g->gen_valid = 1;
g->gen = a->gen;
-
- if (metadata_only &&
- (a->data_type == BCH_DATA_user ||
- a->data_type == BCH_DATA_cached ||
- a->data_type == BCH_DATA_parity)) {
- g->data_type = a->data_type;
- g->dirty_sectors = a->dirty_sectors;
- g->cached_sectors = a->cached_sectors;
- g->stripe = a->stripe;
- g->stripe_redundancy = a->stripe_redundancy;
- }
-
0;
})));
+ bch2_dev_put(ca);
bch_err_fn(c, ret);
return ret;
}
-static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
-{
- for_each_member_device(c, ca) {
- struct bucket_array *buckets = gc_bucket_array(ca);
- struct bucket *g;
-
- for_each_bucket(g, buckets) {
- if (metadata_only &&
- (g->data_type == BCH_DATA_user ||
- g->data_type == BCH_DATA_cached ||
- g->data_type == BCH_DATA_parity))
- continue;
- g->data_type = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- }
- }
-}
-
static int bch2_gc_write_reflink_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -1583,7 +1051,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
- return ret;
+ goto out;
if (!r->refcount)
new->k.type = KEY_TYPE_deleted;
@@ -1591,40 +1059,33 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
ret = bch2_trans_update(trans, iter, new, 0);
}
+out:
fsck_err:
printbuf_exit(&buf);
return ret;
}
-static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_reflink_done(struct bch_fs *c)
{
size_t idx = 0;
- if (metadata_only)
- return 0;
-
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
c->reflink_gc_nr = 0;
return ret;
}
-static int bch2_gc_reflink_start(struct bch_fs *c,
- bool metadata_only)
+static int bch2_gc_reflink_start(struct bch_fs *c)
{
-
- if (metadata_only)
- return 0;
-
c->reflink_gc_nr = 0;
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
const __le64 *refcount = bkey_refcount_c(k);
if (!refcount)
@@ -1647,15 +1108,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c,
return ret;
}
-static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
-{
- struct genradix_iter iter;
- struct reflink_gc *r;
-
- genradix_for_each(&c->reflink_gc_table, iter, r)
- r->refcount = 0;
-}
-
static int bch2_gc_write_stripes_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
@@ -1709,30 +1161,20 @@ fsck_err:
return ret;
}
-static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_stripes_done(struct bch_fs *c)
{
- if (metadata_only)
- return 0;
-
return bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_gc_write_stripes_key(trans, &iter, k)));
}
-static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
-{
- genradix_free(&c->gc_stripes);
-}
-
/**
- * bch2_gc - walk _all_ references to buckets, and recompute them:
+ * bch2_check_allocations - walk all references to buckets, and recompute them:
*
* @c: filesystem object
- * @initial: are we in recovery?
- * @metadata_only: are we just checking metadata references, or everything?
*
* Returns: 0 on success, or standard errcode on failure
*
@@ -1751,9 +1193,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
* move around - if references move backwards in the ordering GC
* uses, GC could skip past them
*/
-int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
+int bch2_check_allocations(struct bch_fs *c)
{
- unsigned iter = 0;
int ret;
lockdep_assert_held(&c->state_lock);
@@ -1763,66 +1204,34 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
bch2_btree_interior_updates_flush(c);
ret = bch2_gc_start(c) ?:
- bch2_gc_alloc_start(c, metadata_only) ?:
- bch2_gc_reflink_start(c, metadata_only);
+ bch2_gc_alloc_start(c) ?:
+ bch2_gc_reflink_start(c);
if (ret)
goto out;
-again:
- gc_pos_set(c, gc_phase(GC_PHASE_START));
- bch2_mark_superblocks(c);
+ gc_pos_set(c, gc_phase(GC_PHASE_start));
- ret = bch2_gc_btrees(c, initial, metadata_only);
+ ret = bch2_mark_superblocks(c);
+ BUG_ON(ret);
+ ret = bch2_gc_btrees(c);
if (ret)
goto out;
-#if 0
- bch2_mark_pending_btree_node_frees(c);
-#endif
c->gc_count++;
- if (test_bit(BCH_FS_need_another_gc, &c->flags) ||
- (!iter && bch2_test_restart_gc)) {
- if (iter++ > 2) {
- bch_info(c, "Unable to fix bucket gens, looping");
- ret = -EINVAL;
- goto out;
- }
-
- /*
- * XXX: make sure gens we fixed got saved
- */
- bch_info(c, "Second GC pass needed, restarting:");
- clear_bit(BCH_FS_need_another_gc, &c->flags);
- __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
-
- bch2_gc_stripes_reset(c, metadata_only);
- bch2_gc_alloc_reset(c, metadata_only);
- bch2_gc_reflink_reset(c, metadata_only);
- ret = bch2_gc_reset(c);
- if (ret)
- goto out;
-
- /* flush fsck errors, reset counters */
- bch2_flush_fsck_errs(c);
- goto again;
- }
+ bch2_journal_block(&c->journal);
out:
- if (!ret) {
- bch2_journal_block(&c->journal);
+ ret = bch2_gc_alloc_done(c) ?:
+ bch2_gc_done(c) ?:
+ bch2_gc_stripes_done(c) ?:
+ bch2_gc_reflink_done(c);
- ret = bch2_gc_alloc_done(c, metadata_only) ?:
- bch2_gc_done(c, initial, metadata_only) ?:
- bch2_gc_stripes_done(c, metadata_only) ?:
- bch2_gc_reflink_done(c, metadata_only);
-
- bch2_journal_unblock(&c->journal);
- }
+ bch2_journal_unblock(&c->journal);
percpu_down_write(&c->mark_lock);
/* Indicates that gc is no longer in progress: */
- __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+ __gc_pos_set(c, gc_phase(GC_PHASE_not_running));
bch2_gc_free(c);
percpu_up_write(&c->mark_lock);
@@ -1847,23 +1256,33 @@ static int gc_btree_gens_key(struct btree_trans *trans,
struct bkey_i *u;
int ret;
+ if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
+ return -EROFS;
+
percpu_down_read(&c->mark_lock);
+ rcu_read_lock();
bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
- if (ptr_stale(ca, ptr) > 16) {
+ if (dev_ptr_stale(ca, ptr) > 16) {
+ rcu_read_unlock();
percpu_up_read(&c->mark_lock);
goto update;
}
}
bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
+ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
if (gen_after(*gen, ptr->gen))
*gen = ptr->gen;
}
+ rcu_read_unlock();
percpu_up_read(&c->mark_lock);
return 0;
update:
@@ -1876,10 +1295,9 @@ update:
return 0;
}
-static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k)
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
+ struct btree_iter *iter, struct bkey_s_c k)
{
- struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
struct bkey_i_alloc_v4 *a_mut;
@@ -1894,7 +1312,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
return ret;
a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
- a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
+ alloc_data_type_set(&a_mut->v, a_mut->v.data_type);
return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
}
@@ -1922,7 +1340,7 @@ int bch2_gc_gens(struct bch_fs *c)
ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
if (!ca->oldest_gen) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
ret = -BCH_ERR_ENOMEM_gc_gens;
goto err;
}
@@ -1940,7 +1358,7 @@ int bch2_gc_gens(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, i,
POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
@@ -1949,14 +1367,23 @@ int bch2_gc_gens(struct bch_fs *c)
goto err;
}
+ struct bch_dev *ca = NULL;
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN,
- BTREE_ITER_PREFETCH,
+ BTREE_ITER_prefetch,
k,
NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_alloc_write_oldest_gen(trans, &iter, k)));
+ BCH_TRANS_COMMIT_no_enospc, ({
+ ca = bch2_dev_iterate(c, ca, k.k->p.inode);
+ if (!ca) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ continue;
+ }
+ bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
+ })));
+ bch2_dev_put(ca);
+
if (ret)
goto err;
@@ -1980,87 +1407,23 @@ err:
return ret;
}
-static int bch2_gc_thread(void *arg)
+static void bch2_gc_gens_work(struct work_struct *work)
{
- struct bch_fs *c = arg;
- struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last = atomic64_read(&clock->now);
- unsigned last_kick = atomic_read(&c->kick_gc);
-
- set_freezable();
-
- while (1) {
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- return 0;
- }
-
- if (atomic_read(&c->kick_gc) != last_kick)
- break;
-
- if (c->btree_gc_periodic) {
- unsigned long next = last + c->capacity / 16;
-
- if (atomic64_read(&clock->now) >= next)
- break;
-
- bch2_io_clock_schedule_timeout(clock, next);
- } else {
- schedule();
- }
-
- try_to_freeze();
- }
- __set_current_state(TASK_RUNNING);
-
- last = atomic64_read(&clock->now);
- last_kick = atomic_read(&c->kick_gc);
-
- /*
- * Full gc is currently incompatible with btree key cache:
- */
-#if 0
- ret = bch2_gc(c, false, false);
-#else
- bch2_gc_gens(c);
-#endif
- debug_check_no_locks_held();
- }
-
- return 0;
+ struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
+ bch2_gc_gens(c);
+ bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
}
-void bch2_gc_thread_stop(struct bch_fs *c)
+void bch2_gc_gens_async(struct bch_fs *c)
{
- struct task_struct *p;
-
- p = c->gc_thread;
- c->gc_thread = NULL;
-
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) &&
+ !queue_work(c->write_ref_wq, &c->gc_gens_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
}
-int bch2_gc_thread_start(struct bch_fs *c)
+void bch2_fs_gc_init(struct bch_fs *c)
{
- struct task_struct *p;
-
- if (c->gc_thread)
- return 0;
+ seqcount_init(&c->gc_pos_lock);
- p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
- if (IS_ERR(p)) {
- bch_err_fn(c, PTR_ERR(p));
- return PTR_ERR(p);
- }
-
- get_task_struct(p);
- c->gc_thread = p;
- wake_up_process(p);
- return 0;
+ INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 607575f83a00..876d81e2017d 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -3,13 +3,11 @@
#define _BCACHEFS_BTREE_GC_H
#include "bkey.h"
+#include "btree_gc_types.h"
#include "btree_types.h"
int bch2_check_topology(struct bch_fs *);
-int bch2_gc(struct bch_fs *, bool, bool);
-int bch2_gc_gens(struct bch_fs *);
-void bch2_gc_thread_stop(struct bch_fs *);
-int bch2_gc_thread_start(struct bch_fs *);
+int bch2_check_allocations(struct bch_fs *);
/*
* For concurrent mark and sweep (with other index updates), we define a total
@@ -35,38 +33,17 @@ int bch2_gc_thread_start(struct bch_fs *);
/* Position of (the start of) a gc phase: */
static inline struct gc_pos gc_phase(enum gc_phase phase)
{
- return (struct gc_pos) {
- .phase = phase,
- .pos = POS_MIN,
- .level = 0,
- };
+ return (struct gc_pos) { .phase = phase, };
}
-static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
-{
- return cmp_int(l.phase, r.phase) ?:
- bpos_cmp(l.pos, r.pos) ?:
- cmp_int(l.level, r.level);
-}
-
-static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
-{
- switch (id) {
-#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
- BCH_BTREE_IDS()
-#undef x
- default:
- BUG();
- }
-}
-
-static inline struct gc_pos gc_pos_btree(enum btree_id id,
- struct bpos pos, unsigned level)
+static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
+ struct bpos pos)
{
return (struct gc_pos) {
- .phase = btree_id_to_gc_phase(id),
- .pos = pos,
+ .phase = GC_PHASE_btree,
+ .btree = btree,
.level = level,
+ .pos = pos,
};
}
@@ -76,19 +53,23 @@ static inline struct gc_pos gc_pos_btree(enum btree_id id,
*/
static inline struct gc_pos gc_pos_btree_node(struct btree *b)
{
- return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level);
+ return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p);
}
-/*
- * GC position of the pointer to a btree root: we don't use
- * gc_pos_pointer_to_btree_node() here to avoid a potential race with
- * btree_split() increasing the tree depth - the new root will have level > the
- * old root and thus have a greater gc position than the old root, but that
- * would be incorrect since once gc has marked the root it's not coming back.
- */
-static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
+static inline int gc_btree_order(enum btree_id btree)
+{
+ if (btree == BTREE_ID_stripes)
+ return -1;
+ return btree;
+}
+
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
- return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH);
+ return cmp_int(l.phase, r.phase) ?:
+ cmp_int(gc_btree_order(l.btree),
+ gc_btree_order(r.btree)) ?:
+ -cmp_int(l.level, r.level) ?:
+ bpos_cmp(l.pos, r.pos);
}
static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
@@ -104,11 +85,8 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
return ret;
}
-static inline void bch2_do_gc_gens(struct bch_fs *c)
-{
- atomic_inc(&c->kick_gc);
- if (c->gc_thread)
- wake_up_process(c->gc_thread);
-}
+int bch2_gc_gens(struct bch_fs *);
+void bch2_gc_gens_async(struct bch_fs *);
+void bch2_fs_gc_init(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h
new file mode 100644
index 000000000000..b82c24bcc088
--- /dev/null
+++ b/fs/bcachefs/btree_gc_types.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_GC_TYPES_H
+#define _BCACHEFS_BTREE_GC_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+enum gc_phase {
+ GC_PHASE_not_running,
+ GC_PHASE_start,
+ GC_PHASE_sb,
+ GC_PHASE_btree,
+};
+
+struct gc_pos {
+ enum gc_phase phase:8;
+ enum btree_id btree:8;
+ u16 level;
+ struct bpos pos;
+};
+
+struct reflink_gc {
+ u64 offset;
+ u32 size;
+ u32 refcount;
+};
+
+typedef GENRADIX(struct reflink_gc) reflink_gc_table;
+
+#endif /* _BCACHEFS_BTREE_GC_TYPES_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 34df8ccc5fec..829c1b91477d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -23,6 +23,18 @@
#include <linux/sched/mm.h>
+static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
+{
+ prt_printf(out, "btree=%s l=%u seq %llux\n",
+ bch2_btree_id_str(BTREE_NODE_ID(bn)),
+ (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq);
+ prt_str(out, "min: ");
+ bch2_bpos_to_text(out, bn->min_key);
+ prt_newline(out);
+ prt_str(out, "max: ");
+ bch2_bpos_to_text(out, bn->max_key);
+}
+
void bch2_btree_node_io_unlock(struct btree *b)
{
EBUG_ON(!btree_node_write_in_flight(b));
@@ -217,7 +229,6 @@ static bool should_compact_bset(struct btree *b, struct bset_tree *t,
static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
{
- struct bset_tree *t;
bool ret = false;
for_each_bset(b, t) {
@@ -288,8 +299,7 @@ bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
static void btree_node_sort(struct bch_fs *c, struct btree *b,
unsigned start_idx,
- unsigned end_idx,
- bool filter_whiteouts)
+ unsigned end_idx)
{
struct btree_node *out;
struct sort_iter_stack sort_iter;
@@ -320,7 +330,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
start_time = local_clock();
- u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
+ u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter);
out->keys.u64s = cpu_to_le16(u64s);
@@ -426,13 +436,12 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b)
break;
if (b->nsets - unwritten_idx > 1) {
- btree_node_sort(c, b, unwritten_idx,
- b->nsets, false);
+ btree_node_sort(c, b, unwritten_idx, b->nsets);
ret = true;
}
if (unwritten_idx > 1) {
- btree_node_sort(c, b, 0, unwritten_idx, false);
+ btree_node_sort(c, b, 0, unwritten_idx);
ret = true;
}
@@ -441,8 +450,6 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b)
void bch2_btree_build_aux_trees(struct btree *b)
{
- struct bset_tree *t;
-
for_each_bset(b, t)
bch2_bset_build_aux_tree(b, t,
!bset_written(b, bset(b, t)) &&
@@ -512,7 +519,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca,
- struct btree *b, struct bset *i,
+ struct btree *b, struct bset *i, struct bkey_packed *k,
unsigned offset, int write)
{
prt_printf(out, bch2_log_msg(c, "%s"),
@@ -524,28 +531,36 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "at btree ");
bch2_btree_pos_to_text(out, c, b);
- prt_printf(out, "\n node offset %u/%u",
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "\nnode offset %u/%u",
b->written, btree_ptr_sectors_written(&b->key));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+ if (k)
+ prt_printf(out, " bset byte offset %lu",
+ (unsigned long)(void *)k -
+ ((unsigned long)(void *)i & ~511UL));
prt_str(out, ": ");
}
-__printf(9, 10)
+__printf(10, 11)
static int __btree_err(int ret,
struct bch_fs *c,
struct bch_dev *ca,
struct btree *b,
struct bset *i,
+ struct bkey_packed *k,
int write,
bool have_retry,
enum bch_sb_error_id err_type,
const char *fmt, ...)
{
struct printbuf out = PRINTBUF;
+ bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
va_list args;
- btree_err_msg(&out, c, ca, b, i, b->written, write);
+ btree_err_msg(&out, c, ca, b, i, k, b->written, write);
va_start(args, fmt);
prt_vprintf(&out, fmt, args);
@@ -564,12 +579,14 @@ static int __btree_err(int ret,
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
ret = -BCH_ERR_btree_node_read_err_bad_node;
- if (ret != -BCH_ERR_btree_node_read_err_fixable)
+ if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable)
bch2_sb_error_count(c, err_type);
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
- ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf);
+ ret = !silent
+ ? bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf)
+ : -BCH_ERR_fsck_fix;
if (ret != -BCH_ERR_fsck_fix &&
ret != -BCH_ERR_fsck_ignore)
goto fsck_err;
@@ -577,14 +594,17 @@ static int __btree_err(int ret,
break;
case -BCH_ERR_btree_node_read_err_want_retry:
case -BCH_ERR_btree_node_read_err_must_retry:
- bch2_print_string_as_lines(KERN_ERR, out.buf);
+ if (!silent)
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
break;
case -BCH_ERR_btree_node_read_err_bad_node:
- bch2_print_string_as_lines(KERN_ERR, out.buf);
+ if (!silent)
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
ret = bch2_topology_error(c);
break;
case -BCH_ERR_btree_node_read_err_incompatible:
- bch2_print_string_as_lines(KERN_ERR, out.buf);
+ if (!silent)
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
ret = -BCH_ERR_fsck_errors_not_fixed;
break;
default:
@@ -596,9 +616,9 @@ fsck_err:
return ret;
}
-#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \
+#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
({ \
- int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \
+ int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \
BCH_FSCK_ERR_##_err_type, \
msg, ##__VA_ARGS__); \
\
@@ -619,8 +639,6 @@ fsck_err:
__cold
void bch2_btree_node_drop_keys_outside_node(struct btree *b)
{
- struct bset_tree *t;
-
for_each_bset(b, t) {
struct bset *i = bset(b, t);
struct bkey_packed *k;
@@ -654,6 +672,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
*/
bch2_bset_set_no_aux_tree(b, b->set);
bch2_btree_build_aux_trees(b);
+ b->nr = bch2_btree_node_count_keys(b);
struct bkey_s_c k;
struct bkey unpacked;
@@ -676,7 +695,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(!bch2_version_compatible(version),
-BCH_ERR_btree_node_read_err_incompatible,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_unsupported_version,
"unsupported bset version %u.%u",
BCH_VERSION_MAJOR(version),
@@ -684,7 +703,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
if (btree_err_on(version < c->sb.version_min,
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, NULL,
btree_node_bset_older_than_sb_min,
"bset version %u older than superblock version_min %u",
version, c->sb.version_min)) {
@@ -697,7 +716,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
if (btree_err_on(BCH_VERSION_MAJOR(version) >
BCH_VERSION_MAJOR(c->sb.version),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, NULL,
btree_node_bset_newer_than_sb,
"bset version %u newer than superblock version %u",
version, c->sb.version)) {
@@ -709,13 +728,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-BCH_ERR_btree_node_read_err_incompatible,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_unsupported_version,
"BSET_SEPARATE_WHITEOUTS no longer supported");
if (btree_err_on(offset + sectors > btree_sectors(c),
-BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_past_end_of_btree_node,
"bset past end of btree node")) {
i->u64s = 0;
@@ -725,13 +744,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(offset && !i->u64s,
-BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_empty,
"empty bset");
btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_wrong_sector_offset,
"bset at wrong sector offset");
@@ -747,20 +766,20 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
/* XXX endianness */
btree_err_on(bp->seq != bn->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
bset_bad_seq,
"incorrect sequence number (wrong btree node)");
}
btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_bad_btree,
"incorrect btree id");
btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_bad_level,
"incorrect level");
@@ -779,7 +798,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bad_min_key,
"incorrect min_key: got %s should be %s",
(printbuf_reset(&buf1),
@@ -790,7 +809,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_bad_max_key,
"incorrect max key %s",
(printbuf_reset(&buf1),
@@ -802,7 +821,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
-BCH_ERR_btree_node_read_err_bad_node,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_bad_format,
"invalid bkey format: %s\n %s", buf1.buf,
(printbuf_reset(&buf2),
@@ -830,7 +849,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
}
-static bool __bkey_valid(struct bch_fs *c, struct btree *b,
+static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
struct bset *i, struct bkey_packed *k)
{
if (bkey_p_next(k) > vstruct_last(i))
@@ -839,7 +858,7 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b,
if (k->format > KEY_FORMAT_CURRENT)
return false;
- if (k->u64s < bkeyp_key_u64s(&b->format, k))
+ if (!bkeyp_u64s_valid(&b->format, k))
return false;
struct printbuf buf = PRINTBUF;
@@ -869,7 +888,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bkey_past_bset_end,
"key extends past end of bset")) {
i->u64s = cpu_to_le16((u64 *) k - i->_data);
@@ -878,16 +897,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bkey_bad_format,
"invalid bkey format %u", k->format))
goto drop_this_key;
- if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k),
+ if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bkey_bad_u64s,
- "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k)))
+ "bad k->u64s %u (min %u max %zu)", k->u64s,
+ bkeyp_key_u64s(&b->format, k),
+ U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
goto drop_this_key;
if (!write)
@@ -905,7 +926,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
bch2_bkey_val_to_text(&buf, c, u.s_c);
btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bad_bkey,
"invalid bkey: %s", buf.buf);
goto drop_this_key;
@@ -926,7 +947,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
bch2_bkey_to_text(&buf, u.k);
if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bkey_out_of_order,
"%s", buf.buf))
goto drop_this_key;
@@ -946,13 +967,12 @@ drop_this_key:
* do
*/
- if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
+ if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
for (next_good_key = 1;
next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
next_good_key++)
- if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
+ if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
goto got_good_key;
-
}
/*
@@ -996,13 +1016,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (bch2_meta_read_fault("btree"))
btree_err(-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_fault_injected,
"dynamic fault");
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bad_magic,
"bad magic: want %llx, got %llx",
bset_magic(c), le64_to_cpu(b->data->magic));
@@ -1017,20 +1037,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(b->data->keys.seq != bp->seq,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bad_seq,
- "got wrong btree node (want %llx got %llx)\n"
- "got btree %s level %llu pos %s",
- bp->seq, b->data->keys.seq,
- bch2_btree_id_str(BTREE_NODE_ID(b->data)),
- BTREE_NODE_LEVEL(b->data),
- buf.buf);
+ "got wrong btree node: got\n%s",
+ (printbuf_reset(&buf),
+ bch2_btree_node_header_to_text(&buf, b->data),
+ buf.buf));
} else {
btree_err_on(!b->data->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bad_seq,
- "bad btree header: seq 0");
+ "bad btree header: seq 0\n%s",
+ (printbuf_reset(&buf),
+ bch2_btree_node_header_to_text(&buf, b->data),
+ buf.buf));
}
while (b->written < (ptr_written ?: btree_sectors(c))) {
@@ -1044,7 +1065,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_unknown_csum,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
@@ -1057,7 +1078,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(csum_bad,
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_bad_csum,
"%s",
(printbuf_reset(&buf),
@@ -1072,7 +1093,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-BCH_ERR_btree_node_read_err_incompatible,
- c, NULL, b, NULL,
+ c, NULL, b, NULL, NULL,
btree_node_unsupported_version,
"btree node does not have NEW_EXTENT_OVERWRITE set");
@@ -1086,19 +1107,19 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_unknown_csum,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
csum_bad = bch2_crc_cmp(bne->csum, csum);
- if (csum_bad)
+ if (ca && csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
btree_err_on(csum_bad,
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_bad_csum,
"%s",
(printbuf_reset(&buf),
@@ -1136,14 +1157,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(blacklisted && first,
-BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_blacklisted_journal_seq,
"first btree node bset has blacklisted journal seq (%llu)",
le64_to_cpu(i->journal_seq));
btree_err_on(blacklisted && ptr_written,
-BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i,
+ c, ca, b, i, NULL,
first_bset_blacklisted_journal_seq,
"found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
le64_to_cpu(i->journal_seq),
@@ -1162,7 +1183,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (ptr_written) {
btree_err_on(b->written < ptr_written,
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_data_missing,
"btree node data missing: expected %u sectors, found %u",
ptr_written, b->written);
@@ -1175,7 +1196,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
le64_to_cpu(bne->keys.journal_seq),
true),
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bset_after_end,
"found bset signature after last bset");
}
@@ -1219,7 +1240,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bch2_bkey_val_to_text(&buf, c, u.s_c);
btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bad_bkey,
"%s", buf.buf);
@@ -1247,12 +1268,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_node_reset_sib_u64s(b);
+ rcu_read_lock();
bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
- if (ca2->mi.state != BCH_MEMBER_STATE_rw)
+ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
set_btree_node_need_rewrite(b);
}
+ rcu_read_unlock();
if (!ptr_written)
set_btree_node_need_rewrite(b);
@@ -1263,10 +1286,12 @@ out:
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
- ret == -BCH_ERR_btree_node_read_err_must_retry)
+ ret == -BCH_ERR_btree_node_read_err_must_retry) {
retry_read = 1;
- else
+ } else {
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
+ }
goto out;
}
@@ -1275,8 +1300,8 @@ static void btree_node_read_work(struct work_struct *work)
struct btree_read_bio *rb =
container_of(work, struct btree_read_bio, work);
struct bch_fs *c = rb->c;
+ struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
struct btree *b = rb->b;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
struct printbuf buf = PRINTBUF;
@@ -1288,8 +1313,8 @@ static void btree_node_read_work(struct work_struct *work)
while (1) {
retry = true;
bch_info(c, "retrying read");
- ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
- rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
+ rb->have_ioref = ca != NULL;
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_buf_bytes(b);
@@ -1303,7 +1328,7 @@ static void btree_node_read_work(struct work_struct *work)
start:
printbuf_reset(&buf);
bch2_btree_pos_to_text(&buf, c, b);
- bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+ bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
"btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
@@ -1327,6 +1352,7 @@ start:
if (!can_retry) {
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
break;
}
}
@@ -1335,7 +1361,9 @@ start:
rb->start_time);
bio_put(&rb->bio);
- if (saw_error && !btree_node_read_error(b)) {
+ if (saw_error &&
+ !btree_node_read_error(b) &&
+ c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->key.k.p);
bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
@@ -1356,7 +1384,7 @@ static void btree_node_read_endio(struct bio *bio)
struct bch_fs *c = rb->c;
if (rb->have_ioref) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
}
@@ -1448,18 +1476,18 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
written2 = btree_node_sectors_written(c, ra->buf[i]);
if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL,
+ c, NULL, b, NULL, NULL,
btree_node_replicas_sectors_written_mismatch,
"btree node sectors written mismatch: %u != %u",
written, written2) ||
btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL,
+ c, NULL, b, NULL, NULL,
btree_node_bset_after_end,
"found bset signature after last bset") ||
btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL,
+ c, NULL, b, NULL, NULL,
btree_node_replicas_data_mismatch,
"btree node replicas content mismatch"))
dump_bset_maps = true;
@@ -1526,9 +1554,10 @@ fsck_err:
ret = -1;
}
- if (ret)
+ if (ret) {
set_btree_node_read_error(b);
- else if (*saw_error)
+ bch2_btree_lost_data(c, b->c.btree_id);
+ } else if (*saw_error)
bch2_btree_node_rewrite_async(c, b);
for (i = 0; i < ra->nr; i++) {
@@ -1552,7 +1581,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio)
struct btree_node_read_all *ra = rb->ra;
if (rb->have_ioref) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
}
@@ -1594,14 +1623,14 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
struct btree_read_bio *rb =
container_of(ra->bio[i], struct btree_read_bio, bio);
rb->c = c;
rb->b = b;
rb->ra = ra;
rb->start_time = local_clock();
- rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ rb->have_ioref = ca != NULL;
rb->idx = i;
rb->pick = pick;
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
@@ -1657,20 +1686,21 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
prt_str(&buf, "btree node read error: no device to read from\n at ");
bch2_btree_pos_to_text(&buf, c, b);
- bch_err(c, "%s", buf.buf);
+ bch_err_ratelimited(c, "%s", buf.buf);
if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
bch2_fatal_error(c);
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
printbuf_exit(&buf);
return;
}
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
bio = bio_alloc_bioset(NULL,
buf_pages(b->data, btree_buf_bytes(b)),
@@ -1682,7 +1712,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
rb->b = b;
rb->ra = NULL;
rb->start_time = local_clock();
- rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ rb->have_ioref = ca != NULL;
rb->pick = pick;
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_iter.bi_sector = pick.ptr.offset;
@@ -1837,7 +1867,6 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
- struct bch_extent_ptr *ptr;
int ret = 0;
btree_bounce_free(c,
@@ -1860,7 +1889,7 @@ static void btree_node_write_work(struct work_struct *work)
} else {
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
- BCH_WATERMARK_reclaim|
+ BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw,
@@ -1887,13 +1916,14 @@ static void btree_node_write_endio(struct bio *bio)
struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio);
struct bch_fs *c = wbio->c;
struct btree *b = wbio->bio.bi_private;
- struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
+ struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
unsigned long flags;
if (wbio->have_ioref)
bch2_latency_acct(ca, wbio->submit_time, WRITE);
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ if (!ca ||
+ bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
"btree write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("btree")) {
@@ -1960,7 +1990,6 @@ static void btree_write_submit(struct work_struct *work)
void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
{
struct btree_write_bio *wbio;
- struct bset_tree *t;
struct bset *i;
struct btree_node *bn = NULL;
struct btree_node_entry *bne = NULL;
@@ -2086,11 +2115,11 @@ do_write:
unwritten_whiteouts_end(b));
SET_BSET_SEPARATE_WHITEOUTS(i, false);
- b->whiteout_u64s = 0;
-
- u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
+ u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter);
le16_add_cpu(&i->u64s, u64s);
+ b->whiteout_u64s = 0;
+
BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
set_needs_whiteout(i, false);
@@ -2217,7 +2246,6 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
{
bool invalidated_iter = false;
struct btree_node_entry *bne;
- struct bset_tree *t;
if (!btree_node_just_written(b))
return false;
@@ -2240,7 +2268,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
* single bset:
*/
if (b->nsets > 1) {
- btree_node_sort(c, b, 0, b->nsets, true);
+ btree_node_sort(c, b, 0, b->nsets);
invalidated_iter = true;
} else {
invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
@@ -2337,20 +2365,13 @@ void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
printbuf_tabstop_push(out, 20);
printbuf_tabstop_push(out, 10);
- prt_tab(out);
- prt_str(out, "nr");
- prt_tab(out);
- prt_str(out, "size");
- prt_newline(out);
+ prt_printf(out, "\tnr\tsize\n");
for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
u64 nr = atomic64_read(&c->btree_write_stats[i].nr);
u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes);
- prt_printf(out, "%s:", bch2_btree_write_types[i]);
- prt_tab(out);
- prt_u64(out, nr);
- prt_tab(out);
+ prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr);
prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
prt_newline(out);
}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index e251cb6b965f..2b8b564fc560 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -81,8 +81,6 @@ static inline bool should_compact_bset_lazy(struct btree *b,
static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
{
- struct bset_tree *t;
-
for_each_bset(b, t)
if (should_compact_bset_lazy(b, t))
return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 51bcdc6c6d1c..d3bcb4e4e230 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -61,7 +61,7 @@ static inline int btree_path_cmp(const struct btree_path *l,
static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
{
/* Are we iterating over keys in all snapshots? */
- if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+ if (iter->flags & BTREE_ITER_all_snapshots) {
p = bpos_successor(p);
} else {
p = bpos_nosnap_successor(p);
@@ -74,7 +74,7 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
{
/* Are we iterating over keys in all snapshots? */
- if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+ if (iter->flags & BTREE_ITER_all_snapshots) {
p = bpos_predecessor(p);
} else {
p = bpos_nosnap_predecessor(p);
@@ -88,7 +88,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
{
struct bpos pos = iter->pos;
- if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ if ((iter->flags & BTREE_ITER_is_extents) &&
!bkey_eq(pos, POS_MAX))
pos = bkey_successor(iter, pos);
return pos;
@@ -253,13 +253,13 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON(iter->btree_id >= BTREE_ID_NR);
- BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached);
+ BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
- BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
- (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+ BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
+ (iter->flags & BTREE_ITER_all_snapshots));
- BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
- (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) &&
+ (iter->flags & BTREE_ITER_all_snapshots) &&
!btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
@@ -269,10 +269,10 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
{
- BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
!iter->pos.snapshot);
- BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
iter->pos.snapshot != iter->snapshot);
BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
@@ -289,7 +289,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
if (!bch2_debug_check_iterators)
return 0;
- if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+ if (!(iter->flags & BTREE_ITER_filter_snapshots))
return 0;
if (bkey_err(k) || !k.k)
@@ -300,8 +300,8 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
k.k->p.snapshot));
bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
- BTREE_ITER_NOPRESERVE|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_nopreserve|
+ BTREE_ITER_all_snapshots);
prev = bch2_btree_iter_prev(&copy);
if (!prev.k)
goto out;
@@ -332,6 +332,8 @@ out:
void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
struct bpos pos, bool key_cache)
{
+ bch2_trans_verify_not_unlocked(trans);
+
struct btree_path *path;
struct trans_for_each_path_inorder_iter iter;
struct printbuf buf = PRINTBUF;
@@ -897,7 +899,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
bch2_bkey_buf_reassemble(out, c, k);
- if ((flags & BTREE_ITER_PREFETCH) &&
+ if ((flags & BTREE_ITER_prefetch) &&
c->opts.btree_node_prefetch)
ret = btree_path_prefetch_j(trans, path, &jiter);
@@ -927,10 +929,24 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
if (ret)
goto err;
} else {
- bch2_bkey_buf_unpack(&tmp, c, l->b,
- bch2_btree_node_iter_peek(&l->iter, l->b));
+ struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
+ if (!k) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "node not found at pos ");
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_str(&buf, " within parent node ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
+
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -BCH_ERR_btree_need_topology_repair;
+ goto err;
+ }
- if ((flags & BTREE_ITER_PREFETCH) &&
+ bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+
+ if ((flags & BTREE_ITER_prefetch) &&
c->opts.btree_node_prefetch) {
ret = btree_path_prefetch(trans, path);
if (ret)
@@ -962,7 +978,6 @@ err:
return ret;
}
-
static int bch2_btree_path_traverse_all(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
@@ -986,6 +1001,7 @@ retry_all:
bch2_trans_unlock(trans);
cond_resched();
+ trans->locked = true;
if (unlikely(trans->memory_allocation_failure)) {
struct closure cl;
@@ -1149,6 +1165,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
goto out_uptodate;
path->level = btree_path_up_until_good_node(trans, path, 0);
+ unsigned max_level = path->level;
EBUG_ON(btree_path_node(path, path->level) &&
!btree_node_locked(path, path->level));
@@ -1179,6 +1196,16 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
goto out;
}
}
+
+ if (unlikely(max_level > path->level)) {
+ struct btree_path *linked;
+ unsigned iter;
+
+ trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter)
+ for (unsigned j = path->level + 1; j < max_level; j++)
+ linked->l[j] = path->l[j];
+ }
+
out_uptodate:
path->uptodate = BTREE_ITER_UPTODATE;
out:
@@ -1208,11 +1235,14 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path
}
static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
- bool intent)
+ bool intent, unsigned long ip)
{
btree_path_idx_t new = btree_path_alloc(trans, src);
btree_path_copy(trans, trans->paths + new, trans->paths + src);
__btree_path_get(trans->paths + new, intent);
+#ifdef TRACK_PATH_ALLOCATED
+ trans->paths[new].ip_allocated = ip;
+#endif
return new;
}
@@ -1221,7 +1251,7 @@ btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
btree_path_idx_t path, bool intent, unsigned long ip)
{
__btree_path_put(trans->paths + path, intent);
- path = btree_path_clone(trans, path, intent);
+ path = btree_path_clone(trans, path, intent, ip);
trans->paths[path].preserve = false;
return path;
}
@@ -1321,6 +1351,26 @@ static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t
__clear_bit(path, trans->paths_allocated);
}
+static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path)
+{
+ unsigned l = path->level;
+
+ do {
+ if (!btree_path_node(path, l))
+ break;
+
+ if (!is_btree_node(path, l))
+ return false;
+
+ if (path->l[l].lock_seq != path->l[l].b->c.lock.seq)
+ return false;
+
+ l++;
+ } while (l < path->locks_want);
+
+ return true;
+}
+
void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
{
struct btree_path *path = trans->paths + path_idx, *dup;
@@ -1335,10 +1385,15 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
return;
- if (path->should_be_locked &&
- !trans->restarted &&
- (!dup || !bch2_btree_path_relock_norestart(trans, dup)))
- return;
+ if (path->should_be_locked && !trans->restarted) {
+ if (!dup)
+ return;
+
+ if (!(trans->locked
+ ? bch2_btree_path_relock_norestart(trans, dup)
+ : bch2_btree_path_can_relock(trans, dup)))
+ return;
+ }
if (dup) {
dup->preserve |= path->preserve;
@@ -1371,22 +1426,26 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
(void *) trans->last_restarted_ip);
}
+void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
+{
+ panic("trans should be locked, unlocked by %pS\n",
+ (void *) trans->last_unlock_ip);
+}
+
noinline __cold
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
- prt_printf(buf, "transaction updates for %s journal seq %llu",
+ prt_printf(buf, "transaction updates for %s journal seq %llu\n",
trans->fn, trans->journal_res.seq);
- prt_newline(buf);
printbuf_indent_add(buf, 2);
trans_for_each_update(trans, i) {
struct bkey_s_c old = { &i->old_k, i->old_v };
- prt_printf(buf, "update: btree=%s cached=%u %pS",
+ prt_printf(buf, "update: btree=%s cached=%u %pS\n",
bch2_btree_id_str(i->btree_id),
i->cached,
(void *) i->ip_allocated);
- prt_newline(buf);
prt_printf(buf, " old ");
bch2_bkey_val_to_text(buf, trans->c, old);
@@ -1415,23 +1474,63 @@ void bch2_dump_trans_updates(struct btree_trans *trans)
printbuf_exit(&buf);
}
-static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
+static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
{
struct btree_path *path = trans->paths + path_idx;
- prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
+ prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ",
path_idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
+ path->cached ? 'C' : 'B',
bch2_btree_id_str(path->btree_id),
path->level);
bch2_bpos_to_text(out, path->pos);
- prt_printf(out, " locks %u", path->nodes_locked);
#ifdef TRACK_PATH_ALLOCATED
prt_printf(out, " %pS", (void *) path->ip_allocated);
#endif
+}
+
+static const char *btree_node_locked_str(enum btree_node_locked_type t)
+{
+ switch (t) {
+ case BTREE_NODE_UNLOCKED:
+ return "unlocked";
+ case BTREE_NODE_READ_LOCKED:
+ return "read";
+ case BTREE_NODE_INTENT_LOCKED:
+ return "intent";
+ case BTREE_NODE_WRITE_LOCKED:
+ return "write";
+ default:
+ return NULL;
+ }
+}
+
+void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
+{
+ bch2_btree_path_to_text_short(out, trans, path_idx);
+
+ struct btree_path *path = trans->paths + path_idx;
+
+ prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want);
prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+ for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
+ prt_printf(out, "l=%u locks %s seq %u node ", l,
+ btree_node_locked_str(btree_node_locked_type(path, l)),
+ path->l[l].lock_seq);
+
+ int ret = PTR_ERR_OR_ZERO(path->l[l].b);
+ if (ret)
+ prt_str(out, bch2_err_str(ret));
+ else
+ prt_printf(out, "%px", path->l[l].b);
+ prt_newline(out);
+ }
+ printbuf_indent_sub(out, 2);
}
static noinline __cold
@@ -1443,8 +1542,10 @@ void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
if (!nosort)
btree_trans_sort_paths(trans);
- trans_for_each_path_idx_inorder(trans, iter)
- bch2_btree_path_to_text(out, trans, iter.path_idx);
+ trans_for_each_path_idx_inorder(trans, iter) {
+ bch2_btree_path_to_text_short(out, trans, iter.path_idx);
+ prt_newline(out);
+ }
}
noinline __cold
@@ -1595,11 +1696,12 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
unsigned flags, unsigned long ip)
{
struct btree_path *path;
- bool cached = flags & BTREE_ITER_CACHED;
- bool intent = flags & BTREE_ITER_INTENT;
+ bool cached = flags & BTREE_ITER_cached;
+ bool intent = flags & BTREE_ITER_intent;
struct trans_for_each_path_inorder_iter iter;
btree_path_idx_t path_pos = 0, path_idx;
+ bch2_trans_verify_not_unlocked(trans);
bch2_trans_verify_not_in_restart(trans);
bch2_trans_verify_locks(trans);
@@ -1644,7 +1746,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
trans->paths_sorted = false;
}
- if (!(flags & BTREE_ITER_NOPRESERVE))
+ if (!(flags & BTREE_ITER_nopreserve))
path->preserve = true;
if (path->intent_ref)
@@ -1665,6 +1767,22 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
return path_idx;
}
+btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans,
+ enum btree_id btree_id,
+ unsigned level,
+ struct bpos pos)
+{
+ btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
+ BTREE_ITER_nopreserve|
+ BTREE_ITER_intent, _RET_IP_);
+ path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
+
+ struct btree_path *path = trans->paths + path_idx;
+ bch2_btree_path_downgrade(trans, path);
+ __bch2_btree_path_unlock(trans, path);
+ return path_idx;
+}
+
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
{
@@ -1706,6 +1824,19 @@ hole:
return (struct bkey_s_c) { u, NULL };
}
+
+void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+
+ if (!iter->path || trans->restarted)
+ return;
+
+ struct btree_path *path = btree_iter_path(trans, iter);
+ path->preserve = false;
+ if (path->ref == 1)
+ path->should_be_locked = false;
+}
/* Btree iterators: */
int __must_check
@@ -1720,9 +1851,11 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
struct btree_trans *trans = iter->trans;
int ret;
+ bch2_trans_verify_not_unlocked(trans);
+
iter->path = bch2_btree_path_set_pos(trans, iter->path,
btree_iter_search_key(iter),
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
@@ -1761,7 +1894,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
iter->k.p = iter->pos = b->key.k.p;
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
out:
@@ -1822,13 +1955,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
if (bpos_eq(iter->pos, b->key.k.p)) {
__btree_path_set_level_up(trans, path, path->level++);
} else {
+ if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED)
+ btree_node_unlock(trans, path, path->level + 1);
+
/*
* Haven't gotten to the end of the parent node: go back down to
* the next child node
*/
iter->path = bch2_btree_path_set_pos(trans, iter->path,
bpos_successor(iter->pos),
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
path = btree_iter_path(trans, iter);
@@ -1846,7 +1982,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
iter->k.p = iter->pos = b->key.k.p;
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
@@ -1865,11 +2001,11 @@ err:
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
struct bpos pos = iter->k.p;
- bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ bool ret = !(iter->flags & BTREE_ITER_all_snapshots
? bpos_eq(pos, SPOS_MAX)
: bkey_eq(pos, SPOS_MAX));
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_successor(iter, pos);
bch2_btree_iter_set_pos(iter, pos);
return ret;
@@ -1878,11 +2014,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
{
struct bpos pos = bkey_start_pos(&iter->k);
- bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ bool ret = !(iter->flags & BTREE_ITER_all_snapshots
? bpos_eq(pos, POS_MIN)
: bkey_eq(pos, POS_MIN));
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_predecessor(iter, pos);
bch2_btree_iter_set_pos(iter, pos);
return ret;
@@ -1993,7 +2129,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
struct bkey_s_c k;
int ret;
- if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) &&
+ bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked(trans);
+
+ if ((iter->flags & BTREE_ITER_key_cache_fill) &&
bpos_eq(iter->pos, pos))
return bkey_s_c_null;
@@ -2002,17 +2141,17 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
if (!iter->key_cache_path)
iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
- iter->flags & BTREE_ITER_INTENT, 0,
- iter->flags|BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL,
+ iter->flags & BTREE_ITER_intent, 0,
+ iter->flags|BTREE_ITER_cached|
+ BTREE_ITER_cached_nofill,
_THIS_IP_);
iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
- iter->flags|BTREE_ITER_CACHED) ?:
+ iter->flags|BTREE_ITER_cached) ?:
bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
if (unlikely(ret))
return bkey_s_c_err(ret);
@@ -2040,7 +2179,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
struct btree_path_level *l;
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
@@ -2065,7 +2204,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
k = btree_path_level_peek_all(trans->c, l, &iter->k);
- if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
k.k &&
(k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
k = k2;
@@ -2076,10 +2215,10 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
}
}
- if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
+ if (unlikely(iter->flags & BTREE_ITER_with_journal))
k = btree_trans_peek_journal(trans, iter, k);
- if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
bch2_btree_trans_peek_updates(trans, iter, &k);
@@ -2131,11 +2270,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
struct bpos iter_pos;
int ret;
- EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
+ bch2_trans_verify_not_unlocked(trans);
+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
if (iter->update_path) {
bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
}
@@ -2158,7 +2298,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* isn't monotonically increasing before FILTER_SNAPSHOTS, and
* that's what we check against in extents mode:
*/
- if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
+ if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
? bkey_gt(k.k->p, end)
: k.k->p.inode > end.inode))
goto end;
@@ -2166,13 +2306,13 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
if (iter->update_path &&
!bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
}
- if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
- (iter->flags & BTREE_ITER_INTENT) &&
- !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ if ((iter->flags & BTREE_ITER_filter_snapshots) &&
+ (iter->flags & BTREE_ITER_intent) &&
+ !(iter->flags & BTREE_ITER_is_extents) &&
!iter->update_path) {
struct bpos pos = k.k->p;
@@ -2187,12 +2327,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* advance, same as on exit for iter->path, but only up
* to snapshot
*/
- __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
iter->update_path = iter->path;
iter->update_path = bch2_btree_path_set_pos(trans,
iter->update_path, pos,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
_THIS_IP_);
ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
if (unlikely(ret)) {
@@ -2205,7 +2345,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* We can never have a key in a leaf node at POS_MAX, so
* we don't have to check these successor() calls:
*/
- if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ if ((iter->flags & BTREE_ITER_filter_snapshots) &&
!bch2_snapshot_is_ancestor(trans->c,
iter->snapshot,
k.k->p.snapshot)) {
@@ -2214,7 +2354,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
}
if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ !(iter->flags & BTREE_ITER_all_snapshots)) {
search_key = bkey_successor(iter, k.k->p);
continue;
}
@@ -2224,12 +2364,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* equal to the key we just returned - except extents can
* straddle iter->pos:
*/
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (!(iter->flags & BTREE_ITER_is_extents))
iter_pos = k.k->p;
else
iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
- if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
+ if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
? bkey_gt(iter_pos, end)
: bkey_ge(iter_pos, end)))
goto end;
@@ -2240,7 +2380,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
iter->pos = iter_pos;
iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
@@ -2253,7 +2393,7 @@ out_no_locked:
btree_path_set_should_be_locked(trans->paths + iter->update_path);
}
- if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ if (!(iter->flags & BTREE_ITER_all_snapshots))
iter->pos.snapshot = iter->snapshot;
ret = bch2_btree_iter_verify_ret(iter, k);
@@ -2303,21 +2443,22 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
btree_path_idx_t saved_path = 0;
int ret;
+ bch2_trans_verify_not_unlocked(trans);
EBUG_ON(btree_iter_path(trans, iter)->cached ||
btree_iter_path(trans, iter)->level);
- if (iter->flags & BTREE_ITER_WITH_JOURNAL)
+ if (iter->flags & BTREE_ITER_with_journal)
return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ if (iter->flags & BTREE_ITER_filter_snapshots)
search_key.snapshot = U32_MAX;
while (1) {
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
@@ -2332,17 +2473,17 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
if (!k.k ||
- ((iter->flags & BTREE_ITER_IS_EXTENTS)
+ ((iter->flags & BTREE_ITER_is_extents)
? bpos_ge(bkey_start_pos(k.k), search_key)
: bpos_gt(k.k->p, search_key)))
k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
- if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
bch2_btree_trans_peek_prev_updates(trans, iter, &k);
if (likely(k.k)) {
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
+ if (iter->flags & BTREE_ITER_filter_snapshots) {
if (k.k->p.snapshot == iter->snapshot)
goto got_key;
@@ -2353,7 +2494,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
*/
if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
bch2_path_put_nokeep(trans, iter->path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->path = saved_path;
saved_path = 0;
iter->k = saved_k;
@@ -2366,9 +2507,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
k.k->p.snapshot)) {
if (saved_path)
bch2_path_put_nokeep(trans, saved_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
saved_path = btree_path_clone(trans, iter->path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent,
+ _THIS_IP_);
path = btree_iter_path(trans, iter);
saved_k = *k.k;
saved_v = k.v;
@@ -2379,9 +2521,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
}
got_key:
if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ !(iter->flags & BTREE_ITER_all_snapshots)) {
search_key = bkey_predecessor(iter, k.k->p);
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ if (iter->flags & BTREE_ITER_filter_snapshots)
search_key.snapshot = U32_MAX;
continue;
}
@@ -2405,11 +2547,11 @@ got_key:
if (bkey_lt(k.k->p, iter->pos))
iter->pos = k.k->p;
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ if (iter->flags & BTREE_ITER_filter_snapshots)
iter->pos.snapshot = iter->snapshot;
out_no_locked:
if (saved_path)
- bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
+ bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -2439,12 +2581,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct bkey_s_c k;
int ret;
+ bch2_trans_verify_not_unlocked(trans);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
+ EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
/* extents can't span inode numbers: */
- if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ if ((iter->flags & BTREE_ITER_is_extents) &&
unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
if (iter->pos.inode == KEY_INODE_MAX)
return bkey_s_c_null;
@@ -2454,7 +2597,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
search_key = btree_iter_search_key(iter);
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
@@ -2463,22 +2606,22 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
goto out_no_locked;
}
- if ((iter->flags & BTREE_ITER_CACHED) ||
- !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
+ if ((iter->flags & BTREE_ITER_cached) ||
+ !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
k = bkey_s_c_null;
- if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates)) {
bch2_btree_trans_peek_slot_updates(trans, iter, &k);
if (k.k)
goto out;
}
- if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+ if (unlikely(iter->flags & BTREE_ITER_with_journal) &&
(k = btree_trans_peek_slot_journal(trans, iter)).k)
goto out;
- if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
(k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
if (!bkey_err(k))
iter->k = *k.k;
@@ -2493,12 +2636,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct bpos next;
struct bpos end = iter->pos;
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ if (iter->flags & BTREE_ITER_is_extents)
end.offset = U64_MAX;
EBUG_ON(btree_iter_path(trans, iter)->level);
- if (iter->flags & BTREE_ITER_INTENT) {
+ if (iter->flags & BTREE_ITER_intent) {
struct btree_iter iter2;
bch2_trans_copy_iter(&iter2, iter);
@@ -2529,7 +2672,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
bkey_init(&iter->k);
iter->k.p = iter->pos;
- if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+ if (iter->flags & BTREE_ITER_is_extents) {
bch2_key_resize(&iter->k,
min_t(u64, KEY_SIZE_MAX,
(next.inode == iter->pos.inode
@@ -2713,13 +2856,13 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
{
if (iter->update_path)
bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
if (iter->path)
bch2_path_put(trans, iter->path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
if (iter->key_cache_path)
bch2_path_put(trans, iter->key_cache_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->path = 0;
iter->update_path = 0;
iter->key_cache_path = 0;
@@ -2744,9 +2887,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
unsigned depth,
unsigned flags)
{
- flags |= BTREE_ITER_NOT_EXTENTS;
- flags |= __BTREE_ITER_ALL_SNAPSHOTS;
- flags |= BTREE_ITER_ALL_SNAPSHOTS;
+ flags |= BTREE_ITER_not_extents;
+ flags |= BTREE_ITER_snapshot_field;
+ flags |= BTREE_ITER_all_snapshots;
bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
__bch2_btree_iter_flags(trans, btree_id, flags),
@@ -2769,9 +2912,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
dst->ip_allocated = _RET_IP_;
#endif
if (src->path)
- __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent);
if (src->update_path)
- __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
dst->key_cache_path = 0;
}
@@ -2790,6 +2933,31 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
struct btree_transaction_stats *s = btree_trans_stats(trans);
s->max_mem = max(s->max_mem, new_bytes);
+ if (trans->used_mempool) {
+ if (trans->mem_bytes >= new_bytes)
+ goto out_change_top;
+
+ /* No more space from mempool item, need malloc new one */
+ new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+ if (unlikely(!new_mem)) {
+ bch2_trans_unlock(trans);
+
+ new_mem = kmalloc(new_bytes, GFP_KERNEL);
+ if (!new_mem)
+ return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+ ret = bch2_trans_relock(trans);
+ if (ret) {
+ kfree(new_mem);
+ return ERR_PTR(ret);
+ }
+ }
+ memcpy(new_mem, trans->mem, trans->mem_top);
+ trans->used_mempool = false;
+ mempool_free(trans->mem, &c->btree_trans_mem_pool);
+ goto out_new_mem;
+ }
+
new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_mem)) {
bch2_trans_unlock(trans);
@@ -2798,6 +2966,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX;
+ memcpy(new_mem, trans->mem, trans->mem_top);
+ trans->used_mempool = true;
kfree(trans->mem);
}
@@ -2811,7 +2981,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (ret)
return ERR_PTR(ret);
}
-
+out_new_mem:
trans->mem = new_mem;
trans->mem_bytes = new_bytes;
@@ -2819,7 +2989,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
}
-
+out_change_top:
p = trans->mem + trans->mem_top;
trans->mem_top += size;
memset(p, 0, size);
@@ -2913,7 +3083,8 @@ u32 bch2_trans_begin(struct btree_trans *trans)
if (!trans->restarted &&
(need_resched() ||
time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
- drop_locks_do(trans, (cond_resched(), 0));
+ bch2_trans_unlock(trans);
+ cond_resched();
now = local_clock();
}
trans->last_begin_time = now;
@@ -2923,11 +3094,14 @@ u32 bch2_trans_begin(struct btree_trans *trans)
bch2_trans_srcu_unlock(trans);
trans->last_begin_ip = _RET_IP_;
+ trans->locked = true;
+
if (trans->restarted) {
bch2_btree_path_traverse_all(trans);
trans->notrace_relock_fail = false;
}
+ bch2_trans_verify_not_unlocked(trans);
return trans->restart_count;
}
@@ -2980,7 +3154,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
*/
BUG_ON(pos_task &&
pid == pos_task->pid &&
- bch2_trans_locked(pos));
+ pos->locked);
if (pos_task && pid < pos_task->pid) {
list_add_tail(&trans->list, &pos->list);
@@ -2996,8 +3170,9 @@ got_trans:
trans->last_begin_time = local_clock();
trans->fn_idx = fn_idx;
trans->locking_wait.task = current;
+ trans->locked = true;
trans->journal_replay_not_finished =
- unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+ unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
atomic_inc_not_zero(&c->journal_keys.ref);
trans->nr_paths = ARRAY_SIZE(trans->_paths);
trans->paths_allocated = trans->_paths_allocated;
@@ -3093,7 +3268,7 @@ void bch2_trans_put(struct btree_trans *trans)
if (paths_allocated != trans->_paths_allocated)
kvfree_rcu_mightsleep(paths_allocated);
- if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
+ if (trans->used_mempool)
mempool_free(trans->mem, &c->btree_trans_mem_pool);
else
kfree(trans->mem);
@@ -3126,13 +3301,11 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
pid = owner ? owner->pid : 0;
rcu_read_unlock();
- prt_tab(out);
- prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
+ prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
b->level, bch2_btree_id_str(b->btree_id));
bch2_bpos_to_text(out, btree_node_pos(b));
- prt_tab(out);
- prt_printf(out, " locks %u:%u:%u held by pid %u",
+ prt_printf(out, "\t locks %u:%u:%u held by pid %u",
c.n[0], c.n[1], c.n[2], pid);
}
@@ -3189,10 +3362,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
b = READ_ONCE(trans->locking);
if (b) {
- prt_printf(out, " blocked for %lluus on",
- div_u64(local_clock() - trans->locking_wait.start_time,
- 1000));
- prt_newline(out);
+ prt_printf(out, " blocked for %lluus on\n",
+ div_u64(local_clock() - trans->locking_wait.start_time, 1000));
prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]);
bch2_btree_bkey_cached_common_to_text(out, b);
prt_newline(out);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 24772538e4cc..798eb1c47966 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -216,9 +216,13 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
btree_path_idx_t,
unsigned, unsigned long);
+static inline void bch2_trans_verify_not_unlocked(struct btree_trans *);
+
static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
btree_path_idx_t path, unsigned flags)
{
+ bch2_trans_verify_not_unlocked(trans);
+
if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
return 0;
@@ -227,6 +231,9 @@ static inline int __must_check bch2_btree_path_traverse(struct btree_trans *tran
btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
unsigned, unsigned, unsigned, unsigned long);
+btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id,
+ unsigned, struct bpos);
+
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
/*
@@ -283,7 +290,6 @@ int bch2_trans_relock(struct btree_trans *);
int bch2_trans_relock_notrace(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
void bch2_trans_unlock_long(struct btree_trans *);
-bool bch2_trans_locked(struct btree_trans *);
static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
{
@@ -309,6 +315,14 @@ static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
bch2_trans_in_restart_error(trans);
}
+void __noreturn bch2_trans_unlocked_error(struct btree_trans *);
+
+static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans)
+{
+ if (!trans->locked)
+ bch2_trans_unlocked_error(trans);
+}
+
__always_inline
static int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
{
@@ -386,10 +400,10 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
if (unlikely(iter->update_path))
bch2_path_put(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
- if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ if (!(iter->flags & BTREE_ITER_all_snapshots))
new_pos.snapshot = iter->snapshot;
__bch2_btree_iter_set_pos(iter, new_pos);
@@ -397,7 +411,7 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
{
- BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
+ BUG_ON(!(iter->flags & BTREE_ITER_is_extents));
iter->pos = bkey_start_pos(&iter->k);
}
@@ -416,20 +430,20 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
unsigned btree_id,
unsigned flags)
{
- if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+ if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
btree_id_is_extents(btree_id))
- flags |= BTREE_ITER_IS_EXTENTS;
+ flags |= BTREE_ITER_is_extents;
- if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+ if (!(flags & BTREE_ITER_snapshot_field) &&
!btree_type_has_snapshot_field(btree_id))
- flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+ flags &= ~BTREE_ITER_all_snapshots;
- if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ if (!(flags & BTREE_ITER_all_snapshots) &&
btree_type_has_snapshots(btree_id))
- flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+ flags |= BTREE_ITER_filter_snapshots;
if (trans->journal_replay_not_finished)
- flags |= BTREE_ITER_WITH_JOURNAL;
+ flags |= BTREE_ITER_with_journal;
return flags;
}
@@ -439,10 +453,10 @@ static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
unsigned flags)
{
if (!btree_id_cached(trans->c, btree_id)) {
- flags &= ~BTREE_ITER_CACHED;
- flags &= ~BTREE_ITER_WITH_KEY_CACHE;
- } else if (!(flags & BTREE_ITER_CACHED))
- flags |= BTREE_ITER_WITH_KEY_CACHE;
+ flags &= ~BTREE_ITER_cached;
+ flags &= ~BTREE_ITER_with_key_cache;
+ } else if (!(flags & BTREE_ITER_cached))
+ flags |= BTREE_ITER_with_key_cache;
return __bch2_btree_iter_flags(trans, btree_id, flags);
}
@@ -494,13 +508,7 @@ void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
unsigned, unsigned, unsigned);
void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
-static inline void set_btree_iter_dontneed(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
-
- if (!trans->restarted)
- btree_iter_path(trans, iter)->preserve = false;
-}
+void bch2_set_btree_iter_dontneed(struct btree_iter *);
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
@@ -614,14 +622,14 @@ u32 bch2_trans_begin(struct btree_trans *);
static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek_prev(iter);
}
static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek(iter);
}
@@ -629,7 +637,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *
struct bpos end,
unsigned flags)
{
- if (!(flags & BTREE_ITER_SLOTS))
+ if (!(flags & BTREE_ITER_slots))
return bch2_btree_iter_peek_upto(iter, end);
if (bkey_gt(iter->pos, end))
@@ -642,7 +650,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *);
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
+ if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
return __bch2_btree_trans_too_many_iters(trans);
return 0;
@@ -694,16 +702,12 @@ transaction_restart: \
_ret2 ?: trans_was_restarted(_trans, _restart_count); \
})
-#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _do) \
+#define for_each_btree_key_upto_continue(_trans, _iter, \
+ _end, _flags, _k, _do) \
({ \
- struct btree_iter _iter; \
struct bkey_s_c _k; \
int _ret3 = 0; \
\
- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- \
do { \
_ret3 = lockrestart_do(_trans, ({ \
(_k) = bch2_btree_iter_peek_upto_type(&(_iter), \
@@ -719,6 +723,21 @@ transaction_restart: \
_ret3; \
})
+#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \
+ for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
+
+#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _do) \
+({ \
+ bch2_trans_begin(trans); \
+ \
+ struct btree_iter _iter; \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\
+})
+
#define for_each_btree_key(_trans, _iter, _btree_id, \
_start, _flags, _k, _do) \
for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \
@@ -789,14 +808,6 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
return k;
}
-#define for_each_btree_key_old(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
-
#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
@@ -827,7 +838,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
#define drop_locks_do(_trans, _do) \
({ \
bch2_trans_unlock(_trans); \
- _do ?: bch2_trans_relock(_trans); \
+ (_do) ?: bch2_trans_relock(_trans); \
})
#define allocate_dropping_locks_errcode(_trans, _do) \
@@ -856,6 +867,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
})
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 50e04356d72c..332dbf164929 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -130,12 +130,30 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
}
+static void journal_iter_verify(struct journal_iter *iter)
+{
+ struct journal_keys *keys = iter->keys;
+ size_t gap_size = keys->size - keys->nr;
+
+ BUG_ON(iter->idx >= keys->gap &&
+ iter->idx < keys->gap + gap_size);
+
+ if (iter->idx < keys->size) {
+ struct journal_key *k = keys->data + iter->idx;
+
+ int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
+ cmp_int(k->level, iter->level);
+ BUG_ON(cmp < 0);
+ }
+}
+
static void journal_iters_fix(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
/* The key we just inserted is immediately before the gap: */
size_t gap_end = keys->gap + (keys->size - keys->nr);
- struct btree_and_journal_iter *iter;
+ struct journal_key *new_key = &keys->data[keys->gap - 1];
+ struct journal_iter *iter;
/*
* If an iterator points one after the key we just inserted, decrement
@@ -143,9 +161,14 @@ static void journal_iters_fix(struct bch_fs *c)
* decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
* handle that:
*/
- list_for_each_entry(iter, &c->journal_iters, journal.list)
- if (iter->journal.idx == gap_end)
- iter->journal.idx = keys->gap - 1;
+ list_for_each_entry(iter, &c->journal_iters, list) {
+ journal_iter_verify(iter);
+ if (iter->idx == gap_end &&
+ new_key->btree_id == iter->btree_id &&
+ new_key->level == iter->level)
+ iter->idx = keys->gap - 1;
+ journal_iter_verify(iter);
+ }
}
static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
@@ -192,7 +215,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
if (idx > keys->gap)
idx -= keys->size - keys->nr;
+ size_t old_gap = keys->gap;
+
if (keys->nr == keys->size) {
+ journal_iters_move_gap(c, old_gap, keys->size);
+ old_gap = keys->size;
+
struct journal_keys new_keys = {
.nr = keys->nr,
.size = max_t(size_t, keys->size, 8) * 2,
@@ -216,7 +244,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
keys->gap = keys->nr;
}
- journal_iters_move_gap(c, keys->gap, idx);
+ journal_iters_move_gap(c, old_gap, idx);
move_gap(keys, idx);
@@ -261,6 +289,22 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
return bch2_journal_key_insert(c, id, level, &whiteout);
}
+bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
+ unsigned level, struct bpos pos)
+{
+ struct journal_keys *keys = &trans->c->journal_keys;
+ size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+ if (!trans->journal_replay_not_finished)
+ return false;
+
+ return (idx < keys->size &&
+ keys->data[idx].btree_id == btree &&
+ keys->data[idx].level == level &&
+ bpos_eq(keys->data[idx].k->k.p, pos) &&
+ bkey_deleted(&keys->data[idx].k->k));
+}
+
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos pos)
{
@@ -285,16 +329,21 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct journal_key *k = iter->keys->data + iter->idx;
+ journal_iter_verify(iter);
+
+ while (iter->idx < iter->keys->size) {
+ struct journal_key *k = iter->keys->data + iter->idx;
+
+ int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
+ cmp_int(k->level, iter->level);
+ if (cmp > 0)
+ break;
+ BUG_ON(cmp);
- while (k < iter->keys->data + iter->keys->size &&
- k->btree_id == iter->btree_id &&
- k->level == iter->level) {
if (!k->overwritten)
return bkey_i_to_s_c(k->k);
bch2_journal_iter_advance(iter);
- k = iter->keys->data + iter->idx;
}
return bkey_s_c_null;
@@ -314,6 +363,8 @@ static void bch2_journal_iter_init(struct bch_fs *c,
iter->level = level;
iter->keys = &c->journal_keys;
iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
+
+ journal_iter_verify(iter);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -363,7 +414,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
{
- struct bkey_s_c btree_k, journal_k, ret;
+ struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
if (iter->prefetch && iter->journal.level)
btree_and_journal_iter_prefetch(iter);
@@ -375,9 +426,10 @@ again:
bpos_lt(btree_k.k->p, iter->pos))
bch2_journal_iter_advance_btree(iter);
- while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
- bpos_lt(journal_k.k->p, iter->pos))
- bch2_journal_iter_advance(&iter->journal);
+ if (iter->trans->journal_replay_not_finished)
+ while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+ bpos_lt(journal_k.k->p, iter->pos))
+ bch2_journal_iter_advance(&iter->journal);
ret = journal_k.k &&
(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
@@ -417,10 +469,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
iter->trans = trans;
iter->b = b;
iter->node_iter = node_iter;
- bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
- INIT_LIST_HEAD(&iter->journal.list);
iter->pos = b->data->min_key;
iter->at_end = false;
+ INIT_LIST_HEAD(&iter->journal.list);
+
+ if (trans->journal_replay_not_finished) {
+ bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
+ if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
+ list_add(&iter->journal.list, &trans->c->journal_iters);
+ }
}
/*
@@ -435,7 +492,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
bch2_btree_node_iter_init_from_start(&node_iter, b);
__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
- list_add(&iter->journal.list, &trans->c->journal_iters);
}
/* sort and dedup all keys in the journal: */
@@ -548,3 +604,39 @@ int bch2_journal_keys_sort(struct bch_fs *c)
bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
return 0;
}
+
+void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
+ unsigned level_min, unsigned level_max,
+ struct bpos start, struct bpos end)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ size_t dst = 0;
+
+ move_gap(keys, keys->nr);
+
+ darray_for_each(*keys, i)
+ if (!(i->btree_id == btree &&
+ i->level >= level_min &&
+ i->level <= level_max &&
+ bpos_ge(i->k->k.p, start) &&
+ bpos_le(i->k->k.p, end)))
+ keys->data[dst++] = *i;
+ keys->nr = keys->gap = dst;
+}
+
+void bch2_journal_keys_dump(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ struct printbuf buf = PRINTBUF;
+
+ pr_info("%zu keys:", keys->nr);
+
+ move_gap(keys, keys->nr);
+
+ darray_for_each(*keys, i) {
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+ pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf);
+ }
+ printbuf_exit(&buf);
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index c9d19da3ea04..1ba4a79b0ef9 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -40,8 +40,8 @@ int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
- unsigned, struct bpos);
+bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
@@ -66,4 +66,10 @@ void bch2_journal_entries_free(struct bch_fs *);
int bch2_journal_keys_sort(struct bch_fs *);
+void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
+ unsigned, unsigned,
+ struct bpos, struct bpos);
+
+void bch2_journal_keys_dump(struct bch_fs *);
+
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 581edcb0911b..34056aaece00 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -169,6 +169,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
} else {
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_pcpu);
+ bc->nr_freed_pcpu++;
mutex_unlock(&bc->lock);
}
}
@@ -245,6 +246,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_pcpu)) {
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_pcpu--;
}
mutex_unlock(&bc->lock);
}
@@ -381,9 +383,9 @@ static int btree_key_cache_fill(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
- BTREE_ITER_KEY_CACHE_FILL|
- BTREE_ITER_CACHED_NOFILL);
- iter.flags &= ~BTREE_ITER_WITH_JOURNAL;
+ BTREE_ITER_key_cache_fill|
+ BTREE_ITER_cached_nofill);
+ iter.flags &= ~BTREE_ITER_with_journal;
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -422,16 +424,16 @@ static int btree_key_cache_fill(struct btree_trans *trans,
goto err;
}
- if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+ ret = bch2_trans_relock(trans);
+ if (ret) {
kfree(new_k);
- trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
goto err;
}
- ret = bch2_trans_relock(trans);
- if (ret) {
+ if (!bch2_btree_node_relock(trans, ck_path, 0)) {
kfree(new_k);
+ trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
goto err;
}
}
@@ -454,7 +456,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
/* We're not likely to need this iterator again: */
- set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(&iter);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -513,23 +515,10 @@ retry:
fill:
path->uptodate = BTREE_ITER_UPTODATE;
- if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
- /*
- * Using the underscore version because we haven't set
- * path->uptodate yet:
- */
- if (!path->locks_want &&
- !__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
- trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
- goto err;
- }
-
- ret = btree_key_cache_fill(trans, path, ck);
- if (ret)
- goto err;
-
- ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
+ if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) {
+ ret = bch2_btree_path_upgrade(trans, path, 1) ?:
+ btree_key_cache_fill(trans, path, ck) ?:
+ bch2_btree_path_relock(trans, path, _THIS_IP_);
if (ret)
goto err;
@@ -620,13 +609,13 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
- BTREE_ITER_SLOTS|
- BTREE_ITER_INTENT|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_slots|
+ BTREE_ITER_intent|
+ BTREE_ITER_all_snapshots);
bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
+ b_iter.flags &= ~BTREE_ITER_with_key_cache;
ret = bch2_btree_iter_traverse(&c_iter);
if (ret)
@@ -659,14 +648,14 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
commit_flags |= BCH_WATERMARK_reclaim;
if (ck->journal.seq != journal_last_seq(j) ||
- j->watermark == BCH_WATERMARK_stripe)
+ !test_bit(JOURNAL_space_low, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
ret = bch2_btree_iter_traverse(&b_iter) ?:
bch2_trans_update(trans, &b_iter, ck->k,
- BTREE_UPDATE_KEY_CACHE_RECLAIM|
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
- BTREE_TRIGGER_NORUN) ?:
+ BTREE_UPDATE_key_cache_reclaim|
+ BTREE_UPDATE_internal_snapshot_node|
+ BTREE_TRIGGER_norun) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
@@ -788,7 +777,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
* flushing. The flush callback will not proceed unless ->seq matches
* the latest pin, so make sure it starts with a consistent value.
*/
- if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
+ if (!(insert_entry->flags & BTREE_UPDATE_nojournal) ||
!journal_pin_active(&ck->journal)) {
ck->seq = trans->journal_res.seq;
}
@@ -833,6 +822,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
int srcu_idx;
mutex_lock(&bc->lock);
+ bc->requested_to_free += sc->nr_to_scan;
+
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
flags = memalloc_nofs_save();
@@ -840,8 +831,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
- scanned += bc->nr_freed_nonpcpu;
-
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@@ -853,13 +842,9 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
atomic_long_dec(&bc->nr_freed);
freed++;
bc->nr_freed_nonpcpu--;
+ bc->freed++;
}
- if (scanned >= nr)
- goto out;
-
- scanned += bc->nr_freed_pcpu;
-
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@@ -871,11 +856,9 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
atomic_long_dec(&bc->nr_freed);
freed++;
bc->nr_freed_pcpu--;
+ bc->freed++;
}
- if (scanned >= nr)
- goto out;
-
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
if (bc->shrink_iter >= tbl->size)
@@ -891,14 +874,19 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
ck = container_of(pos, struct bkey_cached, hash);
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ bc->skipped_dirty++;
goto next;
-
- if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- else if (bkey_cached_lock_for_evict(ck)) {
+ bc->skipped_accessed++;
+ goto next;
+ } else if (bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(bc, ck);
bkey_cached_free(bc, ck);
+ bc->moved_to_freelist++;
+ } else {
+ bc->skipped_lock_fail++;
}
scanned++;
@@ -914,7 +902,6 @@ next:
} while (scanned < nr && bc->shrink_iter != start);
rcu_read_unlock();
-out:
memalloc_nofs_restore(flags);
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
mutex_unlock(&bc->lock);
@@ -965,13 +952,15 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
}
#ifdef __KERNEL__
- for_each_possible_cpu(cpu) {
- struct btree_key_cache_freelist *f =
- per_cpu_ptr(bc->pcpu_freed, cpu);
-
- for (i = 0; i < f->nr; i++) {
- ck = f->objs[i];
- list_add(&ck->list, &items);
+ if (bc->pcpu_freed) {
+ for_each_possible_cpu(cpu) {
+ struct btree_key_cache_freelist *f =
+ per_cpu_ptr(bc->pcpu_freed, cpu);
+
+ for (i = 0; i < f->nr; i++) {
+ ck = f->objs[i];
+ list_add(&ck->list, &items);
+ }
}
}
#endif
@@ -1044,14 +1033,47 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
return 0;
}
-void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
{
- prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed));
- prt_newline(out);
- prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys));
- prt_newline(out);
- prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty));
- prt_newline(out);
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+ printbuf_tabstop_push(out, 24);
+ printbuf_tabstop_push(out, 12);
+
+ unsigned flags = memalloc_nofs_save();
+ mutex_lock(&bc->lock);
+ prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
+ prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
+ prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed));
+ prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu);
+ prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu);
+
+ prt_printf(out, "\nshrinker:\n");
+ prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
+ prt_printf(out, "freed:\t%lu\r\n", bc->freed);
+ prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist);
+ prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
+ prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
+ prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
+
+ prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier));
+
+ struct bkey_cached *ck;
+ unsigned iter = 0;
+ list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
+ prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
+ if (++iter > 10)
+ break;
+ }
+
+ iter = 0;
+ list_for_each_entry(ck, &bc->freed_pcpu, list) {
+ prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
+ if (++iter > 10)
+ break;
+ }
+ mutex_unlock(&bc->lock);
+ memalloc_flags_restore(flags);
}
void bch2_btree_key_cache_exit(void)
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
index 290e4e57df5b..237e8bb3ac40 100644
--- a/fs/bcachefs/btree_key_cache_types.h
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -24,6 +24,14 @@ struct btree_key_cache {
atomic_long_t nr_freed;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
+
+ /* shrinker stats */
+ unsigned long requested_to_free;
+ unsigned long freed;
+ unsigned long moved_to_freelist;
+ unsigned long skipped_dirty;
+ unsigned long skipped_accessed;
+ unsigned long skipped_lock_fail;
};
struct bkey_cached_key {
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index b9b151e693ed..d66fff22109a 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -83,8 +83,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
{
struct trans_waiting_for_lock *i;
- prt_printf(out, "Found lock cycle (%u entries):", g->nr);
- prt_newline(out);
+ prt_printf(out, "Found lock cycle (%u entries):\n", g->nr);
for (i = g->g; i < g->g + g->nr; i++) {
struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
@@ -216,6 +215,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
if (unlikely(!best)) {
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
@@ -224,8 +224,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
bch2_btree_trans_to_text(&buf, trans);
- prt_printf(&buf, "backtrace:");
- prt_newline(&buf);
+ prt_printf(&buf, "backtrace:\n");
printbuf_indent_add(&buf, 2);
bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
printbuf_indent_sub(&buf, 2);
@@ -440,33 +439,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
- struct btree_path *linked;
- unsigned i, iter;
- int ret;
-
- /*
- * XXX BIG FAT NOTICE
- *
- * Drop all read locks before taking a write lock:
- *
- * This is a hack, because bch2_btree_node_lock_write_nofail() is a
- * hack - but by dropping read locks first, this should never fail, and
- * we only use this in code paths where whatever read locks we've
- * already taken are no longer needed:
- */
-
- trans_for_each_path(trans, linked, iter) {
- if (!linked->nodes_locked)
- continue;
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- if (btree_node_read_locked(linked, i)) {
- btree_node_unlock(trans, linked, i);
- btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
- }
- }
-
- ret = __btree_node_lock_write(trans, path, b, true);
+ int ret = __btree_node_lock_write(trans, path, b, true);
BUG_ON(ret);
}
@@ -518,8 +491,6 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
if (path->uptodate == BTREE_ITER_NEED_RELOCK)
path->uptodate = BTREE_ITER_UPTODATE;
- bch2_trans_verify_locks(trans);
-
return path->uptodate < BTREE_ITER_NEED_RELOCK;
}
@@ -635,7 +606,9 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_pa
{
struct get_locks_fail f;
- return btree_path_get_locks(trans, path, false, &f);
+ bool ret = btree_path_get_locks(trans, path, false, &f);
+ bch2_trans_verify_locks(trans);
+ return ret;
}
int __bch2_btree_path_relock(struct btree_trans *trans,
@@ -658,7 +631,9 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
path->locks_want = new_locks_want;
- return btree_path_get_locks(trans, path, true, f);
+ bool ret = btree_path_get_locks(trans, path, true, f);
+ bch2_trans_verify_locks(trans);
+ return ret;
}
bool __bch2_btree_path_upgrade(struct btree_trans *trans,
@@ -666,8 +641,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
unsigned new_locks_want,
struct get_locks_fail *f)
{
- if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
- return true;
+ bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f);
+ if (ret)
+ goto out;
/*
* XXX: this is ugly - we'd prefer to not be mucking with other
@@ -701,8 +677,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
btree_path_get_locks(trans, linked, true, NULL);
}
}
-
- return false;
+out:
+ bch2_trans_verify_locks(trans);
+ return ret;
}
void __bch2_btree_path_downgrade(struct btree_trans *trans,
@@ -751,82 +728,100 @@ void bch2_trans_downgrade(struct btree_trans *trans)
bch2_btree_path_downgrade(trans, path);
}
-int bch2_trans_relock(struct btree_trans *trans)
+static inline void __bch2_trans_unlock(struct btree_trans *trans)
{
struct btree_path *path;
unsigned i;
- if (unlikely(trans->restarted))
- return -((int) trans->restarted);
+ trans_for_each_path(trans, path, i)
+ __bch2_btree_path_unlock(trans, path);
+}
- trans_for_each_path(trans, path, i) {
- struct get_locks_fail f;
+static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
+ struct get_locks_fail *f, bool trace)
+{
+ if (!trace)
+ goto out;
- if (path->should_be_locked &&
- !btree_path_get_locks(trans, path, false, &f)) {
- if (trace_trans_restart_relock_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bpos_to_text(&buf, path->pos);
- prt_printf(&buf, " l=%u seq=%u node seq=",
- f.l, path->l[f.l].lock_seq);
- if (IS_ERR_OR_NULL(f.b)) {
- prt_str(&buf, bch2_err_str(PTR_ERR(f.b)));
- } else {
- prt_printf(&buf, "%u", f.b->c.lock.seq);
-
- struct six_lock_count c =
- bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l);
- prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
-
- c = six_lock_counts(&f.b->c.lock);
- prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
- }
+ if (trace_trans_restart_relock_enabled()) {
+ struct printbuf buf = PRINTBUF;
- trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
- printbuf_exit(&buf);
- }
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq);
+ if (IS_ERR_OR_NULL(f->b)) {
+ prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
+ } else {
+ prt_printf(&buf, "%u", f->b->c.lock.seq);
- count_event(trans->c, trans_restart_relock);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ struct six_lock_count c =
+ bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l);
+ prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+
+ c = six_lock_counts(&f->b->c.lock);
+ prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
}
+
+ trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
+ printbuf_exit(&buf);
}
- return 0;
+ count_event(trans->c, trans_restart_relock);
+out:
+ __bch2_trans_unlock(trans);
+ bch2_trans_verify_locks(trans);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
-int bch2_trans_relock_notrace(struct btree_trans *trans)
+static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
{
- struct btree_path *path;
- unsigned i;
+ bch2_trans_verify_locks(trans);
if (unlikely(trans->restarted))
return -((int) trans->restarted);
+ if (unlikely(trans->locked))
+ goto out;
+
+ struct btree_path *path;
+ unsigned i;
+
+ trans_for_each_path(trans, path, i) {
+ struct get_locks_fail f;
- trans_for_each_path(trans, path, i)
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path)) {
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
- }
+ !btree_path_get_locks(trans, path, false, &f))
+ return bch2_trans_relock_fail(trans, path, &f, trace);
+ }
+
+ trans->locked = true;
+out:
+ bch2_trans_verify_locks(trans);
return 0;
}
+int bch2_trans_relock(struct btree_trans *trans)
+{
+ return __bch2_trans_relock(trans, true);
+}
+
+int bch2_trans_relock_notrace(struct btree_trans *trans)
+{
+ return __bch2_trans_relock(trans, false);
+}
+
void bch2_trans_unlock_noassert(struct btree_trans *trans)
{
- struct btree_path *path;
- unsigned i;
+ __bch2_trans_unlock(trans);
- trans_for_each_path(trans, path, i)
- __bch2_btree_path_unlock(trans, path);
+ trans->locked = false;
+ trans->last_unlock_ip = _RET_IP_;
}
void bch2_trans_unlock(struct btree_trans *trans)
{
- struct btree_path *path;
- unsigned i;
+ __bch2_trans_unlock(trans);
- trans_for_each_path(trans, path, i)
- __bch2_btree_path_unlock(trans, path);
+ trans->locked = false;
+ trans->last_unlock_ip = _RET_IP_;
}
void bch2_trans_unlock_long(struct btree_trans *trans)
@@ -835,17 +830,6 @@ void bch2_trans_unlock_long(struct btree_trans *trans)
bch2_trans_srcu_unlock(trans);
}
-bool bch2_trans_locked(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- if (path->nodes_locked)
- return true;
- return false;
-}
-
int __bch2_trans_mutex_lock(struct btree_trans *trans,
struct mutex *lock)
{
@@ -862,15 +846,19 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans,
void bch2_btree_path_verify_locks(struct btree_path *path)
{
- unsigned l;
+ /*
+ * A path may be uptodate and yet have nothing locked if and only if
+ * there is no node at path->level, which generally means we were
+ * iterating over all nodes and got to the end of the btree
+ */
+ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+ btree_path_node(path, path->level) &&
+ !path->nodes_locked);
- if (!path->nodes_locked) {
- BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
- btree_path_node(path, path->level));
+ if (!path->nodes_locked)
return;
- }
- for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+ for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
int want = btree_lock_want(path, l);
int have = btree_node_locked_type(path, l);
@@ -883,8 +871,24 @@ void bch2_btree_path_verify_locks(struct btree_path *path)
}
}
+static bool bch2_trans_locked(struct btree_trans *trans)
+{
+ struct btree_path *path;
+ unsigned i;
+
+ trans_for_each_path(trans, path, i)
+ if (path->nodes_locked)
+ return true;
+ return false;
+}
+
void bch2_trans_verify_locks(struct btree_trans *trans)
{
+ if (!trans->locked) {
+ BUG_ON(bch2_trans_locked(trans));
+ return;
+ }
+
struct btree_path *path;
unsigned i;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 4bd72c855da1..7f41545b9147 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -364,14 +364,14 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
- struct get_locks_fail f;
+ struct get_locks_fail f = {};
unsigned old_locks_want = path->locks_want;
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
if (path->locks_want < new_locks_want
? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
- : path->uptodate == BTREE_ITER_UPTODATE)
+ : path->nodes_locked)
return 0;
trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
new file mode 100644
index 000000000000..45cb8149d374
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan.c
@@ -0,0 +1,524 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "recovery_passes.h"
+
+#include <linux/kthread.h>
+#include <linux/sort.h>
+
+struct find_btree_nodes_worker {
+ struct closure *cl;
+ struct find_btree_nodes *f;
+ struct bch_dev *ca;
+};
+
+static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
+{
+ prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
+ bch2_bpos_to_text(out, n->min_key);
+ prt_str(out, "-");
+ bch2_bpos_to_text(out, n->max_key);
+
+ if (n->range_updated)
+ prt_str(out, " range updated");
+ if (n->overwritten)
+ prt_str(out, " overwritten");
+
+ for (unsigned i = 0; i < n->nr_ptrs; i++) {
+ prt_char(out, ' ');
+ bch2_extent_ptr_to_text(out, c, n->ptrs + i);
+ }
+}
+
+static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
+{
+ printbuf_indent_add(out, 2);
+ darray_for_each(nodes, i) {
+ found_btree_node_to_text(out, c, i);
+ prt_newline(out);
+ }
+ printbuf_indent_sub(out, 2);
+}
+
+static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
+{
+ struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
+
+ set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
+ bp->k.p = f->max_key;
+ bp->v.seq = cpu_to_le64(f->cookie);
+ bp->v.sectors_written = 0;
+ bp->v.flags = 0;
+ bp->v.sectors_written = cpu_to_le16(f->sectors_written);
+ bp->v.min_key = f->min_key;
+ SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
+ memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
+}
+
+static bool found_btree_node_is_readable(struct btree_trans *trans,
+ struct found_btree_node *f)
+{
+ struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
+
+ found_btree_node_to_key(&k.k, f);
+
+ struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
+ bool ret = !IS_ERR_OR_NULL(b);
+ if (ret) {
+ f->sectors_written = b->written;
+ six_unlock_read(&b->c.lock);
+ }
+
+ /*
+ * We might update this node's range; if that happens, we need the node
+ * to be re-read so the read path can trim keys that are no longer in
+ * this node
+ */
+ if (b != btree_node_root(trans->c, b))
+ bch2_btree_node_evict(trans, &k.k);
+ return ret;
+}
+
+static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->level, r->level) ?:
+ cmp_int(l->cookie, r->cookie);
+}
+
+/*
+ * Given two found btree nodes, if their sequence numbers are equal, take the
+ * one that's readable:
+ */
+static int found_btree_node_cmp_time(const struct found_btree_node *l,
+ const struct found_btree_node *r)
+{
+ return cmp_int(l->seq, r->seq);
+}
+
+static int found_btree_node_cmp_pos(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ -cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->min_key, r->min_key) ?:
+ -found_btree_node_cmp_time(l, r);
+}
+
+static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
+ struct bio *bio, struct btree_node *bn, u64 offset)
+{
+ struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+ bio->bi_iter.bi_sector = offset;
+ bch2_bio_map(bio, bn, PAGE_SIZE);
+
+ submit_bio_wait(bio);
+ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+ "IO error in try_read_btree_node() at %llu: %s",
+ offset, bch2_blk_status_to_str(bio->bi_status)))
+ return;
+
+ if (le64_to_cpu(bn->magic) != bset_magic(c))
+ return;
+
+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
+ struct nonce nonce = btree_nonce(&bn->keys, 0);
+ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+ bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
+ }
+
+ if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
+ return;
+
+ if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
+ return;
+
+ rcu_read_lock();
+ struct found_btree_node n = {
+ .btree_id = BTREE_NODE_ID(bn),
+ .level = BTREE_NODE_LEVEL(bn),
+ .seq = BTREE_NODE_SEQ(bn),
+ .cookie = le64_to_cpu(bn->keys.seq),
+ .min_key = bn->min_key,
+ .max_key = bn->max_key,
+ .nr_ptrs = 1,
+ .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
+ .ptrs[0].offset = offset,
+ .ptrs[0].dev = ca->dev_idx,
+ .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)),
+ };
+ rcu_read_unlock();
+
+ if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+ mutex_lock(&f->lock);
+ if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
+ bch_err(c, "try_read_btree_node() can't handle endian conversion");
+ f->ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (darray_push(&f->nodes, n))
+ f->ret = -ENOMEM;
+unlock:
+ mutex_unlock(&f->lock);
+ }
+}
+
+static int read_btree_nodes_worker(void *p)
+{
+ struct find_btree_nodes_worker *w = p;
+ struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
+ struct bch_dev *ca = w->ca;
+ void *buf = (void *) __get_free_page(GFP_KERNEL);
+ struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
+ unsigned long last_print = jiffies;
+
+ if (!buf || !bio) {
+ bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+ w->f->ret = -ENOMEM;
+ goto err;
+ }
+
+ for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
+ for (unsigned bucket_offset = 0;
+ bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
+ bucket_offset += btree_sectors(c)) {
+ if (time_after(jiffies, last_print + HZ * 30)) {
+ u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
+ u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
+
+ bch_info(ca, "%s: %2u%% done", __func__,
+ (unsigned) div64_u64(cur_sector * 100, end_sector));
+ last_print = jiffies;
+ }
+
+ u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
+
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
+ !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
+ continue;
+
+ try_read_btree_node(w->f, ca, bio, buf, sector);
+ }
+err:
+ bio_put(bio);
+ free_page((unsigned long) buf);
+ percpu_ref_get(&ca->io_ref);
+ closure_put(w->cl);
+ kfree(w);
+ return 0;
+}
+
+static int read_btree_nodes(struct find_btree_nodes *f)
+{
+ struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+ struct closure cl;
+ int ret = 0;
+
+ closure_init_stack(&cl);
+
+ for_each_online_member(c, ca) {
+ if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
+ continue;
+
+ struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
+ struct task_struct *t;
+
+ if (!w) {
+ percpu_ref_put(&ca->io_ref);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ percpu_ref_get(&ca->io_ref);
+ closure_get(&cl);
+ w->cl = &cl;
+ w->f = f;
+ w->ca = ca;
+
+ t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+ ret = IS_ERR_OR_NULL(t);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ closure_put(&cl);
+ f->ret = ret;
+ bch_err(c, "error starting kthread: %i", ret);
+ break;
+ }
+ }
+err:
+ closure_sync(&cl);
+ return f->ret ?: ret;
+}
+
+static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
+{
+ while (n + 1 < end &&
+ found_btree_node_cmp_pos(n, n + 1) > 0) {
+ swap(n[0], n[1]);
+ n++;
+ }
+}
+
+static int handle_overwrites(struct bch_fs *c,
+ struct found_btree_node *start,
+ struct found_btree_node *end)
+{
+ struct found_btree_node *n;
+again:
+ for (n = start + 1;
+ n < end &&
+ n->btree_id == start->btree_id &&
+ n->level == start->level &&
+ bpos_lt(n->min_key, start->max_key);
+ n++) {
+ int cmp = found_btree_node_cmp_time(start, n);
+
+ if (cmp > 0) {
+ if (bpos_cmp(start->max_key, n->max_key) >= 0)
+ n->overwritten = true;
+ else {
+ n->range_updated = true;
+ n->min_key = bpos_successor(start->max_key);
+ n->range_updated = true;
+ bubble_up(n, end);
+ goto again;
+ }
+ } else if (cmp < 0) {
+ BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
+
+ start->max_key = bpos_predecessor(n->min_key);
+ start->range_updated = true;
+ } else if (n->level) {
+ n->overwritten = true;
+ } else {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "overlapping btree nodes with same seq! halting\n ");
+ found_btree_node_to_text(&buf, c, start);
+ prt_str(&buf, "\n ");
+ found_btree_node_to_text(&buf, c, n);
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_scan_for_btree_nodes(struct bch_fs *c)
+{
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+ struct printbuf buf = PRINTBUF;
+ size_t dst;
+ int ret = 0;
+
+ if (f->nodes.nr)
+ return 0;
+
+ mutex_init(&f->lock);
+
+ ret = read_btree_nodes(f);
+ if (ret)
+ return ret;
+
+ if (!f->nodes.nr) {
+ bch_err(c, "%s: no btree nodes found", __func__);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (0 && c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes found:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
+
+ dst = 0;
+ darray_for_each(f->nodes, i) {
+ struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
+
+ if (prev &&
+ prev->cookie == i->cookie) {
+ if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
+ bch_err(c, "%s: found too many replicas for btree node", __func__);
+ ret = -EINVAL;
+ goto err;
+ }
+ prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
+ } else {
+ f->nodes.data[dst++] = *i;
+ }
+ }
+ f->nodes.nr = dst;
+
+ sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+
+ if (0 && c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ dst = 0;
+ darray_for_each(f->nodes, i) {
+ if (i->overwritten)
+ continue;
+
+ ret = handle_overwrites(c, i, &darray_top(f->nodes));
+ if (ret)
+ goto err;
+
+ BUG_ON(i->overwritten);
+ f->nodes.data[dst++] = *i;
+ }
+ f->nodes.nr = dst;
+
+ if (c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ -cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->max_key, r->min_key);
+}
+
+#define for_each_found_btree_node_in_range(_f, _search, _idx) \
+ for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
+ sizeof((_f)->nodes.data[0]), \
+ found_btree_node_range_start_cmp, &search); \
+ _idx < (_f)->nodes.nr && \
+ (_f)->nodes.data[_idx].btree_id == _search.btree_id && \
+ (_f)->nodes.data[_idx].level == _search.level && \
+ bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
+ _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
+
+bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
+{
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+
+ struct found_btree_node search = {
+ .btree_id = b->c.btree_id,
+ .level = b->c.level,
+ .min_key = b->data->min_key,
+ .max_key = b->key.k.p,
+ };
+
+ for_each_found_btree_node_in_range(f, search, idx)
+ if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
+ return true;
+ return false;
+}
+
+bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
+{
+ struct found_btree_node search = {
+ .btree_id = btree,
+ .level = 0,
+ .min_key = POS_MIN,
+ .max_key = SPOS_MAX,
+ };
+
+ for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
+ return true;
+ return false;
+}
+
+int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
+ unsigned level, struct bpos node_min, struct bpos node_max)
+{
+ if (btree_id_is_alloc(btree))
+ return 0;
+
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+
+ int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ return ret;
+
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
+ bch2_bpos_to_text(&buf, node_min);
+ prt_str(&buf, " - ");
+ bch2_bpos_to_text(&buf, node_max);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ struct found_btree_node search = {
+ .btree_id = btree,
+ .level = level,
+ .min_key = node_min,
+ .max_key = node_max,
+ };
+
+ for_each_found_btree_node_in_range(f, search, idx) {
+ struct found_btree_node n = f->nodes.data[idx];
+
+ n.range_updated |= bpos_lt(n.min_key, node_min);
+ n.min_key = bpos_max(n.min_key, node_min);
+
+ n.range_updated |= bpos_gt(n.max_key, node_max);
+ n.max_key = bpos_min(n.max_key, node_max);
+
+ struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
+
+ found_btree_node_to_key(&tmp.k, &n);
+
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
+ bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+
+ BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
+
+ ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
+{
+ darray_exit(&f->nodes);
+}
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
new file mode 100644
index 000000000000..08687b209787
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
+#define _BCACHEFS_BTREE_NODE_SCAN_H
+
+int bch2_scan_for_btree_nodes(struct bch_fs *);
+bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
+bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
+int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
new file mode 100644
index 000000000000..5cfaeb5ac831
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan_types.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+
+#include "darray.h"
+
+struct found_btree_node {
+ bool range_updated:1;
+ bool overwritten:1;
+ u8 btree_id;
+ u8 level;
+ unsigned sectors_written;
+ u32 seq;
+ u64 cookie;
+
+ struct bpos min_key;
+ struct bpos max_key;
+
+ unsigned nr_ptrs;
+ struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
+};
+
+typedef DARRAY(struct found_btree_node) found_btree_nodes;
+
+struct find_btree_nodes {
+ int ret;
+ struct mutex lock;
+ found_btree_nodes nodes;
+};
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 30d69a6d133e..74e1ff225674 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "alloc_foreground.h"
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
@@ -19,6 +20,26 @@
#include <linux/prefetch.h>
+static const char * const trans_commit_flags_strs[] = {
+#define x(n, ...) #n,
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
+ NULL
+};
+
+void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags)
+{
+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+
+ prt_printf(out, "watermark=%s", bch2_watermarks[watermark]);
+
+ flags >>= BCH_WATERMARK_BITS;
+ if (flags) {
+ prt_char(out, ' ');
+ bch2_prt_bitflags(out, trans_commit_flags_strs, flags);
+ }
+}
+
static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
{
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -315,10 +336,10 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->btree_id != path->btree_id);
EBUG_ON(!i->level &&
btree_type_has_snapshots(i->btree_id) &&
- !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
- test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+ !(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
+ test_bit(JOURNAL_replay_done, &trans->c->journal.flags) &&
i->k->k.p.snapshot &&
- bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
+ bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
}
static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
@@ -397,12 +418,13 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
struct bkey_cached *ck = (void *) path->l[0].b;
unsigned new_u64s;
struct bkey_i *new_k;
+ unsigned watermark = flags & BCH_WATERMARK_MASK;
EBUG_ON(path->level);
- if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bch2_btree_key_cache_must_wait(c) &&
- !(flags & BCH_TRANS_COMMIT_journal_reclaim))
+ if (watermark < BCH_WATERMARK_reclaim &&
+ !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ bch2_btree_key_cache_must_wait(c))
return -BCH_ERR_btree_insert_need_journal_reclaim;
/*
@@ -442,13 +464,13 @@ static int run_one_mem_trigger(struct btree_trans *trans,
verify_update_old_key(trans, i);
- if (unlikely(flags & BTREE_TRIGGER_NORUN))
+ if (unlikely(flags & BTREE_TRIGGER_norun))
return 0;
if (old_ops->trigger == new_ops->trigger) {
ret = bch2_key_trigger(trans, i->btree_id, i->level,
old, bkey_i_to_s(new),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+ BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags);
} else {
ret = bch2_key_trigger_new(trans, i->btree_id, i->level,
bkey_i_to_s(new), flags) ?:
@@ -471,11 +493,11 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
struct bkey_s_c old = { &old_k, i->old_v };
const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
- unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL;
+ unsigned flags = i->flags|BTREE_TRIGGER_transactional;
verify_update_old_key(trans, i);
- if ((i->flags & BTREE_TRIGGER_NORUN) ||
+ if ((i->flags & BTREE_TRIGGER_norun) ||
!(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
return 0;
@@ -485,8 +507,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
i->overwrite_trigger_run = true;
i->insert_trigger_run = true;
return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
- BTREE_TRIGGER_INSERT|
- BTREE_TRIGGER_OVERWRITE|flags) ?: 1;
+ BTREE_TRIGGER_insert|
+ BTREE_TRIGGER_overwrite|flags) ?: 1;
} else if (overwrite && !i->overwrite_trigger_run) {
i->overwrite_trigger_run = true;
return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
@@ -499,9 +521,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
}
static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
- struct btree_insert_entry *btree_id_start)
+ unsigned btree_id_start)
{
- struct btree_insert_entry *i;
bool trans_trigger_run;
int ret, overwrite;
@@ -514,13 +535,13 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
do {
trans_trigger_run = false;
- for (i = btree_id_start;
- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+ for (unsigned i = btree_id_start;
+ i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
i++) {
- if (i->btree_id != btree_id)
+ if (trans->updates[i].btree_id != btree_id)
continue;
- ret = run_one_trans_trigger(trans, i, overwrite);
+ ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
if (ret < 0)
return ret;
if (ret)
@@ -534,8 +555,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- struct btree_insert_entry *btree_id_start = trans->updates;
- unsigned btree_id = 0;
+ unsigned btree_id = 0, btree_id_start = 0;
int ret = 0;
/*
@@ -549,8 +569,8 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
if (btree_id == BTREE_ID_alloc)
continue;
- while (btree_id_start < trans->updates + trans->nr_updates &&
- btree_id_start->btree_id < btree_id)
+ while (btree_id_start < trans->nr_updates &&
+ trans->updates[btree_id_start].btree_id < btree_id)
btree_id_start++;
ret = run_btree_triggers(trans, btree_id, btree_id_start);
@@ -558,11 +578,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
return ret;
}
- trans_for_each_update(trans, i) {
+ for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+ struct btree_insert_entry *i = trans->updates + idx;
+
if (i->btree_id > BTREE_ID_alloc)
break;
if (i->btree_id == BTREE_ID_alloc) {
- ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+ ret = run_btree_triggers(trans, BTREE_ID_alloc, idx);
if (ret)
return ret;
break;
@@ -571,7 +593,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
#ifdef CONFIG_BCACHEFS_DEBUG
trans_for_each_update(trans, i)
- BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+ BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
(!i->insert_trigger_run || !i->overwrite_trigger_run));
#endif
@@ -589,7 +611,7 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
- int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+ int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
if (ret)
return ret;
}
@@ -608,6 +630,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
unsigned u64s = 0;
int ret;
+ bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_in_restart(trans);
+
if (race_fault()) {
trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
@@ -685,7 +710,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
trans_for_each_update(trans, i)
if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
- ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags);
+ ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
if (ret)
goto fatal_err;
}
@@ -704,7 +729,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (i->key_cache_already_flushed)
continue;
- if (i->flags & BTREE_UPDATE_NOJOURNAL)
+ if (i->flags & BTREE_UPDATE_nojournal)
continue;
verify_update_old_key(trans, i);
@@ -765,16 +790,15 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
}
static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct btree_insert_entry *i,
struct printbuf *err)
{
struct bch_fs *c = trans->c;
printbuf_reset(err);
- prt_printf(err, "invalid bkey on insert from %s -> %ps",
+ prt_printf(err, "invalid bkey on insert from %s -> %ps\n",
trans->fn, (void *) i->ip_allocated);
- prt_newline(err);
printbuf_indent_add(err, 2);
bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
@@ -795,8 +819,7 @@ static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "invalid bkey on insert from %s", trans->fn);
- prt_newline(&buf);
+ prt_printf(&buf, "invalid bkey on insert from %s\n", trans->fn);
printbuf_indent_add(&buf, 2);
bch2_journal_entry_to_text(&buf, c, i);
@@ -826,7 +849,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
struct bch_fs *c = trans->c;
int ret = 0, u64s_delta = 0;
- trans_for_each_update(trans, i) {
+ for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+ struct btree_insert_entry *i = trans->updates + idx;
if (i->cached)
continue;
@@ -887,6 +911,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
int ret, unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
switch (ret) {
case -BCH_ERR_btree_insert_btree_node_full:
@@ -905,7 +930,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* flag
*/
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
+ watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
}
@@ -985,6 +1010,9 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
int ret = 0;
+ bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_in_restart(trans);
+
if (!trans->nr_updates &&
!trans->journal_entries_u64s)
goto out_reset;
@@ -997,10 +1025,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
trans_for_each_update(trans, i) {
struct printbuf buf = PRINTBUF;
- enum bkey_invalid_flags invalid_flags = 0;
+ enum bch_validate_flags invalid_flags = 0;
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+ invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
i->bkey_type, invalid_flags, &buf)))
@@ -1015,10 +1043,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
for (struct jset_entry *i = trans->journal_entries;
i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
i = vstruct_next(i)) {
- enum bkey_invalid_flags invalid_flags = 0;
+ enum bch_validate_flags invalid_flags = 0;
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+ invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
if (unlikely(bch2_journal_entry_validate(c, NULL, i,
bcachefs_metadata_version_current,
@@ -1062,7 +1090,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (i->key_cache_already_flushed)
continue;
- if (i->flags & BTREE_UPDATE_NOJOURNAL)
+ if (i->flags & BTREE_UPDATE_nojournal)
continue;
/* we're going to journal the key being updated: */
@@ -1083,6 +1111,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
}
retry:
errored_at = NULL;
+ bch2_trans_verify_not_unlocked(trans);
bch2_trans_verify_not_in_restart(trans);
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 9404d96c38f3..d63db4fefe73 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -163,9 +163,21 @@ struct btree_cache {
/* Number of elements in live + freeable lists */
unsigned used;
unsigned reserve;
+ unsigned freed;
+ unsigned not_freed_lock_intent;
+ unsigned not_freed_lock_write;
+ unsigned not_freed_dirty;
+ unsigned not_freed_read_in_flight;
+ unsigned not_freed_write_in_flight;
+ unsigned not_freed_noevict;
+ unsigned not_freed_write_blocked;
+ unsigned not_freed_will_make_reachable;
+ unsigned not_freed_access_bit;
atomic_t dirty;
struct shrinker *shrink;
+ unsigned used_by_btree[BTREE_ID_NR];
+
/*
* If we need to allocate memory for a new btree node and that
* allocation fails, we can cannibalize another node in the btree cache
@@ -187,36 +199,89 @@ struct btree_node_iter {
} data[MAX_BSETS];
};
+#define BTREE_ITER_FLAGS() \
+ x(slots) \
+ x(intent) \
+ x(prefetch) \
+ x(is_extents) \
+ x(not_extents) \
+ x(cached) \
+ x(with_key_cache) \
+ x(with_updates) \
+ x(with_journal) \
+ x(snapshot_field) \
+ x(all_snapshots) \
+ x(filter_snapshots) \
+ x(nopreserve) \
+ x(cached_nofill) \
+ x(key_cache_fill) \
+
+#define STR_HASH_FLAGS() \
+ x(must_create) \
+ x(must_replace)
+
+#define BTREE_UPDATE_FLAGS() \
+ x(internal_snapshot_node) \
+ x(nojournal) \
+ x(key_cache_reclaim)
+
+
/*
- * Iterate over all possible positions, synthesizing deleted keys for holes:
- */
-static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0;
-/*
- * Indicates that intent locks should be taken on leaf nodes, because we expect
- * to be doing updates:
- */
-static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1;
-/*
- * Causes the btree iterator code to prefetch additional btree nodes from disk:
- */
-static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2;
-/*
- * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
- * @pos or the first key strictly greater than @pos
+ * BTREE_TRIGGER_norun - don't run triggers at all
+ *
+ * BTREE_TRIGGER_transactional - we're running transactional triggers as part of
+ * a transaction commit: triggers may generate new updates
+ *
+ * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction
+ * commit: we have our journal reservation, we're holding btree node write
+ * locks, and we know the transaction is going to commit (returning an error
+ * here is a fatal error, causing us to go emergency read-only)
+ *
+ * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage
+ *
+ * BTREE_TRIGGER_insert - @new is entering the btree
+ * BTREE_TRIGGER_overwrite - @old is leaving the btree
+ *
+ * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc
+ * trigger
*/
-static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3;
-static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4;
-static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5;
-static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6;
-static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7;
-static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8;
-static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9;
-static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
-static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11;
-static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12;
-static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13;
-static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14;
-#define __BTREE_ITER_FLAGS_END 15
+#define BTREE_TRIGGER_FLAGS() \
+ x(norun) \
+ x(transactional) \
+ x(atomic) \
+ x(check_repair) \
+ x(gc) \
+ x(insert) \
+ x(overwrite) \
+ x(is_root) \
+ x(bucket_invalidate)
+
+enum {
+#define x(n) BTREE_ITER_FLAG_BIT_##n,
+ BTREE_ITER_FLAGS()
+ STR_HASH_FLAGS()
+ BTREE_UPDATE_FLAGS()
+ BTREE_TRIGGER_FLAGS()
+#undef x
+};
+
+/* iter flags must fit in a u16: */
+//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15);
+
+enum btree_iter_update_trigger_flags {
+#define x(n) BTREE_ITER_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
+ BTREE_ITER_FLAGS()
+#undef x
+#define x(n) STR_HASH_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
+ STR_HASH_FLAGS()
+#undef x
+#define x(n) BTREE_UPDATE_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
+ BTREE_UPDATE_FLAGS()
+#undef x
+#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
+ BTREE_TRIGGER_FLAGS()
+#undef x
+};
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -307,7 +372,7 @@ struct btree_iter {
*/
struct bkey k;
- /* BTREE_ITER_WITH_JOURNAL: */
+ /* BTREE_ITER_with_journal: */
size_t journal_idx;
#ifdef TRACK_PATH_ALLOCATED
unsigned long ip_allocated;
@@ -321,9 +386,9 @@ struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
+ unsigned long btree_trans_barrier_seq;
u16 u64s;
bool valid;
- u32 btree_trans_barrier_seq;
struct bkey_cached_key key;
struct rhash_head hash;
@@ -364,7 +429,21 @@ struct btree_insert_entry {
unsigned long ip_allocated;
};
+/* Number of btree paths we preallocate, usually enough */
#define BTREE_ITER_INITIAL 64
+/*
+ * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
+ * paths should run inside this limit, and if they don't it usually indicates a
+ * bug (leaking/duplicated btree paths).
+ *
+ * exception: some fsck paths
+ *
+ * bugs with excessive path usage seem to have possibly been eliminated now, so
+ * we might consider eliminating this (and btree_trans_too_many_iter()) at some
+ * point.
+ */
+#define BTREE_ITER_NORMAL_LIMIT 256
+/* never exceed limit */
#define BTREE_ITER_MAX (1U << 10)
struct btree_trans_commit_hook;
@@ -404,6 +483,8 @@ struct btree_trans {
u8 lock_must_abort;
bool lock_may_not_fail:1;
bool srcu_held:1;
+ bool locked:1;
+ bool write_locked:1;
bool used_mempool:1;
bool in_traverse_all:1;
bool paths_sorted:1;
@@ -411,13 +492,13 @@ struct btree_trans {
bool journal_transaction_names:1;
bool journal_replay_not_finished:1;
bool notrace_relock_fail:1;
- bool write_locked:1;
enum bch_errcode restarted:16;
u32 restart_count;
u64 last_begin_time;
unsigned long last_begin_ip;
unsigned long last_restarted_ip;
+ unsigned long last_unlock_ip;
unsigned long srcu_lock_time;
const char *fn;
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index a4b40c1656a5..f3c645a43dcb 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -25,19 +25,22 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
static int __must_check
bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t,
- struct bkey_i *, enum btree_update_flags,
+ struct bkey_i *, enum btree_iter_update_trigger_flags,
unsigned long ip);
static noinline int extent_front_merge(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
struct bkey_i **insert,
- enum btree_update_flags flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
struct bkey_i *update;
int ret;
+ if (unlikely(trans->journal_replay_not_finished))
+ return 0;
+
update = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(update);
if (ret)
@@ -69,6 +72,9 @@ static noinline int extent_back_merge(struct btree_trans *trans,
struct bch_fs *c = trans->c;
int ret;
+ if (unlikely(trans->journal_replay_not_finished))
+ return 0;
+
ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
if (ret < 0)
@@ -98,8 +104,8 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
pos.snapshot++;
for_each_btree_key_norestart(trans, iter, btree_id, pos,
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_NOPRESERVE, k, ret) {
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_nopreserve, k, ret) {
if (!bkey_eq(k.k->p, pos))
break;
@@ -132,8 +138,8 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
darray_init(&s);
bch2_trans_iter_init(trans, &old_iter, id, old_pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_not_extents|
+ BTREE_ITER_all_snapshots);
while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
!(ret = bkey_err(old_k)) &&
bkey_eq(old_pos, old_k.k->p)) {
@@ -145,8 +151,8 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
continue;
new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
+ BTREE_ITER_not_extents|
+ BTREE_ITER_intent);
ret = bkey_err(new_k);
if (ret)
break;
@@ -162,7 +168,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
update->k.type = KEY_TYPE_whiteout;
ret = bch2_trans_update(trans, &new_iter, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
}
bch2_trans_iter_exit(trans, &new_iter);
@@ -179,7 +185,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
struct btree_iter *iter,
- enum btree_update_flags flags,
+ enum btree_iter_update_trigger_flags flags,
struct bkey_s_c old,
struct bkey_s_c new)
{
@@ -212,7 +218,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
old.k->p, update->k.p) ?:
bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+ BTREE_UPDATE_internal_snapshot_node|flags);
if (ret)
return ret;
}
@@ -229,7 +235,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
old.k->p, update->k.p) ?:
bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+ BTREE_UPDATE_internal_snapshot_node|flags);
if (ret)
return ret;
}
@@ -254,7 +260,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
}
ret = bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+ BTREE_UPDATE_internal_snapshot_node|flags);
if (ret)
return ret;
}
@@ -267,7 +273,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
bch2_cut_front(new.k->p, update);
ret = bch2_trans_update_by_path(trans, iter->path, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ BTREE_UPDATE_internal_snapshot_node|
flags, _RET_IP_);
if (ret)
return ret;
@@ -279,7 +285,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
static int bch2_trans_update_extent(struct btree_trans *trans,
struct btree_iter *orig_iter,
struct bkey_i *insert,
- enum btree_update_flags flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -287,9 +293,9 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
int ret = 0;
bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
- BTREE_ITER_INTENT|
- BTREE_ITER_WITH_UPDATES|
- BTREE_ITER_NOT_EXTENTS);
+ BTREE_ITER_intent|
+ BTREE_ITER_with_updates|
+ BTREE_ITER_not_extents);
k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
@@ -340,7 +346,7 @@ err:
static noinline int flush_new_cached_update(struct btree_trans *trans,
struct btree_insert_entry *i,
- enum btree_update_flags flags,
+ enum btree_iter_update_trigger_flags flags,
unsigned long ip)
{
struct bkey k;
@@ -348,7 +354,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
btree_path_idx_t path_idx =
bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0,
- BTREE_ITER_INTENT, _THIS_IP_);
+ BTREE_ITER_intent, _THIS_IP_);
ret = bch2_btree_path_traverse(trans, path_idx, 0);
if (ret)
goto out;
@@ -366,7 +372,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
goto out;
i->key_cache_already_flushed = true;
- i->flags |= BTREE_TRIGGER_NORUN;
+ i->flags |= BTREE_TRIGGER_norun;
btree_path_set_should_be_locked(btree_path);
ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
@@ -377,7 +383,7 @@ out:
static int __must_check
bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
- struct bkey_i *k, enum btree_update_flags flags,
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
unsigned long ip)
{
struct bch_fs *c = trans->c;
@@ -473,15 +479,15 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
if (!iter->key_cache_path)
iter->key_cache_path =
bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
- BTREE_ITER_INTENT|
- BTREE_ITER_CACHED, _THIS_IP_);
+ BTREE_ITER_intent|
+ BTREE_ITER_cached, _THIS_IP_);
iter->key_cache_path =
bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
_THIS_IP_);
- ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED);
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached);
if (unlikely(ret))
return ret;
@@ -499,17 +505,17 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
}
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_i *k, enum btree_update_flags flags)
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
{
btree_path_idx_t path_idx = iter->update_path ?: iter->path;
int ret;
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ if (iter->flags & BTREE_ITER_is_extents)
return bch2_trans_update_extent(trans, iter, k, flags);
if (bkey_deleted(&k->k) &&
- !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
- (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+ !(flags & BTREE_UPDATE_key_cache_reclaim) &&
+ (iter->flags & BTREE_ITER_filter_snapshots)) {
ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
if (unlikely(ret < 0))
return ret;
@@ -522,7 +528,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
* Ensure that updates to cached btrees go to the key cache:
*/
struct btree_path *path = trans->paths + path_idx;
- if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+ if (!(flags & BTREE_UPDATE_key_cache_reclaim) &&
!path->cached &&
!path->level &&
btree_id_cached(trans->c, path->btree_id)) {
@@ -581,7 +587,7 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k;
int ret = 0;
- bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent);
k = bch2_btree_iter_prev(iter);
ret = bkey_err(k);
if (ret)
@@ -615,15 +621,15 @@ void bch2_trans_commit_hook(struct btree_trans *trans,
int bch2_btree_insert_nonextent(struct btree_trans *trans,
enum btree_id btree, struct bkey_i *k,
- enum btree_update_flags flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
int ret;
bch2_trans_iter_init(trans, &iter, btree, k->k.p,
- BTREE_ITER_CACHED|
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
+ BTREE_ITER_cached|
+ BTREE_ITER_not_extents|
+ BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
@@ -631,16 +637,13 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
}
int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
- struct bkey_i *k, enum btree_update_flags flags)
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
- int ret;
-
bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, k, flags);
+ BTREE_ITER_intent|flags);
+ int ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -692,8 +695,8 @@ int bch2_btree_delete(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &iter, btree, pos,
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_btree_delete_at(trans, &iter, update_flags);
bch2_trans_iter_exit(trans, &iter);
@@ -711,7 +714,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
struct bkey_s_c k;
int ret = 0;
- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(trans->c, 0);
@@ -739,7 +742,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
*/
delete.k.p = iter.pos;
- if (iter.flags & BTREE_ITER_IS_EXTENTS)
+ if (iter.flags & BTREE_ITER_is_extents)
bch2_key_resize(&delete.k,
bpos_min(end, k.k->p).offset -
iter.pos.offset);
@@ -798,7 +801,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
k->k.p = pos;
struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(trans, &iter, k, 0);
@@ -846,7 +849,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
if (ret)
goto err;
- if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+ if (!test_bit(JOURNAL_running, &c->journal.flags)) {
ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s));
if (ret)
goto err;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index cc7c53e83f89..b4894e4d5447 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -44,16 +44,18 @@ enum bch_trans_commit_flags {
#undef x
};
+void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags);
+
int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
unsigned, unsigned);
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
- struct bkey_i *, enum btree_update_flags);
+ struct bkey_i *, enum btree_iter_update_trigger_flags);
int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
- enum btree_update_flags);
+ enum btree_iter_update_trigger_flags);
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, int flags);
@@ -94,14 +96,14 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
}
int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
- enum btree_update_flags,
+ enum btree_iter_update_trigger_flags,
struct bkey_s_c, struct bkey_s_c);
int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
enum btree_id, struct bpos);
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, enum btree_update_flags);
+ struct bkey_i *, enum btree_iter_update_trigger_flags);
struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);
@@ -276,7 +278,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr
unsigned flags, unsigned type, unsigned min_bytes)
{
struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
- btree_id, pos, flags|BTREE_ITER_INTENT, type);
+ btree_id, pos, flags|BTREE_ITER_intent, type);
struct bkey_i *ret = IS_ERR(k.k)
? ERR_CAST(k.k)
: __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
@@ -299,7 +301,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
unsigned flags, unsigned type, unsigned min_bytes)
{
struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
- btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes);
+ btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes);
int ret;
if (IS_ERR(mut))
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index b2f5f2e50f7e..60b8544cea48 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "bkey_buf.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_gc.h"
@@ -18,82 +19,140 @@
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "recovery_passes.h"
#include "replicas.h"
+#include "sb-members.h"
#include "super-io.h"
#include "trace.h"
#include <linux/random.h>
+static const char * const bch2_btree_update_modes[] = {
+#define x(t) #t,
+ BTREE_UPDATE_MODES()
+#undef x
+ NULL
+};
+
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
btree_path_idx_t, struct btree *, struct keylist *);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
- enum btree_id btree_id,
- unsigned level,
- struct bpos pos)
-{
- btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
- BTREE_ITER_NOPRESERVE|
- BTREE_ITER_INTENT, _RET_IP_);
- path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
-
- struct btree_path *path = trans->paths + path_idx;
- bch2_btree_path_downgrade(trans, path);
- __bch2_btree_path_unlock(trans, path);
- return path_idx;
-}
-
-/* Debug code: */
-
/*
* Verify that child nodes correctly span parent node's range:
*/
-static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
+int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bpos next_node = b->data->min_key;
- struct btree_node_iter iter;
+ struct bch_fs *c = trans->c;
+ struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
+ ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
+ : b->data->min_key;
+ struct btree_and_journal_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_btree_ptr_v2 bp;
- struct bkey unpacked;
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
+ struct bkey_buf prev;
+ int ret = 0;
- BUG_ON(!b->c.level);
+ BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+ b->data->min_key));
+
+ if (b == btree_node_root(c, b)) {
+ if (!bpos_eq(b->data->min_key, POS_MIN)) {
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->min_key);
+ need_fsck_err(c, btree_root_bad_min_key,
+ "btree root with incorrect min_key: %s", buf.buf);
+ goto topology_repair;
+ }
- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
- return;
+ if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->max_key);
+ need_fsck_err(c, btree_root_bad_max_key,
+ "btree root with incorrect max_key: %s", buf.buf);
+ goto topology_repair;
+ }
+ }
+
+ if (!b->c.level)
+ return 0;
- bch2_btree_node_iter_init_from_start(&iter, b);
+ bch2_bkey_buf_init(&prev);
+ bkey_init(&prev.k->k);
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- while (1) {
- k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
if (k.k->type != KEY_TYPE_btree_ptr_v2)
- break;
- bp = bkey_s_c_to_btree_ptr_v2(k);
+ goto out;
- if (!bpos_eq(next_node, bp.v->min_key)) {
- bch2_dump_btree_node(c, b);
- bch2_bpos_to_text(&buf1, next_node);
- bch2_bpos_to_text(&buf2, bp.v->min_key);
- panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
- }
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- bch2_btree_node_iter_advance(&iter, b);
+ struct bpos expected_min = bkey_deleted(&prev.k->k)
+ ? node_min
+ : bpos_successor(prev.k->k.p);
- if (bch2_btree_node_iter_end(&iter)) {
- if (!bpos_eq(k.k->p, b->key.k.p)) {
- bch2_dump_btree_node(c, b);
- bch2_bpos_to_text(&buf1, b->key.k.p);
- bch2_bpos_to_text(&buf2, k.k->p);
- panic("expected end %s got %s\n", buf1.buf, buf2.buf);
- }
- break;
+ if (!bpos_eq(expected_min, bp.v->min_key)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "end of prev node doesn't match start of next node\n"),
+ prt_printf(&buf, " in btree %s level %u node ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, "\n prev ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+ prt_str(&buf, "\n next ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf);
+ goto topology_repair;
}
- next_node = bpos_successor(k.k->p);
+ bch2_bkey_buf_reassemble(&prev, c, k);
+ bch2_btree_and_journal_iter_advance(&iter);
+ }
+
+ if (bkey_deleted(&prev.k->k)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "empty interior node\n");
+ prt_printf(&buf, " in btree %s level %u node ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf);
+ goto topology_repair;
+ } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "last child node doesn't end at end of parent node\n");
+ prt_printf(&buf, " in btree %s level %u node ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, "\n last key ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+
+ need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf);
+ goto topology_repair;
+ }
+out:
+fsck_err:
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_bkey_buf_exit(&prev, c);
+ printbuf_exit(&buf);
+ return ret;
+topology_repair:
+ if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
+ bch2_inconsistent_error(c);
+ ret = -BCH_ERR_btree_need_topology_repair;
+ } else {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
}
-#endif
+ goto out;
}
/* Calculate ideal packed bkey format for new btree nodes: */
@@ -101,7 +160,6 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
{
struct bkey_packed *k;
- struct bset_tree *t;
struct bkey uk;
for_each_bset(b, t)
@@ -254,7 +312,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct open_buckets obs = { .nr = 0 };
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
+ unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
? BTREE_NODE_RESERVE
: 0;
int ret;
@@ -549,6 +607,26 @@ static void btree_update_add_key(struct btree_update *as,
bch2_keylist_push(keys);
}
+static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
+{
+ for_each_keylist_key(&as->new_keys, k)
+ if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
+ return false;
+ return true;
+}
+
+static void btree_update_new_nodes_mark_sb(struct btree_update *as)
+{
+ struct bch_fs *c = as->c;
+
+ mutex_lock(&c->sb_lock);
+ for_each_keylist_key(&as->new_keys, k)
+ bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
+
/*
* The transactional part of an interior btree node update, where we journal the
* update we did to the interior node and update alloc info:
@@ -569,7 +647,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
- BTREE_TRIGGER_TRANSACTIONAL);
+ BTREE_TRIGGER_transactional);
if (ret)
return ret;
}
@@ -578,7 +656,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
- BTREE_TRIGGER_TRANSACTIONAL);
+ BTREE_TRIGGER_transactional);
if (ret)
return ret;
}
@@ -606,6 +684,9 @@ static void btree_update_nodes_written(struct btree_update *as)
if (ret)
goto err;
+ if (!btree_update_new_nodes_marked_sb(as))
+ btree_update_new_nodes_mark_sb(as);
+
/*
* Wait for any in flight writes to finish before we free the old nodes
* on disk:
@@ -638,7 +719,7 @@ static void btree_update_nodes_written(struct btree_update *as)
* which may require allocations as well.
*/
ret = commit_do(trans, &as->disk_res, &journal_seq,
- BCH_WATERMARK_reclaim|
+ BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_journal_reclaim,
@@ -648,12 +729,13 @@ static void btree_update_nodes_written(struct btree_update *as)
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
"%s", bch2_err_str(ret));
err:
- if (as->b) {
-
- b = as->b;
- btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
- as->btree_id, b->c.level, b->key.k.p);
- struct btree_path *path = trans->paths + path_idx;
+ /*
+ * We have to be careful because another thread might be getting ready
+ * to free as->b and calling btree_update_reparent() on us - we'll
+ * recheck under btree_update_lock below:
+ */
+ b = READ_ONCE(as->b);
+ if (b) {
/*
* @b is the node we did the final insert into:
*
@@ -671,12 +753,16 @@ err:
* btree_node_lock_nopath() (the use of which is always suspect,
* we need to work on removing this in the future)
*
- * It should be, but get_unlocked_mut_path() -> bch2_path_get()
+ * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
* calls bch2_path_upgrade(), before we call path_make_mut(), so
* we may rarely end up with a locked path besides the one we
* have here:
*/
bch2_trans_unlock(trans);
+ bch2_trans_begin(trans);
+ btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
+ as->btree_id, b->c.level, b->key.k.p);
+ struct btree_path *path = trans->paths + path_idx;
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
@@ -794,15 +880,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode != BTREE_UPDATE_none);
+ BUG_ON(as->update_level_end < b->c.level);
BUG_ON(!btree_node_dirty(b));
BUG_ON(!b->c.level);
- as->mode = BTREE_INTERIOR_UPDATING_NODE;
+ mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
+ as->mode = BTREE_UPDATE_node;
as->b = b;
+ as->update_level_end = b->c.level;
set_btree_node_write_blocked(b);
list_add(&as->write_blocked_list, &b->write_blocked);
@@ -824,7 +912,7 @@ static void btree_update_reparent(struct btree_update *as,
lockdep_assert_held(&c->btree_interior_update_lock);
child->b = NULL;
- child->mode = BTREE_INTERIOR_UPDATING_AS;
+ child->mode = BTREE_UPDATE_update;
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
bch2_update_reparent_journal_pin_flush);
@@ -835,7 +923,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
struct bkey_i *insert = &b->key;
struct bch_fs *c = as->c;
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode != BTREE_UPDATE_none);
BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
ARRAY_SIZE(as->journal_entries));
@@ -849,7 +937,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
- as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+ as->mode = BTREE_UPDATE_root;
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -1027,7 +1115,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
struct bch_fs *c = as->c;
u64 start_time = as->start_time;
- BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode == BTREE_UPDATE_none);
if (as->took_gc_lock)
up_read(&as->c->gc_lock);
@@ -1044,7 +1132,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level, bool split, unsigned flags)
+ unsigned level_start, bool split, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_update *as;
@@ -1052,7 +1140,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
- unsigned update_level = level;
+ unsigned level_end = level_start;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
int ret = 0;
u32 restart_count = trans->restart_count;
@@ -1067,34 +1155,29 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (watermark < c->journal.watermark) {
- struct journal_res res = { 0 };
- unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK;
-
- if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark != BCH_WATERMARK_reclaim)
- journal_flags |= JOURNAL_RES_GET_NONBLOCK;
+ if (watermark < BCH_WATERMARK_reclaim &&
+ test_bit(JOURNAL_space_low, &c->journal.flags)) {
+ if (flags & BCH_TRANS_COMMIT_journal_reclaim)
+ return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
ret = drop_locks_do(trans,
- bch2_journal_res_get(&c->journal, &res, 1, journal_flags));
- if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; }));
if (ret)
return ERR_PTR(ret);
}
while (1) {
- nr_nodes[!!update_level] += 1 + split;
- update_level++;
+ nr_nodes[!!level_end] += 1 + split;
+ level_end++;
- ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+ ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
if (ret)
return ERR_PTR(ret);
- if (!btree_path_node(path, update_level)) {
+ if (!btree_path_node(path, level_end)) {
/* Allocating new root? */
nr_nodes[1] += split;
- update_level = BTREE_MAX_DEPTH;
+ level_end = BTREE_MAX_DEPTH;
break;
}
@@ -1102,11 +1185,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
- if (bch2_btree_node_insert_fits(path->l[update_level].b,
+ if (bch2_btree_node_insert_fits(path->l[level_end].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
- split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
+ split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
if (!down_read_trylock(&c->gc_lock)) {
@@ -1120,13 +1203,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
memset(as, 0, sizeof(*as));
closure_init(&as->cl, NULL);
- as->c = c;
- as->start_time = start_time;
- as->ip_started = _RET_IP_;
- as->mode = BTREE_INTERIOR_NO_UPDATE;
- as->took_gc_lock = true;
- as->btree_id = path->btree_id;
- as->update_level = update_level;
+ as->c = c;
+ as->start_time = start_time;
+ as->ip_started = _RET_IP_;
+ as->mode = BTREE_UPDATE_none;
+ as->flags = flags;
+ as->took_gc_lock = true;
+ as->btree_id = path->btree_id;
+ as->update_level_start = level_start;
+ as->update_level_end = level_end;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
@@ -1168,7 +1253,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
*/
if (bch2_err_matches(ret, ENOSPC) &&
(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark != BCH_WATERMARK_reclaim) {
+ watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
}
@@ -1220,23 +1305,29 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
bch2_recalc_btree_reserve(c);
}
-static void bch2_btree_set_root(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+static int bch2_btree_set_root(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ bool nofail)
{
struct bch_fs *c = as->c;
- struct btree *old;
trace_and_count(c, btree_node_set_root, trans, b);
- old = btree_node_root(c, b);
+ struct btree *old = btree_node_root(c, b);
/*
* Ensure no one is using the old root while we switch to the
* new root:
*/
- bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+ if (nofail) {
+ bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+ } else {
+ int ret = bch2_btree_node_lock_write(trans, path, &old->c);
+ if (ret)
+ return ret;
+ }
bch2_btree_set_root_inmem(c, b);
@@ -1250,6 +1341,7 @@ static void bch2_btree_set_root(struct btree_update *as,
* depend on the new root would have to update the new root.
*/
bch2_btree_node_unlock_write(trans, path, old);
+ return 0;
}
/* Interior node updates: */
@@ -1269,7 +1361,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
!btree_ptr_sectors_written(insert));
- if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+ if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
@@ -1316,12 +1408,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
}
static void
-__bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter node_iter,
- struct keylist *keys)
+bch2_btree_insert_keys_interior(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct keylist *keys)
{
struct bkey_i *insert = bch2_keylist_front(keys);
struct bkey_packed *k;
@@ -1380,9 +1472,16 @@ static void __btree_split_node(struct btree_update *as,
if (bkey_deleted(k))
continue;
+ uk = bkey_unpack_key(b, k);
+
+ if (b->c.level &&
+ u64s < n1_u64s &&
+ u64s + k->u64s >= n1_u64s &&
+ bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p))
+ n1_u64s += k->u64s;
+
i = u64s >= n1_u64s;
u64s += k->u64s;
- uk = bkey_unpack_key(b, k);
if (!i)
n1_pos = uk.p;
bch2_bkey_format_add_key(&format[i], &uk);
@@ -1441,8 +1540,7 @@ static void __btree_split_node(struct btree_update *as,
bch2_verify_btree_nr_keys(n[i]);
- if (b->c.level)
- btree_node_interior_verify(as->c, n[i]);
+ BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
}
}
@@ -1471,9 +1569,9 @@ static void btree_split_insert_keys(struct btree_update *as,
bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
- __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+ bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
- btree_node_interior_verify(as->c, b);
+ BUG_ON(bch2_btree_node_check_topology(trans, b));
}
}
@@ -1488,9 +1586,14 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
u64 start_time = local_clock();
int ret = 0;
+ bch2_verify_btree_nr_keys(b);
BUG_ON(!parent && (b != btree_node_root(c, b)));
BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret)
+ return ret;
+
bch2_btree_interior_update_will_free_node(as, b);
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
@@ -1517,12 +1620,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
six_unlock_write(&n2->c.lock);
six_unlock_write(&n1->c.lock);
- path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
+ path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + path1, n1);
- path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p);
+ path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p);
six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + path2, n2);
@@ -1567,7 +1670,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n1);
six_unlock_write(&n1->c.lock);
- path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
+ path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + path1, n1);
@@ -1581,15 +1684,16 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (parent) {
/* Split a non root node */
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
- if (ret)
- goto err;
} else if (n3) {
- bch2_btree_set_root(as, trans, trans->paths + path, n3);
+ ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
} else {
/* Root filled up but didn't need to be split */
- bch2_btree_set_root(as, trans, trans->paths + path, n1);
+ ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
}
+ if (ret)
+ goto err;
+
if (n3) {
bch2_btree_update_get_open_buckets(as, n3);
bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
@@ -1646,27 +1750,6 @@ err:
goto out;
}
-static void
-bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct keylist *keys)
-{
- struct btree_path *linked;
- unsigned i;
-
- __bch2_btree_insert_keys_interior(as, trans, path, b,
- path->l[b->c.level].iter, keys);
-
- btree_update_updated_node(as, b);
-
- trans_for_each_path_with_node(trans, b, linked, i)
- bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
-
- bch2_trans_verify_paths(trans);
-}
-
/**
* bch2_btree_insert_node - insert bkeys into a given btree node
*
@@ -1687,7 +1770,8 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
struct keylist *keys)
{
struct bch_fs *c = as->c;
- struct btree_path *path = trans->paths + path_idx;
+ struct btree_path *path = trans->paths + path_idx, *linked;
+ unsigned i;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
@@ -1710,9 +1794,19 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
goto split;
}
- btree_node_interior_verify(c, b);
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret) {
+ bch2_btree_node_unlock_write(trans, path, b);
+ return ret;
+ }
+
+ bch2_btree_insert_keys_interior(as, trans, path, b,
+ path->l[b->c.level].iter, keys);
- bch2_btree_insert_keys_interior(as, trans, path, b, keys);
+ trans_for_each_path_with_node(trans, b, linked, i)
+ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
+
+ bch2_trans_verify_paths(trans);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1726,16 +1820,17 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_maybe_compact_whiteouts(c, b))
bch2_trans_node_reinit_iter(trans, b);
+ btree_update_updated_node(as, b);
bch2_btree_node_unlock_write(trans, path, b);
- btree_node_interior_verify(c, b);
+ BUG_ON(bch2_btree_node_check_topology(trans, b));
return 0;
split:
/*
* We could attempt to avoid the transaction restart, by calling
* bch2_btree_path_upgrade() and allocating more nodes:
*/
- if (b->c.level >= as->update_level) {
+ if (b->c.level >= as->update_level_end) {
trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
@@ -1801,7 +1896,9 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
bch2_keylist_add(&as->parent_keys, &b->key);
btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
- bch2_btree_set_root(as, trans, path, n);
+ int ret = bch2_btree_set_root(as, trans, path, n, true);
+ BUG_ON(ret);
+
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
bch2_trans_node_add(trans, path, n);
@@ -1818,9 +1915,12 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path,
{
struct bch_fs *c = trans->c;
struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
+
+ if (btree_node_fake(b))
+ return bch2_btree_split_leaf(trans, path, flags);
+
struct btree_update *as =
- bch2_btree_update_start(trans, trans->paths + path,
- b->c.level, true, flags);
+ bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
if (IS_ERR(as))
return PTR_ERR(as);
@@ -1848,9 +1948,27 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
u64 start_time = local_clock();
int ret = 0;
+ bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked(trans);
BUG_ON(!trans->paths[path].should_be_locked);
BUG_ON(!btree_node_locked(&trans->paths[path], level));
+ /*
+ * Work around a deadlock caused by the btree write buffer not doing
+ * merges and leaving tons of merges for us to do - we really don't need
+ * to be doing merges at all from the interior update path, and if the
+ * interior update path is generating too many new interior updates we
+ * deadlock:
+ */
+ if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
+ return 0;
+
+ if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) {
+ flags &= ~BCH_WATERMARK_MASK;
+ flags |= BCH_WATERMARK_btree;
+ flags |= BCH_TRANS_COMMIT_journal_reclaim;
+ }
+
b = trans->paths[path].l[level].b;
if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
@@ -1864,7 +1982,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
: bpos_successor(b->data->max_key);
sib_path = bch2_path_get(trans, btree, sib_pos,
- U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
+ U8_MAX, level, BTREE_ITER_intent, _THIS_IP_);
ret = bch2_btree_path_traverse(trans, sib_path, false);
if (ret)
goto err;
@@ -1957,7 +2075,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p);
+ new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + new_path, n);
@@ -1996,6 +2114,10 @@ err:
bch2_path_put(trans, new_path, true);
bch2_path_put(trans, sib_path, true);
bch2_trans_verify_locks(trans);
+ if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+ ret = 0;
+ if (!ret)
+ ret = bch2_trans_relock(trans);
return ret;
err_free_update:
bch2_btree_node_free_never_used(as, trans, n);
@@ -2031,7 +2153,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
+ new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + new_path, n);
@@ -2041,12 +2163,13 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
- if (ret)
- goto err;
} else {
- bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n);
+ ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
}
+ if (ret)
+ goto err;
+
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
@@ -2213,10 +2336,10 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
if (!skip_triggers) {
ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
bkey_i_to_s_c(&b->key),
- BTREE_TRIGGER_TRANSACTIONAL) ?:
+ BTREE_TRIGGER_transactional) ?:
bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
bkey_i_to_s(new_key),
- BTREE_TRIGGER_TRANSACTIONAL);
+ BTREE_TRIGGER_transactional);
if (ret)
return ret;
}
@@ -2233,7 +2356,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
bch2_trans_copy_iter(&iter2, iter);
iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
- iter2.flags & BTREE_ITER_INTENT,
+ iter2.flags & BTREE_ITER_intent,
_THIS_IP_);
struct btree_path *path2 = btree_iter_path(trans, &iter2);
@@ -2245,7 +2368,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
trans->paths_sorted = false;
ret = bch2_btree_iter_traverse(&iter2) ?:
- bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
+ bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
if (ret)
goto err;
} else {
@@ -2353,7 +2476,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->c.level,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto out;
@@ -2367,7 +2490,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
BUG_ON(!btree_node_hashed(b));
- struct bch_extent_ptr *ptr;
bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
!bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
@@ -2391,7 +2513,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
bch2_btree_set_root_inmem(c, b);
}
-static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
+int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level)
{
struct bch_fs *c = trans->c;
struct closure cl;
@@ -2410,7 +2532,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
set_btree_node_fake(b);
set_btree_node_need_rewrite(b);
- b->c.level = 0;
+ b->c.level = level;
b->c.btree_id = id;
bkey_btree_ptr_init(&b->key);
@@ -2437,9 +2559,24 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
return 0;
}
-void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
{
- bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
+ bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level));
+}
+
+static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
+{
+ prt_printf(out, "%ps: ", (void *) as->ip_started);
+ bch2_trans_commit_flags_to_text(out, as->flags);
+
+ prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+ bch2_btree_id_str(as->btree_id),
+ as->update_level_start,
+ as->update_level_end,
+ bch2_btree_update_modes[as->mode],
+ as->nodes_written,
+ closure_nr_remaining(&as->cl),
+ as->journal.seq);
}
void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
@@ -2448,12 +2585,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->btree_interior_update_lock);
list_for_each_entry(as, &c->btree_interior_update_list, list)
- prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
- (void *) as->ip_started,
- as->mode,
- as->nodes_written,
- closure_nr_remaining(&as->cl),
- as->journal.seq);
+ bch2_btree_update_to_text(out, as);
mutex_unlock(&c->btree_interior_update_lock);
}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index f651dd48aaa0..b5b76ce01cfc 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -10,6 +10,20 @@
#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
+int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
+
+#define BTREE_UPDATE_MODES() \
+ x(none) \
+ x(node) \
+ x(root) \
+ x(update)
+
+enum btree_update_mode {
+#define x(n) BTREE_UPDATE_##n,
+ BTREE_UPDATE_MODES()
+#undef x
+};
+
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
* parent node:
@@ -37,24 +51,19 @@ struct btree_update {
struct list_head list;
struct list_head unwritten_list;
- /* What kind of update are we doing? */
- enum {
- BTREE_INTERIOR_NO_UPDATE,
- BTREE_INTERIOR_UPDATING_NODE,
- BTREE_INTERIOR_UPDATING_ROOT,
- BTREE_INTERIOR_UPDATING_AS,
- } mode;
-
+ enum btree_update_mode mode;
+ enum bch_trans_commit_flags flags;
unsigned nodes_written:1;
unsigned took_gc_lock:1;
enum btree_id btree_id;
- unsigned update_level;
+ unsigned update_level_start;
+ unsigned update_level_end;
struct disk_reservation disk_res;
/*
- * BTREE_INTERIOR_UPDATING_NODE:
+ * BTREE_UPDATE_node:
* The update that made the new nodes visible was a regular update to an
* existing interior node - @b. We can't write out the update to @b
* until the new nodes we created are finished writing, so we block @b
@@ -135,6 +144,9 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
EBUG_ON(!btree_node_locked(path, level));
+ if (bch2_btree_node_merging_disabled)
+ return 0;
+
b = path->l[level].b;
if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
return 0;
@@ -163,7 +175,9 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
struct bkey_i *, unsigned, bool);
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+
+int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned);
+void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
static inline unsigned btree_update_reserve_required(struct bch_fs *c,
struct btree *b)
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 5cbad8445782..75c8a196b3f6 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -11,6 +11,7 @@
#include "journal_reclaim.h"
#include <linux/prefetch.h>
+#include <linux/sort.h>
static int bch2_btree_write_buffer_journal_flush(struct journal *,
struct journal_entry_pin *, u64);
@@ -46,6 +47,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke
#endif
}
+static int wb_key_seq_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->journal_seq, r->journal_seq);
+}
+
/* Compare excluding idx, the low 24 bits: */
static inline bool wb_key_eq(const void *_l, const void *_r)
{
@@ -113,7 +122,7 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
trans->journal_res.seq = wb->journal_seq;
return bch2_trans_update(trans, iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw|
@@ -182,13 +191,13 @@ btree_write_buffered_insert(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
- BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+ BTREE_ITER_cached|BTREE_ITER_intent);
trans->journal_res.seq = wb->journal_seq;
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(trans, &iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -307,13 +316,23 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
bch2_btree_node_unlock_write(trans, path, path->l[0].b);
write_locked = false;
+
+ ret = lockrestart_do(trans,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_foreground_maybe_merge(trans, iter.path, 0,
+ BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc));
+ if (ret)
+ goto err;
}
}
if (!iter.path || iter.btree_id != k->btree) {
bch2_trans_iter_exit(trans, &iter);
bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
- BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_intent|BTREE_ITER_all_snapshots);
}
bch2_btree_iter_set_pos(&iter, k->k.k.p);
@@ -357,6 +376,11 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
*/
trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
+ sort(wb->flushing.keys.data,
+ wb->flushing.keys.nr,
+ sizeof(wb->flushing.keys.data[0]),
+ wb_key_seq_cmp, NULL);
+
darray_for_each(wb->flushing.keys, i) {
if (!i->journal_seq)
continue;
@@ -368,10 +392,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
ret = commit_do(trans, NULL, NULL,
BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_journal_res|
- BCH_TRANS_COMMIT_journal_reclaim,
+ BCH_TRANS_COMMIT_no_journal_res ,
btree_write_buffered_insert(trans, i));
if (ret)
goto err;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 96edf2c34d43..ed97712d0db1 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -274,25 +274,14 @@ void bch2_dev_usage_init(struct bch_dev *ca)
void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
{
- prt_tab(out);
- prt_str(out, "buckets");
- prt_tab_rjust(out);
- prt_str(out, "sectors");
- prt_tab_rjust(out);
- prt_str(out, "fragmented");
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
bch2_prt_data_type(out, i);
- prt_tab(out);
- prt_u64(out, usage->d[i].buckets);
- prt_tab_rjust(out);
- prt_u64(out, usage->d[i].sectors);
- prt_tab_rjust(out);
- prt_u64(out, usage->d[i].fragmented);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
+ usage->d[i].buckets,
+ usage->d[i].sectors,
+ usage->d[i].fragmented);
}
}
@@ -329,26 +318,6 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
preempt_enable();
}
-static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
-{
- return (struct bch_alloc_v4) {
- .gen = b.gen,
- .data_type = b.data_type,
- .dirty_sectors = b.dirty_sectors,
- .cached_sectors = b.cached_sectors,
- .stripe = b.stripe,
- };
-}
-
-void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
- struct bucket *old, struct bucket *new)
-{
- struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old);
- struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new);
-
- bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true);
-}
-
static inline int __update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
struct bch_replicas_entry_v1 *r,
@@ -496,78 +465,277 @@ int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64
return bch2_update_replicas_list(trans, &r.e, sectors);
}
-int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, enum bch_data_type data_type,
- unsigned sectors, struct gc_pos pos,
- unsigned flags)
+int bch2_check_fix_ptrs(struct btree_trans *trans,
+ enum btree_id btree, unsigned level, struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
{
- struct bucket old, new, *g;
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry_c;
+ struct extent_ptr_decoded p = { 0 };
+ bool do_update = false;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
- BUG_ON(data_type != BCH_DATA_sb &&
- data_type != BCH_DATA_journal);
+ percpu_down_read(&c->mark_lock);
- /*
- * Backup superblock might be past the end of our normal usable space:
- */
- if (b >= ca->mi.nbuckets)
- return 0;
+ bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
+ struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
+ if (!ca) {
+ if (fsck_err(c, ptr_to_invalid_device,
+ "pointer to missing device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ do_update = true;
+ continue;
+ }
- percpu_down_read(&c->mark_lock);
- g = gc_bucket(ca, b);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry_c);
+
+ if (fsck_err_on(!g->gen_valid,
+ c, ptr_to_missing_alloc_key,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (!p.ptr.cached) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ } else {
+ do_update = true;
+ }
+ }
- bucket_lock(g);
- old = *g;
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (!p.ptr.cached &&
+ (g->data_type != BCH_DATA_btree ||
+ data_type == BCH_DATA_btree)) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ } else {
+ do_update = true;
+ }
+ }
- if (bch2_fs_inconsistent_on(g->data_type &&
- g->data_type != data_type, c,
- "different types of data in same bucket: %s, %s",
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type))) {
- ret = -EIO;
- goto err;
- }
+ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ do_update = true;
+
+ if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
+ c, stale_dirty_ptr,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ do_update = true;
+
+ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
+ goto next;
+
+ if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
+ c, ptr_bucket_data_type_mismatch,
+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (data_type == BCH_DATA_btree) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = data_type;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ } else {
+ do_update = true;
+ }
+ }
- if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
- ca->dev_idx, b, g->gen,
- bch2_data_type_str(g->data_type ?: data_type),
- g->dirty_sectors, sectors)) {
- ret = -EIO;
- goto err;
+ if (p.has_ec) {
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
+
+ if (fsck_err_on(!m || !m->alive, c,
+ ptr_to_missing_stripe,
+ "pointer to nonexistent stripe %llu\n"
+ "while marking %s",
+ (u64) p.ec.idx,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ do_update = true;
+
+ if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
+ ptr_to_incorrect_stripe,
+ "pointer does not match stripe %llu\n"
+ "while marking %s",
+ (u64) p.ec.idx,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ do_update = true;
+ }
+next:
+ bch2_dev_put(ca);
}
- g->data_type = data_type;
- g->dirty_sectors += sectors;
- new = *g;
+ if (do_update) {
+ if (flags & BTREE_TRIGGER_is_root) {
+ bch_err(c, "cannot update btree roots yet");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto err;
+
+ rcu_read_lock();
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_rcu(c, ptr->dev));
+ rcu_read_unlock();
+
+ if (level) {
+ /*
+ * We don't want to drop btree node pointers - if the
+ * btree node isn't there anymore, the read path will
+ * sort it out:
+ */
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ rcu_read_lock();
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ struct bucket *g = PTR_GC_BUCKET(ca, ptr);
+
+ ptr->gen = g->gen;
+ }
+ rcu_read_unlock();
+ } else {
+ struct bkey_ptrs ptrs;
+ union bch_extent_entry *entry;
+
+ rcu_read_lock();
+restart_drop_ptrs:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
+ struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
+
+ if ((p.ptr.cached &&
+ (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
+ (!p.ptr.cached &&
+ gen_cmp(p.ptr.gen, g->gen) < 0) ||
+ gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
+ (g->data_type &&
+ g->data_type != data_type)) {
+ bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
+ goto restart_drop_ptrs;
+ }
+ }
+ rcu_read_unlock();
+again:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
+ entry->stripe_ptr.idx);
+ union bch_extent_entry *next_ptr;
+
+ bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
+ if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
+ goto found;
+ next_ptr = NULL;
+found:
+ if (!next_ptr) {
+ bch_err(c, "aieee, found stripe ptr with no data ptr");
+ continue;
+ }
+
+ if (!m || !m->alive ||
+ !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
+ &next_ptr->ptr,
+ m->sectors)) {
+ bch2_bkey_extent_entry_drop(new, entry);
+ goto again;
+ }
+ }
+ }
+ }
+
+ if (0) {
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_info(c, "updated %s", buf.buf);
+
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+ bch_info(c, "new key %s", buf.buf);
+ }
+
+ percpu_up_read(&c->mark_lock);
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
+ BTREE_ITER_intent|BTREE_ITER_all_snapshots);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, new,
+ BTREE_UPDATE_internal_snapshot_node|
+ BTREE_TRIGGER_norun);
+ bch2_trans_iter_exit(trans, &iter);
+ percpu_down_read(&c->mark_lock);
+
+ if (ret)
+ goto err;
+
+ if (level)
+ bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
+ }
err:
- bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update_m(c, ca, &old, &new);
+fsck_err:
percpu_up_read(&c->mark_lock);
+ printbuf_exit(&buf);
return ret;
}
-int bch2_check_bucket_ref(struct btree_trans *trans,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum bch_data_type ptr_data_type,
- u8 b_gen, u8 bucket_data_type,
- u32 bucket_sectors)
+int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 b_gen, u8 bucket_data_type,
+ u32 *bucket_sectors)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
struct printbuf buf = PRINTBUF;
+ bool inserting = sectors > 0;
int ret = 0;
- if (bucket_data_type == BCH_DATA_cached)
- bucket_data_type = BCH_DATA_user;
-
- if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) ||
- (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe))
- bucket_data_type = ptr_data_type = BCH_DATA_stripe;
+ BUG_ON(!sectors);
if (gen_after(ptr->gen, b_gen)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
@@ -578,8 +746,9 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
+ if (inserting)
+ goto err;
+ goto out;
}
if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
@@ -592,11 +761,17 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
+ if (inserting)
+ goto err;
+ goto out;
+ }
+
+ if (b_gen != ptr->gen && ptr->cached) {
+ ret = 1;
+ goto out;
}
- if (b_gen != ptr->gen && !ptr->cached) {
+ if (b_gen != ptr->gen) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
BCH_FSCK_ERR_stale_dirty_ptr,
"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
@@ -607,18 +782,12 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
- }
-
- if (b_gen != ptr->gen) {
- ret = 1;
+ if (inserting)
+ goto err;
goto out;
}
- if (!data_type_is_empty(bucket_data_type) &&
- ptr_data_type &&
- bucket_data_type != ptr_data_type) {
+ if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
@@ -628,28 +797,33 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
bch2_data_type_str(ptr_data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
+ if (inserting)
+ goto err;
+ goto out;
}
- if ((u64) bucket_sectors + sectors > U32_MAX) {
+ if ((u64) *bucket_sectors + sectors > U32_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
BCH_FSCK_ERR_bucket_sector_count_overflow,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- bucket_sectors, sectors,
+ *bucket_sectors, sectors,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
+ if (inserting)
+ goto err;
+ sectors = -*bucket_sectors;
}
+
+ *bucket_sectors += sectors;
out:
printbuf_exit(&buf);
return ret;
err:
bch2_dump_trans_updates(trans);
+ ret = -EIO;
goto out;
}
@@ -786,111 +960,92 @@ need_mark:
/* KEY_TYPE_extent: */
-static int __mark_pointer(struct btree_trans *trans,
+static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
- u8 bucket_gen, u8 *bucket_data_type,
- u32 *dirty_sectors, u32 *cached_sectors)
+ struct bch_alloc_v4 *a)
{
u32 *dst_sectors = !ptr->cached
- ? dirty_sectors
- : cached_sectors;
- int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
- bucket_gen, *bucket_data_type, *dst_sectors);
+ ? &a->dirty_sectors
+ : &a->cached_sectors;
+ int ret = bch2_bucket_ref_update(trans, ca, k, ptr, sectors, ptr_data_type,
+ a->gen, a->data_type, dst_sectors);
if (ret)
return ret;
- *dst_sectors += sectors;
-
- if (!*dirty_sectors && !*cached_sectors)
- *bucket_data_type = 0;
- else if (*bucket_data_type != BCH_DATA_stripe)
- *bucket_data_type = ptr_data_type;
-
+ alloc_data_type_set(a, ptr_data_type);
return 0;
}
static int bch2_trigger_pointer(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry,
s64 *sectors,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
- bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
+ bool insert = !(flags & BTREE_TRIGGER_overwrite);
+ int ret = 0;
+
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
+ if (unlikely(!ca)) {
+ if (insert)
+ ret = -EIO;
+ goto err;
+ }
+
struct bpos bucket;
struct bch_backpointer bp;
-
- bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
+ bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp);
*sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket);
- int ret = PTR_ERR_OR_ZERO(a);
+ if (flags & BTREE_TRIGGER_transactional) {
+ struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket);
+ ret = PTR_ERR_OR_ZERO(a) ?:
+ __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &a->v);
if (ret)
- return ret;
-
- ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type,
- a->v.gen, &a->v.data_type,
- &a->v.dirty_sectors, &a->v.cached_sectors) ?:
- bch2_trans_update(trans, &iter, &a->k_i, 0);
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- return ret;
+ goto err;
if (!p.ptr.cached) {
- ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
+ ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert);
if (ret)
- return ret;
+ goto err;
}
}
- if (flags & BTREE_TRIGGER_GC) {
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
-
+ if (flags & BTREE_TRIGGER_gc) {
percpu_down_read(&c->mark_lock);
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ struct bucket *g = gc_bucket(ca, bucket.offset);
bucket_lock(g);
- struct bucket old = *g;
-
- u8 bucket_data_type = g->data_type;
- int ret = __mark_pointer(trans, k, &p.ptr, *sectors,
- data_type, g->gen,
- &bucket_data_type,
- &g->dirty_sectors,
- &g->cached_sectors);
- if (ret) {
- bucket_unlock(g);
- percpu_up_read(&c->mark_lock);
- return ret;
+ struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
+ ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new);
+ if (!ret) {
+ alloc_to_bucket(g, new);
+ bch2_dev_usage_update(c, ca, &old, &new, 0, true);
}
-
- g->data_type = bucket_data_type;
- struct bucket new = *g;
bucket_unlock(g);
- bch2_dev_usage_update_m(c, ca, &old, &new);
percpu_up_read(&c->mark_lock);
}
-
- return 0;
+err:
+ bch2_dev_put(ca);
+ return ret;
}
static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
struct bkey_s_c k,
struct extent_ptr_decoded p,
enum bch_data_type data_type,
- s64 sectors, unsigned flags)
+ s64 sectors,
+ enum btree_iter_update_trigger_flags flags)
{
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
struct btree_iter iter;
struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_stripes, POS(0, p.ec.idx),
- BTREE_ITER_WITH_UPDATES, stripe);
+ BTREE_ITER_with_updates, stripe);
int ret = PTR_ERR_OR_ZERO(s);
if (unlikely(ret)) {
bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
@@ -920,10 +1075,10 @@ err:
return ret;
}
- if (flags & BTREE_TRIGGER_GC) {
+ if (flags & BTREE_TRIGGER_gc) {
struct bch_fs *c = trans->c;
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
+ BUG_ON(!(flags & BTREE_TRIGGER_gc));
struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
if (!m) {
@@ -959,9 +1114,10 @@ err:
static int __trigger_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+ struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
{
- bool gc = flags & BTREE_TRIGGER_GC;
+ bool gc = flags & BTREE_TRIGGER_gc;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -970,7 +1126,7 @@ static int __trigger_extent(struct btree_trans *trans,
enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
? BCH_DATA_btree
: BCH_DATA_user;
- s64 dirty_sectors = 0;
+ s64 replicas_sectors = 0;
int ret = 0;
r.e.data_type = data_type;
@@ -978,8 +1134,8 @@ static int __trigger_extent(struct btree_trans *trans,
r.e.nr_required = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors;
- ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags);
+ s64 disk_sectors = 0;
+ ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
if (ret < 0)
return ret;
@@ -996,7 +1152,7 @@ static int __trigger_extent(struct btree_trans *trans,
return ret;
}
} else if (!p.has_ec) {
- dirty_sectors += disk_sectors;
+ replicas_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
@@ -1014,8 +1170,8 @@ static int __trigger_extent(struct btree_trans *trans,
if (r.e.nr_devs) {
ret = !gc
- ? bch2_update_replicas_list(trans, &r.e, dirty_sectors)
- : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true);
+ ? bch2_update_replicas_list(trans, &r.e, replicas_sectors)
+ : bch2_update_replicas(c, k, &r.e, replicas_sectors, 0, true);
if (unlikely(ret && gc)) {
struct printbuf buf = PRINTBUF;
@@ -1031,15 +1187,18 @@ static int __trigger_extent(struct btree_trans *trans,
}
int bch2_trigger_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
+ enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
+ if (unlikely(flags & BTREE_TRIGGER_check_repair))
+ return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags);
+
/* if pointers aren't changing - nothing to do: */
if (new_ptrs_bytes == old_ptrs_bytes &&
!memcmp(new_ptrs.start,
@@ -1047,7 +1206,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
new_ptrs_bytes))
return 0;
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
struct bch_fs *c = trans->c;
int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
(int) bch2_bkey_needs_rebalance(c, old);
@@ -1060,8 +1219,8 @@ int bch2_trigger_extent(struct btree_trans *trans,
}
}
- if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))
- return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags);
+ if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc))
+ return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags);
return 0;
}
@@ -1069,17 +1228,17 @@ int bch2_trigger_extent(struct btree_trans *trans,
/* KEY_TYPE_reservation */
static int __trigger_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+ enum btree_id btree_id, unsigned level, struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size * replicas;
- if (flags & BTREE_TRIGGER_OVERWRITE)
+ if (flags & BTREE_TRIGGER_overwrite)
sectors = -sectors;
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
int ret = bch2_replicas_deltas_realloc(trans, 0);
if (ret)
return ret;
@@ -1090,7 +1249,7 @@ static int __trigger_reservation(struct btree_trans *trans,
d->persistent_reserved[replicas - 1] += sectors;
}
- if (flags & BTREE_TRIGGER_GC) {
+ if (flags & BTREE_TRIGGER_gc) {
percpu_down_read(&c->mark_lock);
preempt_disable();
@@ -1110,7 +1269,7 @@ static int __trigger_reservation(struct btree_trans *trans,
int bch2_trigger_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
}
@@ -1118,22 +1277,16 @@ int bch2_trigger_reservation(struct btree_trans *trans,
/* Mark superblocks: */
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
- struct bch_dev *ca, size_t b,
+ struct bch_dev *ca, u64 b,
enum bch_data_type type,
unsigned sectors)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
int ret = 0;
- /*
- * Backup superblock might be past the end of our normal usable space:
- */
- if (b >= ca->mi.nbuckets)
- return 0;
-
- a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
+ struct bkey_i_alloc_v4 *a =
+ bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b));
if (IS_ERR(a))
return PTR_ERR(a);
@@ -1161,20 +1314,75 @@ err:
return ret;
}
+static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+ u64 b, enum bch_data_type data_type, unsigned sectors,
+ enum btree_iter_update_trigger_flags flags)
+{
+ int ret = 0;
+
+ percpu_down_read(&c->mark_lock);
+ struct bucket *g = gc_bucket(ca, b);
+
+ bucket_lock(g);
+ struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
+
+ if (bch2_fs_inconsistent_on(g->data_type &&
+ g->data_type != data_type, c,
+ "different types of data in same bucket: %s, %s",
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type))) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
+ "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
+ ca->dev_idx, b, g->gen,
+ bch2_data_type_str(g->data_type ?: data_type),
+ g->dirty_sectors, sectors)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
+ struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
+err:
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+ percpu_up_read(&c->mark_lock);
+ return ret;
+}
+
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
- struct bch_dev *ca, size_t b,
- enum bch_data_type type,
- unsigned sectors)
+ struct bch_dev *ca, u64 b,
+ enum bch_data_type type, unsigned sectors,
+ enum btree_iter_update_trigger_flags flags)
{
- return commit_do(trans, NULL, NULL, 0,
- __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
+ BUG_ON(type != BCH_DATA_free &&
+ type != BCH_DATA_sb &&
+ type != BCH_DATA_journal);
+
+ /*
+ * Backup superblock might be past the end of our normal usable space:
+ */
+ if (b >= ca->mi.nbuckets)
+ return 0;
+
+ if (flags & BTREE_TRIGGER_gc)
+ return bch2_mark_metadata_bucket(trans->c, ca, b, type, sectors, flags);
+ else if (flags & BTREE_TRIGGER_transactional)
+ return commit_do(trans, NULL, NULL, 0,
+ __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
+ else
+ BUG();
}
static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
- struct bch_dev *ca,
- u64 start, u64 end,
- enum bch_data_type type,
- u64 *bucket, unsigned *bucket_sectors)
+ struct bch_dev *ca, u64 start, u64 end,
+ enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors,
+ enum btree_iter_update_trigger_flags flags)
{
do {
u64 b = sector_to_bucket(ca, start);
@@ -1183,7 +1391,7 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
if (b != *bucket && *bucket_sectors) {
int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
- type, *bucket_sectors);
+ type, *bucket_sectors, flags);
if (ret)
return ret;
@@ -1198,8 +1406,8 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
return 0;
}
-static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
- struct bch_dev *ca)
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
u64 bucket = 0;
@@ -1212,21 +1420,21 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
if (offset == BCH_SB_SECTOR) {
ret = bch2_trans_mark_metadata_sectors(trans, ca,
0, BCH_SB_SECTOR,
- BCH_DATA_sb, &bucket, &bucket_sectors);
+ BCH_DATA_sb, &bucket, &bucket_sectors, flags);
if (ret)
return ret;
}
ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
offset + (1 << layout->sb_max_size_bits),
- BCH_DATA_sb, &bucket, &bucket_sectors);
+ BCH_DATA_sb, &bucket, &bucket_sectors, flags);
if (ret)
return ret;
}
if (bucket_sectors) {
ret = bch2_trans_mark_metadata_bucket(trans, ca,
- bucket, BCH_DATA_sb, bucket_sectors);
+ bucket, BCH_DATA_sb, bucket_sectors, flags);
if (ret)
return ret;
}
@@ -1234,7 +1442,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
for (i = 0; i < ca->journal.nr; i++) {
ret = bch2_trans_mark_metadata_bucket(trans, ca,
ca->journal.buckets[i],
- BCH_DATA_journal, ca->mi.bucket_size);
+ BCH_DATA_journal, ca->mi.bucket_size, flags);
if (ret)
return ret;
}
@@ -1242,20 +1450,22 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
return 0;
}
-int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
+int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
+ enum btree_iter_update_trigger_flags flags)
{
- int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
-
+ int ret = bch2_trans_run(c,
+ __bch2_trans_mark_dev_sb(trans, ca, flags));
bch_err_fn(c, ret);
return ret;
}
-int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
+ enum btree_iter_update_trigger_flags flags)
{
for_each_online_member(c, ca) {
- int ret = bch2_trans_mark_dev_sb(c, ca);
+ int ret = bch2_trans_mark_dev_sb(c, ca, flags);
if (ret) {
- percpu_ref_put(&ca->ref);
+ percpu_ref_put(&ca->io_ref);
return ret;
}
}
@@ -1263,6 +1473,11 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c)
return 0;
}
+int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+{
+ return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
+}
+
/* Disk reservations: */
#define SECTORS_CACHE 1024
@@ -1331,6 +1546,31 @@ recalculate:
/* Startup/shutdown: */
+void bch2_buckets_nouse_free(struct bch_fs *c)
+{
+ for_each_member_device(c, ca) {
+ kvfree_rcu_mightsleep(ca->buckets_nouse);
+ ca->buckets_nouse = NULL;
+ }
+}
+
+int bch2_buckets_nouse_alloc(struct bch_fs *c)
+{
+ for_each_member_device(c, ca) {
+ BUG_ON(ca->buckets_nouse);
+
+ ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
+ sizeof(unsigned long),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!ca->buckets_nouse) {
+ bch2_dev_put(ca);
+ return -BCH_ERR_ENOMEM_buckets_nouse;
+ }
+ }
+
+ return 0;
+}
+
static void bucket_gens_free_rcu(struct rcu_head *rcu)
{
struct bucket_gens *buckets =
@@ -1342,24 +1582,17 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
- unsigned long *buckets_nouse = NULL;
bool resize = ca->bucket_gens != NULL;
int ret;
+ BUG_ON(resize && ca->buckets_nouse);
+
if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets,
GFP_KERNEL|__GFP_ZERO))) {
ret = -BCH_ERR_ENOMEM_bucket_gens;
goto err;
}
- if ((c->opts.buckets_nouse &&
- !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) *
- sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO)))) {
- ret = -BCH_ERR_ENOMEM_buckets_nouse;
- goto err;
- }
-
bucket_gens->first_bucket = ca->mi.first_bucket;
bucket_gens->nbuckets = nbuckets;
@@ -1377,17 +1610,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
memcpy(bucket_gens->b,
old_bucket_gens->b,
n);
- if (buckets_nouse)
- memcpy(buckets_nouse,
- ca->buckets_nouse,
- BITS_TO_LONGS(n) * sizeof(unsigned long));
}
rcu_assign_pointer(ca->bucket_gens, bucket_gens);
bucket_gens = old_bucket_gens;
- swap(ca->buckets_nouse, buckets_nouse);
-
nbuckets = ca->mi.nbuckets;
if (resize) {
@@ -1398,7 +1625,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
ret = 0;
err:
- kvfree(buckets_nouse);
if (bucket_gens)
call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 6387e039f789..617ffde2fb7a 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -12,7 +12,7 @@
#include "extents.h"
#include "sb-members.h"
-static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s)
{
return div_u64(s, ca->mi.bucket_size);
}
@@ -30,8 +30,7 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
return remainder;
}
-static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
- u32 *offset)
+static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset)
{
return div_u64_rem(s, ca->mi.bucket_size, offset);
}
@@ -94,7 +93,7 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
struct bucket_array *buckets = gc_bucket_array(ca);
- BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+ BUG_ON(!bucket_valid(ca, b));
return buckets->b + b;
}
@@ -111,7 +110,7 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
{
struct bucket_gens *gens = bucket_gens(ca);
- BUG_ON(b < gens->first_bucket || b >= gens->nbuckets);
+ BUG_ON(!bucket_valid(ca, b));
return gens->b + b;
}
@@ -121,20 +120,16 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
return sector_to_bucket(ca, ptr->offset);
}
-static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
- const struct bch_extent_ptr *ptr)
+static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca,
+ const struct bch_extent_ptr *ptr)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
}
-static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca,
const struct bch_extent_ptr *ptr,
u32 *bucket_offset)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
}
@@ -175,17 +170,19 @@ static inline int gen_after(u8 a, u8 b)
return r > 0 ? r : 0;
}
+static inline u8 dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
+{
+ return gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+}
+
/**
- * ptr_stale() - check if a pointer points into a bucket that has been
+ * dev_ptr_stale() - check if a pointer points into a bucket that has been
* invalidated.
*/
-static inline u8 ptr_stale(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
+static inline u8 dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
- u8 ret;
-
rcu_read_lock();
- ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+ u8 ret = dev_ptr_stale_rcu(ca, ptr);
rcu_read_unlock();
return ret;
@@ -226,6 +223,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
fallthrough;
case BCH_WATERMARK_btree_copygc:
case BCH_WATERMARK_reclaim:
+ case BCH_WATERMARK_interior_updates:
break;
}
@@ -305,8 +303,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
const struct bch_alloc_v4 *,
const struct bch_alloc_v4 *, u64, bool);
-void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *,
- struct bucket *, struct bucket *);
/* key/bucket marking: */
@@ -332,27 +328,29 @@ int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
void bch2_fs_usage_initialize(struct bch_fs *);
-int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c,
- const struct bch_extent_ptr *,
- s64, enum bch_data_type, u8, u8, u32);
+int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
+ struct bkey_s_c, const struct bch_extent_ptr *,
+ s64, enum bch_data_type, u8, u8, u32 *);
-int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
- size_t, enum bch_data_type, unsigned,
- struct gc_pos, unsigned);
+int bch2_check_fix_ptrs(struct btree_trans *,
+ enum btree_id, unsigned, struct bkey_s_c,
+ enum btree_iter_update_trigger_flags);
int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
({ \
int ret = 0; \
\
if (_old.k->type) \
- ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \
+ ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert); \
if (!ret && _new.k->type) \
- ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\
+ ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\
ret; \
})
@@ -361,9 +359,13 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *);
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
-int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
- size_t, enum bch_data_type, unsigned);
-int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
+int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
+ enum bch_data_type, unsigned,
+ enum btree_iter_update_trigger_flags);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *,
+ enum btree_iter_update_trigger_flags);
+int bch2_trans_mark_dev_sbs_flags(struct bch_fs *,
+ enum btree_iter_update_trigger_flags);
int bch2_trans_mark_dev_sbs(struct bch_fs *);
static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
@@ -394,14 +396,6 @@ static inline const char *bch2_data_type_str(enum bch_data_type type)
: "(invalid data type)";
}
-static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
-{
- if (type < BCH_DATA_NR)
- prt_str(out, __bch2_data_types[type]);
- else
- prt_printf(out, "(invalid data type %u)", type);
-}
-
/* disk reservations: */
static inline void bch2_disk_reservation_put(struct bch_fs *c,
@@ -471,6 +465,9 @@ static inline u64 avail_factor(u64 r)
return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
}
+void bch2_buckets_nouse_free(struct bch_fs *);
+int bch2_buckets_nouse_alloc(struct bch_fs *);
+
int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
void bch2_dev_buckets_free(struct bch_dev *);
int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 38defa19d52d..9e54323f0f5f 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -7,7 +7,7 @@
#include "chardev.h"
#include "journal.h"
#include "move.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
@@ -32,12 +32,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
if (dev >= c->sb.nr_devices)
return ERR_PTR(-EINVAL);
- rcu_read_lock();
- ca = rcu_dereference(c->devs[dev]);
- if (ca)
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
-
+ ca = bch2_dev_tryget_noerror(c, dev);
if (!ca)
return ERR_PTR(-EINVAL);
} else {
@@ -134,42 +129,38 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
struct fsck_thread {
struct thread_with_stdio thr;
struct bch_fs *c;
- char **devs;
- size_t nr_devs;
struct bch_opts opts;
};
static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
{
struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
- if (thr->devs)
- for (size_t i = 0; i < thr->nr_devs; i++)
- kfree(thr->devs[i]);
- kfree(thr->devs);
kfree(thr);
}
static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
{
struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
- struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
-
- if (IS_ERR(c))
- return PTR_ERR(c);
+ struct bch_fs *c = thr->c;
- int ret = 0;
- if (test_bit(BCH_FS_errors_fixed, &c->flags))
- ret |= 1;
- if (test_bit(BCH_FS_error, &c->flags))
- ret |= 4;
+ int ret = PTR_ERR_OR_ZERO(c);
+ if (ret)
+ return ret;
- bch2_fs_stop(c);
+ ret = bch2_fs_start(thr->c);
+ if (ret)
+ goto err;
- if (ret & 1)
+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
- if (ret & 4)
+ ret |= 1;
+ }
+ if (test_bit(BCH_FS_error, &c->flags)) {
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
-
+ ret |= 4;
+ }
+err:
+ bch2_fs_stop(c);
return ret;
}
@@ -182,7 +173,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
{
struct bch_ioctl_fsck_offline arg;
struct fsck_thread *thr = NULL;
- u64 *devs = NULL;
+ darray_str(devs) = {};
long ret = 0;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
@@ -194,29 +185,32 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) ||
- !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) ||
- !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) {
- ret = -ENOMEM;
- goto err;
- }
+ for (size_t i = 0; i < arg.nr_devs; i++) {
+ u64 dev_u64;
+ ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
+ if (ret)
+ goto err;
- thr->opts = bch2_opts_empty();
- thr->nr_devs = arg.nr_devs;
+ char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(dev_str);
+ if (ret)
+ goto err;
- if (copy_from_user(devs, &user_arg->devs[0],
- array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) {
- ret = -EINVAL;
- goto err;
+ ret = darray_push(&devs, dev_str);
+ if (ret) {
+ kfree(dev_str);
+ goto err;
+ }
}
- for (size_t i = 0; i < arg.nr_devs; i++) {
- thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX);
- ret = PTR_ERR_OR_ZERO(thr->devs[i]);
- if (ret)
- goto err;
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr) {
+ ret = -ENOMEM;
+ goto err;
}
+ thr->opts = bch2_opts_empty();
+
if (arg.opts) {
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
@@ -230,15 +224,28 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
- ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
-err:
- if (ret < 0) {
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- pr_err("ret %s", bch2_err_str(ret));
- }
- kfree(devs);
+ /* We need request_key() to be called before we punt to kthread: */
+ opt_set(thr->opts, nostart, true);
+
+ bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
+
+ thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
+
+ if (!IS_ERR(thr->c) &&
+ thr->c->opts.errors == BCH_ON_ERROR_panic)
+ thr->c->opts.errors = BCH_ON_ERROR_ro;
+
+ ret = __bch2_run_thread_with_stdio(&thr->thr);
+out:
+ darray_for_each(devs, i)
+ kfree(*i);
+ darray_exit(&devs);
return ret;
+err:
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ pr_err("ret %s", bch2_err_str(ret));
+ goto out;
}
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
@@ -379,7 +386,7 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
return PTR_ERR(ca);
ret = bch2_dev_offline(c, ca, arg.flags);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -408,7 +415,7 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
if (ret)
bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -603,7 +610,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
arg.d[i].fragmented = src.d[i].fragmented;
}
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
}
@@ -655,7 +662,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
goto err;
}
err:
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -677,11 +684,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
if (arg.flags & BCH_READ_DEV) {
ca = bch2_device_lookup(c, arg.dev, arg.flags);
-
- if (IS_ERR(ca)) {
- ret = PTR_ERR(ca);
- goto err;
- }
+ ret = PTR_ERR_OR_ZERO(ca);
+ if (ret)
+ goto err_unlock;
sb = ca->disk_sb.sb;
} else {
@@ -696,8 +701,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
vstruct_bytes(sb));
err:
- if (!IS_ERR_OR_NULL(ca))
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
+err_unlock:
mutex_unlock(&c->sb_lock);
return ret;
}
@@ -741,7 +746,7 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c,
ret = bch2_dev_resize(c, ca, arg.nbuckets);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -767,7 +772,7 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -949,7 +954,9 @@ static const struct file_operations bch_chardev_fops = {
};
static int bch_chardev_major;
-static struct class *bch_chardev_class;
+static const struct class bch_chardev_class = {
+ .name = "bcachefs",
+};
static struct device *bch_chardev;
void bch2_fs_chardev_exit(struct bch_fs *c)
@@ -966,7 +973,7 @@ int bch2_fs_chardev_init(struct bch_fs *c)
if (c->minor < 0)
return c->minor;
- c->chardev = device_create(bch_chardev_class, NULL,
+ c->chardev = device_create(&bch_chardev_class, NULL,
MKDEV(bch_chardev_major, c->minor), c,
"bcachefs%u-ctl", c->minor);
if (IS_ERR(c->chardev))
@@ -977,32 +984,39 @@ int bch2_fs_chardev_init(struct bch_fs *c)
void bch2_chardev_exit(void)
{
- if (!IS_ERR_OR_NULL(bch_chardev_class))
- device_destroy(bch_chardev_class,
- MKDEV(bch_chardev_major, U8_MAX));
- if (!IS_ERR_OR_NULL(bch_chardev_class))
- class_destroy(bch_chardev_class);
+ device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX));
+ class_unregister(&bch_chardev_class);
if (bch_chardev_major > 0)
unregister_chrdev(bch_chardev_major, "bcachefs");
}
int __init bch2_chardev_init(void)
{
+ int ret;
+
bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
if (bch_chardev_major < 0)
return bch_chardev_major;
- bch_chardev_class = class_create("bcachefs");
- if (IS_ERR(bch_chardev_class))
- return PTR_ERR(bch_chardev_class);
+ ret = class_register(&bch_chardev_class);
+ if (ret)
+ goto major_out;
- bch_chardev = device_create(bch_chardev_class, NULL,
+ bch_chardev = device_create(&bch_chardev_class, NULL,
MKDEV(bch_chardev_major, U8_MAX),
NULL, "bcachefs-ctl");
- if (IS_ERR(bch_chardev))
- return PTR_ERR(bch_chardev);
+ if (IS_ERR(bch_chardev)) {
+ ret = PTR_ERR(bch_chardev);
+ goto class_out;
+ }
return 0;
+
+class_out:
+ class_unregister(&bch_chardev_class);
+major_out:
+ unregister_chrdev(bch_chardev_major, "bcachefs-ctl");
+ return ret;
}
#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 4701457f6381..3bd3aba90d8f 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -102,6 +102,7 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
int ret;
skcipher_request_set_sync_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
@@ -232,7 +233,7 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
return ret;
}
default:
- BUG();
+ return (struct bch_csum) {};
}
}
@@ -306,7 +307,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
return ret;
}
default:
- BUG();
+ return (struct bch_csum) {};
}
}
@@ -351,8 +352,12 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
bytes += bv.bv_len;
}
- sg_mark_end(sg - 1);
- return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ if (sg != sgl) {
+ sg_mark_end(sg - 1);
+ return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ }
+
+ return ret;
}
struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
@@ -429,15 +434,20 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
extent_nonce(version, crc_old), bio);
if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
- bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
- "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
- __func__,
- crc_old.csum.hi,
- crc_old.csum.lo,
- merged.hi,
- merged.lo,
- bch2_csum_types[crc_old.csum_type],
- bch2_csum_types[new_csum_type]);
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
+ "expected %0llx:%0llx got %0llx:%0llx (old type ",
+ __func__,
+ crc_old.csum.hi,
+ crc_old.csum.lo,
+ merged.hi,
+ merged.lo);
+ bch2_prt_csum_type(&buf, crc_old.csum_type);
+ prt_str(&buf, " new type ");
+ bch2_prt_csum_type(&buf, new_csum_type);
+ prt_str(&buf, ")");
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
return -EIO;
}
@@ -463,9 +473,8 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
/* BCH_SB_FIELD_crypt: */
-static int bch2_sb_crypt_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
@@ -488,14 +497,10 @@ static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
{
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
- prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt));
- prt_newline(out);
- prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt));
- prt_newline(out);
- prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt));
- prt_newline(out);
- prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt));
- prt_newline(out);
+ prt_printf(out, "KFD: %llu\n", BCH_CRYPT_KDF_TYPE(crypt));
+ prt_printf(out, "scrypt n: %llu\n", BCH_KDF_SCRYPT_N(crypt));
+ prt_printf(out, "scrypt r: %llu\n", BCH_KDF_SCRYPT_R(crypt));
+ prt_printf(out, "scrypt p: %llu\n", BCH_KDF_SCRYPT_P(crypt));
}
const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
@@ -647,26 +652,26 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
- int ret;
-
- if (!c->chacha20)
- c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
- ret = PTR_ERR_OR_ZERO(c->chacha20);
+ if (c->chacha20)
+ return 0;
+ struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
+ int ret = PTR_ERR_OR_ZERO(chacha20);
if (ret) {
bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
return ret;
}
- if (!c->poly1305)
- c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
- ret = PTR_ERR_OR_ZERO(c->poly1305);
-
+ struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0);
+ ret = PTR_ERR_OR_ZERO(poly1305);
if (ret) {
bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
+ c->chacha20 = chacha20;
+ c->poly1305 = poly1305;
return 0;
}
@@ -761,11 +766,11 @@ err:
void bch2_fs_encryption_exit(struct bch_fs *c)
{
- if (!IS_ERR_OR_NULL(c->poly1305))
+ if (c->poly1305)
crypto_free_shash(c->poly1305);
- if (!IS_ERR_OR_NULL(c->chacha20))
+ if (c->chacha20)
crypto_free_sync_skcipher(c->chacha20);
- if (!IS_ERR_OR_NULL(c->sha256))
+ if (c->sha256)
crypto_free_shash(c->sha256);
}
@@ -778,6 +783,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
c->sha256 = crypto_alloc_shash("sha256", 0, 0);
ret = PTR_ERR_OR_ZERO(c->sha256);
if (ret) {
+ c->sha256 = NULL;
bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
goto out;
}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 1b8c2c1016dc..e40499fde9a4 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -61,11 +61,12 @@ static inline void bch2_csum_err_msg(struct printbuf *out,
struct bch_csum expected,
struct bch_csum got)
{
- prt_printf(out, "checksum error: got ");
+ prt_str(out, "checksum error, type ");
+ bch2_prt_csum_type(out, type);
+ prt_str(out, ": got ");
bch2_csum_to_text(out, type, got);
prt_str(out, " should be ");
bch2_csum_to_text(out, type, expected);
- prt_printf(out, " type %s", bch2_csum_types[type]);
}
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 58c2eb45570f..607fd5e232c9 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,14 +47,6 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
}
-static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
-{
- if (type < BCH_COMPRESSION_TYPE_NR)
- prt_str(out, __bch2_compression_types[type]);
- else
- prt_printf(out, "(invalid compression type %u)", type);
-}
-
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 4150feca42a2..0d807c2ce9c6 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -14,6 +14,7 @@
#include "move.h"
#include "nocow_locking.h"
#include "rebalance.h"
+#include "snapshot.h"
#include "subvolume.h"
#include "trace.h"
@@ -105,7 +106,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, m->btree_id,
bkey_start_pos(&bch2_keylist_front(keys)->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
while (1) {
struct bkey_s_c k;
@@ -202,6 +203,8 @@ restart_drop_conflicting_replicas:
/* Now, drop excess replicas: */
restart_drop_extra_replicas:
+
+ rcu_read_lock();
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
@@ -213,6 +216,7 @@ restart_drop_extra_replicas:
goto restart_drop_extra_replicas;
}
}
+ rcu_read_unlock();
/* Finally, add the pointers we just wrote: */
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
@@ -287,7 +291,7 @@ restart_drop_extra_replicas:
k.k->p, insert->k.p) ?:
bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
bch2_trans_update(trans, &iter, insert,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, &op->res,
NULL,
BCH_TRANS_COMMIT_no_check_rw|
@@ -356,10 +360,11 @@ void bch2_data_update_exit(struct data_update *update)
bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
if (c->opts.nocow_enabled)
bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr), 0);
- percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+ PTR_BUCKET_POS(ca, ptr), 0);
+ bch2_dev_put(ca);
}
bch2_bkey_buf_exit(&update->k, c);
@@ -385,8 +390,10 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
while (bio_sectors(bio)) {
unsigned sectors = bio_sectors(bio);
+ bch2_trans_begin(trans);
+
bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
- BTREE_ITER_SLOTS);
+ BTREE_ITER_slots);
ret = lockrestart_do(trans, ({
k = bch2_btree_iter_peek_slot(&iter);
bkey_err(k);
@@ -464,7 +471,6 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
while (data_opts.kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
- struct bch_extent_ptr *ptr;
bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
data_opts.kill_ptrs ^= 1U << drop;
@@ -479,15 +485,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
/*
* Since we're not inserting through an extent iterator
- * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
* we aren't using the extent overwrite path to delete, we're
* just using the normal key deletion path:
*/
- if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents))
n->k.size = 0;
return bch2_trans_relock(trans) ?:
- bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
@@ -509,6 +515,14 @@ int bch2_data_update_init(struct btree_trans *trans,
unsigned ptrs_locked = 0;
int ret = 0;
+ /*
+ * fs is corrupt we have a key for a snapshot node that doesn't exist,
+ * and we have to check for this because we go rw before repairing the
+ * snapshots table - just skip it, we can move it later.
+ */
+ if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
+ return -BCH_ERR_data_update_done;
+
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
@@ -530,15 +544,26 @@ int bch2_data_update_init(struct btree_trans *trans,
m->op.compression_opt = background_compression(io_opts);
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
- bkey_for_each_ptr(ptrs, ptr)
- percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!bch2_dev_tryget(c, ptr->dev)) {
+ bkey_for_each_ptr(ptrs, ptr2) {
+ if (ptr2 == ptr)
+ break;
+ bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
+ }
+ return -BCH_ERR_data_update_done;
+ }
+ }
unsigned durability_have = 0, durability_removing = 0;
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
bool locked;
+ rcu_read_lock();
if (((1U << i) & m->data_opts.rewrite_ptrs)) {
BUG_ON(p.ptr.cached);
@@ -552,6 +577,7 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p);
}
+ rcu_read_unlock();
/*
* op->csum_type is normally initialized from the fs/file's
@@ -570,16 +596,13 @@ int bch2_data_update_init(struct btree_trans *trans,
if (ctxt) {
move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
- PTR_BUCKET_POS(c, &p.ptr), 0)) ||
- (!atomic_read(&ctxt->read_sectors) &&
- !atomic_read(&ctxt->write_sectors)));
+ bucket, 0)) ||
+ list_empty(&ctxt->ios));
if (!locked)
- bch2_bucket_nocow_lock(&c->nocow_locks,
- PTR_BUCKET_POS(c, &p.ptr), 0);
+ bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
} else {
- if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
- PTR_BUCKET_POS(c, &p.ptr), 0)) {
+ if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
ret = -BCH_ERR_nocow_lock_blocked;
goto err;
}
@@ -590,6 +613,8 @@ int bch2_data_update_init(struct btree_trans *trans,
i++;
}
+ unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
+
/*
* If current extent durability is less than io_opts.data_replicas,
* we're not trying to rereplicate the extent up to data_replicas here -
@@ -599,7 +624,7 @@ int bch2_data_update_init(struct btree_trans *trans,
* rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/
if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
- durability_have >= io_opts.data_replicas) {
+ !durability_required) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
@@ -608,11 +633,18 @@ int bch2_data_update_init(struct btree_trans *trans,
goto done;
}
- m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
+ m->op.nr_replicas = min(durability_removing, durability_required) +
m->data_opts.extra_replicas;
- m->op.nr_replicas_required = m->op.nr_replicas;
- BUG_ON(!m->op.nr_replicas);
+ /*
+ * If device(s) were set to durability=0 after data was written to them
+ * we can end up with a duribilty=0 extent, and the normal algorithm
+ * that tries not to increase durability doesn't work:
+ */
+ if (!(durability_have + durability_removing))
+ m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
+
+ m->op.nr_replicas_required = m->op.nr_replicas;
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
@@ -632,10 +664,11 @@ int bch2_data_update_init(struct btree_trans *trans,
err:
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
if ((1U << i) & ptrs_locked)
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, &p.ptr), 0);
- percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref);
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+ bch2_dev_put(ca);
i++;
}
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 208ce6f0fc43..51cbf3928361 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -13,6 +13,7 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "buckets.h"
#include "debug.h"
#include "error.h"
@@ -36,11 +37,11 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
struct btree_node *n_ondisk = c->verify_ondisk;
struct btree_node *n_sorted = c->verify_data->data;
struct bset *sorted, *inmemory = &b->data->keys;
- struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
struct bio *bio;
bool failed = false, saw_error = false;
- if (!bch2_dev_get_ioref(ca, READ))
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ if (!ca)
return false;
bio = bio_alloc_bioset(ca->disk_sb.bdev,
@@ -193,8 +194,8 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
- if (!bch2_dev_get_ioref(ca, READ)) {
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ if (!ca) {
prt_printf(out, "error getting device to read from: not online\n");
return;
}
@@ -374,8 +375,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
return flush_buf(i) ?:
bch2_trans_run(i->c,
for_each_btree_key(trans, iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k, ({
bch2_bkey_val_to_text(&i->buf, i->c, k);
prt_newline(&i->buf);
bch2_trans_unlock(trans);
@@ -458,8 +459,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
return flush_buf(i) ?:
bch2_trans_run(i->c,
for_each_btree_key(trans, iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k, ({
struct btree_path_level *l =
&btree_iter_path(trans, &iter)->l[0];
struct bkey_packed *_k =
@@ -491,51 +492,26 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
- prt_printf(out, "%px btree=%s l=%u ",
- b,
- bch2_btree_id_str(b->c.btree_id),
- b->c.level);
- prt_newline(out);
+ prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level);
printbuf_indent_add(out, 2);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
prt_newline(out);
- prt_printf(out, "flags: ");
- prt_tab(out);
+ prt_printf(out, "flags:\t");
prt_bitflags(out, bch2_btree_node_flags, b->flags);
prt_newline(out);
- prt_printf(out, "pcpu read locks: ");
- prt_tab(out);
- prt_printf(out, "%u", b->c.lock.readers != NULL);
- prt_newline(out);
-
- prt_printf(out, "written:");
- prt_tab(out);
- prt_printf(out, "%u", b->written);
- prt_newline(out);
-
- prt_printf(out, "writes blocked:");
- prt_tab(out);
- prt_printf(out, "%u", !list_empty_careful(&b->write_blocked));
- prt_newline(out);
+ prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL);
+ prt_printf(out, "written:\t%u\n", b->written);
+ prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked));
+ prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable);
- prt_printf(out, "will make reachable:");
- prt_tab(out);
- prt_printf(out, "%lx", b->will_make_reachable);
- prt_newline(out);
-
- prt_printf(out, "journal pin %px:", &b->writes[0].journal);
- prt_tab(out);
- prt_printf(out, "%llu", b->writes[0].journal.seq);
- prt_newline(out);
-
- prt_printf(out, "journal pin %px:", &b->writes[1].journal);
- prt_tab(out);
- prt_printf(out, "%llu", b->writes[1].journal.seq);
- prt_newline(out);
+ prt_printf(out, "journal pin %px:\t%llu\n",
+ &b->writes[0].journal, b->writes[0].journal.seq);
+ prt_printf(out, "journal pin %px:\t%llu\n",
+ &b->writes[1].journal, b->writes[1].journal.seq);
printbuf_indent_sub(out, 2);
}
@@ -624,8 +600,7 @@ restart:
bch2_btree_trans_to_text(&i->buf, trans);
- prt_printf(&i->buf, "backtrace:");
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "backtrace:\n");
printbuf_indent_add(&i->buf, 2);
bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
printbuf_indent_sub(&i->buf, 2);
@@ -668,7 +643,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
- do {
+ while (1) {
err = flush_buf(i);
if (err)
return err;
@@ -676,9 +651,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
if (!i->size)
break;
+ if (done)
+ break;
+
done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
i->iter++;
- } while (!done);
+ }
if (i->buf.allocation_failure)
return -ENOMEM;
@@ -693,13 +671,45 @@ static const struct file_operations journal_pins_ops = {
.read = bch2_journal_pins_read,
};
+static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (!i->iter) {
+ bch2_btree_updates_to_text(&i->buf, c);
+ i->iter++;
+ }
+
+ err = flush_buf(i);
+ if (err)
+ return err;
+
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
+
+ return i->ret;
+}
+
+static const struct file_operations btree_updates_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_btree_updates_read,
+};
+
static int btree_transaction_stats_open(struct inode *inode, struct file *file)
{
struct bch_fs *c = inode->i_private;
struct dump_iter *i;
i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-
if (!i)
return -ENOMEM;
@@ -746,25 +756,20 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
!bch2_btree_transaction_fns[i->iter])
break;
- prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]);
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]);
printbuf_indent_add(&i->buf, 2);
mutex_lock(&s->lock);
- prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
- prt_newline(&i->buf);
-
- prt_printf(&i->buf, "Transaction duration:");
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
+ prt_printf(&i->buf, "Transaction duration:\n");
printbuf_indent_add(&i->buf, 2);
bch2_time_stats_to_text(&i->buf, &s->duration);
printbuf_indent_sub(&i->buf, 2);
if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
- prt_printf(&i->buf, "Lock hold times:");
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "Lock hold times:\n");
printbuf_indent_add(&i->buf, 2);
bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
@@ -772,8 +777,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
}
if (s->max_paths_text) {
- prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths);
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths);
printbuf_indent_add(&i->buf, 2);
prt_str_indented(&i->buf, s->max_paths_text);
@@ -866,6 +870,20 @@ void bch2_fs_debug_exit(struct bch_fs *c)
debugfs_remove_recursive(c->fs_debug_dir);
}
+static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
+{
+ struct dentry *d;
+
+ d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
+
+ debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
+
+ debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
+
+ debugfs_create_file("bfloat-failed", 0400, d, bd,
+ &bfloat_failed_debug_ops);
+}
+
void bch2_fs_debug_init(struct bch_fs *c)
{
struct btree_debug *bd;
@@ -888,6 +906,9 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
c->btree_debug, &journal_pins_ops);
+ debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
+ c->btree_debug, &btree_updates_ops);
+
debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
c, &btree_transaction_stats_op);
@@ -902,21 +923,7 @@ void bch2_fs_debug_init(struct bch_fs *c)
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- debugfs_create_file(bch2_btree_id_str(bd->id),
- 0400, c->btree_debug_dir, bd,
- &btree_debug_ops);
-
- snprintf(name, sizeof(name), "%s-formats",
- bch2_btree_id_str(bd->id));
-
- debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
- &btree_format_debug_ops);
-
- snprintf(name, sizeof(name), "%s-bfloat-failed",
- bch2_btree_id_str(bd->id));
-
- debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
- &bfloat_failed_debug_ops);
+ bch2_fs_debug_btree_init(c, bd);
}
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index d37bd07afbfe..c67460d8205d 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -15,6 +15,9 @@
static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
+ if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
+ return 0;
+
unsigned bkey_u64s = bkey_val_u64s(d.k);
unsigned bkey_bytes = bkey_u64s * sizeof(u64);
u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
@@ -98,7 +101,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
};
int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
@@ -118,7 +121,7 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
* Check new keys don't exceed the max length
* (older keys may be larger.)
*/
- bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err,
+ bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, err,
dirent_name_too_long,
"dirent name too big (%u > %u)",
d_name.len, BCH_NAME_MAX);
@@ -205,7 +208,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
- bch_str_hash_flags_t str_hash_flags)
+ enum btree_iter_update_trigger_flags flags)
{
subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
struct bkey_i_dirent *dirent;
@@ -220,9 +223,8 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
dirent->k.p.snapshot = snapshot;
ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, snapshot,
- &dirent->k_i, str_hash_flags,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ dir_inum, snapshot, &dirent->k_i,
+ flags|BTREE_UPDATE_internal_snapshot_node);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -232,7 +234,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
- bch_str_hash_flags_t str_hash_flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_i_dirent *dirent;
int ret;
@@ -243,7 +245,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir, &dirent->k_i, str_hash_flags);
+ dir, &dirent->k_i, flags);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -272,7 +274,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
} else {
target->subvol = le32_to_cpu(d.v->d_child_subvol);
- ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s);
+ ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s);
target->inum = le64_to_cpu(s.inode);
}
@@ -301,13 +303,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
memset(dst_inum, 0, sizeof(*dst_inum));
/* Lookup src: */
- ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
- src_hash, src_dir, src_name,
- BTREE_ITER_INTENT);
- if (ret)
- goto out;
-
- old_src = bch2_btree_iter_peek_slot(&src_iter);
+ old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
+ src_hash, src_dir, src_name,
+ BTREE_ITER_intent);
ret = bkey_err(old_src);
if (ret)
goto out;
@@ -329,13 +327,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
} else {
- ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name,
- BTREE_ITER_INTENT);
- if (ret)
- goto out;
-
- old_dst = bch2_btree_iter_peek_slot(&dst_iter);
+ old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
+ dst_hash, dst_dir, dst_name,
+ BTREE_ITER_intent);
ret = bkey_err(old_dst);
if (ret)
goto out;
@@ -450,7 +444,7 @@ out_set_src:
if (delete_src) {
bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
ret = bch2_btree_iter_traverse(&src_iter) ?:
- bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto out;
}
@@ -458,7 +452,7 @@ out_set_src:
if (delete_dst) {
bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
ret = bch2_btree_iter_traverse(&dst_iter) ?:
- bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto out;
}
@@ -479,13 +473,9 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans,
const struct qstr *name, subvol_inum *inum,
unsigned flags)
{
- int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
- hash_info, dir, name, flags);
- if (ret)
- return ret;
-
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
+ struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+ hash_info, dir, name, flags);
+ int ret = bkey_err(k);
if (ret)
goto err;
@@ -541,16 +531,26 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
}
+static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target)
+{
+ struct qstr name = bch2_dirent_get_name(d);
+ bool ret = dir_emit(ctx, name.name,
+ name.len,
+ target.inum,
+ vfs_d_type(d.v->d_type));
+ if (ret)
+ ctx->pos = d.k->p.offset + 1;
+ return ret;
+}
+
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_dirent dirent;
subvol_inum target;
u32 snapshot;
struct bkey_buf sk;
- struct qstr name;
int ret;
bch2_bkey_buf_init(&sk);
@@ -567,7 +567,9 @@ retry:
if (k.k->type != KEY_TYPE_dirent)
continue;
- dirent = bkey_s_c_to_dirent(k);
+ /* dir_emit() can fault and block: */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
ret = bch2_dirent_read_target(trans, inum, dirent, &target);
if (ret < 0)
@@ -575,28 +577,22 @@ retry:
if (ret)
continue;
- /* dir_emit() can fault and block: */
- bch2_bkey_buf_reassemble(&sk, c, k);
- dirent = bkey_i_to_s_c_dirent(sk.k);
- bch2_trans_unlock(trans);
-
- name = bch2_dirent_get_name(dirent);
-
- ctx->pos = dirent.k->p.offset;
- if (!dir_emit(ctx, name.name,
- name.len,
- target.inum,
- vfs_d_type(dirent.v->d_type)))
- break;
- ctx->pos = dirent.k->p.offset + 1;
-
/*
* read_target looks up subvolumes, we can overflow paths if the
* directory has many subvolumes in it
+ *
+ * XXX: btree_trans_too_many_iters() is something we'd like to
+ * get rid of, and there's no good reason to be using it here
+ * except that we don't yet have a for_each_btree_key() helper
+ * that does subvolume_get_snapshot().
*/
- ret = btree_trans_too_many_iters(trans);
- if (ret)
+ ret = drop_locks_do(trans,
+ bch2_dir_emit(ctx, dirent, target)) ?:
+ btree_trans_too_many_iters(trans);
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
break;
+ }
}
bch2_trans_iter_exit(trans, &iter);
err:
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index bee55cca2aa0..24037e6e0a09 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -4,11 +4,11 @@
#include "str_hash.h"
-enum bkey_invalid_flags;
+enum bch_validate_flags;
extern const struct bch_hash_desc bch2_dirent_hash_desc;
int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_dirent ((struct bkey_ops) { \
@@ -38,11 +38,11 @@ int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
- bch_str_hash_flags_t);
+ enum btree_iter_update_trigger_flags);
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
- bch_str_hash_flags_t);
+ enum btree_iter_update_trigger_flags);
static inline unsigned vfs_d_type(unsigned type)
{
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 06a7df529b40..521a86df5e52 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -18,9 +18,8 @@ static int group_cmp(const void *_l, const void *_r)
strncmp(l->label, r->label, sizeof(l->label));
}
-static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_disk_groups *groups =
field_to_type(f, disk_groups);
@@ -177,7 +176,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
struct bch_disk_group_cpu *dst;
- if (!bch2_member_exists(&m))
+ if (!bch2_member_alive(&m))
continue;
g = BCH_MEMBER_GROUP(&m);
@@ -523,7 +522,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
ca = bch2_dev_lookup(c, val);
if (!IS_ERR(ca)) {
*res = dev_to_target(ca->dev_idx);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return 0;
}
@@ -588,7 +587,7 @@ static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsi
case TARGET_DEV: {
struct bch_member m = bch2_sb_member_get(sb, t.dev);
- if (bch2_dev_exists(sb, t.dev)) {
+ if (bch2_member_exists(sb, t.dev)) {
prt_printf(out, "Device ");
pr_uuid(out, m.uuid.b);
prt_printf(out, " (%u)", t.dev);
diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h
new file mode 100644
index 000000000000..698990bbf1d2
--- /dev/null
+++ b/fs/bcachefs/disk_groups_format.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H
+#define _BCACHEFS_DISK_GROUPS_FORMAT_H
+
+#define BCH_SB_LABEL_SIZE 32
+
+struct bch_disk_group {
+ __u8 label[BCH_SB_LABEL_SIZE];
+ __le64 flags[2];
+} __packed __aligned(8);
+
+LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
+LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
+
+struct bch_sb_field_disk_groups {
+ struct bch_sb_field field;
+ struct bch_disk_group entries[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 082075244e16..d8b9beca3776 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -107,7 +107,7 @@ struct ec_bio {
/* Stripes btree keys: */
int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
@@ -131,174 +131,221 @@ fsck_err:
void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+ const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
+ struct bch_stripe s = {};
+
+ memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
+
+ unsigned nr_data = s.nr_blocks - s.nr_redundant;
- prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
- s->algorithm,
- le16_to_cpu(s->sectors),
- nr_data,
- s->nr_redundant,
- s->csum_type,
- 1U << s->csum_granularity_bits);
+ prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
+ s.algorithm,
+ le16_to_cpu(s.sectors),
+ nr_data,
+ s.nr_redundant);
+ bch2_prt_csum_type(out, s.csum_type);
+ prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+
+ for (unsigned i = 0; i < s.nr_blocks; i++) {
+ const struct bch_extent_ptr *ptr = sp->ptrs + i;
+
+ if ((void *) ptr >= bkey_val_end(k))
+ break;
- for (i = 0; i < s->nr_blocks; i++) {
- const struct bch_extent_ptr *ptr = s->ptrs + i;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- u32 offset;
- u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+ bch2_extent_ptr_to_text(out, c, ptr);
- prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
- if (i < nr_data)
- prt_printf(out, "#%u", stripe_blockcount_get(s, i));
- prt_printf(out, " gen %u", ptr->gen);
- if (ptr_stale(ca, ptr))
- prt_printf(out, " stale");
+ if (s.csum_type < BCH_CSUM_NR &&
+ i < nr_data &&
+ stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
+ prt_printf(out, "#%u", stripe_blockcount_get(sp, i));
}
}
/* Triggers: */
-static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c_stripe s,
- unsigned idx, bool deleting)
+static int __mark_stripe_bucket(struct btree_trans *trans,
+ struct bch_dev *ca,
+ struct bkey_s_c_stripe s,
+ unsigned ptr_idx, bool deleting,
+ struct bpos bucket,
+ struct bch_alloc_v4 *a,
+ enum btree_iter_update_trigger_flags flags)
{
- struct bch_fs *c = trans->c;
- const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
- enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity : 0;
- s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
+ const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
+ unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
+ bool parity = ptr_idx >= nr_data;
+ enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
+ s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
+ struct bch_fs *c = trans->c;
if (deleting)
sectors = -sectors;
- a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
- a->v.gen, a->v.data_type,
- a->v.dirty_sectors);
- if (ret)
- goto err;
-
if (!deleting) {
- if (bch2_trans_inconsistent_on(a->v.stripe ||
- a->v.stripe_redundancy, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_type_str(a->v.data_type),
- a->v.dirty_sectors,
- a->v.stripe, s.k->p.offset)) {
+ if (bch2_trans_inconsistent_on(a->stripe ||
+ a->stripe_redundancy, trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ bch2_data_type_str(a->data_type),
+ a->dirty_sectors,
+ a->stripe, s.k->p.offset,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -EIO;
goto err;
}
- if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_type_str(a->v.data_type),
- a->v.dirty_sectors,
- s.k->p.offset)) {
+ if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ bch2_data_type_str(a->data_type),
+ a->dirty_sectors,
+ a->cached_sectors,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -EIO;
goto err;
}
-
- a->v.stripe = s.k->p.offset;
- a->v.stripe_redundancy = s.v->nr_redundant;
- a->v.data_type = BCH_DATA_stripe;
} else {
- if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
- a->v.stripe_redundancy != s.v->nr_redundant, trans,
- "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- s.k->p.offset, a->v.stripe)) {
+ if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
+ a->stripe_redundancy != s.v->nr_redundant, trans,
+ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ a->stripe,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -EIO;
goto err;
}
- a->v.stripe = 0;
- a->v.stripe_redundancy = 0;
- a->v.data_type = alloc_data_type(a->v, BCH_DATA_user);
+ if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
+ "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ bch2_data_type_str(a->data_type),
+ bch2_data_type_str(data_type),
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (bch2_trans_inconsistent_on(parity &&
+ (a->dirty_sectors != -sectors ||
+ a->cached_sectors), trans,
+ "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ a->dirty_sectors,
+ a->cached_sectors,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -EIO;
+ goto err;
+ }
+ }
+
+ if (sectors) {
+ ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
+ a->gen, a->data_type, &a->dirty_sectors);
+ if (ret)
+ goto err;
}
- a->v.dirty_sectors += sectors;
- if (data_type)
- a->v.data_type = !deleting ? data_type : 0;
+ if (!deleting) {
+ a->stripe = s.k->p.offset;
+ a->stripe_redundancy = s.v->nr_redundant;
+ } else {
+ a->stripe = 0;
+ a->stripe_redundancy = 0;
+ }
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- if (ret)
- goto err;
+ alloc_data_type_set(a, data_type);
err:
- bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
return ret;
}
static int mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c k,
- unsigned ptr_idx,
- unsigned flags)
+ struct bkey_s_c_stripe s,
+ unsigned ptr_idx, bool deleting,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned nr_data = s->nr_blocks - s->nr_redundant;
- bool parity = ptr_idx >= nr_data;
- enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
- s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
- const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket old, new, *g;
- struct printbuf buf = PRINTBUF;
+ const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
int ret = 0;
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
+ struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
+ if (unlikely(!ca)) {
+ if (!(flags & BTREE_TRIGGER_overwrite))
+ ret = -EIO;
+ goto err;
+ }
- /* * XXX doesn't handle deletion */
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
- percpu_down_read(&c->mark_lock);
- g = PTR_GC_BUCKET(ca, ptr);
+ if (flags & BTREE_TRIGGER_transactional) {
+ struct bkey_i_alloc_v4 *a =
+ bch2_trans_start_alloc_update(trans, bucket);
+ ret = PTR_ERR_OR_ZERO(a) ?:
+ __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
+ }
- if (g->dirty_sectors ||
- (g->stripe && g->stripe != k.k->p.offset)) {
- bch2_fs_inconsistent(c,
- "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EINVAL;
- goto err;
+ if (flags & BTREE_TRIGGER_gc) {
+ percpu_down_read(&c->mark_lock);
+ struct bucket *g = gc_bucket(ca, bucket.offset);
+ bucket_lock(g);
+ struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
+ ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
+ if (!ret) {
+ alloc_to_bucket(g, new);
+ bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+ }
+ bucket_unlock(g);
+ percpu_up_read(&c->mark_lock);
}
+err:
+ bch2_dev_put(ca);
+ return ret;
+}
+
+static int mark_stripe_buckets(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ enum btree_iter_update_trigger_flags flags)
+{
+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(old).v : NULL;
+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(new).v : NULL;
- bucket_lock(g);
- old = *g;
+ BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
- ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type,
- g->gen, g->data_type,
- g->dirty_sectors);
- if (ret)
- goto err;
+ unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
- g->data_type = data_type;
- g->dirty_sectors += sectors;
+ for (unsigned i = 0; i < nr_blocks; i++) {
+ if (new_s && old_s &&
+ !memcmp(&new_s->ptrs[i],
+ &old_s->ptrs[i],
+ sizeof(new_s->ptrs[i])))
+ continue;
- g->stripe = k.k->p.offset;
- g->stripe_redundancy = s->nr_redundant;
- new = *g;
-err:
- bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update_m(c, ca, &old, &new);
- percpu_up_read(&c->mark_lock);
- printbuf_exit(&buf);
- return ret;
+ if (new_s) {
+ int ret = mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(new), i, false, flags);
+ if (ret)
+ return ret;
+ }
+
+ if (old_s) {
+ int ret = mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(old), i, true, flags);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
}
int bch2_trigger_stripe(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
+ enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_s_c new = _new.s_c;
struct bch_fs *c = trans->c;
@@ -308,7 +355,10 @@ int bch2_trigger_stripe(struct btree_trans *trans,
const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
? bkey_s_c_to_stripe(new).v : NULL;
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (unlikely(flags & BTREE_TRIGGER_check_repair))
+ return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
+
+ if (flags & BTREE_TRIGGER_transactional) {
/*
* If the pointers aren't changing, we don't need to do anything:
*/
@@ -343,31 +393,12 @@ int bch2_trigger_stripe(struct btree_trans *trans,
return ret;
}
- unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
- for (unsigned i = 0; i < nr_blocks; i++) {
- if (new_s && old_s &&
- !memcmp(&new_s->ptrs[i],
- &old_s->ptrs[i],
- sizeof(new_s->ptrs[i])))
- continue;
-
- if (new_s) {
- int ret = bch2_trans_mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(new), i, false);
- if (ret)
- return ret;
- }
-
- if (old_s) {
- int ret = bch2_trans_mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(old), i, true);
- if (ret)
- return ret;
- }
- }
+ int ret = mark_stripe_buckets(trans, old, new, flags);
+ if (ret)
+ return ret;
}
- if (flags & BTREE_TRIGGER_ATOMIC) {
+ if (flags & BTREE_TRIGGER_atomic) {
struct stripe *m = genradix_ptr(&c->stripes, idx);
if (!m) {
@@ -406,7 +437,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
}
}
- if (flags & BTREE_TRIGGER_GC) {
+ if (flags & BTREE_TRIGGER_gc) {
struct gc_stripe *m =
genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
@@ -435,13 +466,11 @@ int bch2_trigger_stripe(struct btree_trans *trans,
*/
memset(m->block_sectors, 0, sizeof(m->block_sectors));
- for (unsigned i = 0; i < new_s->nr_blocks; i++) {
- int ret = mark_stripe_bucket(trans, new, i, flags);
- if (ret)
- return ret;
- }
+ int ret = mark_stripe_buckets(trans, old, new, flags);
+ if (ret)
+ return ret;
- int ret = bch2_update_replicas(c, new, &m->r.e,
+ ret = bch2_update_replicas(c, new, &m->r.e,
((s64) m->sectors * m->nr_redundant),
0, true);
if (ret) {
@@ -604,21 +633,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
- struct printbuf err = PRINTBUF;
- struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
-
- prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
- want.hi, want.lo,
- got.hi, got.lo,
- bch2_csum_types[v->csum_type]);
- prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
- bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
- bch_err_ratelimited(ca, "%s", err.buf);
- printbuf_exit(&err);
+ struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
+ if (ca) {
+ struct printbuf err = PRINTBUF;
- clear_bit(i, buf->valid);
+ prt_str(&err, "stripe ");
+ bch2_csum_err_msg(&err, v->csum_type, want, got);
+ prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
+ bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
+ bch_err_ratelimited(ca, "%s", err.buf);
+ printbuf_exit(&err);
+
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ }
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ clear_bit(i, buf->valid);
break;
}
@@ -685,7 +714,7 @@ static void ec_block_endio(struct bio *bio)
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
- if (ptr_stale(ca, ptr)) {
+ if (dev_ptr_stale(ca, ptr)) {
bch_err_ratelimited(ca->fs,
"error %s stripe: stale pointer after io",
bio_data_dir(bio) == READ ? "reading from" : "writing to");
@@ -703,25 +732,26 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
unsigned offset = 0, bytes = buf->size << 9;
struct bch_extent_ptr *ptr = &v->ptrs[idx];
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
? BCH_DATA_user
: BCH_DATA_parity;
int rw = op_is_write(opf);
- if (ptr_stale(ca, ptr)) {
- bch_err_ratelimited(c,
- "error %s stripe: stale pointer",
- rw == READ ? "reading from" : "writing to");
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
+ if (!ca) {
clear_bit(idx, buf->valid);
return;
}
- if (!bch2_dev_get_ioref(ca, rw)) {
+ if (dev_ptr_stale(ca, ptr)) {
+ bch_err_ratelimited(c,
+ "error %s stripe: stale pointer",
+ rw == READ ? "reading from" : "writing to");
clear_bit(idx, buf->valid);
return;
}
+
this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
while (offset < bytes) {
@@ -767,7 +797,7 @@ static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
- POS(0, idx), BTREE_ITER_SLOTS);
+ POS(0, idx), BTREE_ITER_slots);
ret = bkey_err(k);
if (ret)
goto err;
@@ -878,7 +908,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
- if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
+ if (c->gc_pos.phase != GC_PHASE_not_running &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
@@ -1058,7 +1088,7 @@ static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1129,7 +1159,7 @@ static int ec_stripe_key_update(struct btree_trans *trans,
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
- new->k.p, BTREE_ITER_INTENT);
+ new->k.p, BTREE_ITER_intent);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1171,6 +1201,7 @@ err:
}
static int ec_stripe_update_extent(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bpos bucket, u8 gen,
struct ec_stripe_buf *s,
struct bpos *bp_pos)
@@ -1181,13 +1212,13 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_s_c k;
const struct bch_extent_ptr *ptr_c;
- struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+ struct bch_extent_ptr *ec_ptr = NULL;
struct bch_extent_stripe_ptr stripe_ptr;
struct bkey_i *n;
int ret, dev, block;
- ret = bch2_get_next_backpointer(trans, bucket, gen,
- bp_pos, &bp, BTREE_ITER_CACHED);
+ ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
+ bp_pos, &bp, BTREE_ITER_cached);
if (ret)
return ret;
if (bpos_eq(*bp_pos, SPOS_MAX))
@@ -1212,7 +1243,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
return -EIO;
}
- k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT);
+ k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent);
ret = bkey_err(k);
if (ret)
return ret;
@@ -1270,17 +1301,21 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
{
struct bch_fs *c = trans->c;
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- struct bch_extent_ptr bucket = v->ptrs[block];
- struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
+ struct bch_extent_ptr ptr = v->ptrs[block];
struct bpos bp_pos = POS_MIN;
int ret = 0;
+ struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
+ if (!ca)
+ return -EIO;
+
+ struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
+
while (1) {
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
- s, &bp_pos));
+ ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos));
if (ret)
break;
if (bkey_eq(bp_pos, POS_MAX))
@@ -1289,6 +1324,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
bp_pos = bpos_nosnap_successor(bp_pos);
}
+ bch2_dev_put(ca);
return ret;
}
@@ -1319,20 +1355,18 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
unsigned block,
struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
- unsigned offset = ca->mi.bucket_size - ob->sectors_free;
- int ret;
-
- if (!bch2_dev_get_ioref(ca, WRITE)) {
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
+ if (!ca) {
s->err = -BCH_ERR_erofs_no_writes;
return;
}
+ unsigned offset = ca->mi.bucket_size - ob->sectors_free;
memset(s->new_stripe.data[block] + (offset << 9),
0,
ob->sectors_free << 9);
- ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
+ int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
ob->bucket * ca->mi.bucket_size + offset,
ob->sectors_free,
GFP_KERNEL, 0);
@@ -1517,16 +1551,13 @@ void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
{
struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
- struct bch_dev *ca;
- unsigned offset;
-
if (!ob)
return NULL;
BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
- ca = bch_dev_bkey_exists(c, ob->dev);
- offset = ca->mi.bucket_size - ob->sectors_free;
+ struct bch_dev *ca = ob_dev(c, ob);
+ unsigned offset = ca->mi.bucket_size - ob->sectors_free;
return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
}
@@ -1935,7 +1966,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
}
for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
if (start_pos.offset) {
start_pos = min_pos;
@@ -2125,7 +2156,7 @@ int bch2_stripes_read(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
if (k.k->type != KEY_TYPE_stripe)
continue;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index f4369b02e805..84a23eeb6249 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -6,14 +6,15 @@
#include "buckets_types.h"
#include "extents_types.h"
-enum bkey_invalid_flags;
+enum bch_validate_flags;
int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_stripe ((struct bkey_ops) { \
.key_invalid = bch2_stripe_invalid, \
@@ -32,6 +33,8 @@ static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
unsigned dev, unsigned csum_idx)
{
+ EBUG_ON(s->csum_type >= BCH_CSUM_NR);
+
unsigned csum_bytes = bch_crc_bytes[s->csum_type];
return sizeof(struct bch_stripe) +
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index af25d8ec60f2..dbe35b80bc0b 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -175,6 +175,7 @@
x(EINVAL, block_size_too_small) \
x(EINVAL, bucket_size_too_small) \
x(EINVAL, device_size_too_small) \
+ x(EINVAL, device_size_too_big) \
x(EINVAL, device_not_a_member_of_filesystem) \
x(EINVAL, device_has_been_removed) \
x(EINVAL, device_splitbrain) \
@@ -252,7 +253,8 @@
x(BCH_ERR_nopromote, nopromote_in_flight) \
x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem) \
- x(0, need_inode_lock)
+ x(0, need_inode_lock) \
+ x(0, invalid_snapshot_node)
enum bch_errcode {
BCH_ERR_START = 2048,
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 043431206799..c66eeffcd7f2 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,7 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "error.h"
-#include "recovery.h"
+#include "journal.h"
+#include "recovery_passes.h"
#include "super.h"
#include "thread_with_file.h"
@@ -16,7 +17,8 @@ bool bch2_inconsistent_error(struct bch_fs *c)
return false;
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "inconsistency detected - emergency read only");
+ bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
+ journal_cur_seq(&c->journal));
return true;
case BCH_ON_ERROR_panic:
panic(bch2_fmt(c, "panic after error"));
@@ -174,6 +176,21 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
return s;
}
+/* s/fix?/fixing/ s/recreate?/recreating/ */
+static void prt_actioning(struct printbuf *out, const char *action)
+{
+ unsigned len = strlen(action);
+
+ BUG_ON(action[len - 1] != '?');
+ --len;
+
+ if (action[len - 1] == 'e')
+ --len;
+
+ prt_bytes(out, action, len);
+ prt_str(out, "ing");
+}
+
int bch2_fsck_err(struct bch_fs *c,
enum bch_fsck_flags flags,
enum bch_sb_error_id err,
@@ -184,6 +201,7 @@ int bch2_fsck_err(struct bch_fs *c,
bool print = true, suppressing = false, inconsistent = false;
struct printbuf buf = PRINTBUF, *out = &buf;
int ret = -BCH_ERR_fsck_ignore;
+ const char *action_orig = "fix?", *action = action_orig;
if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent))
@@ -195,6 +213,19 @@ int bch2_fsck_err(struct bch_fs *c,
prt_vprintf(out, fmt, args);
va_end(args);
+ /* Custom fix/continue/recreate/etc.? */
+ if (out->buf[out->pos - 1] == '?') {
+ const char *p = strrchr(out->buf, ',');
+ if (p) {
+ out->pos = p - out->buf;
+ action = kstrdup(p + 2, GFP_KERNEL);
+ if (!action) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+ }
+
mutex_lock(&c->fsck_error_msgs_lock);
s = fsck_err_get(c, fmt);
if (s) {
@@ -206,12 +237,16 @@ int bch2_fsck_err(struct bch_fs *c,
if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
ret = s->ret;
mutex_unlock(&c->fsck_error_msgs_lock);
- printbuf_exit(&buf);
- return ret;
+ goto err;
}
kfree(s->last_msg);
s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
+ if (!s->last_msg) {
+ mutex_unlock(&c->fsck_error_msgs_lock);
+ ret = -ENOMEM;
+ goto err;
+ }
if (c->opts.ratelimit_errors &&
!(flags & FSCK_NO_RATELIMIT) &&
@@ -237,7 +272,8 @@ int bch2_fsck_err(struct bch_fs *c,
inconsistent = true;
ret = -BCH_ERR_fsck_errors_not_fixed;
} else if (flags & FSCK_CAN_FIX) {
- prt_str(out, ", fixing");
+ prt_str(out, ", ");
+ prt_actioning(out, action);
ret = -BCH_ERR_fsck_fix;
} else {
prt_str(out, ", continuing");
@@ -252,16 +288,16 @@ int bch2_fsck_err(struct bch_fs *c,
: c->opts.fix_errors;
if (fix == FSCK_FIX_ask) {
- int ask;
+ prt_str(out, ", ");
+ prt_str(out, action);
- prt_str(out, ": fix?");
if (bch2_fs_stdio_redirect(c))
bch2_print(c, "%s", out->buf);
else
bch2_print_string_as_lines(KERN_ERR, out->buf);
print = false;
- ask = bch2_fsck_ask_yn(c);
+ int ask = bch2_fsck_ask_yn(c);
if (ask >= YN_ALLNO && s)
s->fix = ask == YN_ALLNO
@@ -274,10 +310,12 @@ int bch2_fsck_err(struct bch_fs *c,
} else if (fix == FSCK_FIX_yes ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
- prt_str(out, ", fixing");
+ prt_str(out, ", ");
+ prt_actioning(out, action);
ret = -BCH_ERR_fsck_fix;
} else {
- prt_str(out, ", not fixing");
+ prt_str(out, ", not ");
+ prt_actioning(out, action);
}
} else if (flags & FSCK_NEED_FSCK) {
prt_str(out, " (run fsck to correct)");
@@ -309,8 +347,6 @@ int bch2_fsck_err(struct bch_fs *c,
mutex_unlock(&c->fsck_error_msgs_lock);
- printbuf_exit(&buf);
-
if (inconsistent)
bch2_inconsistent_error(c);
@@ -320,7 +356,10 @@ int bch2_fsck_err(struct bch_fs *c,
set_bit(BCH_FS_errors_not_fixed, &c->flags);
set_bit(BCH_FS_error, &c->flags);
}
-
+err:
+ if (action != action_orig)
+ kfree(action);
+ printbuf_exit(&buf);
return ret;
}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index ae1d6674c512..36caedf72d89 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -32,6 +32,12 @@ bool bch2_inconsistent_error(struct bch_fs *);
int bch2_topology_error(struct bch_fs *);
+#define bch2_fs_topology_error(c, ...) \
+({ \
+ bch_err(c, "btree topology error: " __VA_ARGS__); \
+ bch2_topology_error(c); \
+})
+
#define bch2_fs_inconsistent(c, ...) \
({ \
bch_err(c, __VA_ARGS__); \
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index b9033bb4f11c..5f4fecb358da 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -72,7 +72,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
for_each_btree_key_norestart(trans, iter,
BTREE_ID_reflink, POS(0, idx + offset),
- BTREE_ITER_SLOTS, r_k, ret2) {
+ BTREE_ITER_slots, r_k, ret2) {
if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
break;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 61395b113df9..469037929685 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -71,6 +71,12 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
}
}
+static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
+{
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
+}
+
/*
* returns true if p1 is better than p2:
*/
@@ -79,11 +85,8 @@ static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p2)
{
if (likely(!p1.idx && !p2.idx)) {
- struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
- struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
-
- u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
- u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+ u64 l1 = dev_latency(c, p1.ptr.dev);
+ u64 l2 = dev_latency(c, p2.ptr.dev);
/* Pick at random, biased in favor of the faster device: */
@@ -109,21 +112,21 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
struct bch_dev_io_failures *f;
- struct bch_dev *ca;
int ret = 0;
if (k.k->type == KEY_TYPE_error)
return -EIO;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
/*
* Unwritten extent: no need to actually read, treat it as a
* hole and return 0s:
*/
- if (p.ptr.unwritten)
- return 0;
-
- ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ if (p.ptr.unwritten) {
+ ret = 0;
+ break;
+ }
/*
* If there are any dirty pointers it's an error if we can't
@@ -132,7 +135,9 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (!ret && !p.ptr.cached)
ret = -EIO;
- if (p.ptr.cached && ptr_stale(ca, &p.ptr))
+ struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+
+ if (p.ptr.cached && (!ca || dev_ptr_stale(ca, &p.ptr)))
continue;
f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
@@ -141,12 +146,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
? f->idx
: f->idx + 1;
- if (!p.idx &&
- !bch2_dev_is_readable(ca))
+ if (!p.idx && !ca)
p.idx++;
- if (bch2_force_reconstruct_read &&
- !p.idx && p.has_ec)
+ if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
+ p.idx++;
+
+ if (!p.idx && !bch2_dev_is_readable(ca))
p.idx++;
if (p.idx >= (unsigned) p.has_ec + 1)
@@ -158,6 +164,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
*pick = p;
ret = 1;
}
+ rcu_read_unlock();
return ret;
}
@@ -165,7 +172,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
/* KEY_TYPE_btree_ptr: */
int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
int ret = 0;
@@ -186,16 +193,26 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
}
int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
int ret = 0;
- bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
- btree_ptr_v2_val_too_big,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
+ c, err, btree_ptr_v2_val_too_big,
"value too big (%zu > %zu)",
bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
+ bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
+ c, err, btree_ptr_v2_min_key_bad,
+ "min_key > key");
+
+ if (flags & BCH_VALIDATE_write)
+ bkey_fsck_err_on(!bp.v->sectors_written,
+ c, err, btree_ptr_v2_written_0,
+ "sectors_written == 0");
+
ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
fsck_err:
return ret;
@@ -242,7 +259,6 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
const union bch_extent_entry *en_r;
struct extent_ptr_decoded lp, rp;
bool use_right_ptr;
- struct bch_dev *ca;
en_l = l_ptrs.start;
en_r = r_ptrs.start;
@@ -273,8 +289,12 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
return false;
/* Extents may not straddle buckets: */
- ca = bch_dev_bkey_exists(c, lp.ptr.dev);
- if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr))
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
+ bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
+ rcu_read_unlock();
+
+ if (!same_bucket)
return false;
if (lp.has_ec != rp.has_ec ||
@@ -380,7 +400,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
/* KEY_TYPE_reservation: */
int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
@@ -662,16 +682,16 @@ static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent
unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
+ struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
- return __extent_ptr_durability(ca, p);
+ return ca ? __extent_ptr_durability(ca, p) : 0;
}
unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
+ struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
- if (ca->mi.state == BCH_MEMBER_STATE_failed)
+ if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
return 0;
return __extent_ptr_durability(ca, p);
@@ -684,8 +704,10 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
unsigned durability = 0;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
durability += bch2_extent_ptr_durability(c, &p);
+ rcu_read_unlock();
return durability;
}
@@ -697,9 +719,11 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
unsigned durability = 0;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
durability += bch2_extent_ptr_durability(c, &p);
+ rcu_read_unlock();
return durability;
}
@@ -828,8 +852,6 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
{
- struct bch_extent_ptr *ptr;
-
bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
}
@@ -855,14 +877,21 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned
bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_dev *ca;
+ bool ret = false;
+ rcu_read_lock();
bkey_for_each_ptr(ptrs, ptr)
if (bch2_dev_in_target(c, ptr->dev, target) &&
+ (ca = bch2_dev_rcu(c, ptr->dev)) &&
(!ptr->cached ||
- !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
- return true;
+ !dev_ptr_stale_rcu(ca, ptr))) {
+ ret = true;
+ break;
+ }
+ rcu_read_unlock();
- return false;
+ return ret;
}
bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
@@ -964,15 +993,44 @@ void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
*/
bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
{
- struct bch_extent_ptr *ptr;
+ struct bch_dev *ca;
+ rcu_read_lock();
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
- ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
+ (ca = bch2_dev_rcu(c, ptr->dev)) &&
+ dev_ptr_stale_rcu(ca, ptr));
+ rcu_read_unlock();
return bkey_deleted(k.k);
}
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
+{
+ out->atomic++;
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca) {
+ prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+ (u64) ptr->offset, ptr->gen,
+ ptr->cached ? " cached" : "");
+ } else {
+ u32 offset;
+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+ prt_printf(out, "ptr: %u:%llu:%u gen %u",
+ ptr->dev, b, offset, ptr->gen);
+ if (ptr->cached)
+ prt_str(out, " cached");
+ if (ptr->unwritten)
+ prt_str(out, " unwritten");
+ if (bucket_valid(ca, b) && dev_ptr_stale_rcu(ca, ptr))
+ prt_printf(out, " stale");
+ }
+ rcu_read_unlock();
+ --out->atomic;
+}
+
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
@@ -988,42 +1046,22 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr: {
- const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
- struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
- ? bch_dev_bkey_exists(c, ptr->dev)
- : NULL;
-
- if (!ca) {
- prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
- (u64) ptr->offset, ptr->gen,
- ptr->cached ? " cached" : "");
- } else {
- u32 offset;
- u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
- prt_printf(out, "ptr: %u:%llu:%u gen %u",
- ptr->dev, b, offset, ptr->gen);
- if (ptr->cached)
- prt_str(out, " cached");
- if (ptr->unwritten)
- prt_str(out, " unwritten");
- if (ca && ptr_stale(ca, ptr))
- prt_printf(out, " stale");
- }
+ case BCH_EXTENT_ENTRY_ptr:
+ bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
break;
- }
+
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128: {
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
crc.compressed_size,
crc.uncompressed_size,
- crc.offset, crc.nonce,
- bch2_csum_types[crc.csum_type]);
+ crc.offset, crc.nonce);
+ bch2_prt_csum_type(out, crc.csum_type);
+ prt_str(out, " compress ");
bch2_prt_compression_type(out, crc.compression_type);
break;
}
@@ -1057,55 +1095,50 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
static int extent_ptr_invalid(struct bch_fs *c,
struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
const struct bch_extent_ptr *ptr,
unsigned size_ondisk,
bool metadata,
struct printbuf *err)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- u64 bucket;
- u32 bucket_offset;
- struct bch_dev *ca;
int ret = 0;
- if (!bch2_dev_exists2(c, ptr->dev)) {
- /*
- * If we're in the write path this key might have already been
- * overwritten, and we could be seeing a device that doesn't
- * exist anymore due to racing with device removal:
- */
- if (flags & BKEY_INVALID_WRITE)
- return 0;
-
- bkey_fsck_err(c, err, ptr_to_invalid_device,
- "pointer to invalid device (%u)", ptr->dev);
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca) {
+ rcu_read_unlock();
+ return 0;
}
+ u32 bucket_offset;
+ u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
+ unsigned first_bucket = ca->mi.first_bucket;
+ u64 nbuckets = ca->mi.nbuckets;
+ unsigned bucket_size = ca->mi.bucket_size;
+ rcu_read_unlock();
- ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr2)
bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
ptr_to_duplicate_device,
"multiple pointers to same device (%u)", ptr->dev);
- bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
- bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err,
+ bkey_fsck_err_on(bucket >= nbuckets, c, err,
ptr_after_last_bucket,
- "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets);
- bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err,
+ "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
+ bkey_fsck_err_on(bucket < first_bucket, c, err,
ptr_before_first_bucket,
- "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
- bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err,
+ "pointer before first bucket (%llu < %u)", bucket, first_bucket);
+ bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, c, err,
ptr_spans_multiple_buckets,
"pointer spans multiple buckets (%u + %u > %u)",
- bucket_offset, size_ondisk, ca->mi.bucket_size);
+ bucket_offset, size_ondisk, bucket_size);
fsck_err:
return ret;
}
int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -1181,7 +1214,7 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
bkey_fsck_err_on(crc_is_encoded(crc) &&
(crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
- (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err,
+ (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, err,
ptr_crc_uncompressed_size_too_big,
"too large encoded extent");
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index fd2669cdd76f..1ade959652b2 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -8,7 +8,7 @@
struct bch_fs;
struct btree_trans;
-enum bkey_invalid_flags;
+enum bch_validate_flags;
/* extent entries: */
@@ -406,12 +406,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
/* KEY_TYPE_btree_ptr: */
int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
int, struct bkey_s);
@@ -448,7 +448,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
/* KEY_TYPE_reservation: */
int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@@ -596,30 +596,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
return ret;
}
-static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr)
-{
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- return BCH_DATA_btree;
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return BCH_DATA_user;
- case KEY_TYPE_stripe: {
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
- BUG_ON(ptr < s.v->ptrs ||
- ptr >= s.v->ptrs + s.v->nr_blocks);
-
- return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity
- : BCH_DATA_user;
- }
- default:
- BUG();
- }
-}
-
unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
@@ -678,7 +654,7 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
do { \
struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \
\
- _ptr = &_ptrs.start->ptr; \
+ struct bch_extent_ptr *_ptr = &_ptrs.start->ptr; \
\
while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \
if (_cond) { \
@@ -700,10 +676,11 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_ptr_swab(struct bkey_s);
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
new file mode 100644
index 000000000000..2eaffe37b5e7
--- /dev/null
+++ b/fs/bcachefs/eytzinger.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "eytzinger.h"
+
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+ unsigned char lsbits = (unsigned char)size;
+
+ (void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+ return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory. This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, size_t n)
+{
+ do {
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+ } while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory. This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible. If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI). Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, size_t n)
+{
+ do {
+#ifdef CONFIG_64BIT
+ u64 t = *(u64 *)(a + (n -= 8));
+ *(u64 *)(a + n) = *(u64 *)(b + n);
+ *(u64 *)(b + n) = t;
+#else
+ /* Use two 32-bit transfers to avoid base+index+4 addressing */
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+
+ t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+#endif
+ } while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, size_t n)
+{
+ do {
+ char t = ((char *)a)[--n];
+ ((char *)a)[n] = ((char *)b)[n];
+ ((char *)b)[n] = t;
+ } while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_r_func_t)0
+#define SWAP_WORDS_32 (swap_r_func_t)1
+#define SWAP_BYTES (swap_r_func_t)2
+#define SWAP_WRAPPER (swap_r_func_t)3
+
+struct wrapper {
+ cmp_func_t cmp;
+ swap_func_t swap_func;
+};
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
+{
+ if (swap_func == SWAP_WRAPPER) {
+ ((const struct wrapper *)priv)->swap_func(a, b, (int)size);
+ return;
+ }
+
+ if (swap_func == SWAP_WORDS_64)
+ swap_words_64(a, b, size);
+ else if (swap_func == SWAP_WORDS_32)
+ swap_words_32(a, b, size);
+ else if (swap_func == SWAP_BYTES)
+ swap_bytes(a, b, size);
+ else
+ swap_func(a, b, (int)size, priv);
+}
+
+#define _CMP_WRAPPER ((cmp_r_func_t)0L)
+
+static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
+{
+ if (cmp == _CMP_WRAPPER)
+ return ((const struct wrapper *)priv)->cmp(a, b);
+ return cmp(a, b, priv);
+}
+
+static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func, const void *priv,
+ size_t l, size_t r)
+{
+ return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ cmp_func, priv);
+}
+
+static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+ swap_r_func_t swap_func, const void *priv,
+ size_t l, size_t r)
+{
+ do_swap(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ size, swap_func, priv);
+}
+
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv)
+{
+ int i, j, k;
+
+ /* called from 'sort' without swap function, let's pick the default */
+ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
+ swap_func = NULL;
+
+ if (!swap_func) {
+ if (is_aligned(base, size, 8))
+ swap_func = SWAP_WORDS_64;
+ else if (is_aligned(base, size, 4))
+ swap_func = SWAP_WORDS_32;
+ else
+ swap_func = SWAP_BYTES;
+ }
+
+ /* heapify */
+ for (i = n / 2 - 1; i >= 0; --i) {
+ /* Find the sift-down path all the way to the leaves. */
+ for (j = i; k = j * 2 + 1, k + 1 < n;)
+ j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
+
+ /* Special case for the last leaf with no sibling. */
+ if (j * 2 + 2 == n)
+ j = j * 2 + 1;
+
+ /* Backtrack to the correct location. */
+ while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0)
+ j = (j - 1) / 2;
+
+ /* Shift the element into its correct place. */
+ for (k = j; j != i;) {
+ j = (j - 1) / 2;
+ eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
+ }
+ }
+
+ /* sort */
+ for (i = n - 1; i > 0; --i) {
+ eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+
+ /* Find the sift-down path all the way to the leaves. */
+ for (j = 0; k = j * 2 + 1, k + 1 < i;)
+ j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
+
+ /* Special case for the last leaf with no sibling. */
+ if (j * 2 + 2 == i)
+ j = j * 2 + 1;
+
+ /* Backtrack to the correct location. */
+ while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0)
+ j = (j - 1) / 2;
+
+ /* Shift the element into its correct place. */
+ for (k = j; j;) {
+ j = (j - 1) / 2;
+ eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
+ }
+ }
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+ cmp_func_t cmp_func,
+ swap_func_t swap_func)
+{
+ struct wrapper w = {
+ .cmp = cmp_func,
+ .swap_func = swap_func,
+ };
+
+ return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
+}
+
+#if 0
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/ktime.h>
+
+static u64 cmp_count;
+
+static int mycmp(const void *a, const void *b)
+{
+ u32 _a = *(u32 *)a;
+ u32 _b = *(u32 *)b;
+
+ cmp_count++;
+ if (_a < _b)
+ return -1;
+ else if (_a > _b)
+ return 1;
+ else
+ return 0;
+}
+
+static int test(void)
+{
+ size_t N, i;
+ ktime_t start, end;
+ s64 delta;
+ u32 *arr;
+
+ for (N = 10000; N <= 100000; N += 10000) {
+ arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL);
+ cmp_count = 0;
+
+ for (i = 0; i < N; i++)
+ arr[i] = get_random_u32();
+
+ start = ktime_get();
+ eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL);
+ end = ktime_get();
+
+ delta = ktime_us_delta(end, start);
+ printk(KERN_INFO "time: %lld\n", delta);
+ printk(KERN_INFO "comparisons: %lld\n", cmp_count);
+
+ u32 prev = 0;
+
+ eytzinger0_for_each(i, N) {
+ if (prev > arr[i])
+ goto err;
+ prev = arr[i];
+ }
+
+ kfree(arr);
+ }
+ return 0;
+
+err:
+ kfree(arr);
+ return -1;
+}
+#endif
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index b04750dbf870..24840aee335c 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -5,23 +5,33 @@
#include <linux/bitops.h>
#include <linux/log2.h>
-#include "util.h"
+#ifdef EYTZINGER_DEBUG
+#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
+#else
+#define EYTZINGER_BUG_ON(cond)
+#endif
/*
* Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array
- */
-
-/*
- * One based indexing version:
+ * array.
+ *
+ * Consider using an eytzinger tree any time you would otherwise be doing binary
+ * search over an array. Binary search is a worst case scenario for branch
+ * prediction and prefetching, but in an eytzinger tree every node's children
+ * are adjacent in memory, thus we can prefetch children before knowing the
+ * result of the comparison, assuming multiple nodes fit on a cacheline.
*
- * With one based indexing each level of the tree starts at a power of two -
- * good for cacheline alignment:
+ * Two variants are provided, for one based indexing and zero based indexing.
+ *
+ * Zero based indexing is more convenient, but one based indexing has better
+ * alignment and thus better performance because each new level of the tree
+ * starts at a power of two, and thus if element 0 was cacheline aligned, each
+ * new level will be as well.
*/
static inline unsigned eytzinger1_child(unsigned i, unsigned child)
{
- EBUG_ON(child > 1);
+ EYTZINGER_BUG_ON(child > 1);
return (i << 1) + child;
}
@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
- EBUG_ON(i > size);
+ EYTZINGER_BUG_ON(i > size);
if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
- EBUG_ON(i > size);
+ EYTZINGER_BUG_ON(i > size);
if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
@@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned shift = __fls(size) - b;
int s;
- EBUG_ON(!i || i > size);
+ EYTZINGER_BUG_ON(!i || i > size);
i ^= 1U << b;
i <<= 1;
@@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
unsigned shift;
int s;
- EBUG_ON(!i || i > size);
+ EYTZINGER_BUG_ON(!i || i > size);
/*
* sign bit trick:
@@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
static inline unsigned eytzinger0_child(unsigned i, unsigned child)
{
- EBUG_ON(child > 1);
+ EYTZINGER_BUG_ON(child > 1);
return (i << 1) + 1 + child;
}
@@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
-typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
-
/* return greatest node <= @search, or -1 if not found */
-static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
- eytzinger_cmp_fn cmp, const void *search)
+static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
{
unsigned i, n = 0;
@@ -244,21 +252,38 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
do {
i = n;
- n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+ n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
} while (n < nr);
if (n & 1) {
- /* @i was greater than @search, return previous node: */
-
- if (i == eytzinger0_first(nr))
- return -1;
-
+ /*
+ * @i was greater than @search, return previous node:
+ *
+ * if @i was leftmost/smallest element,
+ * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
+ */
return eytzinger0_prev(i, nr);
} else {
return i;
}
}
+static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
+{
+ ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
+
+ /*
+ * if eytitzinger0_find_le() returned -1 - no element was <= search - we
+ * want to return the first element; next/prev identities mean this work
+ * as expected
+ *
+ * similarly if find_le() returns last element, we should return -1;
+ * identities mean this all works out:
+ */
+ return eytzinger0_next(idx, nr);
+}
+
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
void *_base = (base); \
@@ -269,13 +294,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
int _res; \
\
while (_i < _nr && \
- (_res = _cmp(_search, _base + _i * _size, _size))) \
+ (_res = _cmp(_search, _base + _i * _size))) \
_i = eytzinger0_child(_i, _res > 0); \
_i; \
})
-void eytzinger0_sort(void *, size_t, size_t,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t));
+void eytzinger0_sort_r(void *, size_t, size_t,
+ cmp_r_func_t, swap_r_func_t, const void *);
+void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 624e6f963240..508d029ac53d 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -42,7 +42,7 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
if (ret)
goto err;
@@ -70,7 +70,7 @@ int bch2_create_trans(struct btree_trans *trans,
struct bch_subvolume s;
ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
- BTREE_ITER_CACHED, &s);
+ BTREE_ITER_cached, &s);
if (ret)
goto err;
@@ -78,7 +78,7 @@ int bch2_create_trans(struct btree_trans *trans,
}
ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
@@ -163,7 +163,7 @@ int bch2_create_trans(struct btree_trans *trans,
name,
dir_target,
&dir_offset,
- BCH_HASH_SET_MUST_CREATE);
+ STR_HASH_must_create);
if (ret)
goto err;
@@ -171,7 +171,7 @@ int bch2_create_trans(struct btree_trans *trans,
new_inode->bi_dir_offset = dir_offset;
}
- inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+ inode_iter.flags &= ~BTREE_ITER_all_snapshots;
bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
ret = bch2_btree_iter_traverse(&inode_iter) ?:
@@ -198,16 +198,16 @@ int bch2_link_trans(struct btree_trans *trans,
if (dir.subvol != inum.subvol)
return -EXDEV;
- ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
if (ret)
- goto err;
+ return ret;
inode_u->bi_ctime = now;
ret = bch2_inode_nlink_inc(inode_u);
if (ret)
- return ret;
+ goto err;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
if (ret)
goto err;
@@ -223,7 +223,7 @@ int bch2_link_trans(struct btree_trans *trans,
ret = bch2_dirent_create(trans, dir, &dir_hash,
mode_to_type(inode_u->bi_mode),
name, inum.inum, &dir_offset,
- BCH_HASH_SET_MUST_CREATE);
+ STR_HASH_must_create);
if (ret)
goto err;
@@ -255,19 +255,19 @@ int bch2_unlink_trans(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
if (ret)
goto err;
dir_hash = bch2_hash_info_init(c, dir_u);
ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
- name, &inum, BTREE_ITER_INTENT);
+ name, &inum, BTREE_ITER_intent);
if (ret)
goto err;
ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
@@ -322,7 +322,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
&dir_hash, &dirent_iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_inode_write(trans, &dir_iter, dir_u) ?:
bch2_inode_write(trans, &inode_iter, inode_u);
err:
@@ -363,7 +363,7 @@ static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_p
struct bkey_i_subvolume *s =
bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, subvol),
- BTREE_ITER_CACHED, subvolume);
+ BTREE_ITER_cached, subvolume);
int ret = PTR_ERR_OR_ZERO(s);
if (ret)
return ret;
@@ -394,7 +394,7 @@ int bch2_rename_trans(struct btree_trans *trans,
int ret;
ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
@@ -403,7 +403,7 @@ int bch2_rename_trans(struct btree_trans *trans,
if (dst_dir.inum != src_dir.inum ||
dst_dir.subvol != src_dir.subvol) {
ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
@@ -423,13 +423,13 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
if (dst_inum.inum) {
ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
}
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 39292e7ef342..54873ecc635c 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -30,15 +30,8 @@ static void bch2_readpages_end_io(struct bio *bio)
{
struct folio_iter fi;
- bio_for_each_folio_all(fi, bio) {
- if (!bio->bi_status) {
- folio_mark_uptodate(fi.folio);
- } else {
- folio_clear_uptodate(fi.folio);
- folio_set_error(fi.folio);
- }
- folio_unlock(fi.folio);
- }
+ bio_for_each_folio_all(fi, bio)
+ folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK);
bio_put(bio);
}
@@ -176,7 +169,7 @@ retry:
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
- BTREE_ITER_SLOTS);
+ BTREE_ITER_slots);
while (1) {
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
@@ -264,7 +257,6 @@ void bch2_readahead(struct readahead_control *ractl)
struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts;
- struct btree_trans *trans = bch2_trans_get(c);
struct folio *folio;
struct readpages_iter readpages_iter;
@@ -276,6 +268,7 @@ void bch2_readahead(struct readahead_control *ractl)
bch2_pagecache_add_get(inode);
+ struct btree_trans *trans = bch2_trans_get(c);
while ((folio = readpage_iter_peek(&readpages_iter))) {
unsigned n = min_t(unsigned,
readpages_iter.folios.nr -
@@ -296,10 +289,10 @@ void bch2_readahead(struct readahead_control *ractl)
&readpages_iter);
bch2_trans_unlock(trans);
}
+ bch2_trans_put(trans);
bch2_pagecache_add_put(inode);
- bch2_trans_put(trans);
darray_exit(&readpages_iter.folios);
}
@@ -408,7 +401,6 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
bio_for_each_folio_all(fi, bio) {
struct bch_folio *s;
- folio_set_error(fi.folio);
mapping_set_error(fi.folio->mapping, -EIO);
s = __bch2_folio(fi.folio);
@@ -445,8 +437,8 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
*/
/*
- * PageWriteback is effectively our ref on the inode - fixup i_blocks
- * before calling end_page_writeback:
+ * The writeback flag is effectively our ref on the inode -
+ * fixup i_blocks before calling folio_end_writeback:
*/
bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
@@ -906,7 +898,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
darray_for_each(fs, fi) {
f = *fi;
f_len = min(end, folio_end_pos(f)) - f_pos;
- f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
+ f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter);
if (!f_copied) {
folios_trunc(&fs, fi);
break;
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 33cb6da3a5ad..049b61bc9a5b 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -254,7 +254,7 @@ retry:
for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_SLOTS, k, err) {
+ BTREE_ITER_slots, k, err) {
if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
break;
@@ -387,6 +387,8 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
ret = dio->op.error ?: ((long) dio->written << 9);
bio_put(&dio->op.wbio.bio);
+ bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write);
+
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
@@ -536,7 +538,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
if (likely(!dio->iter.count) || dio->op.error)
break;
- bio_reset(bio, NULL, REQ_OP_WRITE);
+ bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
}
out:
return bch2_dio_write_done(dio);
@@ -590,22 +592,27 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
prefetch(&inode->ei_inode);
prefetch((void *) &inode->ei_inode + 64);
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
+ return -EROFS;
+
inode_lock(&inode->v);
ret = generic_write_checks(req, iter);
if (unlikely(ret <= 0))
- goto err;
+ goto err_put_write_ref;
ret = file_remove_privs(file);
if (unlikely(ret))
- goto err;
+ goto err_put_write_ref;
ret = file_update_time(file);
if (unlikely(ret))
- goto err;
+ goto err_put_write_ref;
- if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
- goto err;
+ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) {
+ ret = -EINVAL;
+ goto err_put_write_ref;
+ }
inode_dio_begin(&inode->v);
bch2_pagecache_block_get(inode);
@@ -618,7 +625,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
bio = bio_alloc_bioset(NULL,
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_WRITE,
+ REQ_OP_WRITE | REQ_SYNC | REQ_IDLE,
GFP_KERNEL,
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, op.wbio.bio);
@@ -645,7 +652,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
}
ret = bch2_dio_write_loop(dio);
-err:
+out:
if (locked)
inode_unlock(&inode->v);
return ret;
@@ -653,7 +660,9 @@ err_put_bio:
bch2_pagecache_block_put(inode);
bio_put(bio);
inode_dio_end(&inode->v);
- goto err;
+err_put_write_ref:
+ bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+ goto out;
}
void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index d359aa9b33b8..872283e5bd1e 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -214,7 +214,7 @@ retry:
for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_SLOTS, k, ret) {
+ BTREE_ITER_slots, k, ret) {
unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
unsigned state = bkey_to_sector_state(k);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8c70123b6a0c..ef20b64033e0 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -174,18 +174,18 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
static int bch2_flush_inode(struct bch_fs *c,
struct bch_inode_info *inode)
{
- struct bch_inode_unpacked u;
- int ret;
-
if (c->opts.journal_flush_disabled)
return 0;
- ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
- if (ret)
- return ret;
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
+ return -EROFS;
- return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
- bch2_inode_flush_nocow_writes(c, inode);
+ struct bch_inode_unpacked u;
+ int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?:
+ bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+ bch2_inode_flush_nocow_writes(c, inode);
+ bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
+ return ret;
}
int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -202,7 +202,10 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
ret = bch2_flush_inode(c, inode);
out:
- return bch2_err_class(ret);
+ ret = bch2_err_class(ret);
+ if (ret == -EROFS)
+ ret = -EIO;
+ return ret;
}
/* truncate: */
@@ -594,7 +597,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inode->v.i_ino, start_sector),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
while (!ret && bkey_lt(iter.pos, end_pos)) {
s64 i_sectors_delta = 0;
@@ -1009,7 +1012,7 @@ retry:
for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
SPOS(inode->v.i_ino, offset >> 9, snapshot),
- BTREE_ITER_SLOTS, k, ret) {
+ BTREE_ITER_slots, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
next_hole = bch2_seek_pagecache_hole(&inode->v,
offset, MAX_LFS_FILESIZE, 0, false);
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 3dc8630ff9fe..205a323ffc6d 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -548,7 +548,7 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
- case FS_IOC_GETFLAGS:
+ case FS_IOC32_GETFLAGS:
cmd = FS_IOC_GETFLAGS;
break;
case FS_IOC32_SETFLAGS:
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 0ccee05f6887..cd388f1702dc 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -90,7 +90,7 @@ retry:
bch2_trans_begin(trans);
ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
- BTREE_ITER_INTENT) ?:
+ BTREE_ITER_intent) ?:
(set ? set(trans, inode, &inode_u, p) : 0) ?:
bch2_inode_write(trans, &iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
@@ -212,21 +212,47 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
_ret; \
})
+static struct inode *bch2_alloc_inode(struct super_block *sb)
+{
+ BUG();
+}
+
+static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
+{
+ struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+ if (!inode)
+ return NULL;
+
+ inode_init_once(&inode->v);
+ mutex_init(&inode->ei_update_lock);
+ two_state_lock_init(&inode->ei_pagecache_lock);
+ INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
+ mutex_init(&inode->ei_quota_lock);
+ inode->v.i_state = 0;
+
+ if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
+ kmem_cache_free(bch2_inode_cache, inode);
+ return NULL;
+ }
+
+ return inode;
+}
+
/*
* Allocate a new inode, dropping/retaking btree locks if necessary:
*/
static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
-
struct bch_inode_info *inode =
memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
- to_bch_ei(new_inode(c->vfs_sb)));
+ __bch2_new_inode(trans->c));
if (unlikely(!inode)) {
- int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
- if (ret && inode)
- discard_new_inode(&inode->v);
+ int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM);
+ if (ret && inode) {
+ __destroy_inode(&inode->v);
+ kmem_cache_free(bch2_inode_cache, inode);
+ }
if (ret)
return ERR_PTR(ret);
}
@@ -287,7 +313,7 @@ __bch2_create(struct mnt_idmap *idmap,
if (ret)
return ERR_PTR(ret);
#endif
- inode = to_bch_ei(new_inode(c->vfs_sb));
+ inode = __bch2_new_inode(c);
if (unlikely(!inode)) {
inode = ERR_PTR(-ENOMEM);
goto err;
@@ -320,7 +346,7 @@ retry:
inum.inum = inode_u.bi_inum;
ret = bch2_subvolume_get(trans, inum.subvol, true,
- BTREE_ITER_WITH_UPDATES, &subvol) ?:
+ BTREE_ITER_with_updates, &subvol) ?:
bch2_trans_commit(trans, NULL, &journal_seq, 0);
if (unlikely(ret)) {
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
@@ -373,17 +399,14 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter dirent_iter = {};
subvol_inum inum = {};
+ struct printbuf buf = PRINTBUF;
- int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
- dir_hash_info, dir, name, 0);
+ struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
+ dir_hash_info, dir, name, 0);
+ int ret = bkey_err(k);
if (ret)
return ERR_PTR(ret);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
if (ret > 0)
ret = -ENOENT;
@@ -403,20 +426,31 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
- if (bch2_err_matches(ret, ENOENT)) {
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
- bch_err(c, "%s points to missing inode", buf.buf);
- printbuf_exit(&buf);
- }
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
+ c, "dirent to missing inode:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
if (ret)
goto err;
+ /* regular files may have hardlinks: */
+ if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) &&
+ !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
+ c,
+ "dirent points to inode that does not point back:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k),
+ prt_printf(&buf, "\n "),
+ bch2_inode_unpacked_to_text(&buf, &inode_u),
+ buf.buf))) {
+ ret = -ENOENT;
+ goto err;
+ }
+
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
inode = bch2_inode_insert(c, inode);
out:
bch2_trans_iter_exit(trans, &dirent_iter);
+ printbuf_exit(&buf);
return inode;
err:
inode = ERR_PTR(ret);
@@ -784,7 +818,7 @@ retry:
acl = NULL;
ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto btree_err;
@@ -841,6 +875,9 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->blksize = block_bytes(c);
stat->blocks = inode->v.i_blocks;
+ stat->subvol = inode->ei_subvol;
+ stat->result_mask |= STATX_SUBVOL;
+
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
@@ -961,7 +998,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
- struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
unsigned offset_into_extent, sectors;
bool have_extent = false;
u32 snapshot;
@@ -971,6 +1007,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
if (ret)
return ret;
+ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
if (start + len < start)
return -EINVAL;
@@ -1037,6 +1074,10 @@ retry:
bch2_btree_iter_set_pos(&iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
+
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ break;
}
start = iter.pos.offset;
bch2_trans_iter_exit(trans, &iter);
@@ -1103,6 +1144,8 @@ static int bch2_open(struct inode *vinode, struct file *file)
return ret;
}
+ file->f_mode |= FMODE_CAN_ODIRECT;
+
return generic_file_open(vinode, file);
}
@@ -1195,7 +1238,6 @@ static const struct address_space_operations bch_address_space_operations = {
.write_end = bch2_write_end,
.invalidate_folio = bch2_invalidate_folio,
.release_folio = bch2_release_folio,
- .direct_IO = noop_direct_IO,
#ifdef CONFIG_MIGRATION
.migrate_folio = filemap_migrate_folio,
#endif
@@ -1484,34 +1526,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
mapping_set_large_folios(inode->v.i_mapping);
}
-static struct inode *bch2_alloc_inode(struct super_block *sb)
+static void bch2_free_inode(struct inode *vinode)
{
- struct bch_inode_info *inode;
-
- inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
- if (!inode)
- return NULL;
-
- inode_init_once(&inode->v);
- mutex_init(&inode->ei_update_lock);
- two_state_lock_init(&inode->ei_pagecache_lock);
- INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
- mutex_init(&inode->ei_quota_lock);
-
- return &inode->v;
-}
-
-static void bch2_i_callback(struct rcu_head *head)
-{
- struct inode *vinode = container_of(head, struct inode, i_rcu);
- struct bch_inode_info *inode = to_bch_ei(vinode);
-
- kmem_cache_free(bch2_inode_cache, inode);
-}
-
-static void bch2_destroy_inode(struct inode *vinode)
-{
- call_rcu(&vinode->i_rcu, bch2_i_callback);
+ kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
}
static int inode_update_times_fn(struct btree_trans *trans,
@@ -1819,7 +1836,7 @@ static int bch2_unfreeze(struct super_block *sb)
static const struct super_operations bch_super_operations = {
.alloc_inode = bch2_alloc_inode,
- .destroy_inode = bch2_destroy_inode,
+ .free_inode = bch2_free_inode,
.write_inode = bch2_vfs_write_inode,
.evict_inode = bch2_evict_inode,
.sync_fs = bch2_sync_fs,
@@ -1922,8 +1939,7 @@ got_sb:
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
- ret = bch2_err_class(ret);
- return ERR_PTR(ret);
+ goto err;
}
c = sb->s_fs_info;
@@ -1997,7 +2013,17 @@ out:
return dget(sb->s_root);
err_put_super:
+ __bch2_fs_stop(c);
deactivate_locked_super(sb);
+err:
+ /*
+ * On an inconsistency error in recovery we might see an -EROFS derived
+ * errorcode (from the journal), but we don't want to return that to
+ * userspace as that causes util-linux to retry the mount RO - which is
+ * confusing:
+ */
+ if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
+ ret = -EIO;
return ERR_PTR(bch2_err_class(ret));
}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 47d4eefaba7b..fd277bd58ed3 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -12,7 +12,7 @@
#include "fsck.h"
#include "inode.h"
#include "keylist.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "snapshot.h"
#include "super.h"
#include "xattr.h"
@@ -63,9 +63,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
u32 *snapshot, u64 *inum)
{
struct bch_subvolume s;
- int ret;
-
- ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+ int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
*snapshot = le32_to_cpu(s.snapshot);
*inum = le64_to_cpu(s.inode);
@@ -79,21 +77,17 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
struct bkey_s_c k;
int ret;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
- POS(0, inode_nr),
- BTREE_ITER_ALL_SNAPSHOTS);
- k = bch2_btree_iter_peek(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inode_nr)
+ break;
+ if (!bkey_is_inode(k.k))
+ continue;
+ ret = bch2_inode_unpack(k, inode);
+ goto found;
}
-
- ret = bch2_inode_unpack(k, inode);
-err:
+ ret = -BCH_ERR_ENOENT_inode;
+found:
bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -129,13 +123,13 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
u64 *target, unsigned *type, u32 snapshot)
{
struct btree_iter iter;
- struct bkey_s_c_dirent d;
- int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
- &hash_info, dir, name, 0, snapshot);
+ struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
+ &hash_info, dir, name, 0, snapshot);
+ int ret = bkey_err(k);
if (ret)
return ret;
- d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
*target = le64_to_cpu(d.v->d_inum);
*type = d.v->d_type;
bch2_trans_iter_exit(trans, &iter);
@@ -156,11 +150,12 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
dir_hash_info = bch2_hash_info_init(c, &dir_inode);
- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
- ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash_info, &iter,
+ BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter);
err:
bch_err_fn(c, ret);
@@ -169,7 +164,8 @@ err:
/* Get lost+found, create if it doesn't exist: */
static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
- struct bch_inode_unpacked *lostfound)
+ struct bch_inode_unpacked *lostfound,
+ u64 reattaching_inum)
{
struct bch_fs *c = trans->c;
struct qstr lostfound_str = QSTR("lost+found");
@@ -184,19 +180,36 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
return ret;
subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
- u32 subvol_snapshot;
- ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
- &subvol_snapshot, &root_inum.inum);
- bch_err_msg(c, ret, "looking up root subvol");
+ struct bch_subvolume subvol;
+ ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
+ false, 0, &subvol);
+ bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
+ le32_to_cpu(st.master_subvol), snapshot);
if (ret)
return ret;
+ if (!subvol.inode) {
+ struct btree_iter iter;
+ struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
+ 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(subvol);
+ if (ret)
+ return ret;
+
+ subvol->v.inode = cpu_to_le64(reattaching_inum);
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ root_inum.inum = le64_to_cpu(subvol.inode);
+
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
u32 root_inode_snapshot = snapshot;
ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
- bch_err_msg(c, ret, "looking up root inode");
+ bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
+ root_inum.inum, le32_to_cpu(st.master_subvol));
if (ret)
return ret;
@@ -257,9 +270,9 @@ create_lostfound:
&lostfound_str,
lostfound->bi_inum,
&lostfound->bi_dir_offset,
- BCH_HASH_SET_MUST_CREATE) ?:
+ STR_HASH_must_create) ?:
bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
err:
bch_err_msg(c, ret, "creating lost+found");
bch2_trans_iter_exit(trans, &lostfound_iter);
@@ -292,7 +305,7 @@ static int reattach_inode(struct btree_trans *trans,
snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
}
- ret = lookup_lostfound(trans, dirent_snapshot, &lostfound);
+ ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
if (ret)
return ret;
@@ -316,7 +329,7 @@ static int reattach_inode(struct btree_trans *trans,
&name,
inode->bi_subvol ?: inode->bi_inum,
&dir_offset,
- BCH_HASH_SET_MUST_CREATE);
+ STR_HASH_must_create);
if (ret)
return ret;
@@ -363,14 +376,115 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume
return ret;
}
-struct snapshots_seen_entry {
- u32 id;
- u32 equiv;
-};
+static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
+{
+ struct bch_fs *c = trans->c;
+
+ if (!bch2_snapshot_is_leaf(c, snapshotid)) {
+ bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
+ /*
+ * If inum isn't set, that means we're being called from check_dirents,
+ * not check_inodes - the root of this subvolume doesn't exist or we
+ * would have found it there:
+ */
+ if (!inum) {
+ struct btree_iter inode_iter = {};
+ struct bch_inode_unpacked new_inode;
+ u64 cpu = raw_smp_processor_id();
+
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
+
+ new_inode.bi_subvol = subvolid;
+
+ int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
+ bch2_btree_iter_traverse(&inode_iter) ?:
+ bch2_inode_write(trans, &inode_iter, &new_inode);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ if (ret)
+ return ret;
+
+ inum = new_inode.bi_inum;
+ }
+
+ bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
+
+ struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+ int ret = PTR_ERR_OR_ZERO(new_subvol);
+ if (ret)
+ return ret;
+
+ bkey_subvolume_init(&new_subvol->k_i);
+ new_subvol->k.p.offset = subvolid;
+ new_subvol->v.snapshot = cpu_to_le32(snapshotid);
+ new_subvol->v.inode = cpu_to_le64(inum);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
+ if (ret)
+ return ret;
+
+ struct btree_iter iter;
+ struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshots, POS(0, snapshotid),
+ 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(s);
+ bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
+ if (ret)
+ return ret;
+
+ u32 snapshot_tree = le32_to_cpu(s->v.tree);
+
+ s->v.subvol = cpu_to_le32(subvolid);
+ SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
+ bch2_trans_iter_exit(trans, &iter);
+
+ struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
+ 0, snapshot_tree);
+ ret = PTR_ERR_OR_ZERO(st);
+ bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
+ if (ret)
+ return ret;
+
+ if (!st->v.master_subvol)
+ st->v.master_subvol = cpu_to_le32(subvolid);
+
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
+static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked new_inode;
+
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
+ new_inode.bi_size = size;
+ new_inode.bi_inum = inum;
+
+ return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
+}
+
+static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
+{
+ struct btree_iter iter = {};
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
+}
struct snapshots_seen {
struct bpos pos;
- DARRAY(struct snapshots_seen_entry) ids;
+ snapshot_id_list ids;
};
static inline void snapshots_seen_exit(struct snapshots_seen *s)
@@ -385,20 +499,15 @@ static inline void snapshots_seen_init(struct snapshots_seen *s)
static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
{
- struct snapshots_seen_entry *i, n = {
- .id = id,
- .equiv = bch2_snapshot_equiv(c, id),
- };
- int ret = 0;
-
+ u32 *i;
__darray_for_each(s->ids, i) {
- if (i->id == id)
+ if (*i == id)
return 0;
- if (i->id > id)
+ if (*i > id)
break;
}
- ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+ int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
if (ret)
bch_err(c, "error reallocating snapshots_seen table (size %zu)",
s->ids.size);
@@ -408,42 +517,11 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s
static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
enum btree_id btree_id, struct bpos pos)
{
- struct snapshots_seen_entry n = {
- .id = pos.snapshot,
- .equiv = bch2_snapshot_equiv(c, pos.snapshot),
- };
- int ret = 0;
-
if (!bkey_eq(s->pos, pos))
s->ids.nr = 0;
-
s->pos = pos;
- s->pos.snapshot = n.equiv;
- darray_for_each(s->ids, i) {
- if (i->id == n.id)
- return 0;
-
- /*
- * We currently don't rigorously track for snapshot cleanup
- * needing to be run, so it shouldn't be a fsck error yet:
- */
- if (i->equiv == n.equiv) {
- bch_err(c, "snapshot deletion did not finish:\n"
- " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
- bch2_btree_id_str(btree_id),
- pos.inode, pos.offset,
- i->id, n.id, n.equiv);
- set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
- return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
- }
- }
-
- ret = darray_push(&s->ids, n);
- if (ret)
- bch_err(c, "error reallocating snapshots_seen table (size %zu)",
- s->ids.size);
- return ret;
+ return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
}
/**
@@ -463,12 +541,10 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
ssize_t i;
EBUG_ON(id > ancestor);
- EBUG_ON(!bch2_snapshot_is_equiv(c, id));
- EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
/* @ancestor should be the snapshot most recently added to @seen */
EBUG_ON(ancestor != seen->pos.snapshot);
- EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
+ EBUG_ON(ancestor != darray_last(seen->ids));
if (id == ancestor)
return true;
@@ -487,9 +563,9 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
*/
for (i = seen->ids.nr - 2;
- i >= 0 && seen->ids.data[i].equiv >= id;
+ i >= 0 && seen->ids.data[i] >= id;
--i)
- if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
+ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
return false;
return true;
@@ -520,9 +596,6 @@ static int ref_visible2(struct bch_fs *c,
u32 src, struct snapshots_seen *src_seen,
u32 dst, struct snapshots_seen *dst_seen)
{
- src = bch2_snapshot_equiv(c, src);
- dst = bch2_snapshot_equiv(c, dst);
-
if (dst > src) {
swap(dst, src);
swap(dst_seen, src_seen);
@@ -569,7 +642,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w,
return darray_push(&w->inodes, ((struct inode_walker_entry) {
.inode = u,
- .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot),
+ .snapshot = inode.k->p.snapshot,
}));
}
@@ -585,7 +658,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
w->inodes.nr = 0;
for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ BTREE_ITER_all_snapshots, k, ret) {
if (k.k->p.offset != inum)
break;
@@ -605,21 +678,20 @@ static struct inode_walker_entry *
lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
{
bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
- u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
struct inode_walker_entry *i;
__darray_for_each(w->inodes, i)
- if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
goto found;
return NULL;
found:
- BUG_ON(snapshot > i->snapshot);
+ BUG_ON(k.k->p.snapshot > i->snapshot);
- if (snapshot != i->snapshot && !is_whiteout) {
+ if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
struct inode_walker_entry new = *i;
- new.snapshot = snapshot;
+ new.snapshot = k.k->p.snapshot;
new.count = 0;
struct printbuf buf = PRINTBUF;
@@ -628,10 +700,10 @@ found:
bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
"unexpected because we should always update the inode when we update a key in that inode\n"
"%s",
- w->last_pos.inode, snapshot, i->snapshot, buf.buf);
+ w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
printbuf_exit(&buf);
- while (i > w->inodes.data && i[-1].snapshot > snapshot)
+ while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
--i;
size_t pos = i - w->inodes.data;
@@ -663,10 +735,10 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
return lookup_inode_for_snapshot(trans->c, w, k);
}
-static int __get_visible_inodes(struct btree_trans *trans,
- struct inode_walker *w,
- struct snapshots_seen *s,
- u64 inum)
+static int get_visible_inodes(struct btree_trans *trans,
+ struct inode_walker *w,
+ struct snapshots_seen *s,
+ u64 inum)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
@@ -676,19 +748,17 @@ static int __get_visible_inodes(struct btree_trans *trans,
w->inodes.nr = 0;
for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
+ BTREE_ITER_all_snapshots, k, ret) {
if (k.k->p.offset != inum)
break;
- if (!ref_visible(c, s, s->pos.snapshot, equiv))
+ if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
continue;
if (bkey_is_inode(k.k))
add_inode(c, w, k);
- if (equiv >= s->pos.snapshot)
+ if (k.k->p.snapshot >= s->pos.snapshot)
break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -696,25 +766,6 @@ static int __get_visible_inodes(struct btree_trans *trans,
return ret;
}
-static int check_key_has_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
- bkey_in_missing_snapshot,
- "key in missing snapshot: %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
static int hash_redo_key(struct btree_trans *trans,
const struct bch_hash_desc desc,
struct bch_hash_info *hash_info,
@@ -738,8 +789,8 @@ static int hash_redo_key(struct btree_trans *trans,
bch2_hash_set_in_snapshot(trans, desc, hash_info,
(subvol_inum) { 0, k.k->p.inode },
k.k->p.snapshot, tmp,
- BCH_HASH_SET_MUST_CREATE,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ STR_HASH_must_create|
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
@@ -768,7 +819,7 @@ static int hash_check_key(struct btree_trans *trans,
for_each_btree_key_norestart(trans, iter, desc.btree_id,
SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
- BTREE_ITER_SLOTS, k, ret) {
+ BTREE_ITER_slots, k, ret) {
if (bkey_eq(k.k->p, hash_k.k->p))
break;
@@ -909,7 +960,7 @@ static int check_inode(struct btree_trans *trans,
bool do_update = false;
int ret;
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret < 0)
goto err;
if (ret)
@@ -1064,6 +1115,11 @@ static int check_inode(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
+ if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+ ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
+ goto do_update;
+ }
+
if (fsck_err_on(ret,
c, inode_bi_subvol_missing,
"inode %llu:%u bi_subvol points to missing subvolume %u",
@@ -1081,7 +1137,7 @@ static int check_inode(struct btree_trans *trans,
do_update = true;
}
}
-
+do_update:
if (do_update) {
ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck updating inode");
@@ -1105,7 +1161,7 @@ int bch2_check_inodes(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_inode(trans, &iter, k, &prev, &s, full)));
@@ -1130,8 +1186,8 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
i->count = count2;
if (i->count != count2) {
- bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->snapshot, i->count, count2);
+ bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
return -BCH_ERR_internal_fsck_err;
}
@@ -1234,8 +1290,8 @@ static int overlapping_extents_found(struct btree_trans *trans,
BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
bch2_trans_iter_init(trans, &iter1, btree, pos1,
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_NOT_EXTENTS);
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_not_extents);
k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
ret = bkey_err(k1);
if (ret)
@@ -1297,7 +1353,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
ret = bch2_trans_update_extent_overwrite(trans, old_iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+ BTREE_UPDATE_internal_snapshot_node,
k1, k2) ?:
bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(c, &res);
@@ -1338,7 +1394,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
struct snapshots_seen *seen,
struct extent_ends *extent_ends,
struct bkey_s_c k,
- u32 equiv,
struct btree_iter *iter,
bool *fixed)
{
@@ -1371,10 +1426,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
goto err;
}
- ret = extent_ends_at(c, extent_ends, seen, k);
- if (ret)
- goto err;
-
extent_ends->last_pos = k.k->p;
err:
return ret;
@@ -1411,12 +1462,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
- struct bpos equiv = k.k->p;
int ret = 0;
- equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret) {
ret = ret < 0 ? ret : 0;
goto out;
@@ -1438,6 +1486,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
if (k.k->type != KEY_TYPE_whiteout) {
+ if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ inode->last_pos.inode--;
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
if (fsck_err_on(!i, c, extent_in_missing_inode,
"extent in missing inode:\n %s",
(printbuf_reset(&buf),
@@ -1454,8 +1513,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
goto delete;
- ret = check_overlapping_extents(trans, s, extent_ends, k,
- equiv.snapshot, iter,
+ ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
&inode->recalculate_sums);
if (ret)
goto err;
@@ -1472,8 +1530,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
for (;
inode->inodes.data && i >= inode->inodes.data;
--i) {
- if (i->snapshot > equiv.snapshot ||
- !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
+ if (i->snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
continue;
if (k.k->type != KEY_TYPE_whiteout) {
@@ -1490,7 +1548,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
ret = bch2_btree_iter_traverse(&iter2) ?:
bch2_btree_delete_at(trans, &iter2,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter2);
if (ret)
goto err;
@@ -1504,6 +1562,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
i->seen_this_pos = true;
}
+
+ if (k.k->type != KEY_TYPE_whiteout) {
+ ret = extent_ends_at(c, extent_ends, s, k);
+ if (ret)
+ goto err;
+ }
out:
err:
fsck_err:
@@ -1511,7 +1575,7 @@ fsck_err:
bch_err_fn(c, ret);
return ret;
delete:
- ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
goto out;
}
@@ -1532,7 +1596,7 @@ int bch2_check_extents(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL,
BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res);
@@ -1557,7 +1621,7 @@ int bch2_check_indirect_extents(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
&res, NULL,
BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res);
@@ -1584,8 +1648,8 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
return count2;
if (i->count != count2) {
- bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
- i->count, count2);
+ bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
i->count = count2;
if (i->inode.bi_nlink == i->count)
continue;
@@ -1626,6 +1690,15 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
if (inode_points_to_dirent(target, d))
return 0;
+ if (bch2_inode_should_have_bp(target) &&
+ !fsck_err(c, inode_wrong_backpointer,
+ "dirent points to inode that does not point back:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n "),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf)))
+ goto out_noiter;
+
if (!target->bi_dir &&
!target->bi_dir_offset) {
target->bi_dir = d.k->p.inode;
@@ -1694,6 +1767,7 @@ out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &bp_iter);
+out_noiter:
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
@@ -1782,6 +1856,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
u32 parent_snapshot;
+ u32 new_parent_subvol = 0;
u64 parent_inum;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -1790,6 +1865,27 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
+ if (ret ||
+ (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
+ int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
+ if (ret2 && !bch2_err_matches(ret, ENOENT))
+ return ret2;
+ }
+
+ if (ret &&
+ !new_parent_subvol &&
+ (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+ /*
+ * Couldn't find a subvol for dirent's snapshot - but we lost
+ * subvols, so we need to reconstruct:
+ */
+ ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
+ if (ret)
+ return ret;
+
+ parent_snapshot = d.k->p.snapshot;
+ }
+
if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
"dirent parent_subvol points to missing subvolume\n%s",
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
@@ -1798,10 +1894,10 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
parent_snapshot,
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
- u32 new_parent_subvol;
- ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
- if (ret)
- goto err;
+ if (!new_parent_subvol) {
+ bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
ret = PTR_ERR_OR_ZERO(new_dirent);
@@ -1847,9 +1943,16 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
+ goto err;
- if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol,
+ if (ret) {
+ bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = 0;
+ goto err;
+ }
+
+ if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
c, inode_bi_parent_wrong,
"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
target_inum,
@@ -1857,13 +1960,13 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
subvol_root.bi_parent_subvol = parent_subvol;
ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
if (ret)
- return ret;
+ goto err;
}
ret = check_dirent_target(trans, iter, d, &subvol_root,
target_snapshot);
if (ret)
- return ret;
+ goto err;
out:
err:
fsck_err:
@@ -1880,21 +1983,16 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c_dirent d;
struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
- struct bpos equiv;
int ret = 0;
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret) {
ret = ret < 0 ? ret : 0;
goto out;
}
- equiv = k.k->p;
- equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
if (ret)
goto err;
@@ -1919,12 +2017,23 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
dir->first_this_inode = false;
+ if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ dir->last_pos.inode--;
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
"dirent in nonexisting directory:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
goto out;
}
@@ -1953,21 +2062,20 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type != KEY_TYPE_dirent)
goto out;
- d = bkey_s_c_to_dirent(k);
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL) {
ret = check_dirent_to_subvol(trans, iter, d);
if (ret)
goto err;
} else {
- ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
+ ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
if (ret)
goto err;
if (fsck_err_on(!target->inodes.nr,
c, dirent_to_missing_inode,
- "dirent points to missing inode: (equiv %u)\n%s",
- equiv.snapshot,
+ "dirent points to missing inode:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
@@ -1984,7 +2092,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
}
if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, equiv.snapshot, i)
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
i->count++;
}
out:
@@ -2011,7 +2119,7 @@ int bch2_check_dirents(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
@@ -2034,7 +2142,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
struct inode_walker_entry *i;
int ret;
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret < 0)
return ret;
if (ret)
@@ -2075,7 +2183,7 @@ int bch2_check_xattrs(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
@@ -2098,17 +2206,21 @@ static int check_root_trans(struct btree_trans *trans)
if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
"root subvol missing")) {
- struct bkey_i_subvolume root_subvol;
+ struct bkey_i_subvolume *root_subvol =
+ bch2_trans_kmalloc(trans, sizeof(*root_subvol));
+ ret = PTR_ERR_OR_ZERO(root_subvol);
+ if (ret)
+ goto err;
snapshot = U32_MAX;
inum = BCACHEFS_ROOT_INO;
- bkey_subvolume_init(&root_subvol.k_i);
- root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_subvol.v.flags = 0;
- root_subvol.v.snapshot = cpu_to_le32(snapshot);
- root_subvol.v.inode = cpu_to_le64(inum);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
+ bkey_subvolume_init(&root_subvol->k_i);
+ root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_subvol->v.flags = 0;
+ root_subvol->v.snapshot = cpu_to_le32(snapshot);
+ root_subvol->v.inode = cpu_to_le64(inum);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
bch_err_msg(c, ret, "writing root subvol");
if (ret)
goto err;
@@ -2238,7 +2350,7 @@ int bch2_check_subvolume_structure(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_subvol_path(trans, &iter, k)));
bch_err_fn(c, ret);
@@ -2273,7 +2385,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
struct btree_iter inode_iter = {};
struct bch_inode_unpacked inode;
struct printbuf buf = PRINTBUF;
- u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot);
+ u32 snapshot = inode_k.k->p.snapshot;
int ret = 0;
p->nr = 0;
@@ -2375,9 +2487,9 @@ int bch2_check_directory_structure(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_intent|
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
if (!bkey_is_inode(k.k))
continue;
@@ -2477,9 +2589,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_inodes,
POS(0, start),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_intent|
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k, ({
if (!bkey_is_inode(k.k))
continue;
@@ -2520,9 +2632,9 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_intent|
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k, ({
ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
if (ret)
break;
@@ -2533,8 +2645,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
if (d.v->d_type != DT_DIR &&
d.v->d_type != DT_SUBVOL)
inc_link(c, &s, links, range_start, range_end,
- le64_to_cpu(d.v->d_inum),
- bch2_snapshot_equiv(c, d.k->p.snapshot));
+ le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
}
0;
})));
@@ -2597,7 +2708,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS(0, range_start),
- BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
if (ret < 0) {
@@ -2665,7 +2776,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
u->v.front_pad = 0;
u->v.back_pad = 0;
- return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
+ return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
}
int bch2_fix_reflink_p(struct bch_fs *c)
@@ -2676,8 +2787,8 @@ int bch2_fix_reflink_p(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_extents, POS_MIN,
- BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_intent|BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
fix_reflink_p_key(trans, &iter, k)));
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 2b5e06770ab3..aafa79fa6351 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -339,7 +339,7 @@ int bch2_inode_peek_nowarn(struct btree_trans *trans,
k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
SPOS(0, inum.inum, snapshot),
- flags|BTREE_ITER_CACHED);
+ flags|BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
return ret;
@@ -371,7 +371,7 @@ int bch2_inode_peek(struct btree_trans *trans,
int bch2_inode_write_flags(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
- enum btree_update_flags flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_inode_buf *inode_p;
@@ -399,7 +399,7 @@ int __bch2_fsck_write_inode(struct btree_trans *trans,
return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
&inode_p->inode.k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
}
int bch2_fsck_write_inode(struct btree_trans *trans,
@@ -473,7 +473,7 @@ fsck_err:
}
int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
@@ -490,7 +490,7 @@ fsck_err:
}
int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
@@ -507,7 +507,7 @@ fsck_err:
}
int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
@@ -535,29 +535,19 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
printbuf_indent_add(out, 2);
- prt_printf(out, "mode=%o", inode->bi_mode);
- prt_newline(out);
+ prt_printf(out, "mode=%o\n", inode->bi_mode);
prt_str(out, "flags=");
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
- prt_printf(out, " (%x)", inode->bi_flags);
- prt_newline(out);
+ prt_printf(out, " (%x)\n", inode->bi_flags);
- prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq);
- prt_newline(out);
-
- prt_printf(out, "bi_size=%llu", inode->bi_size);
- prt_newline(out);
-
- prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
- prt_newline(out);
-
- prt_newline(out);
- prt_printf(out, "bi_version=%llu", inode->bi_version);
+ prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq);
+ prt_printf(out, "bi_size=%llu\n", inode->bi_size);
+ prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors);
+ prt_printf(out, "bi_version=%llu\n", inode->bi_version);
#define x(_name, _bits) \
- prt_printf(out, #_name "=%llu", (u64) inode->_name); \
- prt_newline(out);
+ prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
BCH_INODE_FIELDS_v3()
#undef x
printbuf_indent_sub(out, 2);
@@ -604,11 +594,11 @@ int bch2_trigger_inode(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
- s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+ s64 nr = (s64) bkey_is_inode(new.k) - (s64) bkey_is_inode(old.k);
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
if (nr) {
int ret = bch2_replicas_deltas_realloc(trans, 0);
if (ret)
@@ -627,13 +617,13 @@ int bch2_trigger_inode(struct btree_trans *trans,
}
}
- if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
+ if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
BUG_ON(!trans->journal_res.seq);
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
}
- if (flags & BTREE_TRIGGER_GC) {
+ if (flags & BTREE_TRIGGER_gc) {
struct bch_fs *c = trans->c;
percpu_down_read(&c->mark_lock);
@@ -645,7 +635,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
}
int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
int ret = 0;
@@ -762,8 +752,8 @@ int bch2_inode_create(struct btree_trans *trans,
pos = start;
bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_INTENT);
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_intent);
again:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
@@ -824,7 +814,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
* extent iterator:
*/
bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
while (1) {
bch2_trans_begin(trans);
@@ -846,7 +836,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
bkey_init(&delete.k);
delete.k.p = iter.pos;
- if (iter.flags & BTREE_ITER_IS_EXTENTS)
+ if (iter.flags & BTREE_ITER_is_extents)
bch2_key_resize(&delete.k,
bpos_min(end, k.k->p).offset -
iter.pos.offset);
@@ -895,7 +885,7 @@ retry:
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0, inum.inum, snapshot),
- BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+ BTREE_ITER_intent|BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1055,7 +1045,7 @@ retry:
bch2_trans_begin(trans);
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+ SPOS(0, inum, snapshot), BTREE_ITER_intent);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1100,7 +1090,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
struct bch_inode_unpacked inode;
int ret;
- k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
+ k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
return ret;
@@ -1152,7 +1142,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
inode.bi_flags &= ~BCH_INODE_unlinked;
ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
bch_err_msg(c, ret, "clearing inode unlinked flag");
if (ret)
goto out;
@@ -1199,7 +1189,7 @@ again:
* flushed and we'd spin:
*/
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
if (ret > 0) {
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 056298050550..679f5f5e5d15 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -6,19 +6,20 @@
#include "bkey_methods.h"
#include "opts.h"
-enum bkey_invalid_flags;
+enum bch_validate_flags;
extern const char * const bch2_inode_opts[];
int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_inode ((struct bkey_ops) { \
.key_invalid = bch2_inode_invalid, \
@@ -49,7 +50,7 @@ static inline bool bkey_is_inode(const struct bkey *k)
}
int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \
@@ -101,7 +102,7 @@ int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, subvol_inum, unsigned);
int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, enum btree_update_flags);
+ struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
static inline int bch2_inode_write(struct btree_trans *trans,
struct btree_iter *iter,
@@ -220,6 +221,14 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
+static inline bool bch2_inode_should_have_bp(struct bch_inode_unpacked *inode)
+{
+ bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
+
+ return S_ISDIR(inode->bi_mode) ||
+ (!inode->bi_nlink && inode_has_bp);
+}
+
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 1baf78594cca..4ec979b4b23e 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -198,7 +198,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inum.inum, start),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
@@ -230,7 +230,7 @@ static int truncate_set_isize(struct btree_trans *trans,
struct bch_inode_unpacked inode_u;
int ret;
- ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+ ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent) ?:
(inode_u.bi_size = new_i_size, 0) ?:
bch2_inode_write(trans, &iter, &inode_u);
@@ -256,7 +256,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
bch2_trans_iter_exit(trans, &fpunch_iter);
@@ -264,6 +264,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
ret = 0;
err:
bch2_logged_op_finish(trans, op_k);
+ bch_err_fn(c, ret);
return ret;
}
@@ -316,7 +317,7 @@ static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset
offset <<= 9;
len <<= 9;
- ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent);
if (ret)
return ret;
@@ -364,7 +365,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inum.inum, 0),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
switch (op->v.state) {
case LOGGED_OP_FINSERT_start:
@@ -476,6 +477,7 @@ case LOGGED_OP_FINSERT_finish:
break;
}
err:
+ bch_err_fn(c, ret);
bch2_logged_op_finish(trans, op_k);
bch2_trans_iter_exit(trans, &iter);
return ret;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 8a556e6d1ab6..f57486794484 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -378,7 +378,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
bch2_bkey_buf_init(&sk);
bch2_trans_iter_init(trans, &iter, rbio->data_btree,
- rbio->read_pos, BTREE_ITER_SLOTS);
+ rbio->read_pos, BTREE_ITER_slots);
retry:
rbio->bio.bi_status = 0;
@@ -487,7 +487,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
return 0;
k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
if ((ret = bkey_err(k)))
goto out;
@@ -523,7 +523,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
goto out;
ret = bch2_trans_update(trans, &iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -541,7 +541,6 @@ static void __bch2_read_endio(struct work_struct *work)
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
struct bio *src = &rbio->bio;
struct bio *dst = &bch2_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
@@ -647,13 +646,15 @@ csum_err:
prt_str(&buf, "data ");
bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
- bch_err_inum_offset_ratelimited(ca,
- rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "data %s", buf.buf);
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ if (ca) {
+ bch_err_inum_offset_ratelimited(ca,
+ rbio->read_pos.inode,
+ rbio->read_pos.offset << 9,
+ "data %s", buf.buf);
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ }
printbuf_exit(&buf);
-
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
decompression_err:
@@ -675,7 +676,7 @@ static void bch2_read_endio(struct bio *bio)
struct bch_read_bio *rbio =
container_of(bio, struct bch_read_bio, bio);
struct bch_fs *c = rbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
struct workqueue_struct *wq = NULL;
enum rbio_context context = RBIO_CONTEXT_NULL;
@@ -687,17 +688,21 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
- rbio->read_pos.inode,
- rbio->read_pos.offset,
- "data read error: %s",
- bch2_blk_status_to_str(bio->bi_status))) {
+ if (bio->bi_status) {
+ if (ca) {
+ bch_err_inum_offset_ratelimited(ca,
+ rbio->read_pos.inode,
+ rbio->read_pos.offset,
+ "data read error: %s",
+ bch2_blk_status_to_str(bio->bi_status));
+ bch2_io_error(ca, BCH_MEMBER_ERROR_read);
+ }
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
}
if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(ca, &rbio->pick.ptr)) {
+ (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
trace_and_count(c, read_reuse_race, &rbio->bio);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
@@ -758,22 +763,21 @@ err:
}
static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bkey_s_c k,
struct bch_extent_ptr ptr)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
struct btree_iter iter;
struct printbuf buf = PRINTBUF;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- PTR_BUCKET_POS(c, &ptr),
- BTREE_ITER_CACHED);
+ PTR_BUCKET_POS(ca, &ptr),
+ BTREE_ITER_cached);
- prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+ prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
printbuf_indent_add(&buf, 2);
- prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
@@ -801,7 +805,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct bch_read_bio *rbio = NULL;
- struct bch_dev *ca = NULL;
struct promote_op *promote = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
@@ -832,7 +835,7 @@ retry_pick:
goto err;
}
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
/*
* Stale dirty pointers are treated as IO errors, but @failed isn't
@@ -842,9 +845,11 @@ retry_pick:
*/
if ((flags & BCH_READ_IN_RETRY) &&
!pick.ptr.cached &&
- unlikely(ptr_stale(ca, &pick.ptr))) {
- read_from_stale_dirty_pointer(trans, k, pick.ptr);
+ ca &&
+ unlikely(dev_ptr_stale(ca, &pick.ptr))) {
+ read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick);
+ percpu_ref_put(&ca->io_ref);
goto retry_pick;
}
@@ -859,8 +864,11 @@ retry_pick:
* can happen if we retry, and the extent we were going to read
* has been merged in the meantime:
*/
- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) {
+ if (ca)
+ percpu_ref_put(&ca->io_ref);
goto hole;
+ }
iter.bi_size = pick.crc.compressed_size << 9;
goto get_bio;
@@ -965,7 +973,7 @@ get_bio:
rbio->bvec_iter = iter;
rbio->offset_into_extent= offset_into_extent;
rbio->flags = flags;
- rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+ rbio->have_ioref = ca != NULL;
rbio->narrow_crcs = narrow_crcs;
rbio->hole = 0;
rbio->retry = 0;
@@ -995,7 +1003,7 @@ get_bio:
* If it's being moved internally, we don't want to flag it as a cache
* hit:
*/
- if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+ if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE))
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
PTR_BUCKET_NR(ca, &pick.ptr), READ);
@@ -1113,7 +1121,7 @@ retry:
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
- BTREE_ITER_SLOTS);
+ BTREE_ITER_slots);
while (1) {
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index f137252bccc5..9401d13e31bb 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -166,7 +166,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
bch2_trans_copy_iter(&iter, extent_iter);
for_each_btree_key_upto_continue_norestart(iter,
- new->k.p, BTREE_ITER_SLOTS, old, ret) {
+ new->k.p, BTREE_ITER_slots, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
bkey_start_offset(old.k));
@@ -199,9 +199,6 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
u64 new_i_size,
s64 i_sectors_delta)
{
- struct btree_iter iter;
- struct bkey_i *k;
- struct bkey_i_inode_v3 *inode;
/*
* Crazy performance optimization:
* Every extent update needs to also update the inode: the inode trigger
@@ -213,26 +210,37 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
* to be journalled - if we crash, the bi_journal_seq update will be
* lost, but that's fine.
*/
- unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
- int ret;
+ unsigned inode_update_flags = BTREE_UPDATE_nojournal;
- k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0,
extent_iter->pos.inode,
extent_iter->snapshot),
- BTREE_ITER_CACHED);
- ret = PTR_ERR_OR_ZERO(k);
+ BTREE_ITER_cached);
+ int ret = bkey_err(k);
if (unlikely(ret))
return ret;
- if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
- k = bch2_inode_to_v3(trans, k);
- ret = PTR_ERR_OR_ZERO(k);
+ /*
+ * varint_decode_fast(), in the inode .invalid method, reads up to 7
+ * bytes past the end of the buffer:
+ */
+ struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8);
+ ret = PTR_ERR_OR_ZERO(k_mut);
+ if (unlikely(ret))
+ goto err;
+
+ bkey_reassemble(k_mut, k);
+
+ if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) {
+ k_mut = bch2_inode_to_v3(trans, k_mut);
+ ret = PTR_ERR_OR_ZERO(k_mut);
if (unlikely(ret))
goto err;
}
- inode = bkey_i_to_inode_v3(k);
+ struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut);
if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
new_i_size > le64_to_cpu(inode->v.bi_size)) {
@@ -251,7 +259,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
}
ret = bch2_trans_update(trans, &iter, &inode->k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ BTREE_UPDATE_internal_snapshot_node|
inode_update_flags);
err:
bch2_trans_iter_exit(trans, &iter);
@@ -360,7 +368,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
bkey_start_pos(&sk.k->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
bch2_extent_update(trans, inum, &iter, sk.k,
@@ -399,13 +407,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
BUG_ON(c->opts.nochanges);
bkey_for_each_ptr(ptrs, ptr) {
- BUG_ON(!bch2_dev_exists2(c, ptr->dev));
-
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = nocow
+ ? bch2_dev_have_ref(c, ptr->dev)
+ : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE);
if (to_entry(ptr + 1) < ptrs.end) {
- n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
- GFP_NOFS, &ca->replica_set));
+ n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
n->bio.bi_end_io = wbio->bio.bi_end_io;
n->bio.bi_private = wbio->bio.bi_private;
@@ -422,11 +429,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->c = c;
n->dev = ptr->dev;
- n->have_ioref = nocow || bch2_dev_get_ioref(ca,
- type == BCH_DATA_btree ? READ : WRITE);
+ n->have_ioref = ca != NULL;
n->nocow = nocow;
n->submit_time = local_clock();
n->inode_offset = bkey_start_offset(&k->k);
+ if (nocow)
+ n->nocow_bucket = PTR_BUCKET_NR(ca, ptr);
n->bio.bi_iter.bi_sector = ptr->offset;
if (likely(n->have_ioref)) {
@@ -473,7 +481,6 @@ static void bch2_write_done(struct closure *cl)
static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
{
struct keylist *keys = &op->insert_keys;
- struct bch_extent_ptr *ptr;
struct bkey_i *src, *dst = keys->keys, *n;
for (src = keys->keys; src != keys->top; src = n) {
@@ -642,7 +649,9 @@ static void bch2_write_endio(struct bio *bio)
struct bch_write_bio *wbio = to_wbio(bio);
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_fs *c = wbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
+ struct bch_dev *ca = wbio->have_ioref
+ ? bch2_dev_have_ref(c, wbio->dev)
+ : NULL;
if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
op->pos.inode,
@@ -653,8 +662,12 @@ static void bch2_write_endio(struct bio *bio)
op->flags |= BCH_WRITE_IO_ERROR;
}
- if (wbio->nocow)
+ if (wbio->nocow) {
+ bch2_bucket_nocow_unlock(&c->nocow_locks,
+ POS(ca->dev_idx, wbio->nocow_bucket),
+ BUCKET_NOCOW_LOCK_UPDATE);
set_bit(wbio->dev, op->devs_need_flush->d);
+ }
if (wbio->have_ioref) {
bch2_latency_acct(ca, wbio->submit_time, WRITE);
@@ -1093,30 +1106,21 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
return false;
e = bkey_s_c_to_extent(k);
+
+ rcu_read_lock();
extent_for_each_ptr_decode(e, p, entry) {
- if (crc_is_encoded(p.crc) || p.has_ec)
+ if (crc_is_encoded(p.crc) || p.has_ec) {
+ rcu_read_unlock();
return false;
+ }
replicas += bch2_extent_ptr_durability(c, &p);
}
+ rcu_read_unlock();
return replicas >= op->opts.data_replicas;
}
-static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
-
- for_each_keylist_key(&op->insert_keys, k) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
-
- bkey_for_each_ptr(ptrs, ptr)
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr),
- BUCKET_NOCOW_LOCK_UPDATE);
- }
-}
-
static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *orig,
@@ -1150,7 +1154,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
return bch2_extent_update_i_size_sectors(trans, iter,
min(new->k.p.offset << 9, new_i_size), 0) ?:
bch2_trans_update(trans, iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
}
static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
@@ -1161,7 +1165,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
for_each_keylist_key(&op->insert_keys, orig) {
int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
- BTREE_ITER_INTENT, k,
+ BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
}));
@@ -1187,8 +1191,6 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
static void __bch2_nocow_write_done(struct bch_write_op *op)
{
- bch2_nocow_write_unlock(op);
-
if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
op->error = -EIO;
} else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
@@ -1234,12 +1236,16 @@ retry:
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(op->pos.inode, op->pos.offset, snapshot),
- BTREE_ITER_SLOTS);
+ BTREE_ITER_slots);
while (1) {
struct bio *bio = &op->wbio.bio;
buckets.nr = 0;
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ break;
+
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -1259,14 +1265,15 @@ retry:
/* Get iorefs before dropping btree locks: */
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
- struct bpos b = PTR_BUCKET_POS(c, ptr);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
+ if (unlikely(!ca))
+ goto err_get_ioref;
+
+ struct bpos b = PTR_BUCKET_POS(ca, ptr);
struct nocow_lock_bucket *l =
bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
prefetch(l);
- if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
- goto err_get_ioref;
-
/* XXX allocating memory with btree locks held - rare */
darray_push_gfp(&buckets, ((struct bucket_to_lock) {
.b = b, .gen = ptr->gen, .l = l,
@@ -1285,7 +1292,7 @@ retry:
bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
darray_for_each(buckets, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode);
+ struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);
__bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
bucket_to_u64(i->b),
@@ -1362,7 +1369,7 @@ err:
return;
err_get_ioref:
darray_for_each(buckets, i)
- percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref);
+ percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref);
/* Fall back to COW path: */
goto out;
@@ -1483,7 +1490,11 @@ err:
if ((op->flags & BCH_WRITE_SYNC) ||
(!(op->flags & BCH_WRITE_DONE) &&
!(op->flags & BCH_WRITE_IN_WORKER))) {
- closure_sync(&op->cl);
+ if (closure_sync_timeout(&op->cl, HZ * 10)) {
+ bch2_print_allocator_stuck(c);
+ closure_sync(&op->cl);
+ }
+
__bch2_write_index(op);
if (!(op->flags & BCH_WRITE_DONE))
@@ -1505,6 +1516,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
unsigned sectors;
int ret;
+ memset(&op->failed, 0, sizeof(op->failed));
+
op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
op->flags |= BCH_WRITE_DONE;
@@ -1639,8 +1652,7 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
prt_bitflags(out, bch2_write_flags, op->flags);
prt_newline(out);
- prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
- prt_newline(out);
+ prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
printbuf_indent_sub(out, 2);
}
@@ -1648,13 +1660,14 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
void bch2_fs_io_write_exit(struct bch_fs *c)
{
mempool_exit(&c->bio_bounce_pages);
+ bioset_exit(&c->replica_set);
bioset_exit(&c->bio_write);
}
int bch2_fs_io_write_init(struct bch_fs *c)
{
- if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
- BIOSET_NEED_BVECS))
+ if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
+ bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
return -BCH_ERR_ENOMEM_bio_write_init;
if (mempool_init_page_pool(&c->bio_bounce_pages,
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
index c7f97c2c4805..6e878a6f2f0b 100644
--- a/fs/bcachefs/io_write_types.h
+++ b/fs/bcachefs/io_write_types.h
@@ -20,6 +20,7 @@ struct bch_write_bio {
u64 submit_time;
u64 inode_offset;
+ u64 nocow_bucket;
struct bch_devs_list failed;
u8 dev;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 9c9a25dbd613..adec8e1ea73e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -53,29 +53,19 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
unsigned i = seq & JOURNAL_BUF_MASK;
struct journal_buf *buf = j->buf + i;
- prt_str(out, "seq:");
- prt_tab(out);
- prt_printf(out, "%llu", seq);
- prt_newline(out);
+ prt_printf(out, "seq:\t%llu\n", seq);
printbuf_indent_add(out, 2);
- prt_str(out, "refcount:");
- prt_tab(out);
- prt_printf(out, "%u", journal_state_count(s, i));
- prt_newline(out);
+ prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i));
- prt_str(out, "size:");
- prt_tab(out);
+ prt_printf(out, "size:\t");
prt_human_readable_u64(out, vstruct_bytes(buf->data));
prt_newline(out);
- prt_str(out, "expires:");
- prt_tab(out);
- prt_printf(out, "%li jiffies", buf->expires - jiffies);
- prt_newline(out);
+ prt_printf(out, "expires:\t");
+ prt_printf(out, "%li jiffies\n", buf->expires - jiffies);
- prt_str(out, "flags:");
- prt_tab(out);
+ prt_printf(out, "flags:\t");
if (buf->noflush)
prt_str(out, "noflush ");
if (buf->must_flush)
@@ -87,9 +77,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
if (buf->write_started)
prt_str(out, "write_started ");
if (buf->write_allocated)
- prt_str(out, "write allocated ");
+ prt_str(out, "write_allocated ");
if (buf->write_done)
- prt_str(out, "write done");
+ prt_str(out, "write_done");
prt_newline(out);
printbuf_indent_sub(out, 2);
@@ -706,6 +696,12 @@ recheck_need_open:
spin_unlock(&j->lock);
+ /*
+ * We're called from bch2_journal_flush_seq() -> wait_event();
+ * but this might block. We won't usually block, so we won't
+ * livelock:
+ */
+ sched_annotate_sleep();
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
if (ret)
return ret;
@@ -870,6 +866,8 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
{
struct journal_buf *ret = NULL;
+ /* We're inside wait_event(), but using mutex_lock(: */
+ sched_annotate_sleep();
mutex_lock(&j->buf_lock);
spin_lock(&j->lock);
max_seq = min(max_seq, journal_cur_seq(j));
@@ -940,7 +938,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
break;
}
} else {
- ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl);
+ ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal,
+ BCH_DATA_journal, cl);
ret = PTR_ERR_OR_ZERO(ob[nr_got]);
if (ret)
break;
@@ -948,7 +947,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
ret = bch2_trans_run(c,
bch2_trans_mark_metadata_bucket(trans, ca,
ob[nr_got]->bucket, BCH_DATA_journal,
- ca->mi.bucket_size));
+ ca->mi.bucket_size, BTREE_TRIGGER_transactional));
if (ret) {
bch2_open_bucket_put(c, ob[nr_got]);
bch_err_msg(c, ret, "marking new journal buckets");
@@ -1028,7 +1027,8 @@ err_unblock:
for (i = 0; i < nr_got; i++)
bch2_trans_run(c,
bch2_trans_mark_metadata_bucket(trans, ca,
- bu[i], BCH_DATA_free, 0));
+ bu[i], BCH_DATA_free, 0,
+ BTREE_TRIGGER_transactional));
err_free:
if (!new_fs)
for (i = 0; i < nr_got; i++)
@@ -1179,12 +1179,14 @@ void bch2_fs_journal_stop(struct journal *j)
bch2_journal_meta(j);
journal_quiesce(j);
+ cancel_delayed_work_sync(&j->write_work);
BUG_ON(!bch2_journal_error(j) &&
- test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
+ test_bit(JOURNAL_replay_done, &j->flags) &&
j->last_empty_seq != journal_cur_seq(j));
- cancel_delayed_work_sync(&j->write_work);
+ if (!bch2_journal_error(j))
+ clear_bit(JOURNAL_running, &j->flags);
}
int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
@@ -1258,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
spin_lock(&j->lock);
- set_bit(JOURNAL_STARTED, &j->flags);
+ set_bit(JOURNAL_running, &j->flags);
j->last_flush_write = jiffies;
j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
@@ -1399,6 +1401,13 @@ int bch2_fs_journal_init(struct journal *j)
/* debug: */
+static const char * const bch2_journal_flags_strs[] = {
+#define x(n) #n,
+ JOURNAL_FLAGS()
+#undef x
+ NULL
+};
+
void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -1407,19 +1416,22 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 24);
+ printbuf_tabstop_push(out, 28);
out->atomic++;
rcu_read_lock();
s = READ_ONCE(j->reservations);
+ prt_printf(out, "flags:\t");
+ prt_bitflags(out, bch2_journal_flags_strs, j->flags);
+ prt_newline(out);
prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
- prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
- prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
- prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
+ prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j));
+ prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk);
+ prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
+ prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]);
prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
@@ -1428,48 +1440,44 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_newline(out);
prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
- prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
+ prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked);
prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
- prt_printf(out, "blocked:\t\t%u\n", j->blocked);
+ prt_printf(out, "blocked:\t%u\n", j->blocked);
prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
- prt_printf(out, "current entry:\t\t");
+ prt_printf(out, "current entry:\t");
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
- prt_printf(out, "error");
+ prt_printf(out, "error\n");
break;
case JOURNAL_ENTRY_CLOSED_VAL:
- prt_printf(out, "closed");
+ prt_printf(out, "closed\n");
break;
default:
- prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
+ prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
break;
}
- prt_newline(out);
- prt_printf(out, "unwritten entries:");
- prt_newline(out);
+ prt_printf(out, "unwritten entries:\n");
bch2_journal_bufs_to_text(out, j);
- prt_printf(out,
- "replay done:\t\t%i\n",
- test_bit(JOURNAL_REPLAY_DONE, &j->flags));
-
prt_printf(out, "space:\n");
- prt_printf(out, "\tdiscarded\t%u:%u\n",
+ printbuf_indent_add(out, 2);
+ prt_printf(out, "discarded\t%u:%u\n",
j->space[journal_space_discarded].next_entry,
j->space[journal_space_discarded].total);
- prt_printf(out, "\tclean ondisk\t%u:%u\n",
+ prt_printf(out, "clean ondisk\t%u:%u\n",
j->space[journal_space_clean_ondisk].next_entry,
j->space[journal_space_clean_ondisk].total);
- prt_printf(out, "\tclean\t\t%u:%u\n",
+ prt_printf(out, "clean\t%u:%u\n",
j->space[journal_space_clean].next_entry,
j->space[journal_space_clean].total);
- prt_printf(out, "\ttotal\t\t%u:%u\n",
+ prt_printf(out, "total\t%u:%u\n",
j->space[journal_space_total].next_entry,
j->space[journal_space_total].total);
+ printbuf_indent_sub(out, 2);
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
@@ -1480,14 +1488,16 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
if (!ja->nr)
continue;
- prt_printf(out, "dev %u:\n", ca->dev_idx);
- prt_printf(out, "\tnr\t\t%u\n", ja->nr);
- prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size);
- prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
- prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx);
- prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
- prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
- prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
+ prt_printf(out, "dev %u:\n", ca->dev_idx);
+ printbuf_indent_add(out, 2);
+ prt_printf(out, "nr\t%u\n", ja->nr);
+ prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size);
+ prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+ prt_printf(out, "discard_idx\t%u\n", ja->discard_idx);
+ prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
+ prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
+ prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
+ printbuf_indent_sub(out, 2);
}
rcu_read_unlock();
@@ -1519,25 +1529,18 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
pin_list = journal_seq_pin(j, *seq);
- prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
- prt_newline(out);
+ prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
printbuf_indent_add(out, 2);
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
- list_for_each_entry(pin, &pin_list->list[i], list) {
- prt_printf(out, "\t%px %ps", pin, pin->flush);
- prt_newline(out);
- }
+ list_for_each_entry(pin, &pin_list->list[i], list)
+ prt_printf(out, "\t%px %ps\n", pin, pin->flush);
- if (!list_empty(&pin_list->flushed)) {
- prt_printf(out, "flushed:");
- prt_newline(out);
- }
+ if (!list_empty(&pin_list->flushed))
+ prt_printf(out, "flushed:\n");
- list_for_each_entry(pin, &pin_list->flushed, list) {
- prt_printf(out, "\t%px %ps", pin, pin->flush);
- prt_newline(out);
- }
+ list_for_each_entry(pin, &pin_list->flushed, list)
+ prt_printf(out, "\t%px %ps\n", pin, pin->flush);
printbuf_indent_sub(out, 2);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 7c7528f839c5..fd1f7cdaa8bc 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -372,7 +372,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
int ret;
EBUG_ON(res->ref);
- EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+ EBUG_ON(!test_bit(JOURNAL_running, &j->flags));
res->u64s = u64s;
@@ -418,8 +418,8 @@ struct bch_dev;
static inline void bch2_journal_set_replay_done(struct journal *j)
{
- BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
- set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+ BUG_ON(!test_bit(JOURNAL_running, &j->flags));
+ set_bit(JOURNAL_replay_done, &j->flags);
}
void bch2_journal_unblock(struct journal *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 725fcf46f631..cdcb1ad49af4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,15 +17,38 @@
#include "sb-clean.h"
#include "trace.h"
+void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ for_each_member_device(c, ca) {
+ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+
+ m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx);
+ m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free);
+ }
+}
+
+void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
+{
+ mutex_lock(&c->sb_lock);
+ for_each_member_device(c, ca) {
+ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
+
+ unsigned idx = le32_to_cpu(m.last_journal_bucket);
+ if (idx < ca->journal.nr)
+ ca->journal.cur_idx = idx;
+ unsigned offset = le32_to_cpu(m.last_journal_bucket_offset);
+ if (offset <= ca->mi.bucket_size)
+ ca->journal.sectors_free = ca->mi.bucket_size - offset;
+ }
+ mutex_unlock(&c->sb_lock);
+}
+
void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct journal_replay *j)
{
darray_for_each(j->ptrs, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
- u64 offset;
-
- div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
-
if (i != j->ptrs.data)
prt_printf(out, " ");
prt_printf(out, "%u:%u:%u (sector %llu)",
@@ -122,6 +145,10 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct printbuf buf = PRINTBUF;
int ret = JOURNAL_ENTRY_ADD_OK;
+ if (!c->journal.oldest_seq_found_ondisk ||
+ le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
+ c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
+
/* Is this entry older than the range we need? */
if (!c->opts.read_entire_journal &&
le64_to_cpu(j->seq) < jlist->last_seq)
@@ -247,7 +274,7 @@ static void journal_entry_err_msg(struct printbuf *out,
if (entry) {
prt_str(out, " type=");
- prt_str(out, bch2_jset_entry_types[entry->type]);
+ bch2_prt_jset_entry_type(out, entry->type);
}
if (!jset) {
@@ -272,7 +299,7 @@ static void journal_entry_err_msg(struct printbuf *out,
journal_entry_err_msg(&_buf, version, jset, entry); \
prt_printf(&_buf, msg, ##__VA_ARGS__); \
\
- switch (flags & BKEY_INVALID_WRITE) { \
+ switch (flags & BCH_VALIDATE_write) { \
case READ: \
mustfix_fsck_err(c, _err, "%s", _buf.buf); \
break; \
@@ -301,9 +328,9 @@ static int journal_validate_key(struct bch_fs *c,
unsigned level, enum btree_id btree_id,
struct bkey_i *k,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
- int write = flags & BKEY_INVALID_WRITE;
+ int write = flags & BCH_VALIDATE_write;
void *next = vstruct_next(entry);
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -376,7 +403,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct bkey_i *k = entry->start;
@@ -385,7 +412,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
entry->level,
entry->btree_id,
k, version, big_endian,
- flags|BKEY_INVALID_JOURNAL);
+ flags|BCH_VALIDATE_journal);
if (ret == FSCK_DELETED_KEY)
continue;
@@ -403,7 +430,8 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
jset_entry_for_each_key(entry, k) {
if (!first) {
prt_newline(out);
- prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ bch2_prt_jset_entry_type(out, entry->type);
+ prt_str(out, ": ");
}
prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
@@ -415,7 +443,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct bkey_i *k = entry->start;
int ret = 0;
@@ -454,7 +482,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
/* obsolete, don't care: */
return 0;
@@ -469,7 +497,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
int ret = 0;
@@ -496,7 +524,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct jset_entry_blacklist_v2 *bl_entry;
int ret = 0;
@@ -538,7 +566,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
@@ -563,16 +591,16 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
- prt_printf(out, "type=%s v=%llu",
- bch2_fs_usage_types[u->entry.btree_id],
- le64_to_cpu(u->v));
+ prt_str(out, "type=");
+ bch2_prt_fs_usage_type(out, u->entry.btree_id);
+ prt_printf(out, " v=%llu", le64_to_cpu(u->v));
}
static int journal_entry_data_usage_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
@@ -616,7 +644,7 @@ static int journal_entry_clock_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
@@ -656,13 +684,12 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
unsigned expected = sizeof(*u);
- unsigned dev;
int ret = 0;
if (journal_entry_err_on(bytes < expected,
@@ -674,16 +701,6 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
return ret;
}
- dev = le32_to_cpu(u->dev);
-
- if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
- c, version, jset, entry,
- journal_entry_dev_usage_bad_dev,
- "bad dev")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
if (journal_entry_err_on(u->pad,
c, version, jset, entry,
journal_entry_dev_usage_bad_pad,
@@ -718,7 +735,7 @@ static int journal_entry_log_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
return 0;
}
@@ -736,7 +753,7 @@ static int journal_entry_overwrite_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
return journal_entry_btree_keys_validate(c, jset, entry,
version, big_endian, READ);
@@ -752,7 +769,7 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
return journal_entry_btree_keys_validate(c, jset, entry,
version, big_endian, READ);
@@ -768,7 +785,7 @@ static int journal_entry_datetime_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
unsigned bytes = vstruct_bytes(entry);
unsigned expected = 16;
@@ -798,7 +815,7 @@ static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
- enum bkey_invalid_flags);
+ enum bch_validate_flags);
void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
};
@@ -816,7 +833,7 @@ int bch2_journal_entry_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
return entry->type < BCH_JSET_ENTRY_NR
? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
@@ -827,16 +844,16 @@ int bch2_journal_entry_validate(struct bch_fs *c,
void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
+ bch2_prt_jset_entry_type(out, entry->type);
+
if (entry->type < BCH_JSET_ENTRY_NR) {
- prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ prt_str(out, ": ");
bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
- } else {
- prt_printf(out, "(unknown type %u)", entry->type);
}
}
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
unsigned version = le32_to_cpu(jset->version);
int ret = 0;
@@ -862,7 +879,7 @@ fsck_err:
static int jset_validate(struct bch_fs *c,
struct bch_dev *ca,
struct jset *jset, u64 sector,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
unsigned version;
int ret = 0;
@@ -917,7 +934,7 @@ static int jset_validate_early(struct bch_fs *c,
{
size_t bytes = vstruct_bytes(jset);
unsigned version;
- enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
+ enum bch_validate_flags flags = BCH_VALIDATE_journal;
int ret = 0;
if (le64_to_cpu(jset->magic) != jset_magic(c))
@@ -1056,6 +1073,13 @@ reread:
goto err;
}
+ if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
+ ja->highest_seq_found = le64_to_cpu(j->seq);
+ ja->cur_idx = bucket;
+ ja->sectors_free = ca->mi.bucket_size -
+ bucket_remainder(ca, offset) - sectors;
+ }
+
/*
* This happens sometimes if we don't have discards on -
* when we've partially overwritten a bucket with new
@@ -1124,8 +1148,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
struct bch_fs *c = ca->fs;
struct journal_list *jlist =
container_of(cl->parent, struct journal_list, cl);
- struct journal_replay *r, **_r;
- struct genradix_iter iter;
struct journal_read_buf buf = { NULL, 0 };
unsigned i;
int ret = 0;
@@ -1145,47 +1167,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
goto err;
}
- ja->sectors_free = ca->mi.bucket_size;
-
- mutex_lock(&jlist->lock);
- genradix_for_each_reverse(&c->journal_entries, iter, _r) {
- r = *_r;
-
- if (!r)
- continue;
-
- darray_for_each(r->ptrs, i)
- if (i->dev == ca->dev_idx) {
- unsigned wrote = bucket_remainder(ca, i->sector) +
- vstruct_sectors(&r->j, c->block_bits);
-
- ja->cur_idx = i->bucket;
- ja->sectors_free = ca->mi.bucket_size - wrote;
- goto found;
- }
- }
-found:
- mutex_unlock(&jlist->lock);
-
- if (ja->bucket_seq[ja->cur_idx] &&
- ja->sectors_free == ca->mi.bucket_size) {
-#if 0
- /*
- * Debug code for ZNS support, where we (probably) want to be
- * correlated where we stopped in the journal to the zone write
- * points:
- */
- bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
- bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
- for (i = 0; i < 3; i++) {
- unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
-
- bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
- }
-#endif
- ja->sectors_free = 0;
- }
-
/*
* Set dirty_idx to indicate the entire journal is full and needs to be
* reclaimed - journal reclaim will immediately reclaim whatever isn't
@@ -1254,7 +1235,7 @@ int bch2_journal_read(struct bch_fs *c,
* those entries will be blacklisted:
*/
genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
- enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
+ enum bch_validate_flags flags = BCH_VALIDATE_journal;
i = *_i;
@@ -1365,7 +1346,7 @@ int bch2_journal_read(struct bch_fs *c,
fsck_err(c, journal_entries_missing,
"journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
" prev at %s\n"
- " next at %s",
+ " next at %s, continue?",
missing_start, missing_end,
*last_seq, *blacklist_seq - 1,
buf1.buf, buf2.buf);
@@ -1389,7 +1370,7 @@ int bch2_journal_read(struct bch_fs *c,
continue;
darray_for_each(i->ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
if (!ptr->csum_good)
bch_err_dev_offset(ca, ptr->sector,
@@ -1399,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c,
}
ret = jset_validate(c,
- bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
+ bch2_dev_have_ref(c, i->ptrs.data[0].dev),
&i->j,
i->ptrs.data[0].sector,
READ);
@@ -1722,7 +1703,7 @@ static void journal_write_endio(struct bio *bio)
percpu_ref_put(&ca->io_ref);
}
-static CLOSURE_CALLBACK(do_journal_write)
+static CLOSURE_CALLBACK(journal_write_submit)
{
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
@@ -1730,10 +1711,8 @@ static CLOSURE_CALLBACK(do_journal_write)
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct journal_device *ja = &ca->journal;
-
- if (!percpu_ref_tryget(&ca->io_ref)) {
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
+ if (!ca) {
/* XXX: fix this */
bch_err(c, "missing device for journal write\n");
continue;
@@ -1742,6 +1721,7 @@ static CLOSURE_CALLBACK(do_journal_write)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
sectors);
+ struct journal_device *ja = &ca->journal;
struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
@@ -1767,6 +1747,44 @@ static CLOSURE_CALLBACK(do_journal_write)
continue_at(cl, journal_write_done, j->wq);
}
+static CLOSURE_CALLBACK(journal_write_preflush)
+{
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
+ spin_lock(&j->lock);
+ closure_wait(&j->async_wait, cl);
+ spin_unlock(&j->lock);
+
+ continue_at(cl, journal_write_preflush, j->wq);
+ return;
+ }
+
+ if (w->separate_flush) {
+ for_each_rw_member(c, ca) {
+ percpu_ref_get(&ca->io_ref);
+
+ struct journal_device *ja = &ca->journal;
+ struct bio *bio = &ja->bio[w->idx]->bio;
+ bio_reset(bio, ca->disk_sb.bdev,
+ REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ closure_bio_submit(bio, cl);
+ }
+
+ continue_at(cl, journal_write_submit, j->wq);
+ } else {
+ /*
+ * no need to punt to another work item if we're not waiting on
+ * preflushes
+ */
+ journal_write_submit(&cl->work);
+ }
+}
+
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -1919,14 +1937,14 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
* So if we're in an error state, and we're still starting up, we don't
* write anything at all.
*/
- if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
+ if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
return -EIO;
if (error ||
w->noflush ||
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+ test_bit(JOURNAL_may_skip_flush, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
w->data->last_seq = 0;
@@ -1937,7 +1955,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
w->must_flush = true;
j->last_flush_write = jiffies;
j->nr_flush_writes++;
- clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+ clear_bit(JOURNAL_need_flush_write, &j->flags);
}
return 0;
@@ -2032,23 +2050,9 @@ CLOSURE_CALLBACK(bch2_journal_write)
goto err;
if (!JSET_NO_FLUSH(w->data))
- closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
-
- if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
- for_each_rw_member(c, ca) {
- percpu_ref_get(&ca->io_ref);
-
- struct journal_device *ja = &ca->journal;
- struct bio *bio = &ja->bio[w->idx]->bio;
- bio_reset(bio, ca->disk_sb.bdev,
- REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- closure_bio_submit(bio, cl);
- }
- }
-
- continue_at(cl, do_journal_write, j->wq);
+ continue_at(cl, journal_write_preflush, j->wq);
+ else
+ continue_at(cl, journal_write_submit, j->wq);
return;
no_io:
continue_at(cl, journal_write_done, j->wq);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 4f1e763ab506..2ca9cde30ea8 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -4,6 +4,9 @@
#include "darray.h"
+void bch2_journal_pos_from_member_info_set(struct bch_fs *);
+void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
+
struct journal_ptr {
bool csum_good;
u8 dev;
@@ -60,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
- enum bkey_invalid_flags);
+ enum bch_validate_flags);
void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
struct jset_entry *);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ab811c0dad26..79be0eaddfa0 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -67,6 +67,8 @@ void bch2_journal_set_watermark(struct journal *j)
track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
trace_and_count(c, journal_full, c);
+ mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin);
+
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
@@ -223,9 +225,9 @@ void bch2_journal_space_available(struct journal *j)
j->space[journal_space_clean_ondisk].total) &&
(clean - clean_ondisk <= total / 8) &&
(clean_ondisk * 2 > clean))
- set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+ set_bit(JOURNAL_may_skip_flush, &j->flags);
else
- clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+ clear_bit(JOURNAL_may_skip_flush, &j->flags);
bch2_journal_set_watermark(j);
out:
@@ -816,7 +818,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
* If journal replay hasn't completed, the unreplayed journal entries
* hold refs on their corresponding sequence numbers
*/
- ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+ ret = !test_bit(JOURNAL_replay_done, &j->flags) ||
journal_last_seq(j) > seq_to_flush ||
!fifo_used(&j->pin);
@@ -831,7 +833,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
/* time_stats this */
bool did_work = false;
- if (!test_bit(JOURNAL_STARTED, &j->flags))
+ if (!test_bit(JOURNAL_running, &j->flags))
return false;
closure_wait_event(&j->async_wait,
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index ae4fb8c3a2bc..db80e506e3ab 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -16,9 +16,8 @@ static int u64_cmp(const void *_l, const void *_r)
return cmp_int(*l, *r);
}
-static int bch2_sb_journal_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_journal *journal = field_to_type(f, journal);
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
@@ -99,9 +98,8 @@ static int u64_range_cmp(const void *_l, const void *_r)
return cmp_int(l->start, r->start);
}
-static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index b5303874fc35..ed4846709611 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "btree_iter.h"
#include "eytzinger.h"
+#include "journal.h"
#include "journal_seq_blacklist.h"
#include "super-io.h"
@@ -95,8 +95,7 @@ out:
return ret ?: bch2_blacklist_table_initialize(c);
}
-static int journal_seq_blacklist_table_cmp(const void *_l,
- const void *_r, size_t size)
+static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
{
const struct journal_seq_blacklist_table_entry *l = _l;
const struct journal_seq_blacklist_table_entry *r = _r;
@@ -163,9 +162,8 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
return 0;
}
-static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_journal_seq_blacklist *bl =
field_to_type(f, journal_seq_blacklist);
@@ -218,78 +216,40 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
.to_text = bch2_sb_journal_seq_blacklist_to_text
};
-void bch2_blacklist_entries_gc(struct work_struct *work)
+bool bch2_blacklist_entries_gc(struct bch_fs *c)
{
- struct bch_fs *c = container_of(work, struct bch_fs,
- journal_seq_blacklist_gc_work);
- struct journal_seq_blacklist_table *t;
- struct bch_sb_field_journal_seq_blacklist *bl;
struct journal_seq_blacklist_entry *src, *dst;
- struct btree_trans *trans = bch2_trans_get(c);
- unsigned i, nr, new_nr;
- int ret;
-
- for (i = 0; i < BTREE_ID_NR; i++) {
- struct btree_iter iter;
- struct btree *b;
-
- bch2_trans_node_iter_init(trans, &iter, i, POS_MIN,
- 0, 0, BTREE_ITER_PREFETCH);
-retry:
- bch2_trans_begin(trans);
-
- b = bch2_btree_iter_peek_node(&iter);
-
- while (!(ret = PTR_ERR_OR_ZERO(b)) &&
- b &&
- !test_bit(BCH_FS_stopping, &c->flags))
- b = bch2_btree_iter_next_node(&iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- bch2_trans_iter_exit(trans, &iter);
- }
-
- bch2_trans_put(trans);
- if (ret)
- return;
-
- mutex_lock(&c->sb_lock);
- bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+ struct bch_sb_field_journal_seq_blacklist *bl =
+ bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
if (!bl)
- goto out;
+ return false;
- nr = blacklist_nr_entries(bl);
+ unsigned nr = blacklist_nr_entries(bl);
dst = bl->start;
- t = c->journal_seq_blacklist_table;
+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
BUG_ON(nr != t->nr);
+ unsigned i;
for (src = bl->start, i = eytzinger0_first(t->nr);
src < bl->start + nr;
src++, i = eytzinger0_next(i, nr)) {
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
- if (t->entries[i].dirty)
+ if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
*dst++ = *src;
}
- new_nr = dst - bl->start;
-
- bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
-
- if (new_nr != nr) {
- bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
- new_nr ? sb_blacklist_u64s(new_nr) : 0);
- BUG_ON(new_nr && !bl);
+ unsigned new_nr = dst - bl->start;
+ if (new_nr == nr)
+ return false;
- if (!new_nr)
- c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
+ bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr);
- bch2_write_super(c);
- }
-out:
- mutex_unlock(&c->sb_lock);
+ bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+ new_nr ? sb_blacklist_u64s(new_nr) : 0);
+ BUG_ON(new_nr && !bl);
+ return true;
}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index afb886ec8e25..d47636f96fdc 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -17,6 +17,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
-void bch2_blacklist_entries_gc(struct work_struct *);
+bool bch2_blacklist_entries_gc(struct bch_fs *);
#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h
new file mode 100644
index 000000000000..2566b12dbc04
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist_format.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
+
+struct journal_seq_blacklist_entry {
+ __le64 start;
+ __le64 end;
+};
+
+struct bch_sb_field_journal_seq_blacklist {
+ struct bch_sb_field field;
+ struct journal_seq_blacklist_entry start[];
+};
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 8c053cb64ca5..19183fcf7ad7 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -129,11 +129,17 @@ enum journal_space_from {
journal_space_nr,
};
+#define JOURNAL_FLAGS() \
+ x(replay_done) \
+ x(running) \
+ x(may_skip_flush) \
+ x(need_flush_write) \
+ x(space_low)
+
enum journal_flags {
- JOURNAL_REPLAY_DONE,
- JOURNAL_STARTED,
- JOURNAL_MAY_SKIP_FLUSH,
- JOURNAL_NEED_FLUSH_WRITE,
+#define x(n) JOURNAL_##n,
+ JOURNAL_FLAGS()
+#undef x
};
/* Reasons we may fail to get a journal reservation: */
@@ -228,6 +234,7 @@ struct journal {
u64 last_seq_ondisk;
u64 err_seq;
u64 last_empty_seq;
+ u64 oldest_seq_found_ondisk;
/*
* FIFO of journal entries whose btree updates have not yet been
@@ -325,6 +332,7 @@ struct journal_device {
/* for bch_journal_read_device */
struct closure read;
+ u64 highest_seq_found;
};
/*
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index 9fac838d123e..f49fdca1d07d 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -37,7 +37,6 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
struct bkey_buf sk;
u32 restart_count = trans->restart_count;
- int ret;
if (!fn)
return 0;
@@ -45,11 +44,11 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
bch2_bkey_buf_init(&sk);
bch2_bkey_buf_reassemble(&sk, c, k);
- ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?:
- fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+ fn->resume(trans, sk.k);
bch2_bkey_buf_exit(&sk, c);
- return ret;
+
+ return trans_was_restarted(trans, restart_count);
}
int bch2_resume_logged_ops(struct bch_fs *c)
@@ -57,7 +56,7 @@ int bch2_resume_logged_ops(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter,
BTREE_ID_logged_ops, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
resume_logged_op(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 26569043e368..a40d116224ed 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -11,7 +11,7 @@
/* KEY_TYPE_lru is obsolete: */
int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
int ret = 0;
@@ -149,7 +149,7 @@ int bch2_check_lrus(struct bch_fs *c)
struct bpos last_flushed_pos = POS_MIN;
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
+ BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 429dca816df5..fb11ab0dd00e 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -49,7 +49,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
}
int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
index db63b3f3b338..e9d9c0212e44 100644
--- a/fs/bcachefs/mean_and_variance_test.c
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -136,20 +136,8 @@ static void mean_and_variance_test_1(struct kunit *test)
d, mean, stddev, weighted_mean, weighted_stddev);
}
-static void mean_and_variance_test_2(struct kunit *test)
-{
- s64 d[] = { 100, 10, 10, 10, 10, 10, 10 };
- s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 };
- s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 };
- s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 };
- s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
/* Test behaviour where we switch from one steady state to another: */
-static void mean_and_variance_test_3(struct kunit *test)
+static void mean_and_variance_test_2(struct kunit *test)
{
s64 d[] = { 100, 100, 100, 100, 100 };
s64 mean[] = { 22, 32, 40, 46, 50 };
@@ -161,18 +149,6 @@ static void mean_and_variance_test_3(struct kunit *test)
d, mean, stddev, weighted_mean, weighted_stddev);
}
-static void mean_and_variance_test_4(struct kunit *test)
-{
- s64 d[] = { 100, 100, 100, 100, 100 };
- s64 mean[] = { 10, 11, 12, 13, 14 };
- s64 stddev[] = { 9, 13, 15, 17, 19 };
- s64 weighted_mean[] = { 32, 49, 61, 71, 78 };
- s64 weighted_stddev[] = { 38, 44, 44, 41, 38 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
static void mean_and_variance_fast_divpow2(struct kunit *test)
{
s64 i;
@@ -230,8 +206,6 @@ static struct kunit_case mean_and_variance_test_cases[] = {
KUNIT_CASE(mean_and_variance_weighted_advanced_test),
KUNIT_CASE(mean_and_variance_test_1),
KUNIT_CASE(mean_and_variance_test_2),
- KUNIT_CASE(mean_and_variance_test_3),
- KUNIT_CASE(mean_and_variance_test_4),
{}
};
@@ -243,4 +217,5 @@ static struct kunit_suite mean_and_variance_test_suite = {
kunit_test_suite(mean_and_variance_test_suite);
MODULE_AUTHOR("Daniel B. Hill");
+MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests");
MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 69098eeb5d48..ddc187fb693d 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -49,7 +49,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
if (!bch2_bkey_has_device_c(k, dev_idx))
return 0;
- n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node);
ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
@@ -67,7 +67,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
/*
* Since we're not inserting through an extent iterator
- * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
* we aren't using the extent overwrite path to delete, we're
* just using the normal key deletion path:
*/
@@ -87,7 +87,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
continue;
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
if (ret)
@@ -119,7 +119,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
for (id = 0; id < BTREE_ID_NR; id++) {
bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
retry:
ret = 0;
while (bch2_trans_begin(trans),
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index bf68ea49447b..6e477fadaa2a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -41,28 +41,23 @@ static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c
struct data_update_opts *data_opts)
{
printbuf_tabstop_push(out, 20);
- prt_str(out, "rewrite ptrs:");
- prt_tab(out);
+ prt_str(out, "rewrite ptrs:\t");
bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
prt_newline(out);
- prt_str(out, "kill ptrs: ");
- prt_tab(out);
+ prt_str(out, "kill ptrs:\t");
bch2_prt_u64_base2(out, data_opts->kill_ptrs);
prt_newline(out);
- prt_str(out, "target: ");
- prt_tab(out);
+ prt_str(out, "target:\t");
bch2_target_to_text(out, c, data_opts->target);
prt_newline(out);
- prt_str(out, "compression: ");
- prt_tab(out);
+ prt_str(out, "compression:\t");
bch2_compression_opt_to_text(out, background_compression(*io_opts));
prt_newline(out);
- prt_str(out, "extra replicas: ");
- prt_tab(out);
+ prt_str(out, "extra replicas:\t");
prt_u64(out, data_opts->extra_replicas);
}
@@ -421,7 +416,7 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
io_opts->d.nr = 0;
ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_all_snapshots, k, ({
if (k.k->p.offset != extent_k.k->p.inode)
break;
@@ -467,7 +462,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans,
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
- BTREE_ITER_CACHED);
+ BTREE_ITER_cached);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
@@ -552,9 +547,10 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
ctxt->stats->pos = BBPOS(btree_id, start);
}
+ bch2_trans_begin(trans);
bch2_trans_iter_init(trans, &iter, btree_id, start,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots);
if (ctxt->rate)
bch2_ratelimit_reset(ctxt->rate);
@@ -695,6 +691,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
struct bpos bp_pos = POS_MIN;
int ret = 0;
+ struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
+ if (!ca)
+ return 0;
+
trace_bucket_evacuate(c, &bucket);
bch2_bkey_buf_init(&sk);
@@ -705,7 +705,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
bch2_trans_begin(trans);
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- bucket, BTREE_ITER_CACHED);
+ bucket, BTREE_ITER_cached);
ret = lockrestart_do(trans,
bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
bch2_trans_iter_exit(trans, &iter);
@@ -716,7 +716,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
a = bch2_alloc_to_v4(k, &a_convert);
dirty_sectors = bch2_bucket_sectors_dirty(*a);
- bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
+ bucket_size = ca->mi.bucket_size;
fragmentation = a->fragmentation_lru;
ret = bch2_btree_write_buffer_tryflush(trans);
@@ -730,9 +730,9 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
bch2_trans_begin(trans);
- ret = bch2_get_next_backpointer(trans, bucket, gen,
+ ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
&bp_pos, &bp,
- BTREE_ITER_CACHED);
+ BTREE_ITER_cached);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
@@ -828,6 +828,7 @@ next:
trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
err:
+ bch2_dev_put(ca);
bch2_bkey_buf_exit(&sk, c);
return ret;
}
@@ -868,7 +869,7 @@ static int bch2_move_btree(struct bch_fs *c,
continue;
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
retry:
ret = 0;
while (bch2_trans_begin(trans),
@@ -920,7 +921,20 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
? c->opts.metadata_replicas
: io_opts->data_replicas;
- if (!nr_good || nr_good >= replicas)
+ rcu_read_lock();
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned i = 0;
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ptr->cached &&
+ (!ca || !ca->mi.durability))
+ data_opts->kill_ptrs |= BIT(i);
+ i++;
+ }
+ rcu_read_unlock();
+
+ if (!data_opts->kill_ptrs &&
+ (!nr_good || nr_good >= replicas))
return false;
data_opts->target = 0;
@@ -968,27 +982,17 @@ static bool migrate_btree_pred(struct bch_fs *c, void *arg,
return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
+/*
+ * Ancient versions of bcachefs produced packed formats which could represent
+ * keys that the in memory format cannot represent; this checks for those
+ * formats so we can get rid of them.
+ */
static bool bformat_needs_redo(struct bkey_format *f)
{
- unsigned i;
-
- for (i = 0; i < f->nr_fields; i++) {
- unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
- u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
- if (f->bits_per_field[i] > unpacked_bits)
+ for (unsigned i = 0; i < f->nr_fields; i++)
+ if (bch2_bkey_format_field_overflows(f, i))
return true;
- if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
- return true;
-
- if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
- unpacked_mask) <
- field_offset)
- return true;
- }
-
return false;
}
@@ -1043,6 +1047,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
struct extent_ptr_decoded p;
unsigned i = 0;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
unsigned d = bch2_extent_ptr_durability(c, &p);
@@ -1053,6 +1058,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
i++;
}
+ rcu_read_unlock();
return data_opts->kill_ptrs != 0;
}
@@ -1137,23 +1143,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_str(out, "keys moved: ");
- prt_u64(out, atomic64_read(&stats->keys_moved));
- prt_newline(out);
-
- prt_str(out, "keys raced: ");
- prt_u64(out, atomic64_read(&stats->keys_raced));
- prt_newline(out);
-
- prt_str(out, "bytes seen: ");
+ prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved));
+ prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced));
+ prt_printf(out, "bytes seen: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
prt_newline(out);
- prt_str(out, "bytes moved: ");
+ prt_printf(out, "bytes moved: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out);
- prt_str(out, "bytes raced: ");
+ prt_printf(out, "bytes raced: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
prt_newline(out);
@@ -1167,19 +1167,17 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
- prt_printf(out, "reads: ios %u/%u sectors %u/%u",
+ prt_printf(out, "reads: ios %u/%u sectors %u/%u\n",
atomic_read(&ctxt->read_ios),
c->opts.move_ios_in_flight,
atomic_read(&ctxt->read_sectors),
c->opts.move_bytes_in_flight >> 9);
- prt_newline(out);
- prt_printf(out, "writes: ios %u/%u sectors %u/%u",
+ prt_printf(out, "writes: ios %u/%u sectors %u/%u\n",
atomic_read(&ctxt->write_ios),
c->opts.move_ios_in_flight,
atomic_read(&ctxt->write_sectors),
c->opts.move_bytes_in_flight >> 9);
- prt_newline(out);
printbuf_indent_add(out, 2);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 0d2b82d8d11f..10bfb31c151b 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -84,7 +84,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
return 0;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
- b->k.bucket, BTREE_ITER_CACHED);
+ b->k.bucket, BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
return ret;
@@ -158,6 +158,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
return ret;
+ bch2_trans_begin(trans);
+
ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 08ea0cfc4aef..bb068fd72465 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -7,6 +7,7 @@
#include "disk_groups.h"
#include "error.h"
#include "opts.h"
+#include "recovery_passes.h"
#include "super-io.h"
#include "util.h"
@@ -42,7 +43,7 @@ const char * const __bch2_btree_ids[] = {
NULL
};
-const char * const bch2_csum_types[] = {
+static const char * const __bch2_csum_types[] = {
BCH_CSUM_TYPES()
NULL
};
@@ -52,7 +53,7 @@ const char * const bch2_csum_opts[] = {
NULL
};
-const char * const __bch2_compression_types[] = {
+static const char * const __bch2_compression_types[] = {
BCH_COMPRESSION_TYPES()
NULL
};
@@ -82,18 +83,39 @@ const char * const bch2_member_states[] = {
NULL
};
-const char * const bch2_jset_entry_types[] = {
+static const char * const __bch2_jset_entry_types[] = {
BCH_JSET_ENTRY_TYPES()
NULL
};
-const char * const bch2_fs_usage_types[] = {
+static const char * const __bch2_fs_usage_types[] = {
BCH_FS_USAGE_TYPES()
NULL
};
#undef x
+static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
+ unsigned nr, const char *type, unsigned idx)
+{
+ if (idx < nr)
+ prt_str(out, opts[idx]);
+ else
+ prt_printf(out, "(unknown %s %u)", type, idx);
+}
+
+#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \
+void bch2_prt_##name(struct printbuf *out, type t) \
+{ \
+ prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\
+}
+
+PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type);
+PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type);
+PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type);
+PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
+PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
+
static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
struct printbuf *err)
{
@@ -205,6 +227,9 @@ const struct bch_option bch2_opt_table[] = {
#define OPT_STR(_choices) .type = BCH_OPT_STR, \
.min = 0, .max = ARRAY_SIZE(_choices), \
.choices = _choices
+#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
+ .min = 0, .max = U64_MAX, \
+ .choices = _choices
#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 136083c11f3a..25530e0bb2f3 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -16,18 +16,20 @@ extern const char * const bch2_version_upgrade_opts[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
-extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
-extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
-extern const char * const bch2_jset_entry_types[];
-extern const char * const bch2_fs_usage_types[];
extern const char * const bch2_d_types[];
+void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type);
+void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type);
+void bch2_prt_data_type(struct printbuf *, enum bch_data_type);
+void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
+void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
+
static inline const char *bch2_d_type_str(unsigned d_type)
{
return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
@@ -362,12 +364,17 @@ enum fsck_err_opts {
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
- NULL, "Don't replay the journal") \
- x(keep_journal, u8, \
+ NULL, "Exit recovery immediately prior to journal replay")\
+ x(recovery_pass_last, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_STR_NOLIMIT(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Exit recovery after specified pass") \
+ x(retain_recovery_info, u8, \
0, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
- NULL, "Don't free journal entries/keys after startup")\
+ NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\
x(read_entire_journal, u8, \
0, \
OPT_BOOL(), \
@@ -419,11 +426,6 @@ enum fsck_err_opts {
BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \
NULL, "Set superblock to latest version,\n" \
"allowing any new features to be used") \
- x(buckets_nouse, u8, \
- 0, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Allocate the buckets_nouse bitmap") \
x(stdio, u64, \
0, \
OPT_UINT(0, S64_MAX), \
@@ -473,7 +475,7 @@ enum fsck_err_opts {
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
- NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\
+ NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
" prefetched sequentially")
struct bch_opts {
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index b27d22925929..9f529e4c1b16 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -10,35 +10,57 @@
#include "printbuf.h"
-static inline unsigned printbuf_linelen(struct printbuf *buf)
+static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos)
{
- return buf->pos - buf->last_newline;
+ return pos - buf->last_newline;
}
-int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
+static inline unsigned printbuf_linelen(struct printbuf *buf)
{
- unsigned new_size;
- char *buf;
+ return __printbuf_linelen(buf, buf->pos);
+}
- if (!out->heap_allocated)
- return 0;
+/*
+ * Returns spaces from start of line, if set, or 0 if unset:
+ */
+static inline unsigned cur_tabstop(struct printbuf *buf)
+{
+ return buf->cur_tabstop < buf->nr_tabstops
+ ? buf->_tabstops[buf->cur_tabstop]
+ : 0;
+}
+int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
+{
/* Reserved space for terminating nul: */
extra += 1;
- if (out->pos + extra < out->size)
+ if (out->pos + extra <= out->size)
return 0;
- new_size = roundup_pow_of_two(out->size + extra);
+ if (!out->heap_allocated) {
+ out->overflow = true;
+ return 0;
+ }
+
+ unsigned new_size = roundup_pow_of_two(out->size + extra);
+
+ /* Sanity check... */
+ if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) {
+ out->allocation_failure = true;
+ out->overflow = true;
+ return -ENOMEM;
+ }
/*
* Note: output buffer must be freeable with kfree(), it's not required
* that the user use printbuf_exit().
*/
- buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
+ char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
if (!buf) {
out->allocation_failure = true;
+ out->overflow = true;
return -ENOMEM;
}
@@ -47,6 +69,92 @@ int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
return 0;
}
+static void printbuf_advance_pos(struct printbuf *out, unsigned len)
+{
+ out->pos += min(len, printbuf_remaining(out));
+}
+
+static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr)
+{
+ unsigned move = out->pos - pos;
+
+ bch2_printbuf_make_room(out, nr);
+
+ if (pos + nr < out->size)
+ memmove(out->buf + pos + nr,
+ out->buf + pos,
+ min(move, out->size - 1 - pos - nr));
+
+ if (pos < out->size)
+ memset(out->buf + pos, ' ', min(nr, out->size - pos));
+
+ printbuf_advance_pos(out, nr);
+ printbuf_nul_terminate_reserved(out);
+}
+
+static void __printbuf_do_indent(struct printbuf *out, unsigned pos)
+{
+ while (true) {
+ int pad;
+ unsigned len = out->pos - pos;
+ char *p = out->buf + pos;
+ char *n = memscan(p, '\n', len);
+ if (cur_tabstop(out)) {
+ n = min(n, (char *) memscan(p, '\r', len));
+ n = min(n, (char *) memscan(p, '\t', len));
+ }
+
+ pos = n - out->buf;
+ if (pos == out->pos)
+ break;
+
+ switch (*n) {
+ case '\n':
+ pos++;
+ out->last_newline = pos;
+
+ printbuf_insert_spaces(out, pos, out->indent);
+
+ pos = min(pos + out->indent, out->pos);
+ out->last_field = pos;
+ out->cur_tabstop = 0;
+ break;
+ case '\r':
+ memmove(n, n + 1, out->pos - pos);
+ --out->pos;
+ pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos);
+ if (pad > 0) {
+ printbuf_insert_spaces(out, out->last_field, pad);
+ pos += pad;
+ }
+
+ out->last_field = pos;
+ out->cur_tabstop++;
+ break;
+ case '\t':
+ pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1;
+ if (pad > 0) {
+ *n = ' ';
+ printbuf_insert_spaces(out, pos, pad - 1);
+ pos += pad;
+ } else {
+ memmove(n, n + 1, out->pos - pos);
+ --out->pos;
+ }
+
+ out->last_field = pos;
+ out->cur_tabstop++;
+ break;
+ }
+ }
+}
+
+static inline void printbuf_do_indent(struct printbuf *out, unsigned pos)
+{
+ if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling)
+ __printbuf_do_indent(out, pos);
+}
+
void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
{
int len;
@@ -55,14 +163,14 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
va_list args2;
va_copy(args2, args);
- len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+ len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2);
va_end(args2);
- } while (len + 1 >= printbuf_remaining(out) &&
- !bch2_printbuf_make_room(out, len + 1));
+ } while (len > printbuf_remaining(out) &&
+ !bch2_printbuf_make_room(out, len));
- len = min_t(size_t, len,
- printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
- out->pos += len;
+ unsigned indent_pos = out->pos;
+ printbuf_advance_pos(out, len);
+ printbuf_do_indent(out, indent_pos);
}
void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
@@ -72,14 +180,14 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
do {
va_start(args, fmt);
- len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
+ len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args);
va_end(args);
- } while (len + 1 >= printbuf_remaining(out) &&
- !bch2_printbuf_make_room(out, len + 1));
+ } while (len > printbuf_remaining(out) &&
+ !bch2_printbuf_make_room(out, len));
- len = min_t(size_t, len,
- printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
- out->pos += len;
+ unsigned indent_pos = out->pos;
+ printbuf_advance_pos(out, len);
+ printbuf_do_indent(out, indent_pos);
}
/**
@@ -194,33 +302,20 @@ void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
void bch2_prt_newline(struct printbuf *buf)
{
- unsigned i;
-
bch2_printbuf_make_room(buf, 1 + buf->indent);
- __prt_char(buf, '\n');
+ __prt_char_reserved(buf, '\n');
buf->last_newline = buf->pos;
- for (i = 0; i < buf->indent; i++)
- __prt_char(buf, ' ');
+ __prt_chars_reserved(buf, ' ', buf->indent);
- printbuf_nul_terminate(buf);
+ printbuf_nul_terminate_reserved(buf);
buf->last_field = buf->pos;
buf->cur_tabstop = 0;
}
-/*
- * Returns spaces from start of line, if set, or 0 if unset:
- */
-static inline unsigned cur_tabstop(struct printbuf *buf)
-{
- return buf->cur_tabstop < buf->nr_tabstops
- ? buf->_tabstops[buf->cur_tabstop]
- : 0;
-}
-
static void __prt_tab(struct printbuf *out)
{
int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
@@ -247,24 +342,9 @@ void bch2_prt_tab(struct printbuf *out)
static void __prt_tab_rjust(struct printbuf *buf)
{
- unsigned move = buf->pos - buf->last_field;
int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
-
- if (pad > 0) {
- bch2_printbuf_make_room(buf, pad);
-
- if (buf->last_field + pad < buf->size)
- memmove(buf->buf + buf->last_field + pad,
- buf->buf + buf->last_field,
- min(move, buf->size - 1 - buf->last_field - pad));
-
- if (buf->last_field < buf->size)
- memset(buf->buf + buf->last_field, ' ',
- min((unsigned) pad, buf->size - buf->last_field));
-
- buf->pos += pad;
- printbuf_nul_terminate(buf);
- }
+ if (pad > 0)
+ printbuf_insert_spaces(buf, buf->last_field, pad);
buf->last_field = buf->pos;
buf->cur_tabstop++;
@@ -301,41 +381,9 @@ void bch2_prt_tab_rjust(struct printbuf *buf)
*/
void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
{
- const char *unprinted_start = str;
- const char *end = str + count;
-
- if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) {
- prt_bytes(out, str, count);
- return;
- }
-
- while (str != end) {
- switch (*str) {
- case '\n':
- prt_bytes(out, unprinted_start, str - unprinted_start);
- unprinted_start = str + 1;
- bch2_prt_newline(out);
- break;
- case '\t':
- if (likely(cur_tabstop(out))) {
- prt_bytes(out, unprinted_start, str - unprinted_start);
- unprinted_start = str + 1;
- __prt_tab(out);
- }
- break;
- case '\r':
- if (likely(cur_tabstop(out))) {
- prt_bytes(out, unprinted_start, str - unprinted_start);
- unprinted_start = str + 1;
- __prt_tab_rjust(out);
- }
- break;
- }
-
- str++;
- }
-
- prt_bytes(out, unprinted_start, str - unprinted_start);
+ unsigned indent_pos = out->pos;
+ prt_bytes(out, str, count);
+ printbuf_do_indent(out, indent_pos);
}
/**
@@ -348,9 +396,10 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou
void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
{
bch2_printbuf_make_room(out, 10);
- out->pos += string_get_size(v, 1, !out->si_units,
- out->buf + out->pos,
- printbuf_remaining_size(out));
+ unsigned len = string_get_size(v, 1, !out->si_units,
+ out->buf + out->pos,
+ printbuf_remaining_size(out));
+ printbuf_advance_pos(out, len);
}
/**
@@ -402,9 +451,7 @@ void bch2_prt_string_option(struct printbuf *out,
const char * const list[],
size_t selected)
{
- size_t i;
-
- for (i = 0; list[i]; i++)
+ for (size_t i = 0; list[i]; i++)
bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
}
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
index 9a4a56c40937..9ecc56bc9635 100644
--- a/fs/bcachefs/printbuf.h
+++ b/fs/bcachefs/printbuf.h
@@ -86,6 +86,7 @@ struct printbuf {
u8 atomic;
bool allocation_failure:1;
bool heap_allocated:1;
+ bool overflow:1;
enum printbuf_si si_units:1;
bool human_readable_units:1;
bool has_indent_or_tabstops:1;
@@ -142,7 +143,9 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
*/
static inline unsigned printbuf_remaining_size(struct printbuf *out)
{
- return out->pos < out->size ? out->size - out->pos : 0;
+ if (WARN_ON(out->size && out->pos >= out->size))
+ out->pos = out->size - 1;
+ return out->size - out->pos;
}
/*
@@ -151,7 +154,7 @@ static inline unsigned printbuf_remaining_size(struct printbuf *out)
*/
static inline unsigned printbuf_remaining(struct printbuf *out)
{
- return out->pos < out->size ? out->size - out->pos - 1 : 0;
+ return out->size ? printbuf_remaining_size(out) - 1 : 0;
}
static inline unsigned printbuf_written(struct printbuf *out)
@@ -159,30 +162,25 @@ static inline unsigned printbuf_written(struct printbuf *out)
return out->size ? min(out->pos, out->size - 1) : 0;
}
-/*
- * Returns true if output was truncated:
- */
-static inline bool printbuf_overflowed(struct printbuf *out)
+static inline void printbuf_nul_terminate_reserved(struct printbuf *out)
{
- return out->pos >= out->size;
+ if (WARN_ON(out->size && out->pos >= out->size))
+ out->pos = out->size - 1;
+ if (out->size)
+ out->buf[out->pos] = 0;
}
static inline void printbuf_nul_terminate(struct printbuf *out)
{
bch2_printbuf_make_room(out, 1);
-
- if (out->pos < out->size)
- out->buf[out->pos] = 0;
- else if (out->size)
- out->buf[out->size - 1] = 0;
+ printbuf_nul_terminate_reserved(out);
}
/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
static inline void __prt_char_reserved(struct printbuf *out, char c)
{
if (printbuf_remaining(out))
- out->buf[out->pos] = c;
- out->pos++;
+ out->buf[out->pos++] = c;
}
/* Doesn't nul terminate: */
@@ -194,37 +192,34 @@ static inline void __prt_char(struct printbuf *out, char c)
static inline void prt_char(struct printbuf *out, char c)
{
- __prt_char(out, c);
- printbuf_nul_terminate(out);
+ bch2_printbuf_make_room(out, 2);
+ __prt_char_reserved(out, c);
+ printbuf_nul_terminate_reserved(out);
}
static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
{
- unsigned i, can_print = min(n, printbuf_remaining(out));
+ unsigned can_print = min(n, printbuf_remaining(out));
- for (i = 0; i < can_print; i++)
+ for (unsigned i = 0; i < can_print; i++)
out->buf[out->pos++] = c;
- out->pos += n - can_print;
}
static inline void prt_chars(struct printbuf *out, char c, unsigned n)
{
bch2_printbuf_make_room(out, n);
__prt_chars_reserved(out, c, n);
- printbuf_nul_terminate(out);
+ printbuf_nul_terminate_reserved(out);
}
static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
{
- unsigned i, can_print;
-
bch2_printbuf_make_room(out, n);
- can_print = min(n, printbuf_remaining(out));
+ unsigned can_print = min(n, printbuf_remaining(out));
- for (i = 0; i < can_print; i++)
+ for (unsigned i = 0; i < can_print; i++)
out->buf[out->pos++] = ((char *) b)[i];
- out->pos += n - can_print;
printbuf_nul_terminate(out);
}
@@ -241,18 +236,18 @@ static inline void prt_str_indented(struct printbuf *out, const char *str)
static inline void prt_hex_byte(struct printbuf *out, u8 byte)
{
- bch2_printbuf_make_room(out, 2);
+ bch2_printbuf_make_room(out, 3);
__prt_char_reserved(out, hex_asc_hi(byte));
__prt_char_reserved(out, hex_asc_lo(byte));
- printbuf_nul_terminate(out);
+ printbuf_nul_terminate_reserved(out);
}
static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
{
- bch2_printbuf_make_room(out, 2);
+ bch2_printbuf_make_room(out, 3);
__prt_char_reserved(out, hex_asc_upper_hi(byte));
__prt_char_reserved(out, hex_asc_upper_lo(byte));
- printbuf_nul_terminate(out);
+ printbuf_nul_terminate_reserved(out);
}
/**
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index e68b34eab90a..a0cca8b70e0a 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -20,7 +20,7 @@ static const char * const bch2_quota_counters[] = {
};
static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_quota *q = field_to_type(f, quota);
@@ -60,8 +60,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
};
int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
int ret = 0;
@@ -97,45 +96,14 @@ static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 20);
- prt_str(out, "i_fieldmask");
- prt_tab(out);
- prt_printf(out, "%x", i->i_fieldmask);
- prt_newline(out);
-
- prt_str(out, "i_flags");
- prt_tab(out);
- prt_printf(out, "%u", i->i_flags);
- prt_newline(out);
-
- prt_str(out, "i_spc_timelimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_spc_timelimit);
- prt_newline(out);
-
- prt_str(out, "i_ino_timelimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_ino_timelimit);
- prt_newline(out);
-
- prt_str(out, "i_rt_spc_timelimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_rt_spc_timelimit);
- prt_newline(out);
-
- prt_str(out, "i_spc_warnlimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_spc_warnlimit);
- prt_newline(out);
-
- prt_str(out, "i_ino_warnlimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_ino_warnlimit);
- prt_newline(out);
-
- prt_str(out, "i_rt_spc_warnlimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_rt_spc_warnlimit);
- prt_newline(out);
+ prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask);
+ prt_printf(out, "i_flags\t%u\n", i->i_flags);
+ prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit);
+ prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit);
+ prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit);
+ prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit);
+ prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit);
+ prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit);
}
static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
@@ -143,60 +111,17 @@ static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 20);
- prt_str(out, "d_fieldmask");
- prt_tab(out);
- prt_printf(out, "%x", q->d_fieldmask);
- prt_newline(out);
-
- prt_str(out, "d_spc_hardlimit");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_spc_hardlimit);
- prt_newline(out);
-
- prt_str(out, "d_spc_softlimit");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_spc_softlimit);
- prt_newline(out);
-
- prt_str(out, "d_ino_hardlimit");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_ino_hardlimit);
- prt_newline(out);
-
- prt_str(out, "d_ino_softlimit");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_ino_softlimit);
- prt_newline(out);
-
- prt_str(out, "d_space");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_space);
- prt_newline(out);
-
- prt_str(out, "d_ino_count");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_ino_count);
- prt_newline(out);
-
- prt_str(out, "d_ino_timer");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_ino_timer);
- prt_newline(out);
-
- prt_str(out, "d_spc_timer");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_spc_timer);
- prt_newline(out);
-
- prt_str(out, "d_ino_warns");
- prt_tab(out);
- prt_printf(out, "%i", q->d_ino_warns);
- prt_newline(out);
-
- prt_str(out, "d_spc_warns");
- prt_tab(out);
- prt_printf(out, "%i", q->d_spc_warns);
- prt_newline(out);
+ prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask);
+ prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit);
+ prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit);
+ prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit);
+ prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit);
+ prt_printf(out, "d_space\t%llu\n", q->d_space);
+ prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count);
+ prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer);
+ prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer);
+ prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns);
+ prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns);
}
static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
@@ -560,13 +485,11 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bch_inode_unpacked u;
struct bch_snapshot_tree s_t;
- int ret;
+ u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot);
- ret = bch2_snapshot_tree_lookup(trans,
- bch2_snapshot_tree(c, k.k->p.snapshot), &s_t);
+ int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "%s: snapshot tree %u not found", __func__,
- snapshot_t(c, k.k->p.snapshot)->tree);
+ "%s: snapshot tree %u not found", __func__, tree);
if (ret)
return ret;
@@ -612,10 +535,10 @@ int bch2_fs_quota_read(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
__bch2_quota_set(c, k, NULL)) ?:
for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
bch2_fs_quota_read_inode(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
@@ -902,7 +825,7 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
ret = bkey_err(k);
if (unlikely(ret))
return ret;
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 884f601f41c4..02d37a332218 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -5,11 +5,11 @@
#include "inode.h"
#include "quota_types.h"
-enum bkey_invalid_flags;
+enum bch_validate_flags;
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_quota ((struct bkey_ops) { \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 56336f3dd1d0..cf81e5128c3a 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -42,7 +42,7 @@ static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -89,7 +89,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum,
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -140,7 +140,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
bch2_trans_iter_init(trans, extent_iter,
work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
work_pos,
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_all_snapshots);
k = bch2_btree_iter_peek_slot(extent_iter);
if (bkey_err(k))
return k;
@@ -323,12 +323,14 @@ static int do_rebalance(struct moving_context *ctxt)
struct bkey_s_c k;
int ret = 0;
+ bch2_trans_begin(trans);
+
bch2_move_stats_init(&r->work_stats, "rebalance_work");
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
bch2_trans_iter_init(trans, &rebalance_work_iter,
BTREE_ID_rebalance_work, POS_MIN,
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_all_snapshots);
while (!bch2_move_ratelimit(ctxt)) {
if (!r->enabled) {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 03f9d6afe467..cf513fc79ce4 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1,35 +1,31 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
#include "alloc_background.h"
-#include "btree_gc.h"
+#include "bkey_buf.h"
#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "buckets.h"
#include "dirent.h"
-#include "ec.h"
#include "errcode.h"
#include "error.h"
#include "fs-common.h"
-#include "fsck.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
-#include "lru.h"
#include "logged_ops.h"
#include "move.h"
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "sb-clean.h"
#include "sb-downgrade.h"
#include "snapshot.h"
-#include "subvolume.h"
#include "super-io.h"
#include <linux/sort.h>
@@ -37,22 +33,25 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-static bool btree_id_is_alloc(enum btree_id id)
+void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
{
- switch (id) {
- case BTREE_ID_alloc:
- case BTREE_ID_backpointers:
- case BTREE_ID_need_discard:
- case BTREE_ID_freespace:
- case BTREE_ID_bucket_gens:
- return true;
- default:
- return false;
+ if (btree >= BTREE_ID_NR_MAX)
+ return;
+
+ u64 b = BIT_ULL(btree);
+
+ if (!(c->sb.btrees_lost_data & b)) {
+ bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree));
+
+ mutex_lock(&c->sb_lock);
+ bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
}
}
/* for -o reconstruct_alloc: */
-static void do_reconstruct_alloc(struct bch_fs *c)
+static void bch2_reconstruct_alloc(struct bch_fs *c)
{
bch2_journal_log_msg(c, "dropping alloc info");
bch_info(c, "dropping and reconstructing all alloc info");
@@ -69,9 +68,20 @@ static void do_reconstruct_alloc(struct bch_fs *c)
__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
+
+ __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent);
+
+ __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent);
+
__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
@@ -87,15 +97,17 @@ static void do_reconstruct_alloc(struct bch_fs *c)
c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- struct journal_keys *keys = &c->journal_keys;
- size_t src, dst;
-
- move_gap(keys, keys->nr);
- for (src = 0, dst = 0; src < keys->nr; src++)
- if (!btree_id_is_alloc(keys->data[src].btree_id))
- keys->data[dst++] = keys->data[src];
- keys->nr = keys->gap = dst;
+ bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_freespace,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
}
/*
@@ -127,9 +139,9 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
{
struct btree_iter iter;
unsigned iter_flags =
- BTREE_ITER_INTENT|
- BTREE_ITER_NOT_EXTENTS;
- unsigned update_flags = BTREE_TRIGGER_NORUN;
+ BTREE_ITER_intent|
+ BTREE_ITER_not_extents;
+ unsigned update_flags = BTREE_TRIGGER_norun;
int ret;
if (k->overwritten)
@@ -138,17 +150,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
trans->journal_res.seq = k->journal_seq;
/*
- * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
+ * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to
* keep the key cache coherent with the underlying btree. Nothing
* besides the allocator is doing updates yet so we don't need key cache
* coherency for non-alloc btrees, and key cache fills for snapshots
- * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until
+ * btrees use BTREE_ITER_filter_snapshots, which isn't available until
* the snapshots recovery pass runs.
*/
if (!k->level && k->btree_id == BTREE_ID_alloc)
- iter_flags |= BTREE_ITER_CACHED;
+ iter_flags |= BTREE_ITER_cached;
else
- update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM;
+ update_flags |= BTREE_UPDATE_key_cache_reclaim;
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
@@ -186,14 +198,15 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
return cmp_int(l->journal_seq, r->journal_seq);
}
-static int bch2_journal_replay(struct bch_fs *c)
+int bch2_journal_replay(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
DARRAY(struct journal_key *) keys_sorted = { 0 };
struct journal *j = &c->journal;
u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start;
- struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_trans *trans = NULL;
+ bool immediate_flush = false;
int ret = 0;
if (keys->nr) {
@@ -206,6 +219,7 @@ static int bch2_journal_replay(struct bch_fs *c)
BUG_ON(!atomic_read(&keys->ref));
move_gap(keys, keys->nr);
+ trans = bch2_trans_get(c);
/*
* First, attempt to replay keys in sorted order. This is more
@@ -215,6 +229,13 @@ static int bch2_journal_replay(struct bch_fs *c)
darray_for_each(*keys, k) {
cond_resched();
+ /*
+ * k->allocated means the key wasn't read in from the journal,
+ * rather it was from early repair code
+ */
+ if (k->allocated)
+ immediate_flush = true;
+
/* Skip fastpath if we're low on space in the journal */
ret = c->journal.watermark ? -1 :
commit_do(trans, NULL, NULL,
@@ -243,7 +264,10 @@ static int bch2_journal_replay(struct bch_fs *c)
struct journal_key *k = *kp;
- replay_now_at(j, k->journal_seq);
+ if (k->journal_seq)
+ replay_now_at(j, k->journal_seq);
+ else
+ replay_now_at(j, j->replay_journal_seq_end);
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
@@ -266,7 +290,8 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_trans_put(trans);
trans = NULL;
- if (!c->opts.keep_journal)
+ if (!c->opts.retain_recovery_info &&
+ c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
bch2_journal_keys_put_initial(c);
replay_now_at(j, j->replay_journal_seq_end);
@@ -274,6 +299,12 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_journal_set_replay_done(j);
+ /* if we did any repair, flush it immediately */
+ if (immediate_flush) {
+ bch2_journal_flush_all_pins(&c->journal);
+ ret = bch2_journal_meta(&c->journal);
+ }
+
if (keys->nr)
bch2_journal_log_msg(c, "journal replay finished");
err:
@@ -345,14 +376,17 @@ static int journal_replay_entry_early(struct bch_fs *c,
case BCH_JSET_ENTRY_dev_usage: {
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
- struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
- unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
-
- for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
- ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
- ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
- ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
- }
+ unsigned nr_types = jset_entry_dev_usage_nr_types(u);
+
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, le32_to_cpu(u->dev));
+ if (ca)
+ for (unsigned i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
+ ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
+ ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
+ ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
+ }
+ rcu_read_unlock();
break;
}
@@ -423,10 +457,9 @@ static int journal_replay_early(struct bch_fs *c,
static int read_btree_roots(struct bch_fs *c)
{
- unsigned i;
int ret = 0;
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (!r->alive)
@@ -435,186 +468,46 @@ static int read_btree_roots(struct bch_fs *c)
if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
continue;
- if (r->error) {
- __fsck_err(c,
- btree_id_is_alloc(i)
- ? FSCK_CAN_IGNORE : 0,
- btree_root_bkey_invalid,
- "invalid btree root %s",
- bch2_btree_id_str(i));
- if (i == BTREE_ID_alloc)
+ if (mustfix_fsck_err_on((ret = r->error),
+ c, btree_root_bkey_invalid,
+ "invalid btree root %s",
+ bch2_btree_id_str(i)) ||
+ mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
+ c, btree_root_read_error,
+ "error reading btree root %s l=%u: %s",
+ bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
+ if (btree_id_is_alloc(i)) {
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- }
+ r->error = 0;
+ } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
+ bch_info(c, "will run btree node scan");
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ }
- ret = bch2_btree_root_read(c, i, &r->key, r->level);
- if (ret) {
- fsck_err(c,
- btree_root_read_error,
- "error reading btree root %s",
- bch2_btree_id_str(i));
- if (btree_id_is_alloc(i))
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
ret = 0;
+ bch2_btree_lost_data(c, i);
}
}
- for (i = 0; i < BTREE_ID_NR; i++) {
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
- if (!r->b) {
+ if (!r->b && !r->error) {
r->alive = false;
r->level = 0;
- bch2_btree_root_alloc(c, i);
+ bch2_btree_root_alloc_fake(c, i, 0);
}
}
fsck_err:
return ret;
}
-static int bch2_initialize_subvolumes(struct bch_fs *c)
-{
- struct bkey_i_snapshot_tree root_tree;
- struct bkey_i_snapshot root_snapshot;
- struct bkey_i_subvolume root_volume;
- int ret;
-
- bkey_snapshot_tree_init(&root_tree.k_i);
- root_tree.k.p.offset = 1;
- root_tree.v.master_subvol = cpu_to_le32(1);
- root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
-
- bkey_snapshot_init(&root_snapshot.k_i);
- root_snapshot.k.p.offset = U32_MAX;
- root_snapshot.v.flags = 0;
- root_snapshot.v.parent = 0;
- root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
- root_snapshot.v.tree = cpu_to_le32(1);
- SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
-
- bkey_subvolume_init(&root_volume.k_i);
- root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_volume.v.flags = 0;
- root_volume.v.snapshot = cpu_to_le32(U32_MAX);
- root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
-
- ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_inode_unpacked inode;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (!bkey_is_inode(k.k)) {
- bch_err(trans->c, "root inode not found");
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
- }
-
- ret = bch2_inode_unpack(k, &inode);
- BUG_ON(ret);
-
- inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
-
- ret = bch2_inode_write(trans, &iter, &inode);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* set bi_subvol on root inode */
-noinline_for_stack
-static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
-{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
- __bch2_fs_upgrade_for_subvolumes(trans));
- bch_err_fn(c, ret);
- return ret;
-}
-
-const char * const bch2_recovery_passes[] = {
-#define x(_fn, ...) #_fn,
- BCH_RECOVERY_PASSES()
-#undef x
- NULL
-};
-
-static int bch2_check_allocations(struct bch_fs *c)
-{
- return bch2_gc(c, true, c->opts.norecovery);
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
-
- /*
- * After we go RW, the journal keys buffer can't be modified (except for
- * setting journal_key->overwritten: it will be accessed by multiple
- * threads
- */
- move_gap(keys, keys->nr);
-
- set_bit(BCH_FS_may_go_rw, &c->flags);
-
- if (keys->nr || c->opts.fsck || !c->sb.clean)
- return bch2_fs_read_write_early(c);
- return 0;
-}
-
-struct recovery_pass_fn {
- int (*fn)(struct bch_fs *);
- unsigned when;
-};
-
-static struct recovery_pass_fn recovery_pass_fns[] = {
-#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-u64 bch2_recovery_passes_to_stable(u64 v)
-{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
-
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
- return ret;
-}
-
-u64 bch2_recovery_passes_from_stable(u64 v)
-{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
-
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
- return ret;
-}
-
static bool check_version_upgrade(struct bch_fs *c)
{
unsigned latest_version = bcachefs_metadata_version_current;
@@ -687,96 +580,6 @@ static bool check_version_upgrade(struct bch_fs *c)
return false;
}
-u64 bch2_fsck_recovery_passes(void)
-{
- u64 ret = 0;
-
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
- if (recovery_pass_fns[i].when & PASS_FSCK)
- ret |= BIT_ULL(i);
- return ret;
-}
-
-static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
-
- if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
- return false;
- if (c->recovery_passes_explicit & BIT_ULL(pass))
- return true;
- if ((p->when & PASS_FSCK) && c->opts.fsck)
- return true;
- if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
- return true;
- if (p->when & PASS_ALWAYS)
- return true;
- return false;
-}
-
-static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
- int ret;
-
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
- bch2_recovery_passes[pass]);
- ret = p->fn(c);
- if (ret)
- return ret;
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_CONT " done\n");
-
- return 0;
-}
-
-static int bch2_run_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
-
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
- if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
- unsigned pass = c->curr_recovery_pass;
-
- ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
- (ret && c->curr_recovery_pass < pass))
- continue;
- if (ret)
- break;
-
- c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
- }
- c->curr_recovery_pass++;
- c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
- }
-
- return ret;
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
-
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
- struct recovery_pass_fn *p = recovery_pass_fns + i;
-
- if (!(p->when & PASS_ONLINE))
- continue;
-
- ret = bch2_run_recovery_pass(c, i);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
- i = c->curr_recovery_pass;
- continue;
- }
- if (ret)
- break;
- }
-
- return ret;
-}
-
int bch2_fs_recovery(struct bch_fs *c)
{
struct bch_sb_field_clean *clean = NULL;
@@ -809,69 +612,57 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (c->opts.fsck && c->opts.norecovery) {
- bch_err(c, "cannot select both norecovery and fsck");
- ret = -EINVAL;
- goto err;
- }
-
- if (!c->opts.nochanges) {
- mutex_lock(&c->sb_lock);
- bool write_sb = false;
-
- struct bch_sb_field_ext *ext =
- bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
- if (!ext) {
- ret = -BCH_ERR_ENOSPC_sb;
- mutex_unlock(&c->sb_lock);
- goto err;
- }
+ if (c->opts.norecovery)
+ c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1;
- if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
- ext->recovery_passes_required[0] |=
- cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
- write_sb = true;
- }
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ bool write_sb = false;
- u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- if (sb_passes) {
- struct printbuf buf = PRINTBUF;
- prt_str(&buf, "superblock requires following recovery passes to be run:\n ");
- prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
+ if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
+ ext->recovery_passes_required[0] |=
+ cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
+ write_sb = true;
+ }
- if (bch2_check_version_downgrade(c)) {
- struct printbuf buf = PRINTBUF;
+ u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ if (sb_passes) {
+ struct printbuf buf = PRINTBUF;
+ prt_str(&buf, "superblock requires following recovery passes to be run:\n ");
+ prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
- prt_str(&buf, "Version downgrade required:");
+ if (bch2_check_version_downgrade(c)) {
+ struct printbuf buf = PRINTBUF;
- __le64 passes = ext->recovery_passes_required[0];
- bch2_sb_set_downgrade(c,
- BCH_VERSION_MINOR(bcachefs_metadata_version_current),
- BCH_VERSION_MINOR(c->sb.version));
- passes = ext->recovery_passes_required[0] & ~passes;
- if (passes) {
- prt_str(&buf, "\n running recovery passes: ");
- prt_bitflags(&buf, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
- }
+ prt_str(&buf, "Version downgrade required:");
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- write_sb = true;
+ __le64 passes = ext->recovery_passes_required[0];
+ bch2_sb_set_downgrade(c,
+ BCH_VERSION_MINOR(bcachefs_metadata_version_current),
+ BCH_VERSION_MINOR(c->sb.version));
+ passes = ext->recovery_passes_required[0] & ~passes;
+ if (passes) {
+ prt_str(&buf, "\n running recovery passes: ");
+ prt_bitflags(&buf, bch2_recovery_passes,
+ bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
}
- if (check_version_upgrade(c))
- write_sb = true;
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ write_sb = true;
+ }
- if (write_sb)
- bch2_write_super(c);
+ if (check_version_upgrade(c))
+ write_sb = true;
- c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- mutex_unlock(&c->sb_lock);
- }
+ if (write_sb)
+ bch2_write_super(c);
+
+ c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ mutex_unlock(&c->sb_lock);
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
@@ -885,7 +676,9 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
+ bch2_journal_pos_from_member_info_resume(c);
+
+ if (!c->sb.clean || c->opts.retain_recovery_info) {
struct genradix_iter iter;
struct journal_replay **i;
@@ -965,7 +758,7 @@ use_clean:
c->journal_replay_seq_end = blacklist_seq - 1;
if (c->opts.reconstruct_alloc)
- do_reconstruct_alloc(c);
+ bch2_reconstruct_alloc(c);
zero_out_btree_mem_ptr(&c->journal_keys);
@@ -1017,6 +810,14 @@ use_clean:
clear_bit(BCH_FS_fsck_running, &c->flags);
+ /* fsync if we fixed errors */
+ if (test_bit(BCH_FS_errors_fixed, &c->flags) &&
+ bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_meta(&c->journal);
+ bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
+ }
+
/* If we fixed errors, verify that fs is actually clean now: */
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
test_bit(BCH_FS_errors_fixed, &c->flags) &&
@@ -1051,7 +852,8 @@ use_clean:
}
mutex_lock(&c->sb_lock);
- bool write_sb = false;
+ ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ write_sb = false;
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
@@ -1064,15 +866,18 @@ use_clean:
write_sb = true;
}
- if (!test_bit(BCH_FS_error, &c->flags)) {
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- if (ext &&
- (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
- !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) {
- memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required));
- memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
- write_sb = true;
- }
+ if (!test_bit(BCH_FS_error, &c->flags) &&
+ !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
+ memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
+ write_sb = true;
+ }
+
+ if (c->opts.fsck &&
+ !test_bit(BCH_FS_error, &c->flags) &&
+ c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
+ ext->btrees_lost_data) {
+ ext->btrees_lost_data = 0;
+ write_sb = true;
}
if (c->opts.fsck &&
@@ -1083,6 +888,9 @@ use_clean:
write_sb = true;
}
+ if (bch2_blacklist_entries_gc(c))
+ write_sb = true;
+
if (write_sb)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -1105,18 +913,16 @@ use_clean:
bch_info(c, "scanning for old btree nodes done");
}
- if (c->journal_seq_blacklist_table &&
- c->journal_seq_blacklist_table->nr > 128)
- queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
-
ret = 0;
out:
bch2_flush_fsck_errs(c);
- if (!c->opts.keep_journal &&
- test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ if (!c->opts.retain_recovery_info) {
bch2_journal_keys_put_initial(c);
- kfree(clean);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
+ }
+ if (!IS_ERR(clean))
+ kfree(clean);
if (!ret &&
test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
@@ -1141,6 +947,7 @@ int bch2_fs_initialize(struct bch_fs *c)
int ret;
bch_notice(c, "initializing new filesystem");
+ set_bit(BCH_FS_new_fs, &c->flags);
mutex_lock(&c->sb_lock);
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
@@ -1155,11 +962,11 @@ int bch2_fs_initialize(struct bch_fs *c)
}
mutex_unlock(&c->sb_lock);
- c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
+ c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
set_bit(BCH_FS_may_go_rw, &c->flags);
for (unsigned i = 0; i < BTREE_ID_NR; i++)
- bch2_btree_root_alloc(c, i);
+ bch2_btree_root_alloc_fake(c, i, 0);
for_each_member_device(c, ca)
bch2_dev_usage_init(ca);
@@ -1230,7 +1037,7 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1;
+ c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 4e9d24719b2e..4bf818de1f2f 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,37 +2,9 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
-extern const char * const bch2_recovery_passes[];
+void bch2_btree_lost_data(struct bch_fs *, enum btree_id);
-u64 bch2_recovery_passes_to_stable(u64 v);
-u64 bch2_recovery_passes_from_stable(u64 v);
-
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- if (c->recovery_passes_explicit & BIT_ULL(pass))
- return 0;
-
- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
- bch2_recovery_passes[pass], pass,
- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
-
- c->recovery_passes_explicit |= BIT_ULL(pass);
-
- if (c->curr_recovery_pass >= pass) {
- c->curr_recovery_pass = pass;
- c->recovery_passes_complete &= (1ULL << pass) >> 1;
- return -BCH_ERR_restart_recovery;
- } else {
- return 0;
- }
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *);
-u64 bch2_fsck_recovery_passes(void);
+int bch2_journal_replay(struct bch_fs *);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
new file mode 100644
index 000000000000..4a9eb9582b6e
--- /dev/null
+++ b/fs/bcachefs/recovery_passes.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_gc.h"
+#include "btree_node_scan.h"
+#include "ec.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "lru.h"
+#include "logged_ops.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "recovery_passes.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, ...) #_fn,
+ BCH_RECOVERY_PASSES()
+#undef x
+ NULL
+};
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+
+ /*
+ * After we go RW, the journal keys buffer can't be modified (except for
+ * setting journal_key->overwritten: it will be accessed by multiple
+ * threads
+ */
+ move_gap(keys, keys->nr);
+
+ set_bit(BCH_FS_may_go_rw, &c->flags);
+
+ if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit)
+ return bch2_fs_read_write_early(c);
+ return 0;
+}
+
+struct recovery_pass_fn {
+ int (*fn)(struct bch_fs *);
+ unsigned when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static const u8 passes_to_stable_map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
+{
+ return passes_to_stable_map[pass];
+}
+
+u64 bch2_recovery_passes_to_stable(u64 v)
+{
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(passes_to_stable_map[i]);
+ return ret;
+}
+
+u64 bch2_recovery_passes_from_stable(u64 v)
+{
+ static const u8 map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+ };
+
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(map[i]);
+ return ret;
+}
+
+/*
+ * For when we need to rewind recovery passes and run a pass we skipped:
+ */
+int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ if (c->recovery_passes_explicit & BIT_ULL(pass))
+ return 0;
+
+ bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+ bch2_recovery_passes[pass], pass,
+ bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+ c->recovery_passes_explicit |= BIT_ULL(pass);
+
+ if (c->curr_recovery_pass >= pass) {
+ c->curr_recovery_pass = pass;
+ c->recovery_passes_complete &= (1ULL << pass) >> 1;
+ return -BCH_ERR_restart_recovery;
+ } else {
+ return 0;
+ }
+}
+
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (!test_bit_le64(s, ext->recovery_passes_required)) {
+ __set_bit_le64(s, ext->recovery_passes_required);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+
+ return bch2_run_explicit_recovery_pass(c, pass);
+}
+
+static void bch2_clear_recovery_pass_required(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (test_bit_le64(s, ext->recovery_passes_required)) {
+ __clear_bit_le64(s, ext->recovery_passes_required);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+}
+
+u64 bch2_fsck_recovery_passes(void)
+{
+ u64 ret = 0;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+ if (recovery_pass_fns[i].when & PASS_FSCK)
+ ret |= BIT_ULL(i);
+ return ret;
+}
+
+static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+
+ if (c->recovery_passes_explicit & BIT_ULL(pass))
+ return true;
+ if ((p->when & PASS_FSCK) && c->opts.fsck)
+ return true;
+ if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
+ return true;
+ if (p->when & PASS_ALWAYS)
+ return true;
+ return false;
+}
+
+static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+ int ret;
+
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
+ bch2_recovery_passes[pass]);
+ ret = p->fn(c);
+ if (ret)
+ return ret;
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_CONT " done\n");
+
+ return 0;
+}
+
+int bch2_run_online_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
+ struct recovery_pass_fn *p = recovery_pass_fns + i;
+
+ if (!(p->when & PASS_ONLINE))
+ continue;
+
+ ret = bch2_run_recovery_pass(c, i);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
+ i = c->curr_recovery_pass;
+ continue;
+ }
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int bch2_run_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
+
+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+ if (c->opts.recovery_pass_last &&
+ c->curr_recovery_pass > c->opts.recovery_pass_last)
+ break;
+
+ if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
+ unsigned pass = c->curr_recovery_pass;
+
+ ret = bch2_run_recovery_pass(c, c->curr_recovery_pass) ?:
+ bch2_journal_flush(&c->journal);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
+ (ret && c->curr_recovery_pass < pass))
+ continue;
+ if (ret)
+ break;
+
+ c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
+ }
+
+ c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
+
+ if (!test_bit(BCH_FS_error, &c->flags))
+ bch2_clear_recovery_pass_required(c, c->curr_recovery_pass);
+
+ c->curr_recovery_pass++;
+ }
+
+ return ret;
+}
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
new file mode 100644
index 000000000000..99b464e127b8
--- /dev/null
+++ b/fs/bcachefs/recovery_passes.h
@@ -0,0 +1,17 @@
+#ifndef _BCACHEFS_RECOVERY_PASSES_H
+#define _BCACHEFS_RECOVERY_PASSES_H
+
+extern const char * const bch2_recovery_passes[];
+
+u64 bch2_recovery_passes_to_stable(u64 v);
+u64 bch2_recovery_passes_from_stable(u64 v);
+
+u64 bch2_fsck_recovery_passes(void);
+
+int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
+
+int bch2_run_online_recovery_passes(struct bch_fs *);
+int bch2_run_recovery_passes(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_passes_types.h
index 4959e95e7c74..773aea9a0080 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_TYPES_H
-#define _BCACHEFS_RECOVERY_TYPES_H
+#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
+#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
#define PASS_SILENT BIT(0)
#define PASS_FSCK BIT(1)
@@ -13,6 +13,7 @@
* must never change:
*/
#define BCH_RECOVERY_PASSES() \
+ x(scan_for_btree_nodes, 37, 0) \
x(check_topology, 4, 0) \
x(alloc_read, 0, PASS_ALWAYS) \
x(stripes_read, 1, PASS_ALWAYS) \
@@ -31,13 +32,13 @@
x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
x(bucket_gens_init, 17, 0) \
+ x(reconstruct_snapshots, 38, 0) \
x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
x(fs_upgrade_for_subvolumes, 22, 0) \
- x(resume_logged_ops, 23, PASS_ALWAYS) \
x(check_inodes, 24, PASS_FSCK) \
x(check_extents, 25, PASS_FSCK) \
x(check_indirect_extents, 26, PASS_FSCK) \
@@ -47,6 +48,7 @@
x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
+ x(resume_logged_ops, 23, PASS_ALWAYS) \
x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 33, 0) \
x(set_fs_needs_rebalance, 34, 0) \
@@ -56,6 +58,7 @@ enum bch_recovery_pass {
#define x(n, id, when) BCH_RECOVERY_PASS_##n,
BCH_RECOVERY_PASSES()
#undef x
+ BCH_RECOVERY_PASS_NR
};
/* But we also need stable identifiers that can be used in the superblock */
@@ -65,4 +68,4 @@ enum bch_recovery_pass_stable {
#undef x
};
-#endif /* _BCACHEFS_RECOVERY_TYPES_H */
+#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c47c66c2b394..9ac6cf21cfbf 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -30,7 +30,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
/* reflink pointers */
int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
@@ -74,20 +74,20 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
}
static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 *idx, unsigned flags)
+ struct bkey_s_c_reflink_p p, u64 *idx,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_i *k;
__le64 *refcount;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
struct printbuf buf = PRINTBUF;
int ret;
k = bch2_bkey_get_mut_noupdate(trans, &iter,
BTREE_ID_reflink, POS(0, *idx),
- BTREE_ITER_WITH_UPDATES);
+ BTREE_ITER_with_updates);
ret = PTR_ERR_OR_ZERO(k);
if (ret)
goto err;
@@ -102,7 +102,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
goto err;
}
- if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+ if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
bch2_bkey_val_to_text(&buf, c, p.s_c);
bch2_trans_inconsistent(trans,
"indirect extent refcount underflow at %llu while marking\n %s",
@@ -111,7 +111,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
goto err;
}
- if (flags & BTREE_TRIGGER_INSERT) {
+ if (flags & BTREE_TRIGGER_insert) {
struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
u64 pad;
@@ -141,12 +141,13 @@ err:
}
static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 *idx, unsigned flags, size_t r_idx)
+ struct bkey_s_c_reflink_p p, u64 *idx,
+ enum btree_iter_update_trigger_flags flags,
+ size_t r_idx)
{
struct bch_fs *c = trans->c;
struct reflink_gc *r;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
u64 start = le64_to_cpu(p.v->idx);
u64 end = le64_to_cpu(p.v->idx) + p.k->size;
u64 next_idx = end + le32_to_cpu(p.v->back_pad);
@@ -163,10 +164,13 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
BUG_ON((s64) r->refcount + add < 0);
- r->refcount += add;
+ if (flags & BTREE_TRIGGER_gc)
+ r->refcount += add;
*idx = r->offset;
return 0;
not_found:
+ BUG_ON(!(flags & BTREE_TRIGGER_check_repair));
+
if (fsck_err(c, reflink_p_to_missing_reflink_v,
"pointer to missing indirect extent\n"
" %s\n"
@@ -185,12 +189,11 @@ not_found:
} else {
bkey_error_init(update);
update->k.p = p.k->p;
- update->k.p.offset = next_idx;
- update->k.size = next_idx - *idx;
+ update->k.size = p.k->size;
set_bkey_val_u64s(&update->k, 0);
}
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_norun);
}
*idx = next_idx;
@@ -201,8 +204,8 @@ fsck_err:
}
static int __trigger_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+ enum btree_id btree_id, unsigned level, struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
@@ -211,12 +214,12 @@ static int __trigger_reflink_p(struct btree_trans *trans,
u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad);
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
while (idx < end && !ret)
ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags);
}
- if (flags & BTREE_TRIGGER_GC) {
+ if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) {
size_t l = 0, r = c->reflink_gc_nr;
while (l < r) {
@@ -239,10 +242,10 @@ int bch2_trigger_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
- if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
- (flags & BTREE_TRIGGER_INSERT)) {
+ if ((flags & BTREE_TRIGGER_transactional) &&
+ (flags & BTREE_TRIGGER_insert)) {
struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v;
v->front_pad = v->back_pad = 0;
@@ -254,7 +257,7 @@ int bch2_trigger_reflink_p(struct btree_trans *trans,
/* indirect extents */
int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
return bch2_bkey_ptrs_invalid(c, k, flags, err);
@@ -282,23 +285,25 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
}
#endif
-static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags)
+static inline void
+check_indirect_extent_deleting(struct bkey_s new,
+ enum btree_iter_update_trigger_flags *flags)
{
- if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
+ if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) {
new.k->type = KEY_TYPE_deleted;
new.k->size = 0;
set_bkey_val_u64s(new.k, 0);
- *flags &= ~BTREE_TRIGGER_INSERT;
+ *flags &= ~BTREE_TRIGGER_insert;
}
}
int bch2_trigger_reflink_v(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
- if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
- (flags & BTREE_TRIGGER_INSERT))
+ if ((flags & BTREE_TRIGGER_transactional) &&
+ (flags & BTREE_TRIGGER_insert))
check_indirect_extent_deleting(new, &flags);
return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
@@ -307,7 +312,7 @@ int bch2_trigger_reflink_v(struct btree_trans *trans,
/* indirect inline data */
int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
return 0;
@@ -327,7 +332,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
check_indirect_extent_deleting(new, &flags);
@@ -350,7 +355,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
k = bch2_btree_iter_peek_prev(&reflink_iter);
ret = bkey_err(k);
if (ret)
@@ -395,7 +400,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
err:
bch2_trans_iter_exit(trans, &reflink_iter);
@@ -456,9 +461,9 @@ s64 bch2_remap_range(struct bch_fs *c,
goto err;
bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
while ((ret == 0 ||
bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
@@ -568,7 +573,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_begin(trans);
ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
- dst_inum, BTREE_ITER_INTENT);
+ dst_inum, BTREE_ITER_intent);
if (!ret2 &&
inode_u.bi_size < new_i_size) {
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 4d8867289717..e894f3a2c67a 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -2,15 +2,16 @@
#ifndef _BCACHEFS_REFLINK_H
#define _BCACHEFS_REFLINK_H
-enum bkey_invalid_flags;
+enum bch_validate_flags;
int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \
.key_invalid = bch2_reflink_p_invalid, \
@@ -21,11 +22,12 @@ int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
})
int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
@@ -36,13 +38,13 @@ int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
})
int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
int bch2_trigger_indirect_inline_data(struct btree_trans *,
enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
- unsigned);
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
.key_invalid = bch2_indirect_inline_data_invalid, \
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index cc2672c12031..57a1f09cca09 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -6,12 +6,15 @@
#include "replicas.h"
#include "super-io.h"
+#include <linux/sort.h>
+
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r, size_t size)
+static int bch2_memcmp(const void *l, const void *r, const void *priv)
{
+ size_t size = (size_t) priv;
return memcmp(l, r, size);
}
@@ -20,14 +23,12 @@ static int bch2_memcmp(const void *l, const void *r, size_t size)
static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- unsigned i;
-
BUG_ON(e->data_type >= BCH_DATA_NR);
BUG_ON(!e->nr_devs);
BUG_ON(e->nr_required > 1 &&
e->nr_required >= e->nr_devs);
- for (i = 0; i + 1 < e->nr_devs; i++)
+ for (unsigned i = 0; i + 1 < e->nr_devs; i++)
BUG_ON(e->devs[i] >= e->devs[i + 1]);
#endif
}
@@ -39,7 +40,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
- eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
+ eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
+ bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
}
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@@ -80,7 +82,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
}
for (unsigned i = 0; i < r->nr_devs; i++)
- if (!bch2_dev_exists(sb, r->devs[i])) {
+ if (!bch2_member_exists(sb, r->devs[i])) {
prt_printf(err, "invalid device %u in entry ", r->devs[i]);
goto bad;
}
@@ -188,24 +190,17 @@ cpu_replicas_add_entry(struct bch_fs *c,
struct bch_replicas_cpu *old,
struct bch_replicas_entry_v1 *new_entry)
{
- unsigned i;
struct bch_replicas_cpu new = {
.nr = old->nr + 1,
.entry_size = max_t(unsigned, old->entry_size,
replicas_entry_bytes(new_entry)),
};
- for (i = 0; i < new_entry->nr_devs; i++)
- BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i]));
-
- BUG_ON(!new_entry->data_type);
- verify_replicas_entry(new_entry);
-
new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
if (!new.entries)
return new;
- for (i = 0; i < old->nr; i++)
+ for (unsigned i = 0; i < old->nr; i++)
memcpy(cpu_replicas_entry(&new, i),
cpu_replicas_entry(old, i),
old->entry_size);
@@ -226,9 +221,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
if (unlikely(entry_size > r->entry_size))
return -1;
- verify_replicas_entry(search);
-
-#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
+#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size)
idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
entry_cmp, search);
#undef entry_cmp
@@ -520,13 +513,16 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
c->replicas_gc.nr = 0;
c->replicas_gc.entry_size = 0;
- for_each_cpu_replicas_entry(&c->replicas, e)
- if (!((1 << e->data_type) & typemask)) {
+ for_each_cpu_replicas_entry(&c->replicas, e) {
+ /* Preserve unknown data types */
+ if (e->data_type >= BCH_DATA_NR ||
+ !((1 << e->data_type) & typemask)) {
c->replicas_gc.nr++;
c->replicas_gc.entry_size =
max_t(unsigned, c->replicas_gc.entry_size,
replicas_entry_bytes(e));
}
+ }
c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
c->replicas_gc.entry_size,
@@ -538,7 +534,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
}
for_each_cpu_replicas_entry(&c->replicas, e)
- if (!((1 << e->data_type) & typemask))
+ if (e->data_type >= BCH_DATA_NR ||
+ !((1 << e->data_type) & typemask))
memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
e, c->replicas_gc.entry_size);
@@ -824,10 +821,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
{
unsigned i;
- sort_cmp_size(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- bch2_memcmp, NULL);
+ sort_r(cpu_r->entries,
+ cpu_r->nr,
+ cpu_r->entry_size,
+ bch2_memcmp, NULL,
+ (void *)(size_t)cpu_r->entry_size);
for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e =
@@ -855,7 +853,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
}
static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
struct bch_replicas_cpu cpu_r;
@@ -894,7 +892,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
};
static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
struct bch_replicas_cpu cpu_r;
@@ -942,18 +940,20 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
percpu_down_read(&c->mark_lock);
for_each_cpu_replicas_entry(&c->replicas, e) {
- unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
+ unsigned nr_online = 0, nr_failed = 0, dflags = 0;
bool metadata = e->data_type < BCH_DATA_user;
if (e->data_type == BCH_DATA_cached)
continue;
- for (i = 0; i < e->nr_devs; i++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
-
+ rcu_read_lock();
+ for (unsigned i = 0; i < e->nr_devs; i++) {
nr_online += test_bit(e->devs[i], devs.d);
- nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
+
+ struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]);
+ nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed;
}
+ rcu_read_unlock();
if (nr_failed == e->nr_devs)
continue;
@@ -991,7 +991,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
{
struct bch_sb_field_replicas *replicas;
struct bch_sb_field_replicas_v0 *replicas_v0;
- unsigned i, data_has = 0;
+ unsigned data_has = 0;
replicas = bch2_sb_field_get(sb, replicas);
replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
@@ -999,17 +999,26 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
if (replicas) {
struct bch_replicas_entry_v1 *r;
- for_each_replicas_entry(replicas, r)
- for (i = 0; i < r->nr_devs; i++)
+ for_each_replicas_entry(replicas, r) {
+ if (r->data_type >= sizeof(data_has) * 8)
+ continue;
+
+ for (unsigned i = 0; i < r->nr_devs; i++)
if (r->devs[i] == dev)
data_has |= 1 << r->data_type;
+ }
+
} else if (replicas_v0) {
struct bch_replicas_entry_v0 *r;
- for_each_replicas_entry_v0(replicas_v0, r)
- for (i = 0; i < r->nr_devs; i++)
+ for_each_replicas_entry_v0(replicas_v0, r) {
+ if (r->data_type >= sizeof(data_has) * 8)
+ continue;
+
+ for (unsigned i = 0; i < r->nr_devs; i++)
if (r->devs[i] == dev)
data_has |= 1 << r->data_type;
+ }
}
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
new file mode 100644
index 000000000000..b97208195d06
--- /dev/null
+++ b/fs/bcachefs/replicas_format.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_FORMAT_H
+#define _BCACHEFS_REPLICAS_FORMAT_H
+
+struct bch_replicas_entry_v0 {
+ __u8 data_type;
+ __u8 nr_devs;
+ __u8 devs[];
+} __packed;
+
+struct bch_sb_field_replicas_v0 {
+ struct bch_sb_field field;
+ struct bch_replicas_entry_v0 entries[];
+} __packed __aligned(8);
+
+struct bch_replicas_entry_v1 {
+ __u8 data_type;
+ __u8 nr_devs;
+ __u8 nr_required;
+ __u8 devs[];
+} __packed;
+
+struct bch_sb_field_replicas {
+ struct bch_sb_field field;
+ struct bch_replicas_entry_v1 entries[];
+} __packed __aligned(8);
+
+#define replicas_entry_bytes(_i) \
+ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+
+#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 5980ba2563fe..47f10ab57f40 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -29,6 +29,14 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle
for (entry = clean->start;
entry < (struct jset_entry *) vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
+ if (vstruct_end(entry) > vstruct_end(&clean->field)) {
+ bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu",
+ le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s),
+ (u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field));
+ bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
ret = bch2_journal_entry_validate(c, NULL, entry,
le16_to_cpu(c->disk_sb.sb->version),
BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
@@ -258,9 +266,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
}
}
-static int bch2_sb_clean_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_clean *clean = field_to_type(f, clean);
@@ -270,6 +277,17 @@ static int bch2_sb_clean_validate(struct bch_sb *sb,
return -BCH_ERR_invalid_sb_clean;
}
+ for (struct jset_entry *entry = clean->start;
+ entry != vstruct_end(&clean->field);
+ entry = vstruct_next(entry)) {
+ if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) {
+ prt_str(err, "entry type ");
+ bch2_prt_jset_entry_type(err, entry->type);
+ prt_str(err, " overruns end of section");
+ return -BCH_ERR_invalid_sb_clean;
+ }
+ }
+
return 0;
}
@@ -279,14 +297,15 @@ static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field_clean *clean = field_to_type(f, clean);
struct jset_entry *entry;
- prt_printf(out, "flags: %x", le32_to_cpu(clean->flags));
- prt_newline(out);
- prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq));
- prt_newline(out);
+ prt_printf(out, "flags: %x\n", le32_to_cpu(clean->flags));
+ prt_printf(out, "journal_seq: %llu\n", le64_to_cpu(clean->journal_seq));
for (entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
+ if ((void *) vstruct_next(entry) > vstruct_end(&clean->field))
+ break;
+
if (entry->type == BCH_JSET_ENTRY_btree_keys &&
!entry->u64s)
continue;
@@ -370,6 +389,8 @@ void bch2_fs_mark_clean(struct bch_fs *c)
goto out;
}
+ bch2_journal_pos_from_member_info_set(c);
+
bch2_write_super(c);
out:
mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c
index 7dc898761bb3..6992e7469112 100644
--- a/fs/bcachefs/sb-counters.c
+++ b/fs/bcachefs/sb-counters.c
@@ -20,9 +20,8 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
};
-static int bch2_sb_counters_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
return 0;
};
@@ -31,19 +30,12 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
- unsigned int i;
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
- for (i = 0; i < nr; i++) {
- if (i < BCH_COUNTER_NR)
- prt_printf(out, "%s ", bch2_counter_names[i]);
- else
- prt_printf(out, "(unknown)");
-
- prt_tab(out);
- prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < nr; i++)
+ prt_printf(out, "%s \t%llu\n",
+ i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)",
+ le64_to_cpu(ctrs->d[i]));
};
int bch2_sb_counters_to_cpu(struct bch_fs *c)
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index e4396cb0bacb..3fb23e399ffb 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -7,7 +7,7 @@
#include "bcachefs.h"
#include "darray.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "super-io.h"
@@ -51,7 +51,10 @@
BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \
x(btree_subvolume_children, \
BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \
- BCH_FSCK_ERR_subvol_children_not_set)
+ BCH_FSCK_ERR_subvol_children_not_set) \
+ x(mi_btree_bitmap, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_btree_bitmap_not_marked)
#define DOWNGRADE_TABLE()
@@ -131,15 +134,32 @@ downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
#define for_each_downgrade_entry(_d, _i) \
for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \
(void *) _i < vstruct_end(&(_d)->field) && \
- (void *) &_i->errors[0] < vstruct_end(&(_d)->field); \
+ (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) && \
+ (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field); \
_i = downgrade_entry_next_c(_i))
static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
- for_each_downgrade_entry(e, i) {
+ for (const struct bch_sb_field_downgrade_entry *i = e->entries;
+ (void *) i < vstruct_end(&e->field);
+ i = downgrade_entry_next_c(i)) {
+ /*
+ * Careful: sb_field_downgrade_entry is only 2 byte aligned, but
+ * section sizes are 8 byte aligned - an empty entry spanning
+ * the end of the section is allowed (and ignored):
+ */
+ if ((void *) &i->errors[0] > vstruct_end(&e->field))
+ break;
+
+ if (flags & BCH_VALIDATE_write &&
+ (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) {
+ prt_printf(err, "downgrade entry overruns end of superblock section");
+ return -BCH_ERR_invalid_sb_downgrade;
+ }
+
if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) !=
BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) {
prt_printf(err, "downgrade entry with mismatched major version (%u != %u)",
@@ -161,19 +181,16 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
printbuf_tabstop_push(out, 16);
for_each_downgrade_entry(e, i) {
- prt_str(out, "version:");
- prt_tab(out);
+ prt_str(out, "version:\t");
bch2_version_to_text(out, le16_to_cpu(i->version));
prt_newline(out);
- prt_str(out, "recovery passes:");
- prt_tab(out);
+ prt_str(out, "recovery passes:\t");
prt_bitflags(out, bch2_recovery_passes,
bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0])));
prt_newline(out);
- prt_str(out, "errors:");
- prt_tab(out);
+ prt_str(out, "errors:\t");
bool first = true;
for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
if (!first)
diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h
new file mode 100644
index 000000000000..cffd932be3ec
--- /dev/null
+++ b/fs/bcachefs/sb-downgrade_format.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H
+#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H
+
+struct bch_sb_field_downgrade_entry {
+ __le16 version;
+ __le64 recovery_passes[2];
+ __le16 nr_errors;
+ __le16 errors[] __counted_by(nr_errors);
+} __packed __aligned(2);
+
+struct bch_sb_field_downgrade {
+ struct bch_sb_field field;
+ struct bch_sb_field_downgrade_entry entries[];
+};
+
+#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index 5f5bcae391fb..bda33e59e226 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -30,7 +30,7 @@ static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
}
static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_errors *e = field_to_type(f, errors);
unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
new file mode 100644
index 000000000000..84d2763bd597
--- /dev/null
+++ b/fs/bcachefs/sb-errors_format.h
@@ -0,0 +1,296 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H
+#define _BCACHEFS_SB_ERRORS_FORMAT_H
+
+#define BCH_SB_ERRS() \
+ x(clean_but_journal_not_empty, 0) \
+ x(dirty_but_no_journal_entries, 1) \
+ x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \
+ x(sb_clean_journal_seq_mismatch, 3) \
+ x(sb_clean_btree_root_mismatch, 4) \
+ x(sb_clean_missing, 5) \
+ x(jset_unsupported_version, 6) \
+ x(jset_unknown_csum, 7) \
+ x(jset_last_seq_newer_than_seq, 8) \
+ x(jset_past_bucket_end, 9) \
+ x(jset_seq_blacklisted, 10) \
+ x(journal_entries_missing, 11) \
+ x(journal_entry_replicas_not_marked, 12) \
+ x(journal_entry_past_jset_end, 13) \
+ x(journal_entry_replicas_data_mismatch, 14) \
+ x(journal_entry_bkey_u64s_0, 15) \
+ x(journal_entry_bkey_past_end, 16) \
+ x(journal_entry_bkey_bad_format, 17) \
+ x(journal_entry_bkey_invalid, 18) \
+ x(journal_entry_btree_root_bad_size, 19) \
+ x(journal_entry_blacklist_bad_size, 20) \
+ x(journal_entry_blacklist_v2_bad_size, 21) \
+ x(journal_entry_blacklist_v2_start_past_end, 22) \
+ x(journal_entry_usage_bad_size, 23) \
+ x(journal_entry_data_usage_bad_size, 24) \
+ x(journal_entry_clock_bad_size, 25) \
+ x(journal_entry_clock_bad_rw, 26) \
+ x(journal_entry_dev_usage_bad_size, 27) \
+ x(journal_entry_dev_usage_bad_dev, 28) \
+ x(journal_entry_dev_usage_bad_pad, 29) \
+ x(btree_node_unreadable, 30) \
+ x(btree_node_fault_injected, 31) \
+ x(btree_node_bad_magic, 32) \
+ x(btree_node_bad_seq, 33) \
+ x(btree_node_unsupported_version, 34) \
+ x(btree_node_bset_older_than_sb_min, 35) \
+ x(btree_node_bset_newer_than_sb, 36) \
+ x(btree_node_data_missing, 37) \
+ x(btree_node_bset_after_end, 38) \
+ x(btree_node_replicas_sectors_written_mismatch, 39) \
+ x(btree_node_replicas_data_mismatch, 40) \
+ x(bset_unknown_csum, 41) \
+ x(bset_bad_csum, 42) \
+ x(bset_past_end_of_btree_node, 43) \
+ x(bset_wrong_sector_offset, 44) \
+ x(bset_empty, 45) \
+ x(bset_bad_seq, 46) \
+ x(bset_blacklisted_journal_seq, 47) \
+ x(first_bset_blacklisted_journal_seq, 48) \
+ x(btree_node_bad_btree, 49) \
+ x(btree_node_bad_level, 50) \
+ x(btree_node_bad_min_key, 51) \
+ x(btree_node_bad_max_key, 52) \
+ x(btree_node_bad_format, 53) \
+ x(btree_node_bkey_past_bset_end, 54) \
+ x(btree_node_bkey_bad_format, 55) \
+ x(btree_node_bad_bkey, 56) \
+ x(btree_node_bkey_out_of_order, 57) \
+ x(btree_root_bkey_invalid, 58) \
+ x(btree_root_read_error, 59) \
+ x(btree_root_bad_min_key, 60) \
+ x(btree_root_bad_max_key, 61) \
+ x(btree_node_read_error, 62) \
+ x(btree_node_topology_bad_min_key, 63) \
+ x(btree_node_topology_bad_max_key, 64) \
+ x(btree_node_topology_overwritten_by_prev_node, 65) \
+ x(btree_node_topology_overwritten_by_next_node, 66) \
+ x(btree_node_topology_interior_node_empty, 67) \
+ x(fs_usage_hidden_wrong, 68) \
+ x(fs_usage_btree_wrong, 69) \
+ x(fs_usage_data_wrong, 70) \
+ x(fs_usage_cached_wrong, 71) \
+ x(fs_usage_reserved_wrong, 72) \
+ x(fs_usage_persistent_reserved_wrong, 73) \
+ x(fs_usage_nr_inodes_wrong, 74) \
+ x(fs_usage_replicas_wrong, 75) \
+ x(dev_usage_buckets_wrong, 76) \
+ x(dev_usage_sectors_wrong, 77) \
+ x(dev_usage_fragmented_wrong, 78) \
+ x(dev_usage_buckets_ec_wrong, 79) \
+ x(bkey_version_in_future, 80) \
+ x(bkey_u64s_too_small, 81) \
+ x(bkey_invalid_type_for_btree, 82) \
+ x(bkey_extent_size_zero, 83) \
+ x(bkey_extent_size_greater_than_offset, 84) \
+ x(bkey_size_nonzero, 85) \
+ x(bkey_snapshot_nonzero, 86) \
+ x(bkey_snapshot_zero, 87) \
+ x(bkey_at_pos_max, 88) \
+ x(bkey_before_start_of_btree_node, 89) \
+ x(bkey_after_end_of_btree_node, 90) \
+ x(bkey_val_size_nonzero, 91) \
+ x(bkey_val_size_too_small, 92) \
+ x(alloc_v1_val_size_bad, 93) \
+ x(alloc_v2_unpack_error, 94) \
+ x(alloc_v3_unpack_error, 95) \
+ x(alloc_v4_val_size_bad, 96) \
+ x(alloc_v4_backpointers_start_bad, 97) \
+ x(alloc_key_data_type_bad, 98) \
+ x(alloc_key_empty_but_have_data, 99) \
+ x(alloc_key_dirty_sectors_0, 100) \
+ x(alloc_key_data_type_inconsistency, 101) \
+ x(alloc_key_to_missing_dev_bucket, 102) \
+ x(alloc_key_cached_inconsistency, 103) \
+ x(alloc_key_cached_but_read_time_zero, 104) \
+ x(alloc_key_to_missing_lru_entry, 105) \
+ x(alloc_key_data_type_wrong, 106) \
+ x(alloc_key_gen_wrong, 107) \
+ x(alloc_key_dirty_sectors_wrong, 108) \
+ x(alloc_key_cached_sectors_wrong, 109) \
+ x(alloc_key_stripe_wrong, 110) \
+ x(alloc_key_stripe_redundancy_wrong, 111) \
+ x(bucket_sector_count_overflow, 112) \
+ x(bucket_metadata_type_mismatch, 113) \
+ x(need_discard_key_wrong, 114) \
+ x(freespace_key_wrong, 115) \
+ x(freespace_hole_missing, 116) \
+ x(bucket_gens_val_size_bad, 117) \
+ x(bucket_gens_key_wrong, 118) \
+ x(bucket_gens_hole_wrong, 119) \
+ x(bucket_gens_to_invalid_dev, 120) \
+ x(bucket_gens_to_invalid_buckets, 121) \
+ x(bucket_gens_nonzero_for_invalid_buckets, 122) \
+ x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \
+ x(need_discard_freespace_key_bad, 124) \
+ x(backpointer_bucket_offset_wrong, 125) \
+ x(backpointer_to_missing_device, 126) \
+ x(backpointer_to_missing_alloc, 127) \
+ x(backpointer_to_missing_ptr, 128) \
+ x(lru_entry_at_time_0, 129) \
+ x(lru_entry_to_invalid_bucket, 130) \
+ x(lru_entry_bad, 131) \
+ x(btree_ptr_val_too_big, 132) \
+ x(btree_ptr_v2_val_too_big, 133) \
+ x(btree_ptr_has_non_ptr, 134) \
+ x(extent_ptrs_invalid_entry, 135) \
+ x(extent_ptrs_no_ptrs, 136) \
+ x(extent_ptrs_too_many_ptrs, 137) \
+ x(extent_ptrs_redundant_crc, 138) \
+ x(extent_ptrs_redundant_stripe, 139) \
+ x(extent_ptrs_unwritten, 140) \
+ x(extent_ptrs_written_and_unwritten, 141) \
+ x(ptr_to_invalid_device, 142) \
+ x(ptr_to_duplicate_device, 143) \
+ x(ptr_after_last_bucket, 144) \
+ x(ptr_before_first_bucket, 145) \
+ x(ptr_spans_multiple_buckets, 146) \
+ x(ptr_to_missing_backpointer, 147) \
+ x(ptr_to_missing_alloc_key, 148) \
+ x(ptr_to_missing_replicas_entry, 149) \
+ x(ptr_to_missing_stripe, 150) \
+ x(ptr_to_incorrect_stripe, 151) \
+ x(ptr_gen_newer_than_bucket_gen, 152) \
+ x(ptr_too_stale, 153) \
+ x(stale_dirty_ptr, 154) \
+ x(ptr_bucket_data_type_mismatch, 155) \
+ x(ptr_cached_and_erasure_coded, 156) \
+ x(ptr_crc_uncompressed_size_too_small, 157) \
+ x(ptr_crc_csum_type_unknown, 158) \
+ x(ptr_crc_compression_type_unknown, 159) \
+ x(ptr_crc_redundant, 160) \
+ x(ptr_crc_uncompressed_size_too_big, 161) \
+ x(ptr_crc_nonce_mismatch, 162) \
+ x(ptr_stripe_redundant, 163) \
+ x(reservation_key_nr_replicas_invalid, 164) \
+ x(reflink_v_refcount_wrong, 165) \
+ x(reflink_p_to_missing_reflink_v, 166) \
+ x(stripe_pos_bad, 167) \
+ x(stripe_val_size_bad, 168) \
+ x(stripe_sector_count_wrong, 169) \
+ x(snapshot_tree_pos_bad, 170) \
+ x(snapshot_tree_to_missing_snapshot, 171) \
+ x(snapshot_tree_to_missing_subvol, 172) \
+ x(snapshot_tree_to_wrong_subvol, 173) \
+ x(snapshot_tree_to_snapshot_subvol, 174) \
+ x(snapshot_pos_bad, 175) \
+ x(snapshot_parent_bad, 176) \
+ x(snapshot_children_not_normalized, 177) \
+ x(snapshot_child_duplicate, 178) \
+ x(snapshot_child_bad, 179) \
+ x(snapshot_skiplist_not_normalized, 180) \
+ x(snapshot_skiplist_bad, 181) \
+ x(snapshot_should_not_have_subvol, 182) \
+ x(snapshot_to_bad_snapshot_tree, 183) \
+ x(snapshot_bad_depth, 184) \
+ x(snapshot_bad_skiplist, 185) \
+ x(subvol_pos_bad, 186) \
+ x(subvol_not_master_and_not_snapshot, 187) \
+ x(subvol_to_missing_root, 188) \
+ x(subvol_root_wrong_bi_subvol, 189) \
+ x(bkey_in_missing_snapshot, 190) \
+ x(inode_pos_inode_nonzero, 191) \
+ x(inode_pos_blockdev_range, 192) \
+ x(inode_unpack_error, 193) \
+ x(inode_str_hash_invalid, 194) \
+ x(inode_v3_fields_start_bad, 195) \
+ x(inode_snapshot_mismatch, 196) \
+ x(inode_unlinked_but_clean, 197) \
+ x(inode_unlinked_but_nlink_nonzero, 198) \
+ x(inode_checksum_type_invalid, 199) \
+ x(inode_compression_type_invalid, 200) \
+ x(inode_subvol_root_but_not_dir, 201) \
+ x(inode_i_size_dirty_but_clean, 202) \
+ x(inode_i_sectors_dirty_but_clean, 203) \
+ x(inode_i_sectors_wrong, 204) \
+ x(inode_dir_wrong_nlink, 205) \
+ x(inode_dir_multiple_links, 206) \
+ x(inode_multiple_links_but_nlink_0, 207) \
+ x(inode_wrong_backpointer, 208) \
+ x(inode_wrong_nlink, 209) \
+ x(inode_unreachable, 210) \
+ x(deleted_inode_but_clean, 211) \
+ x(deleted_inode_missing, 212) \
+ x(deleted_inode_is_dir, 213) \
+ x(deleted_inode_not_unlinked, 214) \
+ x(extent_overlapping, 215) \
+ x(extent_in_missing_inode, 216) \
+ x(extent_in_non_reg_inode, 217) \
+ x(extent_past_end_of_inode, 218) \
+ x(dirent_empty_name, 219) \
+ x(dirent_val_too_big, 220) \
+ x(dirent_name_too_long, 221) \
+ x(dirent_name_embedded_nul, 222) \
+ x(dirent_name_dot_or_dotdot, 223) \
+ x(dirent_name_has_slash, 224) \
+ x(dirent_d_type_wrong, 225) \
+ x(inode_bi_parent_wrong, 226) \
+ x(dirent_in_missing_dir_inode, 227) \
+ x(dirent_in_non_dir_inode, 228) \
+ x(dirent_to_missing_inode, 229) \
+ x(dirent_to_missing_subvol, 230) \
+ x(dirent_to_itself, 231) \
+ x(quota_type_invalid, 232) \
+ x(xattr_val_size_too_small, 233) \
+ x(xattr_val_size_too_big, 234) \
+ x(xattr_invalid_type, 235) \
+ x(xattr_name_invalid_chars, 236) \
+ x(xattr_in_missing_inode, 237) \
+ x(root_subvol_missing, 238) \
+ x(root_dir_missing, 239) \
+ x(root_inode_not_dir, 240) \
+ x(dir_loop, 241) \
+ x(hash_table_key_duplicate, 242) \
+ x(hash_table_key_wrong_offset, 243) \
+ x(unlinked_inode_not_on_deleted_list, 244) \
+ x(reflink_p_front_pad_bad, 245) \
+ x(journal_entry_dup_same_device, 246) \
+ x(inode_bi_subvol_missing, 247) \
+ x(inode_bi_subvol_wrong, 248) \
+ x(inode_points_to_missing_dirent, 249) \
+ x(inode_points_to_wrong_dirent, 250) \
+ x(inode_bi_parent_nonzero, 251) \
+ x(dirent_to_missing_parent_subvol, 252) \
+ x(dirent_not_visible_in_parent_subvol, 253) \
+ x(subvol_fs_path_parent_wrong, 254) \
+ x(subvol_root_fs_path_parent_nonzero, 255) \
+ x(subvol_children_not_set, 256) \
+ x(subvol_children_bad, 257) \
+ x(subvol_loop, 258) \
+ x(subvol_unreachable, 259) \
+ x(btree_node_bkey_bad_u64s, 260) \
+ x(btree_node_topology_empty_interior_node, 261) \
+ x(btree_ptr_v2_min_key_bad, 262) \
+ x(btree_root_unreadable_and_scan_found_nothing, 263) \
+ x(snapshot_node_missing, 264) \
+ x(dup_backpointer_to_bad_csum_extent, 265) \
+ x(btree_bitmap_not_marked, 266) \
+ x(sb_clean_entry_overrun, 267) \
+ x(btree_ptr_v2_written_0, 268) \
+ x(subvol_snapshot_bad, 269) \
+ x(subvol_inode_bad, 270)
+
+enum bch_sb_error_id {
+#define x(t, n) BCH_FSCK_ERR_##t = n,
+ BCH_SB_ERRS()
+#undef x
+ BCH_SB_ERR_MAX
+};
+
+struct bch_sb_field_errors {
+ struct bch_sb_field field;
+ struct bch_sb_field_error_entry {
+ __le64 v;
+ __le64 last_error_time;
+ } entries[];
+};
+
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
+
+#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index 5178bf579f7c..40325239c3b0 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -4,276 +4,6 @@
#include "darray.h"
-#define BCH_SB_ERRS() \
- x(clean_but_journal_not_empty, 0) \
- x(dirty_but_no_journal_entries, 1) \
- x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \
- x(sb_clean_journal_seq_mismatch, 3) \
- x(sb_clean_btree_root_mismatch, 4) \
- x(sb_clean_missing, 5) \
- x(jset_unsupported_version, 6) \
- x(jset_unknown_csum, 7) \
- x(jset_last_seq_newer_than_seq, 8) \
- x(jset_past_bucket_end, 9) \
- x(jset_seq_blacklisted, 10) \
- x(journal_entries_missing, 11) \
- x(journal_entry_replicas_not_marked, 12) \
- x(journal_entry_past_jset_end, 13) \
- x(journal_entry_replicas_data_mismatch, 14) \
- x(journal_entry_bkey_u64s_0, 15) \
- x(journal_entry_bkey_past_end, 16) \
- x(journal_entry_bkey_bad_format, 17) \
- x(journal_entry_bkey_invalid, 18) \
- x(journal_entry_btree_root_bad_size, 19) \
- x(journal_entry_blacklist_bad_size, 20) \
- x(journal_entry_blacklist_v2_bad_size, 21) \
- x(journal_entry_blacklist_v2_start_past_end, 22) \
- x(journal_entry_usage_bad_size, 23) \
- x(journal_entry_data_usage_bad_size, 24) \
- x(journal_entry_clock_bad_size, 25) \
- x(journal_entry_clock_bad_rw, 26) \
- x(journal_entry_dev_usage_bad_size, 27) \
- x(journal_entry_dev_usage_bad_dev, 28) \
- x(journal_entry_dev_usage_bad_pad, 29) \
- x(btree_node_unreadable, 30) \
- x(btree_node_fault_injected, 31) \
- x(btree_node_bad_magic, 32) \
- x(btree_node_bad_seq, 33) \
- x(btree_node_unsupported_version, 34) \
- x(btree_node_bset_older_than_sb_min, 35) \
- x(btree_node_bset_newer_than_sb, 36) \
- x(btree_node_data_missing, 37) \
- x(btree_node_bset_after_end, 38) \
- x(btree_node_replicas_sectors_written_mismatch, 39) \
- x(btree_node_replicas_data_mismatch, 40) \
- x(bset_unknown_csum, 41) \
- x(bset_bad_csum, 42) \
- x(bset_past_end_of_btree_node, 43) \
- x(bset_wrong_sector_offset, 44) \
- x(bset_empty, 45) \
- x(bset_bad_seq, 46) \
- x(bset_blacklisted_journal_seq, 47) \
- x(first_bset_blacklisted_journal_seq, 48) \
- x(btree_node_bad_btree, 49) \
- x(btree_node_bad_level, 50) \
- x(btree_node_bad_min_key, 51) \
- x(btree_node_bad_max_key, 52) \
- x(btree_node_bad_format, 53) \
- x(btree_node_bkey_past_bset_end, 54) \
- x(btree_node_bkey_bad_format, 55) \
- x(btree_node_bad_bkey, 56) \
- x(btree_node_bkey_out_of_order, 57) \
- x(btree_root_bkey_invalid, 58) \
- x(btree_root_read_error, 59) \
- x(btree_root_bad_min_key, 60) \
- x(btree_root_bad_max_key, 61) \
- x(btree_node_read_error, 62) \
- x(btree_node_topology_bad_min_key, 63) \
- x(btree_node_topology_bad_max_key, 64) \
- x(btree_node_topology_overwritten_by_prev_node, 65) \
- x(btree_node_topology_overwritten_by_next_node, 66) \
- x(btree_node_topology_interior_node_empty, 67) \
- x(fs_usage_hidden_wrong, 68) \
- x(fs_usage_btree_wrong, 69) \
- x(fs_usage_data_wrong, 70) \
- x(fs_usage_cached_wrong, 71) \
- x(fs_usage_reserved_wrong, 72) \
- x(fs_usage_persistent_reserved_wrong, 73) \
- x(fs_usage_nr_inodes_wrong, 74) \
- x(fs_usage_replicas_wrong, 75) \
- x(dev_usage_buckets_wrong, 76) \
- x(dev_usage_sectors_wrong, 77) \
- x(dev_usage_fragmented_wrong, 78) \
- x(dev_usage_buckets_ec_wrong, 79) \
- x(bkey_version_in_future, 80) \
- x(bkey_u64s_too_small, 81) \
- x(bkey_invalid_type_for_btree, 82) \
- x(bkey_extent_size_zero, 83) \
- x(bkey_extent_size_greater_than_offset, 84) \
- x(bkey_size_nonzero, 85) \
- x(bkey_snapshot_nonzero, 86) \
- x(bkey_snapshot_zero, 87) \
- x(bkey_at_pos_max, 88) \
- x(bkey_before_start_of_btree_node, 89) \
- x(bkey_after_end_of_btree_node, 90) \
- x(bkey_val_size_nonzero, 91) \
- x(bkey_val_size_too_small, 92) \
- x(alloc_v1_val_size_bad, 93) \
- x(alloc_v2_unpack_error, 94) \
- x(alloc_v3_unpack_error, 95) \
- x(alloc_v4_val_size_bad, 96) \
- x(alloc_v4_backpointers_start_bad, 97) \
- x(alloc_key_data_type_bad, 98) \
- x(alloc_key_empty_but_have_data, 99) \
- x(alloc_key_dirty_sectors_0, 100) \
- x(alloc_key_data_type_inconsistency, 101) \
- x(alloc_key_to_missing_dev_bucket, 102) \
- x(alloc_key_cached_inconsistency, 103) \
- x(alloc_key_cached_but_read_time_zero, 104) \
- x(alloc_key_to_missing_lru_entry, 105) \
- x(alloc_key_data_type_wrong, 106) \
- x(alloc_key_gen_wrong, 107) \
- x(alloc_key_dirty_sectors_wrong, 108) \
- x(alloc_key_cached_sectors_wrong, 109) \
- x(alloc_key_stripe_wrong, 110) \
- x(alloc_key_stripe_redundancy_wrong, 111) \
- x(bucket_sector_count_overflow, 112) \
- x(bucket_metadata_type_mismatch, 113) \
- x(need_discard_key_wrong, 114) \
- x(freespace_key_wrong, 115) \
- x(freespace_hole_missing, 116) \
- x(bucket_gens_val_size_bad, 117) \
- x(bucket_gens_key_wrong, 118) \
- x(bucket_gens_hole_wrong, 119) \
- x(bucket_gens_to_invalid_dev, 120) \
- x(bucket_gens_to_invalid_buckets, 121) \
- x(bucket_gens_nonzero_for_invalid_buckets, 122) \
- x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \
- x(need_discard_freespace_key_bad, 124) \
- x(backpointer_pos_wrong, 125) \
- x(backpointer_to_missing_device, 126) \
- x(backpointer_to_missing_alloc, 127) \
- x(backpointer_to_missing_ptr, 128) \
- x(lru_entry_at_time_0, 129) \
- x(lru_entry_to_invalid_bucket, 130) \
- x(lru_entry_bad, 131) \
- x(btree_ptr_val_too_big, 132) \
- x(btree_ptr_v2_val_too_big, 133) \
- x(btree_ptr_has_non_ptr, 134) \
- x(extent_ptrs_invalid_entry, 135) \
- x(extent_ptrs_no_ptrs, 136) \
- x(extent_ptrs_too_many_ptrs, 137) \
- x(extent_ptrs_redundant_crc, 138) \
- x(extent_ptrs_redundant_stripe, 139) \
- x(extent_ptrs_unwritten, 140) \
- x(extent_ptrs_written_and_unwritten, 141) \
- x(ptr_to_invalid_device, 142) \
- x(ptr_to_duplicate_device, 143) \
- x(ptr_after_last_bucket, 144) \
- x(ptr_before_first_bucket, 145) \
- x(ptr_spans_multiple_buckets, 146) \
- x(ptr_to_missing_backpointer, 147) \
- x(ptr_to_missing_alloc_key, 148) \
- x(ptr_to_missing_replicas_entry, 149) \
- x(ptr_to_missing_stripe, 150) \
- x(ptr_to_incorrect_stripe, 151) \
- x(ptr_gen_newer_than_bucket_gen, 152) \
- x(ptr_too_stale, 153) \
- x(stale_dirty_ptr, 154) \
- x(ptr_bucket_data_type_mismatch, 155) \
- x(ptr_cached_and_erasure_coded, 156) \
- x(ptr_crc_uncompressed_size_too_small, 157) \
- x(ptr_crc_csum_type_unknown, 158) \
- x(ptr_crc_compression_type_unknown, 159) \
- x(ptr_crc_redundant, 160) \
- x(ptr_crc_uncompressed_size_too_big, 161) \
- x(ptr_crc_nonce_mismatch, 162) \
- x(ptr_stripe_redundant, 163) \
- x(reservation_key_nr_replicas_invalid, 164) \
- x(reflink_v_refcount_wrong, 165) \
- x(reflink_p_to_missing_reflink_v, 166) \
- x(stripe_pos_bad, 167) \
- x(stripe_val_size_bad, 168) \
- x(stripe_sector_count_wrong, 169) \
- x(snapshot_tree_pos_bad, 170) \
- x(snapshot_tree_to_missing_snapshot, 171) \
- x(snapshot_tree_to_missing_subvol, 172) \
- x(snapshot_tree_to_wrong_subvol, 173) \
- x(snapshot_tree_to_snapshot_subvol, 174) \
- x(snapshot_pos_bad, 175) \
- x(snapshot_parent_bad, 176) \
- x(snapshot_children_not_normalized, 177) \
- x(snapshot_child_duplicate, 178) \
- x(snapshot_child_bad, 179) \
- x(snapshot_skiplist_not_normalized, 180) \
- x(snapshot_skiplist_bad, 181) \
- x(snapshot_should_not_have_subvol, 182) \
- x(snapshot_to_bad_snapshot_tree, 183) \
- x(snapshot_bad_depth, 184) \
- x(snapshot_bad_skiplist, 185) \
- x(subvol_pos_bad, 186) \
- x(subvol_not_master_and_not_snapshot, 187) \
- x(subvol_to_missing_root, 188) \
- x(subvol_root_wrong_bi_subvol, 189) \
- x(bkey_in_missing_snapshot, 190) \
- x(inode_pos_inode_nonzero, 191) \
- x(inode_pos_blockdev_range, 192) \
- x(inode_unpack_error, 193) \
- x(inode_str_hash_invalid, 194) \
- x(inode_v3_fields_start_bad, 195) \
- x(inode_snapshot_mismatch, 196) \
- x(inode_unlinked_but_clean, 197) \
- x(inode_unlinked_but_nlink_nonzero, 198) \
- x(inode_checksum_type_invalid, 199) \
- x(inode_compression_type_invalid, 200) \
- x(inode_subvol_root_but_not_dir, 201) \
- x(inode_i_size_dirty_but_clean, 202) \
- x(inode_i_sectors_dirty_but_clean, 203) \
- x(inode_i_sectors_wrong, 204) \
- x(inode_dir_wrong_nlink, 205) \
- x(inode_dir_multiple_links, 206) \
- x(inode_multiple_links_but_nlink_0, 207) \
- x(inode_wrong_backpointer, 208) \
- x(inode_wrong_nlink, 209) \
- x(inode_unreachable, 210) \
- x(deleted_inode_but_clean, 211) \
- x(deleted_inode_missing, 212) \
- x(deleted_inode_is_dir, 213) \
- x(deleted_inode_not_unlinked, 214) \
- x(extent_overlapping, 215) \
- x(extent_in_missing_inode, 216) \
- x(extent_in_non_reg_inode, 217) \
- x(extent_past_end_of_inode, 218) \
- x(dirent_empty_name, 219) \
- x(dirent_val_too_big, 220) \
- x(dirent_name_too_long, 221) \
- x(dirent_name_embedded_nul, 222) \
- x(dirent_name_dot_or_dotdot, 223) \
- x(dirent_name_has_slash, 224) \
- x(dirent_d_type_wrong, 225) \
- x(inode_bi_parent_wrong, 226) \
- x(dirent_in_missing_dir_inode, 227) \
- x(dirent_in_non_dir_inode, 228) \
- x(dirent_to_missing_inode, 229) \
- x(dirent_to_missing_subvol, 230) \
- x(dirent_to_itself, 231) \
- x(quota_type_invalid, 232) \
- x(xattr_val_size_too_small, 233) \
- x(xattr_val_size_too_big, 234) \
- x(xattr_invalid_type, 235) \
- x(xattr_name_invalid_chars, 236) \
- x(xattr_in_missing_inode, 237) \
- x(root_subvol_missing, 238) \
- x(root_dir_missing, 239) \
- x(root_inode_not_dir, 240) \
- x(dir_loop, 241) \
- x(hash_table_key_duplicate, 242) \
- x(hash_table_key_wrong_offset, 243) \
- x(unlinked_inode_not_on_deleted_list, 244) \
- x(reflink_p_front_pad_bad, 245) \
- x(journal_entry_dup_same_device, 246) \
- x(inode_bi_subvol_missing, 247) \
- x(inode_bi_subvol_wrong, 248) \
- x(inode_points_to_missing_dirent, 249) \
- x(inode_points_to_wrong_dirent, 250) \
- x(inode_bi_parent_nonzero, 251) \
- x(dirent_to_missing_parent_subvol, 252) \
- x(dirent_not_visible_in_parent_subvol, 253) \
- x(subvol_fs_path_parent_wrong, 254) \
- x(subvol_root_fs_path_parent_nonzero, 255) \
- x(subvol_children_not_set, 256) \
- x(subvol_children_bad, 257) \
- x(subvol_loop, 258) \
- x(subvol_unreachable, 259) \
- x(btree_node_bkey_bad_u64s, 260)
-
-enum bch_sb_error_id {
-#define x(t, n) BCH_FSCK_ERR_##t = n,
- BCH_SB_ERRS()
-#undef x
- BCH_SB_ERR_MAX
-};
-
struct bch_sb_error_entry_cpu {
u64 id:16,
nr:48;
@@ -283,4 +13,3 @@ struct bch_sb_error_entry_cpu {
typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */
-
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index eff5ce18c69c..39196f2a4197 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -1,12 +1,24 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_cache.h"
#include "disk_groups.h"
+#include "error.h"
#include "opts.h"
#include "replicas.h"
#include "sb-members.h"
#include "super-io.h"
+void bch2_dev_missing(struct bch_fs *c, unsigned dev)
+{
+ bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
+}
+
+void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket)
+{
+ bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset);
+}
+
#define x(t, n, ...) [n] = #t,
static const char * const bch2_iops_measurements[] = {
BCH_IOPS_MEASUREMENTS()
@@ -123,9 +135,9 @@ static int validate_member(struct printbuf *err,
struct bch_sb *sb,
int i)
{
- if (le64_to_cpu(m.nbuckets) > LONG_MAX) {
- prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
- i, le64_to_cpu(m.nbuckets), LONG_MAX);
+ if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) {
+ prt_printf(err, "device %u: too many buckets (got %llu, max %u)",
+ i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX);
return -BCH_ERR_invalid_sb_members;
}
@@ -163,18 +175,14 @@ static void member_to_text(struct printbuf *out,
u64 bucket_size = le16_to_cpu(m.bucket_size);
u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
- if (!bch2_member_exists(&m))
+ if (!bch2_member_alive(&m))
return;
- prt_printf(out, "Device:");
- prt_tab(out);
- prt_printf(out, "%u", i);
- prt_newline(out);
+ prt_printf(out, "Device:\t%u\n", i);
printbuf_indent_add(out, 2);
- prt_printf(out, "Label:");
- prt_tab(out);
+ prt_printf(out, "Label:\t");
if (BCH_MEMBER_GROUP(&m)) {
unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
@@ -188,103 +196,73 @@ static void member_to_text(struct printbuf *out,
}
prt_newline(out);
- prt_printf(out, "UUID:");
- prt_tab(out);
+ prt_printf(out, "UUID:\t");
pr_uuid(out, m.uuid.b);
prt_newline(out);
- prt_printf(out, "Size:");
- prt_tab(out);
+ prt_printf(out, "Size:\t");
prt_units_u64(out, device_size << 9);
prt_newline(out);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
- prt_printf(out, "%s errors:", bch2_member_error_strs[i]);
- prt_tab(out);
- prt_u64(out, le64_to_cpu(m.errors[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+ prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i]));
- for (unsigned i = 0; i < BCH_IOPS_NR; i++) {
- prt_printf(out, "%s iops:", bch2_iops_measurements[i]);
- prt_tab(out);
- prt_printf(out, "%u", le32_to_cpu(m.iops[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_IOPS_NR; i++)
+ prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i]));
- prt_printf(out, "Bucket size:");
- prt_tab(out);
+ prt_printf(out, "Bucket size:\t");
prt_units_u64(out, bucket_size << 9);
prt_newline(out);
- prt_printf(out, "First bucket:");
- prt_tab(out);
- prt_printf(out, "%u", le16_to_cpu(m.first_bucket));
- prt_newline(out);
-
- prt_printf(out, "Buckets:");
- prt_tab(out);
- prt_printf(out, "%llu", le64_to_cpu(m.nbuckets));
- prt_newline(out);
+ prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket));
+ prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets));
- prt_printf(out, "Last mount:");
- prt_tab(out);
+ prt_printf(out, "Last mount:\t");
if (m.last_mount)
bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
else
prt_printf(out, "(never)");
prt_newline(out);
- prt_printf(out, "Last superblock write:");
- prt_tab(out);
- prt_u64(out, le64_to_cpu(m.seq));
- prt_newline(out);
+ prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq));
- prt_printf(out, "State:");
- prt_tab(out);
- prt_printf(out, "%s",
+ prt_printf(out, "State:\t%s\n",
BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
? bch2_member_states[BCH_MEMBER_STATE(&m)]
: "unknown");
- prt_newline(out);
- prt_printf(out, "Data allowed:");
- prt_tab(out);
+ prt_printf(out, "Data allowed:\t");
if (BCH_MEMBER_DATA_ALLOWED(&m))
prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
else
prt_printf(out, "(none)");
prt_newline(out);
- prt_printf(out, "Has data:");
- prt_tab(out);
+ prt_printf(out, "Has data:\t");
if (data_have)
prt_bitflags(out, __bch2_data_types, data_have);
else
prt_printf(out, "(none)");
prt_newline(out);
- prt_str(out, "Durability:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
+ prt_printf(out, "Btree allocated bitmap blocksize:\t");
+ prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
prt_newline(out);
- prt_printf(out, "Discard:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));
+ prt_printf(out, "Btree allocated bitmap:\t");
+ bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64);
prt_newline(out);
- prt_printf(out, "Freespace initialized:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
- prt_newline(out);
+ prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
+
+ prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m));
+ prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
printbuf_indent_sub(out, 2);
}
-static int bch2_sb_members_v1_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
unsigned i;
@@ -332,9 +310,8 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
member_to_text(out, members_v2_get(mi, i), gi, sb, i);
}
-static int bch2_sb_members_v2_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
@@ -389,12 +366,8 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
prt_newline(out);
printbuf_indent_add(out, 2);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
- prt_printf(out, "%s:", bch2_member_error_strs[i]);
- prt_tab(out);
- prt_u64(out, atomic64_read(&ca->errors[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+ prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i]));
printbuf_indent_sub(out, 2);
prt_str(out, "IO errors since ");
@@ -403,12 +376,9 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
prt_newline(out);
printbuf_indent_add(out, 2);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
- prt_printf(out, "%s:", bch2_member_error_strs[i]);
- prt_tab(out);
- prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+ prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i],
+ atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
printbuf_indent_sub(out, 2);
}
@@ -426,3 +396,71 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
+
+/*
+ * Per member "range has btree nodes" bitmap:
+ *
+ * This is so that if we ever have to run the btree node scan to repair we don't
+ * have to scan full devices:
+ */
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
+{
+ bool ret = true;
+ rcu_read_lock();
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
+
+ if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) {
+ ret = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
+ u64 start, unsigned sectors)
+{
+ struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
+ u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
+
+ u64 end = start + sectors;
+
+ int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
+ if (resize > 0) {
+ u64 new_bitmap = 0;
+
+ for (unsigned i = 0; i < 64; i++)
+ if (bitmap & BIT_ULL(i))
+ new_bitmap |= BIT_ULL(i >> resize);
+ bitmap = new_bitmap;
+ m->btree_bitmap_shift += resize;
+ }
+
+ BUG_ON(m->btree_bitmap_shift > 57);
+ BUG_ON(end > 64ULL << m->btree_bitmap_shift);
+
+ for (unsigned bit = start >> m->btree_bitmap_shift;
+ (u64) bit << m->btree_bitmap_shift < end;
+ bit++)
+ bitmap |= BIT_ULL(bit);
+
+ m->btree_allocated_bitmap = cpu_to_le64(bitmap);
+}
+
+void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ if (!bch2_member_exists(c->disk_sb.sb, ptr->dev))
+ continue;
+
+ __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
+ }
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index be0a94183271..dd93192ec065 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_SB_MEMBERS_H
#include "darray.h"
+#include "bkey_types.h"
extern char * const bch2_member_error_strs[];
@@ -28,19 +29,6 @@ static inline bool bch2_dev_is_readable(struct bch_dev *ca)
ca->mi.state != BCH_MEMBER_STATE_failed;
}
-static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
-{
- if (!percpu_ref_tryget(&ca->io_ref))
- return false;
-
- if (ca->mi.state == BCH_MEMBER_STATE_rw ||
- (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
- return true;
-
- percpu_ref_put(&ca->io_ref);
- return false;
-}
-
static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
{
return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
@@ -104,14 +92,41 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *
for (struct bch_dev *_ca = NULL; \
(_ca = __bch2_next_dev((_c), _ca, (_mask)));)
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
+static inline void bch2_dev_get(struct bch_dev *ca)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L);
+#else
+ percpu_ref_get(&ca->ref);
+#endif
+}
+
+static inline void __bch2_dev_put(struct bch_dev *ca)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ long r = atomic_long_dec_return(&ca->ref);
+ if (r < (long) !ca->dying)
+ panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put);
+ ca->last_put = _THIS_IP_;
+ if (!r)
+ complete(&ca->ref_completion);
+#else
+ percpu_ref_put(&ca->ref);
+#endif
+}
+
+static inline void bch2_dev_put(struct bch_dev *ca)
{
if (ca)
- percpu_ref_put(&ca->ref);
+ __bch2_dev_put(ca);
+}
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
+{
rcu_read_lock();
+ bch2_dev_put(ca);
if ((ca = __bch2_next_dev(c, ca, NULL)))
- percpu_ref_get(&ca->ref);
+ bch2_dev_get(ca);
rcu_read_unlock();
return ca;
@@ -131,10 +146,10 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
struct bch_dev *ca,
unsigned state_mask)
{
+ rcu_read_lock();
if (ca)
percpu_ref_put(&ca->io_ref);
- rcu_read_lock();
while ((ca = __bch2_next_dev(c, ca, NULL)) &&
(!((1 << ca->mi.state) & state_mask) ||
!percpu_ref_tryget(&ca->io_ref)))
@@ -157,26 +172,113 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
#define for_each_readable_member(c, ca) \
__for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
-/*
- * If a key exists that references a device, the device won't be going away and
- * we can omit rcu_read_lock():
- */
-static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
{
- EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+ return dev < c->sb.nr_devices && c->devs[dev];
+}
- return rcu_dereference_check(c->devs[idx], 1);
+static inline bool bucket_valid(const struct bch_dev *ca, u64 b)
+{
+ return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first;
}
-static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev)
{
- EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+ EBUG_ON(!bch2_dev_exists(c, dev));
- return rcu_dereference_protected(c->devs[idx],
+ return rcu_dereference_check(c->devs[dev], 1);
+}
+
+static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
+{
+ EBUG_ON(!bch2_dev_exists(c, dev));
+
+ return rcu_dereference_protected(c->devs[dev],
lockdep_is_held(&c->sb_lock) ||
lockdep_is_held(&c->state_lock));
}
+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
+{
+ return c && dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[dev])
+ : NULL;
+}
+
+static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
+{
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ if (ca)
+ bch2_dev_get(ca);
+ rcu_read_unlock();
+ return ca;
+}
+
+void bch2_dev_missing(struct bch_fs *, unsigned);
+
+static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
+{
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
+ if (!ca)
+ bch2_dev_missing(c, dev);
+ return ca;
+}
+
+static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket)
+{
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode);
+ if (ca && !bucket_valid(ca, bucket.offset)) {
+ bch2_dev_put(ca);
+ ca = NULL;
+ }
+ return ca;
+}
+
+void bch2_dev_bucket_missing(struct bch_fs *, struct bpos);
+
+static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket)
+{
+ struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket);
+ if (!ca)
+ bch2_dev_bucket_missing(c, bucket);
+ return ca;
+}
+
+static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
+{
+ if (ca && ca->dev_idx == dev_idx)
+ return ca;
+ bch2_dev_put(ca);
+ return bch2_dev_tryget_noerror(c, dev_idx);
+}
+
+static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
+{
+ if (ca && ca->dev_idx == dev_idx)
+ return ca;
+ bch2_dev_put(ca);
+ return bch2_dev_tryget(c, dev_idx);
+}
+
+static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
+{
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ if (ca && !percpu_ref_tryget(&ca->io_ref))
+ ca = NULL;
+ rcu_read_unlock();
+
+ if (ca &&
+ (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)))
+ return ca;
+
+ if (ca)
+ percpu_ref_put(&ca->io_ref);
+ return NULL;
+}
+
/* XXX kill, move to struct bch_fs */
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
{
@@ -191,16 +293,16 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
-static inline bool bch2_member_exists(struct bch_member *m)
+static inline bool bch2_member_alive(struct bch_member *m)
{
return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
}
-static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev)
+static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
{
if (dev < sb->nr_devices) {
struct bch_member m = bch2_sb_member_get(sb, dev);
- return bch2_member_exists(&m);
+ return bch2_member_alive(&m);
}
return false;
}
@@ -209,6 +311,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
{
return (struct bch_member_cpu) {
.nbuckets = le64_to_cpu(mi->nbuckets),
+ .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) -
+ le16_to_cpu(mi->first_bucket),
.first_bucket = le16_to_cpu(mi->first_bucket),
.bucket_size = le16_to_cpu(mi->bucket_size),
.group = BCH_MEMBER_GROUP(mi),
@@ -219,7 +323,9 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
? BCH_MEMBER_DURABILITY(mi) - 1
: 1,
.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
- .valid = bch2_member_exists(mi),
+ .valid = bch2_member_alive(mi),
+ .btree_bitmap_shift = mi->btree_bitmap_shift,
+ .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
};
}
@@ -228,4 +334,22 @@ void bch2_sb_members_from_cpu(struct bch_fs *);
void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
void bch2_dev_errors_reset(struct bch_dev *);
+static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
+{
+ u64 end = start + sectors;
+
+ if (end > 64ULL << ca->mi.btree_bitmap_shift)
+ return false;
+
+ for (unsigned bit = start >> ca->mi.btree_bitmap_shift;
+ (u64) bit << ca->mi.btree_bitmap_shift < end;
+ bit++)
+ if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
+ return false;
+ return true;
+}
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
+void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
+
#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
new file mode 100644
index 000000000000..e2630548c0f6
--- /dev/null
+++ b/fs/bcachefs/sb-members_format.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H
+#define _BCACHEFS_SB_MEMBERS_FORMAT_H
+
+/*
+ * We refer to members with bitmasks in various places - but we need to get rid
+ * of this limit:
+ */
+#define BCH_SB_MEMBERS_MAX 64
+
+#define BCH_MIN_NR_NBUCKETS (1 << 6)
+
+#define BCH_IOPS_MEASUREMENTS() \
+ x(seqread, 0) \
+ x(seqwrite, 1) \
+ x(randread, 2) \
+ x(randwrite, 3)
+
+enum bch_iops_measurement {
+#define x(t, n) BCH_IOPS_##t = n,
+ BCH_IOPS_MEASUREMENTS()
+#undef x
+ BCH_IOPS_NR
+};
+
+#define BCH_MEMBER_ERROR_TYPES() \
+ x(read, 0) \
+ x(write, 1) \
+ x(checksum, 2)
+
+enum bch_member_error_type {
+#define x(t, n) BCH_MEMBER_ERROR_##t = n,
+ BCH_MEMBER_ERROR_TYPES()
+#undef x
+ BCH_MEMBER_ERROR_NR
+};
+
+struct bch_member {
+ __uuid_t uuid;
+ __le64 nbuckets; /* device size */
+ __le16 first_bucket; /* index of first bucket used */
+ __le16 bucket_size; /* sectors */
+ __u8 btree_bitmap_shift;
+ __u8 pad[3];
+ __le64 last_mount; /* time_t */
+
+ __le64 flags;
+ __le32 iops[4];
+ __le64 errors[BCH_MEMBER_ERROR_NR];
+ __le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
+ __le64 errors_reset_time;
+ __le64 seq;
+ __le64 btree_allocated_bitmap;
+ /*
+ * On recovery from a clean shutdown we don't normally read the journal,
+ * but we still want to resume writing from where we left off so we
+ * don't overwrite more than is necessary, for list journal debugging:
+ */
+ __le32 last_journal_bucket;
+ __le32 last_journal_bucket_offset;
+};
+
+/*
+ * This limit comes from the bucket_gens array - it's a single allocation, and
+ * kernel allocation are limited to INT_MAX
+ */
+#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64)
+
+#define BCH_MEMBER_V1_BYTES 56
+
+LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
+/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
+LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+ struct bch_member, flags, 30, 31)
+
+#if 0
+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
+#endif
+
+#define BCH_MEMBER_STATES() \
+ x(rw, 0) \
+ x(ro, 1) \
+ x(failed, 2) \
+ x(spare, 3)
+
+enum bch_member_state {
+#define x(t, n) BCH_MEMBER_STATE_##t = n,
+ BCH_MEMBER_STATES()
+#undef x
+ BCH_MEMBER_STATE_NR
+};
+
+struct bch_sb_field_members_v1 {
+ struct bch_sb_field field;
+ struct bch_member _members[]; //Members are now variable size
+};
+
+struct bch_sb_field_members_v2 {
+ struct bch_sb_field field;
+ __le16 member_bytes; //size of single member entry
+ u8 pad[6];
+ struct bch_member _members[];
+};
+
+#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h
new file mode 100644
index 000000000000..c0eda888fe39
--- /dev/null
+++ b/fs/bcachefs/sb-members_types.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H
+#define _BCACHEFS_SB_MEMBERS_TYPES_H
+
+struct bch_member_cpu {
+ u64 nbuckets; /* device size */
+ u64 nbuckets_minus_first;
+ u16 first_bucket; /* index of first bucket used */
+ u16 bucket_size; /* sectors */
+ u16 group;
+ u8 state;
+ u8 discard;
+ u8 data_allowed;
+ u8 durability;
+ u8 freespace_initialized;
+ u8 valid;
+ u8 btree_bitmap_shift;
+ u64 btree_allocated_bitmap;
+};
+
+#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 39debe814bf3..51918acfd726 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -8,6 +8,7 @@
#include "errcode.h"
#include "error.h"
#include "fs.h"
+#include "recovery_passes.h"
#include "snapshot.h"
#include <linux/random.h>
@@ -31,7 +32,7 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
}
int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
int ret = 0;
@@ -48,7 +49,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot_tree *s)
{
int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
- BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
+ BTREE_ITER_with_updates, snapshot_tree, s);
if (bch2_err_matches(ret, ENOENT))
ret = -BCH_ERR_ENOENT_snapshot_tree;
@@ -93,8 +94,10 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans,
static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
{
- while (id && id < ancestor)
- id = __snapshot_t(t, id)->parent;
+ while (id && id < ancestor) {
+ const struct snapshot_t *s = __snapshot_t(t, id);
+ id = s ? s->parent : 0;
+ }
return id == ancestor;
}
@@ -110,6 +113,8 @@ static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancest
static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
{
const struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
+ return 0;
if (s->skip[2] <= ancestor)
return s->skip[2];
@@ -120,6 +125,15 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances
return s->parent;
}
+static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+ const struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
+ return false;
+
+ return test_bit(ancestor - id - 1, s->is_ancestor);
+}
+
bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
bool ret;
@@ -127,7 +141,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
rcu_read_lock();
struct snapshot_table *t = rcu_dereference(c->snapshots);
- if (unlikely(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots)) {
+ if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
goto out;
}
@@ -135,13 +149,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
id = get_ancestor_below(t, id, ancestor);
- if (id && id < ancestor) {
- ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
+ ret = id && id < ancestor
+ ? test_ancestor_bitmap(t, id, ancestor)
+ : id == ancestor;
- EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
- } else {
- ret = id == ancestor;
- }
+ EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
out:
rcu_read_unlock();
@@ -151,36 +163,39 @@ out:
static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
{
size_t idx = U32_MAX - id;
- size_t new_size;
struct snapshot_table *new, *old;
- new_size = max(16UL, roundup_pow_of_two(idx + 1));
+ size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
+ size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
- new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
+ new = kvzalloc(new_bytes, GFP_KERNEL);
if (!new)
return NULL;
+ new->nr = new_size;
+
old = rcu_dereference_protected(c->snapshots, true);
if (old)
- memcpy(new->s,
- rcu_dereference_protected(c->snapshots, true)->s,
- sizeof(new->s[0]) * c->snapshot_table_size);
+ memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
rcu_assign_pointer(c->snapshots, new);
- c->snapshot_table_size = new_size;
- kvfree_rcu_mightsleep(old);
+ kvfree_rcu(old, rcu);
- return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+ return &rcu_dereference_protected(c->snapshots,
+ lockdep_is_held(&c->snapshot_table_lock))->s[idx];
}
static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
{
size_t idx = U32_MAX - id;
+ struct snapshot_table *table =
+ rcu_dereference_protected(c->snapshots,
+ lockdep_is_held(&c->snapshot_table_lock));
lockdep_assert_held(&c->snapshot_table_lock);
- if (likely(idx < c->snapshot_table_size))
- return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+ if (likely(table && idx < table->nr))
+ return &table->s[idx];
return __snapshot_t_mut(c, id);
}
@@ -208,7 +223,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
}
int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_snapshot s;
@@ -283,7 +298,7 @@ static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
static int __bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
struct snapshot_t *t;
@@ -337,7 +352,7 @@ err:
int bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
}
@@ -346,7 +361,7 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s)
{
return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_WITH_UPDATES, snapshot, s);
+ BTREE_ITER_with_updates, snapshot, s);
}
static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
@@ -567,6 +582,13 @@ static int check_snapshot_tree(struct btree_trans *trans,
u32 subvol_id;
ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+ bch_err_fn(c, ret);
+
+ if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
+ ret = 0;
+ goto err;
+ }
+
if (ret)
goto err;
@@ -596,7 +618,7 @@ int bch2_check_snapshot_trees(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_snapshot_trees, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_snapshot_tree(trans, &iter, k)));
bch_err_fn(c, ret);
@@ -673,7 +695,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans,
root = bch2_bkey_get_iter_typed(trans, &root_iter,
BTREE_ID_snapshots, POS(0, root_id),
- BTREE_ITER_WITH_UPDATES, snapshot);
+ BTREE_ITER_with_updates, snapshot);
ret = bkey_err(root);
if (ret)
goto err;
@@ -724,7 +746,6 @@ static int check_snapshot(struct btree_trans *trans,
u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
u32 real_depth;
struct printbuf buf = PRINTBUF;
- bool should_have_subvol;
u32 i, id;
int ret = 0;
@@ -770,7 +791,7 @@ static int check_snapshot(struct btree_trans *trans,
}
}
- should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+ bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
!BCH_SNAPSHOT_DELETED(&s);
if (should_have_subvol) {
@@ -865,13 +886,181 @@ int bch2_check_snapshots(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_reverse_commit(trans, iter,
BTREE_ID_snapshots, POS_MAX,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_snapshot(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
}
+static int check_snapshot_exists(struct btree_trans *trans, u32 id)
+{
+ struct bch_fs *c = trans->c;
+
+ if (bch2_snapshot_equiv(c, id))
+ return 0;
+
+ /* 0 is an invalid tree ID */
+ u32 tree_id = 0;
+ int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+ if (ret)
+ return ret;
+
+ struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
+ ret = PTR_ERR_OR_ZERO(snapshot);
+ if (ret)
+ return ret;
+
+ bkey_snapshot_init(&snapshot->k_i);
+ snapshot->k.p = POS(0, id);
+ snapshot->v.tree = cpu_to_le32(tree_id);
+ snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+
+ return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
+ bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+ bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
+ bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
+}
+
+/* Figure out which snapshot nodes belong in the same tree: */
+struct snapshot_tree_reconstruct {
+ enum btree_id btree;
+ struct bpos cur_pos;
+ snapshot_id_list cur_ids;
+ DARRAY(snapshot_id_list) trees;
+};
+
+static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
+{
+ darray_for_each(r->trees, i)
+ darray_exit(i);
+ darray_exit(&r->trees);
+ darray_exit(&r->cur_ids);
+}
+
+static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+ return r->btree == BTREE_ID_inodes
+ ? r->cur_pos.offset == pos.offset
+ : r->cur_pos.inode == pos.inode;
+}
+
+static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
+{
+ darray_for_each(*l, i)
+ if (snapshot_list_has_id(r, *i))
+ return true;
+ return false;
+}
+
+static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
+{
+ bool first = true;
+ darray_for_each(*s, i) {
+ if (!first)
+ prt_char(out, ' ');
+ first = false;
+ prt_printf(out, "%u", *i);
+ }
+}
+
+static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
+{
+ if (r->cur_ids.nr) {
+ darray_for_each(r->trees, i)
+ if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
+ int ret = snapshot_list_merge(c, i, &r->cur_ids);
+ if (ret)
+ return ret;
+ goto out;
+ }
+ darray_push(&r->trees, r->cur_ids);
+ darray_init(&r->cur_ids);
+ }
+out:
+ r->cur_ids.nr = 0;
+ return 0;
+}
+
+static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+ if (!same_snapshot(r, pos))
+ snapshot_tree_reconstruct_next(c, r);
+ r->cur_pos = pos;
+ return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
+}
+
+int bch2_reconstruct_snapshots(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct printbuf buf = PRINTBUF;
+ struct snapshot_tree_reconstruct r = {};
+ int ret = 0;
+
+ for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
+ if (btree_type_has_snapshots(btree)) {
+ r.btree = btree;
+
+ ret = for_each_btree_key(trans, iter, btree, POS_MIN,
+ BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
+ get_snapshot_trees(c, &r, k.k->p);
+ }));
+ if (ret)
+ goto err;
+
+ snapshot_tree_reconstruct_next(c, &r);
+ }
+ }
+
+ darray_for_each(r.trees, t) {
+ printbuf_reset(&buf);
+ snapshot_id_list_to_text(&buf, t);
+
+ darray_for_each(*t, id) {
+ if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
+ c, snapshot_node_missing,
+ "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
+ if (t->nr > 1) {
+ bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
+ }
+
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_snapshot_exists(trans, *id));
+ if (ret)
+ goto err;
+ }
+ }
+ }
+fsck_err:
+err:
+ bch2_trans_put(trans);
+ snapshot_tree_reconstruct_exit(&r);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_check_key_has_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
+ bkey_in_missing_snapshot,
+ "key in missing snapshot %s, delete?",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node) ?: 1;
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
/*
* Mark a snapshot as deleted, for future cleanup:
*/
@@ -921,7 +1110,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
int ret = 0;
s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_INTENT, snapshot);
+ BTREE_ITER_intent, snapshot);
ret = bkey_err(s);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
"missing snapshot %u", id);
@@ -1030,7 +1219,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
- POS_MIN, BTREE_ITER_INTENT);
+ POS_MIN, BTREE_ITER_intent);
k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
@@ -1181,35 +1370,39 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
* that key to snapshot leaf nodes, where we can mutate it
*/
-static int snapshot_delete_key(struct btree_trans *trans,
+static int delete_dead_snapshots_process_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
snapshot_id_list *deleted,
snapshot_id_list *equiv_seen,
struct bpos *last_pos)
{
+ int ret = bch2_check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret < 0 ? ret : 0;
+
struct bch_fs *c = trans->c;
u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+ if (!equiv) /* key for invalid snapshot node, but we chose not to delete */
+ return 0;
if (!bkey_eq(k.k->p, *last_pos))
equiv_seen->nr = 0;
- *last_pos = k.k->p;
- if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
- snapshot_list_has_id(equiv_seen, equiv)) {
+ if (snapshot_list_has_id(deleted, k.k->p.snapshot))
return bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
- } else {
- return snapshot_list_add(c, equiv_seen, equiv);
- }
-}
+ BTREE_UPDATE_internal_snapshot_node);
-static int move_key_to_correct_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+ if (!bpos_eq(*last_pos, k.k->p) &&
+ snapshot_list_has_id(equiv_seen, equiv))
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node);
+
+ *last_pos = k.k->p;
+
+ ret = snapshot_list_add_nodup(c, equiv_seen, equiv);
+ if (ret)
+ return ret;
/*
* When we have a linear chain of snapshot nodes, we consider
@@ -1219,31 +1412,30 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans,
*
* If there are multiple keys in different snapshots at the same
* position, we're only going to keep the one in the newest
- * snapshot - the rest have been overwritten and are redundant,
- * and for the key we're going to keep we need to move it to the
- * equivalance class ID if it's not there already.
+ * snapshot (we delete the others above) - the rest have been
+ * overwritten and are redundant, and for the key we're going to keep we
+ * need to move it to the equivalance class ID if it's not there
+ * already.
*/
if (equiv != k.k->p.snapshot) {
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- struct btree_iter new_iter;
- int ret;
-
- ret = PTR_ERR_OR_ZERO(new);
+ int ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
new->k.p.snapshot = equiv;
+ struct btree_iter new_iter;
bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&new_iter) ?:
bch2_trans_update(trans, &new_iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &new_iter);
if (ret)
return ret;
@@ -1368,7 +1560,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
struct btree_trans *trans;
snapshot_id_list deleted = { 0 };
snapshot_id_list deleted_interior = { 0 };
- u32 id;
int ret = 0;
if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
@@ -1415,33 +1606,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
if (ret)
goto err;
- for (id = 0; id < BTREE_ID_NR; id++) {
+ for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
struct bpos last_pos = POS_MIN;
snapshot_id_list equiv_seen = { 0 };
struct disk_reservation res = { 0 };
- if (!btree_type_has_snapshots(id))
- continue;
-
- /*
- * deleted inodes btree is maintained by a trigger on the inodes
- * btree - no work for us to do here, and it's not safe to scan
- * it because we'll see out of date keys due to the btree write
- * buffer:
- */
- if (id == BTREE_ID_deleted_inodes)
+ if (!btree_type_has_snapshots(btree))
continue;
ret = for_each_btree_key_commit(trans, iter,
- id, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL, BCH_TRANS_COMMIT_no_enospc,
- snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
- for_each_btree_key_commit(trans, iter,
- id, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ btree, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc,
- move_key_to_correct_snapshot(trans, &iter, k));
+ delete_dead_snapshots_process_key(trans, &iter, k, &deleted,
+ &equiv_seen, &last_pos));
bch2_disk_reservation_put(c, &res);
darray_exit(&equiv_seen);
@@ -1474,7 +1652,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
* nodes some depth fields will be off:
*/
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
- BTREE_ITER_INTENT, k,
+ BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
if (ret)
@@ -1530,8 +1708,8 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &iter, id, pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_not_extents|
+ BTREE_ITER_all_snapshots);
while (1) {
k = bch2_btree_iter_prev(&iter);
ret = bkey_err(k);
@@ -1583,7 +1761,7 @@ static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
pos.snapshot = leaf_id;
- bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -1682,6 +1860,20 @@ int bch2_snapshots_read(struct bch_fs *c)
POS_MIN, 0, k,
(set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
bch_err_fn(c, ret);
+
+ /*
+ * It's important that we check if we need to reconstruct snapshots
+ * before going RW, so we mark that pass as required in the superblock -
+ * otherwise, we could end up deleting keys with missing snapshot nodes
+ * instead
+ */
+ BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
+ test_bit(BCH_FS_may_go_rw, &c->flags));
+
+ if (bch2_err_matches(ret, EIO) ||
+ (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
+ ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
+
return ret;
}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 7c66ffc06385..31b0ee03e962 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -2,11 +2,11 @@
#ifndef _BCACHEFS_SNAPSHOT_H
#define _BCACHEFS_SNAPSHOT_H
-enum bkey_invalid_flags;
+enum bch_validate_flags;
void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \
.key_invalid = bch2_snapshot_tree_invalid, \
@@ -20,9 +20,10 @@ int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tre
void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \
.key_invalid = bch2_snapshot_invalid, \
@@ -33,7 +34,11 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
{
- return &t->s[U32_MAX - id];
+ u32 idx = U32_MAX - id;
+
+ return likely(t && idx < t->nr)
+ ? &t->s[idx]
+ : NULL;
}
static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
@@ -44,7 +49,8 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
{
rcu_read_lock();
- id = snapshot_t(c, id)->tree;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ id = s ? s->tree : 0;
rcu_read_unlock();
return id;
@@ -52,7 +58,8 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{
- return snapshot_t(c, id)->parent;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s ? s->parent : 0;
}
static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
@@ -66,19 +73,19 @@ static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
- u32 parent = snapshot_t(c, id)->parent;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ if (!s)
+ return 0;
- if (parent &&
- snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
+ u32 parent = s->parent;
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+ parent &&
+ s->depth != snapshot_t(c, parent)->depth + 1)
panic("id %u depth=%u parent %u depth=%u\n",
id, snapshot_t(c, id)->depth,
parent, snapshot_t(c, parent)->depth);
return parent;
-#else
- return snapshot_t(c, id)->parent;
-#endif
}
static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
@@ -116,7 +123,8 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
{
- return snapshot_t(c, id)->equiv;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s ? s->equiv : 0;
}
static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
@@ -128,43 +136,22 @@ static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
return id;
}
-static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
+static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
{
- return id == bch2_snapshot_equiv(c, id);
-}
-
-static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s;
- bool ret;
-
rcu_read_lock();
- s = snapshot_t(c, id);
- ret = s->children[0];
+ const struct snapshot_t *s = snapshot_t(c, id);
+ int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
rcu_read_unlock();
return ret;
}
-static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
+static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
{
- return !bch2_snapshot_is_internal_node(c, id);
-}
-
-static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s;
- u32 parent = __bch2_snapshot_parent(c, id);
-
- if (!parent)
- return 0;
-
- s = snapshot_t(c, __bch2_snapshot_parent(c, id));
- if (id == s->children[0])
- return s->children[1];
- if (id == s->children[1])
- return s->children[0];
- return 0;
+ int ret = bch2_snapshot_is_internal_node(c, id);
+ if (ret < 0)
+ return ret;
+ return !ret;
}
static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
@@ -189,12 +176,9 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *t;
- bool ret;
-
rcu_read_lock();
- t = snapshot_t(c, id);
- ret = (t->children[0]|t->children[1]) != 0;
+ const struct snapshot_t *t = snapshot_t(c, id);
+ bool ret = t && (t->children[0]|t->children[1]) != 0;
rcu_read_unlock();
return ret;
@@ -218,15 +202,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list
static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
- int ret;
-
BUG_ON(snapshot_list_has_id(s, id));
- ret = darray_push(s, id);
+ int ret = darray_push(s, id);
if (ret)
bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
return ret;
}
+static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+ int ret = snapshot_list_has_id(s, id)
+ ? 0
+ : darray_push(s, id);
+ if (ret)
+ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+ return ret;
+}
+
+static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
+{
+ darray_for_each(*src, i) {
+ int ret = snapshot_list_add_nodup(c, dst, *i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s);
int bch2_snapshot_get_subvol(struct btree_trans *, u32,
@@ -238,6 +241,8 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *);
+int bch2_reconstruct_snapshots(struct bch_fs *);
+int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
void bch2_delete_dead_snapshots_work(struct work_struct *);
@@ -249,7 +254,7 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
struct bpos pos)
{
if (!btree_type_has_snapshots(id) ||
- bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+ bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0)
return 0;
return __bch2_key_has_snapshot_overwrites(trans, id, pos);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 3976f80721bf..cbad9b27874f 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -15,16 +15,6 @@
#include <crypto/hash.h>
#include <crypto/sha2.h>
-typedef unsigned __bitwise bch_str_hash_flags_t;
-
-enum bch_str_hash_flags {
- __BCH_HASH_SET_MUST_CREATE,
- __BCH_HASH_SET_MUST_REPLACE,
-};
-
-#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE)
-#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE)
-
static inline enum bch_str_hash_type
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
{
@@ -159,13 +149,14 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s
desc.is_visible(inum, k));
}
-static __always_inline int
+static __always_inline struct bkey_s_c
bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, const void *key,
- unsigned flags, u32 snapshot)
+ enum btree_iter_update_trigger_flags flags,
+ u32 snapshot)
{
struct bkey_s_c k;
int ret;
@@ -173,10 +164,10 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
- BTREE_ITER_SLOTS|flags, k, ret) {
+ BTREE_ITER_slots|flags, k, ret) {
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_key(k, key))
- return 0;
+ return k;
} else if (k.k->type == KEY_TYPE_hash_whiteout) {
;
} else {
@@ -186,20 +177,23 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
}
bch2_trans_iter_exit(trans, iter);
- return ret ?: -BCH_ERR_ENOENT_str_hash_lookup;
+ return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup);
}
-static __always_inline int
+static __always_inline struct bkey_s_c
bch2_hash_lookup(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, const void *key,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
u32 snapshot;
- return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
- bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
+ int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return bkey_s_c_err(ret);
+
+ return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
}
static __always_inline int
@@ -220,7 +214,7 @@ bch2_hash_hole(struct btree_trans *trans,
for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
+ BTREE_ITER_slots|BTREE_ITER_intent, k, ret)
if (!is_visible_key(desc, inum, k))
return 0;
bch2_trans_iter_exit(trans, iter);
@@ -242,7 +236,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
bch2_btree_iter_advance(&iter);
- for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) {
+ for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) {
if (k.k->type != desc.key_type &&
k.k->type != KEY_TYPE_hash_whiteout)
break;
@@ -264,8 +258,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
- bch_str_hash_flags_t str_hash_flags,
- int update_flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter, slot = { NULL };
struct bkey_s_c k;
@@ -277,7 +270,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
POS(insert->k.p.inode, U64_MAX),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
@@ -286,8 +279,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
continue;
}
- if (!slot.path &&
- !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE))
+ if (!slot.path && !(flags & STR_HASH_must_replace))
bch2_trans_copy_iter(&slot, &iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
@@ -305,16 +297,16 @@ found:
found = true;
not_found:
- if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) {
+ if (!found && (flags & STR_HASH_must_replace)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
- } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) {
+ } else if (found && (flags & STR_HASH_must_create)) {
ret = -EEXIST;
} else {
if (!found && slot.path)
swap(iter, slot);
insert->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, insert, update_flags);
+ ret = bch2_trans_update(trans, &iter, insert, flags);
}
goto out;
@@ -326,14 +318,14 @@ int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_info *info,
subvol_inum inum,
struct bkey_i *insert,
- bch_str_hash_flags_t str_hash_flags)
+ enum btree_iter_update_trigger_flags flags)
{
insert->k.p.inode = inum.inum;
u32 snapshot;
return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
bch2_hash_set_in_snapshot(trans, desc, info, inum,
- snapshot, insert, str_hash_flags, 0);
+ snapshot, insert, flags);
}
static __always_inline
@@ -341,7 +333,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct btree_iter *iter,
- unsigned update_flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_i *delete;
int ret;
@@ -359,7 +351,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
delete->k.p = iter->pos;
delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
- return bch2_trans_update(trans, iter, delete, update_flags);
+ return bch2_trans_update(trans, iter, delete, flags);
}
static __always_inline
@@ -369,14 +361,10 @@ int bch2_hash_delete(struct btree_trans *trans,
subvol_inum inum, const void *key)
{
struct btree_iter iter;
- int ret;
-
- ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
- BTREE_ITER_INTENT);
- if (ret)
- return ret;
-
- ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
+ struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
+ BTREE_ITER_intent);
+ int ret = bkey_err(k) ?:
+ bch2_hash_delete_at(trans, desc, info, &iter, 0);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index ce7aed121942..dfc9cf305756 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -162,7 +162,7 @@ int bch2_check_subvols(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_subvol(trans, &iter, k)));
bch_err_fn(c, ret);
@@ -198,7 +198,7 @@ int bch2_check_subvol_children(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k,
+ BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_subvol_child(trans, &iter, k)));
bch_err_fn(c, ret);
@@ -208,14 +208,23 @@ int bch2_check_subvol_children(struct bch_fs *c)
/* Subvolumes: */
int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
+ struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k);
int ret = 0;
bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err,
subvol_pos_bad,
"invalid pos");
+
+ bkey_fsck_err_on(!subvol.v->snapshot, c, err,
+ subvol_snapshot_bad,
+ "invalid snapshot");
+
+ bkey_fsck_err_on(!subvol.v->inode, c, err,
+ subvol_inode_bad,
+ "invalid inode");
fsck_err:
return ret;
}
@@ -245,9 +254,9 @@ static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bo
int bch2_subvolume_trigger(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
struct bpos children_pos_old = subvolume_children_pos(old);
struct bpos children_pos_new = subvolume_children_pos(new.s_c);
@@ -333,7 +342,7 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
subvol = bch2_bkey_get_iter_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES,
+ BTREE_ITER_cached|BTREE_ITER_with_updates,
subvolume);
ret = bkey_err(subvol);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
@@ -383,9 +392,9 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
return lockrestart_do(trans,
bch2_subvolume_get(trans, subvolid_to_delete, true,
- BTREE_ITER_CACHED, &s)) ?:
+ BTREE_ITER_cached, &s)) ?:
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
subvolid_to_delete, le32_to_cpu(s.creation_parent)));
@@ -404,7 +413,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
subvol = bch2_bkey_get_iter_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_CACHED|BTREE_ITER_INTENT,
+ BTREE_ITER_cached|BTREE_ITER_intent,
subvolume);
ret = bkey_err(subvol);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
@@ -505,7 +514,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
n = bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_CACHED, subvolume);
+ BTREE_ITER_cached, subvolume);
ret = PTR_ERR_OR_ZERO(n);
if (unlikely(ret)) {
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
@@ -547,7 +556,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
BTREE_ID_subvolumes, POS(0, src_subvolid),
- BTREE_ITER_CACHED, subvolume);
+ BTREE_ITER_cached, subvolume);
ret = PTR_ERR_OR_ZERO(src_subvol);
if (unlikely(ret)) {
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
@@ -595,6 +604,78 @@ err:
return ret;
}
+int bch2_initialize_subvolumes(struct bch_fs *c)
+{
+ struct bkey_i_snapshot_tree root_tree;
+ struct bkey_i_snapshot root_snapshot;
+ struct bkey_i_subvolume root_volume;
+ int ret;
+
+ bkey_snapshot_tree_init(&root_tree.k_i);
+ root_tree.k.p.offset = 1;
+ root_tree.v.master_subvol = cpu_to_le32(1);
+ root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
+
+ bkey_snapshot_init(&root_snapshot.k_i);
+ root_snapshot.k.p.offset = U32_MAX;
+ root_snapshot.v.flags = 0;
+ root_snapshot.v.parent = 0;
+ root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
+ root_snapshot.v.tree = cpu_to_le32(1);
+ SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+ bkey_subvolume_init(&root_volume.k_i);
+ root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_volume.v.flags = 0;
+ root_volume.v.snapshot = cpu_to_le32(U32_MAX);
+ root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
+
+ ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!bkey_is_inode(k.k)) {
+ bch_err(trans->c, "root inode not found");
+ ret = -BCH_ERR_ENOENT_inode;
+ goto err;
+ }
+
+ ret = bch2_inode_unpack(k, &inode);
+ BUG_ON(ret);
+
+ inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ ret = bch2_inode_write(trans, &iter, &inode);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/* set bi_subvol on root inode */
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
+{
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ __bch2_fs_upgrade_for_subvolumes(trans));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
int bch2_fs_subvolumes_init(struct bch_fs *c)
{
INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 903c05162c06..afa5e871efb2 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -5,16 +5,17 @@
#include "darray.h"
#include "subvolume_types.h"
-enum bkey_invalid_flags;
+enum bch_validate_flags;
int bch2_check_subvols(struct bch_fs *);
int bch2_check_subvol_children(struct bch_fs *);
int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
.key_invalid = bch2_subvolume_invalid, \
@@ -37,6 +38,9 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *);
int bch2_subvolume_unlink(struct btree_trans *, u32);
int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
+int bch2_initialize_subvolumes(struct bch_fs *);
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
+
int bch2_fs_subvolumes_init(struct bch_fs *);
#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index ae644adfc391..9b10c8947828 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -20,6 +20,8 @@ struct snapshot_t {
};
struct snapshot_table {
+ struct rcu_head rcu;
+ size_t nr;
#ifndef RUST_BINDGEN
DECLARE_FLEX_ARRAY(struct snapshot_t, s);
#else
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index ad28e370b640..d73a0222f709 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -8,7 +8,7 @@
#include "journal.h"
#include "journal_sb.h"
#include "journal_seq_blacklist.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
@@ -76,7 +76,7 @@ const char * const bch2_sb_fields[] = {
};
static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
- struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
enum bch_sb_field_type type)
@@ -143,7 +143,7 @@ void bch2_free_super(struct bch_sb_handle *sb)
{
kfree(sb->bio);
if (!IS_ERR_OR_NULL(sb->s_bdev_file))
- fput(sb->s_bdev_file);
+ bdev_fput(sb->s_bdev_file);
kfree(sb->holder);
kfree(sb->sb_name);
@@ -232,7 +232,7 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
- percpu_ref_put(&ca->ref);
+ percpu_ref_put(&ca->io_ref);
return NULL;
}
}
@@ -344,8 +344,8 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
return 0;
}
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
- int rw)
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
+ enum bch_validate_flags flags, struct printbuf *out)
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field_members_v1 *mi;
@@ -401,7 +401,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
return -BCH_ERR_invalid_sb_time_precision;
}
- if (rw == READ) {
+ if (!flags) {
/*
* Been seeing a bug where these are getting inexplicably
* zeroed, so we're now validating them, but we have to be
@@ -457,7 +457,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
return -BCH_ERR_invalid_sb_members_missing;
}
- ret = bch2_sb_field_validate(sb, &mi->field, out);
+ ret = bch2_sb_field_validate(sb, &mi->field, flags, out);
if (ret)
return ret;
@@ -465,12 +465,12 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
continue;
- ret = bch2_sb_field_validate(sb, f, out);
+ ret = bch2_sb_field_validate(sb, f, flags, out);
if (ret)
return ret;
}
- if (rw == WRITE &&
+ if ((flags & BCH_VALIDATE_write) &&
bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
@@ -527,9 +527,11 @@ static void bch2_sb_update(struct bch_fs *c)
memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
- if (ext)
+ if (ext) {
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
+ c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
+ }
for_each_member_device(c, ca) {
struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
@@ -647,7 +649,7 @@ reread:
bytes = vstruct_bytes(sb->sb);
- if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
+ if (bytes > 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits)) {
prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
bytes, 512UL << sb->sb->layout.sb_max_size_bits);
return -BCH_ERR_invalid_sb_too_big;
@@ -698,8 +700,11 @@ retry:
return -ENOMEM;
sb->sb_name = kstrdup(path, GFP_KERNEL);
- if (!sb->sb_name)
- return -ENOMEM;
+ if (!sb->sb_name) {
+ ret = -ENOMEM;
+ prt_printf(&err, "error allocating memory for sb_name");
+ goto err;
+ }
#ifndef __KERNEL__
if (opt_get(*opts, direct_io) == false)
@@ -814,7 +819,7 @@ got_super:
sb->have_layout = true;
- ret = bch2_sb_validate(sb, &err, READ);
+ ret = bch2_sb_validate(sb, 0, &err);
if (ret) {
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
@@ -918,6 +923,7 @@ int bch2_write_super(struct bch_fs *c)
struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
+ DARRAY(struct bch_dev *) online_devices = {};
int ret = 0;
trace_and_count(c, write_super, c, _RET_IP_);
@@ -930,6 +936,15 @@ int bch2_write_super(struct bch_fs *c)
closure_init_stack(cl);
memset(&sb_written, 0, sizeof(sb_written));
+ for_each_online_member(c, ca) {
+ ret = darray_push(&online_devices, ca);
+ if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
+ percpu_ref_put(&ca->io_ref);
+ goto out;
+ }
+ percpu_ref_get(&ca->io_ref);
+ }
+
/* Make sure we're using the new magic numbers: */
c->disk_sb.sb->magic = BCHFS_MAGIC;
c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
@@ -937,8 +952,8 @@ int bch2_write_super(struct bch_fs *c)
le64_add_cpu(&c->disk_sb.sb->seq, 1);
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- for_each_online_member(c, ca)
- __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq;
+ darray_for_each(online_devices, ca)
+ __bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq;
c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
if (test_bit(BCH_FS_error, &c->flags))
@@ -954,16 +969,15 @@ int bch2_write_super(struct bch_fs *c)
bch2_sb_errors_from_cpu(c);
bch2_sb_downgrade_update(c);
- for_each_online_member(c, ca)
- bch2_sb_from_fs(c, ca);
+ darray_for_each(online_devices, ca)
+ bch2_sb_from_fs(c, (*ca));
- for_each_online_member(c, ca) {
+ darray_for_each(online_devices, ca) {
printbuf_reset(&err);
- ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
+ ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err);
if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
- percpu_ref_put(&ca->io_ref);
goto out;
}
}
@@ -990,53 +1004,67 @@ int bch2_write_super(struct bch_fs *c)
return -BCH_ERR_sb_not_downgraded;
}
- for_each_online_member(c, ca) {
- __set_bit(ca->dev_idx, sb_written.d);
- ca->sb_write_error = 0;
+ darray_for_each(online_devices, ca) {
+ __set_bit((*ca)->dev_idx, sb_written.d);
+ (*ca)->sb_write_error = 0;
}
- for_each_online_member(c, ca)
- read_back_super(c, ca);
+ darray_for_each(online_devices, ca)
+ read_back_super(c, *ca);
closure_sync(cl);
- for_each_online_member(c, ca) {
+ darray_for_each(online_devices, cap) {
+ struct bch_dev *ca = *cap;
+
if (ca->sb_write_error)
continue;
if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
- bch2_fs_fatal_error(c,
+ struct printbuf buf = PRINTBUF;
+ prt_char(&buf, ' ');
+ prt_bdevname(&buf, ca->disk_sb.bdev);
+ prt_printf(&buf,
": Superblock write was silently dropped! (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
- percpu_ref_put(&ca->io_ref);
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
ret = -BCH_ERR_erofs_sb_err;
- goto out;
}
if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
- bch2_fs_fatal_error(c,
+ struct printbuf buf = PRINTBUF;
+ prt_char(&buf, ' ');
+ prt_bdevname(&buf, ca->disk_sb.bdev);
+ prt_printf(&buf,
": Superblock modified by another process (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
- percpu_ref_put(&ca->io_ref);
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
ret = -BCH_ERR_erofs_sb_err;
- goto out;
}
}
+ if (ret)
+ goto out;
+
do {
wrote = false;
- for_each_online_member(c, ca)
+ darray_for_each(online_devices, cap) {
+ struct bch_dev *ca = *cap;
if (!ca->sb_write_error &&
sb < ca->disk_sb.sb->layout.nr_superblocks) {
write_one_super(c, ca, sb);
wrote = true;
}
+ }
closure_sync(cl);
sb++;
} while (wrote);
- for_each_online_member(c, ca) {
+ darray_for_each(online_devices, cap) {
+ struct bch_dev *ca = *cap;
if (ca->sb_write_error)
__clear_bit(ca->dev_idx, sb_written.d);
else
@@ -1072,6 +1100,9 @@ int bch2_write_super(struct bch_fs *c)
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
+ darray_for_each(online_devices, ca)
+ percpu_ref_put(&(*ca)->io_ref);
+ darray_exit(&online_devices);
printbuf_exit(&err);
return ret;
}
@@ -1101,18 +1132,12 @@ bool bch2_check_version_downgrade(struct bch_fs *c)
* c->sb will be checked before we write the superblock, so update it as
* well:
*/
- if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) {
+ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
- c->sb.version_upgrade_complete = bcachefs_metadata_version_current;
- }
- if (c->sb.version > bcachefs_metadata_version_current) {
+ if (c->sb.version > bcachefs_metadata_version_current)
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
- c->sb.version = bcachefs_metadata_version_current;
- }
- if (c->sb.version_min > bcachefs_metadata_version_current) {
+ if (c->sb.version_min > bcachefs_metadata_version_current)
c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
- c->sb.version_min = bcachefs_metadata_version_current;
- }
c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
return ret;
}
@@ -1130,7 +1155,7 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
}
static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
if (vstruct_bytes(f) < 88) {
prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88);
@@ -1145,8 +1170,7 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
{
struct bch_sb_field_ext *e = field_to_type(f, ext);
- prt_printf(out, "Recovery passes required:");
- prt_tab(out);
+ prt_printf(out, "Recovery passes required:\t");
prt_bitflags(out, bch2_recovery_passes,
bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0])));
prt_newline(out);
@@ -1155,13 +1179,16 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
if (errors_silent) {
le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
- prt_printf(out, "Errors to silently fix:");
- prt_tab(out);
+ prt_printf(out, "Errors to silently fix:\t");
prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8);
prt_newline(out);
kfree(errors_silent);
}
+
+ prt_printf(out, "Btrees with missing data:\t");
+ prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
+ prt_newline(out);
}
static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
@@ -1186,14 +1213,14 @@ static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
}
static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
unsigned type = le32_to_cpu(f->type);
struct printbuf field_err = PRINTBUF;
const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
int ret;
- ret = ops->validate ? ops->validate(sb, f, &field_err) : 0;
+ ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0;
if (ret) {
prt_printf(err, "Invalid superblock section %s: %s",
bch2_sb_fields[type], field_err.buf);
@@ -1267,97 +1294,73 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
printbuf_tabstop_push(out, 44);
for (int i = 0; i < sb->nr_devices; i++)
- nr_devices += bch2_dev_exists(sb, i);
+ nr_devices += bch2_member_exists(sb, i);
- prt_printf(out, "External UUID:");
- prt_tab(out);
+ prt_printf(out, "External UUID:\t");
pr_uuid(out, sb->user_uuid.b);
prt_newline(out);
- prt_printf(out, "Internal UUID:");
- prt_tab(out);
+ prt_printf(out, "Internal UUID:\t");
pr_uuid(out, sb->uuid.b);
prt_newline(out);
- prt_printf(out, "Magic number:");
- prt_tab(out);
+ prt_printf(out, "Magic number:\t");
pr_uuid(out, sb->magic.b);
prt_newline(out);
- prt_str(out, "Device index:");
- prt_tab(out);
- prt_printf(out, "%u", sb->dev_idx);
- prt_newline(out);
+ prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
- prt_str(out, "Label:");
- prt_tab(out);
+ prt_str(out, "Label:\t");
prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
prt_newline(out);
- prt_str(out, "Version:");
- prt_tab(out);
+ prt_str(out, "Version:\t");
bch2_version_to_text(out, le16_to_cpu(sb->version));
prt_newline(out);
- prt_str(out, "Version upgrade complete:");
- prt_tab(out);
+ prt_str(out, "Version upgrade complete:\t");
bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
prt_newline(out);
- prt_printf(out, "Oldest version on disk:");
- prt_tab(out);
+ prt_printf(out, "Oldest version on disk:\t");
bch2_version_to_text(out, le16_to_cpu(sb->version_min));
prt_newline(out);
- prt_printf(out, "Created:");
- prt_tab(out);
+ prt_printf(out, "Created:\t");
if (sb->time_base_lo)
bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
else
prt_printf(out, "(not set)");
prt_newline(out);
- prt_printf(out, "Sequence number:");
- prt_tab(out);
+ prt_printf(out, "Sequence number:\t");
prt_printf(out, "%llu", le64_to_cpu(sb->seq));
prt_newline(out);
- prt_printf(out, "Time of last write:");
- prt_tab(out);
+ prt_printf(out, "Time of last write:\t");
bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
prt_newline(out);
- prt_printf(out, "Superblock size:");
- prt_tab(out);
+ prt_printf(out, "Superblock size:\t");
prt_units_u64(out, vstruct_bytes(sb));
prt_str(out, "/");
prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
prt_newline(out);
- prt_printf(out, "Clean:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_SB_CLEAN(sb));
- prt_newline(out);
-
- prt_printf(out, "Devices:");
- prt_tab(out);
- prt_printf(out, "%u", nr_devices);
- prt_newline(out);
+ prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
+ prt_printf(out, "Devices:\t%u\n", nr_devices);
- prt_printf(out, "Sections:");
+ prt_printf(out, "Sections:\t");
vstruct_for_each(sb, f)
fields_have |= 1 << le32_to_cpu(f->type);
- prt_tab(out);
prt_bitflags(out, bch2_sb_fields, fields_have);
prt_newline(out);
- prt_printf(out, "Features:");
- prt_tab(out);
+ prt_printf(out, "Features:\t");
prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
prt_newline(out);
- prt_printf(out, "Compat features:");
- prt_tab(out);
+ prt_printf(out, "Compat features:\t");
prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
prt_newline(out);
@@ -1374,8 +1377,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
if (opt->get_sb != BCH2_NO_SB_OPT) {
u64 v = bch2_opt_from_sb(sb, id);
- prt_printf(out, "%s:", opt->attr.name);
- prt_tab(out);
+ prt_printf(out, "%s:\t", opt->attr.name);
bch2_opt_to_text(out, NULL, sb, opt, v,
OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
prt_newline(out);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 95e80e06316b..fadd364e2802 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -51,7 +51,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
extern const char * const bch2_sb_fields[];
struct bch_sb_field_ops {
- int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *);
+ int (*validate)(struct bch_sb *, struct bch_sb_field *,
+ enum bch_validate_flags, struct printbuf *);
void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
};
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 1ad6e5cd9476..df2bea38e83f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -15,6 +15,7 @@
#include "btree_gc.h"
#include "btree_journal_iter.h"
#include "btree_key_cache.h"
+#include "btree_node_scan.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_write_buffer.h"
@@ -263,7 +264,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_open_buckets_stop(c, NULL, true);
bch2_rebalance_stop(c);
bch2_copygc_stop(c);
- bch2_gc_thread_stop(c);
bch2_fs_ec_flush(c);
bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
@@ -284,11 +284,16 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
journal_cur_seq(&c->journal));
- if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+ if (test_bit(JOURNAL_replay_done, &c->journal.flags) &&
!test_bit(BCH_FS_emergency_ro, &c->flags))
set_bit(BCH_FS_clean_shutdown, &c->flags);
+
bch2_fs_journal_stop(&c->journal);
+ bch_info(c, "%sshutdown complete, journal seq %llu",
+ test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
+ c->journal.seq_ondisk);
+
/*
* After stopping journal:
*/
@@ -365,7 +370,7 @@ void bch2_fs_read_only(struct bch_fs *c)
!test_bit(BCH_FS_emergency_ro, &c->flags) &&
test_bit(BCH_FS_started, &c->flags) &&
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
- !c->opts.norecovery) {
+ c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_read(&c->btree_cache.dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
@@ -461,7 +466,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
* overwriting whatever was there previously, and there must always be
* at least one non-flush write in the journal or recovery will fail:
*/
- set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
+ set_bit(JOURNAL_need_flush_write, &c->journal.flags);
+ set_bit(JOURNAL_running, &c->journal.flags);
for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
@@ -479,12 +485,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
}
#endif
- ret = bch2_gc_thread_start(c);
- if (ret) {
- bch_err(c, "error starting gc thread");
- return ret;
- }
-
ret = bch2_journal_reclaim_start(&c->journal);
if (ret)
goto err;
@@ -510,7 +510,8 @@ err:
int bch2_fs_read_write(struct bch_fs *c)
{
- if (c->opts.norecovery)
+ if (c->opts.recovery_pass_last &&
+ c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
return -BCH_ERR_erofs_norecovery;
if (c->opts.nochanges)
@@ -530,12 +531,12 @@ int bch2_fs_read_write_early(struct bch_fs *c)
static void __bch2_fs_free(struct bch_fs *c)
{
- unsigned i;
-
- for (i = 0; i < BCH_TIME_STAT_NR; i++)
+ for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
+ bch2_fs_allocator_background_exit(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
@@ -550,18 +551,20 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_fs_io_read_exit(c);
bch2_fs_buckets_waiting_for_journal_exit(c);
bch2_fs_btree_interior_update_exit(c);
- bch2_fs_btree_iter_exit(c);
bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
bch2_fs_btree_cache_exit(c);
+ bch2_fs_btree_iter_exit(c);
bch2_fs_replicas_exit(c);
bch2_fs_journal_exit(&c->journal);
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
bch2_journal_keys_put_initial(c);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
+ EBUG_ON(c->online_reserved && percpu_u64_get(c->online_reserved));
free_percpu(c->online_reserved);
darray_exit(&c->btree_roots_extra);
@@ -606,8 +609,6 @@ void __bch2_fs_stop(struct bch_fs *c)
set_bit(BCH_FS_stopping, &c->flags);
- cancel_work_sync(&c->journal_seq_blacklist_gc_work);
-
down_write(&c->state_lock);
bch2_fs_read_only(c);
up_write(&c->state_lock);
@@ -655,6 +656,7 @@ void bch2_fs_free(struct bch_fs *c)
struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
if (ca) {
+ EBUG_ON(atomic_long_read(&ca->ref) != 1);
bch2_free_super(&ca->disk_sb);
bch2_dev_free(ca);
}
@@ -709,7 +711,7 @@ static int bch2_fs_online(struct bch_fs *c)
ret = bch2_dev_sysfs_online(c, ca);
if (ret) {
bch_err(c, "error creating sysfs objects");
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
goto err;
}
}
@@ -768,6 +770,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
+ bch2_fs_gc_init(c);
bch2_fs_copygc_init(c);
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
bch2_fs_btree_iter_init_early(c);
@@ -790,16 +793,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
spin_lock_init(&c->btree_write_error_lock);
- INIT_WORK(&c->journal_seq_blacklist_gc_work,
- bch2_blacklist_entries_gc);
-
INIT_LIST_HEAD(&c->journal_iters);
INIT_LIST_HEAD(&c->fsck_error_msgs);
mutex_init(&c->fsck_error_msgs_lock);
- seqcount_init(&c->gc_pos_lock);
-
seqcount_init(&c->usage_lock);
sema_init(&c->io_in_flight, 128);
@@ -930,7 +928,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto err;
for (i = 0; i < c->sb.nr_devices; i++)
- if (bch2_dev_exists(c->disk_sb.sb, i) &&
+ if (bch2_member_exists(c->disk_sb.sb, i) &&
bch2_dev_alloc(c, i)) {
ret = -EEXIST;
goto err;
@@ -1015,8 +1013,16 @@ int bch2_fs_start(struct bch_fs *c)
for_each_online_member(c, ca)
bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
+ struct bch_sb_field_ext *ext =
+ bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
mutex_unlock(&c->sb_lock);
+ if (!ext) {
+ bch_err(c, "insufficient space in superblock for sb_field_ext");
+ ret = -BCH_ERR_ENOSPC_sb;
+ goto err;
+ }
+
for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
@@ -1083,7 +1089,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
return -BCH_ERR_device_not_a_member_of_filesystem;
- if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx))
+ if (!bch2_member_exists(fs->sb, sb->sb->dev_idx))
return -BCH_ERR_device_has_been_removed;
if (fs->sb->block_size != sb->sb->block_size)
@@ -1182,11 +1188,11 @@ static void bch2_dev_free(struct bch_dev *ca)
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
+ kfree(ca->buckets_nouse);
bch2_free_super(&ca->disk_sb);
bch2_dev_journal_exit(ca);
free_percpu(ca->io_done);
- bioset_exit(&ca->replica_set);
bch2_dev_buckets_free(ca);
free_page((unsigned long) ca->sb_read_scratch);
@@ -1194,7 +1200,9 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
percpu_ref_exit(&ca->io_ref);
+#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_exit(&ca->ref);
+#endif
kobject_put(&ca->kobj);
}
@@ -1221,12 +1229,14 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_journal_exit(ca);
}
+#ifndef CONFIG_BCACHEFS_DEBUG
static void bch2_dev_ref_complete(struct percpu_ref *ref)
{
struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
complete(&ca->ref_completion);
}
+#endif
static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
{
@@ -1295,14 +1305,17 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / btree_sectors(c));
- if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
- 0, GFP_KERNEL) ||
- percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
+#ifndef CONFIG_BCACHEFS_DEBUG
+ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL))
+ goto err;
+#else
+ atomic_long_set(&ca->ref, 1);
+#endif
+
+ if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
bch2_dev_buckets_alloc(c, ca) ||
- bioset_init(&ca->replica_set, 4,
- offsetof(struct bch_write_bio, bio), 0) ||
!(ca->io_done = alloc_percpu(*ca->io_done)))
goto err;
@@ -1393,10 +1406,9 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
le64_to_cpu(c->disk_sb.sb->seq))
bch2_sb_to_fs(c, sb->sb);
- BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
- !c->devs[sb->sb->dev_idx]);
+ BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));
- ca = bch_dev_locked(c, sb->sb->dev_idx);
+ ca = bch2_dev_locked(c, sb->sb->dev_idx);
ret = __bch2_dev_attach_bdev(ca, sb);
if (ret)
@@ -1488,10 +1500,10 @@ static bool bch2_fs_may_start(struct bch_fs *c)
mutex_lock(&c->sb_lock);
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
- if (!bch2_dev_exists(c->disk_sb.sb, i))
+ if (!bch2_member_exists(c->disk_sb.sb, i))
continue;
- ca = bch_dev_locked(c, i);
+ ca = bch2_dev_locked(c, i);
if (!bch2_dev_is_online(ca) &&
(ca->mi.state == BCH_MEMBER_STATE_rw ||
@@ -1581,17 +1593,17 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
* with bch2_do_invalidates() and bch2_do_discards()
*/
ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
+ BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
+ BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
+ BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
+ BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
+ BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
- BTREE_TRIGGER_NORUN, NULL);
+ BTREE_TRIGGER_norun, NULL);
bch_err_msg(c, ret, "removing dev alloc info");
return ret;
}
@@ -1608,7 +1620,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
* We consume a reference to ca->ref, regardless of whether we succeed
* or fail:
*/
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot remove without losing data");
@@ -1660,7 +1672,12 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
mutex_unlock(&c->sb_lock);
+#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_kill(&ca->ref);
+#else
+ ca->dying = true;
+ bch2_dev_put(ca);
+#endif
wait_for_completion(&ca->ref_completion);
bch2_dev_free(ca);
@@ -1759,9 +1776,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (dynamic_fault("bcachefs:add:no_slot"))
goto no_slot;
- for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
- if (!bch2_dev_exists(c->disk_sb.sb, dev_idx))
- goto have_slot;
+ if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) {
+ dev_idx = c->sb.nr_devices;
+ goto have_slot;
+ }
+
+ int best = -1;
+ u64 best_last_mount = 0;
+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
+ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
+ if (bch2_member_alive(&m))
+ continue;
+
+ u64 last_mount = le64_to_cpu(m.last_mount);
+ if (best < 0 || last_mount < best_last_mount) {
+ best = dev_idx;
+ best_last_mount = last_mount;
+ }
+ }
+ if (best >= 0) {
+ dev_idx = best;
+ goto have_slot;
+ }
no_slot:
ret = -BCH_ERR_ENOSPC_sb_members;
bch_err_msg(c, ret, "setting up new superblock");
@@ -1803,7 +1839,7 @@ have_slot:
bch2_dev_usage_journal_reserve(c);
- ret = bch2_trans_mark_dev_sb(c, ca);
+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
bch_err_msg(ca, ret, "marking new superblock");
if (ret)
goto err_late;
@@ -1866,9 +1902,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
if (ret)
goto err;
- ca = bch_dev_locked(c, dev_idx);
+ ca = bch2_dev_locked(c, dev_idx);
- ret = bch2_trans_mark_dev_sb(c, ca);
+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
if (ret)
goto err;
@@ -1941,6 +1977,13 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
goto err;
}
+ if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
+ bch_err(ca, "New device size too big (%llu greater than max %u)",
+ nbuckets, BCH_MEMBER_NBUCKETS_MAX);
+ ret = -BCH_ERR_device_size_too_big;
+ goto err;
+ }
+
if (bch2_dev_is_online(ca) &&
get_capacity(ca->disk_sb.bdev->bd_disk) <
ca->mi.bucket_size * nbuckets) {
@@ -1954,7 +1997,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (ret)
goto err;
- ret = bch2_trans_mark_dev_sb(c, ca);
+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
if (ret)
goto err;
@@ -1986,13 +2029,9 @@ err:
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, NULL)
- if (!strcmp(name, ca->name)) {
- rcu_read_unlock();
+ for_each_member_device(c, ca)
+ if (!strcmp(name, ca->name))
return ca;
- }
- rcu_read_unlock();
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index ec784d975f66..368a63d938cf 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -26,17 +26,4 @@ struct bch_devs_list {
u8 data[BCH_BKEY_PTRS_MAX];
};
-struct bch_member_cpu {
- u64 nbuckets; /* device size */
- u16 first_bucket; /* index of first bucket used */
- u16 bucket_size; /* sectors */
- u16 group;
- u8 state;
- u8 discard;
- u8 data_allowed;
- u8 durability;
- u8 freespace_initialized;
- u8 valid;
-};
-
#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index c86a93a8d8fc..93ca74d108b1 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -17,7 +17,6 @@
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_update.h"
-#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
@@ -26,6 +25,7 @@
#include "ec.h"
#include "inode.h"
#include "journal.h"
+#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
#include "movinggc.h"
@@ -139,9 +139,9 @@ do { \
write_attribute(trigger_gc);
write_attribute(trigger_discards);
write_attribute(trigger_invalidates);
-write_attribute(prune_cache);
-write_attribute(btree_wakeup);
-rw_attribute(btree_gc_periodic);
+write_attribute(trigger_journal_flush);
+write_attribute(trigger_btree_cache_shrink);
+write_attribute(trigger_btree_key_cache_shrink);
rw_attribute(gc_gens_pos);
read_attribute(uuid);
@@ -166,7 +166,6 @@ read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
-read_attribute(btree_updates);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(stripes_heap);
@@ -189,12 +188,8 @@ static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
{
bch2_printbuf_tabstop_push(out, 24);
- for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) {
- prt_str(out, bch2_write_refs[i]);
- prt_tab(out);
- prt_printf(out, "%li", atomic_long_read(&c->writes[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++)
+ prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i]));
}
#endif
@@ -278,7 +273,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
continue;
ret = for_each_btree_key(trans, iter, id, POS_MIN,
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_all_snapshots, k, ({
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *entry;
@@ -313,22 +308,11 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
if (ret)
return ret;
- prt_str(out, "type");
printbuf_tabstop_push(out, 12);
- prt_tab(out);
-
- prt_str(out, "compressed");
printbuf_tabstop_push(out, 16);
- prt_tab_rjust(out);
-
- prt_str(out, "uncompressed");
printbuf_tabstop_push(out, 16);
- prt_tab_rjust(out);
-
- prt_str(out, "average extent size");
printbuf_tabstop_push(out, 24);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
bch2_prt_compression_type(out, i);
@@ -362,21 +346,6 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
prt_printf(out, "\n");
}
-static void bch2_btree_wakeup_all(struct bch_fs *c)
-{
- struct btree_trans *trans;
-
- seqmutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
-
- if (b)
- six_lock_wakeup_all(&b->lock);
-
- }
- seqmutex_unlock(&c->btree_trans_lock);
-}
-
SHOW(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -392,8 +361,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_btree_write_stats)
bch2_btree_write_stats_to_text(out, c);
- sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
-
if (attr == &sysfs_gc_gens_pos)
bch2_gc_gens_pos_to_text(out, c);
@@ -415,11 +382,8 @@ SHOW(bch2_fs)
if (attr == &sysfs_journal_debug)
bch2_journal_debug_to_text(out, &c->journal);
- if (attr == &sysfs_btree_updates)
- bch2_btree_updates_to_text(out, c);
-
if (attr == &sysfs_btree_cache)
- bch2_btree_cache_to_text(out, c);
+ bch2_btree_cache_to_text(out, &c->btree_cache);
if (attr == &sysfs_btree_key_cache)
bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
@@ -462,6 +426,9 @@ SHOW(bch2_fs)
if (attr == &sysfs_disk_groups)
bch2_disk_groups_to_text(out, c);
+ if (attr == &sysfs_alloc_debug)
+ bch2_fs_alloc_debug_to_text(out, c);
+
return 0;
}
@@ -469,14 +436,6 @@ STORE(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
- if (attr == &sysfs_btree_gc_periodic) {
- ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
- ?: (ssize_t) size;
-
- wake_up_process(c->gc_thread);
- return ret;
- }
-
if (attr == &sysfs_copy_gc_enabled) {
ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
?: (ssize_t) size;
@@ -505,10 +464,10 @@ STORE(bch2_fs)
/* Debugging: */
- if (!test_bit(BCH_FS_rw, &c->flags))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
return -EROFS;
- if (attr == &sysfs_prune_cache) {
+ if (attr == &sysfs_trigger_btree_cache_shrink) {
struct shrink_control sc;
sc.gfp_mask = GFP_KERNEL;
@@ -516,28 +475,28 @@ STORE(bch2_fs)
c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
}
- if (attr == &sysfs_btree_wakeup)
- bch2_btree_wakeup_all(c);
-
- if (attr == &sysfs_trigger_gc) {
- /*
- * Full gc is currently incompatible with btree key cache:
- */
-#if 0
- down_read(&c->state_lock);
- bch2_gc(c, false, false);
- up_read(&c->state_lock);
-#else
- bch2_gc_gens(c);
-#endif
+ if (attr == &sysfs_trigger_btree_key_cache_shrink) {
+ struct shrink_control sc;
+
+ sc.gfp_mask = GFP_KERNEL;
+ sc.nr_to_scan = strtoul_or_return(buf);
+ c->btree_key_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
}
+ if (attr == &sysfs_trigger_gc)
+ bch2_gc_gens(c);
+
if (attr == &sysfs_trigger_discards)
bch2_do_discards(c);
if (attr == &sysfs_trigger_invalidates)
bch2_do_invalidates(c);
+ if (attr == &sysfs_trigger_journal_flush) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_meta(&c->journal);
+ }
+
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -558,6 +517,7 @@ STORE(bch2_fs)
size = ret;
}
#endif
+ bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
return size;
}
SYSFS_OPS(bch2_fs);
@@ -591,13 +551,11 @@ SHOW(bch2_fs_counters)
if (attr == &sysfs_##t) { \
counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
- prt_printf(out, "since mount:"); \
- prt_tab(out); \
+ prt_printf(out, "since mount:\t"); \
prt_human_readable_u64(out, counter_since_mount); \
prt_newline(out); \
\
- prt_printf(out, "since filesystem creation:"); \
- prt_tab(out); \
+ prt_printf(out, "since filesystem creation:\t"); \
prt_human_readable_u64(out, counter); \
prt_newline(out); \
}
@@ -639,7 +597,6 @@ SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
&sysfs_flags,
&sysfs_journal_debug,
- &sysfs_btree_updates,
&sysfs_btree_cache,
&sysfs_btree_key_cache,
&sysfs_new_stripes,
@@ -657,8 +614,9 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_gc,
&sysfs_trigger_discards,
&sysfs_trigger_invalidates,
- &sysfs_prune_cache,
- &sysfs_btree_wakeup,
+ &sysfs_trigger_journal_flush,
+ &sysfs_trigger_btree_cache_shrink,
+ &sysfs_trigger_btree_key_cache_shrink,
&sysfs_gc_gens_pos,
@@ -674,6 +632,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_internal_uuid,
&sysfs_disk_groups,
+ &sysfs_alloc_debug,
NULL
};
@@ -789,88 +748,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
NULL
};
-static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bch_dev_usage stats = bch2_dev_usage_read(ca);
- unsigned i, nr[BCH_DATA_NR];
-
- memset(nr, 0, sizeof(nr));
-
- for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
- nr[c->open_buckets[i].data_type]++;
-
- printbuf_tabstop_push(out, 8);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
-
- bch2_dev_usage_to_text(out, &stats);
-
- prt_newline(out);
-
- prt_printf(out, "reserves:");
- prt_newline(out);
- for (i = 0; i < BCH_WATERMARK_NR; i++) {
- prt_str(out, bch2_watermarks[i]);
- prt_tab(out);
- prt_u64(out, bch2_dev_buckets_reserved(ca, i));
- prt_tab_rjust(out);
- prt_newline(out);
- }
-
- prt_newline(out);
-
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 24);
-
- prt_str(out, "freelist_wait");
- prt_tab(out);
- prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty");
- prt_newline(out);
-
- prt_str(out, "open buckets allocated");
- prt_tab(out);
- prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
- prt_newline(out);
-
- prt_str(out, "open buckets this dev");
- prt_tab(out);
- prt_u64(out, ca->nr_open_buckets);
- prt_newline(out);
-
- prt_str(out, "open buckets total");
- prt_tab(out);
- prt_u64(out, OPEN_BUCKETS_COUNT);
- prt_newline(out);
-
- prt_str(out, "open_buckets_wait");
- prt_tab(out);
- prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty");
- prt_newline(out);
-
- prt_str(out, "open_buckets_btree");
- prt_tab(out);
- prt_u64(out, nr[BCH_DATA_btree]);
- prt_newline(out);
-
- prt_str(out, "open_buckets_user");
- prt_tab(out);
- prt_u64(out, nr[BCH_DATA_user]);
- prt_newline(out);
-
- prt_str(out, "buckets_to_invalidate");
- prt_tab(out);
- prt_u64(out, should_invalidate_buckets(ca, stats));
- prt_newline(out);
-
- prt_str(out, "btree reserve cache");
- prt_tab(out);
- prt_u64(out, c->btree_reserve_cache_nr);
- prt_newline(out);
-}
-
static const char * const bch2_rw[] = {
"read",
"write",
@@ -940,7 +817,7 @@ SHOW(bch2_dev)
* 100 / CONGESTED_MAX);
if (attr == &sysfs_alloc_debug)
- dev_alloc_debug_to_text(out, ca);
+ bch2_dev_alloc_debug_to_text(out, ca);
return 0;
}
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index b3fe9fc57747..68104b2056d9 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -40,7 +40,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
k.k.p.snapshot = U32_MAX;
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = commit_do(trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
@@ -81,7 +81,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
k.k.p.snapshot = U32_MAX;
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = commit_do(trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
@@ -261,7 +261,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
ret = bch2_trans_run(c,
for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_SLOTS, k, ({
+ BTREE_ITER_slots, k, ({
if (i >= nr * 2)
break;
@@ -322,7 +322,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
ret = bch2_trans_run(c,
for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_SLOTS, k, ({
+ BTREE_ITER_slots, k, ({
if (i == nr)
break;
BUG_ON(bkey_deleted(k.k) != !(i % 16));
@@ -452,7 +452,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start,
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+ BTREE_UPDATE_internal_snapshot_node));
bch_err_fn(c, ret);
return ret;
}
@@ -671,8 +671,8 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
- BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(&iter);
+ BTREE_ITER_intent);
+ k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
@@ -714,7 +714,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
return bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
+ BTREE_ITER_slots|BTREE_ITER_intent, k,
NULL, NULL, 0, ({
if (iter.pos.offset >= nr)
break;
@@ -737,7 +737,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
return bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX),
- BTREE_ITER_INTENT, k,
+ BTREE_ITER_intent, k,
NULL, NULL, 0, ({
struct bkey_i_cookie u;
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 940db15d6a93..b1af7ac430f6 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -294,16 +294,27 @@ static int thread_with_stdio_fn(void *arg)
return 0;
}
-int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
- const struct thread_with_stdio_ops *ops)
+void bch2_thread_with_stdio_init(struct thread_with_stdio *thr,
+ const struct thread_with_stdio_ops *ops)
{
stdio_buf_init(&thr->stdio.input);
stdio_buf_init(&thr->stdio.output);
thr->ops = ops;
+}
+int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr)
+{
return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
}
+int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
+ const struct thread_with_stdio_ops *ops)
+{
+ bch2_thread_with_stdio_init(thr, ops);
+
+ return __bch2_run_thread_with_stdio(thr);
+}
+
int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
const struct thread_with_stdio_ops *ops)
{
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index af54ea8f5b0f..1d63d14d7dca 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -63,6 +63,9 @@ struct thread_with_stdio {
const struct thread_with_stdio_ops *ops;
};
+void bch2_thread_with_stdio_init(struct thread_with_stdio *,
+ const struct thread_with_stdio_ops *);
+int __bch2_run_thread_with_stdio(struct thread_with_stdio *);
int bch2_run_thread_with_stdio(struct thread_with_stdio *,
const struct thread_with_stdio_ops *);
int bch2_run_thread_with_stdout(struct thread_with_stdio *,
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 6aa81d1e6d36..84fcf26e306e 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -43,7 +43,7 @@ DECLARE_EVENT_CLASS(fs_str,
TP_fast_assign(
__entry->dev = c->dev;
- __assign_str(str, str);
+ __assign_str(str);
),
TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
@@ -64,7 +64,7 @@ DECLARE_EVENT_CLASS(trans_str,
__entry->dev = trans->c->dev;
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
- __assign_str(str, str);
+ __assign_str(str);
),
TP_printk("%d,%d %s %pS %s",
@@ -85,7 +85,7 @@ DECLARE_EVENT_CLASS(trans_str_nocaller,
TP_fast_assign(
__entry->dev = trans->c->dev;
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __assign_str(str, str);
+ __assign_str(str);
),
TP_printk("%d,%d %s %s",
@@ -638,99 +638,14 @@ DEFINE_EVENT(bch_fs, gc_gens_end,
/* Allocator */
-DECLARE_EVENT_CLASS(bucket_alloc,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
- u64 bucket,
- u64 free,
- u64 avail,
- u64 copygc_wait_amount,
- s64 copygc_waiting_for,
- struct bucket_alloc_state *s,
- bool nonblocking,
- const char *err),
- TP_ARGS(ca, alloc_reserve, bucket, free, avail,
- copygc_wait_amount, copygc_waiting_for,
- s, nonblocking, err),
-
- TP_STRUCT__entry(
- __field(u8, dev )
- __array(char, reserve, 16 )
- __field(u64, bucket )
- __field(u64, free )
- __field(u64, avail )
- __field(u64, copygc_wait_amount )
- __field(s64, copygc_waiting_for )
- __field(u64, seen )
- __field(u64, open )
- __field(u64, need_journal_commit )
- __field(u64, nouse )
- __field(bool, nonblocking )
- __field(u64, nocow )
- __array(char, err, 32 )
- ),
-
- TP_fast_assign(
- __entry->dev = ca->dev_idx;
- strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
- __entry->bucket = bucket;
- __entry->free = free;
- __entry->avail = avail;
- __entry->copygc_wait_amount = copygc_wait_amount;
- __entry->copygc_waiting_for = copygc_waiting_for;
- __entry->seen = s->buckets_seen;
- __entry->open = s->skipped_open;
- __entry->need_journal_commit = s->skipped_need_journal_commit;
- __entry->nouse = s->skipped_nouse;
- __entry->nonblocking = nonblocking;
- __entry->nocow = s->skipped_nocow;
- strscpy(__entry->err, err, sizeof(__entry->err));
- ),
-
- TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
- __entry->reserve,
- __entry->dev,
- __entry->bucket,
- __entry->free,
- __entry->avail,
- __entry->copygc_wait_amount,
- __entry->copygc_waiting_for,
- __entry->seen,
- __entry->open,
- __entry->need_journal_commit,
- __entry->nouse,
- __entry->nocow,
- __entry->nonblocking,
- __entry->err)
+DEFINE_EVENT(fs_str, bucket_alloc,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bucket_alloc, bucket_alloc,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
- u64 bucket,
- u64 free,
- u64 avail,
- u64 copygc_wait_amount,
- s64 copygc_waiting_for,
- struct bucket_alloc_state *s,
- bool nonblocking,
- const char *err),
- TP_ARGS(ca, alloc_reserve, bucket, free, avail,
- copygc_wait_amount, copygc_waiting_for,
- s, nonblocking, err)
-);
-
-DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
- u64 bucket,
- u64 free,
- u64 avail,
- u64 copygc_wait_amount,
- s64 copygc_waiting_for,
- struct bucket_alloc_state *s,
- bool nonblocking,
- const char *err),
- TP_ARGS(ca, alloc_reserve, bucket, free, avail,
- copygc_wait_amount, copygc_waiting_for,
- s, nonblocking, err)
+DEFINE_EVENT(fs_str, bucket_alloc_fail,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
TRACE_EVENT(discard_buckets,
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 216fadf16928..de331dec2a99 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -348,15 +348,12 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
const struct time_unit *u = bch2_pick_time_units(ns);
- prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
- prt_tab_rjust(out);
- prt_printf(out, "%s", u->name);
+ prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name);
}
static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
{
- prt_str(out, name);
- prt_tab(out);
+ prt_printf(out, "%s\t", name);
bch2_pr_time_units_aligned(out, ns);
prt_newline(out);
}
@@ -389,12 +386,8 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
}
printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
- prt_printf(out, "count:");
- prt_tab(out);
- prt_printf(out, "%llu ",
- stats->duration_stats.n);
+ prt_printf(out, "count:\t%llu\n", stats->duration_stats.n);
printbuf_tabstop_pop(out);
- prt_newline(out);
printbuf_tabstops_reset(out);
@@ -403,13 +396,8 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
printbuf_tabstop_push(out, 0);
printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
- prt_tab(out);
- prt_printf(out, "since mount");
- prt_tab_rjust(out);
- prt_tab(out);
+ prt_printf(out, "\tsince mount\r\trecent\r\n");
prt_printf(out, "recent");
- prt_tab_rjust(out);
- prt_newline(out);
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, out->indent + 20);
@@ -417,23 +405,20 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
printbuf_tabstop_push(out, 2);
printbuf_tabstop_push(out, TABSTOP_SIZE);
- prt_printf(out, "duration of events");
- prt_newline(out);
+ prt_printf(out, "duration of events\n");
printbuf_indent_add(out, 2);
pr_name_and_units(out, "min:", stats->min_duration);
pr_name_and_units(out, "max:", stats->max_duration);
pr_name_and_units(out, "total:", stats->total_duration);
- prt_printf(out, "mean:");
- prt_tab(out);
+ prt_printf(out, "mean:\t");
bch2_pr_time_units_aligned(out, d_mean);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
- prt_printf(out, "stddev:");
- prt_tab(out);
+ prt_printf(out, "stddev:\t");
bch2_pr_time_units_aligned(out, d_stddev);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
@@ -441,22 +426,19 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
printbuf_indent_sub(out, 2);
prt_newline(out);
- prt_printf(out, "time between events");
- prt_newline(out);
+ prt_printf(out, "time between events\n");
printbuf_indent_add(out, 2);
pr_name_and_units(out, "min:", stats->min_freq);
pr_name_and_units(out, "max:", stats->max_freq);
- prt_printf(out, "mean:");
- prt_tab(out);
+ prt_printf(out, "mean:\t");
bch2_pr_time_units_aligned(out, f_mean);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
- prt_printf(out, "stddev:");
- prt_tab(out);
+ prt_printf(out, "stddev:\t");
bch2_pr_time_units_aligned(out, f_stddev);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
@@ -589,40 +571,31 @@ void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_contro
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 20);
- prt_printf(out, "rate:");
- prt_tab(out);
+ prt_printf(out, "rate:\t");
prt_human_readable_s64(out, pd->rate.rate);
prt_newline(out);
- prt_printf(out, "target:");
- prt_tab(out);
+ prt_printf(out, "target:\t");
prt_human_readable_u64(out, pd->last_target);
prt_newline(out);
- prt_printf(out, "actual:");
- prt_tab(out);
+ prt_printf(out, "actual:\t");
prt_human_readable_u64(out, pd->last_actual);
prt_newline(out);
- prt_printf(out, "proportional:");
- prt_tab(out);
+ prt_printf(out, "proportional:\t");
prt_human_readable_s64(out, pd->last_proportional);
prt_newline(out);
- prt_printf(out, "derivative:");
- prt_tab(out);
+ prt_printf(out, "derivative:\t");
prt_human_readable_s64(out, pd->last_derivative);
prt_newline(out);
- prt_printf(out, "change:");
- prt_tab(out);
+ prt_printf(out, "change:\t");
prt_human_readable_s64(out, pd->last_change);
prt_newline(out);
- prt_printf(out, "next io:");
- prt_tab(out);
- prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
- prt_newline(out);
+ prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
}
/* misc: */
@@ -707,149 +680,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
-static int alignment_ok(const void *base, size_t align)
-{
- return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
- ((unsigned long)base & (align - 1)) == 0;
-}
-
-static void u32_swap(void *a, void *b, size_t size)
-{
- u32 t = *(u32 *)a;
- *(u32 *)a = *(u32 *)b;
- *(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, size_t size)
-{
- u64 t = *(u64 *)a;
- *(u64 *)a = *(u64 *)b;
- *(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
- char t;
-
- do {
- t = *(char *)a;
- *(char *)a++ = *(char *)b;
- *(char *)b++ = t;
- } while (--size > 0);
-}
-
-static inline int do_cmp(void *base, size_t n, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- size_t l, size_t r)
-{
- return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
- size);
-}
-
-static inline void do_swap(void *base, size_t n, size_t size,
- void (*swap_func)(void *, void *, size_t),
- size_t l, size_t r)
-{
- swap_func(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
- size);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t))
-{
- int i, c, r;
-
- if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
- else
- swap_func = generic_swap;
- }
-
- /* heapify */
- for (i = n / 2 - 1; i >= 0; --i) {
- for (r = i; r * 2 + 1 < n; r = c) {
- c = r * 2 + 1;
-
- if (c + 1 < n &&
- do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
- c++;
-
- if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
- break;
-
- do_swap(base, n, size, swap_func, r, c);
- }
- }
-
- /* sort */
- for (i = n - 1; i > 0; --i) {
- do_swap(base, n, size, swap_func, 0, i);
-
- for (r = 0; r * 2 + 1 < i; r = c) {
- c = r * 2 + 1;
-
- if (c + 1 < i &&
- do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
- c++;
-
- if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
- break;
-
- do_swap(base, n, size, swap_func, r, c);
- }
- }
-}
-
-void sort_cmp_size(void *base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t size))
-{
- /* pre-scale counters for performance */
- int i = (num/2 - 1) * size, n = num * size, c, r;
-
- if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
- else
- swap_func = generic_swap;
- }
-
- /* heapify */
- for ( ; i >= 0; i -= size) {
- for (r = i; r * 2 + size < n; r = c) {
- c = r * 2 + size;
- if (c < n - size &&
- cmp_func(base + c, base + c + size, size) < 0)
- c += size;
- if (cmp_func(base + r, base + c, size) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-
- /* sort */
- for (i = n - size; i > 0; i -= size) {
- swap_func(base, base + i, size);
- for (r = 0; r * 2 + size < i; r = c) {
- c = r * 2 + size;
- if (c < i - size &&
- cmp_func(base + c, base + c + size, size) < 0)
- c += size;
- if (cmp_func(base + r, base + c, size) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-}
-
#if 0
void eytzinger1_test(void)
{
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 175aee3074c7..5d2c470a49ac 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -445,11 +445,6 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
void bch2_bio_map(struct bio *bio, void *base, size_t);
int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
-static inline sector_t bdev_sectors(struct block_device *bdev)
-{
- return bdev->bd_inode->i_size >> 9;
-}
-
#define closure_bio_submit(bio, cl) \
do { \
closure_get(cl); \
@@ -631,10 +626,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-void sort_cmp_size(void *base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t));
-
/* just the memmove, doesn't update @_nr */
#define __array_insert_item(_array, _nr, _pos) \
memmove(&(_array)[(_pos) + 1], \
@@ -792,9 +783,27 @@ static inline int copy_from_user_errcode(void *to, const void __user *from, unsi
#endif
+static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
+{
+ if (v)
+ set_bit(nr, addr);
+ else
+ clear_bit(nr, addr);
+}
+
static inline void __set_bit_le64(size_t bit, __le64 *addr)
{
addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
}
+static inline void __clear_bit_le64(size_t bit, __le64 *addr)
+{
+ addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline bool test_bit_le64(size_t bit, __le64 *addr)
+{
+ return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
+}
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 754f17bba68e..c11bf6dacc2c 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -71,7 +71,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
};
int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
@@ -118,11 +118,17 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
else
prt_printf(out, "(unknown type %u)", xattr.v->x_type);
+ unsigned name_len = xattr.v->x_name_len;
+ unsigned val_len = le16_to_cpu(xattr.v->x_val_len);
+ unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) -
+ offsetof(struct bch_xattr, x_name);
+
+ val_len = min_t(int, val_len, max_name_val_bytes - name_len);
+ name_len = min(name_len, max_name_val_bytes);
+
prt_printf(out, "%.*s:%.*s",
- xattr.v->x_name_len,
- xattr.v->x_name,
- le16_to_cpu(xattr.v->x_val_len),
- (char *) xattr_val(xattr.v));
+ name_len, xattr.v->x_name,
+ val_len, (char *) xattr_val(xattr.v));
if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
@@ -138,21 +144,13 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
struct btree_iter iter;
- struct bkey_s_c_xattr xattr;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
- inode_inum(inode), &search, 0);
+ struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
+ inode_inum(inode), &search, 0);
+ int ret = bkey_err(k);
if (ret)
- goto err1;
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err2;
+ return ret;
- xattr = bkey_s_c_to_xattr(k);
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
ret = le16_to_cpu(xattr.v->x_val_len);
if (buffer) {
if (ret > size)
@@ -160,10 +158,8 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
else
memcpy(buffer, xattr_val(xattr.v), ret);
}
-err2:
bch2_trans_iter_exit(trans, &iter);
-err1:
- return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret;
+ return ret;
}
int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
@@ -177,7 +173,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
int ret;
ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
- bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+ bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
if (ret)
return ret;
@@ -212,8 +208,8 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
inum, &xattr->k_i,
- (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
- (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
+ (flags & XATTR_CREATE ? STR_HASH_must_create : 0)|
+ (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0));
} else {
struct xattr_search_key search =
X_SEARCH(type, name, strlen(name));
@@ -359,6 +355,9 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
int ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
+ if (ret < 0 && bch2_err_matches(ret, ENOENT))
+ ret = -ENODATA;
+
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 1337f31a5c49..1574b9eb4c85 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -7,7 +7,7 @@
extern const struct bch_hash_desc bch2_xattr_hash_desc;
int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_xattr ((struct bkey_ops) { \