summaryrefslogtreecommitdiff
path: root/fs/bcachefs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/Kconfig16
-rw-r--r--fs/bcachefs/Makefile12
-rw-r--r--fs/bcachefs/acl.c95
-rw-r--r--fs/bcachefs/acl.h2
-rw-r--r--fs/bcachefs/alloc_background.c1562
-rw-r--r--fs/bcachefs/alloc_background.h203
-rw-r--r--fs/bcachefs/alloc_background_format.h7
-rw-r--r--fs/bcachefs/alloc_foreground.c697
-rw-r--r--fs/bcachefs/alloc_foreground.h30
-rw-r--r--fs/bcachefs/alloc_types.h11
-rw-r--r--fs/bcachefs/backpointers.c1227
-rw-r--r--fs/bcachefs/backpointers.h147
-rw-r--r--fs/bcachefs/bbpos.h2
-rw-r--r--fs/bcachefs/bbpos_types.h2
-rw-r--r--fs/bcachefs/bcachefs.h221
-rw-r--r--fs/bcachefs/bcachefs_format.h448
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h36
-rw-r--r--fs/bcachefs/bkey.c25
-rw-r--r--fs/bcachefs/bkey.h257
-rw-r--r--fs/bcachefs/bkey_methods.c135
-rw-r--r--fs/bcachefs/bkey_methods.h92
-rw-r--r--fs/bcachefs/bkey_sort.c79
-rw-r--r--fs/bcachefs/bkey_sort.h4
-rw-r--r--fs/bcachefs/bkey_types.h241
-rw-r--r--fs/bcachefs/bset.c227
-rw-r--r--fs/bcachefs/bset.h12
-rw-r--r--fs/bcachefs/btree_cache.c621
-rw-r--r--fs/bcachefs/btree_cache.h28
-rw-r--r--fs/bcachefs/btree_gc.c1790
-rw-r--r--fs/bcachefs/btree_gc.h79
-rw-r--r--fs/bcachefs/btree_gc_types.h34
-rw-r--r--fs/bcachefs/btree_io.c599
-rw-r--r--fs/bcachefs/btree_io.h18
-rw-r--r--fs/bcachefs/btree_iter.c1157
-rw-r--r--fs/bcachefs/btree_iter.h320
-rw-r--r--fs/bcachefs/btree_journal_iter.c526
-rw-r--r--fs/bcachefs/btree_journal_iter.h57
-rw-r--r--fs/bcachefs/btree_journal_iter_types.h36
-rw-r--r--fs/bcachefs/btree_key_cache.c876
-rw-r--r--fs/bcachefs/btree_key_cache.h18
-rw-r--r--fs/bcachefs/btree_key_cache_types.h24
-rw-r--r--fs/bcachefs/btree_locking.c290
-rw-r--r--fs/bcachefs/btree_locking.h92
-rw-r--r--fs/bcachefs/btree_node_scan.c598
-rw-r--r--fs/bcachefs/btree_node_scan.h11
-rw-r--r--fs/bcachefs/btree_node_scan_types.h31
-rw-r--r--fs/bcachefs/btree_trans_commit.c460
-rw-r--r--fs/bcachefs/btree_types.h259
-rw-r--r--fs/bcachefs/btree_update.c200
-rw-r--r--fs/bcachefs/btree_update.h95
-rw-r--r--fs/bcachefs/btree_update_interior.c942
-rw-r--r--fs/bcachefs/btree_update_interior.h48
-rw-r--r--fs/bcachefs/btree_write_buffer.c337
-rw-r--r--fs/bcachefs/btree_write_buffer.h53
-rw-r--r--fs/bcachefs/btree_write_buffer_types.h2
-rw-r--r--fs/bcachefs/buckets.c1565
-rw-r--r--fs/bcachefs/buckets.h217
-rw-r--r--fs/bcachefs/buckets_types.h27
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.c27
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.h4
-rw-r--r--fs/bcachefs/chardev.c364
-rw-r--r--fs/bcachefs/checksum.c182
-rw-r--r--fs/bcachefs/checksum.h7
-rw-r--r--fs/bcachefs/clock.c95
-rw-r--r--fs/bcachefs/clock.h18
-rw-r--r--fs/bcachefs/clock_types.h5
-rw-r--r--fs/bcachefs/compress.c137
-rw-r--r--fs/bcachefs/compress.h12
-rw-r--r--fs/bcachefs/darray.c20
-rw-r--r--fs/bcachefs/darray.h28
-rw-r--r--fs/bcachefs/data_update.c387
-rw-r--r--fs/bcachefs/data_update.h8
-rw-r--r--fs/bcachefs/debug.c317
-rw-r--r--fs/bcachefs/dirent.c311
-rw-r--r--fs/bcachefs/dirent.h24
-rw-r--r--fs/bcachefs/disk_accounting.c1012
-rw-r--r--fs/bcachefs/disk_accounting.h275
-rw-r--r--fs/bcachefs/disk_accounting_format.h167
-rw-r--r--fs/bcachefs/disk_accounting_types.h19
-rw-r--r--fs/bcachefs/disk_groups.c13
-rw-r--r--fs/bcachefs/disk_groups_format.h21
-rw-r--r--fs/bcachefs/ec.c1284
-rw-r--r--fs/bcachefs/ec.h28
-rw-r--r--fs/bcachefs/ec_format.h26
-rw-r--r--fs/bcachefs/ec_types.h3
-rw-r--r--fs/bcachefs/errcode.c15
-rw-r--r--fs/bcachefs/errcode.h61
-rw-r--r--fs/bcachefs/error.c325
-rw-r--r--fs/bcachefs/error.h127
-rw-r--r--fs/bcachefs/extent_update.c6
-rw-r--r--fs/bcachefs/extents.c725
-rw-r--r--fs/bcachefs/extents.h141
-rw-r--r--fs/bcachefs/extents_format.h15
-rw-r--r--fs/bcachefs/eytzinger.c305
-rw-r--r--fs/bcachefs/eytzinger.h98
-rw-r--r--fs/bcachefs/fifo.h4
-rw-r--r--fs/bcachefs/fs-common.c215
-rw-r--r--fs/bcachefs/fs-common.h4
-rw-r--r--fs/bcachefs/fs-io-buffered.c167
-rw-r--r--fs/bcachefs/fs-io-buffered.h6
-rw-r--r--fs/bcachefs/fs-io-direct.c41
-rw-r--r--fs/bcachefs/fs-io-pagecache.c161
-rw-r--r--fs/bcachefs/fs-io-pagecache.h20
-rw-r--r--fs/bcachefs/fs-io.c282
-rw-r--r--fs/bcachefs/fs-ioctl.c111
-rw-r--r--fs/bcachefs/fs.c1058
-rw-r--r--fs/bcachefs/fs.h23
-rw-r--r--fs/bcachefs/fsck.c2594
-rw-r--r--fs/bcachefs/fsck.h13
-rw-r--r--fs/bcachefs/inode.c692
-rw-r--r--fs/bcachefs/inode.h125
-rw-r--r--fs/bcachefs/inode_format.h24
-rw-r--r--fs/bcachefs/io_misc.c93
-rw-r--r--fs/bcachefs/io_read.c474
-rw-r--r--fs/bcachefs/io_read.h28
-rw-r--r--fs/bcachefs/io_write.c297
-rw-r--r--fs/bcachefs/io_write.h4
-rw-r--r--fs/bcachefs/io_write_types.h1
-rw-r--r--fs/bcachefs/journal.c750
-rw-r--r--fs/bcachefs/journal.h44
-rw-r--r--fs/bcachefs/journal_io.c867
-rw-r--r--fs/bcachefs/journal_io.h52
-rw-r--r--fs/bcachefs/journal_reclaim.c230
-rw-r--r--fs/bcachefs/journal_reclaim.h3
-rw-r--r--fs/bcachefs/journal_sb.c25
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c151
-rw-r--r--fs/bcachefs/journal_seq_blacklist.h2
-rw-r--r--fs/bcachefs/journal_seq_blacklist_format.h15
-rw-r--r--fs/bcachefs/journal_types.h68
-rw-r--r--fs/bcachefs/logged_ops.c45
-rw-r--r--fs/bcachefs/logged_ops.h2
-rw-r--r--fs/bcachefs/logged_ops_format.h5
-rw-r--r--fs/bcachefs/lru.c95
-rw-r--r--fs/bcachefs/lru.h23
-rw-r--r--fs/bcachefs/lru_format.h25
-rw-r--r--fs/bcachefs/mean_and_variance.c28
-rw-r--r--fs/bcachefs/mean_and_variance.h20
-rw-r--r--fs/bcachefs/mean_and_variance_test.c109
-rw-r--r--fs/bcachefs/migrate.c16
-rw-r--r--fs/bcachefs/move.c303
-rw-r--r--fs/bcachefs/move.h5
-rw-r--r--fs/bcachefs/movinggc.c79
-rw-r--r--fs/bcachefs/opts.c274
-rw-r--r--fs/bcachefs/opts.h167
-rw-r--r--fs/bcachefs/printbuf.c247
-rw-r--r--fs/bcachefs/printbuf.h69
-rw-r--r--fs/bcachefs/quota.c139
-rw-r--r--fs/bcachefs/quota.h7
-rw-r--r--fs/bcachefs/rcu_pending.c666
-rw-r--r--fs/bcachefs/rcu_pending.h27
-rw-r--r--fs/bcachefs/rebalance.c281
-rw-r--r--fs/bcachefs/rebalance.h30
-rw-r--r--fs/bcachefs/rebalance_format.h53
-rw-r--r--fs/bcachefs/rebalance_types.h2
-rw-r--r--fs/bcachefs/recovery.c882
-rw-r--r--fs/bcachefs/recovery.h32
-rw-r--r--fs/bcachefs/recovery_passes.c316
-rw-r--r--fs/bcachefs/recovery_passes.h18
-rw-r--r--fs/bcachefs/recovery_passes_types.h80
-rw-r--r--fs/bcachefs/recovery_types.h66
-rw-r--r--fs/bcachefs/reflink.c562
-rw-r--r--fs/bcachefs/reflink.h44
-rw-r--r--fs/bcachefs/reflink_format.h7
-rw-r--r--fs/bcachefs/replicas.c378
-rw-r--r--fs/bcachefs/replicas.h18
-rw-r--r--fs/bcachefs/replicas_format.h36
-rw-r--r--fs/bcachefs/replicas_types.h16
-rw-r--r--fs/bcachefs/sb-clean.c122
-rw-r--r--fs/bcachefs/sb-counters.c20
-rw-r--r--fs/bcachefs/sb-counters_format.h165
-rw-r--r--fs/bcachefs/sb-downgrade.c229
-rw-r--r--fs/bcachefs/sb-downgrade.h1
-rw-r--r--fs/bcachefs/sb-downgrade_format.h17
-rw-r--r--fs/bcachefs/sb-errors.c22
-rw-r--r--fs/bcachefs/sb-errors.h2
-rw-r--r--fs/bcachefs/sb-errors_format.h336
-rw-r--r--fs/bcachefs/sb-errors_types.h256
-rw-r--r--fs/bcachefs/sb-members.c262
-rw-r--r--fs/bcachefs/sb-members.h198
-rw-r--r--fs/bcachefs/sb-members_format.h121
-rw-r--r--fs/bcachefs/sb-members_types.h21
-rw-r--r--fs/bcachefs/seqmutex.h11
-rw-r--r--fs/bcachefs/siphash.c2
-rw-r--r--fs/bcachefs/six.c46
-rw-r--r--fs/bcachefs/six.h8
-rw-r--r--fs/bcachefs/snapshot.c1005
-rw-r--r--fs/bcachefs/snapshot.h130
-rw-r--r--fs/bcachefs/str_hash.c295
-rw-r--r--fs/bcachefs/str_hash.h149
-rw-r--r--fs/bcachefs/subvolume.c378
-rw-r--r--fs/bcachefs/subvolume.h69
-rw-r--r--fs/bcachefs/subvolume_format.h4
-rw-r--r--fs/bcachefs/subvolume_types.h7
-rw-r--r--fs/bcachefs/super-io.c363
-rw-r--r--fs/bcachefs/super-io.h21
-rw-r--r--fs/bcachefs/super.c510
-rw-r--r--fs/bcachefs/super.h11
-rw-r--r--fs/bcachefs/super_types.h15
-rw-r--r--fs/bcachefs/sysfs.c418
-rw-r--r--fs/bcachefs/tests.c67
-rw-r--r--fs/bcachefs/thread_with_file.c431
-rw-r--r--fs/bcachefs/thread_with_file.h66
-rw-r--r--fs/bcachefs/thread_with_file_types.h18
-rw-r--r--fs/bcachefs/time_stats.c179
-rw-r--r--fs/bcachefs/time_stats.h160
-rw-r--r--fs/bcachefs/trace.c1
-rw-r--r--fs/bcachefs/trace.h783
-rw-r--r--fs/bcachefs/two_state_shared_lock.h11
-rw-r--r--fs/bcachefs/util.c499
-rw-r--r--fs/bcachefs/util.h344
-rw-r--r--fs/bcachefs/varint.c9
-rw-r--r--fs/bcachefs/xattr.c175
-rw-r--r--fs/bcachefs/xattr.h8
-rw-r--r--fs/bcachefs/xattr_format.h2
214 files changed, 28364 insertions, 17231 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 5cdfef3b551a..fc7efd0a7525 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -15,6 +15,7 @@ config BCACHEFS_FS
select ZLIB_INFLATE
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
+ select CRYPTO
select CRYPTO_SHA256
select CRYPTO_CHACHA20
select CRYPTO_POLY1305
@@ -24,6 +25,7 @@ config BCACHEFS_FS
select XXHASH
select SRCU
select SYMBOLIC_ERRNAME
+ select MIN_HEAP
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
@@ -59,6 +61,13 @@ config BCACHEFS_DEBUG
The resulting code will be significantly slower than normal; you
probably shouldn't select this option unless you're a developer.
+config BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ bool "Randomly inject transaction restarts"
+ depends on BCACHEFS_DEBUG
+ help
+ Randomly inject transaction restarts in a few core paths - may have a
+ significant performance penalty
+
config BCACHEFS_TESTS
bool "bcachefs unit and performance tests"
depends on BCACHEFS_FS
@@ -87,6 +96,13 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
is held by another thread, spin for a short while, as long as the
thread owning the lock is running.
+config BCACHEFS_PATH_TRACEPOINTS
+ bool "Extra btree_path tracepoints"
+ depends on BCACHEFS_FS && TRACING
+ help
+ Enable extra tracepoints for debugging btree_path operations; we don't
+ normally want these enabled because they happen at very high rates.
+
config MEAN_AND_VARIANCE_UNIT_TEST
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 1a05cecda7cc..d2689388d5e8 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -17,6 +17,7 @@ bcachefs-y := \
btree_journal_iter.o \
btree_key_cache.o \
btree_locking.o \
+ btree_node_scan.o \
btree_trans_commit.o \
btree_update.o \
btree_update_interior.o \
@@ -28,15 +29,17 @@ bcachefs-y := \
clock.o \
compress.o \
darray.o \
+ data_update.o \
debug.o \
dirent.o \
+ disk_accounting.o \
disk_groups.o \
- data_update.o \
ec.o \
errcode.o \
error.o \
extents.o \
extent_update.o \
+ eytzinger.o \
fs.o \
fs-common.o \
fs-ioctl.o \
@@ -66,7 +69,9 @@ bcachefs-y := \
printbuf.o \
quota.o \
rebalance.o \
+ rcu_pending.o \
recovery.o \
+ recovery_passes.o \
reflink.o \
replicas.o \
sb-clean.o \
@@ -77,11 +82,13 @@ bcachefs-y := \
siphash.o \
six.o \
snapshot.o \
+ str_hash.o \
subvolume.o \
super.o \
super-io.o \
sysfs.o \
tests.o \
+ time_stats.o \
thread_with_file.o \
trace.o \
two_state_shared_lock.o \
@@ -90,3 +97,6 @@ bcachefs-y := \
xattr.o
obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
+
+# Silence "note: xyz changed in GCC X.X" messages
+subdir-ccflags-y += $(call cc-disable-warning, psabi)
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 3640f417cce1..99487727ae64 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -184,11 +184,6 @@ invalid:
return ERR_PTR(-EINVAL);
}
-#define acl_for_each_entry(acl, acl_e) \
- for (acl_e = acl->a_entries; \
- acl_e < acl->a_entries + acl->a_count; \
- acl_e++)
-
/*
* Convert from in-memory to filesystem representation.
*/
@@ -199,11 +194,11 @@ bch2_acl_to_xattr(struct btree_trans *trans,
{
struct bkey_i_xattr *xattr;
bch_acl_header *acl_header;
- const struct posix_acl_entry *acl_e;
+ const struct posix_acl_entry *acl_e, *pe;
void *outptr;
unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
- acl_for_each_entry(acl, acl_e) {
+ FOREACH_ACL_ENTRY(acl_e, acl, pe) {
switch (acl_e->e_tag) {
case ACL_USER:
case ACL_GROUP:
@@ -241,7 +236,7 @@ bch2_acl_to_xattr(struct btree_trans *trans,
outptr = (void *) acl_header + sizeof(*acl_header);
- acl_for_each_entry(acl, acl_e) {
+ FOREACH_ACL_ENTRY(acl_e, acl, pe) {
bch_acl_entry *entry = outptr;
entry->e_tag = cpu_to_le16(acl_e->e_tag);
@@ -272,46 +267,41 @@ bch2_acl_to_xattr(struct btree_trans *trans,
return xattr;
}
-struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
- struct dentry *dentry, int type)
+struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
{
- struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+ struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
- struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
- struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
- struct bkey_s_c k;
- int ret;
+
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ struct btree_trans *trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
- ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
- &hash, inode_inum(inode), &search, 0);
- if (ret) {
- if (!bch2_err_matches(ret, ENOENT))
- acl = ERR_PTR(ret);
- goto out;
- }
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret) {
- acl = ERR_PTR(ret);
- goto out;
- }
+ struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+ &hash, inode_inum(inode), &search, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ goto err;
- xattr = bkey_s_c_to_xattr(k);
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
+ le16_to_cpu(xattr.v->x_val_len));
+ ret = PTR_ERR_OR_ZERO(acl);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ if (ret)
+ acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
- if (!IS_ERR(acl))
+ if (!IS_ERR_OR_NULL(acl))
set_cached_acl(&inode->v, type, acl);
-out:
- if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
- goto retry;
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
@@ -354,7 +344,6 @@ int bch2_set_acl(struct mnt_idmap *idmap,
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl;
@@ -362,13 +351,14 @@ int bch2_set_acl(struct mnt_idmap *idmap,
int ret;
mutex_lock(&inode->ei_update_lock);
+ struct btree_trans *trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
acl = _acl;
- ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto btree_err;
@@ -402,8 +392,8 @@ btree_err:
set_cached_acl(&inode->v, type, acl);
err:
- mutex_unlock(&inode->ei_update_lock);
bch2_trans_put(trans);
+ mutex_unlock(&inode->ei_update_lock);
return ret;
}
@@ -416,39 +406,30 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
struct btree_iter iter;
- struct bkey_s_c_xattr xattr;
- struct bkey_i_xattr *new;
struct posix_acl *acl = NULL;
- struct bkey_s_c k;
- int ret;
- ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
- &hash_info, inum, &search, BTREE_ITER_INTENT);
+ struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+ &hash_info, inum, &search, BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (ret)
return bch2_err_matches(ret, ENOENT) ? 0 : ret;
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
- xattr = bkey_s_c_to_xattr(k);
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
ret = PTR_ERR_OR_ZERO(acl);
- if (IS_ERR_OR_NULL(acl))
+ if (ret)
goto err;
- ret = allocate_dropping_locks_errcode(trans,
- __posix_acl_chmod(&acl, _gfp, mode));
+ ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode));
if (ret)
goto err;
- new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
- if (IS_ERR(new)) {
- ret = PTR_ERR(new);
+ struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
goto err;
- }
new->k.p = iter.pos;
ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
index 27e7eec0f278..fe730a6bf0c1 100644
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
@@ -28,7 +28,7 @@ void bch2_acl_to_text(struct printbuf *, const void *, size_t);
#ifdef CONFIG_BCACHEFS_POSIX_ACL
-struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
+struct posix_acl *bch2_get_acl(struct inode *, int, bool);
int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fd3e175d8342..3ea809990ef1 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -3,6 +3,7 @@
#include "alloc_background.h"
#include "alloc_foreground.h"
#include "backpointers.h"
+#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_key_cache.h"
@@ -14,6 +15,7 @@
#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
+#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "lru.h"
@@ -28,6 +30,9 @@
#include <linux/rcupdate.h>
#include <linux/sched/task.h>
#include <linux/sort.h>
+#include <linux/jiffies.h>
+
+static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
/* Persistent alloc info: */
@@ -192,99 +197,119 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
return DIV_ROUND_UP(bytes, sizeof(u64));
}
-int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
int ret = 0;
/* allow for unknown fields */
- bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
- alloc_v1_val_size_bad,
+ bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v),
+ c, alloc_v1_val_size_bad,
"incorrect value size (%zu < %u)",
bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
fsck_err:
return ret;
}
-int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_alloc_unpacked u;
int ret = 0;
- bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
- alloc_v2_unpack_error,
+ bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k),
+ c, alloc_v2_unpack_error,
"unpack error");
fsck_err:
return ret;
}
-int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_alloc_unpacked u;
int ret = 0;
- bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
- alloc_v2_unpack_error,
+ bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
+ c, alloc_v2_unpack_error,
"unpack error");
fsck_err:
return ret;
}
-int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
- struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+ struct bch_alloc_v4 a;
int ret = 0;
- bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err,
- alloc_v4_val_size_bad,
+ bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
+
+ bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
+ c, alloc_v4_val_size_bad,
"bad val size (%u > %zu)",
- alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
+ alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
- bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
- BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
- alloc_v4_backpointers_start_bad,
+ bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
+ BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
+ c, alloc_v4_backpointers_start_bad,
"invalid backpointers_start");
- bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
- alloc_key_data_type_bad,
+ bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
+ c, alloc_key_data_type_bad,
"invalid data type (got %u should be %u)",
- a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+ a.data_type, alloc_data_type(a, a.data_type));
+
+ for (unsigned i = 0; i < 2; i++)
+ bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
+ c, alloc_key_io_time_bad,
+ "invalid io_time[%s]: %llu, max %llu",
+ i == READ ? "read" : "write",
+ a.io_time[i], LRU_TIME_MAX);
+
+ unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
+ offsetof(struct bch_alloc_v4, stripe_sectors)
+ ? a.stripe_sectors
+ : 0;
- switch (a.v->data_type) {
+ switch (a.data_type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
- bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe,
- c, err, alloc_key_empty_but_have_data,
- "empty data type free but have data");
+ bkey_fsck_err_on(stripe_sectors ||
+ a.dirty_sectors ||
+ a.cached_sectors ||
+ a.stripe,
+ c, alloc_key_empty_but_have_data,
+ "empty data type free but have data %u.%u.%u %u",
+ stripe_sectors,
+ a.dirty_sectors,
+ a.cached_sectors,
+ a.stripe);
break;
case BCH_DATA_sb:
case BCH_DATA_journal:
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
- bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
- c, err, alloc_key_dirty_sectors_0,
+ bkey_fsck_err_on(!a.dirty_sectors &&
+ !stripe_sectors,
+ c, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
- bch2_data_type_str(a.v->data_type));
+ bch2_data_type_str(a.data_type));
break;
case BCH_DATA_cached:
- bkey_fsck_err_on(!a.v->cached_sectors ||
- bch2_bucket_sectors_dirty(*a.v) ||
- a.v->stripe,
- c, err, alloc_key_cached_inconsistency,
+ bkey_fsck_err_on(!a.cached_sectors ||
+ a.dirty_sectors ||
+ stripe_sectors ||
+ a.stripe,
+ c, alloc_key_cached_inconsistency,
"data type inconsistency");
- bkey_fsck_err_on(!a.v->io_time[READ] &&
+ bkey_fsck_err_on(!a.io_time[READ] &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
- c, err, alloc_key_cached_but_read_time_zero,
+ c, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time == 0");
break;
case BCH_DATA_stripe:
@@ -297,9 +322,9 @@ fsck_err:
void bch2_alloc_v4_swab(struct bkey_s k)
{
struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
- struct bch_backpointer *bp, *bps;
- a->journal_seq = swab64(a->journal_seq);
+ a->journal_seq_nonempty = swab64(a->journal_seq_nonempty);
+ a->journal_seq_empty = swab64(a->journal_seq_empty);
a->flags = swab32(a->flags);
a->dirty_sectors = swab32(a->dirty_sectors);
a->cached_sectors = swab32(a->cached_sectors);
@@ -307,20 +332,14 @@ void bch2_alloc_v4_swab(struct bkey_s k)
a->io_time[1] = swab64(a->io_time[1]);
a->stripe = swab32(a->stripe);
a->nr_external_backpointers = swab32(a->nr_external_backpointers);
- a->fragmentation_lru = swab64(a->fragmentation_lru);
-
- bps = alloc_v4_backpointers(a);
- for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
- bp->bucket_offset = swab40(bp->bucket_offset);
- bp->bucket_len = swab32(bp->bucket_len);
- bch2_bpos_swab(&bp->pos);
- }
+ a->stripe_sectors = swab32(a->stripe_sectors);
}
void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+ struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL;
prt_newline(out);
printbuf_indent_add(out, 2);
@@ -328,28 +347,24 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
bch2_prt_data_type(out, a->data_type);
prt_newline(out);
- prt_printf(out, "journal_seq %llu", a->journal_seq);
- prt_newline(out);
- prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a));
- prt_newline(out);
- prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a));
- prt_newline(out);
- prt_printf(out, "dirty_sectors %u", a->dirty_sectors);
- prt_newline(out);
- prt_printf(out, "cached_sectors %u", a->cached_sectors);
- prt_newline(out);
- prt_printf(out, "stripe %u", a->stripe);
- prt_newline(out);
- prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy);
- prt_newline(out);
- prt_printf(out, "io_time[READ] %llu", a->io_time[READ]);
- prt_newline(out);
- prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]);
- prt_newline(out);
- prt_printf(out, "fragmentation %llu", a->fragmentation_lru);
- prt_newline(out);
- prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
+ prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty);
+ prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty);
+ prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
+ prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
+ prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
+ prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
+ prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
+ prt_printf(out, "stripe %u\n", a->stripe);
+ prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
+ prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
+ prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
+
+ if (ca)
+ prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca));
+ prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
printbuf_indent_sub(out, 2);
+
+ bch2_dev_put(ca);
}
void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
@@ -371,7 +386,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
*out = (struct bch_alloc_v4) {
- .journal_seq = u.journal_seq,
+ .journal_seq_nonempty = u.journal_seq,
.flags = u.need_discard,
.gen = u.gen,
.oldest_gen = u.oldest_gen,
@@ -437,22 +452,18 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct b
}
struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos pos)
+bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos)
{
- struct bkey_s_c k;
- struct bkey_i_alloc_v4 *a;
- int ret;
-
- k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
- BTREE_ITER_WITH_UPDATES|
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- ret = bkey_err(k);
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
+ BTREE_ITER_with_updates|
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (unlikely(ret))
return ERR_PTR(ret);
- a = bch2_alloc_to_v4_mut_inlined(trans, k);
+ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
ret = PTR_ERR_OR_ZERO(a);
if (unlikely(ret))
goto err;
@@ -462,6 +473,21 @@ err:
return ERR_PTR(ret);
}
+__flatten
+struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
+ enum btree_iter_update_trigger_flags flags)
+{
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
+ int ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
+ bch2_trans_iter_exit(trans, &iter);
+ return unlikely(ret) ? ERR_PTR(ret) : a;
+}
+
static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
{
*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
@@ -484,14 +510,13 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
: 0;
}
-int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
int ret = 0;
- bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
- bucket_gens_val_size_bad,
+ bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens),
+ c, bucket_gens_val_size_bad,
"bad val size (%zu != %zu)",
bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
fsck_err:
@@ -518,7 +543,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
int ret;
ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
@@ -530,13 +555,13 @@ int bch2_bucket_gens_init(struct bch_fs *c)
u8 gen = bch2_alloc_to_v4(k, &a)->gen;
unsigned offset;
struct bpos pos = alloc_gens_pos(iter.pos, &offset);
+ int ret2 = 0;
- if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
- if (ret)
- break;
+ if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
+ ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret2)
+ goto iter_err;
have_bucket_gens_key = false;
}
@@ -547,7 +572,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
}
g.v.gens[offset] = gen;
- 0;
+iter_err:
+ ret2;
}));
if (have_bucket_gens_key && !ret)
@@ -564,29 +590,29 @@ int bch2_bucket_gens_init(struct bch_fs *c)
int bch2_alloc_read(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
+ struct bch_dev *ca = NULL;
int ret;
- down_read(&c->gc_lock);
-
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
if (k.k->type != KEY_TYPE_bucket_gens)
continue;
- const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
-
+ ca = bch2_dev_iterate(c, ca, k.k->p.inode);
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
*/
- if (!bch2_dev_exists2(c, k.k->p.inode))
+ if (!ca) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
continue;
+ }
- struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
for (u64 b = max_t(u64, ca->mi.first_bucket, start);
b < min_t(u64, ca->mi.nbuckets, end);
@@ -596,15 +622,26 @@ int bch2_alloc_read(struct bch_fs *c)
}));
} else {
ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
+ ca = bch2_dev_iterate(c, ca, k.k->p.inode);
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
*/
- if (!bch2_dev_bucket_exists(c, k.k->p))
+ if (!ca) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ continue;
+ }
+
+ if (k.k->p.offset < ca->mi.first_bucket) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
continue;
+ }
- struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ if (k.k->p.offset >= ca->mi.nbuckets) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ continue;
+ }
struct bch_alloc_v4 a;
*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
@@ -612,8 +649,8 @@ int bch2_alloc_read(struct bch_fs *c)
}));
}
+ bch2_dev_put(ca);
bch2_trans_put(trans);
- up_read(&c->gc_lock);
bch_err_fn(c, ret);
return ret;
@@ -621,74 +658,80 @@ int bch2_alloc_read(struct bch_fs *c)
/* Free space/discard btree: */
+static int __need_discard_or_freespace_err(struct btree_trans *trans,
+ struct bkey_s_c alloc_k,
+ bool set, bool discard, bool repair)
+{
+ struct bch_fs *c = trans->c;
+ enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0);
+ enum bch_sb_error_id err_id = discard
+ ? BCH_FSCK_ERR_need_discard_key_wrong
+ : BCH_FSCK_ERR_freespace_key_wrong;
+ enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, alloc_k);
+
+ int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
+ "bucket incorrectly %sset in %s btree\n"
+ " %s",
+ set ? "" : "un",
+ bch2_btree_id_str(btree),
+ buf.buf);
+ if (ret == -BCH_ERR_fsck_ignore ||
+ ret == -BCH_ERR_fsck_errors_not_fixed)
+ ret = 0;
+
+ printbuf_exit(&buf);
+ return ret;
+}
+
+#define need_discard_or_freespace_err(...) \
+ fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__))
+
+#define need_discard_or_freespace_err_on(cond, ...) \
+ (unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false)
+
static int bch2_bucket_do_index(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bkey_s_c alloc_k,
const struct bch_alloc_v4 *a,
bool set)
{
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
- struct btree_iter iter;
- struct bkey_s_c old;
- struct bkey_i *k;
enum btree_id btree;
- enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
- enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
- struct printbuf buf = PRINTBUF;
- int ret;
+ struct bpos pos;
if (a->data_type != BCH_DATA_free &&
a->data_type != BCH_DATA_need_discard)
return 0;
- k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
- if (IS_ERR(k))
- return PTR_ERR(k);
-
- bkey_init(&k->k);
- k->k.type = new_type;
-
switch (a->data_type) {
case BCH_DATA_free:
btree = BTREE_ID_freespace;
- k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
- bch2_key_resize(&k->k, 1);
+ pos = alloc_freespace_pos(alloc_k.k->p, *a);
break;
case BCH_DATA_need_discard:
btree = BTREE_ID_need_discard;
- k->k.p = alloc_k.k->p;
+ pos = alloc_k.k->p;
break;
default:
return 0;
}
- old = bch2_bkey_get_iter(trans, &iter, btree,
- bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
- ret = bkey_err(old);
+ struct btree_iter iter;
+ struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent);
+ int ret = bkey_err(old);
if (ret)
return ret;
- if (ca->mi.freespace_initialized &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
- bch2_trans_inconsistent_on(old.k->type != old_type, trans,
- "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
- " for %s",
- set ? "setting" : "clearing",
- bch2_btree_id_str(btree),
- iter.pos.inode,
- iter.pos.offset,
- bch2_bkey_types[old.k->type],
- bch2_bkey_types[old_type],
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- ret = -EIO;
- goto err;
- }
+ need_discard_or_freespace_err_on(ca->mi.freespace_initialized &&
+ !old.k->type != set,
+ trans, alloc_k, set,
+ btree == BTREE_ID_need_discard, false);
- ret = bch2_trans_update(trans, &iter, k, 0);
-err:
+ ret = bch2_btree_bit_mod_iter(trans, &iter, set);
+fsck_err:
bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
return ret;
}
@@ -708,8 +751,8 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
return ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
- BTREE_ITER_INTENT|
- BTREE_ITER_WITH_UPDATES);
+ BTREE_ITER_intent|
+ BTREE_ITER_with_updates);
ret = bkey_err(k);
if (ret)
return ret;
@@ -728,31 +771,99 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
return ret;
}
+static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
+ enum bch_data_type data_type,
+ s64 delta_buckets,
+ s64 delta_sectors,
+ s64 delta_fragmented, unsigned flags)
+{
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_dev_data_type,
+ .dev_data_type.dev = ca->dev_idx,
+ .dev_data_type.data_type = data_type,
+ };
+ s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
+
+ return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
+}
+
+int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
+ const struct bch_alloc_v4 *old,
+ const struct bch_alloc_v4 *new,
+ unsigned flags)
+{
+ s64 old_sectors = bch2_bucket_sectors(*old);
+ s64 new_sectors = bch2_bucket_sectors(*new);
+ if (old->data_type != new->data_type) {
+ int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
+ 1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
+ bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
+ -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
+ if (ret)
+ return ret;
+ } else if (old_sectors != new_sectors) {
+ int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
+ 0,
+ new_sectors - old_sectors,
+ bch2_bucket_sectors_fragmented(ca, *new) -
+ bch2_bucket_sectors_fragmented(ca, *old), flags);
+ if (ret)
+ return ret;
+ }
+
+ s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
+ s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
+ if (old_unstriped != new_unstriped) {
+ int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
+ !!new_unstriped - !!old_unstriped,
+ new_unstriped - old_unstriped,
+ 0,
+ flags);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int bch2_trigger_alloc(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
- "alloc key for invalid device or bucket"))
+ struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
+ if (!ca)
return -EIO;
- struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode);
-
struct bch_alloc_v4 old_a_convert;
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
- struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
+ struct bch_alloc_v4 *new_a;
+ if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
+ new_a = bkey_s_to_alloc_v4(new).v;
+ } else {
+ BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
+
+ struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
+ ret = PTR_ERR_OR_ZERO(new_ka);
+ if (unlikely(ret))
+ goto err;
+ new_a = &new_ka->v;
+ }
+
+ if (flags & BTREE_TRIGGER_transactional) {
+ alloc_data_type_set(new_a, new_a->data_type);
- new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
+ int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
+ (int) data_type_is_empty(old_a->data_type);
- if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) {
- new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+ if (is_empty_delta < 0) {
+ new_a->io_time[READ] = bch2_current_io_time(c, READ);
+ new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
}
@@ -762,20 +873,21 @@ int bch2_trigger_alloc(struct btree_trans *trans,
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
new_a->gen++;
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+ alloc_data_type_set(new_a, new_a->data_type);
}
if (old_a->data_type != new_a->data_type ||
(new_a->data_type == BCH_DATA_free &&
alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
- ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
- bch2_bucket_do_index(trans, new.s_c, new_a, true);
+ ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
+ bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
if (ret)
- return ret;
+ goto err;
}
if (new_a->data_type == BCH_DATA_cached &&
!new_a->io_time[READ])
- new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ new_a->io_time[READ] = bch2_current_io_time(c, READ);
u64 old_lru = alloc_lru_idx_read(*old_a);
u64 new_lru = alloc_lru_idx_read(*new_a);
@@ -784,128 +896,150 @@ int bch2_trigger_alloc(struct btree_trans *trans,
bucket_to_u64(new.k->p),
old_lru, new_lru);
if (ret)
- return ret;
+ goto err;
}
- new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
- bch_dev_bkey_exists(c, new.k->p.inode));
- if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+ old_lru = alloc_lru_idx_fragmentation(*old_a, ca);
+ new_lru = alloc_lru_idx_fragmentation(*new_a, ca);
+ if (old_lru != new_lru) {
ret = bch2_lru_change(trans,
BCH_LRU_FRAGMENTATION_START,
bucket_to_u64(new.k->p),
- old_a->fragmentation_lru, new_a->fragmentation_lru);
+ old_lru, new_lru);
if (ret)
- return ret;
+ goto err;
}
if (old_a->gen != new_a->gen) {
ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
if (ret)
- return ret;
+ goto err;
}
- /*
- * need to know if we're getting called from the invalidate path or
- * not:
- */
-
- if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
+ if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
old_a->cached_sectors) {
- ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
- -((s64) old_a->cached_sectors));
+ ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
+ -((s64) old_a->cached_sectors),
+ flags & BTREE_TRIGGER_gc);
if (ret)
- return ret;
+ goto err;
}
+
+ ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
+ if (ret)
+ goto err;
}
- if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
- struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
- u64 journal_seq = trans->journal_res.seq;
- u64 bucket_journal_seq = new_a->journal_seq;
+ if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
+ u64 transaction_seq = trans->journal_res.seq;
+ BUG_ON(!transaction_seq);
- if ((flags & BTREE_TRIGGER_INSERT) &&
- data_type_is_empty(old_a->data_type) !=
- data_type_is_empty(new_a->data_type) &&
- new.k->type == KEY_TYPE_alloc_v4) {
- struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
+ if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq,
+ trans, alloc_key_journal_seq_in_future,
+ "bucket journal seq in future (currently at %llu)\n%s",
+ journal_cur_seq(&c->journal),
+ (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
+ new_a->journal_seq_nonempty = transaction_seq;
- /*
- * If the btree updates referring to a bucket weren't flushed
- * before the bucket became empty again, then the we don't have
- * to wait on a journal flush before we can reuse the bucket:
- */
- v->journal_seq = bucket_journal_seq =
- data_type_is_empty(new_a->data_type) &&
- (journal_seq == v->journal_seq ||
- bch2_journal_noflush_seq(&c->journal, v->journal_seq))
- ? 0 : journal_seq;
+ int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
+ (int) data_type_is_empty(old_a->data_type);
+
+ /*
+ * Record journal sequence number of empty -> nonempty transition:
+ * Note that there may be multiple empty -> nonempty
+ * transitions, data in a bucket may be overwritten while we're
+ * still writing to it - so be careful to only record the first:
+ * */
+ if (is_empty_delta < 0 &&
+ new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) {
+ new_a->journal_seq_nonempty = transaction_seq;
+ new_a->journal_seq_empty = 0;
}
- if (!data_type_is_empty(old_a->data_type) &&
- data_type_is_empty(new_a->data_type) &&
- bucket_journal_seq) {
- ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk,
- new.k->p.inode, new.k->p.offset,
- bucket_journal_seq);
- if (ret) {
- bch2_fs_fatal_error(c,
- "error setting bucket_needs_journal_commit: %i", ret);
- return ret;
+ /*
+ * Bucket becomes empty: mark it as waiting for a journal flush,
+ * unless updates since empty -> nonempty transition were never
+ * flushed - we may need to ask the journal not to flush
+ * intermediate sequence numbers:
+ */
+ if (is_empty_delta > 0) {
+ if (new_a->journal_seq_nonempty == transaction_seq ||
+ bch2_journal_noflush_seq(&c->journal,
+ new_a->journal_seq_nonempty,
+ transaction_seq)) {
+ new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0;
+ } else {
+ new_a->journal_seq_empty = transaction_seq;
+
+ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ new.k->p.inode, new.k->p.offset,
+ transaction_seq);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "setting bucket_needs_journal_commit: %s",
+ bch2_err_str(ret)))
+ goto err;
}
}
- percpu_down_read(&c->mark_lock);
- if (new_a->gen != old_a->gen)
- *bucket_gen(ca, new.k->p.offset) = new_a->gen;
+ if (new_a->gen != old_a->gen) {
+ rcu_read_lock();
+ u8 *gen = bucket_gen(ca, new.k->p.offset);
+ if (unlikely(!gen)) {
+ rcu_read_unlock();
+ goto invalid_bucket;
+ }
+ *gen = new_a->gen;
+ rcu_read_unlock();
+ }
- bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
+#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
+#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
+#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk)
- if (new_a->data_type == BCH_DATA_free &&
- (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+ if (statechange(a->data_type == BCH_DATA_free) &&
+ bucket_flushed(new_a))
closure_wake_up(&c->freelist_wait);
- if (new_a->data_type == BCH_DATA_need_discard &&
- (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
- bch2_do_discards(c);
+ if (statechange(a->data_type == BCH_DATA_need_discard) &&
+ !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
+ bucket_flushed(new_a))
+ bch2_discard_one_bucket_fast(ca, new.k->p.offset);
- if (old_a->data_type != BCH_DATA_cached &&
- new_a->data_type == BCH_DATA_cached &&
+ if (statechange(a->data_type == BCH_DATA_cached) &&
+ !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
- bch2_do_invalidates(c);
+ bch2_dev_do_invalidates(ca);
- if (new_a->data_type == BCH_DATA_need_gc_gens)
- bch2_do_gc_gens(c);
- percpu_up_read(&c->mark_lock);
+ if (statechange(a->data_type == BCH_DATA_need_gc_gens))
+ bch2_gc_gens_async(c);
}
- if ((flags & BTREE_TRIGGER_GC) &&
- (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) {
- struct bch_alloc_v4 new_a_convert;
- const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
-
- percpu_down_read(&c->mark_lock);
+ if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
+ rcu_read_lock();
struct bucket *g = gc_bucket(ca, new.k->p.offset);
-
- bucket_lock(g);
-
- g->gen_valid = 1;
- g->gen = new_a->gen;
- g->data_type = new_a->data_type;
- g->stripe = new_a->stripe;
- g->stripe_redundancy = new_a->stripe_redundancy;
- g->dirty_sectors = new_a->dirty_sectors;
- g->cached_sectors = new_a->cached_sectors;
-
- bucket_unlock(g);
- percpu_up_read(&c->mark_lock);
+ if (unlikely(!g)) {
+ rcu_read_unlock();
+ goto invalid_bucket;
+ }
+ g->gen_valid = 1;
+ g->gen = new_a->gen;
+ rcu_read_unlock();
}
-
- return 0;
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch2_dev_put(ca);
+ return ret;
+invalid_bucket:
+ bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
+ (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
+ ret = -EIO;
+ goto err;
}
/*
- * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for
+ * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
* extents style btrees, but works on non-extents btrees:
*/
static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
@@ -933,7 +1067,7 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
* btree node min/max is a closed interval, upto takes a half
* open interval:
*/
- k = bch2_btree_iter_peek_upto(&iter2, end);
+ k = bch2_btree_iter_peek_max(&iter2, end);
next = iter2.pos;
bch2_trans_iter_exit(iter->trans, &iter2);
@@ -950,35 +1084,34 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
}
}
-static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
+static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
{
- struct bch_dev *ca;
-
- if (bch2_dev_bucket_exists(c, *bucket))
- return true;
+ if (*ca) {
+ if (bucket->offset < (*ca)->mi.first_bucket)
+ bucket->offset = (*ca)->mi.first_bucket;
- if (bch2_dev_exists2(c, bucket->inode)) {
- ca = bch_dev_bkey_exists(c, bucket->inode);
-
- if (bucket->offset < ca->mi.first_bucket) {
- bucket->offset = ca->mi.first_bucket;
+ if (bucket->offset < (*ca)->mi.nbuckets)
return true;
- }
+ bch2_dev_put(*ca);
+ *ca = NULL;
bucket->inode++;
bucket->offset = 0;
}
rcu_read_lock();
- ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
- if (ca)
- *bucket = POS(ca->dev_idx, ca->mi.first_bucket);
+ *ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
+ if (*ca) {
+ *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
+ bch2_dev_get(*ca);
+ }
rcu_read_unlock();
- return ca != NULL;
+ return *ca != NULL;
}
-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole)
+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
+ struct bch_dev **ca, struct bkey *hole)
{
struct bch_fs *c = iter->trans->c;
struct bkey_s_c k;
@@ -987,22 +1120,21 @@ again:
if (bkey_err(k))
return k;
+ *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
+
if (!k.k->type) {
- struct bpos bucket = bkey_start_pos(k.k);
+ struct bpos hole_start = bkey_start_pos(k.k);
- if (!bch2_dev_bucket_exists(c, bucket)) {
- if (!next_bucket(c, &bucket))
+ if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
+ if (!next_bucket(c, ca, &hole_start))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, bucket);
+ bch2_btree_iter_set_pos(iter, hole_start);
goto again;
}
- if (!bch2_dev_bucket_exists(c, k.k->p)) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
-
- bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset);
- }
+ if (k.k->p.offset > (*ca)->mi.nbuckets)
+ bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
}
return k;
@@ -1017,87 +1149,51 @@ int bch2_check_alloc_key(struct btree_trans *trans,
struct btree_iter *bucket_gens_iter)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca;
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a;
- unsigned discard_key_type, freespace_key_type;
unsigned gens_offset;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
- int ret;
+ int ret = 0;
- if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
- alloc_key_to_missing_dev_bucket,
+ struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
+ if (fsck_err_on(!ca,
+ trans, alloc_key_to_missing_dev_bucket,
"alloc key for invalid device:bucket %llu:%llu",
alloc_k.k->p.inode, alloc_k.k->p.offset))
- return bch2_btree_delete_at(trans, alloc_iter, 0);
+ ret = bch2_btree_delete_at(trans, alloc_iter, 0);
+ if (!ca)
+ return ret;
- ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
if (!ca->mi.freespace_initialized)
- return 0;
+ goto out;
a = bch2_alloc_to_v4(alloc_k, &a_convert);
- discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
k = bch2_btree_iter_peek_slot(discard_iter);
ret = bkey_err(k);
if (ret)
goto err;
- if (k.k->type != discard_key_type &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, need_discard_key_wrong,
- "incorrect key in need_discard btree (got %s should be %s)\n"
- " %s",
- bch2_bkey_types[k.k->type],
- bch2_bkey_types[discard_key_type],
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
- struct bkey_i *update =
- bch2_trans_kmalloc(trans, sizeof(*update));
-
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- goto err;
-
- bkey_init(&update->k);
- update->k.type = discard_key_type;
- update->k.p = discard_iter->pos;
-
- ret = bch2_trans_update(trans, discard_iter, update, 0);
+ bool is_discarded = a->data_type == BCH_DATA_need_discard;
+ if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded,
+ trans, alloc_k, !is_discarded, true, true)) {
+ ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded);
if (ret)
goto err;
}
- freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
k = bch2_btree_iter_peek_slot(freespace_iter);
ret = bkey_err(k);
if (ret)
goto err;
- if (k.k->type != freespace_key_type &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, freespace_key_wrong,
- "incorrect key in freespace btree (got %s should be %s)\n"
- " %s",
- bch2_bkey_types[k.k->type],
- bch2_bkey_types[freespace_key_type],
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
- struct bkey_i *update =
- bch2_trans_kmalloc(trans, sizeof(*update));
-
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- goto err;
-
- bkey_init(&update->k);
- update->k.type = freespace_key_type;
- update->k.p = freespace_iter->pos;
- bch2_key_resize(&update->k, 1);
-
- ret = bch2_trans_update(trans, freespace_iter, update, 0);
+ bool is_free = a->data_type == BCH_DATA_free;
+ if (need_discard_or_freespace_err_on(!!k.k->type != is_free,
+ trans, alloc_k, !is_free, false, true)) {
+ ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free);
if (ret)
goto err;
}
@@ -1108,14 +1204,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
- if (a->gen != alloc_gen(k, gens_offset) &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, bucket_gens_key_wrong,
- "incorrect gen in bucket_gens btree (got %u should be %u)\n"
- " %s",
- alloc_gen(k, gens_offset), a->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
+ trans, bucket_gens_key_wrong,
+ "incorrect gen in bucket_gens btree (got %u should be %u)\n"
+ " %s",
+ alloc_gen(k, gens_offset), a->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i_bucket_gens *g =
bch2_trans_kmalloc(trans, sizeof(*g));
@@ -1136,25 +1231,25 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
}
+out:
err:
fsck_err:
+ bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
}
static noinline_for_stack
int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bpos start,
struct bpos *end,
struct btree_iter *freespace_iter)
{
- struct bch_fs *c = trans->c;
- struct bch_dev *ca;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
int ret;
- ca = bch_dev_bkey_exists(c, start.inode);
if (!ca->mi.freespace_initialized)
return 0;
@@ -1167,14 +1262,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
*end = bkey_min(k.k->p, *end);
- if (k.k->type != KEY_TYPE_set &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, freespace_hole_missing,
- "hole in alloc btree missing in freespace btree\n"
- " device %llu buckets %llu-%llu",
- freespace_iter->pos.inode,
- freespace_iter->pos.offset,
- end->offset))) {
+ if (fsck_err_on(k.k->type != KEY_TYPE_set,
+ trans, freespace_hole_missing,
+ "hole in alloc btree missing in freespace btree\n"
+ " device %llu buckets %llu-%llu",
+ freespace_iter->pos.inode,
+ freespace_iter->pos.offset,
+ end->offset)) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@@ -1205,7 +1299,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
struct bpos *end,
struct btree_iter *bucket_gens_iter)
{
- struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
unsigned i, gens_offset, gens_end_offset;
@@ -1229,7 +1322,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
bkey_reassemble(&g.k_i, k);
for (i = gens_offset; i < gens_end_offset; i++) {
- if (fsck_err_on(g.v.gens[i], c,
+ if (fsck_err_on(g.v.gens[i], trans,
bucket_gens_hole_wrong,
"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
bucket_gens_pos_to_alloc(k.k->p, i).inode,
@@ -1262,63 +1355,129 @@ fsck_err:
return ret;
}
-static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
- struct btree_iter *iter)
+struct check_discard_freespace_key_async {
+ struct work_struct work;
+ struct bch_fs *c;
+ struct bbpos pos;
+};
+
+static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ u8 gen;
+ ret = k.k->type != KEY_TYPE_set
+ ? bch2_check_discard_freespace_key(trans, &iter, &gen, false)
+ : 0;
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static void check_discard_freespace_key_work(struct work_struct *work)
+{
+ struct check_discard_freespace_key_async *w =
+ container_of(work, struct check_discard_freespace_key_async, work);
+
+ bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
+ bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key);
+ kfree(w);
+}
+
+int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen,
+ bool async_repair)
{
struct bch_fs *c = trans->c;
- struct btree_iter alloc_iter;
- struct bkey_s_c alloc_k;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
- u64 genbits;
- struct bpos pos;
enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
? BCH_DATA_need_discard
: BCH_DATA_free;
struct printbuf buf = PRINTBUF;
- int ret;
- pos = iter->pos;
- pos.offset &= ~(~0ULL << 56);
- genbits = iter->pos.offset & (~0ULL << 56);
+ struct bpos bucket = iter->pos;
+ bucket.offset &= ~(~0ULL << 56);
+ u64 genbits = iter->pos.offset & (~0ULL << 56);
- alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
- ret = bkey_err(alloc_k);
+ struct btree_iter alloc_iter;
+ struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
+ BTREE_ID_alloc, bucket,
+ async_repair ? BTREE_ITER_cached : 0);
+ int ret = bkey_err(alloc_k);
if (ret)
return ret;
- if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
- need_discard_freespace_key_to_invalid_dev_bucket,
- "entry in %s btree for nonexistant dev:bucket %llu:%llu",
- bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
- goto delete;
+ if (!bch2_dev_bucket_exists(c, bucket)) {
+ if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket,
+ "entry in %s btree for nonexistant dev:bucket %llu:%llu",
+ bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
+ goto delete;
+ ret = 1;
+ goto out;
+ }
- a = bch2_alloc_to_v4(alloc_k, &a_convert);
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+ if (a->data_type != state ||
+ (state == BCH_DATA_free &&
+ genbits != alloc_freespace_genbits(*a))) {
+ if (fsck_err(trans, need_discard_freespace_key_bad,
+ "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+ bch2_btree_id_str(iter->btree_id),
+ iter->pos.inode,
+ iter->pos.offset,
+ a->data_type == state,
+ genbits >> 56, alloc_freespace_genbits(*a) >> 56))
+ goto delete;
+ ret = 1;
+ goto out;
+ }
- if (fsck_err_on(a->data_type != state ||
- (state == BCH_DATA_free &&
- genbits != alloc_freespace_genbits(*a)), c,
- need_discard_freespace_key_bad,
- "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
- bch2_btree_id_str(iter->btree_id),
- iter->pos.inode,
- iter->pos.offset,
- a->data_type == state,
- genbits >> 56, alloc_freespace_genbits(*a) >> 56))
- goto delete;
+ *gen = a->gen;
out:
fsck_err:
- set_btree_iter_dontneed(&alloc_iter);
+ bch2_set_btree_iter_dontneed(&alloc_iter);
bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf);
return ret;
delete:
- ret = bch2_btree_delete_extent_at(trans, iter,
- iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- goto out;
+ if (!async_repair) {
+ ret = bch2_btree_bit_mod_iter(trans, iter, false) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_commit;
+ goto out;
+ } else {
+ /*
+ * We can't repair here when called from the allocator path: the
+ * commit will recurse back into the allocator
+ */
+ struct check_discard_freespace_key_async *w =
+ kzalloc(sizeof(*w), GFP_KERNEL);
+ if (!w)
+ goto out;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) {
+ kfree(w);
+ goto out;
+ }
+
+ INIT_WORK(&w->work, check_discard_freespace_key_work);
+ w->c = c;
+ w->pos = BBPOS(iter->btree_id, iter->pos);
+ queue_work(c->write_ref_wq, &w->work);
+ goto out;
+ }
+}
+
+static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter)
+{
+ u8 gen;
+ int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false);
+ return ret < 0 ? ret : 0;
}
/*
@@ -1333,33 +1492,28 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_i_bucket_gens g;
- struct bch_dev *ca;
u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
u64 b;
- bool need_update = false, dev_exists;
+ bool need_update = false;
struct printbuf buf = PRINTBUF;
int ret = 0;
BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
bkey_reassemble(&g.k_i, k);
- /* if no bch_dev, skip out whether we repair or not */
- dev_exists = bch2_dev_exists2(c, k.k->p.inode);
- if (!dev_exists) {
- if (fsck_err_on(!dev_exists, c,
- bucket_gens_to_invalid_dev,
- "bucket_gens key for invalid device:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
+ if (!ca) {
+ if (fsck_err(trans, bucket_gens_to_invalid_dev,
+ "bucket_gens key for invalid device:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter, 0);
- }
goto out;
}
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
if (fsck_err_on(end <= ca->mi.first_bucket ||
- start >= ca->mi.nbuckets, c,
- bucket_gens_to_invalid_buckets,
+ start >= ca->mi.nbuckets,
+ trans, bucket_gens_to_invalid_buckets,
"bucket_gens key for invalid buckets:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
@@ -1367,16 +1521,16 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
}
for (b = start; b < ca->mi.first_bucket; b++)
- if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
- bucket_gens_nonzero_for_invalid_buckets,
+ if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
+ trans, bucket_gens_nonzero_for_invalid_buckets,
"bucket_gens key has nonzero gen for invalid bucket")) {
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
need_update = true;
}
for (b = ca->mi.nbuckets; b < end; b++)
- if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
- bucket_gens_nonzero_for_invalid_buckets,
+ if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
+ trans, bucket_gens_nonzero_for_invalid_buckets,
"bucket_gens key has nonzero gen for invalid bucket")) {
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
need_update = true;
@@ -1394,6 +1548,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
}
out:
fsck_err:
+ bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
}
@@ -1402,25 +1557,26 @@ int bch2_check_alloc_info(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
+ struct bch_dev *ca = NULL;
struct bkey hole;
struct bkey_s_c k;
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
while (1) {
struct bpos next;
bch2_trans_begin(trans);
- k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
+ k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
ret = bkey_err(k);
if (ret)
goto bkey_err;
@@ -1441,7 +1597,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
} else {
next = k.k->p;
- ret = bch2_check_alloc_hole_freespace(trans,
+ ret = bch2_check_alloc_hole_freespace(trans, ca,
bkey_start_pos(k.k),
&next,
&freespace_iter) ?:
@@ -1469,19 +1625,21 @@ bkey_err:
bch2_trans_iter_exit(trans, &freespace_iter);
bch2_trans_iter_exit(trans, &discard_iter);
bch2_trans_iter_exit(trans, &iter);
+ bch2_dev_put(ca);
+ ca = NULL;
if (ret < 0)
goto err;
ret = for_each_btree_key(trans, iter,
BTREE_ID_need_discard, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- bch2_check_discard_freespace_key(trans, &iter));
+ BTREE_ITER_prefetch, k,
+ bch2_check_discard_freespace_key_fsck(trans, &iter));
if (ret)
goto err;
bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
while (1) {
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
@@ -1489,7 +1647,7 @@ bkey_err:
break;
ret = bkey_err(k) ?:
- bch2_check_discard_freespace_key(trans, &iter);
+ bch2_check_discard_freespace_key_fsck(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
ret = 0;
continue;
@@ -1511,7 +1669,7 @@ bkey_err:
ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_bucket_gens_key(trans, &iter, k));
err:
@@ -1521,13 +1679,13 @@ err:
}
static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
- struct btree_iter *alloc_iter)
+ struct btree_iter *alloc_iter,
+ struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
- struct btree_iter lru_iter;
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a;
- struct bkey_s_c alloc_k, lru_k;
+ struct bkey_s_c alloc_k;
struct printbuf buf = PRINTBUF;
int ret;
@@ -1539,13 +1697,25 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (ret)
return ret;
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode);
+ if (!ca)
+ return 0;
+
a = bch2_alloc_to_v4(alloc_k, &a_convert);
+ u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
+ if (lru_idx) {
+ ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
+ lru_idx, alloc_k, last_flushed);
+ if (ret)
+ goto err;
+ }
+
if (a->data_type != BCH_DATA_cached)
- return 0;
+ goto err;
- if (fsck_err_on(!a->io_time[READ], c,
- alloc_key_cached_but_read_time_zero,
+ if (fsck_err_on(!a->io_time[READ],
+ trans, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time 0\n"
" %s",
(printbuf_reset(&buf),
@@ -1556,119 +1726,117 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (ret)
goto err;
- a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+ a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
ret = bch2_trans_update(trans, alloc_iter,
- &a_mut->k_i, BTREE_TRIGGER_NORUN);
+ &a_mut->k_i, BTREE_TRIGGER_norun);
if (ret)
goto err;
a = &a_mut->v;
}
- lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
- lru_pos(alloc_k.k->p.inode,
- bucket_to_u64(alloc_k.k->p),
- a->io_time[READ]), 0);
- ret = bkey_err(lru_k);
+ ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
+ alloc_k, last_flushed);
if (ret)
- return ret;
-
- if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
- alloc_key_to_missing_lru_entry,
- "missing lru entry\n"
- " %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- ret = bch2_lru_set(trans,
- alloc_k.k->p.inode,
- bucket_to_u64(alloc_k.k->p),
- a->io_time[READ]);
- if (ret)
- goto err;
- }
+ goto err;
err:
fsck_err:
- bch2_trans_iter_exit(trans, &lru_iter);
+ bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
}
int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
{
+ struct bkey_buf last_flushed;
+
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS_MIN, BTREE_ITER_PREFETCH, k,
+ POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_alloc_to_lru_ref(trans, &iter)));
+ bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
bch_err_fn(c, ret);
return ret;
}
+static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
+{
+ int ret;
+
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i)
+ if (i->bucket == bucket) {
+ ret = -BCH_ERR_EEXIST_discard_in_flight_add;
+ goto out;
+ }
+
+ ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
+ .in_progress = in_progress,
+ .bucket = bucket,
+ }));
+out:
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
+ return ret;
+}
+
+static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
+{
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i)
+ if (i->bucket == bucket) {
+ BUG_ON(!i->in_progress);
+ darray_remove_item(&ca->discard_buckets_in_flight, i);
+ goto found;
+ }
+ BUG();
+found:
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
+}
+
struct discard_buckets_state {
u64 seen;
u64 open;
u64 need_journal_commit;
u64 discarded;
- struct bch_dev *ca;
- u64 need_journal_commit_this_dev;
};
-static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
-{
- if (s->ca == ca)
- return;
-
- if (s->ca && s->need_journal_commit_this_dev >
- bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
- bch2_journal_flush_async(&c->journal, NULL);
-
- if (s->ca)
- percpu_ref_put(&s->ca->ref);
- if (ca)
- percpu_ref_get(&ca->ref);
- s->ca = ca;
- s->need_journal_commit_this_dev = 0;
-}
-
static int bch2_discard_one_bucket(struct btree_trans *trans,
+ struct bch_dev *ca,
struct btree_iter *need_discard_iter,
struct bpos *discard_pos_done,
- struct discard_buckets_state *s)
+ struct discard_buckets_state *s,
+ bool fastpath)
{
struct bch_fs *c = trans->c;
struct bpos pos = need_discard_iter->pos;
struct btree_iter iter = { NULL };
struct bkey_s_c k;
- struct bch_dev *ca;
struct bkey_i_alloc_v4 *a;
struct printbuf buf = PRINTBUF;
+ bool discard_locked = false;
int ret = 0;
- ca = bch_dev_bkey_exists(c, pos.inode);
-
- if (!percpu_ref_tryget(&ca->io_ref)) {
- bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
- return 0;
- }
-
- discard_buckets_next_dev(c, s, ca);
-
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
s->open++;
goto out;
}
- if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk,
- pos.inode, pos.offset)) {
- s->need_journal_commit++;
- s->need_journal_commit_this_dev++;
+ u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
+ pos.inode, pos.offset);
+ if (seq_ready > c->journal.flushed_seq_ondisk) {
+ if (seq_ready > c->journal.flushing_seq)
+ s->need_journal_commit++;
goto out;
}
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
need_discard_iter->pos,
- BTREE_ITER_CACHED);
+ BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
goto out;
@@ -1678,78 +1846,73 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
if (ret)
goto out;
- if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
- a->v.gen++;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
- goto write;
- }
-
- if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- bch2_trans_inconsistent(trans,
- "clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
- "%s",
- a->v.journal_seq,
- c->journal.flushed_seq_ondisk,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
+ if (a->v.data_type != BCH_DATA_need_discard) {
+ if (need_discard_or_freespace_err(trans, k, true, true, true)) {
+ ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false);
+ if (ret)
+ goto out;
+ goto commit;
}
+
goto out;
}
- if (a->v.data_type != BCH_DATA_need_discard) {
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- bch2_trans_inconsistent(trans,
- "bucket incorrectly set in need_discard btree\n"
- "%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- }
+ if (!fastpath) {
+ if (discard_in_flight_add(ca, iter.pos.offset, true))
+ goto out;
- goto out;
+ discard_locked = true;
}
- if (!bkey_eq(*discard_pos_done, iter.pos) &&
- ca->mi.discard && !c->opts.nochanges) {
- /*
- * This works without any other locks because this is the only
- * thread that removes items from the need_discard tree
- */
- bch2_trans_unlock_long(trans);
- blkdev_issue_discard(ca->disk_sb.bdev,
- k.k->p.offset * ca->mi.bucket_size,
- ca->mi.bucket_size,
- GFP_KERNEL);
+ if (!bkey_eq(*discard_pos_done, iter.pos)) {
+ s->discarded++;
*discard_pos_done = iter.pos;
- ret = bch2_trans_relock_notrace(trans);
- if (ret)
- goto out;
+ if (ca->mi.discard && !c->opts.nochanges) {
+ /*
+ * This works without any other locks because this is the only
+ * thread that removes items from the need_discard tree
+ */
+ bch2_trans_unlock_long(trans);
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ k.k->p.offset * ca->mi.bucket_size,
+ ca->mi.bucket_size,
+ GFP_KERNEL);
+ ret = bch2_trans_relock_notrace(trans);
+ if (ret)
+ goto out;
+ }
}
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
- a->v.data_type = alloc_data_type(a->v, a->v.data_type);
-write:
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc);
+ alloc_data_type_set(&a->v, a->v.data_type);
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ if (ret)
+ goto out;
+commit:
+ ret = bch2_trans_commit(trans, NULL, NULL,
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
count_event(c, bucket_discard);
- s->discarded++;
out:
- s->seen++;
+fsck_err:
+ if (discard_locked)
+ discard_in_flight_remove(ca, iter.pos.offset);
+ if (!ret)
+ s->seen++;
bch2_trans_iter_exit(trans, &iter);
- percpu_ref_put(&ca->io_ref);
printbuf_exit(&buf);
return ret;
}
static void bch2_do_discards_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
+ struct bch_fs *c = ca->fs;
struct discard_buckets_state s = {};
struct bpos discard_pos_done = POS_MAX;
int ret;
@@ -1760,23 +1923,136 @@ static void bch2_do_discards_work(struct work_struct *work)
* successful commit:
*/
ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter,
- BTREE_ID_need_discard, POS_MIN, 0, k,
- bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
+ for_each_btree_key_max(trans, iter,
+ BTREE_ID_need_discard,
+ POS(ca->dev_idx, 0),
+ POS(ca->dev_idx, U64_MAX), 0, k,
+ bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
- discard_buckets_next_dev(c, &s, NULL);
+ if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal))
+ bch2_journal_flush_async(&c->journal, NULL);
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
+ percpu_ref_put(&ca->io_ref);
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+}
+
+void bch2_dev_do_discards(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
+ return;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ goto put_write_ref;
+
+ if (queue_work(c->write_ref_wq, &ca->discard_work))
+ return;
+
+ percpu_ref_put(&ca->io_ref);
+put_write_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
void bch2_do_discards(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
- !queue_work(c->write_ref_wq, &c->discard_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ for_each_member_device(c, ca)
+ bch2_dev_do_discards(ca);
+}
+
+static int bch2_do_discards_fast_one(struct btree_trans *trans,
+ struct bch_dev *ca,
+ u64 bucket,
+ struct bpos *discard_pos_done,
+ struct discard_buckets_state *s)
+{
+ struct btree_iter need_discard_iter;
+ struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter,
+ BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0);
+ int ret = bkey_err(discard_k);
+ if (ret)
+ return ret;
+
+ if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set,
+ trans, discarding_bucket_not_in_need_discard_btree,
+ "attempting to discard bucket %u:%llu not in need_discard btree",
+ ca->dev_idx, bucket))
+ goto out;
+
+ ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true);
+out:
+fsck_err:
+ bch2_trans_iter_exit(trans, &need_discard_iter);
+ return ret;
+}
+
+static void bch2_do_discards_fast_work(struct work_struct *work)
+{
+ struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
+ struct bch_fs *c = ca->fs;
+ struct discard_buckets_state s = {};
+ struct bpos discard_pos_done = POS_MAX;
+ struct btree_trans *trans = bch2_trans_get(c);
+ int ret = 0;
+
+ while (1) {
+ bool got_bucket = false;
+ u64 bucket;
+
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i) {
+ if (i->in_progress)
+ continue;
+
+ got_bucket = true;
+ bucket = i->bucket;
+ i->in_progress = true;
+ break;
+ }
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
+
+ if (!got_bucket)
+ break;
+
+ ret = lockrestart_do(trans,
+ bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s));
+ bch_err_fn(c, ret);
+
+ discard_in_flight_remove(ca, bucket);
+
+ if (ret)
+ break;
+ }
+
+ trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
+
+ bch2_trans_put(trans);
+ percpu_ref_put(&ca->io_ref);
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
+static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
+{
+ struct bch_fs *c = ca->fs;
+
+ if (discard_in_flight_add(ca, bucket, false))
+ return;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
+ return;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ goto put_ref;
+
+ if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
+ return;
+
+ percpu_ref_put(&ca->io_ref);
+put_ref:
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
static int invalidate_one_bucket(struct btree_trans *trans,
@@ -1785,7 +2061,6 @@ static int invalidate_one_bucket(struct btree_trans *trans,
s64 *nr_to_invalidate)
{
struct bch_fs *c = trans->c;
- struct btree_iter alloc_iter = { NULL };
struct bkey_i_alloc_v4 *a = NULL;
struct printbuf buf = PRINTBUF;
struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
@@ -1796,14 +2071,17 @@ static int invalidate_one_bucket(struct btree_trans *trans,
return 1;
if (!bch2_dev_bucket_exists(c, bucket)) {
- prt_str(&buf, "lru entry points to invalid bucket");
- goto err;
+ if (fsck_err(trans, lru_entry_to_invalid_bucket,
+ "lru key points to nonexistent device:bucket %llu:%llu",
+ bucket.inode, bucket.offset))
+ return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
+ goto out;
}
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
return 0;
- a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
+ a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
@@ -1813,6 +2091,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
goto out;
BUG_ON(a->v.data_type != BCH_DATA_cached);
+ BUG_ON(a->v.dirty_sectors);
if (!a->v.cached_sectors)
bch_err(c, "invalidating empty bucket, confused");
@@ -1823,49 +2102,44 @@ static int invalidate_one_bucket(struct btree_trans *trans,
a->v.gen++;
a->v.data_type = 0;
a->v.dirty_sectors = 0;
+ a->v.stripe_sectors = 0;
a->v.cached_sectors = 0;
- a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
- a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
-
- ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
- BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc);
+ a->v.io_time[READ] = bch2_current_io_time(c, READ);
+ a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
+
+ ret = bch2_trans_commit(trans, NULL, NULL,
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
--*nr_to_invalidate;
out:
- bch2_trans_iter_exit(trans, &alloc_iter);
+fsck_err:
printbuf_exit(&buf);
return ret;
-err:
- prt_str(&buf, "\n lru key: ");
- bch2_bkey_val_to_text(&buf, c, lru_k);
-
- prt_str(&buf, "\n lru entry: ");
- bch2_lru_pos_to_text(&buf, lru_iter->pos);
-
- prt_str(&buf, "\n alloc key: ");
- if (!a)
- bch2_bpos_to_text(&buf, bucket);
- else
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
-
- bch_err(c, "%s", buf.buf);
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
- bch2_inconsistent_error(c);
- ret = -EINVAL;
+}
+
+static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
+ struct bch_dev *ca, bool *wrapped)
+{
+ struct bkey_s_c k;
+again:
+ k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
+ if (!k.k && !*wrapped) {
+ bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
+ *wrapped = true;
+ goto again;
}
- goto out;
+ return k;
}
static void bch2_do_invalidates_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
+ struct bch_fs *c = ca->fs;
struct btree_trans *trans = bch2_trans_get(c);
int ret = 0;
@@ -1873,31 +2147,64 @@ static void bch2_do_invalidates_work(struct work_struct *work)
if (ret)
goto err;
- for_each_member_device(c, ca) {
- s64 nr_to_invalidate =
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+ s64 nr_to_invalidate =
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+ struct btree_iter iter;
+ bool wrapped = false;
- ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
- lru_pos(ca->dev_idx, 0, 0),
- lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
- BTREE_ITER_INTENT, k,
- invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+ lru_pos(ca->dev_idx, 0,
+ ((bch2_current_io_time(c, READ) + U32_MAX) &
+ LRU_TIME_MAX)), 0);
- if (ret < 0) {
- percpu_ref_put(&ca->ref);
+ while (true) {
+ bch2_trans_begin(trans);
+
+ struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
+ ret = bkey_err(k);
+ if (ret)
+ goto restart_err;
+ if (!k.k)
break;
- }
+
+ ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
+restart_err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+
+ bch2_btree_iter_advance(&iter);
}
+ bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
+ percpu_ref_put(&ca->io_ref);
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+}
+
+void bch2_dev_do_invalidates(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
+ return;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ goto put_ref;
+
+ if (queue_work(c->write_ref_wq, &ca->invalidate_work))
+ return;
+
+ percpu_ref_put(&ca->io_ref);
+put_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
void bch2_do_invalidates(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
- !queue_work(c->write_ref_wq, &c->invalidate_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ for_each_member_device(c, ca)
+ bch2_dev_do_invalidates(ca);
}
int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
@@ -1917,13 +2224,13 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
/*
* Scan the alloc btree for every bucket on @ca, and add buckets to the
* freespace/need_discard/need_gc_gens btrees as needed:
*/
while (1) {
- if (last_updated + HZ * 10 < jiffies) {
+ if (time_after(jiffies, last_updated + HZ * 10)) {
bch_info(ca, "%s: currently at %llu/%llu",
__func__, iter.pos.offset, ca->mi.nbuckets);
last_updated = jiffies;
@@ -1949,7 +2256,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
- ret = bch2_bucket_do_index(trans, k, a, true) ?:
+ ret = bch2_bucket_do_index(trans, ca, k, a, true) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc);
if (ret)
@@ -2021,7 +2328,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
if (ret) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
bch_err_fn(c, ret);
return ret;
}
@@ -2037,23 +2344,51 @@ int bch2_fs_freespace_init(struct bch_fs *c)
return 0;
}
+/* device removal */
+
+int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct bpos start = POS(ca->dev_idx, 0);
+ struct bpos end = POS(ca->dev_idx, U64_MAX);
+ int ret;
+
+ /*
+ * We clear the LRU and need_discard btrees first so that we don't race
+ * with bch2_do_invalidates() and bch2_do_discards()
+ */
+ ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?:
+ bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_dev_usage_remove(c, ca->dev_idx);
+ bch_err_msg(ca, ret, "removing dev alloc info");
+ return ret;
+}
+
/* Bucket IO clocks: */
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
- size_t bucket_nr, int rw)
+static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
- u64 now;
- int ret = 0;
- a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr));
- ret = PTR_ERR_OR_ZERO(a);
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a =
+ bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
+ int ret = PTR_ERR_OR_ZERO(a);
if (ret)
return ret;
- now = atomic64_read(&c->io_clock[rw].now);
+ u64 now = bch2_current_io_time(c, rw);
if (a->v.io_time[rw] == now)
goto out;
@@ -2066,6 +2401,15 @@ out:
return ret;
}
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
+{
+ if (bch2_trans_relock(trans))
+ bch2_trans_begin(trans);
+
+ return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw));
+}
+
/* Startup/shutdown (ro/rw): */
void bch2_recalc_capacity(struct bch_fs *c)
@@ -2130,6 +2474,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
reserved_sectors = min(reserved_sectors, capacity);
+ c->reserved = reserved_sectors;
c->capacity = capacity - reserved_sectors;
c->bucket_size_max = bucket_size_max;
@@ -2168,13 +2513,15 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
/* device goes ro: */
void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned i;
+ lockdep_assert_held(&c->state_lock);
/* First, remove device from allocation groups: */
- for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
clear_bit(ca->dev_idx, c->rw_devs[i].d);
+ c->rw_devs_change_count++;
+
/*
* Capacity is calculated based off of devices in allocation groups:
*/
@@ -2203,16 +2550,29 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
/* device goes rw: */
void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned i;
+ lockdep_assert_held(&c->state_lock);
- for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
if (ca->mi.data_allowed & (1 << i))
set_bit(ca->dev_idx, c->rw_devs[i].d);
+
+ c->rw_devs_change_count++;
+}
+
+void bch2_dev_allocator_background_exit(struct bch_dev *ca)
+{
+ darray_exit(&ca->discard_buckets_in_flight);
+}
+
+void bch2_dev_allocator_background_init(struct bch_dev *ca)
+{
+ mutex_init(&ca->discard_buckets_in_flight_lock);
+ INIT_WORK(&ca->discard_work, bch2_do_discards_work);
+ INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
+ INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
}
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
- INIT_WORK(&c->discard_work, bch2_do_discards_work);
- INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index e7f7e842ee1b..de25ba4ee94b 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -8,21 +8,16 @@
#include "debug.h"
#include "super.h"
-enum bkey_invalid_flags;
-
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
{
- struct bch_dev *ca;
-
- if (!bch2_dev_exists2(c, pos.inode))
- return false;
-
- ca = bch_dev_bkey_exists(c, pos.inode);
- return pos.offset >= ca->mi.first_bucket &&
- pos.offset < ca->mi.nbuckets;
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
+ bool ret = ca && bucket_valid(ca, pos.offset);
+ rcu_read_unlock();
+ return ret;
}
static inline u64 bucket_to_u64(struct bpos bucket)
@@ -40,58 +35,122 @@ static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
return a.gen - a.oldest_gen;
}
-static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors,
- u32 cached_sectors,
- u32 stripe,
- struct bch_alloc_v4 a,
- enum bch_data_type data_type)
+static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
{
- if (stripe)
- return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
- if (dirty_sectors)
- return data_type;
- if (cached_sectors)
- return BCH_DATA_cached;
- if (BCH_ALLOC_V4_NEED_DISCARD(&a))
- return BCH_DATA_need_discard;
- if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
- return BCH_DATA_need_gc_gens;
- return BCH_DATA_free;
+ dst->gen = src.gen;
+ dst->data_type = src.data_type;
+ dst->stripe_sectors = src.stripe_sectors;
+ dst->dirty_sectors = src.dirty_sectors;
+ dst->cached_sectors = src.cached_sectors;
+ dst->stripe = src.stripe;
}
-static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
- enum bch_data_type data_type)
+static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src)
+{
+ dst->gen = src.gen;
+ dst->data_type = src.data_type;
+ dst->stripe_sectors = src.stripe_sectors;
+ dst->dirty_sectors = src.dirty_sectors;
+ dst->cached_sectors = src.cached_sectors;
+ dst->stripe = src.stripe;
+}
+
+static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
{
- return __alloc_data_type(a.dirty_sectors, a.cached_sectors,
- a.stripe, a, data_type);
+ struct bch_alloc_v4 ret = {};
+ __bucket_m_to_alloc(&ret, b);
+ return ret;
}
static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
{
- return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type;
+ switch (data_type) {
+ case BCH_DATA_cached:
+ case BCH_DATA_stripe:
+ return BCH_DATA_user;
+ default:
+ return data_type;
+ }
+}
+
+static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
+ enum bch_data_type ptr)
+{
+ return !data_type_is_empty(bucket) &&
+ bucket_data_type(bucket) != bucket_data_type(ptr);
}
-static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a)
+/*
+ * It is my general preference to use unsigned types for unsigned quantities -
+ * however, these helpers are used in disk accounting calculations run by
+ * triggers where the output will be negated and added to an s64. unsigned is
+ * right out even though all these quantities will fit in 32 bits, since it
+ * won't be sign extended correctly; u64 will negate "correctly", but s64 is the
+ * simpler option here.
+ */
+static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
{
- return a.dirty_sectors + a.cached_sectors;
+ return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
}
-static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
+static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
{
- return a.dirty_sectors;
+ return a.stripe_sectors + a.dirty_sectors;
}
-static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
+static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a)
+{
+ return a.data_type == BCH_DATA_cached
+ ? a.cached_sectors
+ : bch2_bucket_sectors_dirty(a);
+}
+
+static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca,
struct bch_alloc_v4 a)
{
- int d = bch2_bucket_sectors_dirty(a);
+ int d = bch2_bucket_sectors(a);
+
+ return d ? max(0, ca->mi.bucket_size - d) : 0;
+}
+
+static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a)
+{
+ int d = a.stripe_sectors + a.dirty_sectors;
return d ? max(0, ca->mi.bucket_size - d) : 0;
}
+static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
+{
+ return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
+}
+
+static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
+ enum bch_data_type data_type)
+{
+ if (a.stripe)
+ return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
+ if (bch2_bucket_sectors_dirty(a))
+ return data_type;
+ if (a.cached_sectors)
+ return BCH_DATA_cached;
+ if (BCH_ALLOC_V4_NEED_DISCARD(&a))
+ return BCH_DATA_need_discard;
+ if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+ return BCH_DATA_need_gc_gens;
+ return BCH_DATA_free;
+}
+
+static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type)
+{
+ a->data_type = alloc_data_type(*a, data_type);
+}
+
static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
{
- return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
+ return a.data_type == BCH_DATA_cached
+ ? a.io_time[READ] & LRU_TIME_MAX
+ : 0;
}
#define DATA_TYPES_MOVABLE \
@@ -107,11 +166,20 @@ static inline bool data_type_movable(enum bch_data_type type)
static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
struct bch_dev *ca)
{
+ if (a.data_type >= BCH_DATA_NR)
+ return 0;
+
if (!data_type_movable(a.data_type) ||
!bch2_bucket_sectors_fragmented(ca, a))
return 0;
- u64 d = bch2_bucket_sectors_dirty(a);
+ /*
+ * avoid overflowing LRU_TIME_BITS on a corrupted fs, when
+ * bucket_sectors_dirty is (much) bigger than bucket_size
+ */
+ u64 d = min_t(s64, bch2_bucket_sectors_dirty(a),
+ ca->mi.bucket_size);
+
return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
}
@@ -126,13 +194,17 @@ static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_
return pos;
}
-static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a)
{
- unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+ return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
BCH_ALLOC_V4_U64s_V0) +
BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
(sizeof(struct bch_backpointer) / sizeof(u64));
+}
+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+{
+ unsigned ret = alloc_v4_u64s_noerror(a);
BUG_ON(ret > U8_MAX - BKEY_U64s);
return ret;
}
@@ -143,7 +215,10 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
}
struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
+bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *, struct bpos,
+ enum btree_iter_update_trigger_flags);
void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
@@ -168,52 +243,52 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
-int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
-int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
-int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc ((struct bkey_ops) { \
- .key_invalid = bch2_alloc_v1_invalid, \
+ .key_validate = bch2_alloc_v1_validate, \
.val_to_text = bch2_alloc_to_text, \
.trigger = bch2_trigger_alloc, \
.min_val_size = 8, \
})
#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \
- .key_invalid = bch2_alloc_v2_invalid, \
+ .key_validate = bch2_alloc_v2_validate, \
.val_to_text = bch2_alloc_to_text, \
.trigger = bch2_trigger_alloc, \
.min_val_size = 8, \
})
#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \
- .key_invalid = bch2_alloc_v3_invalid, \
+ .key_validate = bch2_alloc_v3_validate, \
.val_to_text = bch2_alloc_to_text, \
.trigger = bch2_trigger_alloc, \
.min_val_size = 16, \
})
#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \
- .key_invalid = bch2_alloc_v4_invalid, \
+ .key_validate = bch2_alloc_v4_validate, \
.val_to_text = bch2_alloc_to_text, \
.swab = bch2_alloc_v4_swab, \
.trigger = bch2_trigger_alloc, \
.min_val_size = 48, \
})
-int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \
- .key_invalid = bch2_bucket_gens_invalid, \
+ .key_validate = bch2_bucket_gens_validate, \
.val_to_text = bch2_bucket_gens_to_text, \
})
@@ -228,10 +303,17 @@ static inline bool bkey_is_alloc(const struct bkey *k)
int bch2_alloc_read(struct bch_fs *);
+int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
+ const struct bch_alloc_v4 *,
+ const struct bch_alloc_v4 *, unsigned);
int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
+
+int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool);
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_dev_do_discards(struct bch_dev *);
void bch2_do_discards(struct bch_fs *);
static inline u64 should_invalidate_buckets(struct bch_dev *ca,
@@ -246,6 +328,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
}
+void bch2_dev_do_invalidates(struct bch_dev *);
void bch2_do_invalidates(struct bch_fs *);
static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
@@ -262,6 +345,7 @@ static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct
int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
int bch2_fs_freespace_init(struct bch_fs *);
+int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
void bch2_recalc_capacity(struct bch_fs *);
u64 bch2_min_rw_member_capacity(struct bch_fs *);
@@ -269,6 +353,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_background_exit(struct bch_dev *);
+void bch2_dev_allocator_background_init(struct bch_dev *);
+
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
index b4ec20be93b8..740238369a5a 100644
--- a/fs/bcachefs/alloc_background_format.h
+++ b/fs/bcachefs/alloc_background_format.h
@@ -58,7 +58,7 @@ LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
struct bch_alloc_v4 {
struct bch_val v;
- __u64 journal_seq;
+ __u64 journal_seq_nonempty;
__u32 flags;
__u8 gen;
__u8 oldest_gen;
@@ -69,7 +69,10 @@ struct bch_alloc_v4 {
__u64 io_time[2];
__u32 stripe;
__u32 nr_external_backpointers;
- __u64 fragmentation_lru;
+ /* end of fields in original version of alloc_v4 */
+ __u64 journal_seq_empty;
+ __u32 stripe_sectors;
+ __u32 pad;
} __packed __aligned(8);
#define BCH_ALLOC_V4_U64s_V0 6
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 633d3223b353..5a781fb4c794 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -71,7 +71,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *c)
{
rcu_read_lock();
for_each_member_device_rcu(c, ca, NULL)
- ca->alloc_cursor = 0;
+ memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
rcu_read_unlock();
}
@@ -100,21 +100,17 @@ static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *o
void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
if (ob->ec) {
ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
return;
}
- percpu_down_read(&c->mark_lock);
spin_lock(&ob->lock);
-
ob->valid = false;
ob->data_type = 0;
-
spin_unlock(&ob->lock);
- percpu_up_read(&c->mark_lock);
spin_lock(&c->freelist_lock);
bch2_open_bucket_hash_remove(c, ob);
@@ -156,12 +152,24 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
return ob;
}
+static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+{
+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs)
+ return false;
+
+ return bch2_is_superblock_bucket(ca, b);
+}
+
static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
{
BUG_ON(c->open_buckets_partial_nr >=
ARRAY_SIZE(c->open_buckets_partial));
spin_lock(&c->freelist_lock);
+ rcu_read_lock();
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
+ rcu_read_unlock();
+
ob->on_partial_list = true;
c->open_buckets_partial[c->open_buckets_partial_nr++] =
ob - c->open_buckets;
@@ -171,25 +179,13 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
closure_wake_up(&c->freelist_wait);
}
-/* _only_ for allocating the journal on a new device: */
-long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
-{
- while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
- u64 b = ca->new_fs_bucket_idx++;
-
- if (!is_superblock_bucket(ca, b) &&
- (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
- return b;
- }
-
- return -1;
-}
-
static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
{
switch (watermark) {
- case BCH_WATERMARK_reclaim:
+ case BCH_WATERMARK_interior_updates:
return 0;
+ case BCH_WATERMARK_reclaim:
+ return OPEN_BUCKETS_COUNT / 6;
case BCH_WATERMARK_btree:
case BCH_WATERMARK_btree_copygc:
return OPEN_BUCKETS_COUNT / 4;
@@ -200,33 +196,44 @@ static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
}
}
-static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
- u64 bucket,
- enum bch_watermark watermark,
- const struct bch_alloc_v4 *a,
- struct bucket_alloc_state *s,
- struct closure *cl)
+static inline bool may_alloc_bucket(struct bch_fs *c,
+ struct bpos bucket,
+ struct bucket_alloc_state *s)
{
- struct open_bucket *ob;
-
- if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
- s->skipped_nouse++;
- return NULL;
- }
-
- if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+ if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) {
s->skipped_open++;
- return NULL;
+ return false;
}
- if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+ u64 journal_seq_ready =
+ bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
+ bucket.inode, bucket.offset);
+ if (journal_seq_ready > c->journal.flushed_seq_ondisk) {
+ if (journal_seq_ready > c->journal.flushing_seq)
+ s->need_journal_commit++;
s->skipped_need_journal_commit++;
- return NULL;
+ return false;
}
- if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
+ if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) {
s->skipped_nocow++;
+ return false;
+ }
+
+ return true;
+}
+
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ u64 bucket, u8 gen,
+ enum bch_watermark watermark,
+ struct bucket_alloc_state *s,
+ struct closure *cl)
+{
+ if (unlikely(is_superblock_bucket(c, ca, bucket)))
+ return NULL;
+
+ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+ s->skipped_nouse++;
return NULL;
}
@@ -236,8 +243,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
if (cl)
closure_wait(&c->open_buckets_wait, cl);
- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
- &c->blocked_allocate_open_bucket, true);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
spin_unlock(&c->freelist_lock);
return ERR_PTR(-BCH_ERR_open_buckets_empty);
}
@@ -249,136 +255,50 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
return NULL;
}
- ob = bch2_open_bucket_alloc(c);
+ struct open_bucket *ob = bch2_open_bucket_alloc(c);
spin_lock(&ob->lock);
-
ob->valid = true;
ob->sectors_free = ca->mi.bucket_size;
ob->dev = ca->dev_idx;
- ob->gen = a->gen;
+ ob->gen = gen;
ob->bucket = bucket;
spin_unlock(&ob->lock);
ca->nr_open_buckets++;
bch2_open_bucket_hash_add(c, ob);
- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
- &c->blocked_allocate_open_bucket, false);
-
- track_event_change(&c->times[BCH_TIME_blocked_allocate],
- &c->blocked_allocate, false);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
spin_unlock(&c->freelist_lock);
return ob;
}
static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
- enum bch_watermark watermark, u64 free_entry,
+ enum bch_watermark watermark,
struct bucket_alloc_state *s,
- struct bkey_s_c freespace_k,
+ struct btree_iter *freespace_iter,
struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
- struct open_bucket *ob;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
- u64 b = free_entry & ~(~0ULL << 56);
- unsigned genbits = free_entry >> 56;
- struct printbuf buf = PRINTBUF;
- int ret;
-
- if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
- prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
- " freespace key ",
- ca->mi.first_bucket, ca->mi.nbuckets);
- bch2_bkey_val_to_text(&buf, c, freespace_k);
- bch2_trans_inconsistent(trans, "%s", buf.buf);
- ob = ERR_PTR(-EIO);
- goto err;
- }
-
- k = bch2_bkey_get_iter(trans, &iter,
- BTREE_ID_alloc, POS(ca->dev_idx, b),
- BTREE_ITER_CACHED);
- ret = bkey_err(k);
- if (ret) {
- ob = ERR_PTR(ret);
- goto err;
- }
-
- a = bch2_alloc_to_v4(k, &a_convert);
-
- if (a->data_type != BCH_DATA_free) {
- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
- ob = NULL;
- goto err;
- }
-
- prt_printf(&buf, "non free bucket in freespace btree\n"
- " freespace key ");
- bch2_bkey_val_to_text(&buf, c, freespace_k);
- prt_printf(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_trans_inconsistent(trans, "%s", buf.buf);
- ob = ERR_PTR(-EIO);
- goto err;
- }
-
- if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
- " freespace key ",
- genbits, alloc_freespace_genbits(*a) >> 56);
- bch2_bkey_val_to_text(&buf, c, freespace_k);
- prt_printf(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_trans_inconsistent(trans, "%s", buf.buf);
- ob = ERR_PTR(-EIO);
- goto err;
- }
-
- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
- struct bch_backpointer bp;
- struct bpos bp_pos = POS_MIN;
+ u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
- ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
- &bp_pos, &bp,
- BTREE_ITER_NOPRESERVE);
- if (ret) {
- ob = ERR_PTR(ret);
- goto err;
- }
+ if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s))
+ return NULL;
- if (!bkey_eq(bp_pos, POS_MAX)) {
- /*
- * Bucket may have data in it - we don't call
- * bc2h_trans_inconnsistent() because fsck hasn't
- * finished yet
- */
- ob = NULL;
- goto err;
- }
- }
+ u8 gen;
+ int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret)
+ return NULL;
- ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
- if (!ob)
- set_btree_iter_dontneed(&iter);
-err:
- if (iter.path)
- set_btree_iter_dontneed(&iter);
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ob;
+ return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl);
}
/*
* This path is for before the freespace btree is initialized:
- *
- * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
- * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
*/
static noinline struct open_bucket *
bch2_bucket_alloc_early(struct btree_trans *trans,
@@ -387,11 +307,13 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
struct bucket_alloc_state *s,
struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct btree_iter iter, citer;
struct bkey_s_c k, ck;
struct open_bucket *ob = NULL;
- u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
- u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 first_bucket = ca->mi.first_bucket;
+ u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
+ u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
u64 alloc_cursor = alloc_start;
int ret;
@@ -406,23 +328,35 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
*/
again:
for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
- BTREE_ITER_SLOTS, k, ret) {
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
+ BTREE_ITER_slots, k, ret) {
+ u64 bucket = k.k->p.offset;
if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
break;
- if (ca->new_fs_bucket_idx &&
- is_superblock_bucket(ca, k.k->p.offset))
+ if (s->btree_bitmap != BTREE_BITMAP_ANY &&
+ s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
+ bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
+ if (s->btree_bitmap == BTREE_BITMAP_YES &&
+ bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
+ break;
+
+ bucket = sector_to_bucket(ca,
+ round_up(bucket_to_sector(ca, bucket) + 1,
+ 1ULL << ca->mi.btree_bitmap_shift));
+ bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket));
+ s->buckets_seen++;
+ s->skipped_mi_btree_bitmap++;
continue;
+ }
- a = bch2_alloc_to_v4(k, &a_convert);
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
if (a->data_type != BCH_DATA_free)
continue;
/* now check the cached key to serialize concurrent allocs of the bucket */
- ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
+ ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached);
ret = bkey_err(ck);
if (ret)
break;
@@ -433,9 +367,12 @@ again:
s->buckets_seen++;
- ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
+ ob = may_alloc_bucket(c, k.k->p, s)
+ ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen,
+ watermark, s, cl)
+ : NULL;
next:
- set_btree_iter_dontneed(&citer);
+ bch2_set_btree_iter_dontneed(&citer);
bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
@@ -443,7 +380,6 @@ next:
bch2_trans_iter_exit(trans, &iter);
alloc_cursor = iter.pos.offset;
- ca->alloc_cursor = alloc_cursor;
if (!ob && ret)
ob = ERR_PTR(ret);
@@ -453,6 +389,8 @@ next:
goto again;
}
+ *dev_alloc_cursor = alloc_cursor;
+
return ob;
}
@@ -465,44 +403,63 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_s_c k;
struct open_bucket *ob = NULL;
- u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
+ u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
u64 alloc_cursor = alloc_start;
int ret;
-
- BUG_ON(ca->new_fs_bucket_idx);
again:
- for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
- POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
- if (k.k->p.inode != ca->dev_idx)
- break;
-
- for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
- alloc_cursor < k.k->p.offset;
- alloc_cursor++) {
- ret = btree_trans_too_many_iters(trans);
- if (ret) {
- ob = ERR_PTR(ret);
- break;
- }
+ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace,
+ POS(ca->dev_idx, alloc_cursor),
+ POS(ca->dev_idx, U64_MAX),
+ 0, k, ret) {
+ /*
+ * peek normally dosen't trim extents - they can span iter.pos,
+ * which is not what we want here:
+ */
+ iter.k.size = iter.k.p.offset - iter.pos.offset;
+ while (iter.k.size) {
s->buckets_seen++;
- ob = try_alloc_bucket(trans, ca, watermark,
- alloc_cursor, s, k, cl);
+ u64 bucket = iter.pos.offset & ~(~0ULL << 56);
+ if (s->btree_bitmap != BTREE_BITMAP_ANY &&
+ s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
+ bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
+ if (s->btree_bitmap == BTREE_BITMAP_YES &&
+ bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
+ goto fail;
+
+ bucket = sector_to_bucket(ca,
+ round_up(bucket_to_sector(ca, bucket + 1),
+ 1ULL << ca->mi.btree_bitmap_shift));
+ alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
+
+ bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
+ s->skipped_mi_btree_bitmap++;
+ goto next;
+ }
+
+ ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl);
if (ob) {
- set_btree_iter_dontneed(&iter);
+ if (!IS_ERR(ob))
+ *dev_alloc_cursor = iter.pos.offset;
+ bch2_set_btree_iter_dontneed(&iter);
break;
}
- }
+ iter.k.size--;
+ iter.pos.offset++;
+ }
+next:
if (ob || ret)
break;
}
+fail:
bch2_trans_iter_exit(trans, &iter);
- ca->alloc_cursor = alloc_cursor;
+ BUG_ON(ob && ret);
- if (!ob && ret)
+ if (ret)
ob = ERR_PTR(ret);
if (!ob && alloc_start > ca->mi.first_bucket) {
@@ -513,12 +470,53 @@ again:
return ob;
}
+static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
+ enum bch_watermark watermark,
+ enum bch_data_type data_type,
+ struct closure *cl,
+ struct bch_dev_usage *usage,
+ struct bucket_alloc_state *s,
+ struct open_bucket *ob)
+{
+ struct printbuf buf = PRINTBUF;
+
+ printbuf_tabstop_push(&buf, 24);
+
+ prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx);
+ prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]);
+ prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]);
+ prt_printf(&buf, "blocking\t%u\n", cl != NULL);
+ prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets);
+ prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark));
+ prt_printf(&buf, "copygc_wait\t%lu/%lli\n",
+ bch2_copygc_wait_amount(c),
+ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
+ prt_printf(&buf, "seen\t%llu\n", s->buckets_seen);
+ prt_printf(&buf, "open\t%llu\n", s->skipped_open);
+ prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit);
+ prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow);
+ prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse);
+ prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap);
+
+ if (!IS_ERR(ob)) {
+ prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
+ trace_bucket_alloc(c, buf.buf);
+ } else {
+ prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob)));
+ trace_bucket_alloc_fail(c, buf.buf);
+ }
+
+ printbuf_exit(&buf);
+}
+
/**
* bch2_bucket_alloc_trans - allocate a single bucket from a specific device
* @trans: transaction object
* @ca: device to allocate from
* @watermark: how important is this allocation?
+ * @data_type: BCH_DATA_journal, btree, user...
* @cl: if not NULL, closure to be used to wait if buckets not available
+ * @nowait: if true, do not wait for buckets to become available
* @usage: for secondarily also returning the current device usage
*
* Returns: an open_bucket on success, or an ERR_PTR() on failure.
@@ -526,37 +524,44 @@ again:
static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
struct bch_dev *ca,
enum bch_watermark watermark,
+ enum bch_data_type data_type,
struct closure *cl,
+ bool nowait,
struct bch_dev_usage *usage)
{
struct bch_fs *c = trans->c;
struct open_bucket *ob = NULL;
bool freespace = READ_ONCE(ca->mi.freespace_initialized);
u64 avail;
- struct bucket_alloc_state s = { 0 };
- bool waiting = false;
+ struct bucket_alloc_state s = {
+ .btree_bitmap = data_type == BCH_DATA_btree,
+ };
+ bool waiting = nowait;
again:
bch2_dev_usage_read_fast(ca, usage);
avail = dev_buckets_free(ca, *usage, watermark);
if (usage->d[BCH_DATA_need_discard].buckets > avail)
- bch2_do_discards(c);
+ bch2_dev_do_discards(ca);
if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
- bch2_do_gc_gens(c);
+ bch2_gc_gens_async(c);
if (should_invalidate_buckets(ca, *usage))
- bch2_do_invalidates(c);
+ bch2_dev_do_invalidates(ca);
if (!avail) {
+ if (watermark > BCH_WATERMARK_normal &&
+ c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+ goto alloc;
+
if (cl && !waiting) {
closure_wait(&c->freelist_wait, cl);
waiting = true;
goto again;
}
- track_event_change(&c->times[BCH_TIME_blocked_allocate],
- &c->blocked_allocate, true);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
ob = ERR_PTR(-BCH_ERR_freelist_empty);
goto err;
@@ -569,9 +574,14 @@ alloc:
? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
: bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
- if (s.skipped_need_journal_commit * 2 > avail)
+ if (s.need_journal_commit * 2 > avail)
bch2_journal_flush_async(&c->journal, NULL);
+ if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) {
+ s.btree_bitmap = BTREE_BITMAP_ANY;
+ goto alloc;
+ }
+
if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
freespace = false;
goto alloc;
@@ -581,41 +591,32 @@ err:
ob = ERR_PTR(-BCH_ERR_no_buckets_found);
if (!IS_ERR(ob))
- trace_and_count(c, bucket_alloc, ca,
- bch2_watermarks[watermark],
- ob->bucket,
- usage->d[BCH_DATA_free].buckets,
- avail,
- bch2_copygc_wait_amount(c),
- c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
- &s,
- cl == NULL,
- "");
+ ob->data_type = data_type;
+
+ if (!IS_ERR(ob))
+ count_event(c, bucket_alloc);
else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
- trace_and_count(c, bucket_alloc_fail, ca,
- bch2_watermarks[watermark],
- 0,
- usage->d[BCH_DATA_free].buckets,
- avail,
- bch2_copygc_wait_amount(c),
- c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
- &s,
- cl == NULL,
- bch2_err_str(PTR_ERR(ob)));
+ count_event(c, bucket_alloc_fail);
+
+ if (!IS_ERR(ob)
+ ? trace_bucket_alloc_enabled()
+ : trace_bucket_alloc_fail_enabled())
+ trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob);
return ob;
}
struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
enum bch_watermark watermark,
+ enum bch_data_type data_type,
struct closure *cl)
{
struct bch_dev_usage usage;
struct open_bucket *ob;
- bch2_trans_do(c, NULL, NULL, 0,
+ bch2_trans_do(c,
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
- cl, &usage)));
+ data_type, cl, false, &usage)));
return ob;
}
@@ -636,9 +637,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
unsigned i;
for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
- ret.devs[ret.nr++] = i;
+ ret.data[ret.nr++] = i;
- bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
+ bubble_sort(ret.data, ret.nr, dev_stripe_cmp);
return ret;
}
@@ -678,11 +679,9 @@ static int add_new_bucket(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- unsigned flags,
struct open_bucket *ob)
{
- unsigned durability =
- bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+ unsigned durability = ob_dev(c, ob)->mi.durability;
BUG_ON(*nr_effective >= nr_replicas);
@@ -706,45 +705,33 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- unsigned flags,
+ enum bch_write_flags flags,
enum bch_data_type data_type,
enum bch_watermark watermark,
struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct dev_alloc_list devs_sorted =
- bch2_dev_alloc_list(c, stripe, devs_may_alloc);
- unsigned dev;
- struct bch_dev *ca;
int ret = -BCH_ERR_insufficient_devices;
- unsigned i;
BUG_ON(*nr_effective >= nr_replicas);
- for (i = 0; i < devs_sorted.nr; i++) {
- struct bch_dev_usage usage;
- struct open_bucket *ob;
-
- dev = devs_sorted.devs[i];
-
- rcu_read_lock();
- ca = rcu_dereference(c->devs[dev]);
- if (ca)
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
-
+ struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+ darray_for_each(devs_sorted, i) {
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i);
if (!ca)
continue;
if (!ca->mi.durability && *have_cache) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
continue;
}
- ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
+ struct bch_dev_usage usage;
+ struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
+ cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
if (!IS_ERR(ob))
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
if (IS_ERR(ob)) {
ret = PTR_ERR(ob);
@@ -753,11 +740,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
continue;
}
- ob->data_type = data_type;
-
if (add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob)) {
+ have_cache, ob)) {
ret = 0;
break;
}
@@ -783,14 +768,10 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct dev_alloc_list devs_sorted;
- struct ec_stripe_head *h;
- struct open_bucket *ob;
- unsigned i, ec_idx;
int ret = 0;
if (nr_replicas < 2)
@@ -799,34 +780,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
if (ec_open_bucket(c, ptrs))
return 0;
- h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
+ struct ec_stripe_head *h =
+ bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
if (IS_ERR(h))
return PTR_ERR(h);
if (!h)
return 0;
- devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
-
- for (i = 0; i < devs_sorted.nr; i++)
- for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+ struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
+ darray_for_each(devs_sorted, i)
+ for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
if (!h->s->blocks[ec_idx])
continue;
- ob = c->open_buckets + h->s->blocks[ec_idx];
- if (ob->dev == devs_sorted.devs[i] &&
- !test_and_set_bit(ec_idx, h->s->blocks_allocated))
- goto got_bucket;
+ struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx];
+ if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) {
+ ob->ec_idx = ec_idx;
+ ob->ec = h->s;
+ ec_stripe_new_get(h->s, STRIPE_REF_io);
+
+ ret = add_new_bucket(c, ptrs, devs_may_alloc,
+ nr_replicas, nr_effective,
+ have_cache, ob);
+ goto out;
+ }
}
- goto out_put_head;
-got_bucket:
- ob->ec_idx = ec_idx;
- ob->ec = h->s;
- ec_stripe_new_get(h->s, STRIPE_REF_io);
-
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, flags, ob);
-out_put_head:
+out:
bch2_ec_stripe_head_put(c, h);
return ret;
}
@@ -839,7 +818,7 @@ static bool want_bucket(struct bch_fs *c,
bool *have_cache, bool ec,
struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
if (!test_bit(ob->dev, devs_may_alloc->d))
return false;
@@ -864,7 +843,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- bool ec, unsigned flags)
+ bool ec)
{
struct open_buckets ptrs_skip = { .nr = 0 };
struct open_bucket *ob;
@@ -876,7 +855,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c,
have_cache, ec, ob))
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob);
+ have_cache, ob);
else
ob_push(c, &ptrs_skip, ob);
}
@@ -892,8 +871,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache, bool ec,
- enum bch_watermark watermark,
- unsigned flags)
+ enum bch_watermark watermark)
{
int i, ret = 0;
@@ -909,12 +887,12 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
struct bch_dev_usage usage;
u64 avail;
bch2_dev_usage_read_fast(ca, &usage);
- avail = dev_buckets_free(ca, usage, watermark);
+ avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets;
if (!avail)
continue;
@@ -923,9 +901,13 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
i);
ob->on_partial_list = false;
+ rcu_read_lock();
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
+ rcu_read_unlock();
+
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob);
+ have_cache, ob);
if (ret)
break;
}
@@ -945,7 +927,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *_cl)
{
struct bch_fs *c = trans->c;
@@ -964,18 +946,15 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
open_bucket_for_each(c, ptrs, ob, i)
__clear_bit(ob->dev, devs.d);
- if (erasure_code && ec_open_bucket(c, ptrs))
- return 0;
-
ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
- have_cache, erasure_code, flags);
+ have_cache, erasure_code);
if (ret)
return ret;
ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
- have_cache, erasure_code, watermark, flags);
+ have_cache, erasure_code, watermark);
if (ret)
return ret;
@@ -1016,12 +995,12 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *cl)
{
int ret;
- if (erasure_code) {
+ if (erasure_code && !ec_open_bucket(trans->c, ptrs)) {
ret = __open_bucket_add_buckets(trans, ptrs, wp,
devs_have, target, erasure_code,
nr_replicas, nr_effective, have_cache,
@@ -1136,7 +1115,13 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
--c->open_buckets_partial_nr;
swap(c->open_buckets_partial[i],
c->open_buckets_partial[c->open_buckets_partial_nr]);
+
ob->on_partial_list = false;
+
+ rcu_read_lock();
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
+ rcu_read_unlock();
+
spin_unlock(&c->freelist_lock);
bch2_open_bucket_put(c, ob);
spin_lock(&c->freelist_lock);
@@ -1294,7 +1279,7 @@ deallocate_extra_replicas(struct bch_fs *c,
unsigned i;
open_bucket_for_each(c, ptrs, ob, i) {
- unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+ unsigned d = ob_dev(c, ob)->mi.durability;
if (d && d <= extra_replicas) {
extra_replicas -= d;
@@ -1318,7 +1303,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
unsigned nr_replicas,
unsigned nr_replicas_required,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *cl,
struct write_point **wp_ret)
{
@@ -1334,8 +1319,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
erasure_code = false;
- BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
-
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
ptrs.nr = 0;
@@ -1345,6 +1328,10 @@ retry:
*wp_ret = wp = writepoint_find(trans, write_point.v);
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ goto err;
+
/* metadata may not allocate on cache devices: */
if (wp->data_type != BCH_DATA_user)
have_cache = true;
@@ -1361,15 +1348,17 @@ retry:
/* Don't retry from all devices if we're out of open buckets: */
if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
- int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+ int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
&have_cache, watermark,
flags, cl);
- if (!ret ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ if (!ret2 ||
+ bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
+ ret = ret2;
goto alloc_done;
+ }
}
/*
@@ -1434,18 +1423,19 @@ err:
try_decrease_writepoints(trans, write_points_nr))
goto retry;
- if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
+ if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ ret = -BCH_ERR_bucket_alloc_blocked;
+
+ if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) &&
bch2_err_matches(ret, BCH_ERR_freelist_empty))
- return cl
- ? -BCH_ERR_bucket_alloc_blocked
- : -BCH_ERR_ENOSPC_bucket_alloc;
+ ret = -BCH_ERR_bucket_alloc_blocked;
return ret;
}
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
return (struct bch_extent_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_ptr,
@@ -1519,9 +1509,9 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
}
}
-static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
+void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
unsigned data_type = ob->data_type;
barrier(); /* READ_ONCE() doesn't work on bitfields */
@@ -1539,7 +1529,8 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str
prt_newline(out);
}
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bch_dev *ca)
{
struct open_bucket *ob;
@@ -1549,7 +1540,7 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
spin_lock(&ob->lock);
- if (ob->valid && !ob->on_partial_list)
+ if (ob->valid && (!ca || ob->dev == ca->dev_idx))
bch2_open_bucket_to_text(out, c, ob);
spin_unlock(&ob->lock);
}
@@ -1623,3 +1614,121 @@ void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
prt_str(out, "Btree write point\n");
bch2_write_point_to_text(out, c, &c->btree_write_point);
}
+
+void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ unsigned nr[BCH_DATA_NR];
+
+ memset(nr, 0, sizeof(nr));
+
+ for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+ nr[c->open_buckets[i].data_type]++;
+
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 24);
+
+ prt_printf(out, "capacity\t%llu\n", c->capacity);
+ prt_printf(out, "reserved\t%llu\n", c->reserved);
+ prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden));
+ prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree));
+ prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data));
+ prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached));
+ prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved));
+ prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved));
+ prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes));
+
+ prt_newline(out);
+ prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty");
+ prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
+ prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT);
+ prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty");
+ prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]);
+ prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]);
+ prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr);
+}
+
+void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+ struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+ unsigned nr[BCH_DATA_NR];
+
+ memset(nr, 0, sizeof(nr));
+
+ for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+ nr[c->open_buckets[i].data_type]++;
+
+ bch2_dev_usage_to_text(out, ca, &stats);
+
+ prt_newline(out);
+
+ prt_printf(out, "reserves:\n");
+ for (unsigned i = 0; i < BCH_WATERMARK_NR; i++)
+ prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i));
+
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 12);
+ printbuf_tabstop_push(out, 16);
+
+ prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets);
+ prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats));
+}
+
+static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
+{
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n",
+ c->opts.allocator_stuck_timeout);
+
+ prt_printf(&buf, "Allocator debug:\n");
+ printbuf_indent_add(&buf, 2);
+ bch2_fs_alloc_debug_to_text(&buf, c);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+
+ for_each_online_member(c, ca) {
+ prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
+ printbuf_indent_add(&buf, 2);
+ bch2_dev_alloc_debug_to_text(&buf, ca);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+ }
+
+ prt_printf(&buf, "Copygc debug:\n");
+ printbuf_indent_add(&buf, 2);
+ bch2_copygc_wait_to_text(&buf, c);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "Journal debug:\n");
+ printbuf_indent_add(&buf, 2);
+ bch2_journal_debug_to_text(&buf, &c->journal);
+ printbuf_indent_sub(&buf, 2);
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
+
+static inline unsigned allocator_wait_timeout(struct bch_fs *c)
+{
+ if (c->allocator_last_stuck &&
+ time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies))
+ return 0;
+
+ return c->opts.allocator_stuck_timeout * HZ;
+}
+
+void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
+{
+ unsigned t = allocator_wait_timeout(c);
+
+ if (t && closure_sync_timeout(cl, t)) {
+ c->allocator_last_stuck = jiffies;
+ bch2_print_allocator_stuck(c);
+ }
+
+ closure_sync(cl);
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 7aaeec44c746..f25481a0d1a0 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -20,7 +20,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *);
struct dev_alloc_list {
unsigned nr;
- u8 devs[BCH_SB_MEMBERS_MAX];
+ u8 data[BCH_SB_MEMBERS_MAX];
};
struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
@@ -28,10 +28,14 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
struct bch_devs_mask *);
void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
-long bch2_bucket_alloc_new_fs(struct bch_dev *);
+static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
+{
+ return bch2_dev_have_ref(c, ob->dev);
+}
struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
- enum bch_watermark, struct closure *);
+ enum bch_watermark, enum bch_data_type,
+ struct closure *);
static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
struct open_bucket *ob)
@@ -149,9 +153,10 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
return ret;
}
+enum bch_write_flags;
int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
- unsigned, unsigned *, bool *, unsigned,
+ unsigned, unsigned *, bool *, enum bch_write_flags,
enum bch_data_type, enum bch_watermark,
struct closure *);
@@ -161,7 +166,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *,
struct bch_devs_list *,
unsigned, unsigned,
enum bch_watermark,
- unsigned,
+ enum bch_write_flags,
struct closure *,
struct write_point **);
@@ -184,7 +189,7 @@ bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
wp->sectors_allocated += sectors;
open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
ptr.cached = cached ||
@@ -216,9 +221,20 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
void bch2_fs_allocator_foreground_init(struct bch_fs *);
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *);
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *);
void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
+void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *);
+void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
+
+void __bch2_wait_on_allocator(struct bch_fs *, struct closure *);
+static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
+{
+ if (cl->closure_get_happened)
+ __bch2_wait_on_allocator(c, cl);
+}
+
#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index b91b7a461056..4aa8ee026cb8 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -9,11 +9,19 @@
#include "fifo.h"
struct bucket_alloc_state {
+ enum {
+ BTREE_BITMAP_NO,
+ BTREE_BITMAP_YES,
+ BTREE_BITMAP_ANY,
+ } btree_bitmap;
+
u64 buckets_seen;
u64 skipped_open;
u64 skipped_need_journal_commit;
+ u64 need_journal_commit;
u64 skipped_nocow;
u64 skipped_nouse;
+ u64 skipped_mi_btree_bitmap;
};
#define BCH_WATERMARKS() \
@@ -22,7 +30,8 @@ struct bucket_alloc_state {
x(copygc) \
x(btree) \
x(btree_copygc) \
- x(reclaim)
+ x(reclaim) \
+ x(interior_updates)
enum bch_watermark {
#define x(name) BCH_WATERMARK_##name,
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 569b97904da4..ebeb6a5ff9d2 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -8,88 +8,87 @@
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "checksum.h"
+#include "disk_accounting.h"
#include "error.h"
#include <linux/mm.h>
-static bool extent_matches_bp(struct bch_fs *c,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k,
- struct bpos bucket,
- struct bch_backpointer bp)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bpos bucket2;
- struct bch_backpointer bp2;
-
- if (p.ptr.cached)
- continue;
-
- bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
- &bucket2, &bp2);
- if (bpos_eq(bucket, bucket2) &&
- !memcmp(&bp, &bp2, sizeof(bp)))
- return true;
- }
-
- return false;
-}
-
-int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
- struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
int ret = 0;
- bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
- c, err,
- backpointer_pos_wrong,
- "backpointer at wrong pos");
+ bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH,
+ c, backpointer_level_bad,
+ "backpointer level bad: %u >= %u",
+ bp.v->level, BTREE_MAX_DEPTH);
+
+ bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID,
+ c, backpointer_dev_bad,
+ "backpointer for BCH_SB_MEMBER_INVALID");
fsck_err:
return ret;
}
-void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
+void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
- prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
- bch2_btree_id_str(bp->btree_id),
- bp->level,
- (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
- (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
- bp->bucket_len);
- bch2_bpos_to_text(out, bp->pos);
-}
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- if (bch2_dev_exists2(c, k.k->p.inode)) {
- prt_str(out, "bucket=");
- bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
- prt_str(out, " ");
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
+ if (ca) {
+ u32 bucket_offset;
+ struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
+ rcu_read_unlock();
+ prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset);
+ } else {
+ rcu_read_unlock();
+ prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
}
- bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
+ bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
+ prt_printf(out, " suboffset=%u len=%u gen=%u pos=",
+ (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+ bp.v->bucket_len,
+ bp.v->bucket_gen);
+ bch2_bpos_to_text(out, bp.v->pos);
}
void bch2_backpointer_swab(struct bkey_s k)
{
struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
- bp.v->bucket_offset = swab40(bp.v->bucket_offset);
bp.v->bucket_len = swab32(bp.v->bucket_len);
bch2_bpos_swab(&bp.v->pos);
}
+static bool extent_matches_bp(struct bch_fs *c,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k,
+ struct bkey_s_c_backpointer bp)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bkey_i_backpointer bp2;
+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2);
+
+ if (bpos_eq(bp.k->p, bp2.k.p) &&
+ !memcmp(bp.v, &bp2.v, sizeof(bp2.v)))
+ return true;
+ }
+
+ return false;
+}
+
static noinline int backpointer_mod_err(struct btree_trans *trans,
- struct bch_backpointer bp,
- struct bkey_s_c bp_k,
struct bkey_s_c orig_k,
+ struct bkey_i_backpointer *new_bp,
+ struct bkey_s_c found_bp,
bool insert)
{
struct bch_fs *c = trans->c;
@@ -97,12 +96,12 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
if (insert) {
prt_printf(&buf, "existing backpointer found when inserting ");
- bch2_backpointer_to_text(&buf, &bp);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
prt_newline(&buf);
printbuf_indent_add(&buf, 2);
prt_printf(&buf, "found ");
- bch2_bkey_val_to_text(&buf, c, bp_k);
+ bch2_bkey_val_to_text(&buf, c, found_bp);
prt_newline(&buf);
prt_printf(&buf, "for ");
@@ -110,16 +109,15 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
bch_err(c, "%s", buf.buf);
} else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
- prt_printf(&buf, "backpointer not found when deleting");
- prt_newline(&buf);
+ prt_printf(&buf, "backpointer not found when deleting\n");
printbuf_indent_add(&buf, 2);
prt_printf(&buf, "searching for ");
- bch2_backpointer_to_text(&buf, &bp);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
prt_newline(&buf);
prt_printf(&buf, "got ");
- bch2_bkey_val_to_text(&buf, c, bp_k);
+ bch2_bkey_val_to_text(&buf, c, found_bp);
prt_newline(&buf);
prt_printf(&buf, "for ");
@@ -131,257 +129,225 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
printbuf_exit(&buf);
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
- bch2_inconsistent_error(c);
- return -EIO;
+ return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0;
} else {
return 0;
}
}
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
- struct bpos bucket,
- struct bch_backpointer bp,
struct bkey_s_c orig_k,
+ struct bkey_i_backpointer *bp,
bool insert)
{
struct btree_iter bp_iter;
- struct bkey_s_c k;
- struct bkey_i_backpointer *bp_k;
- int ret;
-
- bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
- ret = PTR_ERR_OR_ZERO(bp_k);
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
+ bp->k.p,
+ BTREE_ITER_intent|
+ BTREE_ITER_slots|
+ BTREE_ITER_with_updates);
+ int ret = bkey_err(k);
if (ret)
return ret;
- bkey_backpointer_init(&bp_k->k_i);
- bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
- bp_k->v = bp;
-
- if (!insert) {
- bp_k->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&bp_k->k, 0);
- }
-
- k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
- bp_k->k.p,
- BTREE_ITER_INTENT|
- BTREE_ITER_SLOTS|
- BTREE_ITER_WITH_UPDATES);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
if (insert
? k.k->type
: (k.k->type != KEY_TYPE_backpointer ||
- memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) {
- ret = backpointer_mod_err(trans, bp, k, orig_k, insert);
+ memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) {
+ ret = backpointer_mod_err(trans, orig_k, bp, k, insert);
if (ret)
goto err;
}
- ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
+ if (!insert) {
+ bp->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp->k, 0);
+ }
+
+ ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0);
err:
bch2_trans_iter_exit(trans, &bp_iter);
return ret;
}
-/*
- * Find the next backpointer >= *bp_offset:
- */
-int bch2_get_next_backpointer(struct btree_trans *trans,
- struct bpos bucket, int gen,
- struct bpos *bp_pos,
- struct bch_backpointer *bp,
- unsigned iter_flags)
+static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
{
- struct bch_fs *c = trans->c;
- struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
- struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL };
- struct bkey_s_c k;
- int ret = 0;
-
- if (bpos_ge(*bp_pos, bp_end_pos))
- goto done;
-
- if (gen >= 0) {
- k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
- bucket, BTREE_ITER_CACHED|iter_flags);
- ret = bkey_err(k);
- if (ret)
- goto out;
-
- if (k.k->type != KEY_TYPE_alloc_v4 ||
- bkey_s_c_to_alloc_v4(k).v->gen != gen)
- goto done;
- }
-
- *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0));
-
- for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
- *bp_pos, iter_flags, k, ret) {
- if (bpos_ge(k.k->p, bp_end_pos))
- break;
+ return (likely(!bch2_backpointers_no_use_write_buffer)
+ ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
+ : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+}
- *bp_pos = k.k->p;
- *bp = *bkey_s_c_to_backpointer(k).v;
- goto out;
- }
-done:
- *bp_pos = SPOS_MAX;
-out:
- bch2_trans_iter_exit(trans, &bp_iter);
- bch2_trans_iter_exit(trans, &alloc_iter);
- return ret;
+static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
+ struct bkey_s_c visiting_k,
+ struct bkey_buf *last_flushed)
+{
+ return likely(!bch2_backpointers_no_use_write_buffer)
+ ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed)
+ : 0;
}
-static void backpointer_not_found(struct btree_trans *trans,
- struct bpos bp_pos,
- struct bch_backpointer bp,
- struct bkey_s_c k)
+static int backpointer_target_not_found(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ struct bkey_s_c target_k,
+ struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
- struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+ int ret = 0;
/*
* If we're using the btree write buffer, the backpointer we were
* looking at may have already been deleted - failure to find what it
* pointed to is not an error:
*/
- if (likely(!bch2_backpointers_no_use_write_buffer))
- return;
+ ret = last_flushed
+ ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed)
+ : 0;
+ if (ret)
+ return ret;
prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
- bp.level ? "btree node" : "extent");
- prt_printf(&buf, "bucket: ");
- bch2_bpos_to_text(&buf, bucket);
- prt_printf(&buf, "\n ");
+ bp.v->level ? "btree node" : "extent");
+ bch2_bkey_val_to_text(&buf, c, bp.s_c);
- prt_printf(&buf, "backpointer pos: ");
- bch2_bpos_to_text(&buf, bp_pos);
prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, target_k);
- bch2_backpointer_to_text(&buf, &bp);
- prt_printf(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, k);
- if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers)
- bch_err_ratelimited(c, "%s", buf.buf);
- else
- bch2_trans_inconsistent(trans, "%s", buf.buf);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry)
+ if (p.ptr.dev == bp.k->p.inode) {
+ prt_printf(&buf, "\n ");
+ struct bkey_i_backpointer bp2;
+ bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i));
+ }
+ if (fsck_err(trans, backpointer_to_missing_ptr,
+ "%s", buf.buf))
+ ret = bch2_backpointer_del(trans, bp.k->p);
+fsck_err:
printbuf_exit(&buf);
+ return ret;
}
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
struct btree_iter *iter,
- struct bpos bp_pos,
- struct bch_backpointer bp,
- unsigned iter_flags)
+ unsigned iter_flags,
+ struct bkey_buf *last_flushed)
{
- if (likely(!bp.level)) {
- struct bch_fs *c = trans->c;
- struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
- struct bkey_s_c k;
+ struct bch_fs *c = trans->c;
+ if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
+ return bkey_s_c_null;
+
+ if (likely(!bp.v->level)) {
bch2_trans_node_iter_init(trans, iter,
- bp.btree_id,
- bp.pos,
+ bp.v->btree_id,
+ bp.v->pos,
0, 0,
iter_flags);
- k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
if (bkey_err(k)) {
bch2_trans_iter_exit(trans, iter);
return k;
}
- if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+ if (k.k &&
+ extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
return k;
bch2_trans_iter_exit(trans, iter);
- backpointer_not_found(trans, bp_pos, bp, k);
- return bkey_s_c_null;
+ int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
} else {
- struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
+ struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
+ if (IS_ERR_OR_NULL(b))
+ return ((struct bkey_s_c) { .k = ERR_CAST(b) });
- if (IS_ERR_OR_NULL(b)) {
- bch2_trans_iter_exit(trans, iter);
- return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
- }
return bkey_i_to_s_c(&b->key);
}
}
struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
struct btree_iter *iter,
- struct bpos bp_pos,
- struct bch_backpointer bp)
+ struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
- struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
- struct btree *b;
- BUG_ON(!bp.level);
+ BUG_ON(!bp.v->level);
bch2_trans_node_iter_init(trans, iter,
- bp.btree_id,
- bp.pos,
+ bp.v->btree_id,
+ bp.v->pos,
0,
- bp.level - 1,
+ bp.v->level - 1,
0);
- b = bch2_btree_iter_peek_node(iter);
+ struct btree *b = bch2_btree_iter_peek_node(iter);
if (IS_ERR_OR_NULL(b))
goto err;
- BUG_ON(b->c.level != bp.level - 1);
+ BUG_ON(b->c.level != bp.v->level - 1);
- if (extent_matches_bp(c, bp.btree_id, bp.level,
- bkey_i_to_s_c(&b->key),
- bucket, bp))
+ if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
+ bkey_i_to_s_c(&b->key), bp))
return b;
if (btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else {
- backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
- b = NULL;
+ int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed);
+ b = ret ? ERR_PTR(ret) : NULL;
}
err:
bch2_trans_iter_exit(trans, iter);
return b;
}
-static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
- struct bkey_s_c k)
+static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
+ struct bkey_buf *last_flushed)
{
+ if (k.k->type != KEY_TYPE_backpointer)
+ return 0;
+
struct bch_fs *c = trans->c;
struct btree_iter alloc_iter = { NULL };
struct bkey_s_c alloc_k;
struct printbuf buf = PRINTBUF;
int ret = 0;
- if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
- backpointer_to_missing_device,
- "backpointer for missing device:\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_delete_at(trans, bp_iter, 0);
+ struct bpos bucket;
+ if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
+ ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
+ if (ret)
+ goto out;
+
+ if (fsck_err(trans, backpointer_to_missing_device,
+ "backpointer for missing device:\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = bch2_backpointer_del(trans, k.k->p);
goto out;
}
- alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
- bp_pos_to_bucket(c, k.k->p), 0);
+ alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0);
ret = bkey_err(alloc_k);
if (ret)
goto out;
- if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
- backpointer_to_missing_alloc,
- "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
- alloc_iter.pos.inode, alloc_iter.pos.offset,
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- ret = bch2_btree_delete_at(trans, bp_iter, 0);
- goto out;
+ if (alloc_k.k->type != KEY_TYPE_alloc_v4) {
+ ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
+ if (ret)
+ goto out;
+
+ if (fsck_err(trans, backpointer_to_missing_alloc,
+ "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+ alloc_iter.pos.inode, alloc_iter.pos.offset,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = bch2_backpointer_del(trans, k.k->p);
}
out:
fsck_err:
@@ -393,94 +359,222 @@ fsck_err:
/* verify that every backpointer has a corresponding alloc key */
int bch2_check_btree_backpointers(struct bch_fs *c)
{
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_backpointers, POS_MIN, 0, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_btree_backpointer(trans, &iter, k)));
+ bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed)));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
bch_err_fn(c, ret);
return ret;
}
-static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
-{
- return bpos_eq(l.k->p, r.k->p) &&
- bkey_bytes(l.k) == bkey_bytes(r.k) &&
- !memcmp(l.v, r.v, bkey_val_bytes(l.k));
-}
-
struct extents_to_bp_state {
- struct bpos bucket_start;
- struct bpos bucket_end;
+ struct bpos bp_start;
+ struct bpos bp_end;
struct bkey_buf last_flushed;
};
+static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
+ struct bkey_s_c extent, unsigned dev)
+{
+ struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bch2_bkey_drop_device(bkey_i_to_s(n), dev);
+ return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
+static int check_extent_checksum(struct btree_trans *trans,
+ enum btree_id btree, struct bkey_s_c extent,
+ enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct printbuf buf = PRINTBUF;
+ void *data_buf = NULL;
+ struct bio *bio = NULL;
+ size_t bytes;
+ int ret = 0;
+
+ if (bkey_is_btree_ptr(extent.k))
+ return false;
+
+ bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
+ if (p.ptr.dev == dev)
+ goto found;
+ BUG();
+found:
+ if (!p.crc.csum_type)
+ return false;
+
+ bytes = p.crc.compressed_size << 9;
+
+ struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ);
+ if (!ca)
+ return false;
+
+ data_buf = kvmalloc(bytes, GFP_KERNEL);
+ if (!data_buf) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL);
+ bio->bi_iter.bi_sector = p.ptr.offset;
+ bch2_bio_map(bio, data_buf, bytes);
+ ret = submit_bio_wait(bio);
+ if (ret)
+ goto err;
+
+ prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
+ prt_printf(&buf, "\n ");
+ bch2_btree_id_to_text(&buf, btree);
+ prt_str(&buf, " ");
+ bch2_bkey_val_to_text(&buf, c, extent);
+ prt_printf(&buf, "\n ");
+ bch2_btree_id_to_text(&buf, o_btree);
+ prt_str(&buf, " ");
+ bch2_bkey_val_to_text(&buf, c, extent2);
+
+ struct nonce nonce = extent_nonce(extent.k->bversion, p.crc);
+ struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
+ if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
+ trans, dup_backpointer_to_bad_csum_extent,
+ "%s", buf.buf))
+ ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
+fsck_err:
+err:
+ if (bio)
+ bio_put(bio);
+ kvfree(data_buf);
+ percpu_ref_put(&ca->io_ref);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static int check_bp_exists(struct btree_trans *trans,
struct extents_to_bp_state *s,
- struct bpos bucket,
- struct bch_backpointer bp,
+ struct bkey_i_backpointer *bp,
struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
- struct btree_iter bp_iter = { NULL };
+ struct btree_iter other_extent_iter = {};
struct printbuf buf = PRINTBUF;
- struct bkey_s_c bp_k;
- struct bkey_buf tmp;
- int ret;
-
- bch2_bkey_buf_init(&tmp);
- if (bpos_lt(bucket, s->bucket_start) ||
- bpos_gt(bucket, s->bucket_end))
+ if (bpos_lt(bp->k.p, s->bp_start) ||
+ bpos_gt(bp->k.p, s->bp_end))
return 0;
- if (!bch2_dev_bucket_exists(c, bucket))
+ struct btree_iter bp_iter;
+ struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0);
+ int ret = bkey_err(bp_k);
+ if (ret)
+ goto err;
+
+ if (bp_k.k->type != KEY_TYPE_backpointer ||
+ memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) {
+ ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed);
+ if (ret)
+ goto err;
+
+ goto check_existing_bp;
+ }
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &other_extent_iter);
+ bch2_trans_iter_exit(trans, &bp_iter);
+ printbuf_exit(&buf);
+ return ret;
+check_existing_bp:
+ /* Do we have a backpointer for a different extent? */
+ if (bp_k.k->type != KEY_TYPE_backpointer)
goto missing;
- bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
- bucket_pos_to_bp(c, bucket, bp.bucket_offset),
- 0);
- ret = bkey_err(bp_k);
+ struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
+
+ struct bkey_s_c other_extent =
+ bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL);
+ ret = bkey_err(other_extent);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ ret = 0;
if (ret)
goto err;
- if (bp_k.k->type != KEY_TYPE_backpointer ||
- memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
- bch2_bkey_buf_reassemble(&tmp, c, orig_k);
+ if (!other_extent.k)
+ goto missing;
- if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) {
- if (bp.level) {
- bch2_trans_unlock(trans);
- bch2_btree_interior_updates_flush(c);
- }
+ if (bch2_extents_match(orig_k, other_extent)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, other_extent);
+ bch_err(c, "%s", buf.buf);
- ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (other_extent.k->size <= orig_k.k->size) {
+ ret = drop_dev_and_update(trans, other_bp.v->btree_id,
+ other_extent, bp->k.p.inode);
if (ret)
goto err;
-
- bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k);
- ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
+ } else {
+ ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode);
+ if (ret)
+ goto err;
+ goto missing;
}
+ }
+
+ ret = check_extent_checksum(trans,
+ other_bp.v->btree_id, other_extent,
+ bp->v.btree_id, orig_k,
+ bp->k.p.inode);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ ret = 0;
goto missing;
}
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &bp_iter);
- bch2_bkey_buf_exit(&tmp, c);
- printbuf_exit(&buf);
- return ret;
+
+ ret = check_extent_checksum(trans, bp->v.btree_id, orig_k,
+ other_bp.v->btree_id, other_extent, bp->k.p.inode);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ printbuf_reset(&buf);
+ prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode);
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, other_extent);
+ bch_err(c, "%s", buf.buf);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
missing:
- prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
- bch2_btree_id_str(bp.btree_id), bp.level);
+ printbuf_reset(&buf);
+ prt_str(&buf, "missing backpointer\n for: ");
bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_printf(&buf, "\nbp pos ");
- bch2_bpos_to_text(&buf, bp_iter.pos);
+ prt_printf(&buf, "\n want: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
+ prt_printf(&buf, "\n got: ");
+ bch2_bkey_val_to_text(&buf, c, bp_k);
- if (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
- ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
+ if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
+ ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true);
goto out;
}
@@ -491,25 +585,33 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- int ret;
- ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bpos bucket_pos;
- struct bch_backpointer bp;
-
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, btree, level,
- k, p, &bucket_pos, &bp);
+ if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
+ continue;
- ret = check_bp_exists(trans, s, bucket_pos, bp, k);
- if (ret)
- return ret;
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
+ bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches);
+ bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty);
+ rcu_read_unlock();
+
+ if (check || empty) {
+ struct bkey_i_backpointer bp;
+ bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
+
+ int ret = check
+ ? check_bp_exists(trans, s, &bp, k)
+ : bch2_bucket_backpointer_mod(trans, k, &bp, true);
+ if (ret)
+ return ret;
+ }
}
return 0;
@@ -555,69 +657,138 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
};
}
-static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+static u64 mem_may_pin_bytes(struct bch_fs *c)
{
struct sysinfo i;
- u64 mem_bytes;
-
si_meminfo(&i);
- mem_bytes = i.totalram * i.mem_unit;
- return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
+
+ u64 mem_bytes = i.totalram * i.mem_unit;
+ return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+ return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
}
static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
- unsigned btree_leaf_mask,
- unsigned btree_interior_mask,
+ u64 btree_leaf_mask,
+ u64 btree_interior_mask,
struct bbpos start, struct bbpos *end)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
- enum btree_id btree;
+ struct bch_fs *c = trans->c;
+ s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0;
- for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
- unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+ bch2_btree_cache_unpin(c);
+
+ btree_interior_mask |= btree_leaf_mask;
+
+ c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
+ c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
+ c->btree_cache.pinned_nodes_start = start;
+ c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
- if (!((1U << btree) & btree_leaf_mask) &&
- !((1U << btree) & btree_interior_mask))
+ for (enum btree_id btree = start.btree;
+ btree < BTREE_ID_NR && !ret;
+ btree++) {
+ unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1;
+
+ if (!(BIT_ULL(btree) & btree_leaf_mask) &&
+ !(BIT_ULL(btree) & btree_interior_mask))
continue;
- bch2_trans_node_iter_init(trans, &iter, btree,
- btree == start.btree ? start.pos : POS_MIN,
- 0, depth, 0);
- /*
- * for_each_btree_key_contineu() doesn't check the return value
- * from bch2_btree_iter_advance(), which is needed when
- * iterating over interior nodes where we'll see keys at
- * SPOS_MAX:
- */
- do {
- k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
- ret = bkey_err(k);
- if (!k.k || ret)
+ ret = __for_each_btree_node(trans, iter, btree,
+ btree == start.btree ? start.pos : POS_MIN,
+ 0, depth, BTREE_ITER_prefetch, b, ({
+ mem_may_pin -= btree_buf_bytes(b);
+ if (mem_may_pin <= 0) {
+ c->btree_cache.pinned_nodes_end = *end =
+ BBPOS(btree, b->key.k.p);
break;
-
- --btree_nodes;
- if (!btree_nodes) {
- *end = BBPOS(btree, k.k->p);
- bch2_trans_iter_exit(trans, &iter);
- return 0;
}
- } while (bch2_btree_iter_advance(&iter));
- bch2_trans_iter_exit(trans, &iter);
+ bch2_node_pin(c, b);
+ 0;
+ }));
+ }
+
+ return ret;
+}
+
+struct progress_indicator_state {
+ unsigned long next_print;
+ u64 nodes_seen;
+ u64 nodes_total;
+ struct btree *last_node;
+};
+
+static inline void progress_init(struct progress_indicator_state *s,
+ struct bch_fs *c,
+ u64 btree_id_mask)
+{
+ memset(s, 0, sizeof(*s));
+
+ s->next_print = jiffies + HZ * 10;
+
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
+ if (!(btree_id_mask & BIT_ULL(i)))
+ continue;
+
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_btree,
+ .btree.id = i,
+ };
+
+ u64 v;
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
+ s->nodes_total += div64_ul(v, btree_sectors(c));
}
+}
+
+static inline bool progress_update_p(struct progress_indicator_state *s)
+{
+ bool ret = time_after_eq(jiffies, s->next_print);
- *end = BBPOS_MAX;
+ if (ret)
+ s->next_print = jiffies + HZ * 10;
return ret;
}
+static void progress_update_iter(struct btree_trans *trans,
+ struct progress_indicator_state *s,
+ struct btree_iter *iter,
+ const char *msg)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b = path_l(btree_iter_path(trans, iter))->b;
+
+ s->nodes_seen += b != s->last_node;
+ s->last_node = b;
+
+ if (progress_update_p(s)) {
+ struct printbuf buf = PRINTBUF;
+ unsigned percent = s->nodes_total
+ ? div64_u64(s->nodes_seen * 100, s->nodes_total)
+ : 0;
+
+ prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
+ msg, percent, s->nodes_seen, s->nodes_total);
+ bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
+
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct extents_to_bp_state *s)
{
struct bch_fs *c = trans->c;
+ struct progress_indicator_state progress;
int ret = 0;
+ progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
+
for (enum btree_id btree_id = 0;
btree_id < btree_id_nr_alive(c);
btree_id++) {
@@ -631,31 +802,14 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
while (level >= depth) {
struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
- level,
- BTREE_ITER_PREFETCH);
- while (1) {
- bch2_trans_begin(trans);
-
- struct bkey_s_c k = bch2_btree_iter_peek(&iter);
- if (!k.k)
- break;
- ret = bkey_err(k) ?:
- check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- ret = 0;
- continue;
- }
- if (ret)
- break;
- if (bpos_eq(iter.pos, SPOS_MAX))
- break;
- bch2_btree_iter_advance(&iter);
- }
- bch2_trans_iter_exit(trans, &iter);
-
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level,
+ BTREE_ITER_prefetch);
+
+ ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
+ check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ }));
if (ret)
return ret;
@@ -666,102 +820,330 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
return 0;
}
-static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
- struct bpos bucket)
+enum alloc_sector_counter {
+ ALLOC_dirty,
+ ALLOC_cached,
+ ALLOC_stripe,
+ ALLOC_SECTORS_NR
+};
+
+static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t)
{
- return bch2_dev_exists2(c, bucket.inode)
- ? bucket_pos_to_bp(c, bucket, 0)
- : bucket;
+ switch (t) {
+ case BCH_DATA_btree:
+ case BCH_DATA_user:
+ return ALLOC_dirty;
+ case BCH_DATA_cached:
+ return ALLOC_cached;
+ case BCH_DATA_stripe:
+ return ALLOC_stripe;
+ default:
+ BUG();
+ }
}
-static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
- struct bpos start, struct bpos *end)
+static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos);
+
+static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k,
+ struct bkey_buf *last_flushed)
{
- struct btree_iter alloc_iter;
- struct btree_iter bp_iter;
- struct bkey_s_c alloc_k, bp_k;
- size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
- bool alloc_end = false, bp_end = false;
+ struct bch_fs *c = trans->c;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
+ bool need_commit = false;
+
+ if (a->data_type == BCH_DATA_sb ||
+ a->data_type == BCH_DATA_journal ||
+ a->data_type == BCH_DATA_parity)
+ return 0;
+
+ u32 sectors[ALLOC_SECTORS_NR];
+ memset(sectors, 0, sizeof(sectors));
+
+ struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p);
+ if (!ca)
+ return 0;
+
+ struct btree_iter iter;
+ struct bkey_s_c bp_k;
int ret = 0;
+ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp_start(ca, alloc_k.k->p),
+ bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) {
+ if (bp_k.k->type != KEY_TYPE_backpointer)
+ continue;
- bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
- start, 0, 1, 0);
- bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
- bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
- while (1) {
- alloc_k = !alloc_end
- ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
- : bkey_s_c_null;
- bp_k = !bp_end
- ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
- : bkey_s_c_null;
-
- ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
- if ((!alloc_k.k && !bp_k.k) || ret) {
- *end = SPOS_MAX;
- break;
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
+
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen &&
+ (bp.v->bucket_gen != a->gen ||
+ bp.v->pad)) {
+ ret = bch2_backpointer_del(trans, bp_k.k->p);
+ if (ret)
+ break;
+
+ need_commit = true;
+ continue;
}
- --btree_nodes;
- if (!btree_nodes) {
- *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
- break;
+ if (bp.v->bucket_gen != a->gen)
+ continue;
+
+ sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len;
+ };
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ goto err;
+
+ if (need_commit) {
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+ }
+
+ /* Cached pointers don't have backpointers: */
+
+ if (sectors[ALLOC_dirty] != a->dirty_sectors ||
+ sectors[ALLOC_stripe] != a->stripe_sectors) {
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
+ ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
+ if (ret)
+ goto err;
}
- if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
- bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
- if (!bch2_btree_iter_advance(&alloc_iter))
- alloc_end = true;
- } else {
- if (!bch2_btree_iter_advance(&bp_iter))
- bp_end = true;
+ if (sectors[ALLOC_dirty] > a->dirty_sectors ||
+ sectors[ALLOC_stripe] > a->stripe_sectors) {
+ ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto err;
}
+
+ if (!sectors[ALLOC_dirty] &&
+ !sectors[ALLOC_stripe])
+ __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty);
+ else
+ __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches);
}
- bch2_trans_iter_exit(trans, &bp_iter);
- bch2_trans_iter_exit(trans, &alloc_iter);
+err:
+ bch2_dev_put(ca);
+ return ret;
+}
+
+static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_btree_ptr_v2: {
+ bool ret = false;
+
+ rcu_read_lock();
+ struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key;
+ while (pos.inode <= k.k->p.inode) {
+ if (pos.inode >= c->sb.nr_devices)
+ break;
+
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
+ if (!ca)
+ goto next;
+
+ struct bpos bucket = bp_pos_to_bucket(ca, pos);
+ bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches,
+ ca->mi.nbuckets, bucket.offset);
+ if (bucket.offset == ca->mi.nbuckets)
+ goto next;
+
+ ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p);
+ if (ret)
+ break;
+next:
+ pos = SPOS(pos.inode + 1, 0, 0);
+ }
+ rcu_read_unlock();
+
+ return ret;
+ }
+ case KEY_TYPE_btree_ptr:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k,
+ enum btree_id btree, unsigned level)
+{
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0);
+ struct btree *b = bch2_btree_iter_peek_node(&iter);
+ int ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto err;
+
+ if (b)
+ bch2_node_pin(trans->c, b);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans,
+ struct bpos start, struct bpos *end)
+{
+ struct bch_fs *c = trans->c;
+ int ret = 0;
+
+ struct bkey_buf tmp;
+ bch2_bkey_buf_init(&tmp);
+
+ bch2_btree_cache_unpin(c);
+
+ *end = SPOS_MAX;
+
+ s64 mem_may_pin = mem_may_pin_bytes(c);
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
+ 0, 1, BTREE_ITER_prefetch);
+ ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ if (!backpointer_node_has_missing(c, k))
+ continue;
+
+ mem_may_pin -= c->opts.btree_node_size;
+ if (mem_may_pin <= 0)
+ break;
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ struct btree_path *path = btree_iter_path(trans, &iter);
+
+ BUG_ON(path->level != 1);
+
+ bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1);
+ }));
+ if (ret)
+ return ret;
+
+ struct bpos pinned = SPOS_MAX;
+ mem_may_pin = mem_may_pin_bytes(c);
+ bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
+ 0, 1, BTREE_ITER_prefetch);
+ ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ if (!backpointer_node_has_missing(c, k))
+ continue;
+
+ mem_may_pin -= c->opts.btree_node_size;
+ if (mem_may_pin <= 0) {
+ *end = pinned;
+ break;
+ }
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ struct btree_path *path = btree_iter_path(trans, &iter);
+
+ BUG_ON(path->level != 1);
+
+ int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1);
+
+ if (!ret2)
+ pinned = tmp.k->k.p;
+
+ ret;
+ }));
+ if (ret)
+ return ret;
+
return ret;
}
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
+ int ret = 0;
+
+ /*
+ * Can't allow devices to come/go/resize while we have bucket bitmaps
+ * allocated
+ */
+ lockdep_assert_held(&c->state_lock);
+
+ for_each_member_device(c, ca) {
+ BUG_ON(ca->bucket_backpointer_mismatches);
+ ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!ca->bucket_backpointer_mismatches ||
+ !ca->bucket_backpointer_empty) {
+ bch2_dev_put(ca);
+ ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
+ goto err_free_bitmaps;
+ }
+ }
+
struct btree_trans *trans = bch2_trans_get(c);
- struct extents_to_bp_state s = { .bucket_start = POS_MIN };
- int ret;
+ struct extents_to_bp_state s = { .bp_start = POS_MIN };
bch2_bkey_buf_init(&s.last_flushed);
bkey_init(&s.last_flushed.k->k);
+ ret = for_each_btree_key(trans, iter, BTREE_ID_alloc,
+ POS_MIN, BTREE_ITER_prefetch, k, ({
+ check_bucket_backpointer_mismatch(trans, k, &s.last_flushed);
+ }));
+ if (ret)
+ goto err;
+
+ u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0;
+ for_each_member_device(c, ca) {
+ nr_buckets += ca->mi.nbuckets;
+ nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets);
+ nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets);
+ }
+
+ if (!nr_mismatches && !nr_empty)
+ goto err;
+
+ bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
+ nr_mismatches + nr_empty, nr_buckets);
+
while (1) {
- ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
+ ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
if (ret)
break;
- if ( bpos_eq(s.bucket_start, POS_MIN) &&
- !bpos_eq(s.bucket_end, SPOS_MAX))
+ if ( bpos_eq(s.bp_start, POS_MIN) &&
+ !bpos_eq(s.bp_end, SPOS_MAX))
bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
__func__, btree_nodes_fit_in_ram(c));
- if (!bpos_eq(s.bucket_start, POS_MIN) ||
- !bpos_eq(s.bucket_end, SPOS_MAX)) {
+ if (!bpos_eq(s.bp_start, POS_MIN) ||
+ !bpos_eq(s.bp_end, SPOS_MAX)) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "check_extents_to_backpointers(): ");
- bch2_bpos_to_text(&buf, s.bucket_start);
+ bch2_bpos_to_text(&buf, s.bp_start);
prt_str(&buf, "-");
- bch2_bpos_to_text(&buf, s.bucket_end);
+ bch2_bpos_to_text(&buf, s.bp_end);
bch_verbose(c, "%s", buf.buf);
printbuf_exit(&buf);
}
ret = bch2_check_extents_to_backpointers_pass(trans, &s);
- if (ret || bpos_eq(s.bucket_end, SPOS_MAX))
+ if (ret || bpos_eq(s.bp_end, SPOS_MAX))
break;
- s.bucket_start = bpos_successor(s.bucket_end);
+ s.bp_start = bpos_successor(s.bp_end);
}
+err:
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
+ bch2_btree_cache_unpin(c);
+err_free_bitmaps:
+ for_each_member_device(c, ca) {
+ kvfree(ca->bucket_backpointer_empty);
+ ca->bucket_backpointer_empty = NULL;
+ kvfree(ca->bucket_backpointer_mismatches);
+ ca->bucket_backpointer_mismatches = NULL;
+ }
bch_err_fn(c, ret);
return ret;
@@ -770,61 +1152,70 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
static int check_one_backpointer(struct btree_trans *trans,
struct bbpos start,
struct bbpos end,
- struct bkey_s_c_backpointer bp,
- struct bpos *last_flushed_pos)
+ struct bkey_s_c bp_k,
+ struct bkey_buf *last_flushed)
{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
+ if (bp_k.k->type != KEY_TYPE_backpointer)
+ return 0;
+
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
struct bbpos pos = bp_to_bbpos(*bp.v);
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
- int ret;
if (bbpos_cmp(pos, start) < 0 ||
bbpos_cmp(pos, end) > 0)
return 0;
- k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0);
- ret = bkey_err(k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed);
+ int ret = bkey_err(k);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
return 0;
if (ret)
return ret;
- if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) {
- *last_flushed_pos = bp.k->p;
- ret = bch2_btree_write_buffer_flush_sync(trans) ?:
- -BCH_ERR_transaction_restart_write_buffer_flush;
- goto out;
- }
-
- if (fsck_err_on(!k.k, c,
- backpointer_to_missing_ptr,
- "backpointer for missing %s\n %s",
- bp.v->level ? "btree node" : "extent",
- (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
- ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
- goto out;
- }
-out:
-fsck_err:
bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
return ret;
}
+static int check_bucket_backpointers_to_extents(struct btree_trans *trans,
+ struct bch_dev *ca, struct bpos bucket)
+{
+ u32 restart_count = trans->restart_count;
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp_start(ca, bucket),
+ bucket_pos_to_bp_end(ca, bucket),
+ 0, k,
+ check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed)
+ );
+
+ bch2_bkey_buf_exit(&last_flushed, trans->c);
+ return ret ?: trans_was_restarted(trans, restart_count);
+}
+
static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
{
- struct bpos last_flushed_pos = SPOS_MAX;
+ struct bch_fs *c = trans->c;
+ struct bkey_buf last_flushed;
+ struct progress_indicator_state progress;
+
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+ progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
- return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
- POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_one_backpointer(trans, start, end,
- bkey_s_c_to_backpointer(k),
- &last_flushed_pos));
+ int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers,
+ POS_MIN, BTREE_ITER_prefetch, k, ({
+ progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
+ check_one_backpointer(trans, start, end, k, &last_flushed);
+ }));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
+ return ret;
}
int bch2_check_backpointers_to_extents(struct bch_fs *c)
@@ -835,8 +1226,8 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
while (1) {
ret = bch2_get_btree_in_memory_pos(trans,
- (1U << BTREE_ID_extents)|
- (1U << BTREE_ID_reflink),
+ BIT_ULL(BTREE_ID_extents)|
+ BIT_ULL(BTREE_ID_reflink),
~0,
start, &end);
if (ret)
@@ -868,6 +1259,8 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
+ bch2_btree_cache_unpin(c);
+
bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 327365a9feac..060dad1521ee 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -6,6 +6,7 @@
#include "btree_iter.h"
#include "btree_update.h"
#include "buckets.h"
+#include "error.h"
#include "super.h"
static inline u64 swab40(u64 x)
@@ -17,15 +18,14 @@ static inline u64 swab40(u64 x)
((x & 0xff00000000ULL) >> 32));
}
-int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k,
- enum bkey_invalid_flags, struct printbuf *);
-void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
-void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k,
+ struct bkey_validate_context);
+void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_backpointer_swab(struct bkey_s);
#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \
- .key_invalid = bch2_backpointer_invalid, \
- .val_to_text = bch2_backpointer_k_to_text, \
+ .key_validate = bch2_backpointer_validate, \
+ .val_to_text = bch2_backpointer_to_text, \
.swab = bch2_backpointer_swab, \
.min_val_size = 32, \
})
@@ -36,96 +36,133 @@ void bch2_backpointer_swab(struct bkey_s);
* Convert from pos in backpointer btree to pos of corresponding bucket in alloc
* btree:
*/
-static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
- struct bpos bp_pos)
+static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
}
+static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos,
+ u32 *bucket_offset)
+{
+ u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+ return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset));
+}
+
+static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
+{
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
+ if (ca)
+ *bucket = bp_pos_to_bucket(ca, bp_pos);
+ rcu_read_unlock();
+ return ca != NULL;
+}
+
+static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
+ struct bpos bucket,
+ u64 bucket_offset)
+{
+ return POS(bucket.inode,
+ (bucket_to_sector(ca, bucket.offset) <<
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+}
+
/*
* Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
*/
-static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
+static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca,
struct bpos bucket,
u64 bucket_offset)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
- struct bpos ret;
-
- ret = POS(bucket.inode,
- (bucket_to_sector(ca, bucket.offset) <<
- MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+ struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset);
+ EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret)));
+ return ret;
+}
- EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
+static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket)
+{
+ return bucket_pos_to_bp(ca, bucket, 0);
+}
- return ret;
+static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket)
+{
+ return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0));
}
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket,
- struct bch_backpointer, struct bkey_s_c, bool);
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
+ struct bkey_s_c,
+ struct bkey_i_backpointer *,
+ bool);
static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
- struct bpos bucket,
- struct bch_backpointer bp,
struct bkey_s_c orig_k,
+ struct bkey_i_backpointer *bp,
bool insert)
{
if (unlikely(bch2_backpointers_no_use_write_buffer))
- return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert);
-
- struct bkey_i_backpointer bp_k;
-
- bkey_backpointer_init(&bp_k.k_i);
- bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
- bp_k.v = bp;
+ return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert);
if (!insert) {
- bp_k.k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&bp_k.k, 0);
+ bp->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp->k, 0);
}
- return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
+ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i);
}
-static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p)
+static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry)
{
- return level ? BCH_DATA_btree :
- p.has_ec ? BCH_DATA_stripe :
- BCH_DATA_user;
+ switch (k.k->type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
+ return BCH_DATA_btree;
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
+ return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
+ case KEY_TYPE_stripe: {
+ const struct bch_extent_ptr *ptr = &entry->ptr;
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+ BUG_ON(ptr < s.v->ptrs ||
+ ptr >= s.v->ptrs + s.v->nr_blocks);
+
+ return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
+ ? BCH_DATA_parity
+ : BCH_DATA_user;
+ }
+ default:
+ BUG();
+ }
}
static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
- struct bpos *bucket_pos, struct bch_backpointer *bp)
+ const union bch_extent_entry *entry,
+ struct bkey_i_backpointer *bp)
{
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
- s64 sectors = level ? btree_sectors(c) : k.k->size;
- u32 bucket_offset;
-
- *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
- *bp = (struct bch_backpointer) {
+ bkey_backpointer_init(&bp->k_i);
+ bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset);
+ bp->v = (struct bch_backpointer) {
.btree_id = btree_id,
.level = level,
- .data_type = data_type,
- .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
- p.crc.offset,
- .bucket_len = ptr_disk_sectors(sectors, p),
+ .data_type = bch2_bkey_ptr_data_type(k, p, entry),
+ .bucket_gen = p.ptr.gen,
+ .bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p),
.pos = k.k->p,
};
}
-int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
- struct bpos *, struct bch_backpointer *, unsigned);
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
- struct bpos, struct bch_backpointer,
- unsigned);
-struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
- struct bpos, struct bch_backpointer);
+struct bkey_buf;
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer,
+ struct btree_iter *, unsigned, struct bkey_buf *);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
+ struct btree_iter *, struct bkey_buf *);
int bch2_check_btree_backpointers(struct bch_fs *);
int bch2_check_extents_to_backpointers(struct bch_fs *);
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
index be2edced5213..63abe17f35ea 100644
--- a/fs/bcachefs/bbpos.h
+++ b/fs/bcachefs/bbpos.h
@@ -29,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos)
static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
{
- prt_str(out, bch2_btree_id_str(pos.btree));
+ bch2_btree_id_to_text(out, pos.btree);
prt_char(out, ':');
bch2_bpos_to_text(out, pos.pos);
}
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
index 5198e94cf3b8..f63893344f80 100644
--- a/fs/bcachefs/bbpos_types.h
+++ b/fs/bcachefs/bbpos_types.h
@@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
}
#define BBPOS_MIN BBPOS(0, POS_MIN)
-#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
+#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 69d0d60d50e3..161cf2f05d2a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -205,13 +205,16 @@
#include <linux/zstd.h>
#include "bcachefs_format.h"
+#include "btree_journal_iter_types.h"
+#include "disk_accounting_types.h"
#include "errcode.h"
#include "fifo.h"
#include "nocow_locking_types.h"
#include "opts.h"
-#include "recovery_types.h"
+#include "recovery_passes_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
+#include "time_stats.h"
#include "util.h"
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -265,6 +268,11 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
+void bch2_print_str(struct bch_fs *, const char *);
+
+__printf(2, 3)
+void bch2_print_opts(struct bch_opts *, const char *, ...);
+
__printf(2, 3)
void __bch2_print(struct bch_fs *c, const char *fmt, ...);
@@ -286,6 +294,8 @@ do { \
#define bch_info(c, fmt, ...) \
bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_info_ratelimited(c, fmt, ...) \
+ bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_notice(c, fmt, ...) \
bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn(c, fmt, ...) \
@@ -345,6 +355,12 @@ do { \
bch_info(c, fmt, ##__VA_ARGS__); \
} while (0)
+#define bch_verbose_ratelimited(c, fmt, ...) \
+do { \
+ if ((c)->opts.verbose) \
+ bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \
+} while (0)
+
#define pr_verbose_init(opts, fmt, ...) \
do { \
if (opt_get(opts, verbose)) \
@@ -355,6 +371,8 @@ do { \
#define BCH_DEBUG_PARAMS_ALWAYS() \
BCH_DEBUG_PARAM(key_merging_disabled, \
"Disables merging of extents") \
+ BCH_DEBUG_PARAM(btree_node_merging_disabled, \
+ "Disables merging of btree nodes") \
BCH_DEBUG_PARAM(btree_gc_always_rewrite, \
"Causes mark and sweep to compact and rewrite every " \
"btree node it traverses") \
@@ -438,6 +456,7 @@ BCH_DEBUG_PARAMS_DEBUG()
x(blocked_journal_low_on_space) \
x(blocked_journal_low_on_pin) \
x(blocked_journal_max_in_flight) \
+ x(blocked_key_cache_flush) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
x(blocked_write_buffer_full) \
@@ -451,7 +470,9 @@ enum bch_time_stats {
};
#include "alloc_types.h"
+#include "btree_gc_types.h"
#include "btree_types.h"
+#include "btree_node_scan_types.h"
#include "btree_write_buffer_types.h"
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
@@ -463,6 +484,7 @@ enum bch_time_stats {
#include "quota_types.h"
#include "rebalance_types.h"
#include "replicas_types.h"
+#include "sb-members_types.h"
#include "subvolume_types.h"
#include "super_types.h"
#include "thread_with_file_types.h"
@@ -480,55 +502,24 @@ enum bch_time_stats {
struct btree;
-enum gc_phase {
- GC_PHASE_NOT_RUNNING,
- GC_PHASE_START,
- GC_PHASE_SB,
-
- GC_PHASE_BTREE_stripes,
- GC_PHASE_BTREE_extents,
- GC_PHASE_BTREE_inodes,
- GC_PHASE_BTREE_dirents,
- GC_PHASE_BTREE_xattrs,
- GC_PHASE_BTREE_alloc,
- GC_PHASE_BTREE_quotas,
- GC_PHASE_BTREE_reflink,
- GC_PHASE_BTREE_subvolumes,
- GC_PHASE_BTREE_snapshots,
- GC_PHASE_BTREE_lru,
- GC_PHASE_BTREE_freespace,
- GC_PHASE_BTREE_need_discard,
- GC_PHASE_BTREE_backpointers,
- GC_PHASE_BTREE_bucket_gens,
- GC_PHASE_BTREE_snapshot_trees,
- GC_PHASE_BTREE_deleted_inodes,
- GC_PHASE_BTREE_logged_ops,
- GC_PHASE_BTREE_rebalance_work,
-
- GC_PHASE_PENDING_DELETE,
-};
-
-struct gc_pos {
- enum gc_phase phase;
- struct bpos pos;
- unsigned level;
-};
-
-struct reflink_gc {
- u64 offset;
- u32 size;
- u32 refcount;
-};
-
-typedef GENRADIX(struct reflink_gc) reflink_gc_table;
-
struct io_count {
u64 sectors[2][BCH_DATA_NR];
};
+struct discard_in_flight {
+ bool in_progress:1;
+ u64 bucket:63;
+};
+
struct bch_dev {
struct kobject kobj;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ atomic_long_t ref;
+ bool dying;
+ unsigned long last_put;
+#else
struct percpu_ref ref;
+#endif
struct completion ref_completion;
struct percpu_ref io_ref;
struct completion io_ref_completion;
@@ -554,36 +545,38 @@ struct bch_dev {
struct bch_devs_mask self;
- /* biosets used in cloned bios for writing multiple replicas */
- struct bio_set replica_set;
-
/*
* Buckets:
- * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
- * gc_lock, for device resize - holding any is sufficient for access:
- * Or rcu_read_lock(), but only for ptr_stale():
+ * Per-bucket arrays are protected by either rcu_read_lock or
+ * state_lock, for device resize.
*/
- struct bucket_array __rcu *buckets_gc;
+ GENRADIX(struct bucket) buckets_gc;
struct bucket_gens __rcu *bucket_gens;
u8 *oldest_gen;
unsigned long *buckets_nouse;
- struct rw_semaphore bucket_lock;
- struct bch_dev_usage *usage_base;
- struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR];
- struct bch_dev_usage __percpu *usage_gc;
+ unsigned long *bucket_backpointer_mismatches;
+ unsigned long *bucket_backpointer_empty;
+
+ struct bch_dev_usage __percpu *usage;
/* Allocator: */
- u64 new_fs_bucket_idx;
- u64 alloc_cursor;
+ u64 alloc_cursor[3];
unsigned nr_open_buckets;
+ unsigned nr_partial_buckets;
unsigned nr_btree_reserve;
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
size_t buckets_waiting_on_journal;
+ struct work_struct invalidate_work;
+ struct work_struct discard_work;
+ struct mutex discard_buckets_in_flight_lock;
+ DARRAY(struct discard_in_flight) discard_buckets_in_flight;
+ struct work_struct discard_fast_work;
+
atomic64_t rebalance_work;
struct journal_device journal;
@@ -593,7 +586,7 @@ struct bch_dev {
/* The rest of this all shows up in sysfs */
atomic64_t cur_latency[2];
- struct bch2_time_stats io_latency[2];
+ struct bch2_time_stats_quantiles io_latency[2];
#define CONGESTED_MAX 1024
atomic_t congested;
@@ -609,7 +602,11 @@ struct bch_dev {
*/
#define BCH_FS_FLAGS() \
+ x(new_fs) \
x(started) \
+ x(clean_recovery) \
+ x(btree_running) \
+ x(accounting_replay_done) \
x(may_go_rw) \
x(rw) \
x(was_rw) \
@@ -618,14 +615,15 @@ struct bch_dev {
x(going_ro) \
x(write_disable_complete) \
x(clean_shutdown) \
+ x(recovery_running) \
x(fsck_running) \
x(initial_gc_unfixed) \
- x(need_another_gc) \
x(need_delete_dead_snapshots) \
x(error) \
x(topology_error) \
x(errors_fixed) \
- x(errors_not_fixed)
+ x(errors_not_fixed) \
+ x(no_invalid_checks)
enum bch_fs_flags {
#define x(n) BCH_FS_##n,
@@ -662,38 +660,15 @@ struct journal_seq_blacklist_table {
} entries[];
};
-struct journal_keys {
- struct journal_key {
- u64 journal_seq;
- u32 journal_offset;
- enum btree_id btree_id:8;
- unsigned level:8;
- bool allocated;
- bool overwritten;
- struct bkey_i *k;
- } *d;
- /*
- * Gap buffer: instead of all the empty space in the array being at the
- * end of the buffer - from @nr to @size - the empty space is at @gap.
- * This means that sequential insertions are O(n) instead of O(n^2).
- */
- size_t gap;
- size_t nr;
- size_t size;
- atomic_t ref;
- bool initial_ref_held;
-};
-
struct btree_trans_buf {
struct btree_trans *trans;
};
-#define REPLICAS_DELTA_LIST_MAX (1U << 16)
-
#define BCACHEFS_ROOT_SUBVOL_INUM \
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
#define BCH_WRITE_REFS() \
+ x(journal) \
x(trans) \
x(write) \
x(promote) \
@@ -702,9 +677,14 @@ struct btree_trans_buf {
x(stripe_delete) \
x(reflink) \
x(fallocate) \
+ x(fsync) \
+ x(dio_write) \
x(discard) \
+ x(discard_fast) \
+ x(check_discard_freespace_key) \
x(invalidate) \
x(delete_dead_snapshots) \
+ x(gc_gens) \
x(snapshot_delete_pagecache) \
x(sysfs) \
x(btree_write_buffer)
@@ -745,6 +725,12 @@ struct bch_fs {
struct percpu_ref writes;
#endif
/*
+ * Certain operations are only allowed in single threaded mode, during
+ * recovery, and we want to assert that this is the case:
+ */
+ struct task_struct *recovery_task;
+
+ /*
* Analagous to c->writes, for asynchronous ops that don't necessarily
* need fs to be read-write
*/
@@ -755,15 +741,14 @@ struct bch_fs {
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
+ struct bch_accounting_mem accounting;
+
struct bch_replicas_cpu replicas;
struct bch_replicas_cpu replicas_gc;
struct mutex replicas_gc_lock;
- mempool_t replicas_delta_pool;
struct journal_entry_res btree_root_journal_res;
- struct journal_entry_res replicas_journal_res;
struct journal_entry_res clock_journal_res;
- struct journal_entry_res dev_usage_journal_res;
struct bch_disk_groups_cpu __rcu *disk_groups;
@@ -775,6 +760,8 @@ struct bch_fs {
__uuid_t user_uuid;
u16 version;
+ u16 version_incompat;
+ u16 version_incompat_allowed;
u16 version_min;
u16 version_upgrade_complete;
@@ -789,7 +776,8 @@ struct bch_fs {
unsigned nsec_per_time_unit;
u64 features;
u64 compat;
- unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
+ unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
+ u64 btrees_lost_data;
} sb;
@@ -804,7 +792,6 @@ struct bch_fs {
/* snapshot.c: */
struct snapshot_table __rcu *snapshots;
- size_t snapshot_table_size;
struct mutex snapshot_table_lock;
struct rw_semaphore snapshot_create_lock;
@@ -815,7 +802,8 @@ struct bch_fs {
/* BTREE CACHE */
struct bio_set btree_bio;
- struct workqueue_struct *io_complete_wq;
+ struct workqueue_struct *btree_read_complete_wq;
+ struct workqueue_struct *btree_write_submit_wq;
struct btree_root btree_roots_known[BTREE_ID_NR];
DARRAY(struct btree_root) btree_roots_extra;
@@ -843,8 +831,11 @@ struct bch_fs {
struct workqueue_struct *btree_interior_update_worker;
struct work_struct btree_interior_update_work;
- struct list_head pending_node_rewrites;
- struct mutex pending_node_rewrites_lock;
+ struct workqueue_struct *btree_node_rewrite_worker;
+ struct list_head btree_node_rewrites;
+ struct list_head btree_node_rewrites_pending;
+ spinlock_t btree_node_rewrites_lock;
+ struct closure_waitlist btree_node_rewrites_wait;
/* btree_io.c: */
spinlock_t btree_write_error_lock;
@@ -881,8 +872,10 @@ struct bch_fs {
/* ALLOCATION */
struct bch_devs_mask rw_devs[BCH_DATA_NR];
+ unsigned long rw_devs_change_count;
u64 capacity; /* sectors */
+ u64 reserved; /* sectors */
/*
* When capacity _decreases_ (due to a disk being removed), we
@@ -900,27 +893,20 @@ struct bch_fs {
struct percpu_rw_semaphore mark_lock;
seqcount_t usage_lock;
- struct bch_fs_usage *usage_base;
- struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR];
- struct bch_fs_usage __percpu *usage_gc;
+ struct bch_fs_usage_base __percpu *usage;
u64 __percpu *online_reserved;
- /* single element mempool: */
- struct mutex usage_scratch_lock;
- struct bch_fs_usage_online *usage_scratch;
+ unsigned long allocator_last_stuck;
struct io_clock io_clock[2];
/* JOURNAL SEQ BLACKLIST */
struct journal_seq_blacklist_table *
journal_seq_blacklist_table;
- struct work_struct journal_seq_blacklist_gc_work;
/* ALLOCATOR */
spinlock_t freelist_lock;
struct closure_waitlist freelist_wait;
- u64 blocked_allocate;
- u64 blocked_allocate_open_bucket;
open_bucket_idx_t open_buckets_freelist;
open_bucket_idx_t open_buckets_nr_free;
@@ -940,12 +926,9 @@ struct bch_fs {
unsigned write_points_nr;
struct buckets_waiting_for_journal buckets_waiting_for_journal;
- struct work_struct discard_work;
- struct work_struct invalidate_work;
/* GARBAGE COLLECTION */
- struct task_struct *gc_thread;
- atomic_t kick_gc;
+ struct work_struct gc_gens_work;
unsigned long gc_count;
enum btree_id gc_gens_btree;
@@ -975,6 +958,7 @@ struct bch_fs {
struct bio_set bio_read;
struct bio_set bio_read_split;
struct bio_set bio_write;
+ struct bio_set replica_set;
struct mutex bio_bounce_pages_lock;
mempool_t bio_bounce_pages;
struct bucket_nocow_lock_table
@@ -982,8 +966,7 @@ struct bch_fs {
struct rhashtable promote_table;
mempool_t compression_bounce[2];
- mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR];
- mempool_t decompress_workspace;
+ mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
size_t zstd_workspace_size;
struct crypto_shash *sha256;
@@ -1041,6 +1024,8 @@ struct bch_fs {
/* fs.c */
struct list_head vfs_inodes_list;
struct mutex vfs_inodes_lock;
+ struct rhashtable vfs_inodes_table;
+ struct rhltable vfs_inodes_by_inum_table;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
@@ -1062,12 +1047,12 @@ struct bch_fs {
* for signaling to the toplevel code which pass we want to run now.
*/
enum bch_recovery_pass curr_recovery_pass;
- /* bitmap of explicitly enabled recovery passes: */
- u64 recovery_passes_explicit;
+ enum bch_recovery_pass next_recovery_pass;
/* bitmask of recovery passes that we actually ran */
u64 recovery_passes_complete;
/* never rewinds version of curr_recovery_pass */
enum bch_recovery_pass recovery_pass_done;
+ spinlock_t recovery_pass_lock;
struct semaphore online_fsck_mutex;
/* DEBUG JUNK */
@@ -1078,9 +1063,6 @@ struct bch_fs {
struct btree_node *verify_ondisk;
struct mutex verify_lock;
- u64 *unused_inode_hints;
- unsigned inode_shard_bits;
-
/*
* A btree node on disk could have too many bsets for an iterator to fit
* on the stack - have to dynamically allocate them
@@ -1095,15 +1077,13 @@ struct bch_fs {
struct journal_keys journal_keys;
struct list_head journal_iters;
+ struct find_btree_nodes found_btree_nodes;
+
u64 last_bucket_seq_cleanup;
u64 counters_on_mount[BCH_COUNTER_NR];
u64 __percpu *counters;
- unsigned btree_gc_periodic:1;
- unsigned copy_gc_enabled:1;
- bool promote_whole_extents;
-
struct bch2_time_stats times[BCH_TIME_STAT_NR];
struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
@@ -1212,12 +1192,15 @@ static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
{
struct timespec64 t;
+ s64 sec;
s32 rem;
time += c->sb.time_base_lo;
- t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
- t.tv_nsec = rem * c->sb.nsec_per_time_unit;
+ sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
+
+ set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit);
+
return t;
}
@@ -1235,9 +1218,9 @@ static inline s64 bch2_current_time(const struct bch_fs *c)
return timespec_to_bch2_time(c, now);
}
-static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
+static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw)
{
- return dev < c->sb.nr_devices && c->devs[dev];
+ return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX);
}
static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0668b682a21c..f70f0108401f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -76,6 +76,7 @@
#include <asm/byteorder.h>
#include <linux/kernel.h>
#include <linux/uuid.h>
+#include <uapi/linux/magic.h>
#include "vstructs.h"
#ifdef __KERNEL__
@@ -189,7 +190,11 @@ struct bversion {
__u32 hi;
__u64 lo;
#endif
-} __packed __aligned(4);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+__aligned(4)
+#endif
+;
struct bkey {
/* Size of combined key and value, in u64s */
@@ -212,17 +217,46 @@ struct bkey {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u8 pad[1];
- struct bversion version;
+ struct bversion bversion;
__u32 size; /* extent size, in sectors */
struct bpos p;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
struct bpos p;
__u32 size; /* extent size, in sectors */
- struct bversion version;
+ struct bversion bversion;
__u8 pad[1];
#endif
-} __packed __aligned(8);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+/*
+ * The big-endian version of bkey can't be compiled by rustc with the "aligned"
+ * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
+ * So for Rust compatibility, don't include this. It can be included in the LE
+ * version because the "packed" attr is redundant in that case.
+ *
+ * History: (quoting Kent)
+ *
+ * Specifically, when i was designing bkey, I wanted the header to be no
+ * bigger than necessary so that bkey_packed could use the rest. That means that
+ * decently offten extent keys will fit into only 8 bytes, instead of spilling over
+ * to 16.
+ *
+ * But packed_bkey treats the part after the header - the packed section -
+ * as a single multi word, variable length integer. And bkey, the unpacked
+ * version, is just a special case version of a bkey_packed; all the packed
+ * bkey code will work on keys in any packed format, the in-memory
+ * representation of an unpacked key also is just one type of packed key...
+ *
+ * So that constrains the key part of a bkig endian bkey to start right
+ * after the header.
+ *
+ * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
+ * some reason - that will clean up this wart.
+ */
+__aligned(8)
+#endif
+;
struct bkey_packed {
__u64 _data[0];
@@ -294,8 +328,8 @@ enum bch_bkey_fields {
bkey_format_field(OFFSET, p.offset), \
bkey_format_field(SNAPSHOT, p.snapshot), \
bkey_format_field(SIZE, size), \
- bkey_format_field(VERSION_HI, version.hi), \
- bkey_format_field(VERSION_LO, version.lo), \
+ bkey_format_field(VERSION_HI, bversion.hi), \
+ bkey_format_field(VERSION_LO, bversion.lo), \
}, \
})
@@ -383,7 +417,9 @@ static inline void bkey_init(struct bkey *k)
x(bucket_gens, 30) \
x(snapshot_tree, 31) \
x(logged_op_truncate, 32) \
- x(logged_op_finsert, 33)
+ x(logged_op_finsert, 33) \
+ x(accounting, 34) \
+ x(inode_alloc_cursor, 35)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -428,20 +464,12 @@ struct bch_backpointer {
__u8 btree_id;
__u8 level;
__u8 data_type;
- __u64 bucket_offset:40;
+ __u8 bucket_gen;
+ __u32 pad;
__u32 bucket_len;
struct bpos pos;
} __packed __aligned(8);
-/* LRU btree: */
-
-struct bch_lru {
- struct bch_val v;
- __le64 idx;
-} __packed __aligned(8);
-
-#define LRU_ID_STRIPES (1U << 16)
-
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -468,17 +496,25 @@ struct bch_sb_field {
x(downgrade, 14)
#include "alloc_background_format.h"
+#include "dirent_format.h"
+#include "disk_accounting_format.h"
+#include "disk_groups_format.h"
#include "extents_format.h"
-#include "reflink_format.h"
#include "ec_format.h"
#include "inode_format.h"
-#include "dirent_format.h"
-#include "xattr_format.h"
-#include "quota_format.h"
+#include "journal_seq_blacklist_format.h"
#include "logged_ops_format.h"
+#include "lru_format.h"
+#include "quota_format.h"
+#include "reflink_format.h"
+#include "replicas_format.h"
#include "snapshot_format.h"
#include "subvolume_format.h"
#include "sb-counters_format.h"
+#include "sb-downgrade_format.h"
+#include "sb-errors_format.h"
+#include "sb-members_format.h"
+#include "xattr_format.h"
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
@@ -511,92 +547,6 @@ struct bch_sb_field_journal_v2 {
} d[];
};
-/* BCH_SB_FIELD_members_v1: */
-
-#define BCH_MIN_NR_NBUCKETS (1 << 6)
-
-#define BCH_IOPS_MEASUREMENTS() \
- x(seqread, 0) \
- x(seqwrite, 1) \
- x(randread, 2) \
- x(randwrite, 3)
-
-enum bch_iops_measurement {
-#define x(t, n) BCH_IOPS_##t = n,
- BCH_IOPS_MEASUREMENTS()
-#undef x
- BCH_IOPS_NR
-};
-
-#define BCH_MEMBER_ERROR_TYPES() \
- x(read, 0) \
- x(write, 1) \
- x(checksum, 2)
-
-enum bch_member_error_type {
-#define x(t, n) BCH_MEMBER_ERROR_##t = n,
- BCH_MEMBER_ERROR_TYPES()
-#undef x
- BCH_MEMBER_ERROR_NR
-};
-
-struct bch_member {
- __uuid_t uuid;
- __le64 nbuckets; /* device size */
- __le16 first_bucket; /* index of first bucket used */
- __le16 bucket_size; /* sectors */
- __le32 pad;
- __le64 last_mount; /* time_t */
-
- __le64 flags;
- __le32 iops[4];
- __le64 errors[BCH_MEMBER_ERROR_NR];
- __le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
- __le64 errors_reset_time;
- __le64 seq;
-};
-
-#define BCH_MEMBER_V1_BYTES 56
-
-LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
-/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
-LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
-LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
-LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
-LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
- struct bch_member, flags, 30, 31)
-
-#if 0
-LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
-LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
-#endif
-
-#define BCH_MEMBER_STATES() \
- x(rw, 0) \
- x(ro, 1) \
- x(failed, 2) \
- x(spare, 3)
-
-enum bch_member_state {
-#define x(t, n) BCH_MEMBER_STATE_##t = n,
- BCH_MEMBER_STATES()
-#undef x
- BCH_MEMBER_STATE_NR
-};
-
-struct bch_sb_field_members_v1 {
- struct bch_sb_field field;
- struct bch_member _members[]; //Members are now variable size
-};
-
-struct bch_sb_field_members_v2 {
- struct bch_sb_field field;
- __le16 member_bytes; //size of single member entry
- u8 pad[6];
- struct bch_member _members[];
-};
-
/* BCH_SB_FIELD_crypt: */
struct nonce {
@@ -645,94 +595,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
-/* BCH_SB_FIELD_replicas: */
-
-#define BCH_DATA_TYPES() \
- x(free, 0) \
- x(sb, 1) \
- x(journal, 2) \
- x(btree, 3) \
- x(user, 4) \
- x(cached, 5) \
- x(parity, 6) \
- x(stripe, 7) \
- x(need_gc_gens, 8) \
- x(need_discard, 9)
-
-enum bch_data_type {
-#define x(t, n) BCH_DATA_##t,
- BCH_DATA_TYPES()
-#undef x
- BCH_DATA_NR
-};
-
-static inline bool data_type_is_empty(enum bch_data_type type)
-{
- switch (type) {
- case BCH_DATA_free:
- case BCH_DATA_need_gc_gens:
- case BCH_DATA_need_discard:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool data_type_is_hidden(enum bch_data_type type)
-{
- switch (type) {
- case BCH_DATA_sb:
- case BCH_DATA_journal:
- return true;
- default:
- return false;
- }
-}
-
-struct bch_replicas_entry_v0 {
- __u8 data_type;
- __u8 nr_devs;
- __u8 devs[];
-} __packed;
-
-struct bch_sb_field_replicas_v0 {
- struct bch_sb_field field;
- struct bch_replicas_entry_v0 entries[];
-} __packed __aligned(8);
-
-struct bch_replicas_entry_v1 {
- __u8 data_type;
- __u8 nr_devs;
- __u8 nr_required;
- __u8 devs[];
-} __packed;
-
-#define replicas_entry_bytes(_i) \
- (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
-
-struct bch_sb_field_replicas {
- struct bch_sb_field field;
- struct bch_replicas_entry_v1 entries[];
-} __packed __aligned(8);
-
-/* BCH_SB_FIELD_disk_groups: */
-
-#define BCH_SB_LABEL_SIZE 32
-
-struct bch_disk_group {
- __u8 label[BCH_SB_LABEL_SIZE];
- __le64 flags[2];
-} __packed __aligned(8);
-
-LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
-LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
-LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
-
-struct bch_sb_field_disk_groups {
- struct bch_sb_field field;
- struct bch_disk_group entries[];
-} __packed __aligned(8);
-
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
@@ -760,43 +622,11 @@ struct bch_sb_field_clean {
__u64 _data[];
};
-struct journal_seq_blacklist_entry {
- __le64 start;
- __le64 end;
-};
-
-struct bch_sb_field_journal_seq_blacklist {
- struct bch_sb_field field;
- struct journal_seq_blacklist_entry start[];
-};
-
-struct bch_sb_field_errors {
- struct bch_sb_field field;
- struct bch_sb_field_error_entry {
- __le64 v;
- __le64 last_error_time;
- } entries[];
-};
-
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
-
struct bch_sb_field_ext {
struct bch_sb_field field;
__le64 recovery_passes_required[2];
__le64 errors_silent[8];
-};
-
-struct bch_sb_field_downgrade_entry {
- __le16 version;
- __le64 recovery_passes[2];
- __le16 nr_errors;
- __le16 errors[] __counted_by(nr_errors);
-} __packed __aligned(2);
-
-struct bch_sb_field_downgrade {
- struct bch_sb_field field;
- struct bch_sb_field_downgrade_entry entries[];
+ __le64 btrees_lost_data;
};
/* Superblock: */
@@ -840,7 +670,23 @@ struct bch_sb_field_downgrade {
x(snapshot_skiplists, BCH_VERSION(1, 1)) \
x(deleted_inodes, BCH_VERSION(1, 2)) \
x(rebalance_work, BCH_VERSION(1, 3)) \
- x(member_seq, BCH_VERSION(1, 4))
+ x(member_seq, BCH_VERSION(1, 4)) \
+ x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
+ x(btree_subvolume_children, BCH_VERSION(1, 6)) \
+ x(mi_btree_bitmap, BCH_VERSION(1, 7)) \
+ x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \
+ x(disk_accounting_v2, BCH_VERSION(1, 9)) \
+ x(disk_accounting_v3, BCH_VERSION(1, 10)) \
+ x(disk_accounting_inum, BCH_VERSION(1, 11)) \
+ x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \
+ x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \
+ x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \
+ x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \
+ x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \
+ x(inode_depth, BCH_VERSION(1, 17)) \
+ x(persistent_inode_cursors, BCH_VERSION(1, 18)) \
+ x(autofix_errors, BCH_VERSION(1, 19)) \
+ x(directory_size, BCH_VERSION(1, 20))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -856,7 +702,8 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
#define BCH_SB_SECTOR 8
-#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */
+
+#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */
struct bch_sb_layout {
__uuid_t magic; /* bcachefs superblock UUID */
@@ -956,6 +803,8 @@ LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61);
LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63);
+LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
+ struct bch_sb, flags[0], 63, 64);
LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8);
@@ -1000,6 +849,12 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
struct bch_sb, flags[5], 0, 16);
+LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
+ struct bch_sb, flags[5], 16, 32);
+LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
+LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
+ struct bch_sb, flags[5], 48, 64);
+LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
@@ -1052,21 +907,22 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
x(new_varint, 15) \
x(journal_no_flush, 16) \
x(alloc_v2, 17) \
- x(extents_across_btree_nodes, 18)
+ x(extents_across_btree_nodes, 18) \
+ x(incompat_version_field, 19)
#define BCH_SB_FEATURES_ALWAYS \
- ((1ULL << BCH_FEATURE_new_extent_overwrite)| \
- (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
- (1ULL << BCH_FEATURE_btree_updates_journalled)|\
- (1ULL << BCH_FEATURE_alloc_v2)|\
- (1ULL << BCH_FEATURE_extents_across_btree_nodes))
+ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \
+ BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\
+ BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\
+ BIT_ULL(BCH_FEATURE_alloc_v2)|\
+ BIT_ULL(BCH_FEATURE_extents_across_btree_nodes))
#define BCH_SB_FEATURES_ALL \
(BCH_SB_FEATURES_ALWAYS| \
- (1ULL << BCH_FEATURE_new_siphash)| \
- (1ULL << BCH_FEATURE_btree_ptr_v2)| \
- (1ULL << BCH_FEATURE_new_varint)| \
- (1ULL << BCH_FEATURE_journal_no_flush))
+ BIT_ULL(BCH_FEATURE_new_siphash)| \
+ BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \
+ BIT_ULL(BCH_FEATURE_new_varint)| \
+ BIT_ULL(BCH_FEATURE_journal_no_flush))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
@@ -1107,8 +963,9 @@ enum bch_version_upgrade_opts {
#define BCH_ERROR_ACTIONS() \
x(continue, 0) \
- x(ro, 1) \
- x(panic, 2)
+ x(fix_safe, 1) \
+ x(panic, 2) \
+ x(ro, 3)
enum bch_error_actions {
#define x(t, n) BCH_ON_ERROR_##t = n,
@@ -1187,7 +1044,7 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
x(crc64, 2) \
x(xxhash, 3)
-enum bch_csum_opts {
+enum bch_csum_opt {
#define x(t, n) BCH_CSUM_OPT_##t = n,
BCH_CSUM_OPTS()
#undef x
@@ -1236,7 +1093,7 @@ enum bch_compression_opts {
UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \
0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
-#define BCACHEFS_STATFS_MAGIC 0xca451a4e
+#define BCACHEFS_STATFS_MAGIC BCACHEFS_SUPER_MAGIC
#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL)
#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL)
@@ -1275,9 +1132,10 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(dev_usage, 8) \
x(log, 9) \
x(overwrite, 10) \
- x(write_buffer_keys, 11)
+ x(write_buffer_keys, 11) \
+ x(datetime, 12)
-enum {
+enum bch_jset_entry_type {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
BCH_JSET_ENTRY_TYPES()
#undef x
@@ -1289,7 +1147,6 @@ static inline bool jset_entry_is_key(struct jset_entry *e)
switch (e->type) {
case BCH_JSET_ENTRY_btree_keys:
case BCH_JSET_ENTRY_btree_root:
- case BCH_JSET_ENTRY_overwrite:
case BCH_JSET_ENTRY_write_buffer_keys:
return true;
}
@@ -1323,7 +1180,7 @@ struct jset_entry_blacklist_v2 {
x(inodes, 1) \
x(key_version, 2)
-enum {
+enum bch_fs_usage_type {
#define x(f, nr) BCH_FS_USAGE_##f = nr,
BCH_FS_USAGE_TYPES()
#undef x
@@ -1376,6 +1233,20 @@ struct jset_entry_log {
u8 d[];
} __packed __aligned(8);
+static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l)
+{
+ unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d);
+
+ while (b && !l->d[b - 1])
+ --b;
+ return b;
+}
+
+struct jset_entry_datetime {
+ struct jset_entry entry;
+ __le64 seconds;
+} __packed __aligned(8);
+
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
@@ -1418,14 +1289,18 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
/* Btree: */
enum btree_id_flags {
- BTREE_ID_EXTENTS = BIT(0),
- BTREE_ID_SNAPSHOTS = BIT(1),
- BTREE_ID_SNAPSHOT_FIELD = BIT(2),
- BTREE_ID_DATA = BIT(3),
+ BTREE_IS_extents = BIT(0),
+ BTREE_IS_snapshots = BIT(1),
+ BTREE_IS_snapshot_field = BIT(2),
+ BTREE_IS_data = BIT(3),
+ BTREE_IS_write_buffer = BIT(4),
};
#define BCH_BTREE_IDS() \
- x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
+ x(extents, 0, \
+ BTREE_IS_extents| \
+ BTREE_IS_snapshots| \
+ BTREE_IS_data, \
BIT_ULL(KEY_TYPE_whiteout)| \
BIT_ULL(KEY_TYPE_error)| \
BIT_ULL(KEY_TYPE_cookie)| \
@@ -1433,17 +1308,20 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_reservation)| \
BIT_ULL(KEY_TYPE_reflink_p)| \
BIT_ULL(KEY_TYPE_inline_data)) \
- x(inodes, 1, BTREE_ID_SNAPSHOTS, \
+ x(inodes, 1, \
+ BTREE_IS_snapshots, \
BIT_ULL(KEY_TYPE_whiteout)| \
BIT_ULL(KEY_TYPE_inode)| \
BIT_ULL(KEY_TYPE_inode_v2)| \
BIT_ULL(KEY_TYPE_inode_v3)| \
BIT_ULL(KEY_TYPE_inode_generation)) \
- x(dirents, 2, BTREE_ID_SNAPSHOTS, \
+ x(dirents, 2, \
+ BTREE_IS_snapshots, \
BIT_ULL(KEY_TYPE_whiteout)| \
BIT_ULL(KEY_TYPE_hash_whiteout)| \
BIT_ULL(KEY_TYPE_dirent)) \
- x(xattrs, 3, BTREE_ID_SNAPSHOTS, \
+ x(xattrs, 3, \
+ BTREE_IS_snapshots, \
BIT_ULL(KEY_TYPE_whiteout)| \
BIT_ULL(KEY_TYPE_cookie)| \
BIT_ULL(KEY_TYPE_hash_whiteout)| \
@@ -1457,32 +1335,49 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_quota)) \
x(stripes, 6, 0, \
BIT_ULL(KEY_TYPE_stripe)) \
- x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \
+ x(reflink, 7, \
+ BTREE_IS_extents| \
+ BTREE_IS_data, \
BIT_ULL(KEY_TYPE_reflink_v)| \
- BIT_ULL(KEY_TYPE_indirect_inline_data)) \
+ BIT_ULL(KEY_TYPE_indirect_inline_data)| \
+ BIT_ULL(KEY_TYPE_error)) \
x(subvolumes, 8, 0, \
BIT_ULL(KEY_TYPE_subvolume)) \
x(snapshots, 9, 0, \
BIT_ULL(KEY_TYPE_snapshot)) \
- x(lru, 10, 0, \
+ x(lru, 10, \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_set)) \
- x(freespace, 11, BTREE_ID_EXTENTS, \
+ x(freespace, 11, \
+ BTREE_IS_extents, \
BIT_ULL(KEY_TYPE_set)) \
x(need_discard, 12, 0, \
BIT_ULL(KEY_TYPE_set)) \
- x(backpointers, 13, 0, \
+ x(backpointers, 13, \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_backpointer)) \
x(bucket_gens, 14, 0, \
BIT_ULL(KEY_TYPE_bucket_gens)) \
x(snapshot_trees, 15, 0, \
BIT_ULL(KEY_TYPE_snapshot_tree)) \
- x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
+ x(deleted_inodes, 16, \
+ BTREE_IS_snapshot_field| \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
- BIT_ULL(KEY_TYPE_logged_op_finsert)) \
- x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
- BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
+ BIT_ULL(KEY_TYPE_logged_op_finsert)| \
+ BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \
+ x(rebalance_work, 18, \
+ BTREE_IS_snapshot_field| \
+ BTREE_IS_write_buffer, \
+ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
+ x(subvolume_children, 19, 0, \
+ BIT_ULL(KEY_TYPE_set)) \
+ x(accounting, 20, \
+ BTREE_IS_snapshot_field| \
+ BTREE_IS_write_buffer, \
+ BIT_ULL(KEY_TYPE_accounting)) \
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,
@@ -1491,6 +1386,29 @@ enum btree_id {
BTREE_ID_NR
};
+/*
+ * Maximum number of btrees that we will _ever_ have under the current scheme,
+ * where we refer to them with 64 bit bitfields - and we also need a bit for
+ * the interior btree node type:
+ */
+#define BTREE_ID_NR_MAX 63
+
+static inline bool btree_id_is_alloc(enum btree_id id)
+{
+ switch (id) {
+ case BTREE_ID_alloc:
+ case BTREE_ID_backpointers:
+ case BTREE_ID_need_discard:
+ case BTREE_ID_freespace:
+ case BTREE_ID_bucket_gens:
+ case BTREE_ID_lru:
+ case BTREE_ID_accounting:
+ return true;
+ default:
+ return false;
+ }
+}
+
#define BTREE_MAX_DEPTH 4U
/* Btree nodes */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 4b8fba754b1c..3c23bdf788ce 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -5,6 +5,7 @@
#include <linux/uuid.h>
#include <asm/ioctl.h>
#include "bcachefs_format.h"
+#include "bkey_types.h"
/*
* Flags common to multiple ioctls:
@@ -85,6 +86,7 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
+#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
/* ioctl below act on a particular file, not the filesystem as a whole: */
@@ -251,12 +253,18 @@ struct bch_replicas_usage {
struct bch_replicas_entry_v1 r;
} __packed;
+static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u)
+{
+ return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r);
+}
+
static inline struct bch_replicas_usage *
replicas_usage_next(struct bch_replicas_usage *u)
{
- return (void *) u + replicas_entry_bytes(&u->r) + 8;
+ return (void *) u + replicas_usage_bytes(u);
}
+/* Obsolete */
/*
* BCH_IOCTL_FS_USAGE: query filesystem disk space usage
*
@@ -282,6 +290,7 @@ struct bch_ioctl_fs_usage {
struct bch_replicas_usage replicas[];
};
+/* Obsolete */
/*
* BCH_IOCTL_DEV_USAGE: query device disk space usage
*
@@ -306,6 +315,7 @@ struct bch_ioctl_dev_usage {
} d[10];
};
+/* Obsolete */
struct bch_ioctl_dev_usage_v2 {
__u64 dev;
__u32 flags;
@@ -409,4 +419,28 @@ struct bch_ioctl_fsck_online {
__u64 opts; /* string */
};
+/*
+ * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
+ *
+ * On success, @replica_entries_bytes will be changed to indicate the number of
+ * bytes actually used.
+ *
+ * Returns -ERANGE if @replica_entries_bytes was too small
+ */
+struct bch_ioctl_query_accounting {
+ __u64 capacity;
+ __u64 used;
+ __u64 online_reserved;
+
+ __u32 accounting_u64s; /* input parameter */
+ __u32 accounting_types_mask; /* input parameter */
+
+ struct bkey_i_accounting accounting[];
+};
+
#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 76e79a15ba08..995ba32e9b6e 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -640,10 +640,10 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
int bch2_bkey_format_invalid(struct bch_fs *c,
struct bkey_format *f,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
- unsigned i, bits = KEY_PACKED_BITS_START;
+ unsigned bits = KEY_PACKED_BITS_START;
if (f->nr_fields != BKEY_NR_FIELDS) {
prt_printf(err, "incorrect number of fields: got %u, should be %u",
@@ -655,21 +655,18 @@ int bch2_bkey_format_invalid(struct bch_fs *c,
* Verify that the packed format can't represent fields larger than the
* unpacked format:
*/
- for (i = 0; i < f->nr_fields; i++) {
- if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) {
+ for (unsigned i = 0; i < f->nr_fields; i++) {
+ if (bch2_bkey_format_field_overflows(f, i)) {
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
- u64 packed_max = f->bits_per_field[i]
- ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+ unsigned packed_bits = min(64, f->bits_per_field[i]);
+ u64 packed_max = packed_bits
+ ? ~((~0ULL << 1) << (packed_bits - 1))
: 0;
- u64 field_offset = le64_to_cpu(f->field_offset[i]);
- if (packed_max + field_offset < packed_max ||
- packed_max + field_offset > unpacked_max) {
- prt_printf(err, "field %u too large: %llu + %llu > %llu",
- i, packed_max, field_offset, unpacked_max);
- return -BCH_ERR_invalid;
- }
+ prt_printf(err, "field %u too large: %llu + %llu > %llu",
+ i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max);
+ return -BCH_ERR_invalid;
}
bits += f->bits_per_field[i];
@@ -1067,7 +1064,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
{
const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
u8 *l = k->key_start;
- u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
+ u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1;
while (l < h) {
swap(*l, *h);
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 831be01809f2..054e2d5e8448 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -4,17 +4,11 @@
#include <linux/bug.h>
#include "bcachefs_format.h"
-
+#include "bkey_types.h"
#include "btree_types.h"
#include "util.h"
#include "vstructs.h"
-enum bkey_invalid_flags {
- BKEY_INVALID_WRITE = (1U << 0),
- BKEY_INVALID_COMMIT = (1U << 1),
- BKEY_INVALID_JOURNAL = (1U << 2),
-};
-
#if 0
/*
@@ -31,57 +25,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *,
const struct bkey_format *,
const struct bkey_packed *);
-/* bkey with split value, const */
-struct bkey_s_c {
- const struct bkey *k;
- const struct bch_val *v;
-};
-
-/* bkey with split value */
-struct bkey_s {
- union {
- struct {
- struct bkey *k;
- struct bch_val *v;
- };
- struct bkey_s_c s_c;
- };
-};
-
-#define bkey_p_next(_k) vstruct_next(_k)
-
-static inline struct bkey_i *bkey_next(struct bkey_i *k)
-{
- return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
-}
-
-#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
- return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
- unsigned u64s = BKEY_U64s + val_u64s;
-
- BUG_ON(u64s > U8_MAX);
- k->u64s = u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
- set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
-}
-
-#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
-
-#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
-
-#define bkey_whiteout(_k) \
- ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
-
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
BKEY_PACKED_RIGHT,
@@ -245,6 +188,13 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r)
return bkey_gt(l, r) ? l : r;
}
+static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
+{
+ return bpos_eq(l.k->p, r.k->p) &&
+ bkey_bytes(l.k) == bkey_bytes(r.k) &&
+ !memcmp(l.v, r.v, bkey_val_bytes(l.k));
+}
+
void bch2_bpos_swab(struct bpos *);
void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
@@ -257,9 +207,9 @@ static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL })
-static __always_inline int bversion_zero(struct bversion v)
+static __always_inline bool bversion_zero(struct bversion v)
{
- return !bversion_cmp(v, ZERO_VERSION);
+ return bversion_cmp(v, ZERO_VERSION) == 0;
}
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -362,10 +312,13 @@ static inline struct bpos bkey_start_pos(const struct bkey *k)
static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
const struct bkey_packed *k)
{
- unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+ return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+}
- EBUG_ON(k->u64s < ret);
- return ret;
+static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
+ const struct bkey_packed *k)
+{
+ return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
}
static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
@@ -553,155 +506,6 @@ static inline void bkey_reassemble(struct bkey_i *dst,
memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
}
-#define bkey_s_null ((struct bkey_s) { .k = NULL })
-#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
- return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
- return (struct bkey_s_c) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
- return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
- return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define x(name, ...) \
-struct bkey_i_##name { \
- union { \
- struct bkey k; \
- struct bkey_i k_i; \
- }; \
- struct bch_##name v; \
-}; \
- \
-struct bkey_s_c_##name { \
- union { \
- struct { \
- const struct bkey *k; \
- const struct bch_##name *v; \
- }; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-struct bkey_s_##name { \
- union { \
- struct { \
- struct bkey *k; \
- struct bch_##name *v; \
- }; \
- struct bkey_s_c_##name c; \
- struct bkey_s s; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline const struct bkey_i_##name * \
-bkey_i_to_##name##_c(const struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
- return (struct bkey_s_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
- return (struct bkey_s_c_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{ \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-name##_i_to_s_c(const struct bkey_i_##name *k) \
-{ \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-bkey_i_to_s_c_##name(const struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{ \
- struct bkey_i_##name *k = \
- container_of(&_k->k, struct bkey_i_##name, k); \
- \
- bkey_init(&k->k); \
- memset(&k->v, 0, sizeof(k->v)); \
- k->k.type = KEY_TYPE_##name; \
- set_bkey_val_bytes(&k->k, sizeof(k->v)); \
- \
- return k; \
-}
-
-BCH_BKEY_TYPES();
-#undef x
-
/* byte order helpers */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
@@ -743,8 +547,8 @@ static inline void bch2_bkey_pack_test(void) {}
x(BKEY_FIELD_OFFSET, p.offset) \
x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
x(BKEY_FIELD_SIZE, size) \
- x(BKEY_FIELD_VERSION_HI, version.hi) \
- x(BKEY_FIELD_VERSION_LO, version.lo)
+ x(BKEY_FIELD_VERSION_HI, bversion.hi) \
+ x(BKEY_FIELD_VERSION_LO, bversion.lo)
struct bkey_format_state {
u64 field_min[BKEY_NR_FIELDS];
@@ -771,8 +575,31 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s
void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+
+static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i)
+{
+ unsigned f_bits = f->bits_per_field[i];
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+ u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+ if (f_bits > unpacked_bits)
+ return true;
+
+ if ((f_bits == unpacked_bits) && field_offset)
+ return true;
+
+ u64 f_mask = f_bits
+ ? ~((~0ULL << (f_bits - 1)) << 1)
+ : 0;
+
+ if (((field_offset + f_mask) & unpacked_mask) < field_offset)
+ return true;
+ return false;
+}
+
int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5e52684764eb..15c93576b5c2 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -7,6 +7,7 @@
#include "btree_types.h"
#include "alloc_background.h"
#include "dirent.h"
+#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "extents.h"
@@ -26,27 +27,27 @@ const char * const bch2_bkey_types[] = {
NULL
};
-static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
return 0;
}
#define bch2_bkey_ops_deleted ((struct bkey_ops) { \
- .key_invalid = deleted_key_invalid, \
+ .key_validate = deleted_key_validate, \
})
#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \
- .key_invalid = deleted_key_invalid, \
+ .key_validate = deleted_key_validate, \
})
-static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
int ret = 0;
- bkey_fsck_err_on(bkey_val_bytes(k.k), c, err,
- bkey_val_size_nonzero,
+ bkey_fsck_err_on(bkey_val_bytes(k.k),
+ c, bkey_val_size_nonzero,
"incorrect value size (%zu != 0)",
bkey_val_bytes(k.k));
fsck_err:
@@ -54,11 +55,11 @@ fsck_err:
}
#define bch2_bkey_ops_error ((struct bkey_ops) { \
- .key_invalid = empty_val_key_invalid, \
+ .key_validate = empty_val_key_validate, \
})
-static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
return 0;
}
@@ -72,17 +73,17 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
}
#define bch2_bkey_ops_cookie ((struct bkey_ops) { \
- .key_invalid = key_type_cookie_invalid, \
+ .key_validate = key_type_cookie_validate, \
.val_to_text = key_type_cookie_to_text, \
.min_val_size = 8, \
})
#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
- .key_invalid = empty_val_key_invalid, \
+ .key_validate = empty_val_key_validate, \
})
-static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
return 0;
}
@@ -97,9 +98,9 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
datalen, min(datalen, 32U), d.v->data);
}
-#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \
- .key_invalid = key_type_inline_data_invalid, \
- .val_to_text = key_type_inline_data_to_text, \
+#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \
+ .key_validate = key_type_inline_data_validate, \
+ .val_to_text = key_type_inline_data_to_text, \
})
static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
@@ -109,7 +110,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_
}
#define bch2_bkey_ops_set ((struct bkey_ops) { \
- .key_invalid = empty_val_key_invalid, \
+ .key_validate = empty_val_key_validate, \
.key_merge = key_type_set_merge, \
})
@@ -122,22 +123,24 @@ const struct bkey_ops bch2_bkey_ops[] = {
const struct bkey_ops bch2_bkey_null_ops = {
};
-int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
+ if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
+ return 0;
+
const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
int ret = 0;
- bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err,
- bkey_val_size_too_small,
+ bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size,
+ c, bkey_val_size_too_small,
"bad val size (%zu < %u)",
bkey_val_bytes(k.k), ops->min_val_size);
- if (!ops->key_invalid)
+ if (!ops->key_validate)
return 0;
- ret = ops->key_invalid(c, k, flags, err);
+ ret = ops->key_validate(c, k, from);
fsck_err:
return ret;
}
@@ -157,38 +160,45 @@ const char *bch2_btree_node_type_str(enum btree_node_type type)
return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
}
-int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum btree_node_type type,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
+ enum btree_node_type type = __btree_node_type(from.level, from.btree);
+
+ if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
+ return 0;
+
int ret = 0;
- bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err,
- bkey_u64s_too_small,
+ bkey_fsck_err_on(k.k->u64s < BKEY_U64s,
+ c, bkey_u64s_too_small,
"u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
if (type >= BKEY_TYPE_NR)
return 0;
- bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
- !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
- bkey_invalid_type_for_btree,
+ bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
+ (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) &&
+ !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
+ c, bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",
- bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
+ bch2_btree_node_type_str(type),
+ k.k->type < KEY_TYPE_MAX
+ ? bch2_bkey_types[k.k->type]
+ : "(unknown)");
if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
- bkey_fsck_err_on(k.k->size == 0, c, err,
- bkey_extent_size_zero,
+ bkey_fsck_err_on(k.k->size == 0,
+ c, bkey_extent_size_zero,
"size == 0");
- bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err,
- bkey_extent_size_greater_than_offset,
+ bkey_fsck_err_on(k.k->size > k.k->p.offset,
+ c, bkey_extent_size_greater_than_offset,
"size greater than offset (%u > %llu)",
k.k->size, k.k->p.offset);
} else {
- bkey_fsck_err_on(k.k->size, c, err,
- bkey_size_nonzero,
+ bkey_fsck_err_on(k.k->size,
+ c, bkey_size_nonzero,
"size != 0");
}
@@ -196,12 +206,12 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_id btree = type - 1;
if (btree_type_has_snapshots(btree)) {
- bkey_fsck_err_on(!k.k->p.snapshot, c, err,
- bkey_snapshot_zero,
+ bkey_fsck_err_on(!k.k->p.snapshot,
+ c, bkey_snapshot_zero,
"snapshot == 0");
} else if (!btree_type_has_snapshot_field(btree)) {
- bkey_fsck_err_on(k.k->p.snapshot, c, err,
- bkey_snapshot_nonzero,
+ bkey_fsck_err_on(k.k->p.snapshot,
+ c, bkey_snapshot_nonzero,
"nonzero snapshot");
} else {
/*
@@ -210,34 +220,33 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
*/
}
- bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
- bkey_at_pos_max,
+ bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX),
+ c, bkey_at_pos_max,
"key at POS_MAX");
}
fsck_err:
return ret;
}
-int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum btree_node_type type,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
- return __bch2_bkey_invalid(c, k, type, flags, err) ?:
- bch2_bkey_val_invalid(c, k, flags, err);
+ return __bch2_bkey_validate(c, k, from) ?:
+ bch2_bkey_val_validate(c, k, from);
}
int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k, struct printbuf *err)
+ struct bkey_s_c k,
+ struct bkey_validate_context from)
{
int ret = 0;
- bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err,
- bkey_before_start_of_btree_node,
+ bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key),
+ c, bkey_before_start_of_btree_node,
"key before start of btree node");
- bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err,
- bkey_after_end_of_btree_node,
+ bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key),
+ c, bkey_after_end_of_btree_node,
"key past end of btree node");
fsck_err:
return ret;
@@ -281,7 +290,7 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
bch2_bpos_to_text(out, k->p);
- prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
+ prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo);
} else {
prt_printf(out, "(null)");
}
@@ -388,8 +397,12 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
for (i = 0; i < nr_compat; i++)
switch (!write ? i : nr_compat - 1 - i) {
case 0:
- if (big_endian != CPU_BIG_ENDIAN)
+ if (big_endian != CPU_BIG_ENDIAN) {
+ bch2_bkey_swab_key(f, k);
+ } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
bch2_bkey_swab_key(f, k);
+ bch2_bkey_swab_key(f, k);
+ }
break;
case 1:
if (version < bcachefs_metadata_version_bkey_renumber)
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 03efe8ee565a..bf34111cdf00 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -14,22 +14,23 @@ extern const char * const bch2_bkey_types[];
extern const struct bkey_ops bch2_bkey_null_ops;
/*
- * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
+ * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If
* invalid, entire key will be deleted.
*
* When invalid, error string is returned via @err. @rw indicates whether key is
* being read or written; more aggressive checks can be enabled when rw == WRITE.
*/
struct bkey_ops {
- int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err);
+ int (*key_validate)(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
void (*swab)(struct bkey_s);
bool (*key_normalize)(struct bch_fs *, struct bkey_s);
bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
int (*trigger)(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
void (*compat)(enum btree_id id, unsigned version,
unsigned big_endian, int write,
struct bkey_s);
@@ -47,14 +48,14 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
: &bch2_bkey_null_ops;
}
-int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
-int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
- enum bkey_invalid_flags, struct printbuf *);
-int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
- enum bkey_invalid_flags, struct printbuf *);
-int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *,
- struct bkey_s_c, struct printbuf *);
+int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c,
+ struct bkey_validate_context from);
void bch2_bpos_to_text(struct printbuf *, struct bpos);
void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
@@ -70,62 +71,16 @@ bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
{
return l->type == r->type &&
- !bversion_cmp(l->version, r->version) &&
+ !bversion_cmp(l->bversion, r->bversion) &&
bpos_eq(l->p, bkey_start_pos(r));
}
bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-enum btree_update_flags {
- __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
- __BTREE_UPDATE_NOJOURNAL,
- __BTREE_UPDATE_KEY_CACHE_RECLAIM,
-
- __BTREE_TRIGGER_NORUN,
- __BTREE_TRIGGER_TRANSACTIONAL,
- __BTREE_TRIGGER_ATOMIC,
- __BTREE_TRIGGER_GC,
- __BTREE_TRIGGER_INSERT,
- __BTREE_TRIGGER_OVERWRITE,
- __BTREE_TRIGGER_BUCKET_INVALIDATE,
-};
-
-#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
-#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
-#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
-
-/* Don't run triggers at all */
-#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
-
-/*
- * If set, we're running transactional triggers as part of a transaction commit:
- * triggers may generate new updates
- *
- * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set,
- * we're running atomic triggers during a transaction commit: we have our
- * journal reservation, we're holding btree node write locks, and we know the
- * transaction is going to commit (returning an error here is a fatal error,
- * causing us to go emergency read-only)
- */
-#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL)
-#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC)
-
-/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
-#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
-
-/* @new is entering the btree */
-#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
-
-/* @old is leaving the btree */
-#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
-
-/* signal from bucket invalidate path to alloc trigger */
-#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-
static inline int bch2_key_trigger(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
@@ -135,8 +90,9 @@ static inline int bch2_key_trigger(struct btree_trans *trans,
}
static inline int bch2_key_trigger_old(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, unsigned flags)
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_i deleted;
@@ -144,12 +100,13 @@ static inline int bch2_key_trigger_old(struct btree_trans *trans,
deleted.k.p = old.k->p;
return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted),
- BTREE_TRIGGER_OVERWRITE|flags);
+ BTREE_TRIGGER_overwrite|flags);
}
static inline int bch2_key_trigger_new(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s new, unsigned flags)
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s new,
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_i deleted;
@@ -157,7 +114,7 @@ static inline int bch2_key_trigger_new(struct btree_trans *trans,
deleted.k.p = new.k->p;
return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
- BTREE_TRIGGER_INSERT|flags);
+ BTREE_TRIGGER_insert|flags);
}
void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
@@ -172,7 +129,8 @@ static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
struct bkey_packed *k)
{
if (version < bcachefs_metadata_version_current ||
- big_endian != CPU_BIG_ENDIAN)
+ big_endian != CPU_BIG_ENDIAN ||
+ IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
__bch2_bkey_compat(level, btree_id, version,
big_endian, write, f, k);
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index bcca9e76a0b4..4536eb50fc40 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -6,9 +6,9 @@
#include "bset.h"
#include "extents.h"
-typedef int (*sort_cmp_fn)(struct btree *,
- struct bkey_packed *,
- struct bkey_packed *);
+typedef int (*sort_cmp_fn)(const struct btree *,
+ const struct bkey_packed *,
+ const struct bkey_packed *);
static inline bool sort_iter_end(struct sort_iter *iter)
{
@@ -70,9 +70,9 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
/*
* If keys compare equal, compare by pointer order:
*/
-static inline int key_sort_fix_overlapping_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
+static inline int key_sort_fix_overlapping_cmp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
{
return bch2_bkey_cmp_packed(b, l, r) ?:
cmp_int((unsigned long) l, (unsigned long) r);
@@ -154,46 +154,59 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
return nr;
}
-static inline int sort_keys_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
+static inline int keep_unwritten_whiteouts_cmp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
{
return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
- (int) l->needs_whiteout - (int) r->needs_whiteout;
+ (long) l - (long) r;
}
-unsigned bch2_sort_keys(struct bkey_packed *dst,
- struct sort_iter *iter,
- bool filter_whiteouts)
+#include "btree_update_interior.h"
+
+/*
+ * For sorting in the btree node write path: whiteouts not in the unwritten
+ * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are
+ * dropped if overwritten by real keys:
+ */
+unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter)
{
- const struct bkey_format *f = &iter->b->format;
struct bkey_packed *in, *next, *out = dst;
- sort_iter_sort(iter, sort_keys_cmp);
+ sort_iter_sort(iter, keep_unwritten_whiteouts_cmp);
- while ((in = sort_iter_next(iter, sort_keys_cmp))) {
- bool needs_whiteout = false;
+ while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) {
+ if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b))
+ continue;
- if (bkey_deleted(in) &&
- (filter_whiteouts || !in->needs_whiteout))
+ if ((next = sort_iter_peek(iter)) &&
+ !bch2_bkey_cmp_packed_inlined(iter->b, in, next))
continue;
- while ((next = sort_iter_peek(iter)) &&
- !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) {
- BUG_ON(in->needs_whiteout &&
- next->needs_whiteout);
- needs_whiteout |= in->needs_whiteout;
- in = sort_iter_next(iter, sort_keys_cmp);
- }
+ bkey_p_copy(out, in);
+ out = bkey_p_next(out);
+ }
- if (bkey_deleted(in)) {
- memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in));
- set_bkeyp_val_u64s(f, out, 0);
- } else {
- bkey_p_copy(out, in);
- }
- out->needs_whiteout |= needs_whiteout;
+ return (u64 *) out - (u64 *) dst;
+}
+
+/*
+ * Main sort routine for compacting a btree node in memory: we always drop
+ * whiteouts because any whiteouts that need to be written are in the unwritten
+ * whiteouts area:
+ */
+unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter)
+{
+ struct bkey_packed *in, *out = dst;
+
+ sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined);
+
+ while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) {
+ if (bkey_deleted(in))
+ continue;
+
+ bkey_p_copy(out, in);
out = bkey_p_next(out);
}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
index 7c0f0b160f18..9be969d46890 100644
--- a/fs/bcachefs/bkey_sort.h
+++ b/fs/bcachefs/bkey_sort.h
@@ -48,7 +48,7 @@ bch2_sort_repack(struct bset *, struct btree *,
struct btree_node_iter *,
struct bkey_format *, bool);
-unsigned bch2_sort_keys(struct bkey_packed *,
- struct sort_iter *, bool);
+unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *);
+unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *);
#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
new file mode 100644
index 000000000000..b4f328f9853c
--- /dev/null
+++ b/fs/bcachefs/bkey_types.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_TYPES_H
+#define _BCACHEFS_BKEY_TYPES_H
+
+#include "bcachefs_format.h"
+
+/*
+ * bkey_i - bkey with inline value
+ * bkey_s - bkey with split value
+ * bkey_s_c - bkey with split value, const
+ */
+
+#define bkey_p_next(_k) vstruct_next(_k)
+
+static inline struct bkey_i *bkey_next(struct bkey_i *k)
+{
+ return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
+}
+
+#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+ return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+ unsigned u64s = BKEY_U64s + val_u64s;
+
+ BUG_ON(u64s > U8_MAX);
+ k->u64s = u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+ set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
+}
+
+#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
+
+#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
+
+#define bkey_whiteout(_k) \
+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
+
+/* bkey with split value, const */
+struct bkey_s_c {
+ const struct bkey *k;
+ const struct bch_val *v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+ union {
+ struct {
+ struct bkey *k;
+ struct bch_val *v;
+ };
+ struct bkey_s_c s_c;
+ };
+};
+
+#define bkey_s_null ((struct bkey_s) { .k = NULL })
+#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+ return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+ return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+ return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+ return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define x(name, ...) \
+struct bkey_i_##name { \
+ union { \
+ struct bkey k; \
+ struct bkey_i k_i; \
+ }; \
+ struct bch_##name v; \
+}; \
+ \
+struct bkey_s_c_##name { \
+ union { \
+ struct { \
+ const struct bkey *k; \
+ const struct bch_##name *v; \
+ }; \
+ struct bkey_s_c s_c; \
+ }; \
+}; \
+ \
+struct bkey_s_##name { \
+ union { \
+ struct { \
+ struct bkey *k; \
+ struct bch_##name *v; \
+ }; \
+ struct bkey_s_c_##name c; \
+ struct bkey_s s; \
+ struct bkey_s_c s_c; \
+ }; \
+}; \
+ \
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return container_of(&k->k, struct bkey_i_##name, k); \
+} \
+ \
+static inline const struct bkey_i_##name * \
+bkey_i_to_##name##_c(const struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return container_of(&k->k, struct bkey_i_##name, k); \
+} \
+ \
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
+ return (struct bkey_s_##name) { \
+ .k = k.k, \
+ .v = container_of(k.v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
+ return (struct bkey_s_c_##name) { \
+ .k = k.k, \
+ .v = container_of(k.v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{ \
+ return (struct bkey_s_##name) { \
+ .k = &k->k, \
+ .v = &k->v, \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name \
+name##_i_to_s_c(const struct bkey_i_##name *k) \
+{ \
+ return (struct bkey_s_c_##name) { \
+ .k = &k->k, \
+ .v = &k->v, \
+ }; \
+} \
+ \
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return (struct bkey_s_##name) { \
+ .k = &k->k, \
+ .v = container_of(&k->v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name \
+bkey_i_to_s_c_##name(const struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return (struct bkey_s_c_##name) { \
+ .k = &k->k, \
+ .v = container_of(&k->v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{ \
+ struct bkey_i_##name *k = \
+ container_of(&_k->k, struct bkey_i_##name, k); \
+ \
+ bkey_init(&k->k); \
+ memset(&k->v, 0, sizeof(k->v)); \
+ k->k.type = KEY_TYPE_##name; \
+ set_bkey_val_bytes(&k->k, sizeof(k->v)); \
+ \
+ return k; \
+}
+
+BCH_BKEY_TYPES();
+#undef x
+
+enum bch_validate_flags {
+ BCH_VALIDATE_write = BIT(0),
+ BCH_VALIDATE_commit = BIT(1),
+ BCH_VALIDATE_silent = BIT(2),
+};
+
+#define BKEY_VALIDATE_CONTEXTS() \
+ x(unknown) \
+ x(superblock) \
+ x(journal) \
+ x(btree_root) \
+ x(btree_node) \
+ x(commit)
+
+struct bkey_validate_context {
+ enum {
+#define x(n) BKEY_VALIDATE_##n,
+ BKEY_VALIDATE_CONTEXTS()
+#undef x
+ } from:8;
+ enum bch_validate_flags flags:8;
+ u8 level;
+ enum btree_id btree;
+ bool root:1;
+ unsigned journal_offset;
+ u64 journal_seq;
+};
+
+#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 3fd1085b6c61..9a4a83d6fd2d 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -13,7 +13,7 @@
#include "trace.h"
#include "util.h"
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/console.h>
#include <linux/random.h>
#include <linux/prefetch.h>
@@ -103,8 +103,6 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
{
- struct bset_tree *t;
-
console_lock();
for_each_bset(b, t)
bch2_dump_bset(c, b, bset(b, t), t - b->set);
@@ -134,18 +132,23 @@ void bch2_dump_btree_node_iter(struct btree *b,
printbuf_exit(&buf);
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_verify_btree_nr_keys(struct btree *b)
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
{
- struct bset_tree *t;
struct bkey_packed *k;
- struct btree_nr_keys nr = { 0 };
+ struct btree_nr_keys nr = {};
for_each_bset(b, t)
bset_tree_for_each_key(b, t, k)
if (!bkey_deleted(k))
btree_keys_account_key_add(&nr, t - b->set, k);
+ return nr;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+ struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
}
@@ -192,7 +195,6 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
{
struct btree_node_iter_set *set, *s2;
struct bkey_packed *k, *p;
- struct bset_tree *t;
if (bch2_btree_node_iter_end(iter))
return;
@@ -207,12 +209,14 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
/* Verify that set->end is correct: */
btree_node_iter_for_each(iter, set) {
for_each_bset(b, t)
- if (set->end == t->end_offset)
+ if (set->end == t->end_offset) {
+ BUG_ON(set->k < btree_bkey_first_offset(t) ||
+ set->k >= t->end_offset);
goto found;
+ }
BUG();
found:
- BUG_ON(set->k < btree_bkey_first_offset(t) ||
- set->k >= t->end_offset);
+ do {} while (0);
}
/* Verify iterator is sorted: */
@@ -300,11 +304,6 @@ struct bkey_float {
};
#define BKEY_MANTISSA_BITS 16
-static unsigned bkey_float_byte_offset(unsigned idx)
-{
- return idx * sizeof(struct bkey_float);
-}
-
struct ro_aux_tree {
u8 nothing[0];
struct bkey_float f[];
@@ -324,8 +323,7 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
return t->aux_data_offset;
case BSET_RO_AUX_TREE:
return t->aux_data_offset +
- DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
- t->size * sizeof(u8), 8);
+ DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8);
case BSET_RW_AUX_TREE:
return t->aux_data_offset +
DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
@@ -356,14 +354,6 @@ static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
return __aux_tree_base(b, t);
}
-static u8 *ro_aux_tree_prev(const struct btree *b,
- const struct bset_tree *t)
-{
- EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
- return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
-}
-
static struct bkey_float *bkey_float(const struct btree *b,
const struct bset_tree *t,
unsigned idx)
@@ -371,11 +361,9 @@ static struct bkey_float *bkey_float(const struct btree *b,
return ro_aux_tree_base(b, t)->f + idx;
}
-static void bset_aux_tree_verify(const struct btree *b)
+static void bset_aux_tree_verify(struct btree *b)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- const struct bset_tree *t;
-
for_each_bset(b, t) {
if (t->aux_data_offset == U16_MAX)
continue;
@@ -477,15 +465,6 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
bkey_float(b, t, j)->key_offset);
}
-static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
- const struct bset_tree *t,
- unsigned j)
-{
- unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
-
- return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
-}
-
static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
const struct bset_tree *t)
{
@@ -583,8 +562,7 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
}
static inline unsigned bkey_mantissa(const struct bkey_packed *k,
- const struct bkey_float *f,
- unsigned idx)
+ const struct bkey_float *f)
{
u64 v;
@@ -615,7 +593,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
struct bkey_packed *m = tree_to_bkey(b, t, j);
struct bkey_packed *l = is_power_of_2(j)
? min_key
- : tree_to_prev_bkey(b, t, j >> ffs(j));
+ : tree_to_bkey(b, t, j >> ffs(j));
struct bkey_packed *r = is_power_of_2(j + 1)
? max_key
: tree_to_bkey(b, t, j >> (ffz(j) + 1));
@@ -666,7 +644,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
f->exponent = shift;
- mantissa = bkey_mantissa(m, f, j);
+ mantissa = bkey_mantissa(m, f);
/*
* If we've got garbage bits, set them to all 1s - it's legal for the
@@ -679,20 +657,19 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
}
/* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
+static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
{
bset_aux_tree_verify(b);
return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
}
-static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
+static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
{
- return __bset_tree_capacity(b, t) /
- (sizeof(struct bkey_float) + sizeof(u8));
+ return __bset_tree_capacity(b, t) / sizeof(struct bkey_float);
}
-static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
+static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
{
return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
}
@@ -718,7 +695,7 @@ static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
- struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
+ struct bkey_packed *k = btree_bkey_first(b, t);
struct bkey_i min_key, max_key;
unsigned cacheline = 1;
@@ -731,12 +708,12 @@ retry:
return;
}
- t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+ t->extra = eytzinger1_extra(t->size - 1);
/* First we figure out where the first key in each cacheline is */
eytzinger1_for_each(j, t->size - 1) {
while (bkey_to_cacheline(b, t, k) < cacheline)
- prev = k, k = bkey_p_next(k);
+ k = bkey_p_next(k);
if (k >= btree_bkey_last(b, t)) {
/* XXX: this path sucks */
@@ -744,17 +721,12 @@ retry:
goto retry;
}
- ro_aux_tree_prev(b, t)[j] = prev->u64s;
bkey_float(b, t, j)->key_offset =
bkey_to_cacheline_offset(b, t, cacheline++, k);
- EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
EBUG_ON(tree_to_bkey(b, t, j) != k);
}
- while (k != btree_bkey_last(b, t))
- prev = k, k = bkey_p_next(k);
-
if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
bkey_init(&min_key.k);
min_key.k.p = b->data->min_key;
@@ -913,6 +885,38 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
/* Insert */
+static void rw_aux_tree_insert_entry(struct btree *b,
+ struct bset_tree *t,
+ unsigned idx)
+{
+ EBUG_ON(!idx || idx > t->size);
+ struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
+ struct bkey_packed *end = idx < t->size
+ ? rw_aux_to_bkey(b, t, idx)
+ : btree_bkey_last(b, t);
+
+ if (t->size < bset_rw_tree_capacity(b, t) &&
+ (void *) end - (void *) start > L1_CACHE_BYTES) {
+ struct bkey_packed *k = start;
+
+ while (1) {
+ k = bkey_p_next(k);
+ if (k == end)
+ break;
+
+ if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+ memmove(&rw_aux_tree(b, t)[idx + 1],
+ &rw_aux_tree(b, t)[idx],
+ (void *) &rw_aux_tree(b, t)[t->size] -
+ (void *) &rw_aux_tree(b, t)[idx]);
+ t->size++;
+ rw_aux_tree_set(b, t, idx, k);
+ break;
+ }
+ }
+ }
+}
+
static void bch2_bset_fix_lookup_table(struct btree *b,
struct bset_tree *t,
struct bkey_packed *_where,
@@ -920,84 +924,59 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
unsigned new_u64s)
{
int shift = new_u64s - clobber_u64s;
- unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+ unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
EBUG_ON(bset_has_ro_aux_tree(t));
if (!bset_has_rw_aux_tree(t))
return;
+ if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
+ rw_aux_tree_insert_entry(b, t, t->size);
+ goto verify;
+ }
+
/* returns first entry >= where */
- l = rw_aux_tree_bsearch(b, t, where);
-
- if (!l) /* never delete first entry */
- l++;
- else if (l < t->size &&
- where < t->end_offset &&
- rw_aux_tree(b, t)[l].offset == where)
- rw_aux_tree_set(b, t, l++, _where);
-
- /* l now > where */
-
- for (j = l;
- j < t->size &&
- rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
- j++)
- ;
-
- if (j < t->size &&
- rw_aux_tree(b, t)[j].offset + shift ==
- rw_aux_tree(b, t)[l - 1].offset)
- j++;
-
- memmove(&rw_aux_tree(b, t)[l],
- &rw_aux_tree(b, t)[j],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[j]);
- t->size -= j - l;
-
- for (j = l; j < t->size; j++)
- rw_aux_tree(b, t)[j].offset += shift;
+ idx = rw_aux_tree_bsearch(b, t, where);
+
+ if (rw_aux_tree(b, t)[idx].offset == where) {
+ if (!idx) { /* never delete first entry */
+ idx++;
+ } else if (where < t->end_offset) {
+ rw_aux_tree_set(b, t, idx++, _where);
+ } else {
+ EBUG_ON(where != t->end_offset);
+ rw_aux_tree_insert_entry(b, t, --t->size);
+ goto verify;
+ }
+ }
- EBUG_ON(l < t->size &&
- rw_aux_tree(b, t)[l].offset ==
- rw_aux_tree(b, t)[l - 1].offset);
+ EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
+ if (idx < t->size &&
+ rw_aux_tree(b, t)[idx].offset + shift ==
+ rw_aux_tree(b, t)[idx - 1].offset) {
+ memmove(&rw_aux_tree(b, t)[idx],
+ &rw_aux_tree(b, t)[idx + 1],
+ (void *) &rw_aux_tree(b, t)[t->size] -
+ (void *) &rw_aux_tree(b, t)[idx + 1]);
+ t->size -= 1;
+ }
- if (t->size < bset_rw_tree_capacity(b, t) &&
- (l < t->size
- ? rw_aux_tree(b, t)[l].offset
- : t->end_offset) -
- rw_aux_tree(b, t)[l - 1].offset >
- L1_CACHE_BYTES / sizeof(u64)) {
- struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
- struct bkey_packed *end = l < t->size
- ? rw_aux_to_bkey(b, t, l)
- : btree_bkey_last(b, t);
- struct bkey_packed *k = start;
+ for (j = idx; j < t->size; j++)
+ rw_aux_tree(b, t)[j].offset += shift;
- while (1) {
- k = bkey_p_next(k);
- if (k == end)
- break;
+ EBUG_ON(idx < t->size &&
+ rw_aux_tree(b, t)[idx].offset ==
+ rw_aux_tree(b, t)[idx - 1].offset);
- if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
- memmove(&rw_aux_tree(b, t)[l + 1],
- &rw_aux_tree(b, t)[l],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[l]);
- t->size++;
- rw_aux_tree_set(b, t, l, k);
- break;
- }
- }
- }
+ rw_aux_tree_insert_entry(b, t, idx);
+verify:
bch2_bset_verify_rw_aux_tree(b, t);
bset_aux_tree_verify(b);
}
void bch2_bset_insert(struct btree *b,
- struct btree_node_iter *iter,
struct bkey_packed *where,
struct bkey_i *insert,
unsigned clobber_u64s)
@@ -1096,8 +1075,7 @@ static inline void prefetch_four_cachelines(void *p)
}
static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
- const struct bkey_float *f,
- unsigned idx)
+ const struct bkey_float *f)
{
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
@@ -1131,9 +1109,9 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
goto slowpath;
l = f->mantissa;
- r = bkey_mantissa(packed_search, f, n);
+ r = bkey_mantissa(packed_search, f);
- if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
+ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f))
goto slowpath;
n = n * 2 + (l < r);
@@ -1368,8 +1346,6 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
struct btree *b)
{
- struct bset_tree *t;
-
memset(iter, 0, sizeof(*iter));
for_each_bset(b, t)
@@ -1475,7 +1451,6 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
{
struct bkey_packed *k, *prev = NULL;
struct btree_node_iter_set *set;
- struct bset_tree *t;
unsigned end = 0;
if (bch2_expensive_debug_checks)
@@ -1544,9 +1519,7 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
{
- const struct bset_tree *t;
-
- for_each_bset(b, t) {
+ for_each_bset_c(b, t) {
enum bset_aux_tree_type type = bset_aux_tree_type(t);
size_t j;
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 79c77baaa383..6953d55b72cc 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -206,7 +206,10 @@ static inline size_t btree_aux_data_u64s(const struct btree *b)
}
#define for_each_bset(_b, _t) \
- for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+ for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+
+#define for_each_bset_c(_b, _t) \
+ for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
#define bset_tree_for_each_key(_b, _t, _k) \
for (_k = btree_bkey_first(_b, _t); \
@@ -267,8 +270,8 @@ void bch2_bset_init_first(struct btree *, struct bset *);
void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-void bch2_bset_insert(struct btree *, struct btree_node_iter *,
- struct bkey_packed *, struct bkey_i *, unsigned);
+void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *,
+ unsigned);
void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
/* Bkey utility code */
@@ -294,7 +297,6 @@ static inline struct bset_tree *
bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
{
unsigned offset = __btree_node_key_to_offset(b, k);
- struct bset_tree *t;
for_each_bset(b, t)
if (offset <= t->end_offset) {
@@ -458,6 +460,8 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
/* Accounting: */
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *);
+
static inline void btree_keys_account_key(struct btree_nr_keys *n,
unsigned bset,
struct bkey_packed *k,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d7c81beac14a..1ec1f90e0eb3 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bbpos.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_io.h"
@@ -14,9 +15,19 @@
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
+#include <linux/swap.h>
+
+#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
+do { \
+ if (shrinker_counter) \
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \
+} while (0)
const char * const bch2_btree_node_flags[] = {
-#define x(f) #f,
+ "typebit",
+ "typebit",
+ "typebit",
+#define x(f) [BTREE_NODE_##f] = #f,
BTREE_FLAGS()
#undef x
NULL
@@ -24,43 +35,82 @@ const char * const bch2_btree_node_flags[] = {
void bch2_recalc_btree_reserve(struct bch_fs *c)
{
- unsigned i, reserve = 16;
+ unsigned reserve = 16;
if (!c->btree_roots_known[0].b)
reserve += 8;
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
reserve += min_t(unsigned, 1, r->b->c.level) * 8;
}
- c->btree_cache.reserve = reserve;
+ c->btree_cache.nr_reserve = reserve;
}
-static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+static inline size_t btree_cache_can_free(struct btree_cache_list *list)
{
- return max_t(int, 0, bc->used - bc->reserve);
+ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+
+ size_t can_free = list->nr;
+ if (!list->idx)
+ can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
+ return can_free;
}
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
{
+ BUG_ON(!list_empty(&b->list));
+
if (b->c.lock.readers)
- list_move(&b->list, &bc->freed_pcpu);
+ list_add(&b->list, &bc->freed_pcpu);
else
- list_move(&b->list, &bc->freed_nonpcpu);
+ list_add(&b->list, &bc->freed_nonpcpu);
}
-static void btree_node_data_free(struct bch_fs *c, struct btree *b)
+static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b)
+{
+ BUG_ON(!list_empty(&b->list));
+ BUG_ON(!b->data);
+
+ bc->nr_freeable++;
+ list_add(&b->list, &bc->freeable);
+}
+
+void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
+ mutex_lock(&bc->lock);
+ __bch2_btree_node_to_freelist(bc, b);
+ mutex_unlock(&bc->lock);
+
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+}
+
+static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
+{
+ BUG_ON(!list_empty(&b->list));
+ BUG_ON(btree_node_hashed(b));
+
+ /*
+ * This should really be done in slub/vmalloc, but we're using the
+ * kmalloc_large() path, so we're working around a slub bug by doing
+ * this here:
+ */
+ if (b->data)
+ mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
+ if (b->aux_data)
+ mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
+
EBUG_ON(btree_node_write_in_flight(b));
clear_btree_node_just_written(b);
- kvpfree(b->data, btree_buf_bytes(b));
+ kvfree(b->data);
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
@@ -69,11 +119,17 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
#endif
b->aux_data = NULL;
- bc->used--;
-
btree_node_to_freedlist(bc, b);
}
+static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
+{
+ BUG_ON(list_empty(&b->list));
+ list_del_init(&b->list);
+ --bc->nr_freeable;
+ __btree_node_data_free(bc, b);
+}
+
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
@@ -84,17 +140,20 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
}
static const struct rhashtable_params bch_btree_cache_params = {
- .head_offset = offsetof(struct btree, hash),
- .key_offset = offsetof(struct btree, hash_val),
- .key_len = sizeof(u64),
- .obj_cmpfn = bch2_btree_cache_cmp_fn,
+ .head_offset = offsetof(struct btree, hash),
+ .key_offset = offsetof(struct btree, hash_val),
+ .key_len = sizeof(u64),
+ .obj_cmpfn = bch2_btree_cache_cmp_fn,
+ .automatic_shrinking = true,
};
static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
- b->data = kvpmalloc(btree_buf_bytes(b), gfp);
+ gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
+
+ b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
@@ -107,7 +166,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
- kvpfree(b->data, btree_buf_bytes(b));
+ kvfree(b->data);
b->data = NULL;
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
@@ -144,51 +203,144 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
return NULL;
}
- bch2_btree_lock_init(&b->c, 0);
+ bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
- bc->used++;
- list_add(&b->list, &bc->freeable);
+ __bch2_btree_node_to_freelist(bc, b);
return b;
}
+static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
+{
+ struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+ u64 mask = bc->pinned_nodes_mask[!!b->c.level];
+
+ return ((mask & BIT_ULL(b->c.btree_id)) &&
+ bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+ bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
+}
+
+void bch2_node_pin(struct bch_fs *c, struct btree *b)
+{
+ struct btree_cache *bc = &c->btree_cache;
+
+ mutex_lock(&bc->lock);
+ if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
+ set_btree_node_pinned(b);
+ list_move(&b->list, &bc->live[1].list);
+ bc->live[0].nr--;
+ bc->live[1].nr++;
+ }
+ mutex_unlock(&bc->lock);
+}
+
+void bch2_btree_cache_unpin(struct bch_fs *c)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b, *n;
+
+ mutex_lock(&bc->lock);
+ c->btree_cache.pinned_nodes_mask[0] = 0;
+ c->btree_cache.pinned_nodes_mask[1] = 0;
+
+ list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
+ clear_btree_node_pinned(b);
+ list_move(&b->list, &bc->live[0].list);
+ bc->live[0].nr++;
+ bc->live[1].nr--;
+ }
+
+ mutex_unlock(&bc->lock);
+}
+
/* Btree in memory cache - hash table */
-void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
- int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+ lockdep_assert_held(&bc->lock);
+ int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
BUG_ON(ret);
/* Cause future lookups for this node to fail: */
b->hash_val = 0;
+
+ if (b->c.btree_id < BTREE_ID_NR)
+ --bc->nr_by_btree[b->c.btree_id];
+ --bc->live[btree_node_pinned(b)].nr;
+ list_del_init(&b->list);
+}
+
+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+{
+ __bch2_btree_node_hash_remove(bc, b);
+ __bch2_btree_node_to_freelist(bc, b);
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
{
+ BUG_ON(!list_empty(&b->list));
BUG_ON(b->hash_val);
+
b->hash_val = btree_ptr_hash_val(&b->key);
+ int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
+ bch_btree_cache_params);
+ if (ret)
+ return ret;
+
+ if (b->c.btree_id < BTREE_ID_NR)
+ bc->nr_by_btree[b->c.btree_id]++;
+
+ bool p = __btree_node_pinned(bc, b);
+ mod_bit(BTREE_NODE_pinned, &b->flags, p);
- return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
- bch_btree_cache_params);
+ list_add_tail(&b->list, &bc->live[p].list);
+ bc->live[p].nr++;
+ return 0;
}
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
unsigned level, enum btree_id id)
{
- int ret;
-
b->c.level = level;
b->c.btree_id = id;
mutex_lock(&bc->lock);
- ret = __bch2_btree_node_hash_insert(bc, b);
- if (!ret)
- list_add_tail(&b->list, &bc->live);
+ int ret = __bch2_btree_node_hash_insert(bc, b);
mutex_unlock(&bc->lock);
return ret;
}
+void bch2_btree_node_update_key_early(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b;
+ struct bkey_buf tmp;
+ int ret;
+
+ bch2_bkey_buf_init(&tmp);
+ bch2_bkey_buf_reassemble(&tmp, c, old);
+
+ b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
+ if (!IS_ERR_OR_NULL(b)) {
+ mutex_lock(&c->btree_cache.lock);
+
+ __bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, new);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+
+ mutex_unlock(&c->btree_cache.lock);
+ six_unlock_read(&b->c.lock);
+ }
+
+ bch2_bkey_buf_exit(&tmp, c);
+}
+
__flatten
static inline struct btree *btree_cache_find(struct btree_cache *bc,
const struct bkey_i *k)
@@ -202,7 +354,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc,
* this version is for btree nodes that have already been freed (we're not
* reaping a real btree node)
*/
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
{
struct btree_cache *bc = &c->btree_cache;
int ret = 0;
@@ -212,38 +364,64 @@ wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush)
+ if (!flush) {
+ if (btree_node_dirty(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
+ else if (btree_node_read_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ else if (btree_node_write_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ }
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read(b);
bch2_btree_node_wait_on_write(b);
}
- if (!six_trylock_intent(&b->c.lock))
+ if (!six_trylock_intent(&b->c.lock)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ }
- if (!six_trylock_write(&b->c.lock))
+ if (!six_trylock_write(&b->c.lock)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
goto out_unlock_intent;
+ }
/* recheck under lock */
if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush)
+ if (!flush) {
+ if (btree_node_read_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ else if (btree_node_write_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
goto out_unlock;
+ }
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
}
- if (btree_node_noevict(b) ||
- btree_node_write_blocked(b) ||
- btree_node_will_make_reachable(b))
+ if (btree_node_noevict(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
+ goto out_unlock;
+ }
+ if (btree_node_write_blocked(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
goto out_unlock;
+ }
+ if (btree_node_will_make_reachable(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
+ goto out_unlock;
+ }
if (btree_node_dirty(b)) {
- if (!flush)
+ if (!flush) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
goto out_unlock;
+ }
/*
* Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted
@@ -273,21 +451,22 @@ out_unlock_intent:
goto out;
}
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
{
- return __btree_node_reclaim(c, b, false);
+ return __btree_node_reclaim(c, b, false, shrinker_counter);
}
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
{
- return __btree_node_reclaim(c, b, true);
+ return __btree_node_reclaim(c, b, true, false);
}
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = shrink->private_data;
- struct btree_cache *bc = &c->btree_cache;
+ struct btree_cache_list *list = shrink->private_data;
+ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
unsigned long can_free = 0;
@@ -295,8 +474,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
unsigned long touched = 0;
unsigned i, flags;
unsigned long ret = SHRINK_STOP;
- bool trigger_writes = atomic_read(&bc->dirty) + nr >=
- bc->used * 3 / 4;
+ bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
@@ -311,7 +489,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
* succeed, so that inserting keys into the btree can always succeed and
* IO can always make forward progress:
*/
- can_free = btree_cache_can_free(bc);
+ can_free = btree_cache_can_free(list);
nr = min_t(unsigned long, nr, can_free);
i = 0;
@@ -328,24 +506,29 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
if (touched >= nr)
goto out;
- if (!btree_node_reclaim(c, b)) {
- btree_node_data_free(c, b);
+ if (!btree_node_reclaim(c, b, true)) {
+ btree_node_data_free(bc, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
+ bc->nr_freed++;
}
}
restart:
- list_for_each_entry_safe(b, t, &bc->live, list) {
+ list_for_each_entry_safe(b, t, &list->list, list) {
touched++;
if (btree_node_accessed(b)) {
clear_btree_node_accessed(b);
- } else if (!btree_node_reclaim(c, b)) {
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
+ --touched;;
+ } else if (!btree_node_reclaim(c, b, true)) {
+ __bch2_btree_node_hash_remove(bc, b);
+ __btree_node_data_free(bc, b);
+
freed++;
- btree_node_data_free(c, b);
+ bc->nr_freed++;
- bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -356,7 +539,7 @@ restart:
!btree_node_will_make_reachable(b) &&
!btree_node_write_blocked(b) &&
six_trylock_read(&b->c.lock)) {
- list_move(&bc->live, &b->list);
+ list_move(&list->list, &b->list);
mutex_unlock(&bc->lock);
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_read(&b->c.lock);
@@ -370,8 +553,8 @@ restart:
break;
}
out_rotate:
- if (&t->list != &bc->live)
- list_move_tail(&bc->live, &t->list);
+ if (&t->list != &list->list)
+ list_move_tail(&list->list, &t->list);
out:
mutex_unlock(&bc->lock);
out_nounlock:
@@ -384,57 +567,57 @@ out_nounlock:
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = shrink->private_data;
- struct btree_cache *bc = &c->btree_cache;
+ struct btree_cache_list *list = shrink->private_data;
if (bch2_btree_shrinker_disabled)
return 0;
- return btree_cache_can_free(bc);
+ return btree_cache_can_free(list);
}
void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
- unsigned i, flags;
+ struct btree *b, *t;
+ unsigned long flags;
- shrinker_free(bc->shrink);
+ shrinker_free(bc->live[1].shrink);
+ shrinker_free(bc->live[0].shrink);
/* vfree() can allocate memory: */
flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
if (c->verify_data)
- list_move(&c->verify_data->list, &bc->live);
+ list_move(&c->verify_data->list, &bc->live[0].list);
- kvpfree(c->verify_ondisk, c->opts.btree_node_size);
+ kvfree(c->verify_ondisk);
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
- list_add(&r->b->list, &bc->live);
+ list_add(&r->b->list, &bc->live[0].list);
}
- list_splice(&bc->freeable, &bc->live);
-
- while (!list_empty(&bc->live)) {
- b = list_first_entry(&bc->live, struct btree, list);
+ list_for_each_entry_safe(b, t, &bc->live[1].list, list)
+ bch2_btree_node_hash_remove(bc, b);
+ list_for_each_entry_safe(b, t, &bc->live[0].list, list)
+ bch2_btree_node_hash_remove(bc, b);
+ list_for_each_entry_safe(b, t, &bc->freeable, list) {
BUG_ON(btree_node_read_in_flight(b) ||
btree_node_write_in_flight(b));
- btree_node_data_free(c, b);
+ btree_node_data_free(bc, b);
}
BUG_ON(!bch2_journal_error(&c->journal) &&
- atomic_read(&c->btree_cache.dirty));
+ atomic_long_read(&c->btree_cache.nr_dirty));
list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
- while (!list_empty(&bc->freed_nonpcpu)) {
- b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
+ list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
list_del(&b->list);
six_lock_exit(&b->c.lock);
kfree(b);
@@ -443,6 +626,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
mutex_unlock(&bc->lock);
memalloc_nofs_restore(flags);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
+ BUG_ON(bc->nr_by_btree[i]);
+ BUG_ON(bc->live[0].nr);
+ BUG_ON(bc->live[1].nr);
+ BUG_ON(bc->nr_freeable);
+
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
}
@@ -462,22 +651,32 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bch2_recalc_btree_reserve(c);
- for (i = 0; i < bc->reserve; i++)
+ for (i = 0; i < bc->nr_reserve; i++)
if (!__bch2_btree_node_mem_alloc(c))
goto err;
- list_splice_init(&bc->live, &bc->freeable);
+ list_splice_init(&bc->live[0].list, &bc->freeable);
mutex_init(&c->verify_lock);
shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
if (!shrink)
goto err;
- bc->shrink = shrink;
+ bc->live[0].shrink = shrink;
+ shrink->count_objects = bch2_btree_cache_count;
+ shrink->scan_objects = bch2_btree_cache_scan;
+ shrink->seeks = 2;
+ shrink->private_data = &bc->live[0];
+ shrinker_register(shrink);
+
+ shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
+ if (!shrink)
+ goto err;
+ bc->live[1].shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
- shrink->seeks = 4;
- shrink->private_data = c;
+ shrink->seeks = 8;
+ shrink->private_data = &bc->live[1];
shrinker_register(shrink);
return 0;
@@ -488,7 +687,10 @@ err:
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
{
mutex_init(&bc->lock);
- INIT_LIST_HEAD(&bc->live);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
+ bc->live[i].idx = i;
+ INIT_LIST_HEAD(&bc->live[i].list);
+ }
INIT_LIST_HEAD(&bc->freeable);
INIT_LIST_HEAD(&bc->freed_pcpu);
INIT_LIST_HEAD(&bc->freed_nonpcpu);
@@ -518,8 +720,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
struct btree_cache *bc = &c->btree_cache;
struct task_struct *old;
- old = cmpxchg(&bc->alloc_lock, NULL, current);
- if (old == NULL || old == current)
+ old = NULL;
+ if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
goto success;
if (!cl) {
@@ -530,8 +732,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
closure_wait(&bc->alloc_wait, cl);
/* Try again, after adding ourselves to waitlist */
- old = cmpxchg(&bc->alloc_lock, NULL, current);
- if (old == NULL || old == current) {
+ old = NULL;
+ if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
/* We raced */
closure_wake_up(&bc->alloc_wait);
goto success;
@@ -550,14 +752,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_reclaim(c, b))
- return b;
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+ list_for_each_entry_reverse(b, &bc->live[i].list, list)
+ if (!btree_node_reclaim(c, b, false))
+ return b;
while (1) {
- list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_write_and_reclaim(c, b))
- return b;
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+ list_for_each_entry_reverse(b, &bc->live[i].list, list)
+ if (!btree_node_write_and_reclaim(c, b))
+ return b;
/*
* Rare case: all nodes were intent-locked.
@@ -577,9 +781,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
: &bc->freed_nonpcpu;
struct btree *b, *b2;
u64 start_time = local_clock();
- unsigned flags;
- flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
/*
@@ -587,36 +789,42 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
* disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, freed, list)
- if (!btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b, false)) {
list_del_init(&b->list);
goto got_node;
}
b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
- if (!b) {
+ if (b) {
+ bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
+ } else {
mutex_unlock(&bc->lock);
bch2_trans_unlock(trans);
b = __btree_node_mem_alloc(c, GFP_KERNEL);
if (!b)
goto err;
+ bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
mutex_lock(&bc->lock);
}
- bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
-
BUG_ON(!six_trylock_intent(&b->c.lock));
BUG_ON(!six_trylock_write(&b->c.lock));
-got_node:
+got_node:
/*
* btree_free() doesn't free memory; it sticks the node on the end of
* the list. Check if there's any freed nodes there:
*/
list_for_each_entry(b2, &bc->freeable, list)
- if (!btree_node_reclaim(c, b2)) {
+ if (!btree_node_reclaim(c, b2, false)) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
+
+ list_del_init(&b2->list);
+ --bc->nr_freeable;
btree_node_to_freedlist(bc, b2);
+ mutex_unlock(&bc->lock);
+
six_unlock_write(&b2->c.lock);
six_unlock_intent(&b2->c.lock);
goto got_mem;
@@ -630,11 +838,8 @@ got_node:
goto err;
}
- mutex_lock(&bc->lock);
- bc->used++;
got_mem:
- mutex_unlock(&bc->lock);
-
+ BUG_ON(!list_empty(&b->list));
BUG_ON(btree_node_hashed(b));
BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_write_in_flight(b));
@@ -651,7 +856,12 @@ out:
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
- memalloc_nofs_restore(flags);
+ int ret = bch2_trans_relock(trans);
+ if (unlikely(ret)) {
+ bch2_btree_node_to_freelist(c, b);
+ return ERR_PTR(ret);
+ }
+
return b;
err:
mutex_lock(&bc->lock);
@@ -660,7 +870,7 @@ err:
if (bc->alloc_lock == current) {
b2 = btree_node_cannibalize(c);
clear_btree_node_just_written(b2);
- bch2_btree_node_hash_remove(bc, b2);
+ __bch2_btree_node_hash_remove(bc, b2);
if (b) {
swap(b->data, b2->data);
@@ -670,9 +880,9 @@ err:
six_unlock_intent(&b2->c.lock);
} else {
b = b2;
- list_del_init(&b->list);
}
+ BUG_ON(!list_empty(&b->list));
mutex_unlock(&bc->lock);
trace_and_count(c, btree_cache_cannibalize, trans);
@@ -680,7 +890,6 @@ err:
}
mutex_unlock(&bc->lock);
- memalloc_nofs_restore(flags);
return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
}
@@ -696,9 +905,31 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- u32 seq;
- BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
+ if (unlikely(level >= BTREE_MAX_DEPTH)) {
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
+ level, BTREE_MAX_DEPTH);
+ return ERR_PTR(ret);
+ }
+
+ if (unlikely(!bkey_is_btree_ptr(&k->k))) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
+ printbuf_exit(&buf);
+ return ERR_PTR(ret);
+ }
+
+ if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
+ printbuf_exit(&buf);
+ return ERR_PTR(ret);
+ }
+
/*
* Parent node must be locked, else we could read in a btree node that's
* been freed:
@@ -711,6 +942,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
b = bch2_btree_node_mem_alloc(trans, level != 0);
if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+ if (!path)
+ return b;
+
trans->memory_allocation_failure = true;
trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
@@ -727,7 +961,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
b->hash_val = 0;
mutex_lock(&bc->lock);
- list_add(&b->list, &bc->freeable);
+ __bch2_btree_node_to_freelist(bc, b);
mutex_unlock(&bc->lock);
six_unlock_write(&b->c.lock);
@@ -736,33 +970,30 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
}
set_btree_node_read_in_flight(b);
-
six_unlock_write(&b->c.lock);
- seq = six_lock_seq(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- /* Unlock before doing IO: */
- if (path && sync)
- bch2_trans_unlock_noassert(trans);
+ if (path) {
+ u32 seq = six_lock_seq(&b->c.lock);
- bch2_btree_node_read(trans, b, sync);
+ /* Unlock before doing IO: */
+ six_unlock_intent(&b->c.lock);
+ bch2_trans_unlock_noassert(trans);
- if (!sync)
- return NULL;
+ bch2_btree_node_read(trans, b, sync);
- if (path) {
- int ret = bch2_trans_relock(trans) ?:
- bch2_btree_path_relock_intent(trans, path);
- if (ret) {
- BUG_ON(!trans->restarted);
+ int ret = bch2_trans_relock(trans);
+ if (ret)
return ERR_PTR(ret);
- }
- }
- if (!six_relock_type(&b->c.lock, lock_type, seq)) {
- if (path)
- trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
+ if (!sync)
+ return NULL;
+
+ if (!six_relock_type(&b->c.lock, lock_type, seq))
+ b = NULL;
+ } else {
+ bch2_btree_node_read(trans, b, sync);
+ if (lock_type == SIX_LOCK_read)
+ six_lock_downgrade(&b->c.lock);
}
return b;
@@ -776,22 +1007,21 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
return;
prt_printf(&buf,
- "btree node header doesn't match ptr\n"
- "btree %s level %u\n"
- "ptr: ",
- bch2_btree_id_str(b->c.btree_id), b->c.level);
+ "btree node header doesn't match ptr: ");
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_str(&buf, "\nptr: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_printf(&buf, "\nheader: btree %s level %llu\n"
- "min ",
- bch2_btree_id_str(BTREE_NODE_ID(b->data)),
- BTREE_NODE_LEVEL(b->data));
+ prt_str(&buf, "\nheader: ");
+ bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data));
+ prt_str(&buf, "\nmin ");
bch2_bpos_to_text(&buf, b->data->min_key);
prt_printf(&buf, "\nmax ");
bch2_bpos_to_text(&buf, b->data->max_key);
- bch2_fs_inconsistent(c, "%s", buf.buf);
+ bch2_fs_topology_error(c, "%s", buf.buf);
+
printbuf_exit(&buf);
}
@@ -814,7 +1044,6 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- struct bset_tree *t;
bool need_relock = false;
int ret;
@@ -872,6 +1101,10 @@ retry:
bch2_btree_node_wait_on_read(b);
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ return ERR_PTR(ret);
+
/*
* should_be_locked is not set on this path yet, so we need to
* relock it specifically:
@@ -901,7 +1134,7 @@ retry:
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
}
EBUG_ON(b->c.btree_id != path->btree_id);
@@ -934,7 +1167,6 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
{
struct bch_fs *c = trans->c;
struct btree *b;
- struct bset_tree *t;
int ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
@@ -992,7 +1224,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
}
EBUG_ON(b->c.btree_id != path->btree_id);
@@ -1011,7 +1243,6 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- struct bset_tree *t;
int ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
@@ -1075,7 +1306,7 @@ lock_node:
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->c.lock);
- b = ERR_PTR(-EIO);
+ b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
goto out;
}
@@ -1094,18 +1325,22 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
- BUG_ON(trans && !btree_node_locked(path, level + 1));
+ BUG_ON(path && !btree_node_locked(path, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
- b = btree_cache_find(bc, k);
+ struct btree *b = btree_cache_find(bc, k);
if (b)
return 0;
b = bch2_btree_node_fill(trans, path, k, btree_id,
level, SIX_LOCK_read, false);
- return PTR_ERR_OR_ZERO(b);
+ int ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ return ret;
+ if (b)
+ six_unlock_read(&b->c.lock);
+ return 0;
}
void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
@@ -1117,6 +1352,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
b = btree_cache_find(bc, k);
if (!b)
return;
+
+ BUG_ON(b == btree_node_root(trans->c, b));
wait_on_io:
/* not allowed to wait on io with btree locks held: */
@@ -1128,6 +1365,8 @@ wait_on_io:
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
+ goto out;
if (btree_node_dirty(b)) {
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
@@ -1139,10 +1378,10 @@ wait_on_io:
BUG_ON(btree_node_dirty(b));
mutex_lock(&bc->lock);
- btree_node_data_free(c, b);
bch2_btree_node_hash_remove(bc, b);
+ btree_node_data_free(bc, b);
mutex_unlock(&bc->lock);
-
+out:
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
@@ -1152,13 +1391,39 @@ const char *bch2_btree_id_str(enum btree_id btree)
return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
}
+void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
+{
+ if (btree < BTREE_ID_NR)
+ prt_str(out, __bch2_btree_ids[btree]);
+ else
+ prt_printf(out, "(unknown btree %u)", btree);
+}
+
+void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level)
+{
+ prt_str(out, "btree=");
+ bch2_btree_id_to_text(out, btree);
+ prt_printf(out, " level=%u", level);
+}
+
+void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
+ enum btree_id btree, unsigned level, struct bkey_s_c k)
+{
+ bch2_btree_id_to_text(out, btree);
+ prt_printf(out, " level %u/", level);
+ struct btree_root *r = bch2_btree_id_root(c, btree);
+ if (r)
+ prt_printf(out, "%u", r->level);
+ else
+ prt_printf(out, "(unknown)");
+ prt_printf(out, "\n ");
+
+ bch2_bkey_val_to_text(out, c, k);
+}
+
void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
{
- prt_printf(out, "%s level %u/%u\n ",
- bch2_btree_id_str(b->c.btree_id),
- b->c.level,
- bch2_btree_id_root(c, b->c.btree_id)->level);
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+ __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key));
}
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
@@ -1203,9 +1468,47 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
stats.failed);
}
-void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c)
+static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
+ const char *label, size_t nr)
+{
+ prt_printf(out, "%s\t", label);
+ prt_human_readable_u64(out, nr * c->opts.btree_node_size);
+ prt_printf(out, " (%zu)\n", nr);
+}
+
+static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
+#define x(n) #n,
+ BCH_BTREE_CACHE_NOT_FREED_REASONS()
+#undef x
+ NULL
+};
+
+void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
{
- prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
- prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
- prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
+
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
+ prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
+ prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
+ prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
+ prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
+ prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
+ prt_newline(out);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) {
+ bch2_btree_id_to_text(out, i);
+ prt_printf(out, "\t");
+ prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size);
+ prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]);
+ }
+
+ prt_newline(out);
+ prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
+ prt_printf(out, "not freed:\n");
+
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
+ prt_printf(out, " %s\t%llu\n",
+ bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 6d33885fdbde..ca3c1b145330 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -12,11 +12,21 @@ struct btree_iter;
void bch2_recalc_btree_reserve(struct bch_fs *);
+void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
+
+void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
+
int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
+void bch2_node_pin(struct bch_fs *, struct btree *);
+void bch2_btree_cache_unpin(struct bch_fs *);
+
+void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *);
+
void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
@@ -118,19 +128,29 @@ static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned i
} else {
unsigned idx = id - BTREE_ID_NR;
- EBUG_ON(idx >= c->btree_roots_extra.nr);
+ /* This can happen when we're called from btree_node_scan */
+ if (idx >= c->btree_roots_extra.nr)
+ return NULL;
+
return &c->btree_roots_extra.data[idx];
}
}
static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
{
- return bch2_btree_id_root(c, b->c.btree_id)->b;
+ struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id);
+
+ return r ? r->b : NULL;
}
-const char *bch2_btree_id_str(enum btree_id);
+const char *bch2_btree_id_str(enum btree_id); /* avoid */
+void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
+void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned);
+
+void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *,
+ enum btree_id, unsigned, struct bkey_s_c);
void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *);
+void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1102995643b1..dd1d9b74076e 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -7,25 +7,29 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "bkey_methods.h"
#include "bkey_buf.h"
#include "btree_journal_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
+#include "btree_node_scan.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
#include "debug.h"
+#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "extents.h"
#include "journal.h"
#include "keylist.h"
#include "move.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "reflink.h"
+#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
#include "trace.h"
@@ -40,6 +44,23 @@
#define DROP_THIS_NODE 10
#define DROP_PREV_NODE 11
+#define DID_FILL_FROM_SCAN 12
+
+static const char * const bch2_gc_phase_strs[] = {
+#define x(n) #n,
+ GC_PHASES()
+#undef x
+ NULL
+};
+
+void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
+{
+ prt_str(out, bch2_gc_phase_strs[p->phase]);
+ prt_char(out, ' ');
+ bch2_btree_id_level_to_text(out, p->btree, p->level);
+ prt_char(out, ' ');
+ bch2_bpos_to_text(out, p->pos);
+}
static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
{
@@ -49,12 +70,6 @@ static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
}}};
}
-static bool should_restart_for_topology_repair(struct bch_fs *c)
-{
- return c->opts.fix_errors != FSCK_FIX_no &&
- !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
-}
-
static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
{
preempt_disable();
@@ -66,94 +81,10 @@ static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
{
- BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
+ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
__gc_pos_set(c, new_pos);
}
-/*
- * Missing: if an interior btree node is empty, we need to do something -
- * perhaps just kill it
- */
-static int bch2_gc_check_topology(struct bch_fs *c,
- struct btree *b,
- struct bkey_buf *prev,
- struct bkey_buf cur,
- bool is_last)
-{
- struct bpos node_start = b->data->min_key;
- struct bpos node_end = b->data->max_key;
- struct bpos expected_start = bkey_deleted(&prev->k->k)
- ? node_start
- : bpos_successor(prev->k->k.p);
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
- int ret = 0;
-
- if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
- struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
-
- if (!bpos_eq(expected_start, bp->v.min_key)) {
- bch2_topology_error(c);
-
- if (bkey_deleted(&prev->k->k)) {
- prt_printf(&buf1, "start of node: ");
- bch2_bpos_to_text(&buf1, node_start);
- } else {
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
- }
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
-
- if (__fsck_err(c,
- FSCK_CAN_FIX|
- FSCK_CAN_IGNORE|
- FSCK_NO_RATELIMIT,
- btree_node_topology_bad_min_key,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " cur %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto err;
- } else {
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- }
- }
- }
-
- if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
- bch2_topology_error(c);
-
- printbuf_reset(&buf1);
- printbuf_reset(&buf2);
-
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
- bch2_bpos_to_text(&buf2, node_end);
-
- if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " %s\n"
- " expected %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf) &&
- should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto err;
- } else {
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- }
- }
-
- bch2_bkey_buf_copy(prev, c, cur.k);
-err:
-fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- return ret;
-}
-
static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
{
switch (b->key.k.type) {
@@ -178,40 +109,22 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
}
}
-static void bch2_btree_node_update_key_early(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_i *new)
+static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
{
- struct bch_fs *c = trans->c;
- struct btree *b;
- struct bkey_buf tmp;
+ struct bkey_i_btree_ptr_v2 *new;
int ret;
- bch2_bkey_buf_init(&tmp);
- bch2_bkey_buf_reassemble(&tmp, c, old);
-
- b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
- if (!IS_ERR_OR_NULL(b)) {
- mutex_lock(&c->btree_cache.lock);
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, " -> ");
+ bch2_bpos_to_text(&buf, new_min);
- bkey_copy(&b->key, new);
- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
- BUG_ON(ret);
-
- mutex_unlock(&c->btree_cache.lock);
- six_unlock_read(&b->c.lock);
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
}
- bch2_bkey_buf_exit(&tmp, c);
-}
-
-static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
-{
- struct bkey_i_btree_ptr_v2 *new;
- int ret;
-
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
return -BCH_ERR_ENOMEM_gc_repair_key;
@@ -237,6 +150,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
struct bkey_i_btree_ptr_v2 *new;
int ret;
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, " -> ");
+ bch2_bpos_to_text(&buf, new_max);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
if (ret)
return ret;
@@ -259,7 +183,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
bch2_btree_node_drop_keys_outside_node(b);
mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+ __bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, &new->k_i);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
@@ -268,128 +192,144 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
return 0;
}
-static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
- struct btree *prev, struct btree *cur)
+static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
+ struct btree *prev, struct btree *cur,
+ struct bpos *pulled_from_scan)
{
+ struct bch_fs *c = trans->c;
struct bpos expected_start = !prev
? b->data->min_key
: bpos_successor(prev->key.k.p);
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- if (!prev) {
- prt_printf(&buf1, "start of node: ");
- bch2_bpos_to_text(&buf1, b->data->min_key);
- } else {
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
+ BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+ b->data->min_key));
+
+ if (bpos_eq(expected_start, cur->data->min_key))
+ return 0;
+
+ prt_printf(&buf, " at ");
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_printf(&buf, ":\n parent: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ if (prev) {
+ prt_printf(&buf, "\n prev: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
}
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
-
- if (prev &&
- bpos_gt(expected_start, cur->data->min_key) &&
- BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
- /* cur overwrites prev: */
-
- if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
- cur->data->min_key), c,
- btree_node_topology_overwritten_by_next_node,
- "btree node overwritten by next node at btree %s level %u:\n"
- " node %s\n"
- " next %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = DROP_PREV_NODE;
- goto out;
- }
+ prt_str(&buf, "\n next: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
- if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
- bpos_predecessor(cur->data->min_key)), c,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " node %s\n"
- " next %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf))
- ret = set_node_max(c, prev,
- bpos_predecessor(cur->data->min_key));
- } else {
- /* prev overwrites cur: */
-
- if (mustfix_fsck_err_on(bpos_ge(expected_start,
- cur->data->max_key), c,
- btree_node_topology_overwritten_by_prev_node,
- "btree node overwritten by prev node at btree %s level %u:\n"
- " prev %s\n"
- " node %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = DROP_THIS_NODE;
- goto out;
- }
+ if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
+ if (b->c.level == 1 &&
+ bpos_lt(*pulled_from_scan, cur->data->min_key)) {
+ ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+ expected_start,
+ bpos_predecessor(cur->data->min_key));
+ if (ret)
+ goto err;
- if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
- btree_node_topology_bad_min_key,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " node %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf))
- ret = set_node_min(c, cur, expected_start);
+ *pulled_from_scan = cur->data->min_key;
+ ret = DID_FILL_FROM_SCAN;
+ } else {
+ if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key%s", buf.buf))
+ ret = set_node_min(c, cur, expected_start);
+ }
+ } else { /* overlap */
+ if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
+ if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
+ if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
+ "btree node overwritten by next node%s", buf.buf))
+ ret = DROP_PREV_NODE;
+ } else {
+ if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key%s", buf.buf))
+ ret = set_node_max(c, prev,
+ bpos_predecessor(cur->data->min_key));
+ }
+ } else {
+ if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
+ if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
+ "btree node overwritten by prev node%s", buf.buf))
+ ret = DROP_THIS_NODE;
+ } else {
+ if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key%s", buf.buf))
+ ret = set_node_min(c, cur, expected_start);
+ }
+ }
}
-out:
+err:
fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
+ printbuf_exit(&buf);
return ret;
}
-static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
- struct btree *child)
+static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
+ struct btree *child, struct bpos *pulled_from_scan)
{
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
- bch2_bpos_to_text(&buf2, b->key.k.p);
-
- if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " %s\n"
- " expected %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = set_node_max(c, child, b->key.k.p);
- if (ret)
- goto err;
+ if (bpos_eq(child->key.k.p, b->key.k.p))
+ return 0;
+
+ prt_printf(&buf, " at ");
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_printf(&buf, ":\n parent: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ prt_str(&buf, "\n child: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
+
+ if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key%s", buf.buf)) {
+ if (b->c.level == 1 &&
+ bpos_lt(*pulled_from_scan, b->key.k.p)) {
+ ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+ bpos_successor(child->key.k.p), b->key.k.p);
+ if (ret)
+ goto err;
+
+ *pulled_from_scan = b->key.k.p;
+ ret = DID_FILL_FROM_SCAN;
+ } else {
+ ret = set_node_max(c, child, b->key.k.p);
+ }
}
err:
fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
+ struct bpos *pulled_from_scan)
{
struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf prev_k, cur_k;
struct btree *prev = NULL, *cur = NULL;
- bool have_child, dropped_children = false;
+ bool have_child, new_pass = false;
struct printbuf buf = PRINTBUF;
int ret = 0;
if (!b->c.level)
return 0;
-again:
- prev = NULL;
- have_child = dropped_children = false;
+
bch2_bkey_buf_init(&prev_k);
bch2_bkey_buf_init(&cur_k);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+again:
+ cur = prev = NULL;
+ have_child = new_pass = false;
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+ iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -404,19 +344,23 @@ again:
ret = PTR_ERR_OR_ZERO(cur);
printbuf_reset(&buf);
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
+ prt_char(&buf, ' ');
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
- if (mustfix_fsck_err_on(ret == -EIO, c,
- btree_node_unreadable,
- "Topology repair: unreadable btree node at btree %s level %u:\n"
+ if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
+ trans, btree_node_read_error,
+ "Topology repair: unreadable btree node at\n"
" %s",
- bch2_btree_id_str(b->c.btree_id),
- b->c.level - 1,
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
+ cur = NULL;
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
- cur = NULL;
+ if (ret)
+ break;
+
+ ret = bch2_btree_lost_data(c, b->c.btree_id);
if (ret)
break;
continue;
@@ -426,7 +370,23 @@ again:
if (ret)
break;
- ret = btree_repair_node_boundaries(c, b, prev, cur);
+ if (bch2_btree_node_is_stale(c, cur)) {
+ bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf);
+ six_unlock_read(&cur->c.lock);
+ bch2_btree_node_evict(trans, cur_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
+ cur = NULL;
+ if (ret)
+ break;
+ continue;
+ }
+
+ ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan);
+ if (ret == DID_FILL_FROM_SCAN) {
+ new_pass = true;
+ ret = 0;
+ }
if (ret == DROP_THIS_NODE) {
six_unlock_read(&cur->c.lock);
@@ -444,6 +404,7 @@ again:
prev = NULL;
if (ret == DROP_PREV_NODE) {
+ bch_info(c, "dropped prev node");
bch2_btree_node_evict(trans, prev_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, prev_k.k->k.p);
@@ -451,8 +412,6 @@ again:
break;
bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&prev_k, c);
- bch2_bkey_buf_exit(&cur_k, c);
goto again;
} else if (ret)
break;
@@ -464,7 +423,11 @@ again:
if (!ret && !IS_ERR_OR_NULL(prev)) {
BUG_ON(cur);
- ret = btree_repair_node_end(c, b, prev);
+ ret = btree_repair_node_end(trans, b, prev, pulled_from_scan);
+ if (ret == DID_FILL_FROM_SCAN) {
+ new_pass = true;
+ ret = 0;
+ }
}
if (!IS_ERR_OR_NULL(prev))
@@ -478,7 +441,12 @@ again:
goto err;
bch2_btree_and_journal_iter_exit(&iter);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+ if (new_pass)
+ goto again;
+
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+ iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
bch2_bkey_buf_reassemble(&cur_k, c, k);
@@ -493,7 +461,7 @@ again:
if (ret)
goto err;
- ret = bch2_btree_repair_topology_recurse(trans, cur);
+ ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
six_unlock_read(&cur->c.lock);
cur = NULL;
@@ -501,7 +469,7 @@ again:
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
- dropped_children = true;
+ new_pass = true;
}
if (ret)
@@ -511,14 +479,13 @@ again:
}
printbuf_reset(&buf);
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- if (mustfix_fsck_err_on(!have_child, c,
- btree_node_topology_interior_node_empty,
- "empty interior btree node at btree %s level %u\n"
- " %s",
- bch2_btree_id_str(b->c.btree_id),
- b->c.level, buf.buf))
+ if (mustfix_fsck_err_on(!have_child,
+ trans, btree_node_topology_interior_node_empty,
+ "empty interior btree node at %s", buf.buf))
ret = DROP_THIS_NODE;
err:
fsck_err:
@@ -528,12 +495,14 @@ fsck_err:
six_unlock_read(&cur->c.lock);
bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&prev_k, c);
- bch2_bkey_buf_exit(&cur_k, c);
- if (!ret && dropped_children)
+ if (!ret && new_pass)
goto again;
+ BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
+
+ bch2_bkey_buf_exit(&prev_k, c);
+ bch2_bkey_buf_exit(&cur_k, c);
printbuf_exit(&buf);
return ret;
}
@@ -541,550 +510,215 @@ fsck_err:
int bch2_check_topology(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree *b;
- unsigned i;
+ struct bpos pulled_from_scan = POS_MIN;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
-
- if (!r->alive)
- continue;
+ bch2_trans_srcu_unlock(trans);
- b = r->b;
- if (btree_node_fake(b))
- continue;
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- ret = bch2_btree_repair_topology_recurse(trans, b);
- six_unlock_read(&b->c.lock);
-
- if (ret == DROP_THIS_NODE) {
- bch_err(c, "empty btree root - repair unimplemented");
- ret = -BCH_ERR_fsck_repair_unimplemented;
- }
- }
-
- bch2_trans_put(trans);
-
- return ret;
-}
-
-static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
- unsigned level, bool is_root,
- struct bkey_s_c *k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k);
- const union bch_extent_entry *entry_c;
- struct extent_ptr_decoded p = { 0 };
- bool do_update = false;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ struct btree_root *r = bch2_btree_id_root(c, i);
+ bool reconstructed_root = false;
- /*
- * XXX
- * use check_bucket_ref here
- */
- bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
-
- if (!g->gen_valid &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_to_missing_alloc_key,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
- if (!p.ptr.cached) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- } else {
- do_update = true;
- }
- }
+ printbuf_reset(&buf);
+ bch2_btree_id_to_text(&buf, i);
- if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
- if (!p.ptr.cached) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- set_bit(BCH_FS_need_another_gc, &c->flags);
- } else {
- do_update = true;
- }
- }
+ if (r->error) {
+ ret = bch2_btree_lost_data(c, i);
+ if (ret)
+ break;
+reconstruct_root:
+ bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
- if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
- do_update = true;
-
- if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, stale_dirty_ptr,
- "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
- do_update = true;
-
- if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
- continue;
+ r->alive = false;
+ r->error = 0;
- if (fsck_err_on(bucket_data_type(g->data_type) &&
- bucket_data_type(g->data_type) != data_type, c,
- ptr_bucket_data_type_mismatch,
- "bucket %u:%zu different types of data in same bucket: %s, %s\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
- if (data_type == BCH_DATA_btree) {
- g->data_type = data_type;
- set_bit(BCH_FS_need_another_gc, &c->flags);
+ if (!bch2_btree_has_scanned_nodes(c, i)) {
+ mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
+ "no nodes found for btree %s, continue?", buf.buf);
+ bch2_btree_root_alloc_fake_trans(trans, i, 0);
} else {
- do_update = true;
+ bch2_btree_root_alloc_fake_trans(trans, i, 1);
+ bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
+ if (ret)
+ break;
}
- }
- if (p.has_ec) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
-
- if (fsck_err_on(!m || !m->alive, c,
- ptr_to_missing_stripe,
- "pointer to nonexistent stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
- do_update = true;
-
- if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
- ptr_to_incorrect_stripe,
- "pointer does not match stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
- do_update = true;
+ reconstructed_root = true;
}
- }
- if (do_update) {
- struct bkey_ptrs ptrs;
- union bch_extent_entry *entry;
- struct bch_extent_ptr *ptr;
- struct bkey_i *new;
+ struct btree *b = r->b;
- if (is_root) {
- bch_err(c, "cannot update btree roots yet");
- ret = -EINVAL;
- goto err;
- }
-
- new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
- if (!new) {
- ret = -BCH_ERR_ENOMEM_gc_repair_key;
- bch_err_msg(c, ret, "allocating new key");
- goto err;
- }
-
- bkey_reassemble(new, *k);
-
- if (level) {
- /*
- * We don't want to drop btree node pointers - if the
- * btree node isn't there anymore, the read path will
- * sort it out:
- */
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-
- ptr->gen = g->gen;
- }
- } else {
- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
-
- (ptr->cached &&
- (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
- (!ptr->cached &&
- gen_cmp(ptr->gen, g->gen) < 0) ||
- gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
- (g->data_type &&
- g->data_type != data_type);
- }));
-again:
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_extent_entry_for_each(ptrs, entry) {
- if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
- entry->stripe_ptr.idx);
- union bch_extent_entry *next_ptr;
-
- bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
- if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
- goto found;
- next_ptr = NULL;
-found:
- if (!next_ptr) {
- bch_err(c, "aieee, found stripe ptr with no data ptr");
- continue;
- }
-
- if (!m || !m->alive ||
- !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
- &next_ptr->ptr,
- m->sectors)) {
- bch2_bkey_extent_entry_drop(new, entry);
- goto again;
- }
- }
- }
- }
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+ ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
+ six_unlock_read(&b->c.lock);
- ret = bch2_journal_key_insert_take(c, btree_id, level, new);
- if (ret) {
- kfree(new);
- goto err;
- }
+ if (ret == DROP_THIS_NODE) {
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+ mutex_unlock(&c->btree_cache.lock);
- if (level)
- bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
+ r->b = NULL;
- if (0) {
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, *k);
- bch_info(c, "updated %s", buf.buf);
+ if (!reconstructed_root)
+ goto reconstruct_root;
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
- bch_info(c, "new key %s", buf.buf);
+ bch_err(c, "empty btree root %s", buf.buf);
+ bch2_btree_root_alloc_fake_trans(trans, i, 0);
+ r->alive = false;
+ ret = 0;
}
-
- *k = bkey_i_to_s_c(new);
}
-err:
fsck_err:
printbuf_exit(&buf);
+ bch2_trans_put(trans);
return ret;
}
/* marking of btree keys/nodes: */
static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
- unsigned level, bool is_root,
- struct bkey_s_c *k,
+ unsigned level, struct btree **prev,
+ struct btree_iter *iter, struct bkey_s_c k,
bool initial)
{
struct bch_fs *c = trans->c;
- struct bkey deleted = KEY(0, 0, 0);
- struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
- int ret = 0;
-
- deleted.p = k->k->p;
- if (initial) {
- BUG_ON(bch2_journal_seq_verify &&
- k->k->version.lo > atomic64_read(&c->journal.seq));
-
- ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
- if (ret)
- goto err;
+ if (iter) {
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct btree *b = path_l(path)->b;
- if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
- bkey_version_in_future,
- "key version number higher than recorded: %llu > %llu",
- k->k->version.lo,
- atomic64_read(&c->key_version)))
- atomic64_set(&c->key_version, k->k->version.lo);
- }
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
-fsck_err:
-err:
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
-{
- struct bch_fs *c = trans->c;
- struct btree_node_iter iter;
- struct bkey unpacked;
- struct bkey_s_c k;
- struct bkey_buf prev, cur;
- int ret = 0;
-
- if (!btree_node_type_needs_gc(btree_node_type(b)))
- return 0;
-
- bch2_btree_node_iter_init_from_start(&iter, b);
- bch2_bkey_buf_init(&prev);
- bch2_bkey_buf_init(&cur);
- bkey_init(&prev.k->k);
-
- while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
- &k, initial);
- if (ret)
- break;
-
- bch2_btree_node_iter_advance(&iter, b);
-
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&cur, c, k);
-
- ret = bch2_gc_check_topology(c, b, &prev, cur,
- bch2_btree_node_iter_end(&iter));
+ if (*prev != b) {
+ int ret = bch2_btree_node_check_topology(trans, b);
if (ret)
- break;
+ return ret;
}
+ *prev = b;
}
- bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
- return ret;
-}
-
-static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
- bool initial, bool metadata_only)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct btree *b;
- unsigned depth = metadata_only ? 1 : 0;
+ struct bkey deleted = KEY(0, 0, 0);
+ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
-
- __for_each_btree_node(trans, iter, btree_id, POS_MIN,
- 0, depth, BTREE_ITER_PREFETCH, b, ret) {
- bch2_verify_btree_nr_keys(b);
+ deleted.p = k.k->p;
- gc_pos_set(c, gc_pos_btree_node(b));
-
- ret = btree_gc_mark_node(trans, b, initial);
- if (ret)
- break;
+ if (initial) {
+ BUG_ON(bch2_journal_seq_verify &&
+ k.k->bversion.lo > atomic64_read(&c->journal.seq));
+
+ if (fsck_err_on(btree_id != BTREE_ID_accounting &&
+ k.k->bversion.lo > atomic64_read(&c->key_version),
+ trans, bkey_version_in_future,
+ "key version number higher than recorded %llu\n %s",
+ atomic64_read(&c->key_version),
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ atomic64_set(&c->key_version, k.k->bversion.lo);
}
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- return ret;
- mutex_lock(&c->btree_root_lock);
- b = bch2_btree_id_root(c, btree_id)->b;
- if (!btree_node_fake(b)) {
- struct bkey_s_c k = bkey_i_to_s_c(&b->key);
-
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
- true, &k, initial);
+ if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
+ trans, btree_bitmap_not_marked,
+ "btree ptr not marked in member info btree allocated bitmap\n %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf))) {
+ mutex_lock(&c->sb_lock);
+ bch2_dev_btree_bitmap_mark(c, k);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
}
- gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
- mutex_unlock(&c->btree_root_lock);
-
- return ret;
-}
-static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
- unsigned target_depth)
-{
- struct bch_fs *c = trans->c;
- struct btree_and_journal_iter iter;
- struct bkey_s_c k;
- struct bkey_buf cur, prev;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
- bch2_bkey_buf_init(&prev);
- bch2_bkey_buf_init(&cur);
- bkey_init(&prev.k->k);
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- BUG_ON(bpos_lt(k.k->p, b->data->min_key));
- BUG_ON(bpos_gt(k.k->p, b->data->max_key));
-
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
- false, &k, true);
- if (ret)
- goto fsck_err;
-
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&cur, c, k);
- k = bkey_i_to_s_c(cur.k);
+ /*
+ * We require a commit before key_trigger() because
+ * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
+ * wrong result if we run it multiple times.
+ */
+ unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
- bch2_btree_and_journal_iter_advance(&iter);
+ ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
+ BTREE_TRIGGER_check_repair|flags);
+ if (ret)
+ goto out;
- ret = bch2_gc_check_topology(c, b,
- &prev, cur,
- !bch2_btree_and_journal_iter_peek(&iter).k);
- if (ret)
- goto fsck_err;
- } else {
- bch2_btree_and_journal_iter_advance(&iter);
- }
+ if (trans->nr_updates) {
+ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
}
- if (b->c.level > target_depth) {
- bch2_btree_and_journal_iter_exit(&iter);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- struct btree *child;
-
- bch2_bkey_buf_reassemble(&cur, c, k);
- bch2_btree_and_journal_iter_advance(&iter);
-
- child = bch2_btree_node_get_noiter(trans, cur.k,
- b->c.btree_id, b->c.level - 1,
- false);
- ret = PTR_ERR_OR_ZERO(child);
-
- if (ret == -EIO) {
- bch2_topology_error(c);
-
- if (__fsck_err(c,
- FSCK_CAN_FIX|
- FSCK_CAN_IGNORE|
- FSCK_NO_RATELIMIT,
- btree_node_read_error,
- "Unreadable btree node at btree %s level %u:\n"
- " %s",
- bch2_btree_id_str(b->c.btree_id),
- b->c.level - 1,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
- should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto fsck_err;
- } else {
- /* Continue marking when opted to not
- * fix the error: */
- ret = 0;
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- continue;
- }
- } else if (ret) {
- bch_err_msg(c, ret, "getting btree node");
- break;
- }
-
- ret = bch2_gc_btree_init_recurse(trans, child,
- target_depth);
- six_unlock_read(&child->c.lock);
-
- if (ret)
- break;
- }
- }
+ ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
+ BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
+out:
fsck_err:
- bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
- bch2_btree_and_journal_iter_exit(&iter);
printbuf_exit(&buf);
+ bch_err_fn(c, ret);
return ret;
}
-static int bch2_gc_btree_init(struct btree_trans *trans,
- enum btree_id btree_id,
- bool metadata_only)
+static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
{
struct bch_fs *c = trans->c;
- struct btree *b;
- unsigned target_depth = metadata_only ? 1 : 0;
- struct printbuf buf = PRINTBUF;
+ unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
int ret = 0;
- b = bch2_btree_id_root(c, btree_id)->b;
+ /* We need to make sure every leaf node is readable before going RW */
+ if (initial)
+ target_depth = 0;
- if (btree_node_fake(b))
- return 0;
+ for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
+ struct btree *prev = NULL;
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
+ BTREE_ITER_prefetch);
- six_lock_read(&b->c.lock, NULL, NULL);
- printbuf_reset(&buf);
- bch2_bpos_to_text(&buf, b->data->min_key);
- if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
- btree_root_bad_min_key,
- "btree root with incorrect min_key: %s", buf.buf)) {
- bch_err(c, "repair unimplemented");
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto fsck_err;
+ ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
+ bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
+ }));
+ if (ret)
+ goto err;
}
- printbuf_reset(&buf);
- bch2_bpos_to_text(&buf, b->data->max_key);
- if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
- btree_root_bad_max_key,
- "btree root with incorrect max_key: %s", buf.buf)) {
- bch_err(c, "repair unimplemented");
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto fsck_err;
- }
+ /* root */
+ do {
+retry_root:
+ bch2_trans_begin(trans);
+
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
+ 0, bch2_btree_id_root(c, btree)->b->c.level, 0);
+ struct btree *b = bch2_btree_iter_peek_node(&iter);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto err_root;
- if (b->c.level >= target_depth)
- ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
+ if (b != btree_node_root(c, b)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto retry_root;
+ }
- if (!ret) {
+ gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
-
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true,
- &k, true);
- }
-fsck_err:
- six_unlock_read(&b->c.lock);
-
+ ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
+err_root:
+ bch2_trans_iter_exit(trans, &iter);
+ } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
+err:
bch_err_fn(c, ret);
- printbuf_exit(&buf);
return ret;
}
static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
{
- return (int) btree_id_to_gc_phase(l) -
- (int) btree_id_to_gc_phase(r);
+ return cmp_int(gc_btree_order(l), gc_btree_order(r));
}
-static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
+static int bch2_gc_btrees(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
enum btree_id ids[BTREE_ID_NR];
+ struct printbuf buf = PRINTBUF;
unsigned i;
int ret = 0;
@@ -1092,259 +726,52 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
ids[i] = i;
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
- for (i = 0; i < BTREE_ID_NR && !ret; i++)
- ret = initial
- ? bch2_gc_btree_init(trans, ids[i], metadata_only)
- : bch2_gc_btree(trans, ids[i], initial, metadata_only);
+ for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
- for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
- if (!bch2_btree_id_root(c, i)->alive)
+ if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
continue;
- ret = initial
- ? bch2_gc_btree_init(trans, i, metadata_only)
- : bch2_gc_btree(trans, i, initial, metadata_only);
+ ret = bch2_gc_btree(trans, btree, true);
}
+ printbuf_exit(&buf);
bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
}
-static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
- u64 start, u64 end,
- enum bch_data_type type,
- unsigned flags)
-{
- u64 b = sector_to_bucket(ca, start);
-
- do {
- unsigned sectors =
- min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
-
- bch2_mark_metadata_bucket(c, ca, b, type, sectors,
- gc_phase(GC_PHASE_SB), flags);
- b++;
- start += sectors;
- } while (start < end);
-}
-
-static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
- unsigned flags)
+static int bch2_mark_superblocks(struct bch_fs *c)
{
- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
- unsigned i;
- u64 b;
-
- for (i = 0; i < layout->nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout->sb_offset[i]);
-
- if (offset == BCH_SB_SECTOR)
- mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
- BCH_DATA_sb, flags);
-
- mark_metadata_sectors(c, ca, offset,
- offset + (1 << layout->sb_max_size_bits),
- BCH_DATA_sb, flags);
- }
+ gc_pos_set(c, gc_phase(GC_PHASE_sb));
- for (i = 0; i < ca->journal.nr; i++) {
- b = ca->journal.buckets[i];
- bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB), flags);
- }
+ return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
}
-static void bch2_mark_superblocks(struct bch_fs *c)
-{
- mutex_lock(&c->sb_lock);
- gc_pos_set(c, gc_phase(GC_PHASE_SB));
-
- for_each_online_member(c, ca)
- bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
- mutex_unlock(&c->sb_lock);
-}
-
-#if 0
-/* Also see bch2_pending_btree_node_free_insert_done() */
-static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
-{
- struct btree_update *as;
- struct pending_btree_node_free *d;
-
- mutex_lock(&c->btree_interior_update_lock);
- gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
-
- for_each_pending_btree_node_free(c, as, d)
- if (d->index_update_done)
- bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
-
- mutex_unlock(&c->btree_interior_update_lock);
-}
-#endif
-
static void bch2_gc_free(struct bch_fs *c)
{
+ bch2_accounting_gc_free(c);
+
genradix_free(&c->reflink_gc_table);
genradix_free(&c->gc_stripes);
- for_each_member_device(c, ca) {
- kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
- sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket));
- ca->buckets_gc = NULL;
-
- free_percpu(ca->usage_gc);
- ca->usage_gc = NULL;
- }
-
- free_percpu(c->usage_gc);
- c->usage_gc = NULL;
-}
-
-static int bch2_gc_done(struct bch_fs *c,
- bool initial, bool metadata_only)
-{
- struct bch_dev *ca = NULL;
- struct printbuf buf = PRINTBUF;
- bool verify = !metadata_only &&
- !c->opts.reconstruct_alloc &&
- (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
- unsigned i;
- int ret = 0;
-
- percpu_down_write(&c->mark_lock);
-
-#define copy_field(_err, _f, _msg, ...) \
- if (dst->_f != src->_f && \
- (!verify || \
- fsck_err(c, _err, _msg ": got %llu, should be %llu" \
- , ##__VA_ARGS__, dst->_f, src->_f))) \
- dst->_f = src->_f
-#define copy_dev_field(_err, _f, _msg, ...) \
- copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__)
-#define copy_fs_field(_err, _f, _msg, ...) \
- copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
-
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- bch2_fs_usage_acc_to_base(c, i);
-
- __for_each_member_device(c, ca) {
- struct bch_dev_usage *dst = ca->usage_base;
- struct bch_dev_usage *src = (void *)
- bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
- dev_usage_u64s());
-
- for (i = 0; i < BCH_DATA_NR; i++) {
- copy_dev_field(dev_usage_buckets_wrong,
- d[i].buckets, "%s buckets", bch2_data_type_str(i));
- copy_dev_field(dev_usage_sectors_wrong,
- d[i].sectors, "%s sectors", bch2_data_type_str(i));
- copy_dev_field(dev_usage_fragmented_wrong,
- d[i].fragmented, "%s fragmented", bch2_data_type_str(i));
- }
- }
-
- {
- unsigned nr = fs_usage_u64s(c);
- struct bch_fs_usage *dst = c->usage_base;
- struct bch_fs_usage *src = (void *)
- bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
-
- copy_fs_field(fs_usage_hidden_wrong,
- b.hidden, "hidden");
- copy_fs_field(fs_usage_btree_wrong,
- b.btree, "btree");
-
- if (!metadata_only) {
- copy_fs_field(fs_usage_data_wrong,
- b.data, "data");
- copy_fs_field(fs_usage_cached_wrong,
- b.cached, "cached");
- copy_fs_field(fs_usage_reserved_wrong,
- b.reserved, "reserved");
- copy_fs_field(fs_usage_nr_inodes_wrong,
- b.nr_inodes,"nr_inodes");
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- copy_fs_field(fs_usage_persistent_reserved_wrong,
- persistent_reserved[i],
- "persistent_reserved[%i]", i);
- }
-
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(&c->replicas, i);
-
- if (metadata_only &&
- (e->data_type == BCH_DATA_user ||
- e->data_type == BCH_DATA_cached))
- continue;
-
- printbuf_reset(&buf);
- bch2_replicas_entry_to_text(&buf, e);
-
- copy_fs_field(fs_usage_replicas_wrong,
- replicas[i], "%s", buf.buf);
- }
- }
-
-#undef copy_fs_field
-#undef copy_dev_field
-#undef copy_stripe_field
-#undef copy_field
-fsck_err:
- if (ca)
- percpu_ref_put(&ca->ref);
- bch_err_fn(c, ret);
-
- percpu_up_write(&c->mark_lock);
- printbuf_exit(&buf);
- return ret;
+ for_each_member_device(c, ca)
+ genradix_free(&ca->buckets_gc);
}
static int bch2_gc_start(struct bch_fs *c)
{
- BUG_ON(c->usage_gc);
-
- c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
- sizeof(u64), GFP_KERNEL);
- if (!c->usage_gc) {
- bch_err(c, "error allocating c->usage_gc");
- return -BCH_ERR_ENOMEM_gc_start;
- }
-
for_each_member_device(c, ca) {
- BUG_ON(ca->usage_gc);
-
- ca->usage_gc = alloc_percpu(struct bch_dev_usage);
- if (!ca->usage_gc) {
- bch_err(c, "error allocating ca->usage_gc");
- percpu_ref_put(&ca->ref);
- return -BCH_ERR_ENOMEM_gc_start;
+ int ret = bch2_dev_usage_init(ca, true);
+ if (ret) {
+ bch2_dev_put(ca);
+ return ret;
}
-
- this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
- ca->mi.nbuckets - ca->mi.first_bucket);
}
return 0;
}
-static int bch2_gc_reset(struct bch_fs *c)
-{
- for_each_member_device(c, ca) {
- free_percpu(ca->usage_gc);
- ca->usage_gc = NULL;
- }
-
- free_percpu(c->usage_gc);
- c->usage_gc = NULL;
-
- return bch2_gc_start(c);
-}
-
/* returns true if not equal */
static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
struct bch_alloc_v4 r)
@@ -1353,6 +780,7 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
l.oldest_gen != r.oldest_gen ||
l.data_type != r.data_type ||
l.dirty_sectors != r.dirty_sectors ||
+ l.stripe_sectors != r.stripe_sectors ||
l.cached_sectors != r.cached_sectors ||
l.stripe_redundancy != r.stripe_redundancy ||
l.stripe != r.stripe;
@@ -1360,59 +788,55 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
static int bch2_alloc_write_key(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_s_c k,
- bool metadata_only)
+ struct bch_dev *ca,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
- struct bucket gc, *b;
struct bkey_i_alloc_v4 *a;
- struct bch_alloc_v4 old_convert, new;
+ struct bch_alloc_v4 old_gc, gc, old_convert, new;
const struct bch_alloc_v4 *old;
- enum bch_data_type type;
int ret;
+ if (!bucket_valid(ca, k.k->p.offset))
+ return 0;
+
old = bch2_alloc_to_v4(k, &old_convert);
- new = *old;
+ gc = new = *old;
+
+ __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
- percpu_down_read(&c->mark_lock);
- b = gc_bucket(ca, iter->pos.offset);
+ old_gc = gc;
+
+ if ((old->data_type == BCH_DATA_sb ||
+ old->data_type == BCH_DATA_journal) &&
+ !bch2_dev_is_online(ca)) {
+ gc.data_type = old->data_type;
+ gc.dirty_sectors = old->dirty_sectors;
+ }
/*
- * b->data_type doesn't yet include need_discard & need_gc_gen states -
+ * gc.data_type doesn't yet include need_discard & need_gc_gen states -
* fix that here:
*/
- type = __alloc_data_type(b->dirty_sectors,
- b->cached_sectors,
- b->stripe,
- *old,
- b->data_type);
- if (b->data_type != type) {
- struct bch_dev_usage *u;
-
- preempt_disable();
- u = this_cpu_ptr(ca->usage_gc);
- u->d[b->data_type].buckets--;
- b->data_type = type;
- u->d[b->data_type].buckets++;
- preempt_enable();
- }
-
- gc = *b;
- percpu_up_read(&c->mark_lock);
-
- if (metadata_only &&
- gc.data_type != BCH_DATA_sb &&
- gc.data_type != BCH_DATA_journal &&
- gc.data_type != BCH_DATA_btree)
- return 0;
+ alloc_data_type_set(&gc, gc.data_type);
+ if (gc.data_type != old_gc.data_type ||
+ gc.dirty_sectors != old_gc.dirty_sectors) {
+ ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
+ if (ret)
+ return ret;
- if (gen_after(old->gen, gc.gen))
- return 0;
+ /*
+ * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
+ * safe w.r.t. transaction restarts, so fixup the gc_bucket so
+ * we don't run it twice:
+ */
+ struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
+ gc_m->data_type = gc.data_type;
+ gc_m->dirty_sectors = gc.dirty_sectors;
+ }
- if (c->opts.reconstruct_alloc ||
- fsck_err_on(new.data_type != gc.data_type, c,
- alloc_key_data_type_wrong,
+ if (fsck_err_on(new.data_type != gc.data_type,
+ trans, alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
": got %s, should be %s",
iter->pos.inode, iter->pos.offset,
@@ -1422,26 +846,22 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
- if (c->opts.reconstruct_alloc || \
- fsck_err_on(new._f != gc._f, c, _errtype, \
+ if (fsck_err_on(new._f != gc._f, \
+ trans, _errtype, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
- ": got %u, should be %u", \
+ ": got %llu, should be %llu", \
iter->pos.inode, iter->pos.offset, \
gc.gen, \
bch2_data_type_str(gc.data_type), \
- new._f, gc._f)) \
+ (u64) new._f, (u64) gc._f)) \
new._f = gc._f; \
- copy_bucket_field(alloc_key_gen_wrong,
- gen);
- copy_bucket_field(alloc_key_dirty_sectors_wrong,
- dirty_sectors);
- copy_bucket_field(alloc_key_cached_sectors_wrong,
- cached_sectors);
- copy_bucket_field(alloc_key_stripe_wrong,
- stripe);
- copy_bucket_field(alloc_key_stripe_redundancy_wrong,
- stripe_redundancy);
+ copy_bucket_field(alloc_key_gen_wrong, gen);
+ copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors);
+ copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors);
+ copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors);
+ copy_bucket_field(alloc_key_stripe_wrong, stripe);
+ copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy);
#undef copy_bucket_field
if (!bch2_alloc_v4_cmp(*old, new))
@@ -1455,31 +875,31 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
a->v = new;
/*
- * The trigger normally makes sure this is set, but we're not running
+ * The trigger normally makes sure these are set, but we're not running
* triggers:
*/
if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
+ ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
fsck_err:
return ret;
}
-static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_alloc_done(struct bch_fs *c)
{
int ret = 0;
for_each_member_device(c, ca) {
ret = bch2_trans_run(c,
- for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc,
+ for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
POS(ca->dev_idx, ca->mi.first_bucket),
POS(ca->dev_idx, ca->mi.nbuckets - 1),
- BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
- NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
- bch2_alloc_write_key(trans, &iter, k, metadata_only)));
+ BTREE_ITER_slots|BTREE_ITER_prefetch, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_alloc_write_key(trans, &iter, ca, k)));
if (ret) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
break;
}
}
@@ -1488,179 +908,23 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
return ret;
}
-static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_alloc_start(struct bch_fs *c)
{
- for_each_member_device(c, ca) {
- struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket),
- GFP_KERNEL|__GFP_ZERO);
- if (!buckets) {
- percpu_ref_put(&ca->ref);
- bch_err(c, "error allocating ca->buckets[gc]");
- return -BCH_ERR_ENOMEM_gc_alloc_start;
- }
-
- buckets->first_bucket = ca->mi.first_bucket;
- buckets->nbuckets = ca->mi.nbuckets;
- rcu_assign_pointer(ca->buckets_gc, buckets);
- }
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
- struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
- struct bucket *g = gc_bucket(ca, k.k->p.offset);
-
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-
- g->gen_valid = 1;
- g->gen = a->gen;
-
- if (metadata_only &&
- (a->data_type == BCH_DATA_user ||
- a->data_type == BCH_DATA_cached ||
- a->data_type == BCH_DATA_parity)) {
- g->data_type = a->data_type;
- g->dirty_sectors = a->dirty_sectors;
- g->cached_sectors = a->cached_sectors;
- g->stripe = a->stripe;
- g->stripe_redundancy = a->stripe_redundancy;
- }
-
- 0;
- })));
- bch_err_fn(c, ret);
- return ret;
-}
+ int ret = 0;
-static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
-{
for_each_member_device(c, ca) {
- struct bucket_array *buckets = gc_bucket_array(ca);
- struct bucket *g;
-
- for_each_bucket(g, buckets) {
- if (metadata_only &&
- (g->data_type == BCH_DATA_user ||
- g->data_type == BCH_DATA_cached ||
- g->data_type == BCH_DATA_parity))
- continue;
- g->data_type = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
+ ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
+ if (ret) {
+ bch2_dev_put(ca);
+ ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+ break;
}
}
-}
-
-static int bch2_gc_write_reflink_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- size_t *idx)
-{
- struct bch_fs *c = trans->c;
- const __le64 *refcount = bkey_refcount_c(k);
- struct printbuf buf = PRINTBUF;
- struct reflink_gc *r;
- int ret = 0;
-
- if (!refcount)
- return 0;
-
- while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
- r->offset < k.k->p.offset)
- ++*idx;
-
- if (!r ||
- r->offset != k.k->p.offset ||
- r->size != k.k->size) {
- bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
- return -EINVAL;
- }
-
- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
- reflink_v_refcount_wrong,
- "reflink key has wrong refcount:\n"
- " %s\n"
- " should be %u",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
- r->refcount)) {
- struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
-
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- if (!r->refcount)
- new->k.type = KEY_TYPE_deleted;
- else
- *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
- }
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
-{
- size_t idx = 0;
-
- if (metadata_only)
- return 0;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
- c->reflink_gc_nr = 0;
- return ret;
-}
-
-static int bch2_gc_reflink_start(struct bch_fs *c,
- bool metadata_only)
-{
-
- if (metadata_only)
- return 0;
-
- c->reflink_gc_nr = 0;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
- const __le64 *refcount = bkey_refcount_c(k);
-
- if (!refcount)
- continue;
-
- struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
- c->reflink_gc_nr++, GFP_KERNEL);
- if (!r) {
- ret = -BCH_ERR_ENOMEM_gc_reflink_start;
- break;
- }
-
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
- 0;
- })));
bch_err_fn(c, ret);
return ret;
}
-static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
-{
- struct genradix_iter iter;
- struct reflink_gc *r;
-
- genradix_for_each(&c->reflink_gc_table, iter, r)
- r->refcount = 0;
-}
-
static int bch2_gc_write_stripes_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
@@ -1693,7 +957,8 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans,
if (bad)
bch2_bkey_val_to_text(&buf, c, k);
- if (fsck_err_on(bad, c, stripe_sector_count_wrong,
+ if (fsck_err_on(bad,
+ trans, stripe_sector_count_wrong,
"%s", buf.buf)) {
struct bkey_i_stripe *new;
@@ -1714,30 +979,20 @@ fsck_err:
return ret;
}
-static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_stripes_done(struct bch_fs *c)
{
- if (metadata_only)
- return 0;
-
return bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_gc_write_stripes_key(trans, &iter, k)));
}
-static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
-{
- genradix_free(&c->gc_stripes);
-}
-
/**
- * bch2_gc - walk _all_ references to buckets, and recompute them:
+ * bch2_check_allocations - walk all references to buckets, and recompute them:
*
* @c: filesystem object
- * @initial: are we in recovery?
- * @metadata_only: are we just checking metadata references, or everything?
*
* Returns: 0 on success, or standard errcode on failure
*
@@ -1756,9 +1011,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
* move around - if references move backwards in the ordering GC
* uses, GC could skip past them
*/
-int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
+int bch2_check_allocations(struct bch_fs *c)
{
- unsigned iter = 0;
int ret;
lockdep_assert_held(&c->state_lock);
@@ -1767,67 +1021,34 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
bch2_btree_interior_updates_flush(c);
- ret = bch2_gc_start(c) ?:
- bch2_gc_alloc_start(c, metadata_only) ?:
- bch2_gc_reflink_start(c, metadata_only);
+ ret = bch2_gc_accounting_start(c) ?:
+ bch2_gc_start(c) ?:
+ bch2_gc_alloc_start(c) ?:
+ bch2_gc_reflink_start(c);
if (ret)
goto out;
-again:
- gc_pos_set(c, gc_phase(GC_PHASE_START));
- bch2_mark_superblocks(c);
+ gc_pos_set(c, gc_phase(GC_PHASE_start));
- ret = bch2_gc_btrees(c, initial, metadata_only);
+ ret = bch2_mark_superblocks(c);
+ bch_err_msg(c, ret, "marking superblocks");
+ if (ret)
+ goto out;
+ ret = bch2_gc_btrees(c);
if (ret)
goto out;
-#if 0
- bch2_mark_pending_btree_node_frees(c);
-#endif
c->gc_count++;
- if (test_bit(BCH_FS_need_another_gc, &c->flags) ||
- (!iter && bch2_test_restart_gc)) {
- if (iter++ > 2) {
- bch_info(c, "Unable to fix bucket gens, looping");
- ret = -EINVAL;
- goto out;
- }
-
- /*
- * XXX: make sure gens we fixed got saved
- */
- bch_info(c, "Second GC pass needed, restarting:");
- clear_bit(BCH_FS_need_another_gc, &c->flags);
- __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
-
- bch2_gc_stripes_reset(c, metadata_only);
- bch2_gc_alloc_reset(c, metadata_only);
- bch2_gc_reflink_reset(c, metadata_only);
- ret = bch2_gc_reset(c);
- if (ret)
- goto out;
-
- /* flush fsck errors, reset counters */
- bch2_flush_fsck_errs(c);
- goto again;
- }
+ ret = bch2_gc_alloc_done(c) ?:
+ bch2_gc_accounting_done(c) ?:
+ bch2_gc_stripes_done(c) ?:
+ bch2_gc_reflink_done(c);
out:
- if (!ret) {
- bch2_journal_block(&c->journal);
-
- ret = bch2_gc_stripes_done(c, metadata_only) ?:
- bch2_gc_reflink_done(c, metadata_only) ?:
- bch2_gc_alloc_done(c, metadata_only) ?:
- bch2_gc_done(c, initial, metadata_only);
-
- bch2_journal_unblock(&c->journal);
- }
-
percpu_down_write(&c->mark_lock);
/* Indicates that gc is no longer in progress: */
- __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+ __gc_pos_set(c, gc_phase(GC_PHASE_not_running));
bch2_gc_free(c);
percpu_up_write(&c->mark_lock);
@@ -1852,24 +1073,31 @@ static int gc_btree_gens_key(struct btree_trans *trans,
struct bkey_i *u;
int ret;
- percpu_down_read(&c->mark_lock);
+ if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
+ return -EROFS;
+
+ rcu_read_lock();
bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
- if (ptr_stale(ca, ptr) > 16) {
- percpu_up_read(&c->mark_lock);
+ if (dev_ptr_stale(ca, ptr) > 16) {
+ rcu_read_unlock();
goto update;
}
}
bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
+ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
if (gen_after(*gen, ptr->gen))
*gen = ptr->gen;
}
- percpu_up_read(&c->mark_lock);
+ rcu_read_unlock();
return 0;
update:
u = bch2_bkey_make_mut(trans, iter, &k, 0);
@@ -1881,10 +1109,9 @@ update:
return 0;
}
-static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k)
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
+ struct btree_iter *iter, struct bkey_s_c k)
{
- struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
struct bkey_i_alloc_v4 *a_mut;
@@ -1899,7 +1126,6 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
return ret;
a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
- a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
}
@@ -1909,16 +1135,20 @@ int bch2_gc_gens(struct bch_fs *c)
u64 b, start_time = local_clock();
int ret;
- /*
- * Ideally we would be using state_lock and not gc_lock here, but that
- * introduces a deadlock in the RO path - we currently take the state
- * lock at the start of going RO, thus the gc thread may get stuck:
- */
if (!mutex_trylock(&c->gc_gens_lock))
return 0;
trace_and_count(c, gc_gens_start, c);
- down_read(&c->gc_lock);
+
+ /*
+ * We have to use trylock here. Otherwise, we would
+ * introduce a deadlock in the RO path - we take the
+ * state lock at the start of going RO.
+ */
+ if (!down_read_trylock(&c->state_lock)) {
+ mutex_unlock(&c->gc_gens_lock);
+ return 0;
+ }
for_each_member_device(c, ca) {
struct bucket_gens *gens = bucket_gens(ca);
@@ -1927,7 +1157,7 @@ int bch2_gc_gens(struct bch_fs *c)
ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
if (!ca->oldest_gen) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
ret = -BCH_ERR_ENOMEM_gc_gens;
goto err;
}
@@ -1945,7 +1175,7 @@ int bch2_gc_gens(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, i,
POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
@@ -1954,14 +1184,23 @@ int bch2_gc_gens(struct bch_fs *c)
goto err;
}
+ struct bch_dev *ca = NULL;
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN,
- BTREE_ITER_PREFETCH,
+ BTREE_ITER_prefetch,
k,
NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_alloc_write_oldest_gen(trans, &iter, k)));
+ BCH_TRANS_COMMIT_no_enospc, ({
+ ca = bch2_dev_iterate(c, ca, k.k->p.inode);
+ if (!ca) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ continue;
+ }
+ bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
+ })));
+ bch2_dev_put(ca);
+
if (ret)
goto err;
@@ -1978,94 +1217,37 @@ err:
ca->oldest_gen = NULL;
}
- up_read(&c->gc_lock);
+ up_read(&c->state_lock);
mutex_unlock(&c->gc_gens_lock);
if (!bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret);
return ret;
}
-static int bch2_gc_thread(void *arg)
+static void bch2_gc_gens_work(struct work_struct *work)
{
- struct bch_fs *c = arg;
- struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last = atomic64_read(&clock->now);
- unsigned last_kick = atomic_read(&c->kick_gc);
-
- set_freezable();
-
- while (1) {
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- return 0;
- }
-
- if (atomic_read(&c->kick_gc) != last_kick)
- break;
-
- if (c->btree_gc_periodic) {
- unsigned long next = last + c->capacity / 16;
-
- if (atomic64_read(&clock->now) >= next)
- break;
-
- bch2_io_clock_schedule_timeout(clock, next);
- } else {
- schedule();
- }
-
- try_to_freeze();
- }
- __set_current_state(TASK_RUNNING);
-
- last = atomic64_read(&clock->now);
- last_kick = atomic_read(&c->kick_gc);
-
- /*
- * Full gc is currently incompatible with btree key cache:
- */
-#if 0
- ret = bch2_gc(c, false, false);
-#else
- bch2_gc_gens(c);
-#endif
- debug_check_no_locks_held();
- }
-
- return 0;
+ struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
+ bch2_gc_gens(c);
+ bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
}
-void bch2_gc_thread_stop(struct bch_fs *c)
+void bch2_gc_gens_async(struct bch_fs *c)
{
- struct task_struct *p;
-
- p = c->gc_thread;
- c->gc_thread = NULL;
-
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) &&
+ !queue_work(c->write_ref_wq, &c->gc_gens_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
}
-int bch2_gc_thread_start(struct bch_fs *c)
+void bch2_fs_btree_gc_exit(struct bch_fs *c)
{
- struct task_struct *p;
-
- if (c->gc_thread)
- return 0;
+}
- p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
- if (IS_ERR(p)) {
- bch_err_fn(c, PTR_ERR(p));
- return PTR_ERR(p);
- }
+int bch2_fs_btree_gc_init(struct bch_fs *c)
+{
+ seqcount_init(&c->gc_pos_lock);
+ INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
- get_task_struct(p);
- c->gc_thread = p;
- wake_up_process(p);
+ init_rwsem(&c->gc_lock);
+ mutex_init(&c->gc_gens_lock);
return 0;
}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 607575f83a00..9693a90a48a2 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -3,13 +3,11 @@
#define _BCACHEFS_BTREE_GC_H
#include "bkey.h"
+#include "btree_gc_types.h"
#include "btree_types.h"
int bch2_check_topology(struct bch_fs *);
-int bch2_gc(struct bch_fs *, bool, bool);
-int bch2_gc_gens(struct bch_fs *);
-void bch2_gc_thread_stop(struct bch_fs *);
-int bch2_gc_thread_start(struct bch_fs *);
+int bch2_check_allocations(struct bch_fs *);
/*
* For concurrent mark and sweep (with other index updates), we define a total
@@ -35,60 +33,36 @@ int bch2_gc_thread_start(struct bch_fs *);
/* Position of (the start of) a gc phase: */
static inline struct gc_pos gc_phase(enum gc_phase phase)
{
- return (struct gc_pos) {
- .phase = phase,
- .pos = POS_MIN,
- .level = 0,
- };
-}
-
-static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
-{
- return cmp_int(l.phase, r.phase) ?:
- bpos_cmp(l.pos, r.pos) ?:
- cmp_int(l.level, r.level);
-}
-
-static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
-{
- switch (id) {
-#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
- BCH_BTREE_IDS()
-#undef x
- default:
- BUG();
- }
+ return (struct gc_pos) { .phase = phase, };
}
-static inline struct gc_pos gc_pos_btree(enum btree_id id,
- struct bpos pos, unsigned level)
+static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
+ struct bpos pos)
{
return (struct gc_pos) {
- .phase = btree_id_to_gc_phase(id),
- .pos = pos,
+ .phase = GC_PHASE_btree,
+ .btree = btree,
.level = level,
+ .pos = pos,
};
}
-/*
- * GC position of the pointers within a btree node: note, _not_ for &b->key
- * itself, that lives in the parent node:
- */
-static inline struct gc_pos gc_pos_btree_node(struct btree *b)
+static inline int gc_btree_order(enum btree_id btree)
{
- return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level);
+ if (btree == BTREE_ID_alloc)
+ return -2;
+ if (btree == BTREE_ID_stripes)
+ return -1;
+ return btree;
}
-/*
- * GC position of the pointer to a btree root: we don't use
- * gc_pos_pointer_to_btree_node() here to avoid a potential race with
- * btree_split() increasing the tree depth - the new root will have level > the
- * old root and thus have a greater gc position than the old root, but that
- * would be incorrect since once gc has marked the root it's not coming back.
- */
-static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
- return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH);
+ return cmp_int(l.phase, r.phase) ?:
+ cmp_int(gc_btree_order(l.btree),
+ gc_btree_order(r.btree)) ?:
+ cmp_int(l.level, r.level) ?:
+ bpos_cmp(l.pos, r.pos);
}
static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
@@ -104,11 +78,12 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
return ret;
}
-static inline void bch2_do_gc_gens(struct bch_fs *c)
-{
- atomic_inc(&c->kick_gc);
- if (c->gc_thread)
- wake_up_process(c->gc_thread);
-}
+void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
+
+int bch2_gc_gens(struct bch_fs *);
+void bch2_gc_gens_async(struct bch_fs *);
+
+void bch2_fs_btree_gc_exit(struct bch_fs *);
+int bch2_fs_btree_gc_init(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h
new file mode 100644
index 000000000000..c24dd6edf377
--- /dev/null
+++ b/fs/bcachefs/btree_gc_types.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_GC_TYPES_H
+#define _BCACHEFS_BTREE_GC_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+#define GC_PHASES() \
+ x(not_running) \
+ x(start) \
+ x(sb) \
+ x(btree)
+
+enum gc_phase {
+#define x(n) GC_PHASE_##n,
+ GC_PHASES()
+#undef x
+};
+
+struct gc_pos {
+ enum gc_phase phase:8;
+ enum btree_id btree:8;
+ u16 level;
+ struct bpos pos;
+};
+
+struct reflink_gc {
+ u64 offset;
+ u32 size;
+ u32 refcount;
+};
+
+typedef GENRADIX(struct reflink_gc) reflink_gc_table;
+
+#endif /* _BCACHEFS_BTREE_GC_TYPES_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index aa9b6cbe3226..756736f9243d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -23,6 +23,17 @@
#include <linux/sched/mm.h>
+static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
+{
+ bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
+ prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn));
+ prt_str(out, "min: ");
+ bch2_bpos_to_text(out, bn->min_key);
+ prt_newline(out);
+ prt_str(out, "max: ");
+ bch2_bpos_to_text(out, bn->max_key);
+}
+
void bch2_btree_node_io_unlock(struct btree *b)
{
EBUG_ON(!btree_node_write_in_flight(b));
@@ -34,8 +45,6 @@ void bch2_btree_node_io_unlock(struct btree *b)
void bch2_btree_node_io_lock(struct btree *b)
{
- bch2_assert_btree_nodes_not_locked();
-
wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
@@ -54,16 +63,12 @@ void __bch2_btree_node_wait_on_write(struct btree *b)
void bch2_btree_node_wait_on_read(struct btree *b)
{
- bch2_assert_btree_nodes_not_locked();
-
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
}
void bch2_btree_node_wait_on_write(struct btree *b)
{
- bch2_assert_btree_nodes_not_locked();
-
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
@@ -103,7 +108,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
if (used_mempool)
mempool_free(p, &c->btree_bounce_pool);
else
- vpfree(p, size);
+ kvfree(p);
}
static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
@@ -115,7 +120,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
- p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+ p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
if (!p) {
*used_mempool = true;
p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@@ -217,7 +222,6 @@ static bool should_compact_bset(struct btree *b, struct bset_tree *t,
static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
{
- struct bset_tree *t;
bool ret = false;
for_each_bset(b, t) {
@@ -288,8 +292,7 @@ bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
static void btree_node_sort(struct bch_fs *c, struct btree *b,
unsigned start_idx,
- unsigned end_idx,
- bool filter_whiteouts)
+ unsigned end_idx)
{
struct btree_node *out;
struct sort_iter_stack sort_iter;
@@ -320,7 +323,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
start_time = local_clock();
- u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
+ u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter);
out->keys.u64s = cpu_to_le16(u64s);
@@ -426,13 +429,12 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b)
break;
if (b->nsets - unwritten_idx > 1) {
- btree_node_sort(c, b, unwritten_idx,
- b->nsets, false);
+ btree_node_sort(c, b, unwritten_idx, b->nsets);
ret = true;
}
if (unwritten_idx > 1) {
- btree_node_sort(c, b, 0, unwritten_idx, false);
+ btree_node_sort(c, b, 0, unwritten_idx);
ret = true;
}
@@ -441,8 +443,6 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b)
void bch2_btree_build_aux_trees(struct btree *b)
{
- struct bset_tree *t;
-
for_each_bset(b, t)
bch2_bset_build_aux_tree(b, t,
!bset_written(b, bset(b, t)) &&
@@ -489,8 +489,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
if (b->nsets == MAX_BSETS &&
!btree_node_write_in_flight(b) &&
should_compact_all(c, b)) {
- bch2_btree_node_write(c, b, SIX_LOCK_write,
- BTREE_WRITE_init_next_bset);
+ bch2_btree_node_write_trans(trans, b, SIX_LOCK_write,
+ BTREE_WRITE_init_next_bset);
reinit_iter = true;
}
@@ -512,7 +512,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca,
- struct btree *b, struct bset *i,
+ struct btree *b, struct bset *i, struct bkey_packed *k,
unsigned offset, int write)
{
prt_printf(out, bch2_log_msg(c, "%s"),
@@ -524,28 +524,36 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "at btree ");
bch2_btree_pos_to_text(out, c, b);
- prt_printf(out, "\n node offset %u/%u",
- b->written, btree_ptr_sectors_written(&b->key));
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "\nnode offset %u/%u",
+ b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+ if (k)
+ prt_printf(out, " bset byte offset %lu",
+ (unsigned long)(void *)k -
+ ((unsigned long)(void *)i & ~511UL));
prt_str(out, ": ");
}
-__printf(9, 10)
+__printf(10, 11)
static int __btree_err(int ret,
struct bch_fs *c,
struct bch_dev *ca,
struct btree *b,
struct bset *i,
+ struct bkey_packed *k,
int write,
bool have_retry,
enum bch_sb_error_id err_type,
const char *fmt, ...)
{
struct printbuf out = PRINTBUF;
+ bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
va_list args;
- btree_err_msg(&out, c, ca, b, i, b->written, write);
+ btree_err_msg(&out, c, ca, b, i, k, b->written, write);
va_start(args, fmt);
prt_vprintf(&out, fmt, args);
@@ -564,12 +572,14 @@ static int __btree_err(int ret,
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
ret = -BCH_ERR_btree_node_read_err_bad_node;
- if (ret != -BCH_ERR_btree_node_read_err_fixable)
+ if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable)
bch2_sb_error_count(c, err_type);
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
- ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf);
+ ret = !silent
+ ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
+ : -BCH_ERR_fsck_fix;
if (ret != -BCH_ERR_fsck_fix &&
ret != -BCH_ERR_fsck_ignore)
goto fsck_err;
@@ -577,15 +587,17 @@ static int __btree_err(int ret,
break;
case -BCH_ERR_btree_node_read_err_want_retry:
case -BCH_ERR_btree_node_read_err_must_retry:
- bch2_print_string_as_lines(KERN_ERR, out.buf);
+ if (!silent)
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
break;
case -BCH_ERR_btree_node_read_err_bad_node:
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- bch2_topology_error(c);
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
+ if (!silent)
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
+ ret = bch2_topology_error(c);
break;
case -BCH_ERR_btree_node_read_err_incompatible:
- bch2_print_string_as_lines(KERN_ERR, out.buf);
+ if (!silent)
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
ret = -BCH_ERR_fsck_errors_not_fixed;
break;
default:
@@ -597,9 +609,9 @@ fsck_err:
return ret;
}
-#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \
+#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
({ \
- int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \
+ int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \
BCH_FSCK_ERR_##_err_type, \
msg, ##__VA_ARGS__); \
\
@@ -620,8 +632,6 @@ fsck_err:
__cold
void bch2_btree_node_drop_keys_outside_node(struct btree *b)
{
- struct bset_tree *t;
-
for_each_bset(b, t) {
struct bset *i = bset(b, t);
struct bkey_packed *k;
@@ -655,6 +665,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
*/
bch2_bset_set_no_aux_tree(b, b->set);
bch2_btree_build_aux_trees(b);
+ b->nr = bch2_btree_node_count_keys(b);
struct bkey_s_c k;
struct bkey unpacked;
@@ -671,13 +682,14 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
int write, bool have_retry, bool *saw_error)
{
unsigned version = le16_to_cpu(i->version);
+ unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
struct printbuf buf1 = PRINTBUF;
struct printbuf buf2 = PRINTBUF;
int ret = 0;
btree_err_on(!bch2_version_compatible(version),
-BCH_ERR_btree_node_read_err_incompatible,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_unsupported_version,
"unsupported bset version %u.%u",
BCH_VERSION_MAJOR(version),
@@ -685,7 +697,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
if (btree_err_on(version < c->sb.version_min,
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, NULL,
btree_node_bset_older_than_sb_min,
"bset version %u older than superblock version_min %u",
version, c->sb.version_min)) {
@@ -698,7 +710,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
if (btree_err_on(BCH_VERSION_MAJOR(version) >
BCH_VERSION_MAJOR(c->sb.version),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, NULL,
btree_node_bset_newer_than_sb,
"bset version %u newer than superblock version %u",
version, c->sb.version)) {
@@ -710,29 +722,28 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-BCH_ERR_btree_node_read_err_incompatible,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_unsupported_version,
"BSET_SEPARATE_WHITEOUTS no longer supported");
- if (btree_err_on(offset + sectors > btree_sectors(c),
+ if (!write &&
+ btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)),
-BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_past_end_of_btree_node,
- "bset past end of btree node")) {
+ "bset past end of btree node (offset %u len %u but written %zu)",
+ offset, sectors, ptr_written ?: btree_sectors(c)))
i->u64s = 0;
- ret = 0;
- goto out;
- }
btree_err_on(offset && !i->u64s,
-BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_empty,
"empty bset");
btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_wrong_sector_offset,
"bset at wrong sector offset");
@@ -748,20 +759,20 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
/* XXX endianness */
btree_err_on(bp->seq != bn->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
bset_bad_seq,
"incorrect sequence number (wrong btree node)");
}
btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_bad_btree,
"incorrect btree id");
btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_bad_level,
"incorrect level");
@@ -780,7 +791,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bad_min_key,
"incorrect min_key: got %s should be %s",
(printbuf_reset(&buf1),
@@ -791,7 +802,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_bad_max_key,
"incorrect max key %s",
(printbuf_reset(&buf1),
@@ -803,7 +814,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
-BCH_ERR_btree_node_read_err_bad_node,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_bad_format,
"invalid bkey format: %s\n %s", buf1.buf,
(printbuf_reset(&buf2),
@@ -814,24 +825,41 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
BSET_BIG_ENDIAN(i), write,
&bn->format);
}
-out:
fsck_err:
printbuf_exit(&buf2);
printbuf_exit(&buf1);
return ret;
}
-static int bset_key_invalid(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k,
- bool updated_range, int rw,
- struct printbuf *err)
+static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b,
+ struct bkey_s_c k,
+ enum bch_validate_flags flags)
+{
+ return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = b->c.level,
+ .btree = b->c.btree_id,
+ .flags = flags
+ });
+}
+
+static int bset_key_validate(struct bch_fs *c, struct btree *b,
+ struct bkey_s_c k,
+ bool updated_range,
+ enum bch_validate_flags flags)
{
- return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
- (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?:
- (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
+ struct bkey_validate_context from = (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = b->c.level,
+ .btree = b->c.btree_id,
+ .flags = flags,
+ };
+ return __bch2_bkey_validate(c, k, from) ?:
+ (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?:
+ (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0);
}
-static bool __bkey_valid(struct bch_fs *c, struct btree *b,
+static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
struct bset *i, struct bkey_packed *k)
{
if (bkey_p_next(k) > vstruct_last(i))
@@ -840,12 +868,26 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b,
if (k->format > KEY_FORMAT_CURRENT)
return false;
- struct printbuf buf = PRINTBUF;
+ if (!bkeyp_u64s_valid(&b->format, k))
+ return false;
+
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
- bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf);
- printbuf_exit(&buf);
- return ret;
+ return !__bch2_bkey_validate(c, u.s_c,
+ (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = b->c.level,
+ .btree = b->c.btree_id,
+ .flags = BCH_VALIDATE_silent
+ });
+}
+
+static inline int btree_node_read_bkey_cmp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
+{
+ return bch2_bkey_cmp_packed(b, l, r)
+ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l);
}
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
@@ -867,7 +909,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bkey_past_bset_end,
"key extends past end of bset")) {
i->u64s = cpu_to_le16((u64 *) k - i->_data);
@@ -876,12 +918,20 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bkey_bad_format,
"invalid bkey format %u", k->format))
goto drop_this_key;
- /* XXX: validate k->u64s */
+ if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i, k,
+ btree_node_bkey_bad_u64s,
+ "bad k->u64s %u (min %u max %zu)", k->u64s,
+ bkeyp_key_u64s(&b->format, k),
+ U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
+ goto drop_this_key;
+
if (!write)
bch2_bkey_compat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
@@ -889,26 +939,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
u = __bkey_disassemble(b, k, &tmp);
- printbuf_reset(&buf);
- if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
- printbuf_reset(&buf);
- bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
- prt_printf(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, u.s_c);
-
- btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
- btree_node_bad_bkey,
- "invalid bkey: %s", buf.buf);
+ ret = bset_key_validate(c, b, u.s_c, updated_range, write);
+ if (ret == -BCH_ERR_fsck_delete_bkey)
goto drop_this_key;
- }
+ if (ret)
+ goto fsck_err;
if (write)
bch2_bkey_compat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
&b->format, k);
- if (prev && bkey_iter_cmp(b, prev, k) > 0) {
+ if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) {
struct bkey up = bkey_unpack_key(b, prev);
printbuf_reset(&buf);
@@ -918,7 +960,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
bch2_bkey_to_text(&buf, u.k);
if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bkey_out_of_order,
"%s", buf.buf))
goto drop_this_key;
@@ -938,13 +980,12 @@ drop_this_key:
* do
*/
- if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
+ if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
for (next_good_key = 1;
next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
next_good_key++)
- if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
+ if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
goto got_good_key;
-
}
/*
@@ -955,7 +996,8 @@ drop_this_key:
}
got_good_key:
le16_add_cpu(&i->u64s, -next_good_key);
- memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
+ memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k);
+ set_btree_node_need_rewrite(b);
}
fsck_err:
printbuf_exit(&buf);
@@ -974,7 +1016,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
unsigned u64s;
- unsigned ptr_written = btree_ptr_sectors_written(&b->key);
+ unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
+ u64 max_journal_seq = 0;
struct printbuf buf = PRINTBUF;
int ret = 0, retry_read = 0, write = READ;
u64 start_time = local_clock();
@@ -988,13 +1031,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (bch2_meta_read_fault("btree"))
btree_err(-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_fault_injected,
"dynamic fault");
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bad_magic,
"bad magic: want %llx, got %llx",
bset_magic(c), le64_to_cpu(b->data->magic));
@@ -1009,98 +1052,100 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(b->data->keys.seq != bp->seq,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bad_seq,
- "got wrong btree node (want %llx got %llx)\n"
- "got btree %s level %llu pos %s",
- bp->seq, b->data->keys.seq,
- bch2_btree_id_str(BTREE_NODE_ID(b->data)),
- BTREE_NODE_LEVEL(b->data),
- buf.buf);
+ "got wrong btree node: got\n%s",
+ (printbuf_reset(&buf),
+ bch2_btree_node_header_to_text(&buf, b->data),
+ buf.buf));
} else {
btree_err_on(!b->data->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bad_seq,
- "bad btree header: seq 0");
+ "bad btree header: seq 0\n%s",
+ (printbuf_reset(&buf),
+ bch2_btree_node_header_to_text(&buf, b->data),
+ buf.buf));
}
while (b->written < (ptr_written ?: btree_sectors(c))) {
unsigned sectors;
- struct nonce nonce;
bool first = !b->written;
- bool csum_bad;
- if (!b->written) {
+ if (first) {
+ bne = NULL;
i = &b->data->keys;
+ } else {
+ bne = write_block(b);
+ i = &bne->keys;
- btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
- bset_unknown_csum,
- "unknown checksum type %llu", BSET_CSUM_TYPE(i));
-
- nonce = btree_nonce(i, b->written << 9);
-
- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
- csum_bad = bch2_crc_cmp(b->data->csum, csum);
- if (csum_bad)
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ if (i->seq != b->data->keys.seq)
+ break;
+ }
- btree_err_on(csum_bad,
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
- bset_bad_csum,
- "%s",
- (printbuf_reset(&buf),
- bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
- buf.buf));
-
- ret = bset_encrypt(c, i, b->written << 9);
- if (bch2_fs_fatal_err_on(ret, c,
- "error decrypting btree node: %i", ret))
- goto fsck_err;
+ struct nonce nonce = btree_nonce(i, b->written << 9);
+ bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
+
+ btree_err_on(!good_csum_type,
+ bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))
+ ? -BCH_ERR_btree_node_read_err_must_retry
+ : -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i, NULL,
+ bset_unknown_csum,
+ "unknown checksum type %llu", BSET_CSUM_TYPE(i));
+
+ if (first) {
+ if (good_csum_type) {
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+ bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
+ if (csum_bad)
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+ btree_err_on(csum_bad,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i, NULL,
+ bset_bad_csum,
+ "%s",
+ (printbuf_reset(&buf),
+ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
+ buf.buf));
+
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "decrypting btree node: %s", bch2_err_str(ret)))
+ goto fsck_err;
+ }
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-BCH_ERR_btree_node_read_err_incompatible,
- c, NULL, b, NULL,
+ c, NULL, b, NULL, NULL,
btree_node_unsupported_version,
"btree node does not have NEW_EXTENT_OVERWRITE set");
sectors = vstruct_sectors(b->data, c->block_bits);
} else {
- bne = write_block(b);
- i = &bne->keys;
-
- if (i->seq != b->data->keys.seq)
- break;
-
- btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
- bset_unknown_csum,
- "unknown checksum type %llu", BSET_CSUM_TYPE(i));
-
- nonce = btree_nonce(i, b->written << 9);
- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
- csum_bad = bch2_crc_cmp(bne->csum, csum);
- if (csum_bad)
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
- btree_err_on(csum_bad,
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
- bset_bad_csum,
- "%s",
- (printbuf_reset(&buf),
- bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
- buf.buf));
-
- ret = bset_encrypt(c, i, b->written << 9);
- if (bch2_fs_fatal_err_on(ret, c,
- "error decrypting btree node: %i\n", ret))
- goto fsck_err;
+ if (good_csum_type) {
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+ bool csum_bad = bch2_crc_cmp(bne->csum, csum);
+ if (ca && csum_bad)
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+ btree_err_on(csum_bad,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i, NULL,
+ bset_bad_csum,
+ "%s",
+ (printbuf_reset(&buf),
+ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
+ buf.buf));
+
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "decrypting btree node: %s", bch2_err_str(ret)))
+ goto fsck_err;
+ }
sectors = vstruct_sectors(bne, c->block_bits);
}
@@ -1128,20 +1173,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(blacklisted && first,
-BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_blacklisted_journal_seq,
"first btree node bset has blacklisted journal seq (%llu)",
le64_to_cpu(i->journal_seq));
btree_err_on(blacklisted && ptr_written,
-BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i,
+ c, ca, b, i, NULL,
first_bset_blacklisted_journal_seq,
"found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
le64_to_cpu(i->journal_seq),
b->written, b->written + sectors, ptr_written);
- b->written += sectors;
+ b->written = min(b->written + sectors, btree_sectors(c));
if (blacklisted && !first)
continue;
@@ -1149,12 +1194,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
sort_iter_add(iter,
vstruct_idx(i, 0),
vstruct_last(i));
+
+ max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq));
}
if (ptr_written) {
btree_err_on(b->written < ptr_written,
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_data_missing,
"btree node data missing: expected %u sectors, found %u",
ptr_written, b->written);
@@ -1167,7 +1214,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
le64_to_cpu(bne->keys.journal_seq),
true),
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bset_after_end,
"found bset signature after last bset");
}
@@ -1178,6 +1225,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
set_btree_bset(b, b->set, &b->data->keys);
b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
+ memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
+ btree_buf_bytes(b) -
+ sizeof(struct btree_node) -
+ b->nr.live_u64s * sizeof(u64));
u64s = le16_to_cpu(sorted->keys.u64s);
*sorted = *b->data;
@@ -1185,6 +1236,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
swap(sorted, b->data);
set_btree_bset(b, b->set, &b->data->keys);
b->nsets = 1;
+ b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
BUG_ON(b->nr.live_u64s != u64s);
@@ -1198,31 +1250,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
- printbuf_reset(&buf);
-
- if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
+ ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
+ if (ret == -BCH_ERR_fsck_delete_bkey ||
(bch2_inject_invalid_keys &&
- !bversion_cmp(u.k->version, MAX_VERSION))) {
- printbuf_reset(&buf);
-
- prt_printf(&buf, "invalid bkey: ");
- bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
- prt_printf(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, u.s_c);
-
- btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
- btree_node_bad_bkey,
- "%s", buf.buf);
-
+ !bversion_cmp(u.k->bversion, MAX_VERSION))) {
btree_keys_account_key_drop(&b->nr, 0, k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
set_btree_bset_end(b, b->set);
+ set_btree_node_need_rewrite(b);
continue;
}
+ if (ret)
+ goto fsck_err;
if (u.k->type == KEY_TYPE_btree_ptr_v2) {
struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
@@ -1239,12 +1281,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_node_reset_sib_u64s(b);
+ rcu_read_lock();
bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
- if (ca2->mi.state != BCH_MEMBER_STATE_rw)
+ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
set_btree_node_need_rewrite(b);
}
+ rcu_read_unlock();
if (!ptr_written)
set_btree_node_need_rewrite(b);
@@ -1255,10 +1299,12 @@ out:
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
- ret == -BCH_ERR_btree_node_read_err_must_retry)
+ ret == -BCH_ERR_btree_node_read_err_must_retry) {
retry_read = 1;
- else
+ } else {
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
+ }
goto out;
}
@@ -1267,8 +1313,8 @@ static void btree_node_read_work(struct work_struct *work)
struct btree_read_bio *rb =
container_of(work, struct btree_read_bio, work);
struct bch_fs *c = rb->c;
+ struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
struct btree *b = rb->b;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
struct printbuf buf = PRINTBUF;
@@ -1280,8 +1326,8 @@ static void btree_node_read_work(struct work_struct *work)
while (1) {
retry = true;
bch_info(c, "retrying read");
- ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
- rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
+ rb->have_ioref = ca != NULL;
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_buf_bytes(b);
@@ -1295,7 +1341,7 @@ static void btree_node_read_work(struct work_struct *work)
start:
printbuf_reset(&buf);
bch2_btree_pos_to_text(&buf, c, b);
- bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+ bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
"btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
@@ -1319,6 +1365,7 @@ start:
if (!can_retry) {
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
break;
}
}
@@ -1327,11 +1374,18 @@ start:
rb->start_time);
bio_put(&rb->bio);
- if (saw_error && !btree_node_read_error(b)) {
- printbuf_reset(&buf);
- bch2_bpos_to_text(&buf, b->key.k.p);
- bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
- __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
+ if ((saw_error ||
+ btree_node_need_rewrite(b)) &&
+ !btree_node_read_error(b) &&
+ c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
+ if (saw_error) {
+ printbuf_reset(&buf);
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_str(&buf, " ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s",
+ __func__, buf.buf);
+ }
bch2_btree_node_rewrite_async(c, b);
}
@@ -1348,12 +1402,12 @@ static void btree_node_read_endio(struct bio *bio)
struct bch_fs *c = rb->c;
if (rb->have_ioref) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
}
- queue_work(c->io_complete_wq, &rb->work);
+ queue_work(c->btree_read_complete_wq, &rb->work);
}
struct btree_node_read_all {
@@ -1440,18 +1494,18 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
written2 = btree_node_sectors_written(c, ra->buf[i]);
if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL,
+ c, NULL, b, NULL, NULL,
btree_node_replicas_sectors_written_mismatch,
"btree node sectors written mismatch: %u != %u",
written, written2) ||
btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL,
+ c, NULL, b, NULL, NULL,
btree_node_bset_after_end,
"found bset signature after last bset") ||
btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL,
+ c, NULL, b, NULL, NULL,
btree_node_replicas_data_mismatch,
"btree node replicas content mismatch"))
dump_bset_maps = true;
@@ -1518,9 +1572,10 @@ fsck_err:
ret = -1;
}
- if (ret)
+ if (ret) {
set_btree_node_read_error(b);
- else if (*saw_error)
+ bch2_btree_lost_data(c, b->c.btree_id);
+ } else if (*saw_error)
bch2_btree_node_rewrite_async(c, b);
for (i = 0; i < ra->nr; i++) {
@@ -1544,7 +1599,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio)
struct btree_node_read_all *ra = rb->ra;
if (rb->have_ioref) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
}
@@ -1586,14 +1641,14 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
struct btree_read_bio *rb =
container_of(ra->bio[i], struct btree_read_bio, bio);
rb->c = c;
rb->b = b;
rb->ra = ra;
rb->start_time = local_clock();
- rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ rb->have_ioref = ca != NULL;
rb->idx = i;
rb->pick = pick;
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
@@ -1619,7 +1674,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
btree_node_read_all_replicas_done(&ra->cl.work);
} else {
continue_at(&ra->cl, btree_node_read_all_replicas_done,
- c->io_complete_wq);
+ c->btree_read_complete_wq);
}
return 0;
@@ -1649,20 +1704,21 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
prt_str(&buf, "btree node read error: no device to read from\n at ");
bch2_btree_pos_to_text(&buf, c, b);
- bch_err(c, "%s", buf.buf);
+ bch_err_ratelimited(c, "%s", buf.buf);
- if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
+ if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
bch2_fatal_error(c);
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
printbuf_exit(&buf);
return;
}
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
bio = bio_alloc_bioset(NULL,
buf_pages(b->data, btree_buf_bytes(b)),
@@ -1674,7 +1730,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
rb->b = b;
rb->ra = NULL;
rb->start_time = local_clock();
- rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ rb->have_ioref = ca != NULL;
rb->pick = pick;
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_iter.bi_sector = pick.ptr.offset;
@@ -1699,7 +1755,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
if (sync)
btree_node_read_work(&rb->work);
else
- queue_work(c->io_complete_wq, &rb->work);
+ queue_work(c->btree_read_complete_wq, &rb->work);
}
}
@@ -1728,16 +1784,16 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
set_btree_node_read_in_flight(b);
+ /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */
+ bch2_trans_unlock(trans);
bch2_btree_node_read(trans, b, true);
if (btree_node_read_error(b)) {
- bch2_btree_node_hash_remove(&c->btree_cache, b);
-
mutex_lock(&c->btree_cache.lock);
- list_move(&b->list, &c->btree_cache.freeable);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
- ret = -EIO;
+ ret = -BCH_ERR_btree_node_read_error;
goto err;
}
@@ -1758,15 +1814,16 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
struct btree_write *w)
{
- unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
+ unsigned long old, new;
+ old = READ_ONCE(b->will_make_reachable);
do {
- old = new = v;
+ new = old;
if (!(old & 1))
break;
new &= ~1UL;
- } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
+ } while (!try_cmpxchg(&b->will_make_reachable, &old, new));
if (old & 1)
closure_put(&((struct btree_update *) new)->cl);
@@ -1777,14 +1834,14 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
{
struct btree_write *w = btree_prev_write(b);
- unsigned long old, new, v;
+ unsigned long old, new;
unsigned type = 0;
bch2_btree_complete_write(c, b, w);
- v = READ_ONCE(b->flags);
+ old = READ_ONCE(b->flags);
do {
- old = new = v;
+ new = old;
if ((old & (1U << BTREE_NODE_dirty)) &&
(old & (1U << BTREE_NODE_need_write)) &&
@@ -1804,7 +1861,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
new &= ~(1U << BTREE_NODE_write_in_flight);
new &= ~(1U << BTREE_NODE_write_in_flight_inner);
}
- } while ((v = cmpxchg(&b->flags, old, new)) != old);
+ } while (!try_cmpxchg(&b->flags, &old, new));
if (new & (1U << BTREE_NODE_write_in_flight))
__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
@@ -1817,10 +1874,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
struct btree_trans *trans = bch2_trans_get(c);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- __btree_node_write_done(c, b);
- six_unlock_read(&b->c.lock);
+ /* we don't need transaction context anymore after we got the lock. */
bch2_trans_put(trans);
+ __btree_node_write_done(c, b);
+ six_unlock_read(&b->c.lock);
}
static void btree_node_write_work(struct work_struct *work)
@@ -1829,7 +1887,6 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
- struct bch_extent_ptr *ptr;
int ret = 0;
btree_bounce_free(c,
@@ -1841,7 +1898,7 @@ static void btree_node_write_work(struct work_struct *work)
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
- ret = -BCH_ERR_btree_write_all_failed;
+ ret = -BCH_ERR_btree_node_write_all_failed;
goto err;
}
@@ -1850,9 +1907,9 @@ static void btree_node_write_work(struct work_struct *work)
}
} else {
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_do(c,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
- BCH_WATERMARK_reclaim|
+ BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw,
@@ -1866,8 +1923,8 @@ out:
return;
err:
set_btree_node_noevict(b);
- if (!bch2_err_matches(ret, EROFS))
- bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret));
+ bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
+ "writing btree node: %s", bch2_err_str(ret));
goto out;
}
@@ -1879,13 +1936,14 @@ static void btree_node_write_endio(struct bio *bio)
struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio);
struct bch_fs *c = wbio->c;
struct btree *b = wbio->bio.bi_private;
- struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
+ struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
unsigned long flags;
if (wbio->have_ioref)
bch2_latency_acct(ca, wbio->submit_time, WRITE);
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ if (!ca ||
+ bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
"btree write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("btree")) {
@@ -1912,18 +1970,19 @@ static void btree_node_write_endio(struct bio *bio)
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
struct bset *i, unsigned sectors)
{
- struct printbuf buf = PRINTBUF;
bool saw_error;
- int ret;
- ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
- BKEY_TYPE_btree, WRITE, &buf);
-
- if (ret)
- bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
- printbuf_exit(&buf);
- if (ret)
+ int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
+ (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = b->c.level + 1,
+ .btree = b->c.btree_id,
+ .flags = BCH_VALIDATE_write,
+ });
+ if (ret) {
+ bch2_fs_inconsistent(c, "invalid btree node key before write");
return ret;
+ }
ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
@@ -1952,7 +2011,6 @@ static void btree_write_submit(struct work_struct *work)
void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
{
struct btree_write_bio *wbio;
- struct bset_tree *t;
struct bset *i;
struct btree_node *bn = NULL;
struct btree_node_entry *bne = NULL;
@@ -1977,8 +2035,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
* dirty bit requires a write lock, we can't race with other threads
* redirtying it:
*/
+ old = READ_ONCE(b->flags);
do {
- old = new = READ_ONCE(b->flags);
+ new = old;
if (!(old & (1 << BTREE_NODE_dirty)))
return;
@@ -2009,14 +2068,14 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
new |= (1 << BTREE_NODE_write_in_flight_inner);
new |= (1 << BTREE_NODE_just_written);
new ^= (1 << BTREE_NODE_write_idx);
- } while (cmpxchg_acquire(&b->flags, old, new) != old);
+ } while (!try_cmpxchg_acquire(&b->flags, &old, new));
if (new & (1U << BTREE_NODE_need_write))
return;
do_write:
BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
- atomic_dec(&c->btree_cache.dirty);
+ atomic_long_dec(&c->btree_cache.nr_dirty);
BUG_ON(btree_node_fake(b));
BUG_ON((b->will_make_reachable != 0) != !b->written);
@@ -2078,11 +2137,11 @@ do_write:
unwritten_whiteouts_end(b));
SET_BSET_SEPARATE_WHITEOUTS(i, false);
- b->whiteout_u64s = 0;
-
- u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
+ u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter);
le16_add_cpu(&i->u64s, u64s);
+ b->whiteout_u64s = 0;
+
BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
set_needs_whiteout(i, false);
@@ -2096,7 +2155,7 @@ do_write:
if (!b->written &&
b->key.k.type == KEY_TYPE_btree_ptr_v2)
- BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
+ BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write);
memset(data + bytes_to_write, 0,
(sectors_to_write << 9) - bytes_to_write);
@@ -2123,7 +2182,7 @@ do_write:
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
- "error encrypting btree node: %i\n", ret))
+ "encrypting btree node: %s", bch2_err_str(ret)))
goto err;
nonce = btree_nonce(i, b->written << 9);
@@ -2192,7 +2251,7 @@ do_write:
atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
INIT_WORK(&wbio->work, btree_write_submit);
- queue_work(c->io_complete_wq, &wbio->work);
+ queue_work(c->btree_write_submit_wq, &wbio->work);
return;
err:
set_btree_node_noevict(b);
@@ -2209,7 +2268,6 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
{
bool invalidated_iter = false;
struct btree_node_entry *bne;
- struct bset_tree *t;
if (!btree_node_just_written(b))
return false;
@@ -2232,7 +2290,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
* single bset:
*/
if (b->nsets > 1) {
- btree_node_sort(c, b, 0, b->nsets, true);
+ btree_node_sort(c, b, 0, b->nsets);
invalidated_iter = true;
} else {
invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
@@ -2287,6 +2345,34 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
}
}
+void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b,
+ enum six_lock_type lock_type_held,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+
+ if (lock_type_held == SIX_LOCK_intent ||
+ (lock_type_held == SIX_LOCK_read &&
+ six_lock_tryupgrade(&b->c.lock))) {
+ __bch2_btree_node_write(c, b, flags);
+
+ /* don't cycle lock unnecessarily: */
+ if (btree_node_just_written(b) &&
+ six_trylock_write(&b->c.lock)) {
+ bch2_btree_post_write_cleanup(c, b);
+ __bch2_btree_node_unlock_write(trans, b);
+ }
+
+ if (lock_type_held == SIX_LOCK_read)
+ six_lock_downgrade(&b->c.lock);
+ } else {
+ __bch2_btree_node_write(c, b, flags);
+ if (lock_type_held == SIX_LOCK_write &&
+ btree_node_just_written(b))
+ bch2_btree_post_write_cleanup(c, b);
+ }
+}
+
static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
{
struct bucket_table *tbl;
@@ -2329,20 +2415,13 @@ void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
printbuf_tabstop_push(out, 20);
printbuf_tabstop_push(out, 10);
- prt_tab(out);
- prt_str(out, "nr");
- prt_tab(out);
- prt_str(out, "size");
- prt_newline(out);
+ prt_printf(out, "\tnr\tsize\n");
for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
u64 nr = atomic64_read(&c->btree_write_stats[i].nr);
u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes);
- prt_printf(out, "%s:", bch2_btree_write_types[i]);
- prt_tab(out);
- prt_u64(out, nr);
- prt_tab(out);
+ prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr);
prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
prt_newline(out);
}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index e251cb6b965f..6f9e4a6dacf7 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -18,19 +18,19 @@ struct btree_node_read_all;
static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
- atomic_inc(&c->btree_cache.dirty);
+ atomic_long_inc(&c->btree_cache.nr_dirty);
}
static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
- atomic_dec(&c->btree_cache.dirty);
+ atomic_long_dec(&c->btree_cache.nr_dirty);
}
-static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
+static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
{
- return k->k.type == KEY_TYPE_btree_ptr_v2
- ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
+ return k.k->type == KEY_TYPE_btree_ptr_v2
+ ? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written)
: 0;
}
@@ -81,8 +81,6 @@ static inline bool should_compact_bset_lazy(struct btree *b,
static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
{
- struct bset_tree *t;
-
for_each_bset(b, t)
if (should_compact_bset_lazy(b, t))
return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
@@ -146,11 +144,13 @@ enum btree_write_flags {
void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
enum six_lock_type, unsigned);
+void bch2_btree_node_write_trans(struct btree_trans *, struct btree *,
+ enum six_lock_type, unsigned);
-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b,
enum six_lock_type lock_held)
{
- bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
+ bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
}
bool bch2_btree_flush_all_reads(struct bch_fs *);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3ef338df82f5..e32fce4fd258 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -61,7 +61,7 @@ static inline int btree_path_cmp(const struct btree_path *l,
static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
{
/* Are we iterating over keys in all snapshots? */
- if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+ if (iter->flags & BTREE_ITER_all_snapshots) {
p = bpos_successor(p);
} else {
p = bpos_nosnap_successor(p);
@@ -74,7 +74,7 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
{
/* Are we iterating over keys in all snapshots? */
- if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+ if (iter->flags & BTREE_ITER_all_snapshots) {
p = bpos_predecessor(p);
} else {
p = bpos_nosnap_predecessor(p);
@@ -88,7 +88,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
{
struct bpos pos = iter->pos;
- if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ if ((iter->flags & BTREE_ITER_is_extents) &&
!bkey_eq(pos, POS_MAX))
pos = bkey_successor(iter, pos);
return pos;
@@ -221,11 +221,8 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
struct btree_path *path)
{
struct bch_fs *c = trans->c;
- unsigned i;
-
- EBUG_ON(path->btree_id >= BTREE_ID_NR);
- for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+ for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
if (!path->l[i].b) {
BUG_ON(!path->cached &&
bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
@@ -251,15 +248,13 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
- BUG_ON(iter->btree_id >= BTREE_ID_NR);
-
- BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached);
+ BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
- BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
- (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+ BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
+ (iter->flags & BTREE_ITER_all_snapshots));
- BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
- (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) &&
+ (iter->flags & BTREE_ITER_all_snapshots) &&
!btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
@@ -269,14 +264,16 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
{
- BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
!iter->pos.snapshot);
- BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
iter->pos.snapshot != iter->snapshot);
- BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
- bkey_gt(iter->pos, iter->k.p));
+ BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) :
+ !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) :
+ (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
+ bkey_gt(iter->pos, iter->k.p)));
}
static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
@@ -289,7 +286,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
if (!bch2_debug_check_iterators)
return 0;
- if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+ if (!(iter->flags & BTREE_ITER_filter_snapshots))
return 0;
if (bkey_err(k) || !k.k)
@@ -300,8 +297,8 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
k.k->p.snapshot));
bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
- BTREE_ITER_NOPRESERVE|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_nopreserve|
+ BTREE_ITER_all_snapshots);
prev = bch2_btree_iter_prev(&copy);
if (!prev.k)
goto out;
@@ -330,8 +327,10 @@ out:
}
void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
- struct bpos pos, bool key_cache)
+ struct bpos pos)
{
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+
struct btree_path *path;
struct trans_for_each_path_inorder_iter iter;
struct printbuf buf = PRINTBUF;
@@ -339,19 +338,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
btree_trans_sort_paths(trans);
trans_for_each_path_inorder(trans, path, iter) {
- int cmp = cmp_int(path->btree_id, id) ?:
- cmp_int(path->cached, key_cache);
-
- if (cmp > 0)
- break;
- if (cmp < 0)
- continue;
-
- if (!btree_node_locked(path, 0) ||
+ if (path->btree_id != id ||
+ !btree_node_locked(path, 0) ||
!path->should_be_locked)
continue;
- if (!key_cache) {
+ if (!path->cached) {
if (bkey_ge(pos, path->l[0].b->data->min_key) &&
bkey_le(pos, path->l[0].b->key.k.p))
return;
@@ -364,9 +356,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
bch2_dump_trans_paths_updates(trans);
bch2_bpos_to_text(&buf, pos);
- panic("not locked: %s %s%s\n",
- bch2_btree_id_str(id), buf.buf,
- key_cache ? " cached" : "");
+ panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
}
#else
@@ -709,6 +699,19 @@ void bch2_trans_node_add(struct btree_trans *trans,
bch2_trans_revalidate_updates_in_node(trans, b);
}
+void bch2_trans_node_drop(struct btree_trans *trans,
+ struct btree *b)
+{
+ struct btree_path *path;
+ unsigned i, level = b->c.level;
+
+ trans_for_each_path(trans, path, i)
+ if (path->l[level].b == b) {
+ btree_node_unlock(trans, path, level);
+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+ }
+}
+
/*
* A btree node has been modified in such a way as to invalidate iterators - fix
* them:
@@ -732,7 +735,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
+ struct btree_root *r = bch2_btree_id_root(c, path->btree_id);
enum six_lock_type lock_type;
unsigned i;
int ret;
@@ -740,7 +743,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
EBUG_ON(path->nodes_locked);
while (1) {
- b = READ_ONCE(*rootp);
+ struct btree *b = READ_ONCE(r->b);
+ if (unlikely(!b)) {
+ BUG_ON(!r->error);
+ return r->error;
+ }
+
path->level = READ_ONCE(b->c.level);
if (unlikely(path->level < depth_want)) {
@@ -760,14 +768,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
ret = btree_node_lock(trans, path, &b->c,
path->level, lock_type, trace_ip);
if (unlikely(ret)) {
- if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
- continue;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
BUG();
}
- if (likely(b == READ_ONCE(*rootp) &&
+ if (likely(b == READ_ONCE(r->b) &&
b->c.level == path->level &&
!race_fault())) {
for (i = 0; i < path->level; i++)
@@ -837,6 +843,8 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
bch2_bkey_buf_init(&tmp);
+ jiter->fail_if_too_many_whiteouts = true;
+
while (nr-- && !ret) {
if (!bch2_btree_node_relock(trans, path, path->level))
break;
@@ -891,16 +899,29 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
struct bkey_s_c k;
int ret = 0;
- __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+ __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
k = bch2_btree_and_journal_iter_peek(&jiter);
+ if (!k.k) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "node not found at pos ");
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_str(&buf, " at btree ");
+ bch2_btree_pos_to_text(&buf, c, l->b);
+
+ ret = bch2_fs_topology_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ goto err;
+ }
bch2_bkey_buf_reassemble(out, c, k);
- if ((flags & BTREE_ITER_PREFETCH) &&
+ if ((flags & BTREE_ITER_prefetch) &&
c->opts.btree_node_prefetch)
ret = btree_path_prefetch_j(trans, path, &jiter);
+err:
bch2_btree_and_journal_iter_exit(&jiter);
return ret;
}
@@ -927,10 +948,24 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
if (ret)
goto err;
} else {
- bch2_bkey_buf_unpack(&tmp, c, l->b,
- bch2_btree_node_iter_peek(&l->iter, l->b));
+ struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
+ if (!k) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "node not found at pos ");
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_str(&buf, " within parent node ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
+
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -BCH_ERR_btree_need_topology_repair;
+ goto err;
+ }
+
+ bch2_bkey_buf_unpack(&tmp, c, l->b, k);
- if ((flags & BTREE_ITER_PREFETCH) &&
+ if ((flags & BTREE_ITER_prefetch) &&
c->opts.btree_node_prefetch) {
ret = btree_path_prefetch(trans, path);
if (ret)
@@ -962,7 +997,6 @@ err:
return ret;
}
-
static int bch2_btree_path_traverse_all(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
@@ -986,6 +1020,7 @@ retry_all:
bch2_trans_unlock(trans);
cond_resched();
+ trans_set_locked(trans, false);
if (unlikely(trans->memory_allocation_failure)) {
struct closure cl;
@@ -1008,9 +1043,9 @@ retry_all:
* the same position:
*/
if (trans->paths[idx].uptodate) {
- __btree_path_get(&trans->paths[idx], false);
+ __btree_path_get(trans, &trans->paths[idx], false);
ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
- __btree_path_put(&trans->paths[idx], false);
+ __btree_path_put(trans, &trans->paths[idx], false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret, ENOMEM))
@@ -1129,6 +1164,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
if (unlikely(!trans->srcu_held))
bch2_trans_srcu_lock(trans);
+ trace_btree_path_traverse_start(trans, path);
+
/*
* Ensure we obey path->should_be_locked: if it's set, we can't unlock
* and re-traverse the path without a transaction restart:
@@ -1146,9 +1183,10 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
path = &trans->paths[path_idx];
if (unlikely(path->level >= BTREE_MAX_DEPTH))
- goto out;
+ goto out_uptodate;
path->level = btree_path_up_until_good_node(trans, path, 0);
+ unsigned max_level = path->level;
EBUG_ON(btree_path_node(path, path->level) &&
!btree_node_locked(path, path->level));
@@ -1180,7 +1218,18 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
}
}
+ if (unlikely(max_level > path->level)) {
+ struct btree_path *linked;
+ unsigned iter;
+
+ trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter)
+ for (unsigned j = path->level + 1; j < max_level; j++)
+ linked->l[j] = path->l[j];
+ }
+
+out_uptodate:
path->uptodate = BTREE_ITER_UPTODATE;
+ trace_btree_path_traverse_end(trans, path);
out:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
panic("ret %s (%i) trans->restarted %s (%i)\n",
@@ -1208,11 +1257,14 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path
}
static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
- bool intent)
+ bool intent, unsigned long ip)
{
btree_path_idx_t new = btree_path_alloc(trans, src);
btree_path_copy(trans, trans->paths + new, trans->paths + src);
- __btree_path_get(trans->paths + new, intent);
+ __btree_path_get(trans, trans->paths + new, intent);
+#ifdef TRACK_PATH_ALLOCATED
+ trans->paths[new].ip_allocated = ip;
+#endif
return new;
}
@@ -1220,8 +1272,10 @@ __flatten
btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
btree_path_idx_t path, bool intent, unsigned long ip)
{
- __btree_path_put(trans->paths + path, intent);
- path = btree_path_clone(trans, path, intent);
+ struct btree_path *old = trans->paths + path;
+ __btree_path_put(trans, trans->paths + path, intent);
+ path = btree_path_clone(trans, path, intent, ip);
+ trace_btree_path_clone(trans, old, trans->paths + path);
trans->paths[path].preserve = false;
return path;
}
@@ -1233,9 +1287,11 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
{
int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
- bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
EBUG_ON(!trans->paths[path_idx].ref);
+ trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
+
path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
struct btree_path *path = trans->paths + path_idx;
@@ -1321,24 +1377,51 @@ static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t
__clear_bit(path, trans->paths_allocated);
}
+static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path)
+{
+ unsigned l = path->level;
+
+ do {
+ if (!btree_path_node(path, l))
+ break;
+
+ if (!is_btree_node(path, l))
+ return false;
+
+ if (path->l[l].lock_seq != path->l[l].b->c.lock.seq)
+ return false;
+
+ l++;
+ } while (l < path->locks_want);
+
+ return true;
+}
+
void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
{
struct btree_path *path = trans->paths + path_idx, *dup;
- if (!__btree_path_put(path, intent))
+ if (!__btree_path_put(trans, path, intent))
return;
dup = path->preserve
? have_path_at_pos(trans, path)
: have_node_at_pos(trans, path);
+ trace_btree_path_free(trans, path_idx, dup);
+
if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
return;
- if (path->should_be_locked &&
- !trans->restarted &&
- (!dup || !bch2_btree_path_relock_norestart(trans, dup)))
- return;
+ if (path->should_be_locked && !trans->restarted) {
+ if (!dup)
+ return;
+
+ if (!(trans->locked
+ ? bch2_btree_path_relock_norestart(trans, dup)
+ : bch2_btree_path_can_relock(trans, dup)))
+ return;
+ }
if (dup) {
dup->preserve |= path->preserve;
@@ -1351,7 +1434,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
bool intent)
{
- if (!__btree_path_put(trans->paths + path, intent))
+ if (!__btree_path_put(trans, trans->paths + path, intent))
return;
__bch2_path_free(trans, path);
@@ -1364,29 +1447,48 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_
(void *) trans->last_begin_ip);
}
-void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
+static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct printbuf buf = PRINTBUF;
+ bch2_prt_backtrace(&buf, &trans->last_restarted_trace);
+ panic("in transaction restart: %s, last restarted by\n%s",
+ bch2_err_str(trans->restarted),
+ buf.buf);
+#else
panic("in transaction restart: %s, last restarted by %pS\n",
bch2_err_str(trans->restarted),
(void *) trans->last_restarted_ip);
+#endif
+}
+
+void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans)
+{
+ if (trans->restarted)
+ bch2_trans_in_restart_error(trans);
+
+ if (!trans->locked)
+ panic("trans should be locked, unlocked by %pS\n",
+ (void *) trans->last_unlock_ip);
+
+ BUG();
}
noinline __cold
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
- prt_printf(buf, "transaction updates for %s journal seq %llu",
- trans->fn, trans->journal_res.seq);
- prt_newline(buf);
+ prt_printf(buf, "%u transaction updates for %s journal seq %llu\n",
+ trans->nr_updates, trans->fn, trans->journal_res.seq);
printbuf_indent_add(buf, 2);
trans_for_each_update(trans, i) {
struct bkey_s_c old = { &i->old_k, i->old_v };
- prt_printf(buf, "update: btree=%s cached=%u %pS",
- bch2_btree_id_str(i->btree_id),
- i->cached,
- (void *) i->ip_allocated);
- prt_newline(buf);
+ prt_str(buf, "update: btree=");
+ bch2_btree_id_to_text(buf, i->btree_id);
+ prt_printf(buf, " cached=%u %pS\n",
+ i->cached,
+ (void *) i->ip_allocated);
prt_printf(buf, " old ");
bch2_bkey_val_to_text(buf, trans->c, old);
@@ -1411,27 +1513,75 @@ void bch2_dump_trans_updates(struct btree_trans *trans)
struct printbuf buf = PRINTBUF;
bch2_trans_updates_to_text(&buf, trans);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_str(trans->c, buf.buf);
printbuf_exit(&buf);
}
-static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
+static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
{
struct btree_path *path = trans->paths + path_idx;
- prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
+ prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ",
path_idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
- bch2_btree_id_str(path->btree_id),
- path->level);
+ path->cached ? 'C' : 'B');
+ bch2_btree_id_level_to_text(out, path->btree_id, path->level);
+ prt_str(out, " pos ");
bch2_bpos_to_text(out, path->pos);
- prt_printf(out, " locks %u", path->nodes_locked);
+ if (!path->cached && btree_node_locked(path, path->level)) {
+ prt_char(out, ' ');
+ struct btree *b = path_l(path)->b;
+ bch2_bpos_to_text(out, b->data->min_key);
+ prt_char(out, '-');
+ bch2_bpos_to_text(out, b->key.k.p);
+ }
+
#ifdef TRACK_PATH_ALLOCATED
prt_printf(out, " %pS", (void *) path->ip_allocated);
#endif
+}
+
+static const char *btree_node_locked_str(enum btree_node_locked_type t)
+{
+ switch (t) {
+ case BTREE_NODE_UNLOCKED:
+ return "unlocked";
+ case BTREE_NODE_READ_LOCKED:
+ return "read";
+ case BTREE_NODE_INTENT_LOCKED:
+ return "intent";
+ case BTREE_NODE_WRITE_LOCKED:
+ return "write";
+ default:
+ return NULL;
+ }
+}
+
+void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
+{
+ bch2_btree_path_to_text_short(out, trans, path_idx);
+
+ struct btree_path *path = trans->paths + path_idx;
+
+ prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want);
prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+ for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
+ prt_printf(out, "l=%u locks %s seq %u node ", l,
+ btree_node_locked_str(btree_node_locked_type(path, l)),
+ path->l[l].lock_seq);
+
+ int ret = PTR_ERR_OR_ZERO(path->l[l].b);
+ if (ret)
+ prt_str(out, bch2_err_str(ret));
+ else
+ prt_printf(out, "%px", path->l[l].b);
+ prt_newline(out);
+ }
+ printbuf_indent_sub(out, 2);
}
static noinline __cold
@@ -1443,8 +1593,10 @@ void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
if (!nosort)
btree_trans_sort_paths(trans);
- trans_for_each_path_idx_inorder(trans, iter)
- bch2_btree_path_to_text(out, trans, iter.path_idx);
+ trans_for_each_path_idx_inorder(trans, iter) {
+ bch2_btree_path_to_text_short(out, trans, iter.path_idx);
+ prt_newline(out);
+ }
}
noinline __cold
@@ -1461,7 +1613,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
__bch2_trans_paths_to_text(&buf, trans, nosort);
bch2_trans_updates_to_text(&buf, trans);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_str(trans->c, buf.buf);
printbuf_exit(&buf);
}
@@ -1520,7 +1672,7 @@ static noinline void btree_paths_realloc(struct btree_trans *trans)
{
unsigned nr = trans->nr_paths * 2;
- void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
+ void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
sizeof(struct btree_trans_paths) +
nr * sizeof(struct btree_path) +
nr * sizeof(btree_path_idx_t) + 8 +
@@ -1595,12 +1747,12 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
unsigned flags, unsigned long ip)
{
struct btree_path *path;
- bool cached = flags & BTREE_ITER_CACHED;
- bool intent = flags & BTREE_ITER_INTENT;
+ bool cached = flags & BTREE_ITER_cached;
+ bool intent = flags & BTREE_ITER_intent;
struct trans_for_each_path_inorder_iter iter;
btree_path_idx_t path_pos = 0, path_idx;
- bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_trans_verify_locks(trans);
btree_trans_sort_paths(trans);
@@ -1620,14 +1772,16 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
trans->paths[path_pos].cached == cached &&
trans->paths[path_pos].btree_id == btree_id &&
trans->paths[path_pos].level == level) {
- __btree_path_get(trans->paths + path_pos, intent);
+ trace_btree_path_get(trans, trans->paths + path_pos, &pos);
+
+ __btree_path_get(trans, trans->paths + path_pos, intent);
path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
path = trans->paths + path_idx;
} else {
path_idx = btree_path_alloc(trans, path_pos);
path = trans->paths + path_idx;
- __btree_path_get(path, intent);
+ __btree_path_get(trans, path, intent);
path->pos = pos;
path->btree_id = btree_id;
path->cached = cached;
@@ -1642,9 +1796,11 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
path->ip_allocated = ip;
#endif
trans->paths_sorted = false;
+
+ trace_btree_path_alloc(trans, path);
}
- if (!(flags & BTREE_ITER_NOPRESERVE))
+ if (!(flags & BTREE_ITER_nopreserve))
path->preserve = true;
if (path->intent_ref)
@@ -1665,6 +1821,22 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
return path_idx;
}
+btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans,
+ enum btree_id btree_id,
+ unsigned level,
+ struct bpos pos)
+{
+ btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
+ BTREE_ITER_nopreserve|
+ BTREE_ITER_intent, _RET_IP_);
+ path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
+
+ struct btree_path *path = trans->paths + path_idx;
+ bch2_btree_path_downgrade(trans, path);
+ __bch2_btree_path_unlock(trans, path);
+ return path_idx;
+}
+
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
{
@@ -1688,15 +1860,14 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
goto hole;
} else {
struct bkey_cached *ck = (void *) path->l[0].b;
-
- EBUG_ON(ck &&
- (path->btree_id != ck->key.btree_id ||
- !bkey_eq(path->pos, ck->key.pos)));
- if (!ck || !ck->valid)
+ if (!ck)
return bkey_s_c_null;
+ EBUG_ON(path->btree_id != ck->key.btree_id ||
+ !bkey_eq(path->pos, ck->key.pos));
+
*u = ck->k->k;
- k = bkey_i_to_s_c(ck->k);
+ k = (struct bkey_s_c) { u, &ck->k->v };
}
return k;
@@ -1706,6 +1877,18 @@ hole:
return (struct bkey_s_c) { u, NULL };
}
+void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+
+ if (!iter->path || trans->restarted)
+ return;
+
+ struct btree_path *path = btree_iter_path(trans, iter);
+ path->preserve = false;
+ if (path->ref == 1)
+ path->should_be_locked = false;
+}
/* Btree iterators: */
int __must_check
@@ -1720,16 +1903,20 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
struct btree_trans *trans = iter->trans;
int ret;
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+
iter->path = bch2_btree_path_set_pos(trans, iter->path,
btree_iter_search_key(iter),
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
if (ret)
return ret;
- btree_path_set_should_be_locked(trans->paths + iter->path);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ if (btree_path_node(path, path->level))
+ btree_path_set_should_be_locked(trans, path);
return 0;
}
@@ -1759,9 +1946,9 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
iter->k.p = iter->pos = b->key.k.p;
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -1772,6 +1959,7 @@ err:
goto out;
}
+/* Only kept for -tools */
struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
{
struct btree *b;
@@ -1790,9 +1978,14 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
int ret;
EBUG_ON(trans->paths[iter->path].cached);
- bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_btree_iter_verify(iter);
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (ret)
+ goto err;
+
+
struct btree_path *path = btree_iter_path(trans, iter);
/* already at end? */
@@ -1820,13 +2013,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
if (bpos_eq(iter->pos, b->key.k.p)) {
__btree_path_set_level_up(trans, path, path->level++);
} else {
+ if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED)
+ btree_node_unlock(trans, path, path->level + 1);
+
/*
* Haven't gotten to the end of the parent node: go back down to
* the next child node
*/
iter->path = bch2_btree_path_set_pos(trans, iter->path,
bpos_successor(iter->pos),
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
path = btree_iter_path(trans, iter);
@@ -1844,9 +2040,9 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
iter->k.p = iter->pos = b->key.k.p;
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
@@ -1863,11 +2059,11 @@ err:
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
struct bpos pos = iter->k.p;
- bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ bool ret = !(iter->flags & BTREE_ITER_all_snapshots
? bpos_eq(pos, SPOS_MAX)
: bkey_eq(pos, SPOS_MAX));
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_successor(iter, pos);
bch2_btree_iter_set_pos(iter, pos);
return ret;
@@ -1876,11 +2072,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
{
struct bpos pos = bkey_start_pos(&iter->k);
- bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ bool ret = !(iter->flags & BTREE_ITER_all_snapshots
? bpos_eq(pos, POS_MIN)
: bkey_eq(pos, POS_MIN));
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_predecessor(iter, pos);
bch2_btree_iter_set_pos(iter, pos);
return ret;
@@ -1938,7 +2134,7 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
{
struct btree_path *path = btree_iter_path(trans, iter);
- return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+ return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
path->level,
path->pos,
end_pos,
@@ -1961,21 +2157,47 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
}
static noinline
-struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
+void btree_trans_peek_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
bch2_btree_journal_peek(trans, iter,
- k.k ? k.k->p : path_l(path)->b->key.k.p);
-
+ k->k ? k->k->p : path_l(path)->b->key.k.p);
if (next_journal) {
iter->k = next_journal->k;
- k = bkey_i_to_s_c(next_journal);
+ *k = bkey_i_to_s_c(next_journal);
}
+}
- return k;
+static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos end_pos)
+{
+ struct btree_path *path = btree_iter_path(trans, iter);
+
+ return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
+ path->level,
+ path->pos,
+ end_pos,
+ &iter->journal_idx);
+}
+
+static noinline
+void btree_trans_peek_prev_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c *k)
+{
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct bkey_i *next_journal =
+ bch2_btree_journal_peek_prev(trans, iter,
+ k->k ? k->k->p : path_l(path)->b->key.k.p);
+
+ if (next_journal) {
+ iter->k = next_journal->k;
+ *k = bkey_i_to_s_c(next_journal);
+ }
}
/*
@@ -1991,7 +2213,9 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
struct bkey_s_c k;
int ret;
- if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) &&
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+
+ if ((iter->flags & BTREE_ITER_key_cache_fill) &&
bpos_eq(iter->pos, pos))
return bkey_s_c_null;
@@ -2000,28 +2224,32 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
if (!iter->key_cache_path)
iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
- iter->flags & BTREE_ITER_INTENT, 0,
- iter->flags|BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL,
+ iter->flags & BTREE_ITER_intent, 0,
+ iter->flags|BTREE_ITER_cached|
+ BTREE_ITER_cached_nofill,
_THIS_IP_);
iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
- iter->flags|BTREE_ITER_CACHED) ?:
+ iter->flags|BTREE_ITER_cached) ?:
bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
if (unlikely(ret))
return bkey_s_c_err(ret);
- btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
-
k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
- if (k.k && !bkey_err(k)) {
- iter->k = u;
- k.k = &iter->k;
- }
+ if (!k.k)
+ return k;
+
+ if ((iter->flags & BTREE_ITER_all_snapshots) &&
+ !bpos_eq(pos, k.k->p))
+ return bkey_s_c_null;
+
+ iter->k = u;
+ k.k = &iter->k;
+ btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
return k;
}
@@ -2035,10 +2263,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
bch2_btree_iter_verify(iter);
while (1) {
- struct btree_path_level *l;
-
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
@@ -2046,38 +2272,37 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
/* ensure that iter->k is consistent with iter->pos: */
bch2_btree_iter_set_pos(iter, iter->pos);
k = bkey_s_c_err(ret);
- goto out;
+ break;
}
struct btree_path *path = btree_iter_path(trans, iter);
- l = path_l(path);
+ struct btree_path_level *l = path_l(path);
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
bch2_btree_iter_set_pos(iter, SPOS_MAX);
k = bkey_s_c_null;
- goto out;
+ break;
}
- btree_path_set_should_be_locked(path);
+ btree_path_set_should_be_locked(trans, path);
k = btree_path_level_peek_all(trans->c, l, &iter->k);
- if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
k.k &&
(k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
k = k2;
- ret = bkey_err(k);
- if (ret) {
+ if (bkey_err(k)) {
bch2_btree_iter_set_pos(iter, iter->pos);
- goto out;
+ break;
}
}
- if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
- k = btree_trans_peek_journal(trans, iter, k);
+ if (unlikely(iter->flags & BTREE_ITER_with_journal))
+ btree_trans_peek_journal(trans, iter, &k);
- if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
bch2_btree_trans_peek_updates(trans, iter, &k);
@@ -2104,41 +2329,46 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
/* End of btree: */
bch2_btree_iter_set_pos(iter, SPOS_MAX);
k = bkey_s_c_null;
- goto out;
+ break;
}
}
-out:
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(iter);
return k;
}
/**
- * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * bch2_btree_iter_peek_max() - returns first key greater than or equal to
* iterator's current position
* @iter: iterator to peek from
* @end: search limit: returns keys less than or equal to @end
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end)
{
struct btree_trans *trans = iter->trans;
struct bpos search_key = btree_iter_search_key(iter);
struct bkey_s_c k;
- struct bpos iter_pos;
+ struct bpos iter_pos = iter->pos;
int ret;
- EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+ bch2_btree_iter_verify_entry_exit(iter);
+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
+
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
if (iter->update_path) {
bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
}
- bch2_btree_iter_verify_entry_exit(iter);
-
while (1) {
k = __bch2_btree_iter_peek(iter, search_key);
if (unlikely(!k.k))
@@ -2146,75 +2376,75 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
if (unlikely(bkey_err(k)))
goto out_no_locked;
- /*
- * We need to check against @end before FILTER_SNAPSHOTS because
- * if we get to a different inode that requested we might be
- * seeing keys for a different snapshot tree that will all be
- * filtered out.
- *
- * But we can't do the full check here, because bkey_start_pos()
- * isn't monotonically increasing before FILTER_SNAPSHOTS, and
- * that's what we check against in extents mode:
- */
- if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
- ? bkey_gt(k.k->p, end)
- : k.k->p.inode > end.inode))
- goto end;
+ if (iter->flags & BTREE_ITER_filter_snapshots) {
+ /*
+ * We need to check against @end before FILTER_SNAPSHOTS because
+ * if we get to a different inode that requested we might be
+ * seeing keys for a different snapshot tree that will all be
+ * filtered out.
+ *
+ * But we can't do the full check here, because bkey_start_pos()
+ * isn't monotonically increasing before FILTER_SNAPSHOTS, and
+ * that's what we check against in extents mode:
+ */
+ if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
+ ? bkey_gt(k.k->p, end)
+ : k.k->p.inode > end.inode))
+ goto end;
+
+ if (iter->update_path &&
+ !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
+ bch2_path_put_nokeep(trans, iter->update_path,
+ iter->flags & BTREE_ITER_intent);
+ iter->update_path = 0;
+ }
- if (iter->update_path &&
- !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
- bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
- iter->update_path = 0;
- }
+ if ((iter->flags & BTREE_ITER_intent) &&
+ !(iter->flags & BTREE_ITER_is_extents) &&
+ !iter->update_path) {
+ struct bpos pos = k.k->p;
- if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
- (iter->flags & BTREE_ITER_INTENT) &&
- !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
- !iter->update_path) {
- struct bpos pos = k.k->p;
+ if (pos.snapshot < iter->snapshot) {
+ search_key = bpos_successor(k.k->p);
+ continue;
+ }
- if (pos.snapshot < iter->snapshot) {
- search_key = bpos_successor(k.k->p);
- continue;
- }
+ pos.snapshot = iter->snapshot;
- pos.snapshot = iter->snapshot;
+ /*
+ * advance, same as on exit for iter->path, but only up
+ * to snapshot
+ */
+ __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
+ iter->update_path = iter->path;
+
+ iter->update_path = bch2_btree_path_set_pos(trans,
+ iter->update_path, pos,
+ iter->flags & BTREE_ITER_intent,
+ _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+ }
/*
- * advance, same as on exit for iter->path, but only up
- * to snapshot
+ * We can never have a key in a leaf node at POS_MAX, so
+ * we don't have to check these successor() calls:
*/
- __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT);
- iter->update_path = iter->path;
-
- iter->update_path = bch2_btree_path_set_pos(trans,
- iter->update_path, pos,
- iter->flags & BTREE_ITER_INTENT,
- _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out_no_locked;
+ if (!bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ search_key = bpos_successor(k.k->p);
+ continue;
}
- }
-
- /*
- * We can never have a key in a leaf node at POS_MAX, so
- * we don't have to check these successor() calls:
- */
- if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
- !bch2_snapshot_is_ancestor(trans->c,
- iter->snapshot,
- k.k->p.snapshot)) {
- search_key = bpos_successor(k.k->p);
- continue;
- }
- if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
- search_key = bkey_successor(iter, k.k->p);
- continue;
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_key_cache_fill)) {
+ search_key = bkey_successor(iter, k.k->p);
+ continue;
+ }
}
/*
@@ -2222,14 +2452,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* equal to the key we just returned - except extents can
* straddle iter->pos:
*/
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (!(iter->flags & BTREE_ITER_is_extents))
iter_pos = k.k->p;
else
iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
- if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
- ? bkey_gt(iter_pos, end)
- : bkey_ge(iter_pos, end)))
+ if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) :
+ iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) :
+ bkey_gt(iter_pos, end)))
goto end;
break;
@@ -2238,20 +2468,20 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
iter->pos = iter_pos;
iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out_no_locked:
if (iter->update_path) {
ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
if (unlikely(ret))
k = bkey_s_c_err(ret);
else
- btree_path_set_should_be_locked(trans->paths + iter->update_path);
+ btree_path_set_should_be_locked(trans, trans->paths + iter->update_path);
}
- if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ if (!(iter->flags & BTREE_ITER_all_snapshots))
iter->pos.snapshot = iter->snapshot;
ret = bch2_btree_iter_verify_ret(iter, k);
@@ -2284,135 +2514,224 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
return bch2_btree_iter_peek(iter);
}
-/**
- * bch2_btree_iter_peek_prev() - returns first key less than or equal to
- * iterator's current position
- * @iter: iterator to peek from
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key)
{
struct btree_trans *trans = iter->trans;
- struct bpos search_key = iter->pos;
- struct bkey_s_c k;
- struct bkey saved_k;
- const struct bch_val *saved_v;
- btree_path_idx_t saved_path = 0;
- int ret;
-
- EBUG_ON(btree_iter_path(trans, iter)->cached ||
- btree_iter_path(trans, iter)->level);
-
- if (iter->flags & BTREE_ITER_WITH_JOURNAL)
- return bkey_s_c_err(-EIO);
+ struct bkey_s_c k, k2;
bch2_btree_iter_verify(iter);
- bch2_btree_iter_verify_entry_exit(iter);
-
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
- search_key.snapshot = U32_MAX;
while (1) {
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
+ iter->flags & BTREE_ITER_intent,
+ btree_iter_ip_allocated(iter));
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
/* ensure that iter->k is consistent with iter->pos: */
bch2_btree_iter_set_pos(iter, iter->pos);
k = bkey_s_c_err(ret);
- goto out_no_locked;
+ break;
}
struct btree_path *path = btree_iter_path(trans, iter);
+ struct btree_path_level *l = path_l(path);
+
+ if (unlikely(!l->b)) {
+ /* No btree nodes at requested level: */
+ bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ k = bkey_s_c_null;
+ break;
+ }
+
+ btree_path_set_should_be_locked(trans, path);
+
+ k = btree_path_level_peek_all(trans->c, l, &iter->k);
+ if (!k.k || bpos_gt(k.k->p, search_key)) {
+ k = btree_path_level_prev(trans, path, l, &iter->k);
+
+ BUG_ON(k.k && bpos_gt(k.k->p, search_key));
+ }
+
+ if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
+ k.k &&
+ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ k = k2;
+ if (bkey_err(k2)) {
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ break;
+ }
+ }
- k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
- if (!k.k ||
- ((iter->flags & BTREE_ITER_IS_EXTENTS)
- ? bpos_ge(bkey_start_pos(k.k), search_key)
- : bpos_gt(k.k->p, search_key)))
- k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
+ if (unlikely(iter->flags & BTREE_ITER_with_journal))
+ btree_trans_peek_prev_journal(trans, iter, &k);
- if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
bch2_btree_trans_peek_prev_updates(trans, iter, &k);
- if (likely(k.k)) {
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
- if (k.k->p.snapshot == iter->snapshot)
- goto got_key;
+ if (likely(k.k && !bkey_deleted(k.k))) {
+ break;
+ } else if (k.k) {
+ search_key = bpos_predecessor(k.k->p);
+ } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
+ /* Advance to previous leaf node: */
+ search_key = bpos_predecessor(path->l[0].b->data->min_key);
+ } else {
+ /* Start of btree: */
+ bch2_btree_iter_set_pos(iter, POS_MIN);
+ k = bkey_s_c_null;
+ break;
+ }
+ }
+
+ bch2_btree_iter_verify(iter);
+ return k;
+}
+
+/**
+ * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
+ * iterator's current position
+ * @iter: iterator to peek from
+ * @end: search limit: returns keys greater than or equal to @end
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
+{
+ if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
+ !bkey_eq(iter->pos, POS_MAX)) {
+ /*
+ * bkey_start_pos(), for extents, is not monotonically
+ * increasing until after filtering for snapshots:
+ *
+ * Thus, for extents we need to search forward until we find a
+ * real visible extents - easiest to just use peek_slot() (which
+ * internally uses peek() for extents)
+ */
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ if (bkey_err(k))
+ return k;
+
+ if (!bkey_deleted(k.k) &&
+ (!(iter->flags & BTREE_ITER_is_extents) ||
+ bkey_lt(bkey_start_pos(k.k), iter->pos)))
+ return k;
+ }
+
+ struct btree_trans *trans = iter->trans;
+ struct bpos search_key = iter->pos;
+ struct bkey_s_c k;
+ btree_path_idx_t saved_path = 0;
+
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+ bch2_btree_iter_verify_entry_exit(iter);
+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
+
+ int ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+
+ while (1) {
+ k = __bch2_btree_iter_peek_prev(iter, search_key);
+ if (unlikely(!k.k))
+ goto end;
+ if (unlikely(bkey_err(k)))
+ goto out_no_locked;
+
+ if (iter->flags & BTREE_ITER_filter_snapshots) {
+ struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
+ if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
+ /*
+ * If we have a saved candidate, and we're past
+ * the last possible snapshot overwrite, return
+ * it:
+ */
+ bch2_path_put_nokeep(trans, iter->path,
+ iter->flags & BTREE_ITER_intent);
+ iter->path = saved_path;
+ saved_path = 0;
+ k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
+ break;
+ }
+
+ /*
+ * We need to check against @end before FILTER_SNAPSHOTS because
+ * if we get to a different inode that requested we might be
+ * seeing keys for a different snapshot tree that will all be
+ * filtered out.
+ */
+ if (unlikely(bkey_lt(k.k->p, end)))
+ goto end;
+
+ if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
+ search_key = bpos_predecessor(k.k->p);
+ continue;
+ }
+ if (k.k->p.snapshot != iter->snapshot) {
/*
- * If we have a saved candidate, and we're no
- * longer at the same _key_ (not pos), return
- * that candidate
+ * Have a key visible in iter->snapshot, but
+ * might have overwrites: - save it and keep
+ * searching. Unless it's a whiteout - then drop
+ * our previous saved candidate:
*/
- if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
- bch2_path_put_nokeep(trans, iter->path,
- iter->flags & BTREE_ITER_INTENT);
- iter->path = saved_path;
+ if (saved_path) {
+ bch2_path_put_nokeep(trans, saved_path,
+ iter->flags & BTREE_ITER_intent);
saved_path = 0;
- iter->k = saved_k;
- k.v = saved_v;
- goto got_key;
}
- if (bch2_snapshot_is_ancestor(trans->c,
- iter->snapshot,
- k.k->p.snapshot)) {
- if (saved_path)
- bch2_path_put_nokeep(trans, saved_path,
- iter->flags & BTREE_ITER_INTENT);
+ if (!bkey_whiteout(k.k)) {
saved_path = btree_path_clone(trans, iter->path,
- iter->flags & BTREE_ITER_INTENT);
- path = btree_iter_path(trans, iter);
- saved_k = *k.k;
- saved_v = k.v;
+ iter->flags & BTREE_ITER_intent,
+ _THIS_IP_);
+ trace_btree_path_save_pos(trans,
+ trans->paths + iter->path,
+ trans->paths + saved_path);
}
search_key = bpos_predecessor(k.k->p);
continue;
}
-got_key:
- if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+
+ if (bkey_whiteout(k.k)) {
search_key = bkey_predecessor(iter, k.k->p);
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
- search_key.snapshot = U32_MAX;
+ search_key.snapshot = U32_MAX;
continue;
}
-
- btree_path_set_should_be_locked(path);
- break;
- } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
- /* Advance to previous leaf node: */
- search_key = bpos_predecessor(path->l[0].b->data->min_key);
- } else {
- /* Start of btree: */
- bch2_btree_iter_set_pos(iter, POS_MIN);
- k = bkey_s_c_null;
- goto out_no_locked;
}
- }
- EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
+ EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) :
+ iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) :
+ bkey_gt(k.k->p, iter->pos));
+
+ if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) :
+ iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) :
+ bkey_lt(k.k->p, end)))
+ goto end;
+
+ break;
+ }
/* Extents can straddle iter->pos: */
- if (bkey_lt(k.k->p, iter->pos))
- iter->pos = k.k->p;
+ iter->pos = bpos_min(iter->pos, k.k->p);;
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ if (iter->flags & BTREE_ITER_filter_snapshots)
iter->pos.snapshot = iter->snapshot;
out_no_locked:
if (saved_path)
- bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
+ bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
-
return k;
+end:
+ bch2_btree_iter_set_pos(iter, end);
+ k = bkey_s_c_null;
+ goto out_no_locked;
}
/**
@@ -2437,12 +2756,19 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct bkey_s_c k;
int ret;
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
+ EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
+
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
/* extents can't span inode numbers: */
- if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ if ((iter->flags & BTREE_ITER_is_extents) &&
unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
if (iter->pos.inode == KEY_INODE_MAX)
return bkey_s_c_null;
@@ -2452,7 +2778,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
search_key = btree_iter_search_key(iter);
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
@@ -2461,22 +2787,26 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
goto out_no_locked;
}
- if ((iter->flags & BTREE_ITER_CACHED) ||
- !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
+ struct btree_path *path = btree_iter_path(trans, iter);
+ if (unlikely(!btree_path_node(path, path->level)))
+ return bkey_s_c_null;
+
+ if ((iter->flags & BTREE_ITER_cached) ||
+ !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
k = bkey_s_c_null;
- if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates)) {
bch2_btree_trans_peek_slot_updates(trans, iter, &k);
if (k.k)
goto out;
}
- if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+ if (unlikely(iter->flags & BTREE_ITER_with_journal) &&
(k = btree_trans_peek_slot_journal(trans, iter)).k)
goto out;
- if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
(k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
if (!bkey_err(k))
iter->k = *k.k;
@@ -2487,22 +2817,28 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
if (unlikely(!k.k))
goto out_no_locked;
+
+ if (unlikely(k.k->type == KEY_TYPE_whiteout &&
+ (iter->flags & BTREE_ITER_filter_snapshots) &&
+ !(iter->flags & BTREE_ITER_key_cache_fill)))
+ iter->k.type = KEY_TYPE_deleted;
} else {
struct bpos next;
struct bpos end = iter->pos;
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ if (iter->flags & BTREE_ITER_is_extents)
end.offset = U64_MAX;
EBUG_ON(btree_iter_path(trans, iter)->level);
- if (iter->flags & BTREE_ITER_INTENT) {
+ if (iter->flags & BTREE_ITER_intent) {
struct btree_iter iter2;
bch2_trans_copy_iter(&iter2, iter);
- k = bch2_btree_iter_peek_upto(&iter2, end);
+ k = bch2_btree_iter_peek_max(&iter2, end);
if (k.k && !bkey_err(k)) {
+ swap(iter->key_cache_path, iter2.key_cache_path);
iter->k = iter2.k;
k.k = &iter->k;
}
@@ -2510,7 +2846,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
} else {
struct bpos pos = iter->pos;
- k = bch2_btree_iter_peek_upto(iter, end);
+ k = bch2_btree_iter_peek_max(iter, end);
if (unlikely(bkey_err(k)))
bch2_btree_iter_set_pos(iter, pos);
else
@@ -2526,7 +2862,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
bkey_init(&iter->k);
iter->k.p = iter->pos;
- if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+ if (iter->flags & BTREE_ITER_is_extents) {
bch2_key_resize(&iter->k,
min_t(u64, KEY_SIZE_MAX,
(next.inode == iter->pos.inode
@@ -2540,7 +2876,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
}
out:
- btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -2567,6 +2903,7 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
return bch2_btree_iter_peek_slot(iter);
}
+/* Obsolete, but still used by rust wrapper in -tools */
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
{
struct bkey_s_c k;
@@ -2710,13 +3047,13 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
{
if (iter->update_path)
bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
if (iter->path)
bch2_path_put(trans, iter->path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
if (iter->key_cache_path)
bch2_path_put(trans, iter->key_cache_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->path = 0;
iter->update_path = 0;
iter->key_cache_path = 0;
@@ -2729,7 +3066,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans,
unsigned flags)
{
bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
- bch2_btree_iter_flags(trans, btree_id, flags),
+ bch2_btree_iter_flags(trans, btree_id, 0, flags),
_RET_IP_);
}
@@ -2741,12 +3078,15 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
unsigned depth,
unsigned flags)
{
- flags |= BTREE_ITER_NOT_EXTENTS;
- flags |= __BTREE_ITER_ALL_SNAPSHOTS;
- flags |= BTREE_ITER_ALL_SNAPSHOTS;
+ flags |= BTREE_ITER_not_extents;
+ flags |= BTREE_ITER_snapshot_field;
+ flags |= BTREE_ITER_all_snapshots;
+
+ if (!depth && btree_id_cached(trans->c, btree_id))
+ flags |= BTREE_ITER_with_key_cache;
bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
- __bch2_btree_iter_flags(trans, btree_id, flags),
+ bch2_btree_iter_flags(trans, btree_id, depth, flags),
_RET_IP_);
iter->min_depth = depth;
@@ -2762,10 +3102,13 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
struct btree_trans *trans = src->trans;
*dst = *src;
+#ifdef TRACK_PATH_ALLOCATED
+ dst->ip_allocated = _RET_IP_;
+#endif
if (src->path)
- __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent);
if (src->update_path)
- __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
dst->key_cache_path = 0;
}
@@ -2781,9 +3124,38 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (ret)
+ return ERR_PTR(ret);
+
struct btree_transaction_stats *s = btree_trans_stats(trans);
s->max_mem = max(s->max_mem, new_bytes);
+ if (trans->used_mempool) {
+ if (trans->mem_bytes >= new_bytes)
+ goto out_change_top;
+
+ /* No more space from mempool item, need malloc new one */
+ new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+ if (unlikely(!new_mem)) {
+ bch2_trans_unlock(trans);
+
+ new_mem = kmalloc(new_bytes, GFP_KERNEL);
+ if (!new_mem)
+ return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+ ret = bch2_trans_relock(trans);
+ if (ret) {
+ kfree(new_mem);
+ return ERR_PTR(ret);
+ }
+ }
+ memcpy(new_mem, trans->mem, trans->mem_top);
+ trans->used_mempool = false;
+ mempool_free(trans->mem, &c->btree_trans_mem_pool);
+ goto out_new_mem;
+ }
+
new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_mem)) {
bch2_trans_unlock(trans);
@@ -2792,6 +3164,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX;
+ memcpy(new_mem, trans->mem, trans->mem_top);
+ trans->used_mempool = true;
kfree(trans->mem);
}
@@ -2805,15 +3179,16 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (ret)
return ERR_PTR(ret);
}
-
+out_new_mem:
trans->mem = new_mem;
trans->mem_bytes = new_bytes;
if (old_bytes) {
trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
+ return ERR_PTR(btree_trans_restart_ip(trans,
+ BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
}
-
+out_change_top:
p = trans->mem + trans->mem_top;
trans->mem_top += size;
memset(p, 0, size);
@@ -2907,7 +3282,8 @@ u32 bch2_trans_begin(struct btree_trans *trans)
if (!trans->restarted &&
(need_resched() ||
time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
- drop_locks_do(trans, (cond_resched(), 0));
+ bch2_trans_unlock(trans);
+ cond_resched();
now = local_clock();
}
trans->last_begin_time = now;
@@ -2917,11 +3293,23 @@ u32 bch2_trans_begin(struct btree_trans *trans)
bch2_trans_srcu_unlock(trans);
trans->last_begin_ip = _RET_IP_;
+
+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ if (trans->restarted) {
+ trans->restart_count_this_trans++;
+ } else {
+ trans->restart_count_this_trans = 0;
+ }
+#endif
+
+ trans_set_locked(trans, false);
+
if (trans->restarted) {
bch2_btree_path_traverse_all(trans);
trans->notrace_relock_fail = false;
}
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
return trans->restart_count;
}
@@ -2955,7 +3343,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
memset(trans, 0, sizeof(*trans));
- closure_init_stack(&trans->ref);
seqmutex_lock(&c->btree_trans_lock);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
@@ -2974,16 +3361,11 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
*/
BUG_ON(pos_task &&
pid == pos_task->pid &&
- bch2_trans_locked(pos));
-
- if (pos_task && pid < pos_task->pid) {
- list_add_tail(&trans->list, &pos->list);
- goto list_add_done;
- }
+ pos->locked);
}
}
- list_add_tail(&trans->list, &c->btree_trans_list);
-list_add_done:
+
+ list_add(&trans->list, &c->btree_trans_list);
seqmutex_unlock(&c->btree_trans_lock);
got_trans:
trans->c = c;
@@ -2991,7 +3373,7 @@ got_trans:
trans->fn_idx = fn_idx;
trans->locking_wait.task = current;
trans->journal_replay_not_finished =
- unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+ unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
atomic_inc_not_zero(&c->journal_keys.ref);
trans->nr_paths = ARRAY_SIZE(trans->_paths);
trans->paths_allocated = trans->_paths_allocated;
@@ -3003,6 +3385,9 @@ got_trans:
trans->paths_allocated[0] = 1;
+ static struct lock_class_key lockdep_key;
+ lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0);
+
if (fn_idx < BCH_TRANSACTIONS_NR) {
trans->fn = bch2_btree_transaction_fns[fn_idx];
@@ -3023,6 +3408,9 @@ got_trans:
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
trans->srcu_held = true;
+ trans_set_locked(trans, false);
+
+ closure_init_stack_release(&trans->ref);
return trans;
}
@@ -3054,12 +3442,14 @@ void bch2_trans_put(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
+ if (trans->restarted)
+ bch2_trans_in_restart_error(trans);
+
bch2_trans_unlock(trans);
trans_for_each_update(trans, i)
- __btree_path_put(trans->paths + i->path, true);
+ __btree_path_put(trans, trans->paths + i->path, true);
trans->nr_updates = 0;
- trans->locking_wait.task = NULL;
check_btree_paths_leaked(trans);
@@ -3068,26 +3458,28 @@ void bch2_trans_put(struct btree_trans *trans)
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
}
- if (trans->fs_usage_deltas) {
- if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
- REPLICAS_DELTA_LIST_MAX)
- mempool_free(trans->fs_usage_deltas,
- &c->replicas_delta_pool);
- else
- kfree(trans->fs_usage_deltas);
- }
-
if (unlikely(trans->journal_replay_not_finished))
bch2_journal_keys_put(c);
+ /*
+ * trans->ref protects trans->locking_wait.task, btree_paths array; used
+ * by cycle detector
+ */
+ closure_return_sync(&trans->ref);
+ trans->locking_wait.task = NULL;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ darray_exit(&trans->last_restarted_trace);
+#endif
+
unsigned long *paths_allocated = trans->paths_allocated;
trans->paths_allocated = NULL;
trans->paths = NULL;
if (paths_allocated != trans->_paths_allocated)
- kfree_rcu_mightsleep(paths_allocated);
+ kvfree_rcu_mightsleep(paths_allocated);
- if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
+ if (trans->used_mempool)
mempool_free(trans->mem, &c->btree_trans_mem_pool);
else
kfree(trans->mem);
@@ -3097,8 +3489,6 @@ void bch2_trans_put(struct btree_trans *trans)
trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
if (trans) {
- closure_sync(&trans->ref);
-
seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock);
@@ -3107,6 +3497,21 @@ void bch2_trans_put(struct btree_trans *trans)
}
}
+bool bch2_current_has_btree_trans(struct bch_fs *c)
+{
+ seqmutex_lock(&c->btree_trans_lock);
+ struct btree_trans *trans;
+ bool ret = false;
+ list_for_each_entry(trans, &c->btree_trans_list, list)
+ if (trans->locking_wait.task == current &&
+ trans->locked) {
+ ret = true;
+ break;
+ }
+ seqmutex_unlock(&c->btree_trans_lock);
+ return ret;
+}
+
static void __maybe_unused
bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
struct btree_bkey_cached_common *b)
@@ -3120,13 +3525,12 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
pid = owner ? owner->pid : 0;
rcu_read_unlock();
- prt_tab(out);
- prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
- b->level, bch2_btree_id_str(b->btree_id));
+ prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
+ bch2_btree_id_to_text(out, b->btree_id);
+ prt_printf(out, " l=%u:", b->level);
bch2_bpos_to_text(out, btree_node_pos(b));
- prt_tab(out);
- prt_printf(out, " locks %u:%u:%u held by pid %u",
+ prt_printf(out, "\t locks %u:%u:%u held by pid %u",
c.n[0], c.n[1], c.n[2], pid);
}
@@ -3162,11 +3566,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
if (!path->nodes_locked)
continue;
- prt_printf(out, " path %u %c l=%u %s:",
- idx,
- path->cached ? 'c' : 'b',
- path->level,
- bch2_btree_id_str(path->btree_id));
+ prt_printf(out, " path %u %c ",
+ idx,
+ path->cached ? 'c' : 'b');
+ bch2_btree_id_to_text(out, path->btree_id);
+ prt_printf(out, " l=%u:", path->level);
bch2_bpos_to_text(out, path->pos);
prt_newline(out);
@@ -3183,10 +3587,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
b = READ_ONCE(trans->locking);
if (b) {
- prt_printf(out, " blocked for %lluus on",
- div_u64(local_clock() - trans->locking_wait.start_time,
- 1000));
- prt_newline(out);
+ prt_printf(out, " blocked for %lluus on\n",
+ div_u64(local_clock() - trans->locking_wait.start_time, 1000));
prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]);
bch2_btree_bkey_cached_common_to_text(out, b);
prt_newline(out);
@@ -3208,8 +3610,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
if (trans) {
- closure_sync(&trans->ref);
-
seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock);
@@ -3229,8 +3629,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
bch2_time_stats_exit(&s->lock_hold_times);
}
- if (c->btree_trans_barrier_initialized)
+ if (c->btree_trans_barrier_initialized) {
+ synchronize_srcu_expedited(&c->btree_trans_barrier);
cleanup_srcu_struct(&c->btree_trans_barrier);
+ }
mempool_exit(&c->btree_trans_mem_pool);
mempool_exit(&c->btree_trans_pool);
}
@@ -3264,7 +3666,22 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
BTREE_TRANS_MEM_MAX) ?:
init_srcu_struct(&c->btree_trans_barrier);
- if (!ret)
- c->btree_trans_barrier_initialized = true;
- return ret;
+ if (ret)
+ return ret;
+
+ /*
+ * static annotation (hackily done) for lock ordering of reclaim vs.
+ * btree node locks:
+ */
+#ifdef CONFIG_LOCKDEP
+ fs_reclaim_acquire(GFP_KERNEL);
+ struct btree_trans *trans = bch2_trans_get(c);
+ trans_set_locked(trans, false);
+ bch2_trans_put(trans);
+ fs_reclaim_release(GFP_KERNEL);
+#endif
+
+ c->btree_trans_barrier_initialized = true;
+ return 0;
+
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 24772538e4cc..b96157f3dc9c 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -6,6 +6,12 @@
#include "btree_types.h"
#include "trace.h"
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
+void bch2_dump_trans_paths_updates(struct btree_trans *);
+
static inline int __bkey_err(const struct bkey *k)
{
return PTR_ERR_OR_ZERO(k);
@@ -13,16 +19,30 @@ static inline int __bkey_err(const struct bkey *k)
#define bkey_err(_k) __bkey_err((_k).k)
-static inline void __btree_path_get(struct btree_path *path, bool intent)
+static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent)
{
+ unsigned idx = path - trans->paths;
+
+ EBUG_ON(idx >= trans->nr_paths);
+ EBUG_ON(!test_bit(idx, trans->paths_allocated));
+ if (unlikely(path->ref == U8_MAX)) {
+ bch2_dump_trans_paths_updates(trans);
+ panic("path %u refcount overflow\n", idx);
+ }
+
path->ref++;
path->intent_ref += intent;
+ trace_btree_path_get_ll(trans, path);
}
-static inline bool __btree_path_put(struct btree_path *path, bool intent)
+static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
{
+ EBUG_ON(path - trans->paths >= trans->nr_paths);
+ EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
EBUG_ON(!path->ref);
EBUG_ON(!path->intent_ref && intent);
+
+ trace_btree_path_put_ll(trans, path);
path->intent_ref -= intent;
return --path->ref == 0;
}
@@ -216,9 +236,13 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
btree_path_idx_t,
unsigned, unsigned long);
+static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *);
+
static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
btree_path_idx_t path, unsigned flags)
{
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+
if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
return 0;
@@ -227,6 +251,9 @@ static inline int __must_check bch2_btree_path_traverse(struct btree_trans *tran
btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
unsigned, unsigned, unsigned, unsigned long);
+btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id,
+ unsigned, struct bpos);
+
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
/*
@@ -261,12 +288,11 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_trans_verify_paths(struct btree_trans *);
-void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
- struct bpos, bool);
+void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
#else
static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
- struct bpos pos, bool key_cache) {}
+ struct bpos pos) {}
#endif
void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
@@ -283,7 +309,6 @@ int bch2_trans_relock(struct btree_trans *);
int bch2_trans_relock_notrace(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
void bch2_trans_unlock_long(struct btree_trans *);
-bool bch2_trans_locked(struct btree_trans *);
static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
{
@@ -301,30 +326,45 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
bch2_trans_restart_error(trans, restart_count);
}
-void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
+void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *);
-static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
+static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans)
{
- if (trans->restarted)
- bch2_trans_in_restart_error(trans);
+ if (trans->restarted || !trans->locked)
+ bch2_trans_unlocked_or_in_restart_error(trans);
}
__always_inline
-static int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
+static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
{
BUG_ON(err <= 0);
BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
trans->restarted = err;
- trans->last_restarted_ip = _THIS_IP_;
+ trans->last_restarted_ip = ip;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ darray_exit(&trans->last_restarted_trace);
+ bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
+#endif
return -err;
}
__always_inline
static int btree_trans_restart(struct btree_trans *trans, int err)
{
- btree_trans_restart_nounlock(trans, err);
- return -err;
+ return btree_trans_restart_ip(trans, err, _THIS_IP_);
+}
+
+static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip)
+{
+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) {
+ trace_and_count(trans->c, trans_restart_injected, trans, ip);
+ return btree_trans_restart_ip(trans,
+ BCH_ERR_transaction_restart_fault_inject, ip);
+ }
+#endif
+ return 0;
}
bool bch2_btree_node_upgrade(struct btree_trans *,
@@ -344,6 +384,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
void bch2_trans_downgrade(struct btree_trans *);
void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
+void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
@@ -353,15 +394,21 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
struct btree *bch2_btree_iter_next_node(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
{
- return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
+ return bch2_btree_iter_peek_max(iter, SPOS_MAX);
+}
+
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos);
+
+static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+{
+ return bch2_btree_iter_peek_prev_min(iter, POS_MIN);
}
-struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
@@ -386,10 +433,10 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
if (unlikely(iter->update_path))
bch2_path_put(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
- if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ if (!(iter->flags & BTREE_ITER_all_snapshots))
new_pos.snapshot = iter->snapshot;
__bch2_btree_iter_set_pos(iter, new_pos);
@@ -397,7 +444,7 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
{
- BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
+ BUG_ON(!(iter->flags & BTREE_ITER_is_extents));
iter->pos = bkey_start_pos(&iter->k);
}
@@ -412,41 +459,35 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna
void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
-static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
- unsigned btree_id,
- unsigned flags)
+static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
+ unsigned btree_id,
+ unsigned level,
+ unsigned flags)
{
- if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+ if (level || !btree_id_cached(trans->c, btree_id)) {
+ flags &= ~BTREE_ITER_cached;
+ flags &= ~BTREE_ITER_with_key_cache;
+ } else if (!(flags & BTREE_ITER_cached))
+ flags |= BTREE_ITER_with_key_cache;
+
+ if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
btree_id_is_extents(btree_id))
- flags |= BTREE_ITER_IS_EXTENTS;
+ flags |= BTREE_ITER_is_extents;
- if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+ if (!(flags & BTREE_ITER_snapshot_field) &&
!btree_type_has_snapshot_field(btree_id))
- flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+ flags &= ~BTREE_ITER_all_snapshots;
- if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ if (!(flags & BTREE_ITER_all_snapshots) &&
btree_type_has_snapshots(btree_id))
- flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+ flags |= BTREE_ITER_filter_snapshots;
if (trans->journal_replay_not_finished)
- flags |= BTREE_ITER_WITH_JOURNAL;
+ flags |= BTREE_ITER_with_journal;
return flags;
}
-static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
- unsigned btree_id,
- unsigned flags)
-{
- if (!btree_id_cached(trans->c, btree_id)) {
- flags &= ~BTREE_ITER_CACHED;
- flags &= ~BTREE_ITER_WITH_KEY_CACHE;
- } else if (!(flags & BTREE_ITER_CACHED))
- flags |= BTREE_ITER_WITH_KEY_CACHE;
-
- return __bch2_btree_iter_flags(trans, btree_id, flags);
-}
-
static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
@@ -483,7 +524,7 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans,
if (__builtin_constant_p(btree_id) &&
__builtin_constant_p(flags))
bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
- bch2_btree_iter_flags(trans, btree_id, flags),
+ bch2_btree_iter_flags(trans, btree_id, 0, flags),
_THIS_IP_);
else
bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
@@ -494,16 +535,16 @@ void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
unsigned, unsigned, unsigned);
void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
-static inline void set_btree_iter_dontneed(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
-
- if (!trans->restarted)
- btree_iter_path(trans, iter)->preserve = false;
-}
+void bch2_set_btree_iter_dontneed(struct btree_iter *);
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+/**
+ * bch2_trans_kmalloc - allocate memory for use by the current transaction
+ *
+ * Must be called after bch2_trans_begin, which on second and further calls
+ * frees all memory allocated in this transaction
+ */
static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
size = roundup(size, 8);
@@ -562,23 +603,30 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \
_btree_id, _pos, _flags, KEY_TYPE_##_type))
+static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k)
+{
+ unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k));
+ memcpy(dst_v, src_k.v, b);
+ if (unlikely(b < dst_size))
+ memset(dst_v + b, 0, dst_size - b);
+}
+
+#define bkey_val_copy(_dst_v, _src_k) \
+do { \
+ BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \
+ __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \
+} while (0)
+
static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
unsigned btree_id, struct bpos pos,
unsigned flags, unsigned type,
unsigned val_size, void *val)
{
struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
- ret = bkey_err(k);
+ struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
+ int ret = bkey_err(k);
if (!ret) {
- unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size);
-
- memcpy(val, k.v, b);
- if (unlikely(b < sizeof(*val)))
- memset((void *) val + b, 0, sizeof(*val) - b);
+ __bkey_val_copy(val, val_size, k);
bch2_trans_iter_exit(trans, &iter);
}
@@ -593,44 +641,56 @@ void bch2_trans_srcu_unlock(struct btree_trans *);
u32 bch2_trans_begin(struct btree_trans *);
-/*
- * XXX
- * this does not handle transaction restarts from bch2_btree_iter_next_node()
- * correctly
- */
-#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
- _locks_want, _depth, _flags, _b, _ret) \
- for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \
- _start, _locks_want, _depth, _flags); \
- (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)), \
- !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \
- (_b) = bch2_btree_iter_next_node(&(_iter)))
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
+ _locks_want, _depth, _flags, _b, _do) \
+({ \
+ bch2_trans_begin((_trans)); \
+ \
+ struct btree_iter _iter; \
+ bch2_trans_node_iter_init((_trans), &_iter, (_btree_id), \
+ _start, _locks_want, _depth, _flags); \
+ int _ret3 = 0; \
+ do { \
+ _ret3 = lockrestart_do((_trans), ({ \
+ struct btree *_b = bch2_btree_iter_peek_node(&_iter); \
+ if (!_b) \
+ break; \
+ \
+ PTR_ERR_OR_ZERO(_b) ?: (_do); \
+ })) ?: \
+ lockrestart_do((_trans), \
+ PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \
+ } while (!_ret3); \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret3; \
+})
#define for_each_btree_node(_trans, _iter, _btree_id, _start, \
- _flags, _b, _ret) \
- __for_each_btree_node(_trans, _iter, _btree_id, _start, \
- 0, 0, _flags, _b, _ret)
+ _flags, _b, _do) \
+ __for_each_btree_node(_trans, _iter, _btree_id, _start, \
+ 0, 0, _flags, _b, _do)
static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek_prev(iter);
}
static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek(iter);
}
-static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter,
struct bpos end,
unsigned flags)
{
- if (!(flags & BTREE_ITER_SLOTS))
- return bch2_btree_iter_peek_upto(iter, end);
+ if (!(flags & BTREE_ITER_slots))
+ return bch2_btree_iter_peek_max(iter, end);
if (bkey_gt(iter->pos, end))
return bkey_s_c_null;
@@ -642,7 +702,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *);
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
+ if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
return __bch2_btree_trans_too_many_iters(trans);
return 0;
@@ -691,22 +751,18 @@ transaction_restart: \
if (!_ret2) \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
\
- _ret2 ?: trans_was_restarted(_trans, _restart_count); \
+ _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \
})
-#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _do) \
+#define for_each_btree_key_max_continue(_trans, _iter, \
+ _end, _flags, _k, _do) \
({ \
- struct btree_iter _iter; \
struct bkey_s_c _k; \
int _ret3 = 0; \
\
- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- \
do { \
_ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \
+ (_k) = bch2_btree_iter_peek_max_type(&(_iter), \
_end, (_flags)); \
if (!(_k).k) \
break; \
@@ -719,9 +775,24 @@ transaction_restart: \
_ret3; \
})
+#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \
+ for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
+
+#define for_each_btree_key_max(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _do) \
+({ \
+ bch2_trans_begin(trans); \
+ \
+ struct btree_iter _iter; \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\
+})
+
#define for_each_btree_key(_trans, _iter, _btree_id, \
_start, _flags, _k, _do) \
- for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \
+ for_each_btree_key_max(_trans, _iter, _btree_id, _start, \
SPOS_MAX, _flags, _k, _do)
#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \
@@ -765,59 +836,45 @@ transaction_restart: \
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
-#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \
+#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \
_start, _end, _iter_flags, _k, \
_disk_res, _journal_seq, _commit_flags,\
_do) \
- for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
+ for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
- struct btree_iter *iter, unsigned flags)
-{
- struct bkey_s_c k;
-
- while (btree_trans_too_many_iters(trans) ||
- (k = bch2_btree_iter_peek_type(iter, flags),
- bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(trans);
-
- return k;
-}
-
-#define for_each_btree_key_old(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
-
-#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
+#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
+ (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
-#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
+#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\
for (; \
- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \
+ (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
- for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
+ for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
SPOS_MAX, _flags, _k, _ret)
+#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_rewind(&(_iter)))
+
#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
- for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
+ for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
/*
* This should not be used in a fastpath, without first trying _do in
@@ -827,7 +884,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
#define drop_locks_do(_trans, _do) \
({ \
bch2_trans_unlock(_trans); \
- _do ?: bch2_trans_relock(_trans); \
+ (_do) ?: bch2_trans_relock(_trans); \
})
#define allocate_dropping_locks_errcode(_trans, _do) \
@@ -837,7 +894,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
\
if (bch2_err_matches(_ret, ENOMEM)) { \
_gfp = GFP_KERNEL; \
- _ret = drop_locks_do(trans, _do); \
+ _ret = drop_locks_do(_trans, _do); \
} \
_ret; \
})
@@ -850,19 +907,26 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
_ret = 0; \
if (unlikely(!_p)) { \
_gfp = GFP_KERNEL; \
- _ret = drop_locks_do(trans, ((_p = _do), 0)); \
+ _ret = drop_locks_do(_trans, ((_p = _do), 0)); \
} \
_p; \
})
-void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
-void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
-void bch2_dump_trans_updates(struct btree_trans *);
-void bch2_dump_trans_paths_updates(struct btree_trans *);
+#define bch2_trans_run(_c, _do) \
+({ \
+ struct btree_trans *trans = bch2_trans_get(_c); \
+ int _ret = (_do); \
+ bch2_trans_put(trans); \
+ _ret; \
+})
+
+#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
void bch2_trans_put(struct btree_trans *);
+bool bch2_current_has_btree_trans(struct bch_fs *);
+
extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
unsigned bch2_trans_get_fn_idx(const char *);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 719a94a84950..6d25e3f85ce8 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -1,7 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_buf.h"
#include "bset.h"
+#include "btree_cache.h"
#include "btree_journal_iter.h"
#include "journal_io.h"
@@ -14,19 +16,15 @@
* operations for the regular btree iter code to use:
*/
-static int __journal_key_cmp(enum btree_id l_btree_id,
- unsigned l_level,
- struct bpos l_pos,
- const struct journal_key *r)
+static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos)
{
- return (cmp_int(l_btree_id, r->btree_id) ?:
- cmp_int(l_level, r->level) ?:
- bpos_cmp(l_pos, r->k->k.p));
-}
+ size_t gap_size = keys->size - keys->nr;
-static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-{
- return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+ BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size);
+
+ if (pos >= keys->gap)
+ pos -= gap_size;
+ return pos;
}
static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
@@ -40,7 +38,7 @@ static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
{
- return keys->d + idx_to_pos(keys, idx);
+ return keys->data + idx_to_pos(keys, idx);
}
static size_t __bch2_journal_key_search(struct journal_keys *keys,
@@ -74,7 +72,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
}
/* Returns first non-overwritten key >= search key: */
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
unsigned level, struct bpos pos,
struct bpos end_pos, size_t *idx)
{
@@ -97,27 +95,92 @@ search:
}
}
+ struct bkey_i *ret = NULL;
+ rcu_read_lock(); /* for overwritten_ranges */
+
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
- return NULL;
+ break;
if (k->overwritten) {
- (*idx)++;
+ if (k->overwritten_range)
+ *idx = rcu_dereference(k->overwritten_range)->end;
+ else
+ *idx += 1;
continue;
}
- if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
- return k->k;
+ if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
+ ret = k->k;
+ break;
+ }
(*idx)++;
iters++;
if (iters == 10) {
*idx = 0;
+ rcu_read_unlock();
goto search;
}
}
- return NULL;
+ rcu_read_unlock();
+ return ret;
+}
+
+struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos,
+ struct bpos end_pos, size_t *idx)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ unsigned iters = 0;
+ struct journal_key *k;
+
+ BUG_ON(*idx > keys->nr);
+search:
+ if (!*idx)
+ *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+ while (*idx &&
+ __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+ (*idx)++;
+ iters++;
+ if (iters == 10) {
+ *idx = 0;
+ goto search;
+ }
+ }
+
+ struct bkey_i *ret = NULL;
+ rcu_read_lock(); /* for overwritten_ranges */
+
+ while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+ if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
+ break;
+
+ if (k->overwritten) {
+ if (k->overwritten_range)
+ *idx = rcu_dereference(k->overwritten_range)->start - 1;
+ else
+ *idx -= 1;
+ continue;
+ }
+
+ if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
+ ret = k->k;
+ break;
+ }
+
+ --(*idx);
+ iters++;
+ if (iters == 10) {
+ *idx = 0;
+ goto search;
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
}
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
@@ -125,7 +188,25 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
{
size_t idx = 0;
- return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
+ return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx);
+}
+
+static void journal_iter_verify(struct journal_iter *iter)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct journal_keys *keys = iter->keys;
+ size_t gap_size = keys->size - keys->nr;
+
+ BUG_ON(iter->idx >= keys->gap &&
+ iter->idx < keys->gap + gap_size);
+
+ if (iter->idx < keys->size) {
+ struct journal_key *k = keys->data + iter->idx;
+
+ int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
+ BUG_ON(cmp > 0);
+ }
+#endif
}
static void journal_iters_fix(struct bch_fs *c)
@@ -133,7 +214,8 @@ static void journal_iters_fix(struct bch_fs *c)
struct journal_keys *keys = &c->journal_keys;
/* The key we just inserted is immediately before the gap: */
size_t gap_end = keys->gap + (keys->size - keys->nr);
- struct btree_and_journal_iter *iter;
+ struct journal_key *new_key = &keys->data[keys->gap - 1];
+ struct journal_iter *iter;
/*
* If an iterator points one after the key we just inserted, decrement
@@ -141,9 +223,14 @@ static void journal_iters_fix(struct bch_fs *c)
* decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
* handle that:
*/
- list_for_each_entry(iter, &c->journal_iters, journal.list)
- if (iter->journal.idx == gap_end)
- iter->journal.idx = keys->gap - 1;
+ list_for_each_entry(iter, &c->journal_iters, list) {
+ journal_iter_verify(iter);
+ if (iter->idx == gap_end &&
+ new_key->btree_id == iter->btree_id &&
+ new_key->level == iter->level)
+ iter->idx = keys->gap - 1;
+ journal_iter_verify(iter);
+ }
}
static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
@@ -172,7 +259,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
* Ensure these keys are done last by journal replay, to unblock
* journal reclaim:
*/
- .journal_seq = U32_MAX,
+ .journal_seq = U64_MAX,
};
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
@@ -180,33 +267,38 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
BUG_ON(test_bit(BCH_FS_rw, &c->flags));
if (idx < keys->size &&
- journal_key_cmp(&n, &keys->d[idx]) == 0) {
- if (keys->d[idx].allocated)
- kfree(keys->d[idx].k);
- keys->d[idx] = n;
+ journal_key_cmp(&n, &keys->data[idx]) == 0) {
+ if (keys->data[idx].allocated)
+ kfree(keys->data[idx].k);
+ keys->data[idx] = n;
return 0;
}
if (idx > keys->gap)
idx -= keys->size - keys->nr;
+ size_t old_gap = keys->gap;
+
if (keys->nr == keys->size) {
+ journal_iters_move_gap(c, old_gap, keys->size);
+ old_gap = keys->size;
+
struct journal_keys new_keys = {
.nr = keys->nr,
.size = max_t(size_t, keys->size, 8) * 2,
};
- new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
- if (!new_keys.d) {
+ new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
+ if (!new_keys.data) {
bch_err(c, "%s: error allocating new key array (size %zu)",
__func__, new_keys.size);
return -BCH_ERR_ENOMEM_journal_key_insert;
}
/* Since @keys was full, there was no gap: */
- memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
- kvfree(keys->d);
- keys->d = new_keys.d;
+ memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
+ kvfree(keys->data);
+ keys->data = new_keys.data;
keys->nr = new_keys.nr;
keys->size = new_keys.size;
@@ -214,13 +306,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
keys->gap = keys->nr;
}
- journal_iters_move_gap(c, keys->gap, idx);
+ journal_iters_move_gap(c, old_gap, idx);
- move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
- keys->gap = idx;
+ move_gap(keys, idx);
keys->nr++;
- keys->d[keys->gap++] = n;
+ keys->data[keys->gap++] = n;
journal_iters_fix(c);
@@ -260,6 +351,84 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
return bch2_journal_key_insert(c, id, level, &whiteout);
}
+bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
+ unsigned level, struct bpos pos)
+{
+ struct journal_keys *keys = &trans->c->journal_keys;
+ size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+ if (!trans->journal_replay_not_finished)
+ return false;
+
+ return (idx < keys->size &&
+ keys->data[idx].btree_id == btree &&
+ keys->data[idx].level == level &&
+ bpos_eq(keys->data[idx].k->k.p, pos) &&
+ bkey_deleted(&keys->data[idx].k->k));
+}
+
+static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
+{
+ struct journal_key *k = keys->data + pos;
+ size_t idx = pos_to_idx(keys, pos);
+
+ k->overwritten = true;
+
+ struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL;
+ struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL;
+
+ bool prev_overwritten = prev && prev->overwritten;
+ bool next_overwritten = next && next->overwritten;
+
+ struct journal_key_range_overwritten *prev_range =
+ prev_overwritten ? prev->overwritten_range : NULL;
+ struct journal_key_range_overwritten *next_range =
+ next_overwritten ? next->overwritten_range : NULL;
+
+ BUG_ON(prev_range && prev_range->end != idx);
+ BUG_ON(next_range && next_range->start != idx + 1);
+
+ if (prev_range && next_range) {
+ prev_range->end = next_range->end;
+
+ keys->data[pos].overwritten_range = prev_range;
+ for (size_t i = next_range->start; i < next_range->end; i++) {
+ struct journal_key *ip = keys->data + idx_to_pos(keys, i);
+ BUG_ON(ip->overwritten_range != next_range);
+ ip->overwritten_range = prev_range;
+ }
+
+ kfree_rcu_mightsleep(next_range);
+ } else if (prev_range) {
+ prev_range->end++;
+ k->overwritten_range = prev_range;
+ if (next_overwritten) {
+ prev_range->end++;
+ next->overwritten_range = prev_range;
+ }
+ } else if (next_range) {
+ next_range->start--;
+ k->overwritten_range = next_range;
+ if (prev_overwritten) {
+ next_range->start--;
+ prev->overwritten_range = next_range;
+ }
+ } else if (prev_overwritten || next_overwritten) {
+ struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
+ if (!r)
+ return;
+
+ r->start = idx - (size_t) prev_overwritten;
+ r->end = idx + 1 + (size_t) next_overwritten;
+
+ rcu_assign_pointer(k->overwritten_range, r);
+ if (prev_overwritten)
+ prev->overwritten_range = r;
+ if (next_overwritten)
+ next->overwritten_range = r;
+ }
+}
+
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos pos)
{
@@ -267,10 +436,14 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
if (idx < keys->size &&
- keys->d[idx].btree_id == btree &&
- keys->d[idx].level == level &&
- bpos_eq(keys->d[idx].k->k.p, pos))
- keys->d[idx].overwritten = true;
+ keys->data[idx].btree_id == btree &&
+ keys->data[idx].level == level &&
+ bpos_eq(keys->data[idx].k->k.p, pos) &&
+ !keys->data[idx].overwritten) {
+ mutex_lock(&keys->overwrite_lock);
+ __bch2_journal_key_overwritten(keys, idx);
+ mutex_unlock(&keys->overwrite_lock);
+ }
}
static void bch2_journal_iter_advance(struct journal_iter *iter)
@@ -284,19 +457,32 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct journal_key *k = iter->keys->d + iter->idx;
+ struct bkey_s_c ret = bkey_s_c_null;
+
+ journal_iter_verify(iter);
- while (k < iter->keys->d + iter->keys->size &&
- k->btree_id == iter->btree_id &&
- k->level == iter->level) {
- if (!k->overwritten)
- return bkey_i_to_s_c(k->k);
+ rcu_read_lock();
+ while (iter->idx < iter->keys->size) {
+ struct journal_key *k = iter->keys->data + iter->idx;
- bch2_journal_iter_advance(iter);
- k = iter->keys->d + iter->idx;
+ int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
+ if (cmp < 0)
+ break;
+ BUG_ON(cmp);
+
+ if (!k->overwritten) {
+ ret = bkey_i_to_s_c(k->k);
+ break;
+ }
+
+ if (k->overwritten_range)
+ iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
+ else
+ bch2_journal_iter_advance(iter);
}
+ rcu_read_unlock();
- return bkey_s_c_null;
+ return ret;
}
static void bch2_journal_iter_exit(struct journal_iter *iter)
@@ -313,6 +499,8 @@ static void bch2_journal_iter_init(struct bch_fs *c,
iter->level = level;
iter->keys = &c->journal_keys;
iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
+
+ journal_iter_verify(iter);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -334,20 +522,57 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
iter->pos = bpos_successor(iter->pos);
}
+static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
+{
+ struct btree_and_journal_iter iter = *_iter;
+ struct bch_fs *c = iter.trans->c;
+ unsigned level = iter.journal.level;
+ struct bkey_buf tmp;
+ unsigned nr = test_bit(BCH_FS_started, &c->flags)
+ ? (level > 1 ? 0 : 2)
+ : (level > 1 ? 1 : 16);
+
+ iter.prefetch = false;
+ iter.fail_if_too_many_whiteouts = true;
+ bch2_bkey_buf_init(&tmp);
+
+ while (nr--) {
+ bch2_btree_and_journal_iter_advance(&iter);
+ struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
+ if (!k.k)
+ break;
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
+ }
+
+ bch2_bkey_buf_exit(&tmp, c);
+}
+
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
{
- struct bkey_s_c btree_k, journal_k, ret;
+ struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
+ size_t iters = 0;
+
+ if (iter->prefetch && iter->journal.level)
+ btree_and_journal_iter_prefetch(iter);
again:
if (iter->at_end)
return bkey_s_c_null;
+ iters++;
+
+ if (iters > 20 && iter->fail_if_too_many_whiteouts)
+ return bkey_s_c_null;
+
while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
bpos_lt(btree_k.k->p, iter->pos))
bch2_journal_iter_advance_btree(iter);
- while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
- bpos_lt(journal_k.k->p, iter->pos))
- bch2_journal_iter_advance(&iter->journal);
+ if (iter->trans->journal_replay_not_finished)
+ while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+ bpos_lt(journal_k.k->p, iter->pos))
+ bch2_journal_iter_advance(&iter->journal);
ret = journal_k.k &&
(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
@@ -376,51 +601,44 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
bch2_journal_iter_exit(&iter->journal);
}
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+ struct btree_and_journal_iter *iter,
struct btree *b,
struct btree_node_iter node_iter,
struct bpos pos)
{
memset(iter, 0, sizeof(*iter));
+ iter->trans = trans;
iter->b = b;
iter->node_iter = node_iter;
- bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
- INIT_LIST_HEAD(&iter->journal.list);
iter->pos = b->data->min_key;
iter->at_end = false;
+ INIT_LIST_HEAD(&iter->journal.list);
+
+ if (trans->journal_replay_not_finished) {
+ bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
+ if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
+ list_add(&iter->journal.list, &trans->c->journal_iters);
+ }
}
/*
* this version is used by btree_gc before filesystem has gone RW and
* multithreaded, so uses the journal_iters list:
*/
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+ struct btree_and_journal_iter *iter,
struct btree *b)
{
struct btree_node_iter node_iter;
bch2_btree_node_iter_init_from_start(&node_iter, b);
- __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
- list_add(&iter->journal.list, &c->journal_iters);
+ __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
}
/* sort and dedup all keys in the journal: */
-void bch2_journal_entries_free(struct bch_fs *c)
-{
- struct journal_replay **i;
- struct genradix_iter iter;
-
- genradix_for_each(&c->journal_entries, iter, i)
- if (*i)
- kvpfree(*i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&(*i)->j));
- genradix_free(&c->journal_entries);
-}
-
/*
* When keys compare equal, oldest compares first:
*/
@@ -437,106 +655,78 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
void bch2_journal_keys_put(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
- struct journal_key *i;
BUG_ON(atomic_read(&keys->ref) <= 0);
if (!atomic_dec_and_test(&keys->ref))
return;
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
+ move_gap(keys, keys->nr);
+
+ darray_for_each(*keys, i) {
+ if (i->overwritten_range &&
+ (i == &darray_last(*keys) ||
+ i->overwritten_range != i[1].overwritten_range))
+ kfree(i->overwritten_range);
- for (i = keys->d; i < keys->d + keys->nr; i++)
if (i->allocated)
kfree(i->k);
+ }
- kvfree(keys->d);
- keys->d = NULL;
+ kvfree(keys->data);
+ keys->data = NULL;
keys->nr = keys->gap = keys->size = 0;
- bch2_journal_entries_free(c);
+ struct journal_replay **i;
+ struct genradix_iter iter;
+
+ genradix_for_each(&c->journal_entries, iter, i)
+ kvfree(*i);
+ genradix_free(&c->journal_entries);
}
static void __journal_keys_sort(struct journal_keys *keys)
{
- struct journal_key *src, *dst;
+ sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
- sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
+ cond_resched();
- src = dst = keys->d;
- while (src < keys->d + keys->nr) {
- while (src + 1 < keys->d + keys->nr &&
- !journal_key_cmp(src, src + 1))
- src++;
+ struct journal_key *dst = keys->data;
+
+ darray_for_each(*keys, src) {
+ /*
+ * We don't accumulate accounting keys here because we have to
+ * compare each individual accounting key against the version in
+ * the btree during replay:
+ */
+ if (src->k->k.type != KEY_TYPE_accounting &&
+ src + 1 < &darray_top(*keys) &&
+ !journal_key_cmp(src, src + 1))
+ continue;
- *dst++ = *src++;
+ *dst++ = *src;
}
- keys->nr = dst - keys->d;
+ keys->nr = dst - keys->data;
}
int bch2_journal_keys_sort(struct bch_fs *c)
{
struct genradix_iter iter;
struct journal_replay *i, **_i;
- struct jset_entry *entry;
- struct bkey_i *k;
struct journal_keys *keys = &c->journal_keys;
- size_t nr_keys = 0, nr_read = 0;
-
- genradix_for_each(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (!i || i->ignore)
- continue;
-
- for_each_jset_key(k, entry, &i->j)
- nr_keys++;
- }
-
- if (!nr_keys)
- return 0;
-
- keys->size = roundup_pow_of_two(nr_keys);
-
- keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
- if (!keys->d) {
- bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
- nr_keys);
-
- do {
- keys->size >>= 1;
- keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
- } while (!keys->d && keys->size > nr_keys / 8);
-
- if (!keys->d) {
- bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
- keys->size);
- return -BCH_ERR_ENOMEM_journal_keys_sort;
- }
- }
+ size_t nr_read = 0;
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
cond_resched();
for_each_jset_key(k, entry, &i->j) {
- if (keys->nr == keys->size) {
- __journal_keys_sort(keys);
-
- if (keys->nr > keys->size * 7 / 8) {
- bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
- keys->nr, keys->size, nr_read, nr_keys);
- return -BCH_ERR_ENOMEM_journal_keys_sort;
- }
- }
-
- keys->d[keys->nr++] = (struct journal_key) {
+ struct journal_key n = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
@@ -544,6 +734,18 @@ int bch2_journal_keys_sort(struct bch_fs *c)
.journal_offset = k->_data - i->j._data,
};
+ if (darray_push(keys, n)) {
+ __journal_keys_sort(keys);
+
+ if (keys->nr * 8 > keys->size * 7) {
+ bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
+ keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
+ return -BCH_ERR_ENOMEM_journal_keys_sort;
+ }
+
+ BUG_ON(darray_push(keys, n));
+ }
+
nr_read++;
}
}
@@ -551,6 +753,54 @@ int bch2_journal_keys_sort(struct bch_fs *c)
__journal_keys_sort(keys);
keys->gap = keys->nr;
- bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
+ bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
return 0;
}
+
+void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
+ unsigned level_min, unsigned level_max,
+ struct bpos start, struct bpos end)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ size_t dst = 0;
+
+ move_gap(keys, keys->nr);
+
+ darray_for_each(*keys, i)
+ if (!(i->btree_id == btree &&
+ i->level >= level_min &&
+ i->level <= level_max &&
+ bpos_ge(i->k->k.p, start) &&
+ bpos_le(i->k->k.p, end)))
+ keys->data[dst++] = *i;
+ keys->nr = keys->gap = dst;
+}
+
+void bch2_journal_keys_dump(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ struct printbuf buf = PRINTBUF;
+
+ pr_info("%zu keys:", keys->nr);
+
+ move_gap(keys, keys->nr);
+
+ darray_for_each(*keys, i) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "btree=");
+ bch2_btree_id_to_text(&buf, i->btree_id);
+ prt_printf(&buf, " l=%u ", i->level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+ pr_err("%s", buf.buf);
+ }
+ printbuf_exit(&buf);
+}
+
+void bch2_fs_journal_keys_init(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+
+ atomic_set(&keys->ref, 1);
+ keys->initial_ref_held = true;
+ mutex_init(&keys->overwrite_lock);
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 8ca4c100b2e3..2a3082919b8d 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
#define _BCACHEFS_BTREE_JOURNAL_ITER_H
+#include "bkey.h"
+
struct journal_iter {
struct list_head list;
enum btree_id btree_id;
@@ -15,6 +17,7 @@ struct journal_iter {
*/
struct btree_and_journal_iter {
+ struct btree_trans *trans;
struct btree *b;
struct btree_node_iter node_iter;
struct bkey unpacked;
@@ -22,32 +25,60 @@ struct btree_and_journal_iter {
struct journal_iter journal;
struct bpos pos;
bool at_end;
+ bool prefetch;
+ bool fail_if_too_many_whiteouts;
};
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+static inline int __journal_key_btree_cmp(enum btree_id l_btree_id,
+ unsigned l_level,
+ const struct journal_key *r)
+{
+ return -cmp_int(l_level, r->level) ?:
+ cmp_int(l_btree_id, r->btree_id);
+}
+
+static inline int __journal_key_cmp(enum btree_id l_btree_id,
+ unsigned l_level,
+ struct bpos l_pos,
+ const struct journal_key *r)
+{
+ return __journal_key_btree_cmp(l_btree_id, l_level, r) ?:
+ bpos_cmp(l_pos, r->k->k.p);
+}
+
+static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+{
+ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+}
+
+struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
unsigned, struct bpos, struct bpos, size_t *);
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
+int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
+ struct btree_and_journal_iter *);
+
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
- unsigned, struct bpos);
+bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
- struct bch_fs *, struct btree *,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+ struct btree_and_journal_iter *, struct btree *,
struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
- struct bch_fs *,
- struct btree *);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+ struct btree_and_journal_iter *, struct btree *);
void bch2_journal_keys_put(struct bch_fs *);
@@ -58,8 +89,14 @@ static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
c->journal_keys.initial_ref_held = false;
}
-void bch2_journal_entries_free(struct bch_fs *);
-
int bch2_journal_keys_sort(struct bch_fs *);
+void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
+ unsigned, unsigned,
+ struct bpos, struct bpos);
+
+void bch2_journal_keys_dump(struct bch_fs *);
+
+void bch2_fs_journal_keys_init(struct bch_fs *);
+
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h
new file mode 100644
index 000000000000..8b773823704f
--- /dev/null
+++ b/fs/bcachefs/btree_journal_iter_types.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
+#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
+
+struct journal_key_range_overwritten {
+ size_t start, end;
+};
+
+struct journal_key {
+ u64 journal_seq;
+ u32 journal_offset;
+ enum btree_id btree_id:8;
+ unsigned level:8;
+ bool allocated;
+ bool overwritten;
+ struct journal_key_range_overwritten __rcu *
+ overwritten_range;
+ struct bkey_i *k;
+};
+
+struct journal_keys {
+ /* must match layout in darray_types.h */
+ size_t nr, size;
+ struct journal_key *data;
+ /*
+ * Gap buffer: instead of all the empty space in the array being at the
+ * end of the buffer - from @nr to @size - the empty space is at @gap.
+ * This means that sequential insertions are O(n) instead of O(n^2).
+ */
+ size_t gap;
+ atomic_t ref;
+ bool initial_ref_held;
+ struct mutex overwrite_lock;
+};
+
+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 74e52fd28abe..edce59433375 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -32,12 +32,22 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
}
static const struct rhashtable_params bch2_btree_key_cache_params = {
- .head_offset = offsetof(struct bkey_cached, hash),
- .key_offset = offsetof(struct bkey_cached, key),
- .key_len = sizeof(struct bkey_cached_key),
- .obj_cmpfn = bch2_btree_key_cache_cmp_fn,
+ .head_offset = offsetof(struct bkey_cached, hash),
+ .key_offset = offsetof(struct bkey_cached, key),
+ .key_len = sizeof(struct bkey_cached_key),
+ .obj_cmpfn = bch2_btree_key_cache_cmp_fn,
+ .automatic_shrinking = true,
};
+static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_cached *ck,
+ enum btree_node_locked_type lock_held)
+{
+ path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
+ path->l[0].b = (void *) ck;
+ mark_btree_node_locked(trans, path, 0, lock_held);
+}
+
__flatten
inline struct bkey_cached *
bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
@@ -69,224 +79,95 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
return true;
}
-static void bkey_cached_evict(struct btree_key_cache *c,
+static bool bkey_cached_evict(struct btree_key_cache *c,
struct bkey_cached *ck)
{
- BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
- bch2_btree_key_cache_params));
- memset(&ck->key, ~0, sizeof(ck->key));
+ bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
+ bch2_btree_key_cache_params);
+ if (ret) {
+ memset(&ck->key, ~0, sizeof(ck->key));
+ atomic_long_dec(&c->nr_keys);
+ }
- atomic_long_dec(&c->nr_keys);
+ return ret;
}
-static void bkey_cached_free(struct btree_key_cache *bc,
- struct bkey_cached *ck)
+static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
- BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
-
- ck->btree_trans_barrier_seq =
- start_poll_synchronize_srcu(&c->btree_trans_barrier);
+ struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
+ struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
- if (ck->c.lock.readers) {
- list_move_tail(&ck->list, &bc->freed_pcpu);
- bc->nr_freed_pcpu++;
- } else {
- list_move_tail(&ck->list, &bc->freed_nonpcpu);
- bc->nr_freed_nonpcpu++;
- }
- atomic_long_inc(&bc->nr_freed);
+ this_cpu_dec(*c->btree_key_cache.nr_pending);
+ kmem_cache_free(bch2_key_cache, ck);
+}
+static void bkey_cached_free(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
-}
-
-#ifdef __KERNEL__
-static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
- struct bkey_cached *ck)
-{
- struct bkey_cached *pos;
-
- bc->nr_freed_nonpcpu++;
-
- list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
- if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
- pos->btree_trans_barrier_seq)) {
- list_move(&ck->list, &pos->list);
- return;
- }
- }
- list_move(&ck->list, &bc->freed_nonpcpu);
+ bool pcpu_readers = ck->c.lock.readers != NULL;
+ rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
+ this_cpu_inc(*bc->nr_pending);
}
-#endif
-static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
- struct bkey_cached *ck)
+static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
{
- BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
-
- if (!ck->c.lock.readers) {
-#ifdef __KERNEL__
- struct btree_key_cache_freelist *f;
- bool freed = false;
-
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- if (f->nr < ARRAY_SIZE(f->objs)) {
- f->objs[f->nr++] = ck;
- freed = true;
- }
- preempt_enable();
-
- if (!freed) {
- mutex_lock(&bc->lock);
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- while (f->nr > ARRAY_SIZE(f->objs) / 2) {
- struct bkey_cached *ck2 = f->objs[--f->nr];
-
- __bkey_cached_move_to_freelist_ordered(bc, ck2);
- }
- preempt_enable();
+ gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
- __bkey_cached_move_to_freelist_ordered(bc, ck);
- mutex_unlock(&bc->lock);
- }
-#else
- mutex_lock(&bc->lock);
- list_move_tail(&ck->list, &bc->freed_nonpcpu);
- bc->nr_freed_nonpcpu++;
- mutex_unlock(&bc->lock);
-#endif
- } else {
- mutex_lock(&bc->lock);
- list_move_tail(&ck->list, &bc->freed_pcpu);
- mutex_unlock(&bc->lock);
+ struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
+ if (unlikely(!ck))
+ return NULL;
+ ck->k = kmalloc(key_u64s * sizeof(u64), gfp);
+ if (unlikely(!ck->k)) {
+ kmem_cache_free(bch2_key_cache, ck);
+ return NULL;
}
-}
-
-static void bkey_cached_free_fast(struct btree_key_cache *bc,
- struct bkey_cached *ck)
-{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
- ck->btree_trans_barrier_seq =
- start_poll_synchronize_srcu(&c->btree_trans_barrier);
-
- list_del_init(&ck->list);
- atomic_long_inc(&bc->nr_freed);
-
- kfree(ck->k);
- ck->k = NULL;
- ck->u64s = 0;
-
- bkey_cached_move_to_freelist(bc, ck);
-
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
+ ck->u64s = key_u64s;
+ return ck;
}
static struct bkey_cached *
-bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
- bool *was_new)
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s)
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
- struct bkey_cached *ck = NULL;
bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
int ret;
- if (!pcpu_readers) {
-#ifdef __KERNEL__
- struct btree_key_cache_freelist *f;
-
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
- if (f->nr)
- ck = f->objs[--f->nr];
- preempt_enable();
-
- if (!ck) {
- mutex_lock(&bc->lock);
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- while (!list_empty(&bc->freed_nonpcpu) &&
- f->nr < ARRAY_SIZE(f->objs) / 2) {
- ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
- list_del_init(&ck->list);
- bc->nr_freed_nonpcpu--;
- f->objs[f->nr++] = ck;
- }
-
- ck = f->nr ? f->objs[--f->nr] : NULL;
- preempt_enable();
- mutex_unlock(&bc->lock);
- }
-#else
- mutex_lock(&bc->lock);
- if (!list_empty(&bc->freed_nonpcpu)) {
- ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
- list_del_init(&ck->list);
- bc->nr_freed_nonpcpu--;
- }
- mutex_unlock(&bc->lock);
-#endif
- } else {
- mutex_lock(&bc->lock);
- if (!list_empty(&bc->freed_pcpu)) {
- ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
- list_del_init(&ck->list);
- }
- mutex_unlock(&bc->lock);
- }
-
- if (ck) {
- ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
- if (unlikely(ret)) {
- bkey_cached_move_to_freelist(bc, ck);
- return ERR_PTR(ret);
- }
-
- path->l[0].b = (void *) ck;
- path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
- mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
-
- ret = bch2_btree_node_lock_write(trans, path, &ck->c);
- if (unlikely(ret)) {
- btree_node_unlock(trans, path, 0);
- bkey_cached_move_to_freelist(bc, ck);
- return ERR_PTR(ret);
- }
-
- return ck;
- }
+ struct bkey_cached *ck = container_of_or_null(
+ rcu_pending_dequeue(&bc->pending[pcpu_readers]),
+ struct bkey_cached, rcu);
+ if (ck)
+ goto lock;
ck = allocate_dropping_locks(trans, ret,
- kmem_cache_zalloc(bch2_key_cache, _gfp));
+ __bkey_cached_alloc(key_u64s, _gfp));
if (ret) {
+ if (ck)
+ kfree(ck->k);
kmem_cache_free(bch2_key_cache, ck);
return ERR_PTR(ret);
}
- if (!ck)
- return NULL;
-
- INIT_LIST_HEAD(&ck->list);
- bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
-
- ck->c.cached = true;
- BUG_ON(!six_trylock_intent(&ck->c.lock));
- BUG_ON(!six_trylock_write(&ck->c.lock));
- *was_new = true;
+ if (ck) {
+ bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
+ ck->c.cached = true;
+ goto lock;
+ }
+
+ ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
+ struct bkey_cached, rcu);
+ if (ck)
+ goto lock;
+lock:
+ six_lock_intent(&ck->c.lock, NULL, NULL);
+ six_lock_write(&ck->c.lock, NULL, NULL);
return ck;
}
@@ -298,310 +179,215 @@ bkey_cached_reuse(struct btree_key_cache *c)
struct bkey_cached *ck;
unsigned i;
- mutex_lock(&c->lock);
rcu_read_lock();
tbl = rht_dereference_rcu(c->table.tbl, &c->table);
for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bkey_cached_lock_for_evict(ck)) {
- bkey_cached_evict(c, ck);
- goto out;
+ if (bkey_cached_evict(c, ck))
+ goto out;
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
}
}
ck = NULL;
out:
rcu_read_unlock();
- mutex_unlock(&c->lock);
return ck;
}
-static struct bkey_cached *
-btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
+static int btree_key_cache_create(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_path *ck_path,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
- struct bkey_cached *ck;
- bool was_new = false;
- ck = bkey_cached_alloc(trans, path, &was_new);
- if (IS_ERR(ck))
- return ck;
+ /*
+ * bch2_varint_decode can read past the end of the buffer by at
+ * most 7 bytes (it won't be used):
+ */
+ unsigned key_u64s = k.k->u64s + 1;
+
+ /*
+ * Allocate some extra space so that the transaction commit path is less
+ * likely to have to reallocate, since that requires a transaction
+ * restart:
+ */
+ key_u64s = min(256U, (key_u64s * 3) / 2);
+ key_u64s = roundup_pow_of_two(key_u64s);
+
+ struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s);
+ int ret = PTR_ERR_OR_ZERO(ck);
+ if (ret)
+ return ret;
if (unlikely(!ck)) {
ck = bkey_cached_reuse(bc);
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
- bch2_btree_id_str(path->btree_id));
- return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
+ bch2_btree_id_str(ck_path->btree_id));
+ return -BCH_ERR_ENOMEM_btree_key_cache_create;
}
-
- mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
}
ck->c.level = 0;
- ck->c.btree_id = path->btree_id;
- ck->key.btree_id = path->btree_id;
- ck->key.pos = path->pos;
- ck->valid = false;
+ ck->c.btree_id = ck_path->btree_id;
+ ck->key.btree_id = ck_path->btree_id;
+ ck->key.pos = ck_path->pos;
ck->flags = 1U << BKEY_CACHED_ACCESSED;
- if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
- &ck->hash,
- bch2_btree_key_cache_params))) {
- /* We raced with another fill: */
-
- if (likely(was_new)) {
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
- kfree(ck);
- } else {
- bkey_cached_free_fast(bc, ck);
+ if (unlikely(key_u64s > ck->u64s)) {
+ mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
+
+ struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
+ kmalloc(key_u64s * sizeof(u64), _gfp));
+ if (unlikely(!new_k)) {
+ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_id_str(ck->key.btree_id), key_u64s);
+ ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+ } else if (ret) {
+ kfree(new_k);
+ goto err;
}
- mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
- return NULL;
+ kfree(ck->k);
+ ck->k = new_k;
+ ck->u64s = key_u64s;
}
- atomic_long_inc(&bc->nr_keys);
+ bkey_reassemble(ck->k, k);
+
+ ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c);
+ if (unlikely(ret))
+ goto err;
+ ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
+
+ bch2_btree_node_unlock_write(trans, path, path_l(path)->b);
+
+ if (unlikely(ret)) /* raced with another fill? */
+ goto err;
+
+ atomic_long_inc(&bc->nr_keys);
six_unlock_write(&ck->c.lock);
- return ck;
+ enum six_lock_type lock_want = __btree_lock_want(ck_path, 0);
+ if (lock_want == SIX_LOCK_read)
+ six_lock_downgrade(&ck->c.lock);
+ btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want);
+ ck_path->uptodate = BTREE_ITER_UPTODATE;
+ return 0;
+err:
+ bkey_cached_free(bc, ck);
+ mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
+
+ return ret;
}
-static int btree_key_cache_fill(struct btree_trans *trans,
- struct btree_path *ck_path,
- struct bkey_cached *ck)
+static noinline int btree_key_cache_fill(struct btree_trans *trans,
+ struct btree_path *ck_path,
+ unsigned flags)
{
+ if (flags & BTREE_ITER_cached_nofill) {
+ ck_path->l[0].b = NULL;
+ return 0;
+ }
+
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- unsigned new_u64s = 0;
- struct bkey_i *new_k = NULL;
int ret;
- k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
- BTREE_ITER_KEY_CACHE_FILL|
- BTREE_ITER_CACHED_NOFILL);
+ bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
+ BTREE_ITER_intent|
+ BTREE_ITER_key_cache_fill|
+ BTREE_ITER_cached_nofill);
+ iter.flags &= ~BTREE_ITER_with_journal;
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
- if (!bch2_btree_node_relock(trans, ck_path, 0)) {
- trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
- goto err;
- }
-
- /*
- * bch2_varint_decode can read past the end of the buffer by at
- * most 7 bytes (it won't be used):
- */
- new_u64s = k.k->u64s + 1;
-
- /*
- * Allocate some extra space so that the transaction commit path is less
- * likely to have to reallocate, since that requires a transaction
- * restart:
- */
- new_u64s = min(256U, (new_u64s * 3) / 2);
-
- if (new_u64s > ck->u64s) {
- new_u64s = roundup_pow_of_two(new_u64s);
- new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
- if (!new_k) {
- bch2_trans_unlock(trans);
-
- new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
- if (!new_k) {
- bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_id_str(ck->key.btree_id), new_u64s);
- ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
- goto err;
- }
-
- if (!bch2_btree_node_relock(trans, ck_path, 0)) {
- kfree(new_k);
- trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
- goto err;
- }
-
- ret = bch2_trans_relock(trans);
- if (ret) {
- kfree(new_k);
- goto err;
- }
- }
- }
+ /* Recheck after btree lookup, before allocating: */
+ ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0;
+ if (unlikely(ret))
+ goto out;
- ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
- if (ret) {
- kfree(new_k);
+ ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k);
+ if (ret)
goto err;
- }
- if (new_k) {
- kfree(ck->k);
- ck->u64s = new_u64s;
- ck->k = new_k;
- }
-
- bkey_reassemble(ck->k, k);
- ck->valid = true;
- bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
+ if (trace_key_cache_fill_enabled()) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bpos_to_text(&buf, ck_path->pos);
+ prt_char(&buf, ' ');
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ trace_key_cache_fill(trans, buf.buf);
+ printbuf_exit(&buf);
+ }
+out:
/* We're not likely to need this iterator again: */
- set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(&iter);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-static noinline int
-bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
- unsigned flags)
+static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
+ struct btree_path *path)
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck;
- int ret = 0;
-
- BUG_ON(path->level);
-
- path->l[1].b = NULL;
-
- if (bch2_btree_node_relock_notrace(trans, path, 0)) {
- ck = (void *) path->l[0].b;
- goto fill;
- }
retry:
ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
- if (!ck) {
- ck = btree_key_cache_create(trans, path);
- ret = PTR_ERR_OR_ZERO(ck);
- if (ret)
- goto err;
- if (!ck)
- goto retry;
-
- mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
- path->locks_want = 1;
- } else {
- enum six_lock_type lock_want = __btree_lock_want(path, 0);
-
- ret = btree_node_lock(trans, path, (void *) ck, 0,
- lock_want, _THIS_IP_);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto err;
-
- BUG_ON(ret);
-
- if (ck->key.btree_id != path->btree_id ||
- !bpos_eq(ck->key.pos, path->pos)) {
- six_unlock_type(&ck->c.lock, lock_want);
- goto retry;
- }
-
- mark_btree_node_locked(trans, path, 0,
- (enum btree_node_locked_type) lock_want);
- }
-
- path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
- path->l[0].b = (void *) ck;
-fill:
- path->uptodate = BTREE_ITER_UPTODATE;
-
- if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
- /*
- * Using the underscore version because we haven't set
- * path->uptodate yet:
- */
- if (!path->locks_want &&
- !__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
- trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
- goto err;
- }
+ if (!ck)
+ return -ENOENT;
- ret = btree_key_cache_fill(trans, path, ck);
- if (ret)
- goto err;
+ enum six_lock_type lock_want = __btree_lock_want(path, 0);
- ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
- if (ret)
- goto err;
+ int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_);
+ if (ret)
+ return ret;
- path->uptodate = BTREE_ITER_UPTODATE;
+ if (ck->key.btree_id != path->btree_id ||
+ !bpos_eq(ck->key.pos, path->pos)) {
+ six_unlock_type(&ck->c.lock, lock_want);
+ goto retry;
}
if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
- BUG_ON(path->uptodate);
-
- return ret;
-err:
- path->uptodate = BTREE_ITER_NEED_TRAVERSE;
- if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- btree_node_unlock(trans, path, 0);
- path->l[0].b = ERR_PTR(ret);
- }
- return ret;
+ btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
+ path->uptodate = BTREE_ITER_UPTODATE;
+ return 0;
}
int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
unsigned flags)
{
- struct bch_fs *c = trans->c;
- struct bkey_cached *ck;
- int ret = 0;
-
EBUG_ON(path->level);
path->l[1].b = NULL;
- if (bch2_btree_node_relock_notrace(trans, path, 0)) {
- ck = (void *) path->l[0].b;
- goto fill;
- }
-retry:
- ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
- if (!ck) {
- return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
- } else {
- enum six_lock_type lock_want = __btree_lock_want(path, 0);
-
- ret = btree_node_lock(trans, path, (void *) ck, 0,
- lock_want, _THIS_IP_);
- EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
- if (ret)
- return ret;
-
- if (ck->key.btree_id != path->btree_id ||
- !bpos_eq(ck->key.pos, path->pos)) {
- six_unlock_type(&ck->c.lock, lock_want);
- goto retry;
+ int ret;
+ do {
+ ret = btree_path_traverse_cached_fast(trans, path);
+ if (unlikely(ret == -ENOENT))
+ ret = btree_key_cache_fill(trans, path, flags);
+ } while (ret == -EEXIST);
+
+ if (unlikely(ret)) {
+ path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ btree_node_unlock(trans, path, 0);
+ path->l[0].b = ERR_PTR(ret);
}
-
- mark_btree_node_locked(trans, path, 0,
- (enum btree_node_locked_type) lock_want);
}
-
- path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
- path->l[0].b = (void *) ck;
-fill:
- if (!ck->valid)
- return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
-
- if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
- set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-
- path->uptodate = BTREE_ITER_UPTODATE;
- EBUG_ON(!ck->valid);
- EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
-
return ret;
}
@@ -618,13 +404,13 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
- BTREE_ITER_SLOTS|
- BTREE_ITER_INTENT|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_slots|
+ BTREE_ITER_intent|
+ BTREE_ITER_all_snapshots);
bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
+ b_iter.flags &= ~BTREE_ITER_with_key_cache;
ret = bch2_btree_iter_traverse(&c_iter);
if (ret)
@@ -640,8 +426,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
goto out;
}
- BUG_ON(!ck->valid);
-
if (journal_seq && ck->journal.seq != journal_seq)
goto out;
@@ -657,24 +441,31 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
commit_flags |= BCH_WATERMARK_reclaim;
if (ck->journal.seq != journal_last_seq(j) ||
- j->watermark == BCH_WATERMARK_stripe)
+ !test_bit(JOURNAL_space_low, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
- ret = bch2_btree_iter_traverse(&b_iter) ?:
- bch2_trans_update(trans, &b_iter, ck->k,
- BTREE_UPDATE_KEY_CACHE_RECLAIM|
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
- BTREE_TRIGGER_NORUN) ?:
+ struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter);
+ ret = bkey_err(btree_k);
+ if (ret)
+ goto err;
+
+ /* * Check that we're not violating cache coherency rules: */
+ BUG_ON(bkey_deleted(btree_k.k));
+
+ ret = bch2_trans_update(trans, &b_iter, ck->k,
+ BTREE_UPDATE_key_cache_reclaim|
+ BTREE_UPDATE_internal_snapshot_node|
+ BTREE_TRIGGER_norun) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
commit_flags);
-
+err:
bch2_fs_fatal_err_on(ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
!bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
!bch2_journal_error(j), c,
- "error flushing key cache: %s", bch2_err_str(ret));
+ "flushing key cache: %s", bch2_err_str(ret));
if (ret)
goto out;
@@ -704,8 +495,12 @@ evict:
}
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
- bkey_cached_evict(&c->btree_key_cache, ck);
- bkey_cached_free_fast(&c->btree_key_cache, ck);
+ if (bkey_cached_evict(&c->btree_key_cache, ck)) {
+ bkey_cached_free(&c->btree_key_cache, ck);
+ } else {
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
+ }
}
out:
bch2_trans_iter_exit(trans, &b_iter);
@@ -763,7 +558,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
BUG_ON(insert->k.u64s > ck->u64s);
bkey_copy(ck->k, insert);
- ck->valid = true;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
@@ -786,7 +580,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
* flushing. The flush callback will not proceed unless ->seq matches
* the latest pin, so make sure it starts with a consistent value.
*/
- if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
+ if (!(insert_entry->flags & BTREE_UPDATE_nojournal) ||
!journal_pin_active(&ck->journal)) {
ck->seq = trans->journal_res.seq;
}
@@ -802,10 +596,9 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
struct btree_path *path)
{
struct bch_fs *c = trans->c;
+ struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck = (void *) path->l[0].b;
- BUG_ON(!ck->valid);
-
/*
* We just did an update to the btree, bypassing the key cache: the key
* cache key is now stale and must be dropped, even if dirty:
@@ -816,7 +609,22 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
bch2_journal_pin_drop(&c->journal, &ck->journal);
}
- ck->valid = false;
+ bkey_cached_evict(bc, ck);
+ bkey_cached_free(bc, ck);
+
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
+
+ struct btree_path *path2;
+ unsigned i;
+ trans_for_each_path(trans, path2, i)
+ if (path2->l[0].b == (void *) ck) {
+ __bch2_btree_path_unlock(trans, path2);
+ path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop);
+ path2->should_be_locked = false;
+ btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE);
+ }
+
+ bch2_trans_verify_locks(trans);
}
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
@@ -825,97 +633,75 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bucket_table *tbl;
- struct bkey_cached *ck, *t;
+ struct bkey_cached *ck;
size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
- unsigned start, flags;
+ unsigned iter, start;
int srcu_idx;
- mutex_lock(&bc->lock);
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- flags = memalloc_nofs_save();
+ rcu_read_lock();
+
+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
/*
- * Newest freed entries are at the end of the list - once we hit one
- * that's too new to be freed, we can bail out:
+ * Scanning is expensive while a rehash is in progress - most elements
+ * will be on the new hashtable, if it's in progress
+ *
+ * A rehash could still start while we're scanning - that's ok, we'll
+ * still see most elements.
*/
- scanned += bc->nr_freed_nonpcpu;
-
- list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
- if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
- ck->btree_trans_barrier_seq))
- break;
-
- list_del(&ck->list);
- six_lock_exit(&ck->c.lock);
- kmem_cache_free(bch2_key_cache, ck);
- atomic_long_dec(&bc->nr_freed);
- freed++;
- bc->nr_freed_nonpcpu--;
- }
-
- if (scanned >= nr)
- goto out;
-
- scanned += bc->nr_freed_pcpu;
-
- list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
- if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
- ck->btree_trans_barrier_seq))
- break;
-
- list_del(&ck->list);
- six_lock_exit(&ck->c.lock);
- kmem_cache_free(bch2_key_cache, ck);
- atomic_long_dec(&bc->nr_freed);
- freed++;
- bc->nr_freed_pcpu--;
+ if (unlikely(tbl->nest)) {
+ rcu_read_unlock();
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+ return SHRINK_STOP;
}
- if (scanned >= nr)
- goto out;
-
- rcu_read_lock();
- tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
- if (bc->shrink_iter >= tbl->size)
- bc->shrink_iter = 0;
- start = bc->shrink_iter;
+ iter = bc->shrink_iter;
+ if (iter >= tbl->size)
+ iter = 0;
+ start = iter;
do {
struct rhash_head *pos, *next;
- pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+ pos = rht_ptr_rcu(&tbl->buckets[iter]);
while (!rht_is_a_nulls(pos)) {
- next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+ next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
ck = container_of(pos, struct bkey_cached, hash);
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
- goto next;
-
- if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ bc->skipped_dirty++;
+ } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- else if (bkey_cached_lock_for_evict(ck)) {
- bkey_cached_evict(bc, ck);
+ bc->skipped_accessed++;
+ } else if (!bkey_cached_lock_for_evict(ck)) {
+ bc->skipped_lock_fail++;
+ } else if (bkey_cached_evict(bc, ck)) {
bkey_cached_free(bc, ck);
+ bc->freed++;
+ freed++;
+ } else {
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
}
scanned++;
if (scanned >= nr)
- break;
-next:
+ goto out;
+
pos = next;
}
- bc->shrink_iter++;
- if (bc->shrink_iter >= tbl->size)
- bc->shrink_iter = 0;
- } while (scanned < nr && bc->shrink_iter != start);
+ iter++;
+ if (iter >= tbl->size)
+ iter = 0;
+ } while (scanned < nr && iter != start);
+out:
+ bc->shrink_iter = iter;
rcu_read_unlock();
-out:
- memalloc_nofs_restore(flags);
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
- mutex_unlock(&bc->lock);
return freed;
}
@@ -928,6 +714,14 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
long nr = atomic_long_read(&bc->nr_keys) -
atomic_long_read(&bc->nr_dirty);
+ /*
+ * Avoid hammering our shrinker too much if it's nearly empty - the
+ * shrinker code doesn't take into account how big our cache is, if it's
+ * mostly empty but the system is under memory pressure it causes nasty
+ * lock contention:
+ */
+ nr -= 128;
+
return max(0L, nr);
}
@@ -935,60 +729,36 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct bucket_table *tbl;
- struct bkey_cached *ck, *n;
+ struct bkey_cached *ck;
struct rhash_head *pos;
LIST_HEAD(items);
unsigned i;
-#ifdef __KERNEL__
- int cpu;
-#endif
shrinker_free(bc->shrink);
- mutex_lock(&bc->lock);
-
/*
* The loop is needed to guard against racing with rehash:
*/
while (atomic_long_read(&bc->nr_keys)) {
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
- if (tbl)
+ if (tbl) {
+ if (tbl->nest) {
+ /* wait for in progress rehash */
+ rcu_read_unlock();
+ mutex_lock(&bc->table.mutex);
+ mutex_unlock(&bc->table.mutex);
+ continue;
+ }
for (i = 0; i < tbl->size; i++)
- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
- bkey_cached_evict(bc, ck);
- list_add(&ck->list, &items);
+ while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
+ ck = container_of(pos, struct bkey_cached, hash);
+ BUG_ON(!bkey_cached_evict(bc, ck));
+ kfree(ck->k);
+ kmem_cache_free(bch2_key_cache, ck);
}
- rcu_read_unlock();
- }
-
-#ifdef __KERNEL__
- for_each_possible_cpu(cpu) {
- struct btree_key_cache_freelist *f =
- per_cpu_ptr(bc->pcpu_freed, cpu);
-
- for (i = 0; i < f->nr; i++) {
- ck = f->objs[i];
- list_add(&ck->list, &items);
}
- }
-#endif
-
- BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
- BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
-
- list_splice(&bc->freed_pcpu, &items);
- list_splice(&bc->freed_nonpcpu, &items);
-
- mutex_unlock(&bc->lock);
-
- list_for_each_entry_safe(ck, n, &items, list) {
- cond_resched();
-
- list_del(&ck->list);
- kfree(ck->k);
- six_lock_exit(&ck->c.lock);
- kmem_cache_free(bch2_key_cache, ck);
+ rcu_read_unlock();
}
if (atomic_long_read(&bc->nr_dirty) &&
@@ -1004,14 +774,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
- free_percpu(bc->pcpu_freed);
+ rcu_pending_exit(&bc->pending[0]);
+ rcu_pending_exit(&bc->pending[1]);
+
+ free_percpu(bc->nr_pending);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
- mutex_init(&c->lock);
- INIT_LIST_HEAD(&c->freed_pcpu);
- INIT_LIST_HEAD(&c->freed_nonpcpu);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
@@ -1019,11 +789,13 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct shrinker *shrink;
-#ifdef __KERNEL__
- bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
- if (!bc->pcpu_freed)
+ bc->nr_pending = alloc_percpu(size_t);
+ if (!bc->nr_pending)
+ return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+
+ if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
+ rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-#endif
if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
@@ -1034,22 +806,32 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
if (!shrink)
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
bc->shrink = shrink;
- shrink->seeks = 0;
shrink->count_objects = bch2_btree_key_cache_count;
shrink->scan_objects = bch2_btree_key_cache_scan;
+ shrink->batch = 1 << 14;
+ shrink->seeks = 0;
shrink->private_data = c;
shrinker_register(shrink);
return 0;
}
-void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
{
- prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed));
- prt_newline(out);
- prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys));
+ printbuf_tabstop_push(out, 24);
+ printbuf_tabstop_push(out, 12);
+
+ prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
+ prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
+ prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size);
prt_newline(out);
- prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty));
+ prt_printf(out, "shrinker:\n");
+ prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
+ prt_printf(out, "freed:\t%lu\r\n", bc->freed);
+ prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
+ prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
+ prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
prt_newline(out);
+ prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending));
}
void bch2_btree_key_cache_exit(void)
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index e6b2cd0dd2c1..51d6289b8dee 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -11,13 +11,27 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
return max_t(ssize_t, 0, nr_dirty - max_dirty);
}
-static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c)
{
size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
size_t max_dirty = 4096 + (nr_keys * 3) / 4;
- return nr_dirty > max_dirty;
+ return nr_dirty - max_dirty;
+}
+
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+ return __bch2_btree_key_cache_must_wait(c) > 0;
+}
+
+static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c)
+{
+ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+ size_t max_dirty = 2048 + (nr_keys * 5) / 8;
+
+ return nr_dirty <= max_dirty;
}
int bch2_btree_key_cache_journal_flush(struct journal *,
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
index 290e4e57df5b..722f1ed10551 100644
--- a/fs/bcachefs/btree_key_cache_types.h
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -2,28 +2,28 @@
#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-struct btree_key_cache_freelist {
- struct bkey_cached *objs[16];
- unsigned nr;
-};
+#include "rcu_pending.h"
struct btree_key_cache {
- struct mutex lock;
struct rhashtable table;
bool table_init_done;
- struct list_head freed_pcpu;
- size_t nr_freed_pcpu;
- struct list_head freed_nonpcpu;
- size_t nr_freed_nonpcpu;
-
struct shrinker *shrink;
unsigned shrink_iter;
- struct btree_key_cache_freelist __percpu *pcpu_freed;
- atomic_long_t nr_freed;
+ /* 0: non pcpu reader locks, 1: pcpu reader locks */
+ struct rcu_pending pending[2];
+ size_t __percpu *nr_pending;
+
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
+
+ /* shrinker stats */
+ unsigned long requested_to_free;
+ unsigned long freed;
+ unsigned long skipped_dirty;
+ unsigned long skipped_accessed;
+ unsigned long skipped_lock_fail;
};
struct bkey_cached_key {
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 684397442338..caef65adeae4 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -7,22 +7,13 @@
static struct lock_class_key bch2_btree_node_lock_key;
void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
- enum six_lock_init_flags flags)
+ enum six_lock_init_flags flags,
+ gfp_t gfp)
{
- __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
- lockdep_set_novalidate_class(&b->lock);
+ __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp);
+ lockdep_set_notrack_class(&b->lock);
}
-#ifdef CONFIG_LOCKDEP
-void bch2_assert_btree_nodes_not_locked(void)
-{
-#if 0
- //Re-enable when lock_class_is_held() is merged:
- BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
-#endif
-}
-#endif
-
/* Btree node locking: */
struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
@@ -83,8 +74,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
{
struct trans_waiting_for_lock *i;
- prt_printf(out, "Found lock cycle (%u entries):", g->nr);
- prt_newline(out);
+ prt_printf(out, "Found lock cycle (%u entries):\n", g->nr);
for (i = g->g; i < g->g + g->nr; i++) {
struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
@@ -120,6 +110,12 @@ static noinline void lock_graph_pop_all(struct lock_graph *g)
lock_graph_up(g);
}
+static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i)
+{
+ while (g->g + g->nr > i)
+ lock_graph_up(g);
+}
+
static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
{
g->g[g->nr++] = (struct trans_waiting_for_lock) {
@@ -135,15 +131,20 @@ static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
__lock_graph_down(g, trans);
}
-static bool lock_graph_remove_non_waiters(struct lock_graph *g)
+static bool lock_graph_remove_non_waiters(struct lock_graph *g,
+ struct trans_waiting_for_lock *from)
{
struct trans_waiting_for_lock *i;
- for (i = g->g + 1; i < g->g + g->nr; i++)
+ if (from->trans->locking != from->node_want) {
+ lock_graph_pop_from(g, from);
+ return true;
+ }
+
+ for (i = from + 1; i < g->g + g->nr; i++)
if (i->trans->locking != i->node_want ||
i->trans->locking_wait.start_time != i[-1].lock_start_time) {
- while (g->g + g->nr > i)
- lock_graph_up(g);
+ lock_graph_pop_from(g, i);
return true;
}
@@ -190,13 +191,14 @@ static int btree_trans_abort_preference(struct btree_trans *trans)
return 3;
}
-static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
+ struct trans_waiting_for_lock *from)
{
struct trans_waiting_for_lock *i, *abort = NULL;
unsigned best = 0, pref;
int ret;
- if (lock_graph_remove_non_waiters(g))
+ if (lock_graph_remove_non_waiters(g, from))
return 0;
/* Only checking, for debugfs: */
@@ -206,7 +208,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
goto out;
}
- for (i = g->g; i < g->g + g->nr; i++) {
+ for (i = from; i < g->g + g->nr; i++) {
pref = btree_trans_abort_preference(i->trans);
if (pref > best) {
abort = i;
@@ -216,6 +218,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
if (unlikely(!best)) {
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
@@ -224,15 +227,14 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
bch2_btree_trans_to_text(&buf, trans);
- prt_printf(&buf, "backtrace:");
- prt_newline(&buf);
+ prt_printf(&buf, "backtrace:\n");
printbuf_indent_add(&buf, 2);
bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
printbuf_indent_sub(&buf, 2);
prt_newline(&buf);
}
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
printbuf_exit(&buf);
BUG();
}
@@ -240,8 +242,9 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
ret = abort_lock(g, abort);
out:
if (ret)
- while (g->nr)
- lock_graph_up(g);
+ lock_graph_pop_all(g);
+ else
+ lock_graph_pop_from(g, abort);
return ret;
}
@@ -254,7 +257,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
for (i = g->g; i < g->g + g->nr; i++)
if (i->trans == trans) {
closure_put(&trans->ref);
- return break_cycle(g, cycle);
+ return break_cycle(g, cycle, i);
}
if (g->nr == ARRAY_SIZE(g->g)) {
@@ -263,8 +266,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
if (orig_trans->lock_may_not_fail)
return 0;
- while (g->nr)
- lock_graph_up(g);
+ lock_graph_pop_all(g);
if (cycle)
return 0;
@@ -292,7 +294,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
g.nr = 0;
- if (trans->lock_must_abort) {
+ if (trans->lock_must_abort && !trans->lock_may_not_fail) {
if (cycle)
return -1;
@@ -347,7 +349,7 @@ next:
* structures - which means it can't be blocked
* waiting on a lock:
*/
- if (!lock_graph_remove_non_waiters(&g)) {
+ if (!lock_graph_remove_non_waiters(&g, g.g)) {
/*
* If lock_graph_remove_non_waiters()
* didn't do anything, it must be
@@ -440,33 +442,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
- struct btree_path *linked;
- unsigned i, iter;
- int ret;
-
- /*
- * XXX BIG FAT NOTICE
- *
- * Drop all read locks before taking a write lock:
- *
- * This is a hack, because bch2_btree_node_lock_write_nofail() is a
- * hack - but by dropping read locks first, this should never fail, and
- * we only use this in code paths where whatever read locks we've
- * already taken are no longer needed:
- */
-
- trans_for_each_path(trans, linked, iter) {
- if (!linked->nodes_locked)
- continue;
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- if (btree_node_read_locked(linked, i)) {
- btree_node_unlock(trans, linked, i);
- btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
- }
- }
-
- ret = __btree_node_lock_write(trans, path, b, true);
+ int ret = __btree_node_lock_write(trans, path, b, true);
BUG_ON(ret);
}
@@ -518,8 +494,6 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
if (path->uptodate == BTREE_ITER_NEED_RELOCK)
path->uptodate = BTREE_ITER_UPTODATE;
- bch2_trans_verify_locks(trans);
-
return path->uptodate < BTREE_ITER_NEED_RELOCK;
}
@@ -551,7 +525,6 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
struct btree *b = path->l[level].b;
- struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
if (!is_btree_node(path, level))
return false;
@@ -575,24 +548,11 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
if (race_fault())
return false;
- if (btree_node_locked(path, level)) {
- bool ret;
-
- six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
- ret = six_lock_tryupgrade(&b->c.lock);
- six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
-
- if (ret)
- goto success;
- } else {
- if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
- goto success;
- }
+ if (btree_node_locked(path, level)
+ ? six_lock_tryupgrade(&b->c.lock)
+ : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+ goto success;
- /*
- * Do we already have an intent lock via another path? If so, just bump
- * lock count:
- */
if (btree_node_lock_seq_matches(path, b, level) &&
btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
btree_node_unlock(trans, path, level);
@@ -635,7 +595,9 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_pa
{
struct get_locks_fail f;
- return btree_path_get_locks(trans, path, false, &f);
+ bool ret = btree_path_get_locks(trans, path, false, &f);
+ bch2_trans_verify_locks(trans);
+ return ret;
}
int __bch2_btree_path_relock(struct btree_trans *trans,
@@ -658,7 +620,9 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
path->locks_want = new_locks_want;
- return btree_path_get_locks(trans, path, true, f);
+ bool ret = btree_path_get_locks(trans, path, true, f);
+ bch2_trans_verify_locks(trans);
+ return ret;
}
bool __bch2_btree_path_upgrade(struct btree_trans *trans,
@@ -666,8 +630,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
unsigned new_locks_want,
struct get_locks_fail *f)
{
- if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
- return true;
+ bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f);
+ if (ret)
+ goto out;
/*
* XXX: this is ugly - we'd prefer to not be mucking with other
@@ -701,8 +666,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
btree_path_get_locks(trans, linked, true, NULL);
}
}
-
- return false;
+out:
+ bch2_trans_verify_locks(trans);
+ return ret;
}
void __bch2_btree_path_downgrade(struct btree_trans *trans,
@@ -747,85 +713,102 @@ void bch2_trans_downgrade(struct btree_trans *trans)
return;
trans_for_each_path(trans, path, i)
- bch2_btree_path_downgrade(trans, path);
+ if (path->ref)
+ bch2_btree_path_downgrade(trans, path);
}
-int bch2_trans_relock(struct btree_trans *trans)
+static inline void __bch2_trans_unlock(struct btree_trans *trans)
{
struct btree_path *path;
unsigned i;
- if (unlikely(trans->restarted))
- return -((int) trans->restarted);
+ trans_for_each_path(trans, path, i)
+ __bch2_btree_path_unlock(trans, path);
+}
- trans_for_each_path(trans, path, i) {
- struct get_locks_fail f;
+static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
+ struct get_locks_fail *f, bool trace)
+{
+ if (!trace)
+ goto out;
- if (path->should_be_locked &&
- !btree_path_get_locks(trans, path, false, &f)) {
- if (trace_trans_restart_relock_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bpos_to_text(&buf, path->pos);
- prt_printf(&buf, " l=%u seq=%u node seq=",
- f.l, path->l[f.l].lock_seq);
- if (IS_ERR_OR_NULL(f.b)) {
- prt_str(&buf, bch2_err_str(PTR_ERR(f.b)));
- } else {
- prt_printf(&buf, "%u", f.b->c.lock.seq);
-
- struct six_lock_count c =
- bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l);
- prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
-
- c = six_lock_counts(&f.b->c.lock);
- prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
- }
+ if (trace_trans_restart_relock_enabled()) {
+ struct printbuf buf = PRINTBUF;
- trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
- printbuf_exit(&buf);
- }
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq);
+ if (IS_ERR_OR_NULL(f->b)) {
+ prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
+ } else {
+ prt_printf(&buf, "%u", f->b->c.lock.seq);
- count_event(trans->c, trans_restart_relock);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ struct six_lock_count c =
+ bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l);
+ prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+
+ c = six_lock_counts(&f->b->c.lock);
+ prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
}
+
+ trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
+ printbuf_exit(&buf);
}
- return 0;
+ count_event(trans->c, trans_restart_relock);
+out:
+ __bch2_trans_unlock(trans);
+ bch2_trans_verify_locks(trans);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
-int bch2_trans_relock_notrace(struct btree_trans *trans)
+static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
{
- struct btree_path *path;
- unsigned i;
+ bch2_trans_verify_locks(trans);
if (unlikely(trans->restarted))
return -((int) trans->restarted);
+ if (unlikely(trans->locked))
+ goto out;
+
+ struct btree_path *path;
+ unsigned i;
+
+ trans_for_each_path(trans, path, i) {
+ struct get_locks_fail f;
- trans_for_each_path(trans, path, i)
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path)) {
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
- }
+ !btree_path_get_locks(trans, path, false, &f))
+ return bch2_trans_relock_fail(trans, path, &f, trace);
+ }
+
+ trans_set_locked(trans, true);
+out:
+ bch2_trans_verify_locks(trans);
return 0;
}
+int bch2_trans_relock(struct btree_trans *trans)
+{
+ return __bch2_trans_relock(trans, true);
+}
+
+int bch2_trans_relock_notrace(struct btree_trans *trans)
+{
+ return __bch2_trans_relock(trans, false);
+}
+
void bch2_trans_unlock_noassert(struct btree_trans *trans)
{
- struct btree_path *path;
- unsigned i;
+ __bch2_trans_unlock(trans);
- trans_for_each_path(trans, path, i)
- __bch2_btree_path_unlock(trans, path);
+ trans_set_unlocked(trans);
}
void bch2_trans_unlock(struct btree_trans *trans)
{
- struct btree_path *path;
- unsigned i;
+ __bch2_trans_unlock(trans);
- trans_for_each_path(trans, path, i)
- __bch2_btree_path_unlock(trans, path);
+ trans_set_unlocked(trans);
}
void bch2_trans_unlock_long(struct btree_trans *trans)
@@ -834,15 +817,15 @@ void bch2_trans_unlock_long(struct btree_trans *trans)
bch2_trans_srcu_unlock(trans);
}
-bool bch2_trans_locked(struct btree_trans *trans)
+void bch2_trans_unlock_write(struct btree_trans *trans)
{
struct btree_path *path;
unsigned i;
trans_for_each_path(trans, path, i)
- if (path->nodes_locked)
- return true;
- return false;
+ for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++)
+ if (btree_node_write_locked(path, l))
+ bch2_btree_node_unlock_write(trans, path, path->l[l].b);
}
int __bch2_trans_mutex_lock(struct btree_trans *trans,
@@ -861,15 +844,19 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans,
void bch2_btree_path_verify_locks(struct btree_path *path)
{
- unsigned l;
+ /*
+ * A path may be uptodate and yet have nothing locked if and only if
+ * there is no node at path->level, which generally means we were
+ * iterating over all nodes and got to the end of the btree
+ */
+ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+ btree_path_node(path, path->level) &&
+ !path->nodes_locked);
- if (!path->nodes_locked) {
- BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
- btree_path_node(path, path->level));
+ if (!path->nodes_locked)
return;
- }
- for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+ for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
int want = btree_lock_want(path, l);
int have = btree_node_locked_type(path, l);
@@ -879,11 +866,30 @@ void bch2_btree_path_verify_locks(struct btree_path *path)
(want == BTREE_NODE_UNLOCKED ||
have != BTREE_NODE_WRITE_LOCKED) &&
want != have);
+
+ BUG_ON(btree_node_locked(path, l) &&
+ path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock));
}
}
+static bool bch2_trans_locked(struct btree_trans *trans)
+{
+ struct btree_path *path;
+ unsigned i;
+
+ trans_for_each_path(trans, path, i)
+ if (path->nodes_locked)
+ return true;
+ return false;
+}
+
void bch2_trans_verify_locks(struct btree_trans *trans)
{
+ if (!trans->locked) {
+ BUG_ON(bch2_trans_locked(trans));
+ return;
+ }
+
struct btree_path *path;
unsigned i;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 4bd72c855da1..b33ab7af8440 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -13,15 +13,10 @@
#include "btree_iter.h"
#include "six.h"
-void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
-
-#ifdef CONFIG_LOCKDEP
-void bch2_assert_btree_nodes_not_locked(void);
-#else
-static inline void bch2_assert_btree_nodes_not_locked(void) {}
-#endif
+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp);
void bch2_trans_unlock_noassert(struct btree_trans *);
+void bch2_trans_unlock_write(struct btree_trans *);
static inline bool is_btree_node(struct btree_path *path, unsigned l)
{
@@ -81,13 +76,6 @@ static inline void mark_btree_node_locked_noreset(struct btree_path *path,
path->nodes_locked |= (type + 1) << (level << 1);
}
-static inline void mark_btree_node_unlocked(struct btree_path *path,
- unsigned level)
-{
- EBUG_ON(btree_node_write_locked(path, level));
- mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
-}
-
static inline void mark_btree_node_locked(struct btree_trans *trans,
struct btree_path *path,
unsigned level,
@@ -130,6 +118,9 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
/* unlock: */
+void bch2_btree_node_unlock_write(struct btree_trans *,
+ struct btree_path *, struct btree *);
+
static inline void btree_node_unlock(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
@@ -138,10 +129,14 @@ static inline void btree_node_unlock(struct btree_trans *trans,
EBUG_ON(level >= BTREE_MAX_DEPTH);
if (lock_type != BTREE_NODE_UNLOCKED) {
+ if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) {
+ bch2_btree_node_unlock_write(trans, path, path->l[level].b);
+ lock_type = BTREE_NODE_INTENT_LOCKED;
+ }
six_unlock_type(&path->l[level].b->c.lock, lock_type);
btree_trans_lock_hold_time_update(trans, path, level);
+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
}
- mark_btree_node_unlocked(path, level);
}
static inline int btree_path_lowest_level_locked(struct btree_path *path)
@@ -168,47 +163,76 @@ static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
* succeed:
*/
static inline void
+__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b)
+{
+ if (!b->c.lock.write_lock_recurse) {
+ struct btree_path *linked;
+ unsigned i;
+
+ trans_for_each_path_with_node(trans, b, linked, i)
+ linked->l[b->c.level].lock_seq++;
+ }
+
+ six_unlock_write(&b->c.lock);
+}
+
+static inline void
bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
struct btree *b)
{
- struct btree_path *linked;
- unsigned i;
-
EBUG_ON(path->l[b->c.level].b != b);
EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-
- trans_for_each_path_with_node(trans, b, linked, i)
- linked->l[b->c.level].lock_seq++;
-
- six_unlock_write(&b->c.lock);
+ __bch2_btree_node_unlock_write(trans, b);
}
-void bch2_btree_node_unlock_write(struct btree_trans *,
- struct btree_path *, struct btree *);
-
int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
/* lock: */
+static inline void trans_set_locked(struct btree_trans *trans, bool try)
+{
+ if (!trans->locked) {
+ lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_);
+ trans->locked = true;
+ trans->last_unlock_ip = 0;
+
+ trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0;
+ current->flags |= PF_MEMALLOC_NOFS;
+ }
+}
+
+static inline void trans_set_unlocked(struct btree_trans *trans)
+{
+ if (trans->locked) {
+ lock_release(&trans->dep_map, _THIS_IP_);
+ trans->locked = false;
+ trans->last_unlock_ip = _RET_IP_;
+
+ if (!trans->pf_memalloc_nofs)
+ current->flags &= ~PF_MEMALLOC_NOFS;
+ }
+}
+
static inline int __btree_node_lock_nopath(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
enum six_lock_type type,
bool lock_may_not_fail,
unsigned long ip)
{
- int ret;
-
trans->lock_may_not_fail = lock_may_not_fail;
trans->lock_must_abort = false;
trans->locking = b;
- ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
- bch2_six_check_for_deadlock, trans, ip);
+ int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans, ip);
WRITE_ONCE(trans->locking, NULL);
WRITE_ONCE(trans->locking_wait.start_time, 0);
+
+ if (!ret)
+ trace_btree_path_lock(trans, _THIS_IP_, b);
return ret;
}
@@ -262,6 +286,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
@@ -364,14 +389,14 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
- struct get_locks_fail f;
+ struct get_locks_fail f = {};
unsigned old_locks_want = path->locks_want;
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
if (path->locks_want < new_locks_want
? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
- : path->uptodate == BTREE_ITER_UPTODATE)
+ : path->nodes_locked)
return 0;
trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
@@ -381,12 +406,13 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
/* misc: */
-static inline void btree_path_set_should_be_locked(struct btree_path *path)
+static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path)
{
EBUG_ON(!btree_node_locked(path, path->level));
EBUG_ON(path->uptodate);
path->should_be_locked = true;
+ trace_btree_path_should_be_locked(trans, path);
}
static inline void __btree_path_set_level_up(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
new file mode 100644
index 000000000000..a7f06deee13c
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan.c
@@ -0,0 +1,598 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "recovery_passes.h"
+
+#include <linux/kthread.h>
+#include <linux/min_heap.h>
+#include <linux/sort.h>
+
+struct find_btree_nodes_worker {
+ struct closure *cl;
+ struct find_btree_nodes *f;
+ struct bch_dev *ca;
+};
+
+static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
+{
+ bch2_btree_id_level_to_text(out, n->btree_id, n->level);
+ prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
+ n->seq, n->journal_seq, n->cookie);
+ bch2_bpos_to_text(out, n->min_key);
+ prt_str(out, "-");
+ bch2_bpos_to_text(out, n->max_key);
+
+ if (n->range_updated)
+ prt_str(out, " range updated");
+
+ for (unsigned i = 0; i < n->nr_ptrs; i++) {
+ prt_char(out, ' ');
+ bch2_extent_ptr_to_text(out, c, n->ptrs + i);
+ }
+}
+
+static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
+{
+ printbuf_indent_add(out, 2);
+ darray_for_each(nodes, i) {
+ found_btree_node_to_text(out, c, i);
+ prt_newline(out);
+ }
+ printbuf_indent_sub(out, 2);
+}
+
+static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
+{
+ struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
+
+ set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
+ bp->k.p = f->max_key;
+ bp->v.seq = cpu_to_le64(f->cookie);
+ bp->v.sectors_written = 0;
+ bp->v.flags = 0;
+ bp->v.sectors_written = cpu_to_le16(f->sectors_written);
+ bp->v.min_key = f->min_key;
+ SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
+ memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
+}
+
+static inline u64 bkey_journal_seq(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode_v3:
+ return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
+ default:
+ return 0;
+ }
+}
+
+static bool found_btree_node_is_readable(struct btree_trans *trans,
+ struct found_btree_node *f)
+{
+ struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
+
+ found_btree_node_to_key(&tmp.k, f);
+
+ struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
+ bool ret = !IS_ERR_OR_NULL(b);
+ if (!ret)
+ return ret;
+
+ f->sectors_written = b->written;
+ f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
+
+ struct bkey_s_c k;
+ struct bkey unpacked;
+ struct btree_node_iter iter;
+ for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
+ f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
+
+ six_unlock_read(&b->c.lock);
+
+ /*
+ * We might update this node's range; if that happens, we need the node
+ * to be re-read so the read path can trim keys that are no longer in
+ * this node
+ */
+ if (b != btree_node_root(trans->c, b))
+ bch2_btree_node_evict(trans, &tmp.k);
+ return ret;
+}
+
+static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->level, r->level) ?:
+ cmp_int(l->cookie, r->cookie);
+}
+
+/*
+ * Given two found btree nodes, if their sequence numbers are equal, take the
+ * one that's readable:
+ */
+static int found_btree_node_cmp_time(const struct found_btree_node *l,
+ const struct found_btree_node *r)
+{
+ return cmp_int(l->seq, r->seq) ?:
+ cmp_int(l->journal_seq, r->journal_seq);
+}
+
+static int found_btree_node_cmp_pos(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ -cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->min_key, r->min_key) ?:
+ -found_btree_node_cmp_time(l, r);
+}
+
+static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
+{
+ return found_btree_node_cmp_pos(l, r) < 0;
+}
+
+static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
+{
+ struct found_btree_node *l = _l;
+ struct found_btree_node *r = _r;
+
+ swap(*l, *r);
+}
+
+static const struct min_heap_callbacks found_btree_node_heap_cbs = {
+ .less = found_btree_node_cmp_pos_less,
+ .swp = found_btree_node_swap,
+};
+
+static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
+ struct bio *bio, struct btree_node *bn, u64 offset)
+{
+ struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+ bio->bi_iter.bi_sector = offset;
+ bch2_bio_map(bio, bn, PAGE_SIZE);
+
+ submit_bio_wait(bio);
+ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+ "IO error in try_read_btree_node() at %llu: %s",
+ offset, bch2_blk_status_to_str(bio->bi_status)))
+ return;
+
+ if (le64_to_cpu(bn->magic) != bset_magic(c))
+ return;
+
+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
+ if (!c->chacha20)
+ return;
+
+ struct nonce nonce = btree_nonce(&bn->keys, 0);
+ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+ bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
+ }
+
+ if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
+ return;
+
+ if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
+ return;
+
+ if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
+ return;
+
+ rcu_read_lock();
+ struct found_btree_node n = {
+ .btree_id = BTREE_NODE_ID(bn),
+ .level = BTREE_NODE_LEVEL(bn),
+ .seq = BTREE_NODE_SEQ(bn),
+ .cookie = le64_to_cpu(bn->keys.seq),
+ .min_key = bn->min_key,
+ .max_key = bn->max_key,
+ .nr_ptrs = 1,
+ .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
+ .ptrs[0].offset = offset,
+ .ptrs[0].dev = ca->dev_idx,
+ .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)),
+ };
+ rcu_read_unlock();
+
+ if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+ mutex_lock(&f->lock);
+ if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
+ bch_err(c, "try_read_btree_node() can't handle endian conversion");
+ f->ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (darray_push(&f->nodes, n))
+ f->ret = -ENOMEM;
+unlock:
+ mutex_unlock(&f->lock);
+ }
+}
+
+static int read_btree_nodes_worker(void *p)
+{
+ struct find_btree_nodes_worker *w = p;
+ struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
+ struct bch_dev *ca = w->ca;
+ void *buf = (void *) __get_free_page(GFP_KERNEL);
+ struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
+ unsigned long last_print = jiffies;
+
+ if (!buf || !bio) {
+ bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+ w->f->ret = -ENOMEM;
+ goto err;
+ }
+
+ for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
+ for (unsigned bucket_offset = 0;
+ bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
+ bucket_offset += btree_sectors(c)) {
+ if (time_after(jiffies, last_print + HZ * 30)) {
+ u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
+ u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
+
+ bch_info(ca, "%s: %2u%% done", __func__,
+ (unsigned) div64_u64(cur_sector * 100, end_sector));
+ last_print = jiffies;
+ }
+
+ u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
+
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
+ !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
+ continue;
+
+ try_read_btree_node(w->f, ca, bio, buf, sector);
+ }
+err:
+ bio_put(bio);
+ free_page((unsigned long) buf);
+ percpu_ref_get(&ca->io_ref);
+ closure_put(w->cl);
+ kfree(w);
+ return 0;
+}
+
+static int read_btree_nodes(struct find_btree_nodes *f)
+{
+ struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+ struct closure cl;
+ int ret = 0;
+
+ closure_init_stack(&cl);
+
+ for_each_online_member(c, ca) {
+ if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
+ continue;
+
+ struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
+ struct task_struct *t;
+
+ if (!w) {
+ percpu_ref_put(&ca->io_ref);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ percpu_ref_get(&ca->io_ref);
+ closure_get(&cl);
+ w->cl = &cl;
+ w->f = f;
+ w->ca = ca;
+
+ t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+ ret = PTR_ERR_OR_ZERO(t);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ closure_put(&cl);
+ f->ret = ret;
+ bch_err(c, "error starting kthread: %i", ret);
+ break;
+ }
+ }
+err:
+ closure_sync(&cl);
+ return f->ret ?: ret;
+}
+
+static bool nodes_overlap(const struct found_btree_node *l,
+ const struct found_btree_node *r)
+{
+ return (l->btree_id == r->btree_id &&
+ l->level == r->level &&
+ bpos_gt(l->max_key, r->min_key));
+}
+
+static int handle_overwrites(struct bch_fs *c,
+ struct found_btree_node *l,
+ found_btree_nodes *nodes_heap)
+{
+ struct found_btree_node *r;
+
+ while ((r = min_heap_peek(nodes_heap)) &&
+ nodes_overlap(l, r)) {
+ int cmp = found_btree_node_cmp_time(l, r);
+
+ if (cmp > 0) {
+ if (bpos_cmp(l->max_key, r->max_key) >= 0)
+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
+ else {
+ r->range_updated = true;
+ r->min_key = bpos_successor(l->max_key);
+ r->range_updated = true;
+ min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
+ }
+ } else if (cmp < 0) {
+ BUG_ON(bpos_eq(l->min_key, r->min_key));
+
+ l->max_key = bpos_predecessor(r->min_key);
+ l->range_updated = true;
+ } else if (r->level) {
+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
+ } else {
+ if (bpos_cmp(l->max_key, r->max_key) >= 0)
+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
+ else {
+ r->range_updated = true;
+ r->min_key = bpos_successor(l->max_key);
+ r->range_updated = true;
+ min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
+ }
+ }
+ }
+
+ return 0;
+}
+
+int bch2_scan_for_btree_nodes(struct bch_fs *c)
+{
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+ struct printbuf buf = PRINTBUF;
+ found_btree_nodes nodes_heap = {};
+ size_t dst;
+ int ret = 0;
+
+ if (f->nodes.nr)
+ return 0;
+
+ mutex_init(&f->lock);
+
+ ret = read_btree_nodes(f);
+ if (ret)
+ return ret;
+
+ if (!f->nodes.nr) {
+ bch_err(c, "%s: no btree nodes found", __func__);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (0 && c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes found:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
+
+ dst = 0;
+ darray_for_each(f->nodes, i) {
+ struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
+
+ if (prev &&
+ prev->cookie == i->cookie) {
+ if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
+ bch_err(c, "%s: found too many replicas for btree node", __func__);
+ ret = -EINVAL;
+ goto err;
+ }
+ prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
+ } else {
+ f->nodes.data[dst++] = *i;
+ }
+ }
+ f->nodes.nr = dst;
+
+ sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+
+ if (0 && c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ swap(nodes_heap, f->nodes);
+
+ {
+ /* darray must have same layout as a heap */
+ min_heap_char real_heap;
+ BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr));
+ BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size));
+ BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr));
+ BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size));
+ }
+
+ min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
+
+ if (nodes_heap.nr) {
+ ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
+ if (ret)
+ goto err;
+
+ min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
+ }
+
+ while (true) {
+ ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
+ if (ret)
+ goto err;
+
+ if (!nodes_heap.nr)
+ break;
+
+ ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
+ if (ret)
+ goto err;
+
+ min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
+ }
+
+ for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
+ BUG_ON(nodes_overlap(n, n + 1));
+
+ if (0 && c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ } else {
+ bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
+ }
+
+ eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+err:
+ darray_exit(&nodes_heap);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ -cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->max_key, r->min_key);
+}
+
+#define for_each_found_btree_node_in_range(_f, _search, _idx) \
+ for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
+ sizeof((_f)->nodes.data[0]), \
+ found_btree_node_range_start_cmp, &search); \
+ _idx < (_f)->nodes.nr && \
+ (_f)->nodes.data[_idx].btree_id == _search.btree_id && \
+ (_f)->nodes.data[_idx].level == _search.level && \
+ bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
+ _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
+
+bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
+{
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+
+ struct found_btree_node search = {
+ .btree_id = b->c.btree_id,
+ .level = b->c.level,
+ .min_key = b->data->min_key,
+ .max_key = b->key.k.p,
+ };
+
+ for_each_found_btree_node_in_range(f, search, idx)
+ if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
+ return true;
+ return false;
+}
+
+bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
+{
+ struct found_btree_node search = {
+ .btree_id = btree,
+ .level = 0,
+ .min_key = POS_MIN,
+ .max_key = SPOS_MAX,
+ };
+
+ for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
+ return true;
+ return false;
+}
+
+int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
+ unsigned level, struct bpos node_min, struct bpos node_max)
+{
+ if (btree_id_is_alloc(btree))
+ return 0;
+
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+
+ int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ return ret;
+
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "recovery ");
+ bch2_btree_id_level_to_text(&buf, btree, level);
+ prt_str(&buf, " ");
+ bch2_bpos_to_text(&buf, node_min);
+ prt_str(&buf, " - ");
+ bch2_bpos_to_text(&buf, node_max);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ struct found_btree_node search = {
+ .btree_id = btree,
+ .level = level,
+ .min_key = node_min,
+ .max_key = node_max,
+ };
+
+ for_each_found_btree_node_in_range(f, search, idx) {
+ struct found_btree_node n = f->nodes.data[idx];
+
+ n.range_updated |= bpos_lt(n.min_key, node_min);
+ n.min_key = bpos_max(n.min_key, node_min);
+
+ n.range_updated |= bpos_gt(n.max_key, node_max);
+ n.max_key = bpos_min(n.max_key, node_max);
+
+ struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
+
+ found_btree_node_to_key(&tmp.k, &n);
+
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
+ bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+
+ BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
+ (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = level + 1,
+ .btree = btree,
+ }));
+
+ ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
+{
+ darray_exit(&f->nodes);
+}
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
new file mode 100644
index 000000000000..08687b209787
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
+#define _BCACHEFS_BTREE_NODE_SCAN_H
+
+int bch2_scan_for_btree_nodes(struct bch_fs *);
+bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
+bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
+int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
new file mode 100644
index 000000000000..2811b6857c97
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan_types.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+
+#include "darray.h"
+
+struct found_btree_node {
+ bool range_updated:1;
+ u8 btree_id;
+ u8 level;
+ unsigned sectors_written;
+ u32 seq;
+ u64 journal_seq;
+ u64 cookie;
+
+ struct bpos min_key;
+ struct bpos max_key;
+
+ unsigned nr_ptrs;
+ struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
+};
+
+typedef DARRAY(struct found_btree_node) found_btree_nodes;
+
+struct find_btree_nodes {
+ int ret;
+ struct mutex lock;
+ found_btree_nodes nodes;
+};
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 30d69a6d133e..c4f524b2ca9a 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "alloc_foreground.h"
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
@@ -9,6 +10,7 @@
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "buckets.h"
+#include "disk_accounting.h"
#include "errcode.h"
#include "error.h"
#include "journal.h"
@@ -19,6 +21,26 @@
#include <linux/prefetch.h>
+static const char * const trans_commit_flags_strs[] = {
+#define x(n, ...) #n,
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
+ NULL
+};
+
+void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags)
+{
+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+
+ prt_printf(out, "watermark=%s", bch2_watermarks[watermark]);
+
+ flags >>= BCH_WATERMARK_BITS;
+ if (flags) {
+ prt_char(out, ' ');
+ bch2_prt_bitflags(out, trans_commit_flags_strs, flags);
+ }
+}
+
static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
{
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -111,11 +133,12 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans)
return 0;
}
-static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans)
{
if (likely(trans->write_locked)) {
trans_for_each_update(trans, i)
- if (!same_leaf_as_prev(trans, i))
+ if (btree_node_locked_type(trans->paths + i->path, i->level) ==
+ BTREE_NODE_WRITE_LOCKED)
bch2_btree_node_unlock_write_inlined(trans,
trans->paths + i->path, insert_l(trans, i)->b);
trans->write_locked = false;
@@ -191,7 +214,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
overwrite:
- bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+ bch2_bset_insert(b, k, insert, clobber_u64s);
new_u64s = k->u64s;
fix_iter:
if (clobber_u64s != new_u64s)
@@ -207,14 +230,14 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
struct btree_trans *trans = bch2_trans_get(c);
- unsigned long old, new, v;
+ unsigned long old, new;
unsigned idx = w - b->writes;
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- v = READ_ONCE(b->flags);
+ old = READ_ONCE(b->flags);
do {
- old = new = v;
+ new = old;
if (!(old & (1 << BTREE_NODE_dirty)) ||
!!(old & (1 << BTREE_NODE_write_idx)) != idx ||
@@ -224,9 +247,9 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
new &= ~BTREE_WRITE_TYPE_MASK;
new |= BTREE_WRITE_journal_reclaim;
new |= 1 << BTREE_NODE_need_write;
- } while ((v = cmpxchg(&b->flags, old, new)) != old);
+ } while (!try_cmpxchg(&b->flags, &old, new));
- btree_node_write_if_need(c, b, SIX_LOCK_read);
+ btree_node_write_if_need(trans, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
bch2_trans_put(trans);
@@ -315,17 +338,17 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->btree_id != path->btree_id);
EBUG_ON(!i->level &&
btree_type_has_snapshots(i->btree_id) &&
- !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
- test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+ !(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
+ test_bit(JOURNAL_replay_done, &trans->c->journal.flags) &&
i->k->k.p.snapshot &&
- bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
+ bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
}
static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
unsigned flags)
{
return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
- trans->journal_u64s, flags);
+ trans->journal_u64s, flags, trans);
}
#define JSET_ENTRY_LOG_U64s 4
@@ -361,7 +384,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
struct bkey_i *new_k;
int ret;
- bch2_trans_unlock_write(trans);
+ bch2_trans_unlock_updates_write(trans);
bch2_trans_unlock(trans);
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
@@ -397,12 +420,13 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
struct bkey_cached *ck = (void *) path->l[0].b;
unsigned new_u64s;
struct bkey_i *new_k;
+ unsigned watermark = flags & BCH_WATERMARK_MASK;
EBUG_ON(path->level);
- if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bch2_btree_key_cache_must_wait(c) &&
- !(flags & BCH_TRANS_COMMIT_journal_reclaim))
+ if (watermark < BCH_WATERMARK_reclaim &&
+ !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ bch2_btree_key_cache_must_wait(c))
return -BCH_ERR_btree_insert_need_journal_reclaim;
/*
@@ -434,34 +458,35 @@ static int run_one_mem_trigger(struct btree_trans *trans,
struct btree_insert_entry *i,
unsigned flags)
{
+ verify_update_old_key(trans, i);
+
+ if (unlikely(flags & BTREE_TRIGGER_norun))
+ return 0;
+
struct bkey_s_c old = { &i->old_k, i->old_v };
struct bkey_i *new = i->k;
const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
- int ret;
-
- verify_update_old_key(trans, i);
- if (unlikely(flags & BTREE_TRIGGER_NORUN))
- return 0;
-
- if (old_ops->trigger == new_ops->trigger) {
- ret = bch2_key_trigger(trans, i->btree_id, i->level,
+ if (old_ops->trigger == new_ops->trigger)
+ return bch2_key_trigger(trans, i->btree_id, i->level,
old, bkey_i_to_s(new),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
- } else {
- ret = bch2_key_trigger_new(trans, i->btree_id, i->level,
+ BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags);
+ else
+ return bch2_key_trigger_new(trans, i->btree_id, i->level,
bkey_i_to_s(new), flags) ?:
- bch2_key_trigger_old(trans, i->btree_id, i->level,
+ bch2_key_trigger_old(trans, i->btree_id, i->level,
old, flags);
- }
-
- return ret;
}
-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
- bool overwrite)
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i)
{
+ verify_update_old_key(trans, i);
+
+ if ((i->flags & BTREE_TRIGGER_norun) ||
+ !btree_node_type_has_trans_triggers(i->bkey_type))
+ return 0;
+
/*
* Transactional triggers create new btree_insert_entries, so we can't
* pass them a pointer to a btree_insert_entry, that memory is going to
@@ -471,13 +496,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
struct bkey_s_c old = { &old_k, i->old_v };
const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
- unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL;
-
- verify_update_old_key(trans, i);
-
- if ((i->flags & BTREE_TRIGGER_NORUN) ||
- !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
- return 0;
+ unsigned flags = i->flags|BTREE_TRIGGER_transactional;
if (!i->insert_trigger_run &&
!i->overwrite_trigger_run &&
@@ -485,12 +504,12 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
i->overwrite_trigger_run = true;
i->insert_trigger_run = true;
return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
- BTREE_TRIGGER_INSERT|
- BTREE_TRIGGER_OVERWRITE|flags) ?: 1;
- } else if (overwrite && !i->overwrite_trigger_run) {
+ BTREE_TRIGGER_insert|
+ BTREE_TRIGGER_overwrite|flags) ?: 1;
+ } else if (!i->overwrite_trigger_run) {
i->overwrite_trigger_run = true;
return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
- } else if (!overwrite && !i->insert_trigger_run) {
+ } else if (!i->insert_trigger_run) {
i->insert_trigger_run = true;
return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
} else {
@@ -499,43 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
}
static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
- struct btree_insert_entry *btree_id_start)
+ unsigned *btree_id_updates_start)
{
- struct btree_insert_entry *i;
bool trans_trigger_run;
- int ret, overwrite;
- for (overwrite = 1; overwrite >= 0; --overwrite) {
+ /*
+ * Running triggers will append more updates to the list of updates as
+ * we're walking it:
+ */
+ do {
+ trans_trigger_run = false;
- /*
- * Running triggers will append more updates to the list of updates as
- * we're walking it:
- */
- do {
- trans_trigger_run = false;
+ for (unsigned i = *btree_id_updates_start;
+ i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
+ i++) {
+ if (trans->updates[i].btree_id < btree_id) {
+ *btree_id_updates_start = i;
+ continue;
+ }
- for (i = btree_id_start;
- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
- i++) {
- if (i->btree_id != btree_id)
- continue;
+ int ret = run_one_trans_trigger(trans, trans->updates + i);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ trans_trigger_run = true;
+ }
+ } while (trans_trigger_run);
- ret = run_one_trans_trigger(trans, i, overwrite);
- if (ret < 0)
- return ret;
- if (ret)
- trans_trigger_run = true;
- }
- } while (trans_trigger_run);
- }
+ trans_for_each_update(trans, i)
+ BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
+ i->btree_id == btree_id &&
+ btree_node_type_has_trans_triggers(i->bkey_type) &&
+ (!i->insert_trigger_run || !i->overwrite_trigger_run));
return 0;
}
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- struct btree_insert_entry *btree_id_start = trans->updates;
- unsigned btree_id = 0;
+ unsigned btree_id = 0, btree_id_updates_start = 0;
int ret = 0;
/*
@@ -549,30 +570,20 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
if (btree_id == BTREE_ID_alloc)
continue;
- while (btree_id_start < trans->updates + trans->nr_updates &&
- btree_id_start->btree_id < btree_id)
- btree_id_start++;
-
- ret = run_btree_triggers(trans, btree_id, btree_id_start);
+ ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start);
if (ret)
return ret;
}
- trans_for_each_update(trans, i) {
- if (i->btree_id > BTREE_ID_alloc)
- break;
- if (i->btree_id == BTREE_ID_alloc) {
- ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
- if (ret)
- return ret;
- break;
- }
- }
+ btree_id_updates_start = 0;
+ ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start);
+ if (ret)
+ return ret;
#ifdef CONFIG_BCACHEFS_DEBUG
trans_for_each_update(trans, i)
- BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
- (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+ BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
+ btree_node_type_has_trans_triggers(i->bkey_type) &&
(!i->insert_trigger_run || !i->overwrite_trigger_run));
#endif
return 0;
@@ -580,20 +591,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
{
- trans_for_each_update(trans, i) {
- /*
- * XXX: synchronization of cached update triggers with gc
- * XXX: synchronization of interior node updates with gc
- */
- BUG_ON(i->cached || i->level);
-
- if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
- gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
- int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+ trans_for_each_update(trans, i)
+ if (btree_node_type_has_triggers(i->bkey_type) &&
+ gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) {
+ int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
if (ret)
return ret;
}
- }
return 0;
}
@@ -606,11 +610,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
struct bch_fs *c = trans->c;
struct btree_trans_commit_hook *h;
unsigned u64s = 0;
- int ret;
+ int ret = 0;
+
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
if (race_fault()) {
trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
- return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
}
/*
@@ -662,30 +668,40 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
!(flags & BCH_TRANS_COMMIT_no_journal_res)) {
if (bch2_journal_seq_verify)
trans_for_each_update(trans, i)
- i->k->k.version.lo = trans->journal_res.seq;
+ i->k->k.bversion.lo = trans->journal_res.seq;
else if (bch2_inject_invalid_keys)
trans_for_each_update(trans, i)
- i->k->k.version = MAX_VERSION;
+ i->k->k.bversion = MAX_VERSION;
}
- if (trans->fs_usage_deltas &&
- bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
- return -BCH_ERR_btree_insert_need_mark_replicas;
-
- /* XXX: we only want to run this if deltas are nonzero */
- bch2_trans_account_disk_usage_change(trans);
-
h = trans->hooks;
while (h) {
ret = h->fn(trans, h);
if (ret)
- goto revert_fs_usage;
+ return ret;
h = h->next;
}
+ struct jset_entry *entry = trans->journal_entries;
+
+ percpu_down_read(&c->mark_lock);
+ for (entry = trans->journal_entries;
+ entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ entry = vstruct_next(entry))
+ if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
+ entry->start->k.type == KEY_TYPE_accounting) {
+ ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags);
+ if (ret)
+ goto revert_fs_usage;
+ }
+ percpu_up_read(&c->mark_lock);
+
+ /* XXX: we only want to run this if deltas are nonzero */
+ bch2_trans_account_disk_usage_change(trans);
+
trans_for_each_update(trans, i)
- if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
- ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags);
+ if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
+ ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
if (ret)
goto fatal_err;
}
@@ -696,6 +712,37 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
goto fatal_err;
}
+ struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit };
+
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+ validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit;
+
+ for (struct jset_entry *i = trans->journal_entries;
+ i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ i = vstruct_next(i)) {
+ ret = bch2_journal_entry_validate(c, NULL, i,
+ bcachefs_metadata_version_current,
+ CPU_BIG_ENDIAN, validate_context);
+ if (unlikely(ret)) {
+ bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
+ trans->fn);
+ goto fatal_err;
+ }
+ }
+
+ trans_for_each_update(trans, i) {
+ validate_context.level = i->level;
+ validate_context.btree = i->btree_id;
+
+ ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context);
+ if (unlikely(ret)){
+ bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
+ trans->fn, (void *) i->ip_allocated);
+ goto fatal_err;
+ }
+ btree_insert_entry_checks(trans, i);
+ }
+
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
struct journal *j = &c->journal;
struct jset_entry *entry;
@@ -704,7 +751,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (i->key_cache_already_flushed)
continue;
- if (i->flags & BTREE_UPDATE_NOJOURNAL)
+ if (i->flags & BTREE_UPDATE_nojournal)
continue;
verify_update_old_key(trans, i);
@@ -739,75 +786,41 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
trans_for_each_update(trans, i) {
struct btree_path *path = trans->paths + i->path;
- if (!i->cached) {
+ if (!i->cached)
bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
- } else if (!i->key_cache_already_flushed)
+ else if (!i->key_cache_already_flushed)
bch2_btree_insert_key_cached(trans, flags, i);
- else {
+ else
bch2_btree_key_cache_drop(trans, path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- }
}
return 0;
fatal_err:
- bch2_fatal_error(c);
+ bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
+ percpu_down_read(&c->mark_lock);
revert_fs_usage:
- if (trans->fs_usage_deltas)
- bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
+ for (struct jset_entry *entry2 = trans->journal_entries;
+ entry2 != entry;
+ entry2 = vstruct_next(entry2))
+ if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys &&
+ entry2->start->k.type == KEY_TYPE_accounting)
+ bch2_accounting_trans_commit_revert(trans,
+ bkey_i_to_accounting(entry2->start), flags);
+ percpu_up_read(&c->mark_lock);
return ret;
}
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
+ /*
+ * Accounting keys aren't deduped in the journal: we have to compare
+ * each individual update against what's in the btree to see if it has
+ * been applied yet, and accounting updates also don't overwrite,
+ * they're deltas that accumulate.
+ */
trans_for_each_update(trans, i)
- bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-}
-
-static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
- enum bkey_invalid_flags flags,
- struct btree_insert_entry *i,
- struct printbuf *err)
-{
- struct bch_fs *c = trans->c;
-
- printbuf_reset(err);
- prt_printf(err, "invalid bkey on insert from %s -> %ps",
- trans->fn, (void *) i->ip_allocated);
- prt_newline(err);
- printbuf_indent_add(err, 2);
-
- bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
- prt_newline(err);
-
- bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err);
- bch2_print_string_as_lines(KERN_ERR, err->buf);
-
- bch2_inconsistent_error(c);
- bch2_dump_trans_updates(trans);
-
- return -EINVAL;
-}
-
-static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans,
- struct jset_entry *i)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "invalid bkey on insert from %s", trans->fn);
- prt_newline(&buf);
- printbuf_indent_add(&buf, 2);
-
- bch2_journal_entry_to_text(&buf, c, i);
- prt_newline(&buf);
-
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
-
- bch2_inconsistent_error(c);
- bch2_dump_trans_updates(trans);
-
- return -EINVAL;
+ if (i->k->k.type != KEY_TYPE_accounting)
+ bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
}
static int bch2_trans_commit_journal_pin_flush(struct journal *j,
@@ -826,7 +839,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
struct bch_fs *c = trans->c;
int ret = 0, u64s_delta = 0;
- trans_for_each_update(trans, i) {
+ for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+ struct btree_insert_entry *i = trans->updates + idx;
if (i->cached)
continue;
@@ -854,7 +868,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
if (!ret && unlikely(trans->journal_replay_not_finished))
bch2_drop_overwrites_from_journal(trans);
- bch2_trans_unlock_write(trans);
+ bch2_trans_unlock_updates_write(trans);
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
@@ -874,7 +888,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
static int journal_reclaim_wait_done(struct bch_fs *c)
{
int ret = bch2_journal_error(&c->journal) ?:
- !bch2_btree_key_cache_must_wait(c);
+ bch2_btree_key_cache_wait_done(c);
if (!ret)
journal_reclaim_kick(&c->journal);
@@ -887,6 +901,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
int ret, unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
switch (ret) {
case -BCH_ERR_btree_insert_btree_node_full:
@@ -897,7 +912,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
break;
case -BCH_ERR_btree_insert_need_mark_replicas:
ret = drop_locks_do(trans,
- bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
+ bch2_accounting_update_sb(trans));
break;
case -BCH_ERR_journal_res_get_blocked:
/*
@@ -905,7 +920,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* flag
*/
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
+ watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
}
@@ -919,9 +934,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
bch2_trans_unlock(trans);
trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
+ track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true);
wait_event_freezable(c->journal.reclaim_wait,
(ret = journal_reclaim_wait_done(c)));
+
+ track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false);
+
if (ret < 0)
break;
@@ -941,24 +960,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
return ret;
}
-static noinline int
-bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- int ret;
-
- if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
- test_bit(BCH_FS_started, &c->flags))
- return -BCH_ERR_erofs_trans_commit;
-
- ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
- if (ret)
- return ret;
-
- bch2_write_ref_get(c, BCH_WRITE_REF_trans);
- return 0;
-}
-
/*
* This is for updates done in the early part of fsck - btree_gc - before we've
* gone RW. we only add the new key to the list of keys for journal replay to
@@ -968,15 +969,26 @@ static noinline int
do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- int ret = 0;
+
+ BUG_ON(current != c->recovery_task);
trans_for_each_update(trans, i) {
- ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
+ int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
if (ret)
- break;
+ return ret;
}
- return ret;
+ for (struct jset_entry *i = trans->journal_entries;
+ i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ i = vstruct_next(i))
+ if (i->type == BCH_JSET_ENTRY_btree_keys ||
+ i->type == BCH_JSET_ENTRY_write_buffer_keys) {
+ int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
@@ -985,60 +997,27 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
int ret = 0;
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret))
+ goto out_reset;
+
if (!trans->nr_updates &&
!trans->journal_entries_u64s)
goto out_reset;
- memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
-
ret = bch2_trans_commit_run_triggers(trans);
if (ret)
goto out_reset;
- trans_for_each_update(trans, i) {
- struct printbuf buf = PRINTBUF;
- enum bkey_invalid_flags invalid_flags = 0;
-
- if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
-
- if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
- i->bkey_type, invalid_flags, &buf)))
- ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf);
- btree_insert_entry_checks(trans, i);
- printbuf_exit(&buf);
-
- if (ret)
- return ret;
- }
-
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- i = vstruct_next(i)) {
- enum bkey_invalid_flags invalid_flags = 0;
-
- if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
-
- if (unlikely(bch2_journal_entry_validate(c, NULL, i,
- bcachefs_metadata_version_current,
- CPU_BIG_ENDIAN, invalid_flags)))
- ret = bch2_trans_commit_journal_entry_invalid(trans, i);
-
- if (ret)
- return ret;
- }
-
- if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
- ret = do_bch2_trans_commit_to_journal_replay(trans);
- goto out_reset;
- }
-
if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
- ret = bch2_trans_commit_get_rw_cold(trans, flags);
- if (ret)
- goto out_reset;
+ if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
+ ret = do_bch2_trans_commit_to_journal_replay(trans);
+ else
+ ret = -BCH_ERR_erofs_trans_commit;
+ goto out_reset;
}
EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
@@ -1062,7 +1041,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (i->key_cache_already_flushed)
continue;
- if (i->flags & BTREE_UPDATE_NOJOURNAL)
+ if (i->flags & BTREE_UPDATE_nojournal)
continue;
/* we're going to journal the key being updated: */
@@ -1083,9 +1062,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
}
retry:
errored_at = NULL;
- bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+ memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4a5a64499eb7..a09cbe9cd94f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -5,6 +5,7 @@
#include <linux/list.h>
#include <linux/rhashtable.h>
+#include "bbpos_types.h"
#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
@@ -137,6 +138,31 @@ struct btree {
struct list_head list;
};
+#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \
+ x(lock_intent) \
+ x(lock_write) \
+ x(dirty) \
+ x(read_in_flight) \
+ x(write_in_flight) \
+ x(noevict) \
+ x(write_blocked) \
+ x(will_make_reachable) \
+ x(access_bit)
+
+enum bch_btree_cache_not_freed_reasons {
+#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
+ BCH_BTREE_CACHE_NOT_FREED_REASONS()
+#undef x
+ BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
+};
+
+struct btree_cache_list {
+ unsigned idx;
+ struct shrinker *shrink;
+ struct list_head list;
+ size_t nr;
+};
+
struct btree_cache {
struct rhashtable table;
bool table_init_done;
@@ -154,16 +180,19 @@ struct btree_cache {
* should never grow past ~2-3 nodes in practice.
*/
struct mutex lock;
- struct list_head live;
struct list_head freeable;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
+ struct btree_cache_list live[2];
- /* Number of elements in live + freeable lists */
- unsigned used;
- unsigned reserve;
- atomic_t dirty;
- struct shrinker *shrink;
+ size_t nr_freeable;
+ size_t nr_reserve;
+ size_t nr_by_btree[BTREE_ID_NR];
+ atomic_long_t nr_dirty;
+
+ /* shrinker stats */
+ size_t nr_freed;
+ u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
/*
* If we need to allocate memory for a new btree node and that
@@ -173,6 +202,11 @@ struct btree_cache {
*/
struct task_struct *alloc_lock;
struct closure_waitlist alloc_wait;
+
+ struct bbpos pinned_nodes_start;
+ struct bbpos pinned_nodes_end;
+ /* btree id mask: 0 for leaves, 1 for interior */
+ u64 pinned_nodes_mask[2];
};
struct btree_node_iter {
@@ -181,36 +215,89 @@ struct btree_node_iter {
} data[MAX_BSETS];
};
+#define BTREE_ITER_FLAGS() \
+ x(slots) \
+ x(intent) \
+ x(prefetch) \
+ x(is_extents) \
+ x(not_extents) \
+ x(cached) \
+ x(with_key_cache) \
+ x(with_updates) \
+ x(with_journal) \
+ x(snapshot_field) \
+ x(all_snapshots) \
+ x(filter_snapshots) \
+ x(nopreserve) \
+ x(cached_nofill) \
+ x(key_cache_fill) \
+
+#define STR_HASH_FLAGS() \
+ x(must_create) \
+ x(must_replace)
+
+#define BTREE_UPDATE_FLAGS() \
+ x(internal_snapshot_node) \
+ x(nojournal) \
+ x(key_cache_reclaim)
+
+
/*
- * Iterate over all possible positions, synthesizing deleted keys for holes:
- */
-static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0;
-/*
- * Indicates that intent locks should be taken on leaf nodes, because we expect
- * to be doing updates:
- */
-static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1;
-/*
- * Causes the btree iterator code to prefetch additional btree nodes from disk:
- */
-static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2;
-/*
- * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
- * @pos or the first key strictly greater than @pos
+ * BTREE_TRIGGER_norun - don't run triggers at all
+ *
+ * BTREE_TRIGGER_transactional - we're running transactional triggers as part of
+ * a transaction commit: triggers may generate new updates
+ *
+ * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction
+ * commit: we have our journal reservation, we're holding btree node write
+ * locks, and we know the transaction is going to commit (returning an error
+ * here is a fatal error, causing us to go emergency read-only)
+ *
+ * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage
+ *
+ * BTREE_TRIGGER_insert - @new is entering the btree
+ * BTREE_TRIGGER_overwrite - @old is leaving the btree
+ *
+ * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc
+ * trigger
*/
-static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3;
-static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4;
-static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5;
-static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6;
-static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7;
-static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8;
-static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9;
-static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
-static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11;
-static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12;
-static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13;
-static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14;
-#define __BTREE_ITER_FLAGS_END 15
+#define BTREE_TRIGGER_FLAGS() \
+ x(norun) \
+ x(transactional) \
+ x(atomic) \
+ x(check_repair) \
+ x(gc) \
+ x(insert) \
+ x(overwrite) \
+ x(is_root) \
+ x(bucket_invalidate)
+
+enum {
+#define x(n) BTREE_ITER_FLAG_BIT_##n,
+ BTREE_ITER_FLAGS()
+ STR_HASH_FLAGS()
+ BTREE_UPDATE_FLAGS()
+ BTREE_TRIGGER_FLAGS()
+#undef x
+};
+
+/* iter flags must fit in a u16: */
+//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15);
+
+enum btree_iter_update_trigger_flags {
+#define x(n) BTREE_ITER_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
+ BTREE_ITER_FLAGS()
+#undef x
+#define x(n) STR_HASH_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
+ STR_HASH_FLAGS()
+#undef x
+#define x(n) BTREE_UPDATE_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
+ BTREE_UPDATE_FLAGS()
+#undef x
+#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
+ BTREE_TRIGGER_FLAGS()
+#undef x
+};
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -301,7 +388,7 @@ struct btree_iter {
*/
struct bkey k;
- /* BTREE_ITER_WITH_JOURNAL: */
+ /* BTREE_ITER_with_journal: */
size_t journal_idx;
#ifdef TRACK_PATH_ALLOCATED
unsigned long ip_allocated;
@@ -316,17 +403,15 @@ struct bkey_cached {
unsigned long flags;
u16 u64s;
- bool valid;
- u32 btree_trans_barrier_seq;
struct bkey_cached_key key;
struct rhash_head hash;
- struct list_head list;
struct journal_entry_pin journal;
u64 seq;
struct bkey_i *k;
+ struct rcu_head rcu;
};
static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
@@ -358,7 +443,21 @@ struct btree_insert_entry {
unsigned long ip_allocated;
};
+/* Number of btree paths we preallocate, usually enough */
#define BTREE_ITER_INITIAL 64
+/*
+ * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
+ * paths should run inside this limit, and if they don't it usually indicates a
+ * bug (leaking/duplicated btree paths).
+ *
+ * exception: some fsck paths
+ *
+ * bugs with excessive path usage seem to have possibly been eliminated now, so
+ * we might consider eliminating this (and btree_trans_too_many_iter()) at some
+ * point.
+ */
+#define BTREE_ITER_NORMAL_LIMIT 256
+/* never exceed limit */
#define BTREE_ITER_MAX (1U << 10)
struct btree_trans_commit_hook;
@@ -393,11 +492,14 @@ struct btree_trans {
btree_path_idx_t nr_sorted;
btree_path_idx_t nr_paths;
btree_path_idx_t nr_paths_max;
+ btree_path_idx_t nr_updates;
u8 fn_idx;
- u8 nr_updates;
u8 lock_must_abort;
bool lock_may_not_fail:1;
bool srcu_held:1;
+ bool locked:1;
+ bool pf_memalloc_nofs:1;
+ bool write_locked:1;
bool used_mempool:1;
bool in_traverse_all:1;
bool paths_sorted:1;
@@ -405,13 +507,19 @@ struct btree_trans {
bool journal_transaction_names:1;
bool journal_replay_not_finished:1;
bool notrace_relock_fail:1;
- bool write_locked:1;
enum bch_errcode restarted:16;
u32 restart_count;
+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ u32 restart_count_this_trans;
+#endif
u64 last_begin_time;
unsigned long last_begin_ip;
unsigned long last_restarted_ip;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ bch_stacktrace last_restarted_trace;
+#endif
+ unsigned long last_unlock_ip;
unsigned long srcu_lock_time;
const char *fn;
@@ -435,8 +543,10 @@ struct btree_trans {
unsigned journal_u64s;
unsigned extra_disk_res; /* XXX kill */
- struct replicas_delta_list *fs_usage_deltas;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
/* Entries before this are zeroed out on every bch2_trans_get() call */
struct list_head list;
@@ -494,7 +604,8 @@ enum btree_write_type {
x(dying) \
x(fake) \
x(need_rewrite) \
- x(never_write)
+ x(never_write) \
+ x(pinned)
enum btree_flags {
/* First bits for btree node write type */
@@ -654,6 +765,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
BIT_ULL(BKEY_TYPE_reflink)| \
+ BIT_ULL(BKEY_TYPE_subvolumes)| \
BIT_ULL(BKEY_TYPE_btree))
#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
@@ -666,58 +778,79 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
-static inline bool btree_node_type_needs_gc(enum btree_node_type type)
+static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type)
{
- return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type);
+ return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS;
}
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
+static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type)
+{
+ return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS;
+}
+
+static inline bool btree_node_type_has_triggers(enum btree_node_type type)
{
- const unsigned mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
+ return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
+}
+
+static inline bool btree_id_is_extents(enum btree_id btree)
+{
+ const u64 mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return (1U << type) & mask;
+ return BIT_ULL(btree) & mask;
}
-static inline bool btree_id_is_extents(enum btree_id btree)
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
- return btree_node_type_is_extents(__btree_node_type(0, btree));
+ return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1);
+}
+
+static inline bool btree_type_has_snapshots(enum btree_id btree)
+{
+ const u64 mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr)
+ BCH_BTREE_IDS()
+#undef x
+ ;
+
+ return BIT_ULL(btree) & mask;
}
-static inline bool btree_type_has_snapshots(enum btree_id id)
+static inline bool btree_type_has_snapshot_field(enum btree_id btree)
{
- const unsigned mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
+ const u64 mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr)
BCH_BTREE_IDS()
#undef x
;
- return (1U << id) & mask;
+ return BIT_ULL(btree) & mask;
}
-static inline bool btree_type_has_snapshot_field(enum btree_id id)
+static inline bool btree_type_has_ptrs(enum btree_id btree)
{
- const unsigned mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
+ const u64 mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return (1U << id) & mask;
+ return BIT_ULL(btree) & mask;
}
-static inline bool btree_type_has_ptrs(enum btree_id id)
+static inline bool btree_type_uses_write_buffer(enum btree_id btree)
{
- const unsigned mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr)
+ const u64 mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return (1U << id) & mask;
+ return BIT_ULL(btree) & mask;
}
struct btree_root {
@@ -727,7 +860,7 @@ struct btree_root {
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
u8 level;
u8 alive;
- s8 error;
+ s16 error;
};
enum btree_gc_coalesce_fail_reason {
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index c3ff365acce9..13d794f201a5 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -25,19 +25,22 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
static int __must_check
bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t,
- struct bkey_i *, enum btree_update_flags,
+ struct bkey_i *, enum btree_iter_update_trigger_flags,
unsigned long ip);
static noinline int extent_front_merge(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
struct bkey_i **insert,
- enum btree_update_flags flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
struct bkey_i *update;
int ret;
+ if (unlikely(trans->journal_replay_not_finished))
+ return 0;
+
update = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(update);
if (ret)
@@ -69,6 +72,9 @@ static noinline int extent_back_merge(struct btree_trans *trans,
struct bch_fs *c = trans->c;
int ret;
+ if (unlikely(trans->journal_replay_not_finished))
+ return 0;
+
ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
if (ret < 0)
@@ -98,8 +104,8 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
pos.snapshot++;
for_each_btree_key_norestart(trans, iter, btree_id, pos,
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_NOPRESERVE, k, ret) {
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_nopreserve, k, ret) {
if (!bkey_eq(k.k->p, pos))
break;
@@ -132,21 +138,21 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
darray_init(&s);
bch2_trans_iter_init(trans, &old_iter, id, old_pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_not_extents|
+ BTREE_ITER_all_snapshots);
while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
!(ret = bkey_err(old_k)) &&
bkey_eq(old_pos, old_k.k->p)) {
struct bpos whiteout_pos =
- SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+ SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);
if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
continue;
new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
+ BTREE_ITER_not_extents|
+ BTREE_ITER_intent);
ret = bkey_err(new_k);
if (ret)
break;
@@ -162,7 +168,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
update->k.type = KEY_TYPE_whiteout;
ret = bch2_trans_update(trans, &new_iter, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
}
bch2_trans_iter_exit(trans, &new_iter);
@@ -179,7 +185,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
struct btree_iter *iter,
- enum btree_update_flags flags,
+ enum btree_iter_update_trigger_flags flags,
struct bkey_s_c old,
struct bkey_s_c new)
{
@@ -212,7 +218,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
old.k->p, update->k.p) ?:
bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+ BTREE_UPDATE_internal_snapshot_node|flags);
if (ret)
return ret;
}
@@ -229,7 +235,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
old.k->p, update->k.p) ?:
bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+ BTREE_UPDATE_internal_snapshot_node|flags);
if (ret)
return ret;
}
@@ -254,7 +260,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
}
ret = bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+ BTREE_UPDATE_internal_snapshot_node|flags);
if (ret)
return ret;
}
@@ -267,7 +273,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
bch2_cut_front(new.k->p, update);
ret = bch2_trans_update_by_path(trans, iter->path, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ BTREE_UPDATE_internal_snapshot_node|
flags, _RET_IP_);
if (ret)
return ret;
@@ -279,7 +285,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
static int bch2_trans_update_extent(struct btree_trans *trans,
struct btree_iter *orig_iter,
struct bkey_i *insert,
- enum btree_update_flags flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -287,10 +293,10 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
int ret = 0;
bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
- BTREE_ITER_INTENT|
- BTREE_ITER_WITH_UPDATES|
- BTREE_ITER_NOT_EXTENTS);
- k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+ BTREE_ITER_intent|
+ BTREE_ITER_with_updates|
+ BTREE_ITER_not_extents);
+ k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -317,7 +323,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
goto out;
next:
bch2_btree_iter_advance(&iter);
- k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+ k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -340,7 +346,7 @@ err:
static noinline int flush_new_cached_update(struct btree_trans *trans,
struct btree_insert_entry *i,
- enum btree_update_flags flags,
+ enum btree_iter_update_trigger_flags flags,
unsigned long ip)
{
struct bkey k;
@@ -348,7 +354,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
btree_path_idx_t path_idx =
bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0,
- BTREE_ITER_INTENT, _THIS_IP_);
+ BTREE_ITER_intent, _THIS_IP_);
ret = bch2_btree_path_traverse(trans, path_idx, 0);
if (ret)
goto out;
@@ -366,9 +372,9 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
goto out;
i->key_cache_already_flushed = true;
- i->flags |= BTREE_TRIGGER_NORUN;
+ i->flags |= BTREE_TRIGGER_norun;
- btree_path_set_should_be_locked(btree_path);
+ btree_path_set_should_be_locked(trans, btree_path);
ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
out:
bch2_path_put(trans, path_idx, true);
@@ -377,7 +383,7 @@ out:
static int __must_check
bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
- struct bkey_i *k, enum btree_update_flags flags,
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
unsigned long ip)
{
struct bch_fs *c = trans->c;
@@ -416,7 +422,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
break;
}
- if (!cmp && i < trans->updates + trans->nr_updates) {
+ bool overwrite = !cmp && i < trans->updates + trans->nr_updates;
+
+ if (overwrite) {
EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
bch2_path_put(trans, i->path, true);
@@ -443,7 +451,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
}
}
- __btree_path_get(trans->paths + i->path, true);
+ __btree_path_get(trans, trans->paths + i->path, true);
+
+ trace_update_by_path(trans, path, i, overwrite);
/*
* If a key is present in the key cache, it must also exist in the
@@ -452,7 +462,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
* the key cache - but the key has to exist in the btree for that to
* work:
*/
- if (path->cached && bkey_deleted(&i->old_k))
+ if (path->cached && !i->old_btree_u64s)
return flush_new_cached_update(trans, i, flags, ip);
return 0;
@@ -473,15 +483,15 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
if (!iter->key_cache_path)
iter->key_cache_path =
bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
- BTREE_ITER_INTENT|
- BTREE_ITER_CACHED, _THIS_IP_);
+ BTREE_ITER_intent|
+ BTREE_ITER_cached, _THIS_IP_);
iter->key_cache_path =
bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
_THIS_IP_);
- ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED);
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached);
if (unlikely(ret))
return ret;
@@ -492,24 +502,24 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
}
- btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
+ btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
}
return 0;
}
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_i *k, enum btree_update_flags flags)
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
{
btree_path_idx_t path_idx = iter->update_path ?: iter->path;
int ret;
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ if (iter->flags & BTREE_ITER_is_extents)
return bch2_trans_update_extent(trans, iter, k, flags);
if (bkey_deleted(&k->k) &&
- !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
- (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+ !(flags & BTREE_UPDATE_key_cache_reclaim) &&
+ (iter->flags & BTREE_ITER_filter_snapshots)) {
ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
if (unlikely(ret < 0))
return ret;
@@ -522,7 +532,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
* Ensure that updates to cached btrees go to the key cache:
*/
struct btree_path *path = trans->paths + path_idx;
- if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+ if (!(flags & BTREE_UPDATE_key_cache_reclaim) &&
!path->cached &&
!path->level &&
btree_id_cached(trans->c, path->btree_id)) {
@@ -578,12 +588,9 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
enum btree_id btree, struct bpos end)
{
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
- k = bch2_btree_iter_prev(iter);
- ret = bkey_err(k);
+ bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
+ int ret = bkey_err(k);
if (ret)
goto err;
@@ -615,15 +622,15 @@ void bch2_trans_commit_hook(struct btree_trans *trans,
int bch2_btree_insert_nonextent(struct btree_trans *trans,
enum btree_id btree, struct bkey_i *k,
- enum btree_update_flags flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
int ret;
bch2_trans_iter_init(trans, &iter, btree, k->k.p,
- BTREE_ITER_CACHED|
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
+ BTREE_ITER_cached|
+ BTREE_ITER_not_extents|
+ BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
@@ -631,16 +638,13 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
}
int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
- struct bkey_i *k, enum btree_update_flags flags)
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
- int ret;
-
bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, k, flags);
+ BTREE_ITER_intent|flags);
+ int ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -653,37 +657,31 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
* @disk_res: must be non-NULL whenever inserting or potentially
* splitting data extents
* @flags: transaction commit flags
+ * @iter_flags: btree iter update trigger flags
*
* Returns: 0 on success, error code on failure
*/
int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
- struct disk_reservation *disk_res, int flags)
+ struct disk_reservation *disk_res, int flags,
+ enum btree_iter_update_trigger_flags iter_flags)
{
- return bch2_trans_do(c, disk_res, NULL, flags,
- bch2_btree_insert_trans(trans, id, k, 0));
+ return bch2_trans_commit_do(c, disk_res, NULL, flags,
+ bch2_btree_insert_trans(trans, id, k, iter_flags));
}
-int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
- unsigned len, unsigned update_flags)
+int bch2_btree_delete_at(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned update_flags)
{
- struct bkey_i *k;
-
- k = bch2_trans_kmalloc(trans, sizeof(*k));
- if (IS_ERR(k))
- return PTR_ERR(k);
+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+ int ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ return ret;
bkey_init(&k->k);
k->k.p = iter->pos;
- bch2_key_resize(&k->k, len);
return bch2_trans_update(trans, iter, k, update_flags);
}
-int bch2_btree_delete_at(struct btree_trans *trans,
- struct btree_iter *iter, unsigned update_flags)
-{
- return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
-}
-
int bch2_btree_delete(struct btree_trans *trans,
enum btree_id btree, struct bpos pos,
unsigned update_flags)
@@ -692,8 +690,8 @@ int bch2_btree_delete(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &iter, btree, pos,
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_btree_delete_at(trans, &iter, update_flags);
bch2_trans_iter_exit(trans, &iter);
@@ -711,8 +709,8 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
struct bkey_s_c k;
int ret = 0;
- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
- while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
+ while ((k = bch2_btree_iter_peek_max(&iter, end)).k) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(trans->c, 0);
struct bkey_i delete;
@@ -739,7 +737,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
*/
delete.k.p = iter.pos;
- if (iter.flags & BTREE_ITER_IS_EXTENTS)
+ if (iter.flags & BTREE_ITER_is_extents)
bch2_key_resize(&delete.k,
bpos_min(end, k.k->p).offset -
iter.pos.offset);
@@ -785,9 +783,37 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
return ret;
}
+int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set)
+{
+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+ int ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ return ret;
+
+ bkey_init(&k->k);
+ k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ k->k.p = iter->pos;
+ if (iter->flags & BTREE_ITER_is_extents)
+ bch2_key_resize(&k->k, 1);
+
+ return bch2_trans_update(trans, iter, k, 0);
+}
+
int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
struct bpos pos, bool set)
{
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
+
+ int ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_bit_mod_iter(trans, &iter, set);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
+ struct bpos pos, bool set)
+{
struct bkey_i k;
bkey_init(&k.k);
@@ -797,10 +823,17 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
return bch2_trans_update_buffered(trans, btree, &k);
}
-static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s)
+int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
{
+ unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64));
+ prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos);
+
+ int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+ if (ret)
+ return ret;
+
struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
- int ret = PTR_ERR_OR_ZERO(e);
+ ret = PTR_ERR_OR_ZERO(e);
if (ret)
return ret;
@@ -825,7 +858,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
if (ret)
goto err;
- if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+ if (!test_bit(JOURNAL_running, &c->journal.flags)) {
ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s));
if (ret)
goto err;
@@ -835,9 +868,8 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
memcpy(l->d, buf.buf, buf.pos);
c->journal.early_journal_entries.nr += jset_u64s(u64s);
} else {
- ret = bch2_trans_do(c, NULL, NULL,
- BCH_TRANS_COMMIT_lazy_rw|commit_flags,
- __bch2_trans_log_msg(trans, &buf, u64s));
+ ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
+ bch2_trans_log_msg(trans, &buf));
}
err:
printbuf_exit(&buf);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index b9382b7b288b..47d8690f01bf 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -24,11 +24,11 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
#define BCH_TRANS_COMMIT_FLAGS() \
x(no_enospc, "don't check for enospc") \
x(no_check_rw, "don't attempt to take a ref on c->writes") \
- x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \
x(no_journal_res, "don't take a journal reservation, instead " \
"pin journal entry referred to by trans->journal_res.seq") \
x(journal_reclaim, "operation required for journal reclaim; may return error" \
"instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
+ x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied")
enum __bch_trans_commit_flags {
/* First bits for bch_watermark: */
@@ -44,30 +44,33 @@ enum bch_trans_commit_flags {
#undef x
};
-int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
- unsigned, unsigned);
+void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags);
+
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
- struct bkey_i *, enum btree_update_flags);
+ struct bkey_i *, enum btree_iter_update_trigger_flags);
int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
- enum btree_update_flags);
-int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
- struct disk_reservation *, int flags);
+ enum btree_iter_update_trigger_flags);
+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct
+ disk_reservation *, int flags, enum
+ btree_iter_update_trigger_flags iter_flags);
int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, unsigned, u64 *);
+int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool);
int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
enum btree_id btree, struct bpos pos)
{
- return bch2_btree_bit_mod(trans, btree, pos, false);
+ return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
}
int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
@@ -93,14 +96,14 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
}
int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
- enum btree_update_flags,
+ enum btree_iter_update_trigger_flags,
struct bkey_s_c, struct bkey_s_c);
int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
enum btree_id, struct bpos);
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, enum btree_update_flags);
+ struct bkey_i *, enum btree_iter_update_trigger_flags);
struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);
@@ -123,11 +126,31 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
+int bch2_btree_write_buffer_insert_err(struct btree_trans *,
+ enum btree_id, struct bkey_i *);
+
static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
enum btree_id btree,
struct bkey_i *k)
{
- if (unlikely(trans->journal_replay_not_finished))
+ if (unlikely(!btree_type_uses_write_buffer(btree))) {
+ int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
+ dump_stack();
+ return ret;
+ }
+ /*
+ * Most updates skip the btree write buffer until journal replay is
+ * finished because synchronization with journal replay relies on having
+ * a btree node locked - if we're overwriting a key in the journal that
+ * journal replay hasn't yet replayed, we have to mark it as
+ * overwritten.
+ *
+ * But accounting updates don't overwrite, they're deltas, and they have
+ * to be flushed to the btree strictly in order for journal replay to be
+ * able to tell which updates need to be applied:
+ */
+ if (k->k.type != KEY_TYPE_accounting &&
+ unlikely(trans->journal_replay_not_finished))
return bch2_btree_insert_clone_trans(trans, btree, k);
struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
@@ -144,6 +167,7 @@ void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *, unsigned);
+int bch2_trans_log_msg(struct btree_trans *, struct printbuf *);
__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
@@ -175,15 +199,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_flags)))
-#define bch2_trans_run(_c, _do) \
-({ \
- struct btree_trans *trans = bch2_trans_get(_c); \
- int _ret = (_do); \
- bch2_trans_put(trans); \
- _ret; \
-})
-
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
+#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \
bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
#define trans_for_each_update(_trans, _i) \
@@ -200,14 +216,6 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
trans->journal_entries_u64s = 0;
trans->hooks = NULL;
trans->extra_disk_res = 0;
-
- if (trans->fs_usage_deltas) {
- trans->fs_usage_deltas->used = 0;
- memset((void *) trans->fs_usage_deltas +
- offsetof(struct replicas_delta_list, memset_start), 0,
- (void *) &trans->fs_usage_deltas->memset_end -
- (void *) &trans->fs_usage_deltas->memset_start);
- }
}
static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
@@ -219,7 +227,8 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t
if (type && k.k->type != type)
return ERR_PTR(-ENOENT);
- mut = bch2_trans_kmalloc_nomemzero(trans, bytes);
+ /* extra padding for varint_decode_fast... */
+ mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8);
if (!IS_ERR(mut)) {
bkey_reassemble(mut, k);
@@ -242,7 +251,8 @@ static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *tra
KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k, unsigned flags,
+ struct bkey_s_c *k,
+ enum btree_iter_update_trigger_flags flags,
unsigned type, unsigned min_bytes)
{
struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
@@ -259,8 +269,9 @@ static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, str
return mut;
}
-static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k, unsigned flags)
+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans,
+ struct btree_iter *iter, struct bkey_s_c *k,
+ enum btree_iter_update_trigger_flags flags)
{
return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
}
@@ -272,10 +283,11 @@ static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struc
static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
- unsigned flags, unsigned type, unsigned min_bytes)
+ enum btree_iter_update_trigger_flags flags,
+ unsigned type, unsigned min_bytes)
{
struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
- btree_id, pos, flags|BTREE_ITER_INTENT, type);
+ btree_id, pos, flags|BTREE_ITER_intent, type);
struct bkey_i *ret = IS_ERR(k.k)
? ERR_CAST(k.k)
: __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
@@ -287,7 +299,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr
static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
}
@@ -295,10 +307,11 @@ static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *tran
static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
- unsigned flags, unsigned type, unsigned min_bytes)
+ enum btree_iter_update_trigger_flags flags,
+ unsigned type, unsigned min_bytes)
{
struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
- btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes);
+ btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes);
int ret;
if (IS_ERR(mut))
@@ -316,7 +329,8 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
- unsigned flags, unsigned min_bytes)
+ enum btree_iter_update_trigger_flags flags,
+ unsigned min_bytes)
{
return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
}
@@ -324,7 +338,7 @@ static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans
static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
}
@@ -335,7 +349,8 @@ static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
- unsigned flags, unsigned type, unsigned val_size)
+ enum btree_iter_update_trigger_flags flags,
+ unsigned type, unsigned val_size)
{
struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
int ret;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4530b14ff2c3..e4e7c804625e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "bkey_buf.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_gc.h"
@@ -15,86 +16,138 @@
#include "clock.h"
#include "error.h"
#include "extents.h"
+#include "io_write.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "recovery_passes.h"
#include "replicas.h"
+#include "sb-members.h"
#include "super-io.h"
#include "trace.h"
#include <linux/random.h>
+static const char * const bch2_btree_update_modes[] = {
+#define x(t) #t,
+ BTREE_UPDATE_MODES()
+#undef x
+ NULL
+};
+
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
- btree_path_idx_t, struct btree *,
- struct keylist *, unsigned);
+ btree_path_idx_t, struct btree *, struct keylist *);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
- enum btree_id btree_id,
- unsigned level,
- struct bpos pos)
-{
- btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
- BTREE_ITER_NOPRESERVE|
- BTREE_ITER_INTENT, _RET_IP_);
- path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
-
- struct btree_path *path = trans->paths + path_idx;
- bch2_btree_path_downgrade(trans, path);
- __bch2_btree_path_unlock(trans, path);
- return path_idx;
-}
-
-/* Debug code: */
-
/*
* Verify that child nodes correctly span parent node's range:
*/
-static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
+int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bpos next_node = b->data->min_key;
- struct btree_node_iter iter;
+ struct bch_fs *c = trans->c;
+ struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
+ ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
+ : b->data->min_key;
+ struct btree_and_journal_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_btree_ptr_v2 bp;
- struct bkey unpacked;
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
+ struct bkey_buf prev;
+ int ret = 0;
- BUG_ON(!b->c.level);
+ BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+ b->data->min_key));
+
+ bch2_bkey_buf_init(&prev);
+ bkey_init(&prev.k->k);
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+
+ if (b == btree_node_root(c, b)) {
+ if (!bpos_eq(b->data->min_key, POS_MIN)) {
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->min_key);
+ log_fsck_err(trans, btree_root_bad_min_key,
+ "btree root with incorrect min_key: %s", buf.buf);
+ goto topology_repair;
+ }
- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
- return;
+ if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->max_key);
+ log_fsck_err(trans, btree_root_bad_max_key,
+ "btree root with incorrect max_key: %s", buf.buf);
+ goto topology_repair;
+ }
+ }
- bch2_btree_node_iter_init_from_start(&iter, b);
+ if (!b->c.level)
+ goto out;
- while (1) {
- k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
if (k.k->type != KEY_TYPE_btree_ptr_v2)
- break;
- bp = bkey_s_c_to_btree_ptr_v2(k);
+ goto out;
- if (!bpos_eq(next_node, bp.v->min_key)) {
- bch2_dump_btree_node(c, b);
- bch2_bpos_to_text(&buf1, next_node);
- bch2_bpos_to_text(&buf2, bp.v->min_key);
- panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
- }
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- bch2_btree_node_iter_advance(&iter, b);
+ struct bpos expected_min = bkey_deleted(&prev.k->k)
+ ? node_min
+ : bpos_successor(prev.k->k.p);
- if (bch2_btree_node_iter_end(&iter)) {
- if (!bpos_eq(k.k->p, b->key.k.p)) {
- bch2_dump_btree_node(c, b);
- bch2_bpos_to_text(&buf1, b->key.k.p);
- bch2_bpos_to_text(&buf2, k.k->p);
- panic("expected end %s got %s\n", buf1.buf, buf2.buf);
- }
- break;
+ if (!bpos_eq(expected_min, bp.v->min_key)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "end of prev node doesn't match start of next node\n in ");
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_str(&buf, " node ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, "\n prev ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+ prt_str(&buf, "\n next ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
+ goto topology_repair;
}
- next_node = bpos_successor(k.k->p);
+ bch2_bkey_buf_reassemble(&prev, c, k);
+ bch2_btree_and_journal_iter_advance(&iter);
}
-#endif
+
+ if (bkey_deleted(&prev.k->k)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "empty interior node\n in ");
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_str(&buf, " node ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
+ goto topology_repair;
+ } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "last child node doesn't end at end of parent node\n in ");
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_str(&buf, " node ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, "\n last key ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+
+ log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
+ goto topology_repair;
+ }
+out:
+fsck_err:
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_bkey_buf_exit(&prev, c);
+ printbuf_exit(&buf);
+ return ret;
+topology_repair:
+ ret = bch2_topology_error(c);
+ goto out;
}
/* Calculate ideal packed bkey format for new btree nodes: */
@@ -102,7 +155,6 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
{
struct bkey_packed *k;
- struct bset_tree *t;
struct bkey uk;
for_each_bset(b, t)
@@ -179,10 +231,6 @@ static void __btree_node_free(struct btree_trans *trans, struct btree *b)
BUG_ON(b->will_make_reachable);
clear_btree_node_noevict(b);
-
- mutex_lock(&c->btree_cache.lock);
- list_move(&b->list, &c->btree_cache.freeable);
- mutex_unlock(&c->btree_cache.lock);
}
static void bch2_btree_node_free_inmem(struct btree_trans *trans,
@@ -190,19 +238,19 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
struct btree *b)
{
struct bch_fs *c = trans->c;
- unsigned i, level = b->c.level;
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+
__btree_node_free(trans, b);
+
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+ mutex_unlock(&c->btree_cache.lock);
+
six_unlock_write(&b->c.lock);
- mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
+ mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
- trans_for_each_path(trans, path, i)
- if (path->l[level].b == b) {
- btree_node_unlock(trans, path, level);
- path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
- }
+ bch2_trans_node_drop(trans, b);
}
static void bch2_btree_node_free_never_used(struct btree_update *as,
@@ -211,8 +259,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
{
struct bch_fs *c = as->c;
struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
- struct btree_path *path;
- unsigned i, level = b->c.level;
BUG_ON(!list_empty(&b->write_blocked));
BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
@@ -226,8 +272,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
clear_btree_node_need_write(b);
mutex_lock(&c->btree_cache.lock);
- list_del_init(&b->list);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+ __bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
BUG_ON(p->nr >= ARRAY_SIZE(p->b));
@@ -235,11 +280,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
six_unlock_intent(&b->c.lock);
- trans_for_each_path(trans, path, i)
- if (path->l[level].b == b) {
- btree_node_unlock(trans, path, level);
- path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
- }
+ bch2_trans_node_drop(trans, b);
}
static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
@@ -255,11 +296,17 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct open_buckets obs = { .nr = 0 };
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
+ unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
? BTREE_NODE_RESERVE
: 0;
int ret;
+ b = bch2_btree_node_mem_alloc(trans, interior_node);
+ if (IS_ERR(b))
+ return b;
+
+ BUG_ON(b->ob.nr);
+
mutex_lock(&c->btree_reserve_cache_lock);
if (c->btree_reserve_cache_nr > nr_reserve) {
struct btree_alloc *a =
@@ -268,10 +315,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
obs = a->ob;
bkey_copy(&tmp.k, &a->k);
mutex_unlock(&c->btree_reserve_cache_lock);
- goto mem_alloc;
+ goto out;
}
mutex_unlock(&c->btree_reserve_cache_lock);
-
retry:
ret = bch2_alloc_sectors_start_trans(trans,
c->opts.metadata_target ?:
@@ -284,7 +330,7 @@ retry:
c->opts.metadata_replicas_required),
watermark, 0, cl, &wp);
if (unlikely(ret))
- return ERR_PTR(ret);
+ goto err;
if (wp->sectors_free < btree_sectors(c)) {
struct open_bucket *ob;
@@ -303,19 +349,16 @@ retry:
bch2_open_bucket_get(c, wp, &obs);
bch2_alloc_sectors_done(c, wp);
-mem_alloc:
- b = bch2_btree_node_mem_alloc(trans, interior_node);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-
- /* we hold cannibalize_lock: */
- BUG_ON(IS_ERR(b));
- BUG_ON(b->ob.nr);
-
+out:
bkey_copy(&b->key, &tmp.k);
b->ob = obs;
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
return b;
+err:
+ bch2_btree_node_to_freelist(c, b);
+ return ERR_PTR(ret);
}
static struct btree *bch2_btree_node_alloc(struct btree_update *as,
@@ -457,8 +500,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
__btree_node_free(trans, b);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
+ bch2_btree_node_to_freelist(c, b);
}
}
}
@@ -550,6 +592,26 @@ static void btree_update_add_key(struct btree_update *as,
bch2_keylist_push(keys);
}
+static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
+{
+ for_each_keylist_key(&as->new_keys, k)
+ if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
+ return false;
+ return true;
+}
+
+static void btree_update_new_nodes_mark_sb(struct btree_update *as)
+{
+ struct bch_fs *c = as->c;
+
+ mutex_lock(&c->sb_lock);
+ for_each_keylist_key(&as->new_keys, k)
+ bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
+
/*
* The transactional part of an interior btree node update, where we journal the
* update we did to the interior node and update alloc info:
@@ -570,7 +632,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
- BTREE_TRIGGER_TRANSACTIONAL);
+ BTREE_TRIGGER_transactional);
if (ret)
return ret;
}
@@ -579,7 +641,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
- BTREE_TRIGGER_TRANSACTIONAL);
+ BTREE_TRIGGER_transactional);
if (ret)
return ret;
}
@@ -607,6 +669,9 @@ static void btree_update_nodes_written(struct btree_update *as)
if (ret)
goto err;
+ if (!btree_update_new_nodes_marked_sb(as))
+ btree_update_new_nodes_mark_sb(as);
+
/*
* Wait for any in flight writes to finish before we free the old nodes
* on disk:
@@ -616,9 +681,11 @@ static void btree_update_nodes_written(struct btree_update *as)
b = as->old_nodes[i];
+ bch2_trans_begin(trans);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
seq = b->data ? b->data->keys.seq : 0;
six_unlock_read(&b->c.lock);
+ bch2_trans_unlock_long(trans);
if (seq == as->old_nodes_seq[i])
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
@@ -639,7 +706,7 @@ static void btree_update_nodes_written(struct btree_update *as)
* which may require allocations as well.
*/
ret = commit_do(trans, &as->disk_res, &journal_seq,
- BCH_WATERMARK_reclaim|
+ BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_journal_reclaim,
@@ -647,14 +714,27 @@ static void btree_update_nodes_written(struct btree_update *as)
bch2_trans_unlock(trans);
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
- "%s(): error %s", __func__, bch2_err_str(ret));
+ "%s", bch2_err_str(ret));
err:
- if (as->b) {
+ /*
+ * Ensure transaction is unlocked before using btree_node_lock_nopath()
+ * (the use of which is always suspect, we need to work on removing this
+ * in the future)
+ *
+ * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
+ * calls bch2_path_upgrade(), before we call path_make_mut(), so we may
+ * rarely end up with a locked path besides the one we have here:
+ */
+ bch2_trans_unlock(trans);
+ bch2_trans_begin(trans);
- b = as->b;
- btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
- as->btree_id, b->c.level, b->key.k.p);
- struct btree_path *path = trans->paths + path_idx;
+ /*
+ * We have to be careful because another thread might be getting ready
+ * to free as->b and calling btree_update_reparent() on us - we'll
+ * recheck under btree_update_lock below:
+ */
+ b = READ_ONCE(as->b);
+ if (b) {
/*
* @b is the node we did the final insert into:
*
@@ -667,17 +747,9 @@ err:
* we're in journal error state:
*/
- /*
- * Ensure transaction is unlocked before using
- * btree_node_lock_nopath() (the use of which is always suspect,
- * we need to work on removing this in the future)
- *
- * It should be, but get_unlocked_mut_path() -> bch2_path_get()
- * calls bch2_path_upgrade(), before we call path_make_mut(), so
- * we may rarely end up with a locked path besides the one we
- * have here:
- */
- bch2_trans_unlock(trans);
+ btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
+ as->btree_id, b->c.level, b->key.k.p);
+ struct btree_path *path = trans->paths + path_idx;
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
@@ -722,7 +794,7 @@ err:
mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
six_unlock_write(&b->c.lock);
- btree_node_write_if_need(c, b, SIX_LOCK_intent);
+ btree_node_write_if_need(trans, b, SIX_LOCK_intent);
btree_node_unlock(trans, path, b->c.level);
bch2_path_put(trans, path_idx, true);
}
@@ -743,7 +815,7 @@ err:
b = as->new_nodes[i];
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- btree_node_write_if_need(c, b, SIX_LOCK_read);
+ btree_node_write_if_need(trans, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
}
@@ -795,15 +867,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode != BTREE_UPDATE_none);
+ BUG_ON(as->update_level_end < b->c.level);
BUG_ON(!btree_node_dirty(b));
BUG_ON(!b->c.level);
- as->mode = BTREE_INTERIOR_UPDATING_NODE;
+ mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
+ as->mode = BTREE_UPDATE_node;
as->b = b;
+ as->update_level_end = b->c.level;
set_btree_node_write_blocked(b);
list_add(&as->write_blocked_list, &b->write_blocked);
@@ -825,7 +899,7 @@ static void btree_update_reparent(struct btree_update *as,
lockdep_assert_held(&c->btree_interior_update_lock);
child->b = NULL;
- child->mode = BTREE_INTERIOR_UPDATING_AS;
+ child->mode = BTREE_UPDATE_update;
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
bch2_update_reparent_journal_pin_flush);
@@ -836,7 +910,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
struct bkey_i *insert = &b->key;
struct bch_fs *c = as->c;
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode != BTREE_UPDATE_none);
BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
ARRAY_SIZE(as->journal_entries));
@@ -850,7 +924,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
- as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+ as->mode = BTREE_UPDATE_root;
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -1028,7 +1102,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
struct bch_fs *c = as->c;
u64 start_time = as->start_time;
- BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode == BTREE_UPDATE_none);
if (as->took_gc_lock)
up_read(&as->c->gc_lock);
@@ -1045,7 +1119,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level, bool split, unsigned flags)
+ unsigned level_start, bool split, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_update *as;
@@ -1053,7 +1127,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
- unsigned update_level = level;
+ unsigned level_end = level_start;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
int ret = 0;
u32 restart_count = trans->restart_count;
@@ -1068,29 +1142,29 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark < c->journal.watermark) {
- struct journal_res res = { 0 };
+ if (watermark < BCH_WATERMARK_reclaim &&
+ test_bit(JOURNAL_space_low, &c->journal.flags)) {
+ if (flags & BCH_TRANS_COMMIT_journal_reclaim)
+ return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
ret = drop_locks_do(trans,
- bch2_journal_res_get(&c->journal, &res, 1,
- watermark|JOURNAL_RES_GET_CHECK));
+ ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; }));
if (ret)
return ERR_PTR(ret);
}
while (1) {
- nr_nodes[!!update_level] += 1 + split;
- update_level++;
+ nr_nodes[!!level_end] += 1 + split;
+ level_end++;
- ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+ ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
if (ret)
return ERR_PTR(ret);
- if (!btree_path_node(path, update_level)) {
+ if (!btree_path_node(path, level_end)) {
/* Allocating new root? */
nr_nodes[1] += split;
- update_level = BTREE_MAX_DEPTH;
+ level_end = BTREE_MAX_DEPTH;
break;
}
@@ -1098,11 +1172,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
- if (bch2_btree_node_insert_fits(path->l[update_level].b,
+ if (bch2_btree_node_insert_fits(path->l[level_end].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
- split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
+ split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
if (!down_read_trylock(&c->gc_lock)) {
@@ -1116,12 +1190,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
memset(as, 0, sizeof(*as));
closure_init(&as->cl, NULL);
- as->c = c;
- as->start_time = start_time;
- as->mode = BTREE_INTERIOR_NO_UPDATE;
- as->took_gc_lock = true;
- as->btree_id = path->btree_id;
- as->update_level = update_level;
+ as->c = c;
+ as->start_time = start_time;
+ as->ip_started = _RET_IP_;
+ as->mode = BTREE_UPDATE_none;
+ as->flags = flags;
+ as->took_gc_lock = true;
+ as->btree_id = path->btree_id;
+ as->update_level_start = level_start;
+ as->update_level_end = level_end;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
@@ -1163,7 +1240,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
*/
if (bch2_err_matches(ret, ENOSPC) &&
(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark != BCH_WATERMARK_reclaim) {
+ watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
}
@@ -1174,7 +1251,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
bch2_trans_unlock(trans);
- closure_sync(&cl);
+ bch2_wait_on_allocator(c, &cl);
} while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
}
@@ -1193,7 +1270,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
err:
bch2_btree_update_free(as, trans);
if (!bch2_err_matches(ret, ENOSPC) &&
- !bch2_err_matches(ret, EROFS))
+ !bch2_err_matches(ret, EROFS) &&
+ ret != -BCH_ERR_journal_reclaim_would_deadlock)
bch_err_fn_ratelimited(c, ret);
return ERR_PTR(ret);
}
@@ -1208,33 +1286,35 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
mutex_unlock(&c->btree_cache.lock);
mutex_lock(&c->btree_root_lock);
- BUG_ON(btree_node_root(c, b) &&
- (b->c.level < btree_node_root(c, b)->c.level ||
- !btree_node_dying(btree_node_root(c, b))));
-
bch2_btree_id_root(c, b->c.btree_id)->b = b;
mutex_unlock(&c->btree_root_lock);
bch2_recalc_btree_reserve(c);
}
-static void bch2_btree_set_root(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+static int bch2_btree_set_root(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ bool nofail)
{
struct bch_fs *c = as->c;
- struct btree *old;
trace_and_count(c, btree_node_set_root, trans, b);
- old = btree_node_root(c, b);
+ struct btree *old = btree_node_root(c, b);
/*
* Ensure no one is using the old root while we switch to the
* new root:
*/
- bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+ if (nofail) {
+ bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+ } else {
+ int ret = bch2_btree_node_lock_write(trans, path, &old->c);
+ if (ret)
+ return ret;
+ }
bch2_btree_set_root_inmem(c, b);
@@ -1248,6 +1328,7 @@ static void bch2_btree_set_root(struct btree_update *as,
* depend on the new root would have to update the new root.
*/
bch2_btree_node_unlock_write(trans, path, old);
+ return 0;
}
/* Interior node updates: */
@@ -1262,26 +1343,23 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
struct bch_fs *c = as->c;
struct bkey_packed *k;
struct printbuf buf = PRINTBUF;
- unsigned long old, new, v;
+ unsigned long old, new;
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
- !btree_ptr_sectors_written(insert));
+ !btree_ptr_sectors_written(bkey_i_to_s_c(insert)));
- if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+ if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
- if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
- btree_node_type(b), WRITE, &buf) ?:
- bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) {
- printbuf_reset(&buf);
- prt_printf(&buf, "inserting invalid bkey\n ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
- prt_printf(&buf, "\n ");
- bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
- btree_node_type(b), WRITE, &buf);
- bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf);
-
- bch2_fs_inconsistent(c, "%s", buf.buf);
+ struct bkey_validate_context from = (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = b->c.level,
+ .btree = b->c.btree_id,
+ .flags = BCH_VALIDATE_commit,
+ };
+ if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?:
+ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) {
+ bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__);
dump_stack();
}
@@ -1301,25 +1379,25 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
set_btree_node_dirty_acct(c, b);
- v = READ_ONCE(b->flags);
+ old = READ_ONCE(b->flags);
do {
- old = new = v;
+ new = old;
new &= ~BTREE_WRITE_TYPE_MASK;
new |= BTREE_WRITE_interior;
new |= 1 << BTREE_NODE_need_write;
- } while ((v = cmpxchg(&b->flags, old, new)) != old);
+ } while (!try_cmpxchg(&b->flags, &old, new));
printbuf_exit(&buf);
}
static void
-__bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter node_iter,
- struct keylist *keys)
+bch2_btree_insert_keys_interior(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct keylist *keys)
{
struct bkey_i *insert = bch2_keylist_front(keys);
struct bkey_packed *k;
@@ -1330,15 +1408,35 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
(bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
;
- while (!bch2_keylist_empty(keys)) {
- insert = bch2_keylist_front(keys);
+ for (;
+ insert != keys->top && bpos_le(insert->k.p, b->key.k.p);
+ insert = bkey_next(insert))
+ bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
- if (bpos_gt(insert->k.p, b->key.k.p))
- break;
+ if (bch2_btree_node_check_topology(trans, b)) {
+ struct printbuf buf = PRINTBUF;
- bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
- bch2_keylist_pop_front(keys);
+ for (struct bkey_i *k = keys->keys;
+ k != insert;
+ k = bkey_next(k)) {
+ bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
+ prt_newline(&buf);
+ }
+
+ panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf);
}
+
+ memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data);
+ keys->top_p -= insert->_data - keys->keys_p;
+}
+
+static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
+{
+ if (insert_keys)
+ for_each_keylist_key(insert_keys, k)
+ if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos))
+ return true;
+ return false;
}
/*
@@ -1348,7 +1446,8 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
static void __btree_split_node(struct btree_update *as,
struct btree_trans *trans,
struct btree *b,
- struct btree *n[2])
+ struct btree *n[2],
+ struct keylist *insert_keys)
{
struct bkey_packed *k;
struct bpos n1_pos = POS_MIN;
@@ -1378,9 +1477,17 @@ static void __btree_split_node(struct btree_update *as,
if (bkey_deleted(k))
continue;
+ uk = bkey_unpack_key(b, k);
+
+ if (b->c.level &&
+ u64s < n1_u64s &&
+ u64s + k->u64s >= n1_u64s &&
+ (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) ||
+ key_deleted_in_insert(insert_keys, uk.p)))
+ n1_u64s += k->u64s;
+
i = u64s >= n1_u64s;
u64s += k->u64s;
- uk = bkey_unpack_key(b, k);
if (!i)
n1_pos = uk.p;
bch2_bkey_format_add_key(&format[i], &uk);
@@ -1439,8 +1546,7 @@ static void __btree_split_node(struct btree_update *as,
bch2_verify_btree_nr_keys(n[i]);
- if (b->c.level)
- btree_node_interior_verify(as->c, n[i]);
+ BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
}
}
@@ -1469,15 +1575,13 @@ static void btree_split_insert_keys(struct btree_update *as,
bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
- __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
-
- btree_node_interior_verify(as->c, b);
+ bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
}
}
static int btree_split(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path, struct btree *b,
- struct keylist *keys, unsigned flags)
+ struct keylist *keys)
{
struct bch_fs *c = as->c;
struct btree *parent = btree_node_parent(trans->paths + path, b);
@@ -1486,10 +1590,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
u64 start_time = local_clock();
int ret = 0;
+ bch2_verify_btree_nr_keys(b);
BUG_ON(!parent && (b != btree_node_root(c, b)));
BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
- bch2_btree_interior_update_will_free_node(as, b);
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret)
+ return ret;
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
struct btree *n[2];
@@ -1499,7 +1606,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
- __btree_split_node(as, trans, b, n);
+ __btree_split_node(as, trans, b, n, keys);
if (keys) {
btree_split_insert_keys(as, trans, path, n1, keys);
@@ -1515,12 +1622,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
six_unlock_write(&n2->c.lock);
six_unlock_write(&n1->c.lock);
- path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
+ path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + path1, n1);
- path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p);
+ path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p);
six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + path2, n2);
@@ -1565,7 +1672,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n1);
six_unlock_write(&n1->c.lock);
- path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
+ path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + path1, n1);
@@ -1578,26 +1685,29 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (parent) {
/* Split a non root node */
- ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
- if (ret)
- goto err;
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
} else if (n3) {
- bch2_btree_set_root(as, trans, trans->paths + path, n3);
+ ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
} else {
/* Root filled up but didn't need to be split */
- bch2_btree_set_root(as, trans, trans->paths + path, n1);
+ ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
}
+ if (ret)
+ goto err;
+
+ bch2_btree_interior_update_will_free_node(as, b);
+
if (n3) {
bch2_btree_update_get_open_buckets(as, n3);
- bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0);
}
if (n2) {
bch2_btree_update_get_open_buckets(as, n2);
- bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0);
}
bch2_btree_update_get_open_buckets(as, n1);
- bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0);
/*
* The old node must be freed (in memory) _before_ unlocking the new
@@ -1644,27 +1754,6 @@ err:
goto out;
}
-static void
-bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct keylist *keys)
-{
- struct btree_path *linked;
- unsigned i;
-
- __bch2_btree_insert_keys_interior(as, trans, path, b,
- path->l[b->c.level].iter, keys);
-
- btree_update_updated_node(as, b);
-
- trans_for_each_path_with_node(trans, b, linked, i)
- bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
-
- bch2_trans_verify_paths(trans);
-}
-
/**
* bch2_btree_insert_node - insert bkeys into a given btree node
*
@@ -1673,7 +1762,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
* @path_idx: path that points to current node
* @b: node to insert keys into
* @keys: list of keys to insert
- * @flags: transaction commit flags
*
* Returns: 0 on success, typically transaction restart error on failure
*
@@ -1683,10 +1771,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
*/
static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path_idx, struct btree *b,
- struct keylist *keys, unsigned flags)
+ struct keylist *keys)
{
struct bch_fs *c = as->c;
- struct btree_path *path = trans->paths + path_idx;
+ struct btree_path *path = trans->paths + path_idx, *linked;
+ unsigned i;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
@@ -1709,9 +1798,19 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
goto split;
}
- btree_node_interior_verify(c, b);
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret) {
+ bch2_btree_node_unlock_write(trans, path, b);
+ return ret;
+ }
+
+ bch2_btree_insert_keys_interior(as, trans, path, b,
+ path->l[b->c.level].iter, keys);
+
+ trans_for_each_path_with_node(trans, b, linked, i)
+ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
- bch2_btree_insert_keys_interior(as, trans, path, b, keys);
+ bch2_trans_verify_paths(trans);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1725,21 +1824,20 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_maybe_compact_whiteouts(c, b))
bch2_trans_node_reinit_iter(trans, b);
+ btree_update_updated_node(as, b);
bch2_btree_node_unlock_write(trans, path, b);
-
- btree_node_interior_verify(c, b);
return 0;
split:
/*
* We could attempt to avoid the transaction restart, by calling
* bch2_btree_path_upgrade() and allocating more nodes:
*/
- if (b->c.level >= as->update_level) {
+ if (b->c.level >= as->update_level_end) {
trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
- return btree_split(as, trans, path_idx, b, keys, flags);
+ return btree_split(as, trans, path_idx, b, keys);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
@@ -1747,7 +1845,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
unsigned flags)
{
/* btree_split & merge may both cause paths array to be reallocated */
-
struct btree *b = path_l(trans->paths + path)->b;
struct btree_update *as;
unsigned l;
@@ -1759,7 +1856,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
if (IS_ERR(as))
return PTR_ERR(as);
- ret = btree_split(as, trans, path, b, NULL, flags);
+ ret = btree_split(as, trans, path, b, NULL);
if (ret) {
bch2_btree_update_free(as, trans);
return ret;
@@ -1775,6 +1872,65 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
return ret;
}
+static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
+ btree_path_idx_t path_idx)
+{
+ struct bch_fs *c = as->c;
+ struct btree_path *path = trans->paths + path_idx;
+ struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
+
+ BUG_ON(!btree_node_locked(path, b->c.level));
+
+ n = __btree_root_alloc(as, trans, b->c.level + 1);
+
+ bch2_btree_update_add_new_node(as, n);
+ six_unlock_write(&n->c.lock);
+
+ path->locks_want++;
+ BUG_ON(btree_node_locked(path, n->c.level));
+ six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, path, n);
+
+ n->sib_u64s[0] = U16_MAX;
+ n->sib_u64s[1] = U16_MAX;
+
+ bch2_keylist_add(&as->parent_keys, &b->key);
+ btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
+
+ int ret = bch2_btree_set_root(as, trans, path, n, true);
+ BUG_ON(ret);
+
+ bch2_btree_update_get_open_buckets(as, n);
+ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
+ bch2_trans_node_add(trans, path, n);
+ six_unlock_intent(&n->c.lock);
+
+ mutex_lock(&c->btree_cache.lock);
+ list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
+ mutex_unlock(&c->btree_cache.lock);
+
+ bch2_trans_verify_locks(trans);
+}
+
+int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
+
+ if (btree_node_fake(b))
+ return bch2_btree_split_leaf(trans, path, flags);
+
+ struct btree_update *as =
+ bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
+ if (IS_ERR(as))
+ return PTR_ERR(as);
+
+ __btree_increase_depth(as, trans, path);
+ bch2_btree_update_done(as, trans);
+ return 0;
+}
+
int __bch2_foreground_maybe_merge(struct btree_trans *trans,
btree_path_idx_t path,
unsigned level,
@@ -1794,9 +1950,26 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
u64 start_time = local_clock();
int ret = 0;
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
BUG_ON(!trans->paths[path].should_be_locked);
BUG_ON(!btree_node_locked(&trans->paths[path], level));
+ /*
+ * Work around a deadlock caused by the btree write buffer not doing
+ * merges and leaving tons of merges for us to do - we really don't need
+ * to be doing merges at all from the interior update path, and if the
+ * interior update path is generating too many new interior updates we
+ * deadlock:
+ */
+ if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
+ return 0;
+
+ if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) {
+ flags &= ~BCH_WATERMARK_MASK;
+ flags |= BCH_WATERMARK_btree;
+ flags |= BCH_TRANS_COMMIT_journal_reclaim;
+ }
+
b = trans->paths[path].l[level].b;
if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
@@ -1810,12 +1983,12 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
: bpos_successor(b->data->max_key);
sib_path = bch2_path_get(trans, btree, sib_pos,
- U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
+ U8_MAX, level, BTREE_ITER_intent, _THIS_IP_);
ret = bch2_btree_path_traverse(trans, sib_path, false);
if (ret)
goto err;
- btree_path_set_should_be_locked(trans->paths + sib_path);
+ btree_path_set_should_be_locked(trans, trans->paths + sib_path);
m = trans->paths[sib_path].l[level].b;
@@ -1845,8 +2018,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
__func__, buf1.buf, buf2.buf);
printbuf_exit(&buf1);
printbuf_exit(&buf2);
- bch2_topology_error(c);
- ret = -EIO;
+ ret = bch2_topology_error(c);
goto err;
}
@@ -1882,9 +2054,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
trace_and_count(c, btree_node_merge, trans, b);
- bch2_btree_interior_update_will_free_node(as, b);
- bch2_btree_interior_update_will_free_node(as, m);
-
n = bch2_btree_node_alloc(as, trans, b->c.level);
SET_BTREE_NODE_SEQ(n->data,
@@ -1904,7 +2073,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p);
+ new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + new_path, n);
@@ -1916,14 +2085,17 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_trans_verify_paths(trans);
- ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
if (ret)
goto err_free_update;
+ bch2_btree_interior_update_will_free_node(as, b);
+ bch2_btree_interior_update_will_free_node(as, m);
+
bch2_trans_verify_paths(trans);
bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
bch2_btree_node_free_inmem(trans, trans->paths + path, b);
bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
@@ -1943,6 +2115,10 @@ err:
bch2_path_put(trans, new_path, true);
bch2_path_put(trans, sib_path, true);
bch2_trans_verify_locks(trans);
+ if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+ ret = 0;
+ if (!ret)
+ ret = bch2_trans_relock(trans);
return ret;
err_free_update:
bch2_btree_node_free_never_used(as, trans, n);
@@ -1970,15 +2146,13 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
if (ret)
goto out;
- bch2_btree_interior_update_will_free_node(as, b);
-
n = bch2_btree_node_alloc_replacement(as, trans, b);
bch2_btree_build_aux_trees(n);
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
+ new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + new_path, n);
@@ -1987,16 +2161,18 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
- ret = bch2_btree_insert_node(as, trans, iter->path,
- parent, &as->parent_keys, flags);
- if (ret)
- goto err;
+ ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
} else {
- bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n);
+ ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
}
+ if (ret)
+ goto err;
+
+ bch2_btree_interior_update_will_free_node(as, b);
+
bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
@@ -2021,42 +2197,50 @@ struct async_btree_rewrite {
struct list_head list;
enum btree_id btree_id;
unsigned level;
- struct bpos pos;
- __le64 seq;
+ struct bkey_buf key;
};
static int async_btree_node_rewrite_trans(struct btree_trans *trans,
struct async_btree_rewrite *a)
{
- struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct btree *b;
- int ret;
-
- bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
+ bch2_trans_node_iter_init(trans, &iter,
+ a->btree_id, a->key.k->k.p,
BTREE_MAX_DEPTH, a->level, 0);
- b = bch2_btree_iter_peek_node(&iter);
- ret = PTR_ERR_OR_ZERO(b);
+ struct btree *b = bch2_btree_iter_peek_node(&iter);
+ int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto out;
- if (!b || b->data->keys.seq != a->seq) {
+ bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
+ ret = found
+ ? bch2_btree_node_rewrite(trans, &iter, b, 0)
+ : -ENOENT;
+
+#if 0
+ /* Tracepoint... */
+ if (!ret || ret == -ENOENT) {
+ struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
- if (b)
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- else
- prt_str(&buf, "(null");
- bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
- __func__, a->seq, buf.buf);
+ if (!ret) {
+ prt_printf(&buf, "rewrite node:\n ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
+ } else {
+ prt_printf(&buf, "node to rewrite not found:\n want: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
+ prt_printf(&buf, "\n got: ");
+ if (b)
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ else
+ prt_str(&buf, "(null)");
+ }
+ bch_info(c, "%s", buf.buf);
printbuf_exit(&buf);
- goto out;
}
-
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+#endif
out:
bch2_trans_iter_exit(trans, &iter);
-
return ret;
}
@@ -2065,85 +2249,99 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
struct async_btree_rewrite *a =
container_of(work, struct async_btree_rewrite, work);
struct bch_fs *c = a->c;
- int ret;
- ret = bch2_trans_do(c, NULL, NULL, 0,
- async_btree_node_rewrite_trans(trans, a));
- bch_err_fn(c, ret);
+ int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
+ if (ret != -ENOENT)
+ bch_err_fn_ratelimited(c, ret);
+
+ spin_lock(&c->btree_node_rewrites_lock);
+ list_del(&a->list);
+ spin_unlock(&c->btree_node_rewrites_lock);
+
+ closure_wake_up(&c->btree_node_rewrites_wait);
+
+ bch2_bkey_buf_exit(&a->key, c);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
{
- struct async_btree_rewrite *a;
- int ret;
-
- a = kmalloc(sizeof(*a), GFP_NOFS);
- if (!a) {
- bch_err(c, "%s: error allocating memory", __func__);
+ struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
+ if (!a)
return;
- }
a->c = c;
a->btree_id = b->c.btree_id;
a->level = b->c.level;
- a->pos = b->key.k.p;
- a->seq = b->data->keys.seq;
INIT_WORK(&a->work, async_btree_node_rewrite_work);
- if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
- mutex_lock(&c->pending_node_rewrites_lock);
- list_add(&a->list, &c->pending_node_rewrites);
- mutex_unlock(&c->pending_node_rewrites_lock);
- return;
- }
+ bch2_bkey_buf_init(&a->key);
+ bch2_bkey_buf_copy(&a->key, c, &b->key);
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
- if (test_bit(BCH_FS_started, &c->flags)) {
- bch_err(c, "%s: error getting c->writes ref", __func__);
- kfree(a);
- return;
- }
+ bool now = false, pending = false;
- ret = bch2_fs_read_write_early(c);
- bch_err_msg(c, ret, "going read-write");
- if (ret) {
- kfree(a);
- return;
- }
+ spin_lock(&c->btree_node_rewrites_lock);
+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay &&
+ bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+ list_add(&a->list, &c->btree_node_rewrites);
+ now = true;
+ } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
+ list_add(&a->list, &c->btree_node_rewrites_pending);
+ pending = true;
+ }
+ spin_unlock(&c->btree_node_rewrites_lock);
- bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+ if (now) {
+ queue_work(c->btree_node_rewrite_worker, &a->work);
+ } else if (pending) {
+ /* bch2_do_pending_node_rewrites will execute */
+ } else {
+ bch2_bkey_buf_exit(&a->key, c);
+ kfree(a);
}
+}
- queue_work(c->btree_interior_update_worker, &a->work);
+void bch2_async_btree_node_rewrites_flush(struct bch_fs *c)
+{
+ closure_wait_event(&c->btree_node_rewrites_wait,
+ list_empty(&c->btree_node_rewrites));
}
void bch2_do_pending_node_rewrites(struct bch_fs *c)
{
- struct async_btree_rewrite *a, *n;
-
- mutex_lock(&c->pending_node_rewrites_lock);
- list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
- list_del(&a->list);
+ while (1) {
+ spin_lock(&c->btree_node_rewrites_lock);
+ struct async_btree_rewrite *a =
+ list_pop_entry(&c->btree_node_rewrites_pending,
+ struct async_btree_rewrite, list);
+ if (a)
+ list_add(&a->list, &c->btree_node_rewrites);
+ spin_unlock(&c->btree_node_rewrites_lock);
+
+ if (!a)
+ break;
bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
- queue_work(c->btree_interior_update_worker, &a->work);
+ queue_work(c->btree_node_rewrite_worker, &a->work);
}
- mutex_unlock(&c->pending_node_rewrites_lock);
}
void bch2_free_pending_node_rewrites(struct bch_fs *c)
{
- struct async_btree_rewrite *a, *n;
+ while (1) {
+ spin_lock(&c->btree_node_rewrites_lock);
+ struct async_btree_rewrite *a =
+ list_pop_entry(&c->btree_node_rewrites_pending,
+ struct async_btree_rewrite, list);
+ spin_unlock(&c->btree_node_rewrites_lock);
- mutex_lock(&c->pending_node_rewrites_lock);
- list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
- list_del(&a->list);
+ if (!a)
+ break;
+ bch2_bkey_buf_exit(&a->key, c);
kfree(a);
}
- mutex_unlock(&c->pending_node_rewrites_lock);
}
static int __bch2_btree_node_update_key(struct btree_trans *trans,
@@ -2161,10 +2359,10 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
if (!skip_triggers) {
ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
bkey_i_to_s_c(&b->key),
- BTREE_TRIGGER_TRANSACTIONAL) ?:
+ BTREE_TRIGGER_transactional) ?:
bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
bkey_i_to_s(new_key),
- BTREE_TRIGGER_TRANSACTIONAL);
+ BTREE_TRIGGER_transactional);
if (ret)
return ret;
}
@@ -2181,7 +2379,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
bch2_trans_copy_iter(&iter2, iter);
iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
- iter2.flags & BTREE_ITER_INTENT,
+ iter2.flags & BTREE_ITER_intent,
_THIS_IP_);
struct btree_path *path2 = btree_iter_path(trans, &iter2);
@@ -2193,7 +2391,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
trans->paths_sorted = false;
ret = bch2_btree_iter_traverse(&iter2) ?:
- bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
+ bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
if (ret)
goto err;
} else {
@@ -2220,7 +2418,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ __bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, new_key);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
@@ -2272,6 +2471,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
}
new_hash = bch2_btree_node_mem_alloc(trans, false);
+ ret = PTR_ERR_OR_ZERO(new_hash);
+ if (ret)
+ goto err;
}
path->intent_ref++;
@@ -2279,14 +2481,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
commit_flags, skip_triggers);
--path->intent_ref;
- if (new_hash) {
- mutex_lock(&c->btree_cache.lock);
- list_move(&new_hash->list, &c->btree_cache.freeable);
- mutex_unlock(&c->btree_cache.lock);
-
- six_unlock_write(&new_hash->c.lock);
- six_unlock_intent(&new_hash->c.lock);
- }
+ if (new_hash)
+ bch2_btree_node_to_freelist(c, new_hash);
+err:
closure_sync(&cl);
bch2_btree_cache_cannibalize_unlock(trans);
return ret;
@@ -2301,7 +2498,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->c.level,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto out;
@@ -2315,7 +2512,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
BUG_ON(!btree_node_hashed(b));
- struct bch_extent_ptr *ptr;
bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
!bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
@@ -2339,7 +2535,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
bch2_btree_set_root_inmem(c, b);
}
-static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
+int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level)
{
struct bch_fs *c = trans->c;
struct closure cl;
@@ -2356,9 +2552,13 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
b = bch2_btree_node_mem_alloc(trans, false);
bch2_btree_cache_cannibalize_unlock(trans);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ return ret;
+
set_btree_node_fake(b);
set_btree_node_need_rewrite(b);
- b->c.level = 0;
+ b->c.level = level;
b->c.btree_id = id;
bkey_btree_ptr_init(&b->key);
@@ -2385,9 +2585,25 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
return 0;
}
-void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
{
- bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
+ bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
+}
+
+static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
+{
+ prt_printf(out, "%ps: ", (void *) as->ip_started);
+ bch2_trans_commit_flags_to_text(out, as->flags);
+
+ prt_str(out, " ");
+ bch2_btree_id_to_text(out, as->btree_id);
+ prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+ as->update_level_start,
+ as->update_level_end,
+ bch2_btree_update_modes[as->mode],
+ as->nodes_written,
+ closure_nr_remaining(&as->cl),
+ as->journal.seq);
}
void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
@@ -2396,12 +2612,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->btree_interior_update_lock);
list_for_each_entry(as, &c->btree_interior_update_list, list)
- prt_printf(out, "%p m %u w %u r %u j %llu\n",
- as,
- as->mode,
- as->nodes_written,
- closure_nr_remaining(&as->cl),
- as->journal.seq);
+ bch2_btree_update_to_text(out, as);
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -2463,8 +2674,35 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c,
return end;
}
+static void bch2_btree_alloc_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct btree_alloc *a)
+{
+ printbuf_indent_add(out, 2);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k));
+ prt_newline(out);
+
+ struct open_bucket *ob;
+ unsigned i;
+ open_bucket_for_each(c, &a->ob, ob, i)
+ bch2_open_bucket_to_text(out, c, ob);
+
+ printbuf_indent_sub(out, 2);
+}
+
+void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++)
+ bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]);
+}
+
void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
{
+ WARN_ON(!list_empty(&c->btree_node_rewrites));
+ WARN_ON(!list_empty(&c->btree_node_rewrites_pending));
+
+ if (c->btree_node_rewrite_worker)
+ destroy_workqueue(c->btree_node_rewrite_worker);
if (c->btree_interior_update_worker)
destroy_workqueue(c->btree_interior_update_worker);
mempool_exit(&c->btree_interior_update_pool);
@@ -2478,17 +2716,23 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
mutex_init(&c->btree_interior_update_lock);
INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
- INIT_LIST_HEAD(&c->pending_node_rewrites);
- mutex_init(&c->pending_node_rewrites_lock);
+ INIT_LIST_HEAD(&c->btree_node_rewrites);
+ INIT_LIST_HEAD(&c->btree_node_rewrites_pending);
+ spin_lock_init(&c->btree_node_rewrites_lock);
}
int bch2_fs_btree_interior_update_init(struct bch_fs *c)
{
c->btree_interior_update_worker =
- alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
if (!c->btree_interior_update_worker)
return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+ c->btree_node_rewrite_worker =
+ alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
+ if (!c->btree_node_rewrite_worker)
+ return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+
if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
sizeof(struct btree_update)))
return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index c593c925d1e3..26d646e1275c 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -10,6 +10,20 @@
#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
+int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
+
+#define BTREE_UPDATE_MODES() \
+ x(none) \
+ x(node) \
+ x(root) \
+ x(update)
+
+enum btree_update_mode {
+#define x(n) BTREE_UPDATE_##n,
+ BTREE_UPDATE_MODES()
+#undef x
+};
+
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
* parent node:
@@ -32,28 +46,24 @@ struct btree_update {
struct closure cl;
struct bch_fs *c;
u64 start_time;
+ unsigned long ip_started;
struct list_head list;
struct list_head unwritten_list;
- /* What kind of update are we doing? */
- enum {
- BTREE_INTERIOR_NO_UPDATE,
- BTREE_INTERIOR_UPDATING_NODE,
- BTREE_INTERIOR_UPDATING_ROOT,
- BTREE_INTERIOR_UPDATING_AS,
- } mode;
-
+ enum btree_update_mode mode;
+ enum bch_trans_commit_flags flags;
unsigned nodes_written:1;
unsigned took_gc_lock:1;
enum btree_id btree_id;
- unsigned update_level;
+ unsigned update_level_start;
+ unsigned update_level_end;
struct disk_reservation disk_res;
/*
- * BTREE_INTERIOR_UPDATING_NODE:
+ * BTREE_UPDATE_node:
* The update that made the new nodes visible was a regular update to an
* existing interior node - @b. We can't write out the update to @b
* until the new nodes we created are finished writing, so we block @b
@@ -119,6 +129,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
+int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
+
int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
unsigned, unsigned, enum btree_node_sibling);
@@ -132,6 +144,9 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
EBUG_ON(!btree_node_locked(path, level));
+ if (bch2_btree_node_merging_disabled)
+ return 0;
+
b = path->l[level].b;
if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
return 0;
@@ -144,6 +159,8 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
unsigned level,
unsigned flags)
{
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+
return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
btree_prev_sib) ?:
bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
@@ -160,7 +177,9 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
struct bkey_i *, unsigned, bool);
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+
+int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned);
+void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
static inline unsigned btree_update_reserve_required(struct bch_fs *c,
struct btree *b)
@@ -259,12 +278,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt
{
struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
- (void *) btree_bkey_last(b, bset_tree_last(b)));
+ (void *) btree_bkey_last(b, t));
ssize_t remaining_space =
__bch2_btree_u64s_remaining(b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
- if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+ if (b->written + block_sectors(c) <= btree_sectors(c))
return bne;
} else {
if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
@@ -315,9 +334,12 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
struct jset_entry *, unsigned long);
+void bch2_async_btree_node_rewrites_flush(struct bch_fs *);
void bch2_do_pending_node_rewrites(struct bch_fs *);
void bch2_free_pending_node_rewrites(struct bch_fs *);
+void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *);
+
void bch2_fs_btree_interior_update_exit(struct bch_fs *);
void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
int bch2_fs_btree_interior_update_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index ac7844861966..2c09d19dd621 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -1,22 +1,24 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_buf.h"
#include "btree_locking.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "disk_accounting.h"
#include "error.h"
+#include "extents.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include <linux/prefetch.h>
+#include <linux/sort.h>
static int bch2_btree_write_buffer_journal_flush(struct journal *,
struct journal_entry_pin *, u64);
-static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *);
-
static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
{
return (cmp_int(l->hi, r->hi) ?:
@@ -46,6 +48,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke
#endif
}
+static int wb_key_seq_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->journal_seq, r->journal_seq);
+}
+
/* Compare excluding idx, the low 24 bits: */
static inline bool wb_key_eq(const void *_l, const void *_r)
{
@@ -113,7 +123,7 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
trans->journal_res.seq = wb->journal_seq;
return bch2_trans_update(trans, iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw|
@@ -123,7 +133,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
struct btree_write_buffered_key *wb,
- bool *write_locked, size_t *fast)
+ bool *write_locked,
+ bool *accounting_accumulated,
+ size_t *fast)
{
struct btree_path *path;
int ret;
@@ -136,6 +148,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
if (ret)
return ret;
+ if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
+ struct bkey u;
+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
+
+ if (k.k->type == KEY_TYPE_accounting)
+ bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
+ bkey_s_c_to_accounting(k));
+ }
+ *accounting_accumulated = true;
+
/*
* We can't clone a path that has write locks: unshare it now, before
* set_pos and traverse():
@@ -182,13 +204,13 @@ btree_write_buffered_insert(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
- BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+ BTREE_ITER_cached|BTREE_ITER_intent);
trans->journal_res.seq = wb->journal_seq;
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(trans, &iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -242,16 +264,37 @@ out:
BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
}
+int bch2_btree_write_buffer_insert_err(struct btree_trans *trans,
+ enum btree_id btree, struct bkey_i *k)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
+ bch2_btree_id_to_text(&buf, btree);
+ prt_str(&buf, "\n");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -EROFS;
+}
+
static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
struct btree_iter iter = { NULL };
- size_t skipped = 0, fast = 0, slowpath = 0;
+ size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
bool write_locked = false;
+ bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
int ret = 0;
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ return ret;
+
bch2_trans_unlock(trans);
bch2_trans_begin(trans);
@@ -285,16 +328,32 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
darray_for_each(wb->sorted, i) {
struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
+ if (unlikely(!btree_type_uses_write_buffer(k->btree))) {
+ ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k);
+ goto err;
+ }
+
for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
prefetch(&wb->flushing.keys.data[n->idx]);
BUG_ON(!k->journal_seq);
+ if (!accounting_replay_done &&
+ k->k.k.type == KEY_TYPE_accounting) {
+ slowpath++;
+ continue;
+ }
+
if (i + 1 < &darray_top(wb->sorted) &&
wb_key_eq(i, i + 1)) {
struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
- skipped++;
+ if (k->k.k.type == KEY_TYPE_accounting &&
+ n->k.k.type == KEY_TYPE_accounting)
+ bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
+ bkey_i_to_s_c_accounting(&k->k));
+
+ overwritten++;
n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
k->journal_seq = 0;
continue;
@@ -307,25 +366,37 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
bch2_btree_node_unlock_write(trans, path, path->l[0].b);
write_locked = false;
+
+ ret = lockrestart_do(trans,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_foreground_maybe_merge(trans, iter.path, 0,
+ BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc));
+ if (ret)
+ goto err;
}
}
if (!iter.path || iter.btree_id != k->btree) {
bch2_trans_iter_exit(trans, &iter);
bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
- BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_intent|BTREE_ITER_all_snapshots);
}
bch2_btree_iter_set_pos(&iter, k->k.k.p);
btree_iter_path(trans, &iter)->preserve = false;
+ bool accounting_accumulated = false;
do {
if (race_fault()) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
}
- ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
+ ret = wb_flush_one(trans, &iter, k, &write_locked,
+ &accounting_accumulated, &fast);
if (!write_locked)
bch2_trans_begin(trans);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
@@ -357,49 +428,128 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
*/
trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
+ sort(wb->flushing.keys.data,
+ wb->flushing.keys.nr,
+ sizeof(wb->flushing.keys.data[0]),
+ wb_key_seq_cmp, NULL);
+
darray_for_each(wb->flushing.keys, i) {
if (!i->journal_seq)
continue;
- bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
- bch2_btree_write_buffer_journal_flush);
+ if (!accounting_replay_done &&
+ i->k.k.type == KEY_TYPE_accounting) {
+ could_not_insert++;
+ continue;
+ }
+
+ if (!could_not_insert)
+ bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+ bch2_btree_write_buffer_journal_flush);
bch2_trans_begin(trans);
ret = commit_do(trans, NULL, NULL,
BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_journal_res|
- BCH_TRANS_COMMIT_journal_reclaim,
+ BCH_TRANS_COMMIT_no_journal_res ,
btree_write_buffered_insert(trans, i));
if (ret)
goto err;
+
+ i->journal_seq = 0;
+ }
+
+ /*
+ * If journal replay hasn't finished with accounting keys we
+ * can't flush accounting keys at all - condense them and leave
+ * them for next time.
+ *
+ * Q: Can the write buffer overflow?
+ * A Shouldn't be any actual risk. It's just new accounting
+ * updates that the write buffer can't flush, and those are only
+ * going to be generated by interior btree node updates as
+ * journal replay has to split/rewrite nodes to make room for
+ * its updates.
+ *
+ * And for those new acounting updates, updates to the same
+ * counters get accumulated as they're flushed from the journal
+ * to the write buffer - see the patch for eytzingcer tree
+ * accumulated. So we could only overflow if the number of
+ * distinct counters touched somehow was very large.
+ */
+ if (could_not_insert) {
+ struct btree_write_buffered_key *dst = wb->flushing.keys.data;
+
+ darray_for_each(wb->flushing.keys, i)
+ if (i->journal_seq)
+ *dst++ = *i;
+ wb->flushing.keys.nr = dst - wb->flushing.keys.data;
}
}
err:
- bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
- trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
- bch2_journal_pin_drop(j, &wb->flushing.pin);
- wb->flushing.keys.nr = 0;
+ if (ret || !could_not_insert) {
+ bch2_journal_pin_drop(j, &wb->flushing.pin);
+ wb->flushing.keys.nr = 0;
+ }
+
+ bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
+ trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
return ret;
}
-static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
+{
+ struct journal_keys_to_wb dst;
+ int ret = 0;
+
+ bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
+
+ for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
+ jset_entry_for_each_key(entry, k) {
+ ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
+ if (ret)
+ goto out;
+ }
+
+ entry->type = BCH_JSET_ENTRY_btree_keys;
+ }
+out:
+ ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
+ return ret;
+}
+
+static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq)
{
struct journal *j = &c->journal;
struct journal_buf *buf;
+ bool blocked;
int ret = 0;
- while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) {
+ while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) {
ret = bch2_journal_keys_to_write_buffer(c, buf);
+
+ if (!blocked && !ret) {
+ spin_lock(&j->lock);
+ buf->need_flush_to_write_buffer = false;
+ spin_unlock(&j->lock);
+ }
+
mutex_unlock(&j->buf_lock);
+
+ if (blocked) {
+ bch2_journal_unblock(j);
+ break;
+ }
}
return ret;
}
-static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq)
+static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq,
+ bool *did_work)
{
struct bch_fs *c = trans->c;
struct btree_write_buffer *wb = &c->btree_write_buffer;
@@ -408,7 +558,9 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq)
do {
bch2_trans_unlock(trans);
- fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
+ fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq);
+
+ *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr;
/*
* On memory allocation failure, bch2_btree_write_buffer_flush_locked()
@@ -419,8 +571,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq)
mutex_unlock(&wb->flushing.lock);
} while (!ret &&
(fetch_from_journal_err ||
- (wb->inc.pin.seq && wb->inc.pin.seq <= seq) ||
- (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq)));
+ (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) ||
+ (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq)));
return ret;
}
@@ -429,17 +581,34 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
struct journal_entry_pin *_pin, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool did_work = false;
- return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq));
+ return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work));
}
int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
+ bool did_work = false;
trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_);
- return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal));
+ return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work);
+}
+
+/*
+ * The write buffer requires flushing when going RO: keys in the journal for the
+ * write buffer don't have a journal pin yet
+ */
+bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c)
+{
+ if (bch2_journal_error(&c->journal))
+ return false;
+
+ bool did_work = false;
+ bch2_trans_run(c, btree_write_buffer_flush_seq(trans,
+ journal_cur_seq(&c->journal), &did_work));
+ return did_work;
}
int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
@@ -468,6 +637,49 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
return ret;
}
+/*
+ * In check and repair code, when checking references to write buffer btrees we
+ * need to issue a flush before we have a definitive error: this issues a flush
+ * if this is a key we haven't yet checked.
+ */
+int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
+ struct bkey_s_c referring_k,
+ struct bkey_buf *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_buf tmp;
+ int ret = 0;
+
+ bch2_bkey_buf_init(&tmp);
+
+ if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) {
+ if (trace_write_buffer_maybe_flush_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, referring_k);
+ trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ bch2_bkey_buf_reassemble(&tmp, c, referring_k);
+
+ if (bkey_is_btree_ptr(referring_k.k)) {
+ bch2_trans_unlock(trans);
+ bch2_btree_interior_updates_flush(c);
+ }
+
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
+
+ bch2_bkey_buf_copy(last_flushed, c, tmp.k);
+ ret = -BCH_ERR_transaction_restart_write_buffer_flush;
+ }
+err:
+ bch2_bkey_buf_exit(&tmp, c);
+ return ret;
+}
+
static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
@@ -483,6 +695,29 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
}
+static void wb_accounting_sort(struct btree_write_buffer *wb)
+{
+ eytzinger0_sort(wb->accounting.data, wb->accounting.nr,
+ sizeof(wb->accounting.data[0]),
+ wb_key_cmp, NULL);
+}
+
+int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree,
+ struct bkey_i_accounting *k)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ struct btree_write_buffered_key new = { .btree = btree };
+
+ bkey_copy(&new.k, &k->k_i);
+
+ int ret = darray_push(&wb->accounting, new);
+ if (ret)
+ return ret;
+
+ wb_accounting_sort(wb);
+ return 0;
+}
+
int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
struct journal_keys_to_wb *dst,
enum btree_id btree, struct bkey_i *k)
@@ -552,11 +787,35 @@ void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_ke
bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
bch2_btree_write_buffer_journal_flush);
+
+ darray_for_each(wb->accounting, i)
+ memset(&i->k.v, 0, bkey_val_bytes(&i->k.k));
}
-void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
+int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
+ unsigned live_accounting_keys = 0;
+ int ret = 0;
+
+ darray_for_each(wb->accounting, i)
+ if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) {
+ i->journal_seq = dst->seq;
+ live_accounting_keys++;
+ ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k);
+ if (ret)
+ break;
+ }
+
+ if (live_accounting_keys * 2 < wb->accounting.nr) {
+ struct btree_write_buffered_key *dst = wb->accounting.data;
+
+ darray_for_each(wb->accounting, src)
+ if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k)))
+ *dst++ = *src;
+ wb->accounting.nr = dst - wb->accounting.data;
+ wb_accounting_sort(wb);
+ }
if (!dst->wb->keys.nr)
bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
@@ -569,30 +828,7 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys
if (dst->wb == &wb->flushing)
mutex_unlock(&wb->flushing.lock);
mutex_unlock(&wb->inc.lock);
-}
-static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
-{
- struct journal_keys_to_wb dst;
- struct jset_entry *entry;
- struct bkey_i *k;
- int ret = 0;
-
- bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
-
- for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
- jset_entry_for_each_key(entry, k) {
- ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
- if (ret)
- goto out;
- }
-
- entry->type = BCH_JSET_ENTRY_btree_keys;
- }
-
- buf->need_flush_to_write_buffer = false;
-out:
- bch2_journal_keys_to_write_buffer_end(c, &dst);
return ret;
}
@@ -624,6 +860,7 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
!bch2_journal_error(&c->journal));
+ darray_exit(&wb->accounting);
darray_exit(&wb->sorted);
darray_exit(&wb->flushing.keys);
darray_exit(&wb->inc.keys);
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index eebcd2b15249..d535cea28bde 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_BTREE_WRITE_BUFFER_H
#include "bkey.h"
+#include "disk_accounting.h"
static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
{
@@ -20,25 +21,58 @@ static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
struct btree_trans;
int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
+bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *);
int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
int bch2_btree_write_buffer_tryflush(struct btree_trans *);
+struct bkey_buf;
+int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *);
+
struct journal_keys_to_wb {
struct btree_write_buffer_keys *wb;
size_t room;
u64 seq;
};
+static inline int wb_key_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p);
+}
+
+int bch2_accounting_key_to_wb_slowpath(struct bch_fs *,
+ enum btree_id, struct bkey_i_accounting *);
+
+static inline int bch2_accounting_key_to_wb(struct bch_fs *c,
+ enum btree_id btree, struct bkey_i_accounting *k)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ struct btree_write_buffered_key search;
+ search.btree = btree;
+ search.k.k.p = k->k.p;
+
+ unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr,
+ sizeof(wb->accounting.data[0]),
+ wb_key_cmp, &search);
+
+ if (idx >= wb->accounting.nr)
+ return bch2_accounting_key_to_wb_slowpath(c, btree, k);
+
+ struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k);
+ bch2_accounting_accumulate(dst, accounting_i_to_s_c(k));
+ return 0;
+}
+
int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
struct journal_keys_to_wb *,
enum btree_id, struct bkey_i *);
-static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+static inline int __bch2_journal_key_to_wb(struct bch_fs *c,
struct journal_keys_to_wb *dst,
enum btree_id btree, struct bkey_i *k)
{
- EBUG_ON(!dst->seq);
-
if (unlikely(!dst->room))
return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
@@ -51,8 +85,19 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c,
return 0;
}
+static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+ struct journal_keys_to_wb *dst,
+ enum btree_id btree, struct bkey_i *k)
+{
+ EBUG_ON(!dst->seq);
+
+ return k->k.type == KEY_TYPE_accounting
+ ? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k))
+ : __bch2_journal_key_to_wb(c, dst, btree, k);
+}
+
void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
-void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
+int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
index 9b9433de9c36..e9e76e20f43b 100644
--- a/fs/bcachefs/btree_write_buffer_types.h
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -52,6 +52,8 @@ struct btree_write_buffer {
struct btree_write_buffer_keys inc;
struct btree_write_buffer_keys flushing;
struct work_struct flush_work;
+
+ DARRAY(struct btree_write_buffered_key) accounting;
};
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 54f7826ac498..345b117a4a4a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -13,11 +13,14 @@
#include "btree_update.h"
#include "buckets.h"
#include "buckets_waiting_for_journal.h"
+#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "inode.h"
#include "movinggc.h"
+#include "rebalance.h"
#include "recovery.h"
+#include "recovery_passes.h"
#include "reflink.h"
#include "replicas.h"
#include "subvolume.h"
@@ -25,197 +28,10 @@
#include <linux/preempt.h>
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
- enum bch_data_type data_type,
- s64 sectors)
-{
- switch (data_type) {
- case BCH_DATA_btree:
- fs_usage->btree += sectors;
- break;
- case BCH_DATA_user:
- case BCH_DATA_parity:
- fs_usage->data += sectors;
- break;
- case BCH_DATA_cached:
- fs_usage->cached += sectors;
- break;
- default:
- break;
- }
-}
-
-void bch2_fs_usage_initialize(struct bch_fs *c)
-{
- percpu_down_write(&c->mark_lock);
- struct bch_fs_usage *usage = c->usage_base;
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
- bch2_fs_usage_acc_to_base(c, i);
-
- for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
- usage->b.reserved += usage->persistent_reserved[i];
-
- for (unsigned i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(&c->replicas, i);
-
- fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]);
- }
-
- for_each_member_device(c, ca) {
- struct bch_dev_usage dev = bch2_dev_usage_read(ca);
-
- usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
- dev.d[BCH_DATA_journal].buckets) *
- ca->mi.bucket_size;
- }
-
- percpu_up_write(&c->mark_lock);
-}
-
-static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
- unsigned journal_seq,
- bool gc)
-{
- BUG_ON(!gc && !journal_seq);
-
- return this_cpu_ptr(gc
- ? ca->usage_gc
- : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
-}
-
void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
{
- struct bch_fs *c = ca->fs;
- unsigned seq, i, u64s = dev_usage_u64s();
-
- do {
- seq = read_seqcount_begin(&c->usage_lock);
- memcpy(usage, ca->usage_base, u64s * sizeof(u64));
- for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
- acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
- } while (read_seqcount_retry(&c->usage_lock, seq));
-}
-
-u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
-{
- ssize_t offset = v - (u64 *) c->usage_base;
- unsigned i, seq;
- u64 ret;
-
- BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
- percpu_rwsem_assert_held(&c->mark_lock);
-
- do {
- seq = read_seqcount_begin(&c->usage_lock);
- ret = *v;
-
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
- } while (read_seqcount_retry(&c->usage_lock, seq));
-
- return ret;
-}
-
-struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
-{
- struct bch_fs_usage_online *ret;
- unsigned nr_replicas = READ_ONCE(c->replicas.nr);
- unsigned seq, i;
-retry:
- ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL);
- if (unlikely(!ret))
- return NULL;
-
- percpu_down_read(&c->mark_lock);
-
- if (nr_replicas != c->replicas.nr) {
- nr_replicas = c->replicas.nr;
- percpu_up_read(&c->mark_lock);
- kfree(ret);
- goto retry;
- }
-
- ret->online_reserved = percpu_u64_get(c->online_reserved);
-
- do {
- seq = read_seqcount_begin(&c->usage_lock);
- unsafe_memcpy(&ret->u, c->usage_base,
- __fs_usage_u64s(nr_replicas) * sizeof(u64),
- "embedded variable length struct");
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i],
- __fs_usage_u64s(nr_replicas));
- } while (read_seqcount_retry(&c->usage_lock, seq));
-
- return ret;
-}
-
-void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
-{
- unsigned u64s = fs_usage_u64s(c);
-
- BUG_ON(idx >= ARRAY_SIZE(c->usage));
-
- preempt_disable();
- write_seqcount_begin(&c->usage_lock);
-
- acc_u64s_percpu((u64 *) c->usage_base,
- (u64 __percpu *) c->usage[idx], u64s);
- percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
-
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, NULL) {
- u64s = dev_usage_u64s();
-
- acc_u64s_percpu((u64 *) ca->usage_base,
- (u64 __percpu *) ca->usage[idx], u64s);
- percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
- }
- rcu_read_unlock();
-
- write_seqcount_end(&c->usage_lock);
- preempt_enable();
-}
-
-void bch2_fs_usage_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_fs_usage_online *fs_usage)
-{
- unsigned i;
-
- prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
-
- prt_printf(out, "hidden:\t\t\t\t%llu\n",
- fs_usage->u.b.hidden);
- prt_printf(out, "data:\t\t\t\t%llu\n",
- fs_usage->u.b.data);
- prt_printf(out, "cached:\t\t\t\t%llu\n",
- fs_usage->u.b.cached);
- prt_printf(out, "reserved:\t\t\t%llu\n",
- fs_usage->u.b.reserved);
- prt_printf(out, "nr_inodes:\t\t\t%llu\n",
- fs_usage->u.b.nr_inodes);
- prt_printf(out, "online reserved:\t\t%llu\n",
- fs_usage->online_reserved);
-
- for (i = 0;
- i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
- i++) {
- prt_printf(out, "%u replicas:\n", i + 1);
- prt_printf(out, "\treserved:\t\t%llu\n",
- fs_usage->u.persistent_reserved[i]);
- }
-
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(&c->replicas, i);
-
- prt_printf(out, "\t");
- bch2_replicas_entry_to_text(out, e);
- prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
- }
+ memset(usage, 0, sizeof(*usage));
+ acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
}
static u64 reserve_factor(u64 r)
@@ -223,16 +39,6 @@ static u64 reserve_factor(u64 r)
return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
}
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
-{
- return min(fs_usage->u.b.hidden +
- fs_usage->u.b.btree +
- fs_usage->u.b.data +
- reserve_factor(fs_usage->u.b.reserved +
- fs_usage->online_reserved),
- c->capacity);
-}
-
static struct bch_fs_usage_short
__bch2_fs_usage_read_short(struct bch_fs *c)
{
@@ -240,17 +46,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
u64 data, reserved;
ret.capacity = c->capacity -
- bch2_fs_usage_read_one(c, &c->usage_base->b.hidden);
+ percpu_u64_get(&c->usage->hidden);
- data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) +
- bch2_fs_usage_read_one(c, &c->usage_base->b.btree);
- reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) +
+ data = percpu_u64_get(&c->usage->data) +
+ percpu_u64_get(&c->usage->btree);
+ reserved = percpu_u64_get(&c->usage->reserved) +
percpu_u64_get(c->online_reserved);
ret.used = min(ret.capacity, data + reserve_factor(reserved));
ret.free = ret.capacity - ret.used;
- ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes);
+ ret.nr_inodes = percpu_u64_get(&c->usage->nr_inodes);
return ret;
}
@@ -267,324 +73,346 @@ bch2_fs_usage_read_short(struct bch_fs *c)
return ret;
}
-void bch2_dev_usage_init(struct bch_dev *ca)
+void bch2_dev_usage_to_text(struct printbuf *out,
+ struct bch_dev *ca,
+ struct bch_dev_usage *usage)
{
- ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
-}
+ if (out->nr_tabstops < 5) {
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 12);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ }
-void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
-{
- prt_tab(out);
- prt_str(out, "buckets");
- prt_tab_rjust(out);
- prt_str(out, "sectors");
- prt_tab_rjust(out);
- prt_str(out, "fragmented");
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
bch2_prt_data_type(out, i);
- prt_tab(out);
- prt_u64(out, usage->d[i].buckets);
- prt_tab_rjust(out);
- prt_u64(out, usage->d[i].sectors);
- prt_tab_rjust(out);
- prt_u64(out, usage->d[i].fragmented);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
+ usage->d[i].buckets,
+ usage->d[i].sectors,
+ usage->d[i].fragmented);
}
-}
-
-void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
- const struct bch_alloc_v4 *old,
- const struct bch_alloc_v4 *new,
- u64 journal_seq, bool gc)
-{
- struct bch_fs_usage *fs_usage;
- struct bch_dev_usage *u;
-
- preempt_disable();
- fs_usage = fs_usage_ptr(c, journal_seq, gc);
-
- if (data_type_is_hidden(old->data_type))
- fs_usage->b.hidden -= ca->mi.bucket_size;
- if (data_type_is_hidden(new->data_type))
- fs_usage->b.hidden += ca->mi.bucket_size;
-
- u = dev_usage_ptr(ca, journal_seq, gc);
-
- u->d[old->data_type].buckets--;
- u->d[new->data_type].buckets++;
- u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old);
- u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new);
-
- u->d[BCH_DATA_cached].sectors += new->cached_sectors;
- u->d[BCH_DATA_cached].sectors -= old->cached_sectors;
-
- u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old);
- u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new);
-
- preempt_enable();
+ prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets);
}
-static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
+static int bch2_check_fix_ptr(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry,
+ bool *do_update)
{
- return (struct bch_alloc_v4) {
- .gen = b.gen,
- .data_type = b.data_type,
- .dirty_sectors = b.dirty_sectors,
- .cached_sectors = b.cached_sectors,
- .stripe = b.stripe,
- };
-}
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
-void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
- struct bucket *old, struct bucket *new)
-{
- struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old);
- struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new);
+ struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
+ if (!ca) {
+ if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
+ trans, ptr_to_invalid_device,
+ "pointer to missing device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+ return 0;
+ }
- bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true);
-}
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ if (!g) {
+ if (fsck_err(trans, ptr_to_invalid_device,
+ "pointer to invalid bucket on device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+ goto out;
+ }
-static inline int __update_replicas(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct bch_replicas_entry_v1 *r,
- s64 sectors)
-{
- int idx = bch2_replicas_entry_idx(c, r);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
- if (idx < 0)
- return -1;
+ if (fsck_err_on(!g->gen_valid,
+ trans, ptr_to_missing_alloc_key,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (!p.ptr.cached) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ } else {
+ *do_update = true;
+ }
+ }
- fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
- fs_usage->replicas[idx] += sectors;
- return 0;
-}
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
+ trans, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (!p.ptr.cached &&
+ (g->data_type != BCH_DATA_btree ||
+ data_type == BCH_DATA_btree)) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = 0;
+ g->stripe_sectors = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ } else {
+ *do_update = true;
+ }
+ }
-int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
- struct bch_replicas_entry_v1 *r, s64 sectors,
- unsigned journal_seq, bool gc)
-{
- struct bch_fs_usage *fs_usage;
- int idx, ret = 0;
- struct printbuf buf = PRINTBUF;
+ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
+ trans, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
- percpu_down_read(&c->mark_lock);
+ if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
+ trans, stale_dirty_ptr,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
- idx = bch2_replicas_entry_idx(c, r);
- if (idx < 0 &&
- fsck_err(c, ptr_to_missing_replicas_entry,
- "no replicas entry\n while marking %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- percpu_up_read(&c->mark_lock);
- ret = bch2_mark_replicas(c, r);
- percpu_down_read(&c->mark_lock);
+ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
+ goto out;
- if (ret)
- goto err;
- idx = bch2_replicas_entry_idx(c, r);
- }
- if (idx < 0) {
- ret = -1;
- goto err;
+ if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
+ trans, ptr_bucket_data_type_mismatch,
+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (data_type == BCH_DATA_btree) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = data_type;
+ g->stripe_sectors = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ } else {
+ *do_update = true;
+ }
}
- preempt_disable();
- fs_usage = fs_usage_ptr(c, journal_seq, gc);
- fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
- fs_usage->replicas[idx] += sectors;
- preempt_enable();
-err:
+ if (p.has_ec) {
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
+
+ if (fsck_err_on(!m || !m->alive,
+ trans, ptr_to_missing_stripe,
+ "pointer to nonexistent stripe %llu\n"
+ "while marking %s",
+ (u64) p.ec.idx,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+
+ if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
+ trans, ptr_to_incorrect_stripe,
+ "pointer does not match stripe %llu\n"
+ "while marking %s",
+ (u64) p.ec.idx,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+ }
+out:
fsck_err:
- percpu_up_read(&c->mark_lock);
+ bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
}
-static inline int update_cached_sectors(struct bch_fs *c,
- struct bkey_s_c k,
- unsigned dev, s64 sectors,
- unsigned journal_seq, bool gc)
-{
- struct bch_replicas_padded r;
-
- bch2_replicas_entry_cached(&r.e, dev);
-
- return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc);
-}
-
-static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
- gfp_t gfp)
+int bch2_check_fix_ptrs(struct btree_trans *trans,
+ enum btree_id btree, unsigned level, struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
{
- struct replicas_delta_list *d = trans->fs_usage_deltas;
- unsigned new_size = d ? (d->size + more) * 2 : 128;
- unsigned alloc_size = sizeof(*d) + new_size;
-
- WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
-
- if (!d || d->used + more > d->size) {
- d = krealloc(d, alloc_size, gfp|__GFP_ZERO);
-
- if (unlikely(!d)) {
- if (alloc_size > REPLICAS_DELTA_LIST_MAX)
- return -ENOMEM;
-
- d = mempool_alloc(&trans->c->replicas_delta_pool, gfp);
- if (!d)
- return -ENOMEM;
-
- memset(d, 0, REPLICAS_DELTA_LIST_MAX);
-
- if (trans->fs_usage_deltas)
- memcpy(d, trans->fs_usage_deltas,
- trans->fs_usage_deltas->size + sizeof(*d));
-
- new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
- kfree(trans->fs_usage_deltas);
- }
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry_c;
+ struct extent_ptr_decoded p = { 0 };
+ bool do_update = false;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
- d->size = new_size;
- trans->fs_usage_deltas = d;
+ bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
+ ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
+ if (ret)
+ goto err;
}
- return 0;
-}
-
-int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
-{
- return allocate_dropping_locks_errcode(trans,
- __replicas_deltas_realloc(trans, more, _gfp));
-}
-
-int bch2_update_replicas_list(struct btree_trans *trans,
- struct bch_replicas_entry_v1 *r,
- s64 sectors)
-{
- struct replicas_delta_list *d;
- struct replicas_delta *n;
- unsigned b;
- int ret;
-
- if (!sectors)
- return 0;
-
- b = replicas_entry_bytes(r) + 8;
- ret = bch2_replicas_deltas_realloc(trans, b);
- if (ret)
- return ret;
-
- d = trans->fs_usage_deltas;
- n = (void *) d->d + d->used;
- n->delta = sectors;
- unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r),
- r, replicas_entry_bytes(r),
- "flexible array member embedded in strcuct with padding");
- bch2_replicas_entry_sort(&n->r);
- d->used += b;
- return 0;
-}
-
-int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
-{
- struct bch_replicas_padded r;
-
- bch2_replicas_entry_cached(&r.e, dev);
+ if (do_update) {
+ if (flags & BTREE_TRIGGER_is_root) {
+ bch_err(c, "cannot update btree roots yet");
+ ret = -EINVAL;
+ goto err;
+ }
- return bch2_update_replicas_list(trans, &r.e, sectors);
-}
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto err;
-int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, enum bch_data_type data_type,
- unsigned sectors, struct gc_pos pos,
- unsigned flags)
-{
- struct bucket old, new, *g;
- int ret = 0;
+ rcu_read_lock();
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
+ rcu_read_unlock();
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
- BUG_ON(data_type != BCH_DATA_sb &&
- data_type != BCH_DATA_journal);
+ if (level) {
+ /*
+ * We don't want to drop btree node pointers - if the
+ * btree node isn't there anymore, the read path will
+ * sort it out:
+ */
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ rcu_read_lock();
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ struct bucket *g = PTR_GC_BUCKET(ca, ptr);
- /*
- * Backup superblock might be past the end of our normal usable space:
- */
- if (b >= ca->mi.nbuckets)
- return 0;
+ ptr->gen = g->gen;
+ }
+ rcu_read_unlock();
+ } else {
+ struct bkey_ptrs ptrs;
+ union bch_extent_entry *entry;
+
+ rcu_read_lock();
+restart_drop_ptrs:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
+ struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
+
+ if ((p.ptr.cached &&
+ (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
+ (!p.ptr.cached &&
+ gen_cmp(p.ptr.gen, g->gen) < 0) ||
+ gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
+ (g->data_type &&
+ g->data_type != data_type)) {
+ bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
+ goto restart_drop_ptrs;
+ }
+ }
+ rcu_read_unlock();
+again:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
+ entry->stripe_ptr.idx);
+ union bch_extent_entry *next_ptr;
+
+ bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
+ if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
+ goto found;
+ next_ptr = NULL;
+found:
+ if (!next_ptr) {
+ bch_err(c, "aieee, found stripe ptr with no data ptr");
+ continue;
+ }
+
+ if (!m || !m->alive ||
+ !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
+ &next_ptr->ptr,
+ m->sectors)) {
+ bch2_bkey_extent_entry_drop(new, entry);
+ goto again;
+ }
+ }
+ }
+ }
- percpu_down_read(&c->mark_lock);
- g = gc_bucket(ca, b);
+ if (0) {
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_info(c, "updated %s", buf.buf);
- bucket_lock(g);
- old = *g;
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+ bch_info(c, "new key %s", buf.buf);
+ }
- if (bch2_fs_inconsistent_on(g->data_type &&
- g->data_type != data_type, c,
- "different types of data in same bucket: %s, %s",
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type))) {
- ret = -EIO;
- goto err;
- }
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
+ BTREE_ITER_intent|BTREE_ITER_all_snapshots);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, new,
+ BTREE_UPDATE_internal_snapshot_node|
+ BTREE_TRIGGER_norun);
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ goto err;
- if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
- ca->dev_idx, b, g->gen,
- bch2_data_type_str(g->data_type ?: data_type),
- g->dirty_sectors, sectors)) {
- ret = -EIO;
- goto err;
+ if (level)
+ bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
}
-
- g->data_type = data_type;
- g->dirty_sectors += sectors;
- new = *g;
err:
- bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update_m(c, ca, &old, &new);
- percpu_up_read(&c->mark_lock);
+ printbuf_exit(&buf);
return ret;
}
-int bch2_check_bucket_ref(struct btree_trans *trans,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum bch_data_type ptr_data_type,
- u8 b_gen, u8 bucket_data_type,
- u32 bucket_sectors)
+int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 b_gen, u8 bucket_data_type,
+ u32 *bucket_sectors)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
struct printbuf buf = PRINTBUF;
+ bool inserting = sectors > 0;
int ret = 0;
- if (bucket_data_type == BCH_DATA_cached)
- bucket_data_type = BCH_DATA_user;
-
- if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) ||
- (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe))
- bucket_data_type = ptr_data_type = BCH_DATA_stripe;
+ BUG_ON(!sectors);
if (gen_after(ptr->gen, b_gen)) {
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen,
+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ log_fsck_err(trans, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
+ if (inserting)
+ goto err;
+ goto out;
}
if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- BCH_FSCK_ERR_ptr_too_stale,
+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ log_fsck_err(trans, ptr_too_stale,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -592,35 +420,35 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
+ if (inserting)
+ goto err;
+ goto out;
+ }
+
+ if (b_gen != ptr->gen && ptr->cached) {
+ ret = 1;
+ goto out;
}
- if (b_gen != ptr->gen && !ptr->cached) {
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- BCH_FSCK_ERR_stale_dirty_ptr,
+ if (b_gen != ptr->gen) {
+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ log_fsck_err(trans, stale_dirty_ptr,
"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- *bucket_gen(ca, bucket_nr),
+ bucket_gen_get(ca, bucket_nr),
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
- }
-
- if (b_gen != ptr->gen) {
- ret = 1;
+ if (inserting)
+ goto err;
goto out;
}
- if (!data_type_is_empty(bucket_data_type) &&
- ptr_data_type &&
- bucket_data_type != ptr_data_type) {
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
+ if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ log_fsck_err(trans, ptr_bucket_data_type_mismatch,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -628,72 +456,38 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
bch2_data_type_str(ptr_data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
+ if (inserting)
+ goto err;
+ goto out;
}
- if ((u64) bucket_sectors + sectors > U32_MAX) {
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- BCH_FSCK_ERR_bucket_sector_count_overflow,
+ if ((u64) *bucket_sectors + sectors > U32_MAX) {
+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ log_fsck_err(trans, bucket_sector_count_overflow,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- bucket_sectors, sectors,
+ *bucket_sectors, sectors,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
+ if (inserting)
+ goto err;
+ sectors = -*bucket_sectors;
}
+
+ *bucket_sectors += sectors;
out:
printbuf_exit(&buf);
return ret;
err:
+fsck_err:
bch2_dump_trans_updates(trans);
+ bch2_inconsistent_error(c);
+ ret = -BCH_ERR_bucket_ref_update;
goto out;
}
-void bch2_trans_fs_usage_revert(struct btree_trans *trans,
- struct replicas_delta_list *deltas)
-{
- struct bch_fs *c = trans->c;
- struct bch_fs_usage *dst;
- struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
- s64 added = 0;
- unsigned i;
-
- percpu_down_read(&c->mark_lock);
- preempt_disable();
- dst = fs_usage_ptr(c, trans->journal_res.seq, false);
-
- /* revert changes: */
- for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
- switch (d->r.data_type) {
- case BCH_DATA_btree:
- case BCH_DATA_user:
- case BCH_DATA_parity:
- added += d->delta;
- }
- BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
- }
-
- dst->b.nr_inodes -= deltas->nr_inodes;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- added -= deltas->persistent_reserved[i];
- dst->b.reserved -= deltas->persistent_reserved[i];
- dst->persistent_reserved[i] -= deltas->persistent_reserved[i];
- }
-
- if (added > 0) {
- trans->disk_res->sectors += added;
- this_cpu_add(*c->online_reserved, added);
- }
-
- preempt_enable();
- percpu_up_read(&c->mark_lock);
-}
-
void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
@@ -702,8 +496,6 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
bool warn = false;
percpu_down_read(&c->mark_lock);
- preempt_disable();
- struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b;
struct bch_fs_usage_base *src = &trans->fs_usage_delta;
s64 added = src->btree + src->data + src->reserved;
@@ -714,13 +506,13 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
*/
s64 should_not_have_added = added - (s64) disk_res_sectors;
if (unlikely(should_not_have_added > 0)) {
- u64 old, new, v = atomic64_read(&c->sectors_available);
+ u64 old, new;
+ old = atomic64_read(&c->sectors_available);
do {
- old = v;
new = max_t(s64, 0, old - should_not_have_added);
- } while ((v = atomic64_cmpxchg(&c->sectors_available,
- old, new)) != old);
+ } while (!atomic64_try_cmpxchg(&c->sectors_available,
+ &old, new));
added -= should_not_have_added;
warn = true;
@@ -731,13 +523,9 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
this_cpu_sub(*c->online_reserved, added);
}
- dst->hidden += src->hidden;
- dst->btree += src->btree;
- dst->data += src->data;
- dst->cached += src->cached;
- dst->reserved += src->reserved;
- dst->nr_inodes += src->nr_inodes;
-
+ preempt_disable();
+ struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
+ acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
preempt_enable();
percpu_up_read(&c->mark_lock);
@@ -747,150 +535,104 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
should_not_have_added, disk_res_sectors);
}
-int bch2_trans_fs_usage_apply(struct btree_trans *trans,
- struct replicas_delta_list *deltas)
-{
- struct bch_fs *c = trans->c;
- struct replicas_delta *d, *d2;
- struct replicas_delta *top = (void *) deltas->d + deltas->used;
- struct bch_fs_usage *dst;
- unsigned i;
-
- percpu_down_read(&c->mark_lock);
- preempt_disable();
- dst = fs_usage_ptr(c, trans->journal_res.seq, false);
-
- for (d = deltas->d; d != top; d = replicas_delta_next(d))
- if (__update_replicas(c, dst, &d->r, d->delta))
- goto need_mark;
-
- dst->b.nr_inodes += deltas->nr_inodes;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- dst->b.reserved += deltas->persistent_reserved[i];
- dst->persistent_reserved[i] += deltas->persistent_reserved[i];
- }
-
- preempt_enable();
- percpu_up_read(&c->mark_lock);
- return 0;
-need_mark:
- /* revert changes: */
- for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
- BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
-
- preempt_enable();
- percpu_up_read(&c->mark_lock);
- return -1;
-}
-
/* KEY_TYPE_extent: */
-static int __mark_pointer(struct btree_trans *trans,
+static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
+ const struct extent_ptr_decoded *p,
s64 sectors, enum bch_data_type ptr_data_type,
- u8 bucket_gen, u8 *bucket_data_type,
- u32 *dirty_sectors, u32 *cached_sectors)
+ struct bch_alloc_v4 *a,
+ bool insert)
{
- u32 *dst_sectors = !ptr->cached
- ? dirty_sectors
- : cached_sectors;
- int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
- bucket_gen, *bucket_data_type, *dst_sectors);
+ u32 *dst_sectors = p->has_ec ? &a->stripe_sectors :
+ !p->ptr.cached ? &a->dirty_sectors :
+ &a->cached_sectors;
+ int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type,
+ a->gen, a->data_type, dst_sectors);
if (ret)
return ret;
-
- *dst_sectors += sectors;
-
- if (!*dirty_sectors && !*cached_sectors)
- *bucket_data_type = 0;
- else if (*bucket_data_type != BCH_DATA_stripe)
- *bucket_data_type = ptr_data_type;
-
+ if (insert)
+ alloc_data_type_set(a, ptr_data_type);
return 0;
}
static int bch2_trigger_pointer(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry,
s64 *sectors,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
- bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
- struct bpos bucket;
- struct bch_backpointer bp;
+ struct bch_fs *c = trans->c;
+ bool insert = !(flags & BTREE_TRIGGER_overwrite);
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
- bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
- *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
+ struct bkey_i_backpointer bp;
+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp);
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket);
- int ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- return ret;
+ *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len;
- ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type,
- a->v.gen, &a->v.data_type,
- &a->v.dirty_sectors, &a->v.cached_sectors) ?:
- bch2_trans_update(trans, &iter, &a->k_i, 0);
- bch2_trans_iter_exit(trans, &iter);
+ struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
+ if (unlikely(!ca)) {
+ if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
+ ret = -BCH_ERR_trigger_pointer;
+ goto err;
+ }
+ struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
+
+ if (flags & BTREE_TRIGGER_transactional) {
+ struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
+ ret = PTR_ERR_OR_ZERO(a) ?:
+ __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert);
if (ret)
- return ret;
+ goto err;
if (!p.ptr.cached) {
- ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
+ ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
if (ret)
- return ret;
+ goto err;
}
}
- if (flags & BTREE_TRIGGER_GC) {
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
-
- percpu_down_read(&c->mark_lock);
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- bucket_lock(g);
- struct bucket old = *g;
-
- u8 bucket_data_type = g->data_type;
- int ret = __mark_pointer(trans, k, &p.ptr, *sectors,
- data_type, g->gen,
- &bucket_data_type,
- &g->dirty_sectors,
- &g->cached_sectors);
- if (ret) {
- bucket_unlock(g);
- percpu_up_read(&c->mark_lock);
- return ret;
+ if (flags & BTREE_TRIGGER_gc) {
+ struct bucket *g = gc_bucket(ca, bucket.offset);
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
+ p.ptr.dev,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = -BCH_ERR_trigger_pointer;
+ goto err;
}
- g->data_type = bucket_data_type;
- struct bucket new = *g;
+ bucket_lock(g);
+ struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
+ ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert);
+ alloc_to_bucket(g, new);
bucket_unlock(g);
- bch2_dev_usage_update_m(c, ca, &old, &new);
- percpu_up_read(&c->mark_lock);
- }
- return 0;
+ if (!ret)
+ ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
+ }
+err:
+ bch2_dev_put(ca);
+ printbuf_exit(&buf);
+ return ret;
}
static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
struct bkey_s_c k,
struct extent_ptr_decoded p,
enum bch_data_type data_type,
- s64 sectors, unsigned flags)
+ s64 sectors,
+ enum btree_iter_update_trigger_flags flags)
{
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
struct btree_iter iter;
struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_stripes, POS(0, p.ec.idx),
- BTREE_ITER_WITH_UPDATES, stripe);
+ BTREE_ITER_with_updates, stripe);
int ret = PTR_ERR_OR_ZERO(s);
if (unlikely(ret)) {
bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
@@ -903,7 +645,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
- ret = -EIO;
+ ret = -BCH_ERR_trigger_stripe_pointer;
goto err;
}
@@ -911,20 +653,20 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
stripe_blockcount_get(&s->v, p.ec.block) +
sectors);
- struct bch_replicas_padded r;
- bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
- r.e.data_type = data_type;
- ret = bch2_update_replicas_list(trans, &r.e, sectors);
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+ bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+ acc.replicas.data_type = data_type;
+ ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
- if (flags & BTREE_TRIGGER_GC) {
+ if (flags & BTREE_TRIGGER_gc) {
struct bch_fs *c = trans->c;
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
if (!m) {
bch_err(c, "error allocating memory for gc_stripes, idx %llu",
@@ -942,16 +684,21 @@ err:
(u64) p.ec.idx, buf.buf);
printbuf_exit(&buf);
bch2_inconsistent_error(c);
- return -EIO;
+ return -BCH_ERR_trigger_stripe_pointer;
}
m->block_sectors[p.ec.block] += sectors;
- struct bch_replicas_padded r = m->r;
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+ memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
mutex_unlock(&c->ec_stripes_heap_lock);
- r.e.data_type = data_type;
- bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
+ acc.replicas.data_type = data_type;
+ int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, true);
+ if (ret)
+ return ret;
}
return 0;
@@ -959,45 +706,49 @@ err:
static int __trigger_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+ struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags,
+ s64 *replicas_sectors)
{
- bool gc = flags & BTREE_TRIGGER_GC;
- struct bch_fs *c = trans->c;
+ bool gc = flags & BTREE_TRIGGER_gc;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- struct bch_replicas_padded r;
enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
? BCH_DATA_btree
: BCH_DATA_user;
- s64 dirty_sectors = 0;
int ret = 0;
- r.e.data_type = data_type;
- r.e.nr_devs = 0;
- r.e.nr_required = 1;
+ struct disk_accounting_pos acc_replicas_key = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ .replicas.data_type = data_type,
+ .replicas.nr_devs = 0,
+ .replicas.nr_required = 1,
+ };
+
+ struct disk_accounting_pos acct_compression_key = {
+ .type = BCH_DISK_ACCOUNTING_compression,
+ };
+ u64 compression_acct[3] = { 1, 0, 0 };
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors;
- ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags);
+ s64 disk_sectors = 0;
+ ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
if (ret < 0)
return ret;
bool stale = ret > 0;
+ if (p.ptr.cached && stale)
+ continue;
+
if (p.ptr.cached) {
- if (!stale) {
- ret = !gc
- ? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
- : update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
- bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors",
- __func__);
- if (ret)
- return ret;
- }
+ ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc);
+ if (ret)
+ return ret;
} else if (!p.has_ec) {
- dirty_sectors += disk_sectors;
- r.e.devs[r.e.nr_devs++] = p.ptr.dev;
+ *replicas_sectors += disk_sectors;
+ replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
if (ret)
@@ -1008,21 +759,77 @@ static int __trigger_extent(struct btree_trans *trans,
* if so they're not required for mounting if we have an
* erasure coded pointer in this extent:
*/
- r.e.nr_required = 0;
+ acc_replicas_key.replicas.nr_required = 0;
}
- }
- if (r.e.nr_devs) {
- ret = !gc
- ? bch2_update_replicas_list(trans, &r.e, dirty_sectors)
- : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true);
- if (unlikely(ret && gc)) {
- struct printbuf buf = PRINTBUF;
+ if (acct_compression_key.compression.type &&
+ acct_compression_key.compression.type != p.crc.compression_type) {
+ if (flags & BTREE_TRIGGER_overwrite)
+ bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
- printbuf_exit(&buf);
+ ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
+ ARRAY_SIZE(compression_acct), gc);
+ if (ret)
+ return ret;
+
+ compression_acct[0] = 1;
+ compression_acct[1] = 0;
+ compression_acct[2] = 0;
+ }
+
+ acct_compression_key.compression.type = p.crc.compression_type;
+ if (p.crc.compression_type) {
+ compression_acct[1] += p.crc.uncompressed_size;
+ compression_acct[2] += p.crc.compressed_size;
}
+ }
+
+ if (acc_replicas_key.replicas.nr_devs) {
+ ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
+ if (ret)
+ return ret;
+ }
+
+ if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
+ struct disk_accounting_pos acc_snapshot_key = {
+ .type = BCH_DISK_ACCOUNTING_snapshot,
+ .snapshot.id = k.k->p.snapshot,
+ };
+ ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
+ if (ret)
+ return ret;
+ }
+
+ if (acct_compression_key.compression.type) {
+ if (flags & BTREE_TRIGGER_overwrite)
+ bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
+
+ ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
+ ARRAY_SIZE(compression_acct), gc);
+ if (ret)
+ return ret;
+ }
+
+ if (level) {
+ struct disk_accounting_pos acc_btree_key = {
+ .type = BCH_DISK_ACCOUNTING_btree,
+ .btree.id = btree_id,
+ };
+ ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
+ if (ret)
+ return ret;
+ } else {
+ bool insert = !(flags & BTREE_TRIGGER_overwrite);
+ struct disk_accounting_pos acc_inum_key = {
+ .type = BCH_DISK_ACCOUNTING_inum,
+ .inum.inum = k.k->p.inode,
+ };
+ s64 v[3] = {
+ insert ? 1 : -1,
+ insert ? k.k->size : -((s64) k.k->size),
+ *replicas_sectors,
+ };
+ ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
if (ret)
return ret;
}
@@ -1031,15 +838,19 @@ static int __trigger_extent(struct btree_trans *trans,
}
int bch2_trigger_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
+ enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
+ if (unlikely(flags & BTREE_TRIGGER_check_repair))
+ return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags);
+
/* if pointers aren't changing - nothing to do: */
if (new_ptrs_bytes == old_ptrs_bytes &&
!memcmp(new_ptrs.start,
@@ -1047,60 +858,75 @@ int bch2_trigger_extent(struct btree_trans *trans,
new_ptrs_bytes))
return 0;
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
- struct bch_fs *c = trans->c;
- int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
- (int) bch2_bkey_needs_rebalance(c, old);
+ if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
+ s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
- if (mod) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0);
+ if (old.k->type) {
+ int ret = __trigger_extent(trans, btree, level, old,
+ flags & ~BTREE_TRIGGER_insert,
+ &old_replicas_sectors);
if (ret)
return ret;
}
- }
- if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))
- return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags);
+ if (new.k->type) {
+ int ret = __trigger_extent(trans, btree, level, new.s_c,
+ flags & ~BTREE_TRIGGER_overwrite,
+ &new_replicas_sectors);
+ if (ret)
+ return ret;
+ }
- return 0;
-}
+ int need_rebalance_delta = 0;
+ s64 need_rebalance_sectors_delta = 0;
-/* KEY_TYPE_reservation */
+ s64 s = bch2_bkey_sectors_need_rebalance(c, old);
+ need_rebalance_delta -= s != 0;
+ need_rebalance_sectors_delta -= s;
-static int __trigger_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
- s64 sectors = (s64) k.k->size * replicas;
+ s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
+ need_rebalance_delta += s != 0;
+ need_rebalance_sectors_delta += s;
- if (flags & BTREE_TRIGGER_OVERWRITE)
- sectors = -sectors;
+ if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
+ int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
+ new.k->p, need_rebalance_delta > 0);
+ if (ret)
+ return ret;
+ }
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
- int ret = bch2_replicas_deltas_realloc(trans, 0);
- if (ret)
- return ret;
+ if (need_rebalance_sectors_delta) {
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_rebalance_work,
+ };
+ int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
+ flags & BTREE_TRIGGER_gc);
+ if (ret)
+ return ret;
+ }
+ }
- struct replicas_delta_list *d = trans->fs_usage_deltas;
- replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved));
+ return 0;
+}
- d->persistent_reserved[replicas - 1] += sectors;
- }
+/* KEY_TYPE_reservation */
- if (flags & BTREE_TRIGGER_GC) {
- percpu_down_read(&c->mark_lock);
- preempt_disable();
+static int __trigger_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level, struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
+{
+ if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
+ s64 sectors = k.k->size;
- struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc);
+ if (flags & BTREE_TRIGGER_overwrite)
+ sectors = -sectors;
- replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved));
- fs_usage->b.reserved += sectors;
- fs_usage->persistent_reserved[replicas - 1] += sectors;
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_persistent_reserved,
+ .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas,
+ };
- preempt_enable();
- percpu_up_read(&c->mark_lock);
+ return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, flags & BTREE_TRIGGER_gc);
}
return 0;
@@ -1109,7 +935,7 @@ static int __trigger_reservation(struct btree_trans *trans,
int bch2_trigger_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
}
@@ -1117,35 +943,29 @@ int bch2_trigger_reservation(struct btree_trans *trans,
/* Mark superblocks: */
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
- struct bch_dev *ca, size_t b,
+ struct bch_dev *ca, u64 b,
enum bch_data_type type,
unsigned sectors)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
int ret = 0;
- /*
- * Backup superblock might be past the end of our normal usable space:
- */
- if (b >= ca->mi.nbuckets)
- return 0;
-
- a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
+ struct bkey_i_alloc_v4 *a =
+ bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b));
if (IS_ERR(a))
return PTR_ERR(a);
if (a->v.data_type && type && a->v.data_type != type) {
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- BCH_FSCK_ERR_bucket_metadata_type_mismatch,
+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ log_fsck_err(trans, bucket_metadata_type_mismatch,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
iter.pos.inode, iter.pos.offset, a->v.gen,
bch2_data_type_str(a->v.data_type),
bch2_data_type_str(type),
bch2_data_type_str(type));
- ret = -EIO;
+ ret = -BCH_ERR_metadata_bucket_inconsistency;
goto err;
}
@@ -1156,24 +976,80 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
}
err:
+fsck_err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
+static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca,
+ u64 b, enum bch_data_type data_type, unsigned sectors,
+ enum btree_iter_update_trigger_flags flags)
+{
+ struct bch_fs *c = trans->c;
+ int ret = 0;
+
+ struct bucket *g = gc_bucket(ca, b);
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
+ ca->dev_idx, bch2_data_type_str(data_type)))
+ goto err;
+
+ bucket_lock(g);
+ struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
+
+ if (bch2_fs_inconsistent_on(g->data_type &&
+ g->data_type != data_type, c,
+ "different types of data in same bucket: %s, %s",
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type)))
+ goto err_unlock;
+
+ if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
+ "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
+ ca->dev_idx, b, g->gen,
+ bch2_data_type_str(g->data_type ?: data_type),
+ g->dirty_sectors, sectors))
+ goto err_unlock;
+
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
+ struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
+ bucket_unlock(g);
+ ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
+ return ret;
+err_unlock:
+ bucket_unlock(g);
+err:
+ return -BCH_ERR_metadata_bucket_inconsistency;
+}
+
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
- struct bch_dev *ca, size_t b,
- enum bch_data_type type,
- unsigned sectors)
+ struct bch_dev *ca, u64 b,
+ enum bch_data_type type, unsigned sectors,
+ enum btree_iter_update_trigger_flags flags)
{
- return commit_do(trans, NULL, NULL, 0,
- __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
+ BUG_ON(type != BCH_DATA_free &&
+ type != BCH_DATA_sb &&
+ type != BCH_DATA_journal);
+
+ /*
+ * Backup superblock might be past the end of our normal usable space:
+ */
+ if (b >= ca->mi.nbuckets)
+ return 0;
+
+ if (flags & BTREE_TRIGGER_gc)
+ return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags);
+ else if (flags & BTREE_TRIGGER_transactional)
+ return commit_do(trans, NULL, NULL, 0,
+ __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
+ else
+ BUG();
}
static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
- struct bch_dev *ca,
- u64 start, u64 end,
- enum bch_data_type type,
- u64 *bucket, unsigned *bucket_sectors)
+ struct bch_dev *ca, u64 start, u64 end,
+ enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors,
+ enum btree_iter_update_trigger_flags flags)
{
do {
u64 b = sector_to_bucket(ca, start);
@@ -1182,7 +1058,7 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
if (b != *bucket && *bucket_sectors) {
int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
- type, *bucket_sectors);
+ type, *bucket_sectors, flags);
if (ret)
return ret;
@@ -1197,35 +1073,40 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
return 0;
}
-static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
- struct bch_dev *ca)
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
+ enum btree_iter_update_trigger_flags flags)
{
- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ struct bch_fs *c = trans->c;
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_layout layout = ca->disk_sb.sb->layout;
+ mutex_unlock(&c->sb_lock);
+
u64 bucket = 0;
unsigned i, bucket_sectors = 0;
int ret;
- for (i = 0; i < layout->nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout->sb_offset[i]);
+ for (i = 0; i < layout.nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout.sb_offset[i]);
if (offset == BCH_SB_SECTOR) {
ret = bch2_trans_mark_metadata_sectors(trans, ca,
0, BCH_SB_SECTOR,
- BCH_DATA_sb, &bucket, &bucket_sectors);
+ BCH_DATA_sb, &bucket, &bucket_sectors, flags);
if (ret)
return ret;
}
ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
- offset + (1 << layout->sb_max_size_bits),
- BCH_DATA_sb, &bucket, &bucket_sectors);
+ offset + (1 << layout.sb_max_size_bits),
+ BCH_DATA_sb, &bucket, &bucket_sectors, flags);
if (ret)
return ret;
}
if (bucket_sectors) {
ret = bch2_trans_mark_metadata_bucket(trans, ca,
- bucket, BCH_DATA_sb, bucket_sectors);
+ bucket, BCH_DATA_sb, bucket_sectors, flags);
if (ret)
return ret;
}
@@ -1233,7 +1114,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
for (i = 0; i < ca->journal.nr; i++) {
ret = bch2_trans_mark_metadata_bucket(trans, ca,
ca->journal.buckets[i],
- BCH_DATA_journal, ca->mi.bucket_size);
+ BCH_DATA_journal, ca->mi.bucket_size, flags);
if (ret)
return ret;
}
@@ -1241,20 +1122,22 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
return 0;
}
-int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
+int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
+ enum btree_iter_update_trigger_flags flags)
{
- int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
-
+ int ret = bch2_trans_run(c,
+ __bch2_trans_mark_dev_sb(trans, ca, flags));
bch_err_fn(c, ret);
return ret;
}
-int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
+ enum btree_iter_update_trigger_flags flags)
{
for_each_online_member(c, ca) {
- int ret = bch2_trans_mark_dev_sb(c, ca);
+ int ret = bch2_trans_mark_dev_sb(c, ca, flags);
if (ret) {
- percpu_ref_put(&ca->ref);
+ percpu_ref_put(&ca->io_ref);
return ret;
}
}
@@ -1262,16 +1145,46 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c)
return 0;
}
+int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+{
+ return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
+}
+
+bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ u64 b_offset = bucket_to_sector(ca, b);
+ u64 b_end = bucket_to_sector(ca, b + 1);
+ unsigned i;
+
+ if (!b)
+ return true;
+
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+ u64 end = offset + (1 << layout->sb_max_size_bits);
+
+ if (!(offset >= b_end || end <= b_offset))
+ return true;
+ }
+
+ for (i = 0; i < ca->journal.nr; i++)
+ if (b == ca->journal.buckets[i])
+ return true;
+
+ return false;
+}
+
/* Disk reservations: */
#define SECTORS_CACHE 1024
int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- u64 sectors, int flags)
+ u64 sectors, enum bch_reservation_flags flags)
{
struct bch_fs_pcpu *pcpu;
- u64 old, v, get;
- s64 sectors_available;
+ u64 old, get;
+ u64 sectors_available;
int ret;
percpu_down_read(&c->mark_lock);
@@ -1281,17 +1194,16 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
if (sectors <= pcpu->sectors_available)
goto out;
- v = atomic64_read(&c->sectors_available);
+ old = atomic64_read(&c->sectors_available);
do {
- old = v;
get = min((u64) sectors + SECTORS_CACHE, old);
if (get < sectors) {
preempt_enable();
goto recalculate;
}
- } while ((v = atomic64_cmpxchg(&c->sectors_available,
- old, old - get)) != old);
+ } while (!atomic64_try_cmpxchg(&c->sectors_available,
+ &old, old - get));
pcpu->sectors_available += get;
@@ -1310,6 +1222,9 @@ recalculate:
percpu_u64_set(&c->pcpu->sectors_available, 0);
sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
+ if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL))
+ sectors = min(sectors, sectors_available);
+
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
atomic64_set(&c->sectors_available,
@@ -1330,75 +1245,82 @@ recalculate:
/* Startup/shutdown: */
+void bch2_buckets_nouse_free(struct bch_fs *c)
+{
+ for_each_member_device(c, ca) {
+ kvfree_rcu_mightsleep(ca->buckets_nouse);
+ ca->buckets_nouse = NULL;
+ }
+}
+
+int bch2_buckets_nouse_alloc(struct bch_fs *c)
+{
+ for_each_member_device(c, ca) {
+ BUG_ON(ca->buckets_nouse);
+
+ ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
+ sizeof(unsigned long),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!ca->buckets_nouse) {
+ bch2_dev_put(ca);
+ return -BCH_ERR_ENOMEM_buckets_nouse;
+ }
+ }
+
+ return 0;
+}
+
static void bucket_gens_free_rcu(struct rcu_head *rcu)
{
struct bucket_gens *buckets =
container_of(rcu, struct bucket_gens, rcu);
- kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+ kvfree(buckets);
}
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
- unsigned long *buckets_nouse = NULL;
bool resize = ca->bucket_gens != NULL;
int ret;
- if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
- GFP_KERNEL|__GFP_ZERO))) {
- ret = -BCH_ERR_ENOMEM_bucket_gens;
- goto err;
- }
+ if (resize)
+ lockdep_assert_held(&c->state_lock);
- if ((c->opts.buckets_nouse &&
- !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
- sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO)))) {
- ret = -BCH_ERR_ENOMEM_buckets_nouse;
+ if (resize && ca->buckets_nouse)
+ return -BCH_ERR_no_resize_with_buckets_nouse;
+
+ bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!bucket_gens) {
+ ret = -BCH_ERR_ENOMEM_bucket_gens;
goto err;
}
bucket_gens->first_bucket = ca->mi.first_bucket;
bucket_gens->nbuckets = nbuckets;
-
- if (resize) {
- down_write(&c->gc_lock);
- down_write(&ca->bucket_lock);
- percpu_down_write(&c->mark_lock);
- }
+ bucket_gens->nbuckets_minus_first =
+ bucket_gens->nbuckets - bucket_gens->first_bucket;
old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
if (resize) {
- size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
-
+ bucket_gens->nbuckets = min(bucket_gens->nbuckets,
+ old_bucket_gens->nbuckets);
+ bucket_gens->nbuckets_minus_first =
+ bucket_gens->nbuckets - bucket_gens->first_bucket;
memcpy(bucket_gens->b,
old_bucket_gens->b,
- n);
- if (buckets_nouse)
- memcpy(buckets_nouse,
- ca->buckets_nouse,
- BITS_TO_LONGS(n) * sizeof(unsigned long));
+ bucket_gens->nbuckets);
}
rcu_assign_pointer(ca->bucket_gens, bucket_gens);
bucket_gens = old_bucket_gens;
- swap(ca->buckets_nouse, buckets_nouse);
-
nbuckets = ca->mi.nbuckets;
- if (resize) {
- percpu_up_write(&c->mark_lock);
- up_write(&ca->bucket_lock);
- up_write(&c->gc_lock);
- }
-
ret = 0;
err:
- kvpfree(buckets_nouse,
- BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
if (bucket_gens)
call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
@@ -1407,31 +1329,16 @@ err:
void bch2_dev_buckets_free(struct bch_dev *ca)
{
- unsigned i;
-
- kvpfree(ca->buckets_nouse,
- BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
- kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
- sizeof(struct bucket_gens) + ca->mi.nbuckets);
-
- for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
- free_percpu(ca->usage[i]);
- kfree(ca->usage_base);
+ kvfree(ca->buckets_nouse);
+ kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
+ free_percpu(ca->usage);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned i;
-
- ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
- if (!ca->usage_base)
+ ca->usage = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage)
return -BCH_ERR_ENOMEM_usage_init;
- for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
- ca->usage[i] = alloc_percpu(struct bch_dev_usage);
- if (!ca->usage[i])
- return -BCH_ERR_ENOMEM_usage_init;
- }
-
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 6387e039f789..a9acdd6c0c86 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -12,7 +12,7 @@
#include "extents.h"
#include "sb-members.h"
-static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s)
{
return div_u64(s, ca->mi.bucket_size);
}
@@ -30,8 +30,7 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
return remainder;
}
-static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
- u32 *offset)
+static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset)
{
return div_u64_rem(s, ca->mi.bucket_size, offset);
}
@@ -81,60 +80,58 @@ static inline void bucket_lock(struct bucket *b)
TASK_UNINTERRUPTIBLE);
}
-static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
-{
- return rcu_dereference_check(ca->buckets_gc,
- !ca->fs ||
- percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->gc_lock) ||
- lockdep_is_held(&ca->bucket_lock));
-}
-
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
- struct bucket_array *buckets = gc_bucket_array(ca);
-
- BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
- return buckets->b + b;
+ return bucket_valid(ca, b)
+ ? genradix_ptr(&ca->buckets_gc, b)
+ : NULL;
}
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
{
return rcu_dereference_check(ca->bucket_gens,
- !ca->fs ||
- percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->gc_lock) ||
- lockdep_is_held(&ca->bucket_lock));
+ lockdep_is_held(&ca->fs->state_lock));
}
static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
{
struct bucket_gens *gens = bucket_gens(ca);
- BUG_ON(b < gens->first_bucket || b >= gens->nbuckets);
+ if (b - gens->first_bucket >= gens->nbuckets_minus_first)
+ return NULL;
return gens->b + b;
}
+static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b)
+{
+ u8 *gen = bucket_gen(ca, b);
+ return gen ? *gen : -1;
+}
+
+static inline int bucket_gen_get(struct bch_dev *ca, size_t b)
+{
+ rcu_read_lock();
+ int ret = bucket_gen_get_rcu(ca, b);
+ rcu_read_unlock();
+ return ret;
+}
+
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
return sector_to_bucket(ca, ptr->offset);
}
-static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
- const struct bch_extent_ptr *ptr)
+static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca,
+ const struct bch_extent_ptr *ptr)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
}
-static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca,
const struct bch_extent_ptr *ptr,
u32 *bucket_offset)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
}
@@ -175,19 +172,21 @@ static inline int gen_after(u8 a, u8 b)
return r > 0 ? r : 0;
}
+static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
+{
+ int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr));
+ return gen < 0 ? gen : gen_after(gen, ptr->gen);
+}
+
/**
- * ptr_stale() - check if a pointer points into a bucket that has been
+ * dev_ptr_stale() - check if a pointer points into a bucket that has been
* invalidated.
*/
-static inline u8 ptr_stale(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
+static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
- u8 ret;
-
rcu_read_lock();
- ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+ int ret = dev_ptr_stale_rcu(ca, ptr);
rcu_read_unlock();
-
return ret;
}
@@ -202,8 +201,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
return ret;
}
-void bch2_dev_usage_init(struct bch_dev *);
-void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *);
+void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *);
static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
{
@@ -226,6 +224,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
fallthrough;
case BCH_WATERMARK_btree_copygc:
case BCH_WATERMARK_reclaim:
+ case BCH_WATERMARK_interior_updates:
break;
}
@@ -263,129 +262,52 @@ static inline u64 dev_buckets_available(struct bch_dev *ca,
/* Filesystem usage: */
-static inline unsigned __fs_usage_u64s(unsigned nr_replicas)
-{
- return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas;
-}
-
-static inline unsigned fs_usage_u64s(struct bch_fs *c)
-{
- return __fs_usage_u64s(READ_ONCE(c->replicas.nr));
-}
-
-static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas)
-{
- return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas;
-}
-
-static inline unsigned fs_usage_online_u64s(struct bch_fs *c)
-{
- return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr));
-}
-
static inline unsigned dev_usage_u64s(void)
{
return sizeof(struct bch_dev_usage) / sizeof(u64);
}
-u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
-
-struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
-
-void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
-
-void bch2_fs_usage_to_text(struct printbuf *,
- struct bch_fs *, struct bch_fs_usage_online *);
-
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
-
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
-void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
- const struct bch_alloc_v4 *,
- const struct bch_alloc_v4 *, u64, bool);
-void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *,
- struct bucket *, struct bucket *);
-
-/* key/bucket marking: */
-
-static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
- unsigned journal_seq,
- bool gc)
-{
- percpu_rwsem_assert_held(&c->mark_lock);
- BUG_ON(!gc && !journal_seq);
-
- return this_cpu_ptr(gc
- ? c->usage_gc
- : c->usage[journal_seq & JOURNAL_BUF_MASK]);
-}
-
-int bch2_update_replicas(struct bch_fs *, struct bkey_s_c,
- struct bch_replicas_entry_v1 *, s64,
- unsigned, bool);
-int bch2_update_replicas_list(struct btree_trans *,
- struct bch_replicas_entry_v1 *, s64);
-int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
-int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
-
-void bch2_fs_usage_initialize(struct bch_fs *);
+int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
+ struct bkey_s_c, const struct bch_extent_ptr *,
+ s64, enum bch_data_type, u8, u8, u32 *);
-int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c,
- const struct bch_extent_ptr *,
- s64, enum bch_data_type, u8, u8, u32);
-
-int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
- size_t, enum bch_data_type, unsigned,
- struct gc_pos, unsigned);
+int bch2_check_fix_ptrs(struct btree_trans *,
+ enum btree_id, unsigned, struct bkey_s_c,
+ enum btree_iter_update_trigger_flags);
int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
({ \
int ret = 0; \
\
if (_old.k->type) \
- ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \
+ ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert); \
if (!ret && _new.k->type) \
- ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\
+ ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\
ret; \
})
void bch2_trans_account_disk_usage_change(struct btree_trans *);
-void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
-int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
-
-int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
- size_t, enum bch_data_type, unsigned);
-int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
+int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
+ enum bch_data_type, unsigned,
+ enum btree_iter_update_trigger_flags);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *,
+ enum btree_iter_update_trigger_flags);
+int bch2_trans_mark_dev_sbs_flags(struct bch_fs *,
+ enum btree_iter_update_trigger_flags);
int bch2_trans_mark_dev_sbs(struct bch_fs *);
-static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
-{
- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
- u64 b_offset = bucket_to_sector(ca, b);
- u64 b_end = bucket_to_sector(ca, b + 1);
- unsigned i;
-
- if (!b)
- return true;
-
- for (i = 0; i < layout->nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout->sb_offset[i]);
- u64 end = offset + (1 << layout->sb_max_size_bits);
-
- if (!(offset >= b_end || end <= b_offset))
- return true;
- }
-
- return false;
-}
+bool bch2_is_superblock_bucket(struct bch_dev *, u64);
static inline const char *bch2_data_type_str(enum bch_data_type type)
{
@@ -394,14 +316,6 @@ static inline const char *bch2_data_type_str(enum bch_data_type type)
: "(invalid data type)";
}
-static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
-{
- if (type < BCH_DATA_NR)
- prt_str(out, __bch2_data_types[type]);
- else
- prt_printf(out, "(invalid data type %u)", type);
-}
-
/* disk reservations: */
static inline void bch2_disk_reservation_put(struct bch_fs *c,
@@ -413,25 +327,27 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c,
}
}
-#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
+enum bch_reservation_flags {
+ BCH_DISK_RESERVATION_NOFAIL = 1 << 0,
+ BCH_DISK_RESERVATION_PARTIAL = 1 << 1,
+};
-int __bch2_disk_reservation_add(struct bch_fs *,
- struct disk_reservation *,
- u64, int);
+int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *,
+ u64, enum bch_reservation_flags);
static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- u64 sectors, int flags)
+ u64 sectors, enum bch_reservation_flags flags)
{
#ifdef __KERNEL__
u64 old, new;
+ old = this_cpu_read(c->pcpu->sectors_available);
do {
- old = this_cpu_read(c->pcpu->sectors_available);
if (sectors > old)
return __bch2_disk_reservation_add(c, res, sectors, flags);
new = old - sectors;
- } while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
+ } while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new));
this_cpu_add(*c->online_reserved, sectors);
res->sectors += sectors;
@@ -471,6 +387,9 @@ static inline u64 avail_factor(u64 r)
return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
}
+void bch2_buckets_nouse_free(struct bch_fs *);
+int bch2_buckets_nouse_alloc(struct bch_fs *);
+
int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
void bch2_dev_buckets_free(struct bch_dev *);
int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 6a31740222a7..7174047b8e92 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -16,24 +16,19 @@ struct bucket {
u32 stripe;
u32 dirty_sectors;
u32 cached_sectors;
-};
-
-struct bucket_array {
- struct rcu_head rcu;
- u16 first_bucket;
- size_t nbuckets;
- struct bucket b[];
-};
+ u32 stripe_sectors;
+} __aligned(sizeof(long));
struct bucket_gens {
struct rcu_head rcu;
u16 first_bucket;
size_t nbuckets;
- u8 b[];
+ size_t nbuckets_minus_first;
+ u8 b[] __counted_by(nbuckets);
};
struct bch_dev_usage {
- struct {
+ struct bch_dev_usage_type {
u64 buckets;
u64 sectors; /* _compressed_ sectors: */
/*
@@ -54,18 +49,6 @@ struct bch_fs_usage_base {
u64 nr_inodes;
};
-struct bch_fs_usage {
- /* all fields are in units of 512 byte sectors: */
- struct bch_fs_usage_base b;
- u64 persistent_reserved[BCH_REPLICAS_MAX];
- u64 replicas[];
-};
-
-struct bch_fs_usage_online {
- u64 online_reserved;
- struct bch_fs_usage u;
-};
-
struct bch_fs_usage_short {
u64 capacity;
u64 used;
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
index ec1b636ef78d..c8a488e6b7b8 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -22,23 +22,21 @@ static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_
memset(t->d, 0, sizeof(t->d[0]) << t->bits);
}
-bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
- u64 flushed_seq,
- unsigned dev, u64 bucket)
+u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b,
+ unsigned dev, u64 bucket)
{
struct buckets_waiting_for_journal_table *t;
u64 dev_bucket = (u64) dev << 56 | bucket;
- bool ret = false;
- unsigned i;
+ u64 ret = 0;
mutex_lock(&b->lock);
t = b->t;
- for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
if (h->dev_bucket == dev_bucket) {
- ret = h->journal_seq > flushed_seq;
+ ret = h->journal_seq;
break;
}
}
@@ -93,7 +91,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
.dev_bucket = (u64) dev << 56 | bucket,
.journal_seq = journal_seq,
};
- size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0;
+ size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0;
int ret = 0;
mutex_lock(&b->lock);
@@ -106,8 +104,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
for (i = 0; i < size; i++)
nr_elements += t->d[i].journal_seq > flushed_seq;
- new_bits = t->bits + (nr_elements * 3 > size);
-
+ new_bits = ilog2(roundup_pow_of_two(nr_elements * 3));
+realloc:
n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
if (!n) {
ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
@@ -115,7 +113,16 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
}
retry_rehash:
+ if (nr_rehashes_this_size == 3) {
+ new_bits++;
+ nr_rehashes_this_size = 0;
+ kvfree(n);
+ goto realloc;
+ }
+
nr_rehashes++;
+ nr_rehashes_this_size++;
+
bucket_table_init(n, new_bits);
tmp = new;
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
index d2ae19cbe18c..365619ca44c8 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.h
+++ b/fs/bcachefs/buckets_waiting_for_journal.h
@@ -4,8 +4,8 @@
#include "buckets_waiting_for_journal_types.h"
-bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
- u64, unsigned, u64);
+u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *,
+ unsigned, u64);
int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
u64, unsigned, u64, u64);
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 226b39c17667..46e9e32105a9 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -5,11 +5,12 @@
#include "bcachefs_ioctl.h"
#include "buckets.h"
#include "chardev.h"
+#include "disk_accounting.h"
+#include "fsck.h"
#include "journal.h"
#include "move.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
-#include "super.h"
#include "super-io.h"
#include "thread_with_file.h"
@@ -22,12 +23,6 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
-__must_check
-static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
-{
- return copy_to_user(to, from, n) ? -EFAULT : 0;
-}
-
/* returns with ref on ca->ref */
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
unsigned flags)
@@ -38,12 +33,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
if (dev >= c->sb.nr_devices)
return ERR_PTR(-EINVAL);
- rcu_read_lock();
- ca = rcu_dereference(c->devs[dev]);
- if (ca)
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
-
+ ca = bch2_dev_tryget_noerror(c, dev);
if (!ca)
return ERR_PTR(-EINVAL);
} else {
@@ -137,102 +127,6 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
}
#endif
-struct fsck_thread {
- struct thread_with_stdio thr;
- struct bch_fs *c;
- char **devs;
- size_t nr_devs;
- struct bch_opts opts;
-};
-
-static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
-{
- struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
- if (thr->devs)
- for (size_t i = 0; i < thr->nr_devs; i++)
- kfree(thr->devs[i]);
- kfree(thr->devs);
- kfree(thr);
-}
-
-static int bch2_fsck_offline_thread_fn(void *arg)
-{
- struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
- struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
-
- thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
- if (!thr->thr.thr.ret)
- bch2_fs_stop(c);
-
- thread_with_stdio_done(&thr->thr);
- return 0;
-}
-
-static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
-{
- struct bch_ioctl_fsck_offline arg;
- struct fsck_thread *thr = NULL;
- u64 *devs = NULL;
- long ret = 0;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) ||
- !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) ||
- !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) {
- ret = -ENOMEM;
- goto err;
- }
-
- thr->opts = bch2_opts_empty();
- thr->nr_devs = arg.nr_devs;
-
- if (copy_from_user(devs, &user_arg->devs[0],
- array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) {
- ret = -EINVAL;
- goto err;
- }
-
- for (size_t i = 0; i < arg.nr_devs; i++) {
- thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX);
- ret = PTR_ERR_OR_ZERO(thr->devs[i]);
- if (ret)
- goto err;
- }
-
- if (arg.opts) {
- char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-
- ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(NULL, &thr->opts, optstr);
- kfree(optstr);
-
- if (ret)
- goto err;
- }
-
- opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
-
- ret = bch2_run_thread_with_stdio(&thr->thr,
- bch2_fsck_thread_exit,
- bch2_fsck_offline_thread_fn);
-err:
- if (ret < 0) {
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- pr_err("ret %s", bch2_err_str(ret));
- }
- kfree(devs);
- return ret;
-}
-
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
{
long ret;
@@ -304,7 +198,8 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
return ret;
ret = bch2_dev_add(c, path);
- kfree(path);
+ if (!IS_ERR(path))
+ kfree(path);
return ret;
}
@@ -371,7 +266,7 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
return PTR_ERR(ca);
ret = bch2_dev_offline(c, ca, arg.flags);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -400,7 +295,7 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
if (ret)
bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -453,7 +348,6 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
static const struct file_operations bcachefs_data_ops = {
.release = bch2_data_job_release,
.read = bch2_data_job_read,
- .llseek = no_llseek,
};
static long bch2_ioctl_data(struct bch_fs *c,
@@ -486,11 +380,9 @@ static long bch2_ioctl_data(struct bch_fs *c,
static long bch2_ioctl_fs_usage(struct bch_fs *c,
struct bch_ioctl_fs_usage __user *user_arg)
{
- struct bch_ioctl_fs_usage *arg = NULL;
- struct bch_replicas_usage *dst_e, *dst_end;
- struct bch_fs_usage_online *src;
+ struct bch_ioctl_fs_usage arg = {};
+ darray_char replicas = {};
u32 replica_entries_bytes;
- unsigned i;
int ret = 0;
if (!test_bit(BCH_FS_started, &c->flags))
@@ -499,62 +391,60 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
return -EFAULT;
- arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
- if (!arg)
- return -ENOMEM;
-
- src = bch2_fs_usage_read(c);
- if (!src) {
- ret = -ENOMEM;
+ ret = bch2_fs_replicas_usage_read(c, &replicas) ?:
+ (replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?:
+ copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr);
+ if (ret)
goto err;
- }
-
- arg->capacity = c->capacity;
- arg->used = bch2_fs_sectors_used(c, src);
- arg->online_reserved = src->online_reserved;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- arg->persistent_reserved[i] = src->u.persistent_reserved[i];
-
- dst_e = arg->replicas;
- dst_end = (void *) arg->replicas + replica_entries_bytes;
-
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *src_e =
- cpu_replicas_entry(&c->replicas, i);
-
- /* check that we have enough space for one replicas entry */
- if (dst_e + 1 > dst_end) {
- ret = -ERANGE;
- break;
- }
- dst_e->sectors = src->u.replicas[i];
- dst_e->r = *src_e;
+ struct bch_fs_usage_short u = bch2_fs_usage_read_short(c);
+ arg.capacity = c->capacity;
+ arg.used = u.used;
+ arg.online_reserved = percpu_u64_get(c->online_reserved);
+ arg.replica_entries_bytes = replicas.nr;
- /* recheck after setting nr_devs: */
- if (replicas_usage_next(dst_e) > dst_end) {
- ret = -ERANGE;
- break;
- }
-
- memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs);
+ for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
+ struct disk_accounting_pos k = {
+ .type = BCH_DISK_ACCOUNTING_persistent_reserved,
+ .persistent_reserved.nr_replicas = i,
+ };
- dst_e = replicas_usage_next(dst_e);
+ bch2_accounting_mem_read(c,
+ disk_accounting_pos_to_bpos(&k),
+ &arg.persistent_reserved[i], 1);
}
- arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas;
+ ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
+err:
+ darray_exit(&replicas);
+ return ret;
+}
+
+static long bch2_ioctl_query_accounting(struct bch_fs *c,
+ struct bch_ioctl_query_accounting __user *user_arg)
+{
+ struct bch_ioctl_query_accounting arg;
+ darray_char accounting = {};
+ int ret = 0;
- percpu_up_read(&c->mark_lock);
- kfree(src);
+ if (!test_bit(BCH_FS_started, &c->flags))
+ return -EINVAL;
+ ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?:
+ bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?:
+ (arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?:
+ copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr);
if (ret)
goto err;
- ret = copy_to_user_errcode(user_arg, arg,
- sizeof(*arg) + arg->replica_entries_bytes);
+ arg.capacity = c->capacity;
+ arg.used = bch2_fs_usage_read_short(c).used;
+ arg.online_reserved = percpu_u64_get(c->online_reserved);
+ arg.accounting_u64s = accounting.nr / sizeof(u64);
+
+ ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
err:
- kfree(arg);
+ darray_exit(&accounting);
return ret;
}
@@ -589,13 +479,13 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
arg.bucket_size = ca->mi.bucket_size;
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
- for (i = 0; i < BCH_DATA_NR; i++) {
+ for (i = 0; i < ARRAY_SIZE(arg.d); i++) {
arg.d[i].buckets = src.d[i].buckets;
arg.d[i].sectors = src.d[i].sectors;
arg.d[i].fragmented = src.d[i].fragmented;
}
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
}
@@ -647,7 +537,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
goto err;
}
err:
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -669,11 +559,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
if (arg.flags & BCH_READ_DEV) {
ca = bch2_device_lookup(c, arg.dev, arg.flags);
-
- if (IS_ERR(ca)) {
- ret = PTR_ERR(ca);
- goto err;
- }
+ ret = PTR_ERR_OR_ZERO(ca);
+ if (ret)
+ goto err_unlock;
sb = ca->disk_sb.sb;
} else {
@@ -688,8 +576,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
vstruct_bytes(sb));
err:
- if (!IS_ERR_OR_NULL(ca))
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
+err_unlock:
mutex_unlock(&c->sb_lock);
return ret;
}
@@ -733,7 +621,7 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c,
ret = bch2_dev_resize(c, ca, arg.nbuckets);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -759,98 +647,7 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
- percpu_ref_put(&ca->ref);
- return ret;
-}
-
-static int bch2_fsck_online_thread_fn(void *arg)
-{
- struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
- struct bch_fs *c = thr->c;
-
- c->stdio_filter = current;
- c->stdio = &thr->thr.stdio;
-
- /*
- * XXX: can we figure out a way to do this without mucking with c->opts?
- */
- unsigned old_fix_errors = c->opts.fix_errors;
- if (opt_defined(thr->opts, fix_errors))
- c->opts.fix_errors = thr->opts.fix_errors;
- else
- c->opts.fix_errors = FSCK_FIX_ask;
-
- c->opts.fsck = true;
- set_bit(BCH_FS_fsck_running, &c->flags);
-
- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
- int ret = bch2_run_online_recovery_passes(c);
-
- clear_bit(BCH_FS_fsck_running, &c->flags);
- bch_err_fn(c, ret);
-
- c->stdio = NULL;
- c->stdio_filter = NULL;
- c->opts.fix_errors = old_fix_errors;
-
- thread_with_stdio_done(&thr->thr);
-
- up(&c->online_fsck_mutex);
- bch2_ro_ref_put(c);
- return 0;
-}
-
-static long bch2_ioctl_fsck_online(struct bch_fs *c,
- struct bch_ioctl_fsck_online arg)
-{
- struct fsck_thread *thr = NULL;
- long ret = 0;
-
- if (arg.flags)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!bch2_ro_ref_tryget(c))
- return -EROFS;
-
- if (down_trylock(&c->online_fsck_mutex)) {
- bch2_ro_ref_put(c);
- return -EAGAIN;
- }
-
- thr = kzalloc(sizeof(*thr), GFP_KERNEL);
- if (!thr) {
- ret = -ENOMEM;
- goto err;
- }
-
- thr->c = c;
- thr->opts = bch2_opts_empty();
-
- if (arg.opts) {
- char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-
- ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(c, &thr->opts, optstr);
- kfree(optstr);
-
- if (ret)
- goto err;
- }
-
- ret = bch2_run_thread_with_stdio(&thr->thr,
- bch2_fsck_thread_exit,
- bch2_fsck_online_thread_fn);
-err:
- if (ret < 0) {
- bch_err_fn(c, ret);
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- up(&c->online_fsck_mutex);
- bch2_ro_ref_put(c);
- }
+ bch2_dev_put(ca);
return ret;
}
@@ -911,6 +708,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
case BCH_IOCTL_FSCK_ONLINE:
BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
+ case BCH_IOCTL_QUERY_ACCOUNTING:
+ return bch2_ioctl_query_accounting(c, arg);
default:
return -ENOTTY;
}
@@ -940,7 +739,9 @@ static const struct file_operations bch_chardev_fops = {
};
static int bch_chardev_major;
-static struct class *bch_chardev_class;
+static const struct class bch_chardev_class = {
+ .name = "bcachefs",
+};
static struct device *bch_chardev;
void bch2_fs_chardev_exit(struct bch_fs *c)
@@ -957,7 +758,7 @@ int bch2_fs_chardev_init(struct bch_fs *c)
if (c->minor < 0)
return c->minor;
- c->chardev = device_create(bch_chardev_class, NULL,
+ c->chardev = device_create(&bch_chardev_class, NULL,
MKDEV(bch_chardev_major, c->minor), c,
"bcachefs%u-ctl", c->minor);
if (IS_ERR(c->chardev))
@@ -968,32 +769,39 @@ int bch2_fs_chardev_init(struct bch_fs *c)
void bch2_chardev_exit(void)
{
- if (!IS_ERR_OR_NULL(bch_chardev_class))
- device_destroy(bch_chardev_class,
- MKDEV(bch_chardev_major, U8_MAX));
- if (!IS_ERR_OR_NULL(bch_chardev_class))
- class_destroy(bch_chardev_class);
+ device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX));
+ class_unregister(&bch_chardev_class);
if (bch_chardev_major > 0)
unregister_chrdev(bch_chardev_major, "bcachefs");
}
int __init bch2_chardev_init(void)
{
+ int ret;
+
bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
if (bch_chardev_major < 0)
return bch_chardev_major;
- bch_chardev_class = class_create("bcachefs");
- if (IS_ERR(bch_chardev_class))
- return PTR_ERR(bch_chardev_class);
+ ret = class_register(&bch_chardev_class);
+ if (ret)
+ goto major_out;
- bch_chardev = device_create(bch_chardev_class, NULL,
+ bch_chardev = device_create(&bch_chardev_class, NULL,
MKDEV(bch_chardev_major, U8_MAX),
NULL, "bcachefs-ctl");
- if (IS_ERR(bch_chardev))
- return PTR_ERR(bch_chardev);
+ if (IS_ERR(bch_chardev)) {
+ ret = PTR_ERR(bch_chardev);
+ goto class_out;
+ }
return 0;
+
+class_out:
+ class_unregister(&bch_chardev_class);
+major_out:
+ unregister_chrdev(bch_chardev_major, "bcachefs-ctl");
+ return ret;
}
#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3c761ad6b1c8..23a383577d4c 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "checksum.h"
#include "errcode.h"
+#include "error.h"
#include "super.h"
#include "super-io.h"
@@ -10,6 +11,7 @@
#include <linux/xxhash.h>
#include <linux/key.h>
#include <linux/random.h>
+#include <linux/ratelimit.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
#include <crypto/chacha.h>
@@ -99,12 +101,12 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct scatterlist *sg, size_t len)
{
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
- int ret;
skcipher_request_set_sync_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
- ret = crypto_skcipher_encrypt(req);
+ int ret = crypto_skcipher_encrypt(req);
if (ret)
pr_err("got error %i from crypto_skcipher_encrypt()", ret);
@@ -116,38 +118,47 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
void *buf, size_t len)
{
if (!is_vmalloc_addr(buf)) {
- struct scatterlist sg;
-
- sg_init_table(&sg, 1);
- sg_set_page(&sg,
- is_vmalloc_addr(buf)
- ? vmalloc_to_page(buf)
- : virt_to_page(buf),
- len, offset_in_page(buf));
+ struct scatterlist sg = {};
+
+ sg_mark_end(&sg);
+ sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf));
return do_encrypt_sg(tfm, nonce, &sg, len);
} else {
- unsigned pages = buf_pages(buf, len);
- struct scatterlist *sg;
- size_t orig_len = len;
- int ret, i;
-
- sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
- if (!sg)
- return -BCH_ERR_ENOMEM_do_encrypt;
+ DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
+ size_t sgl_len = 0;
+ int ret;
- sg_init_table(sg, pages);
+ darray_init(&sgl);
- for (i = 0; i < pages; i++) {
+ while (len) {
unsigned offset = offset_in_page(buf);
- unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
+ struct scatterlist sg = {
+ .page_link = (unsigned long) vmalloc_to_page(buf),
+ .offset = offset,
+ .length = min(len, PAGE_SIZE - offset),
+ };
- sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
- buf += pg_len;
- len -= pg_len;
+ if (darray_push(&sgl, sg)) {
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
+ if (ret)
+ goto err;
+
+ nonce = nonce_add(nonce, sgl_len);
+ sgl_len = 0;
+ sgl.nr = 0;
+ BUG_ON(darray_push(&sgl, sg));
+ }
+
+ buf += sg.length;
+ len -= sg.length;
+ sgl_len += sg.length;
}
- ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
- kfree(sg);
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
+err:
+ darray_exit(&sgl);
return ret;
}
}
@@ -232,7 +243,7 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
return ret;
}
default:
- BUG();
+ return (struct bch_csum) {};
}
}
@@ -242,6 +253,10 @@ int bch2_encrypt(struct bch_fs *c, unsigned type,
if (!bch2_csum_type_is_encryption(type))
return 0;
+ if (bch2_fs_inconsistent_on(!c->chacha20,
+ c, "attempting to encrypt without encryption key"))
+ return -BCH_ERR_no_encryption_key;
+
return do_encrypt(c->chacha20, nonce, data, len);
}
@@ -306,7 +321,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
return ret;
}
default:
- BUG();
+ return (struct bch_csum) {};
}
}
@@ -323,36 +338,44 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
{
struct bio_vec bv;
struct bvec_iter iter;
- struct scatterlist sgl[16], *sg = sgl;
- size_t bytes = 0;
+ DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
+ size_t sgl_len = 0;
int ret = 0;
- if (!bch2_csum_type_is_encryption(type))
- return 0;
+ if (bch2_fs_inconsistent_on(!c->chacha20,
+ c, "attempting to encrypt without encryption key"))
+ return -BCH_ERR_no_encryption_key;
- sg_init_table(sgl, ARRAY_SIZE(sgl));
+ darray_init(&sgl);
bio_for_each_segment(bv, bio, iter) {
- if (sg == sgl + ARRAY_SIZE(sgl)) {
- sg_mark_end(sg - 1);
-
- ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ struct scatterlist sg = {
+ .page_link = (unsigned long) bv.bv_page,
+ .offset = bv.bv_offset,
+ .length = bv.bv_len,
+ };
+
+ if (darray_push(&sgl, sg)) {
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
if (ret)
- return ret;
+ goto err;
- nonce = nonce_add(nonce, bytes);
- bytes = 0;
+ nonce = nonce_add(nonce, sgl_len);
+ sgl_len = 0;
+ sgl.nr = 0;
- sg_init_table(sgl, ARRAY_SIZE(sgl));
- sg = sgl;
+ BUG_ON(darray_push(&sgl, sg));
}
- sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
- bytes += bv.bv_len;
+ sgl_len += sg.length;
}
- sg_mark_end(sg - 1);
- return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
+err:
+ darray_exit(&sgl);
+ return ret;
}
struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
@@ -429,15 +452,20 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
extent_nonce(version, crc_old), bio);
if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
- bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
- "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
- __func__,
- crc_old.csum.hi,
- crc_old.csum.lo,
- merged.hi,
- merged.lo,
- bch2_csum_types[crc_old.csum_type],
- bch2_csum_types[new_csum_type]);
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
+ " expected %0llx:%0llx got %0llx:%0llx (old type ",
+ __func__,
+ crc_old.csum.hi,
+ crc_old.csum.lo,
+ merged.hi,
+ merged.lo);
+ bch2_prt_csum_type(&buf, crc_old.csum_type);
+ prt_str(&buf, " new type ");
+ bch2_prt_csum_type(&buf, new_csum_type);
+ prt_str(&buf, ")");
+ WARN_RATELIMIT(1, "%s", buf.buf);
+ printbuf_exit(&buf);
return -EIO;
}
@@ -463,9 +491,8 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
/* BCH_SB_FIELD_crypt: */
-static int bch2_sb_crypt_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
@@ -488,14 +515,10 @@ static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
{
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
- prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt));
- prt_newline(out);
- prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt));
- prt_newline(out);
- prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt));
- prt_newline(out);
- prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt));
- prt_newline(out);
+ prt_printf(out, "KFD: %llu\n", BCH_CRYPT_KDF_TYPE(crypt));
+ prt_printf(out, "scrypt n: %llu\n", BCH_KDF_SCRYPT_N(crypt));
+ prt_printf(out, "scrypt r: %llu\n", BCH_KDF_SCRYPT_R(crypt));
+ prt_printf(out, "scrypt p: %llu\n", BCH_KDF_SCRYPT_P(crypt));
}
const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
@@ -558,7 +581,7 @@ got_key:
return 0;
}
-#include "../crypto.h"
+#include "crypto.h"
#endif
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
@@ -647,26 +670,26 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
- int ret;
-
- if (!c->chacha20)
- c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
- ret = PTR_ERR_OR_ZERO(c->chacha20);
+ if (c->chacha20)
+ return 0;
+ struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
+ int ret = PTR_ERR_OR_ZERO(chacha20);
if (ret) {
bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
return ret;
}
- if (!c->poly1305)
- c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
- ret = PTR_ERR_OR_ZERO(c->poly1305);
-
+ struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0);
+ ret = PTR_ERR_OR_ZERO(poly1305);
if (ret) {
bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
+ c->chacha20 = chacha20;
+ c->poly1305 = poly1305;
return 0;
}
@@ -761,11 +784,11 @@ err:
void bch2_fs_encryption_exit(struct bch_fs *c)
{
- if (!IS_ERR_OR_NULL(c->poly1305))
+ if (c->poly1305)
crypto_free_shash(c->poly1305);
- if (!IS_ERR_OR_NULL(c->chacha20))
+ if (c->chacha20)
crypto_free_sync_skcipher(c->chacha20);
- if (!IS_ERR_OR_NULL(c->sha256))
+ if (c->sha256)
crypto_free_shash(c->sha256);
}
@@ -778,6 +801,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
c->sha256 = crypto_alloc_shash("sha256", 0, 0);
ret = PTR_ERR_OR_ZERO(c->sha256);
if (ret) {
+ c->sha256 = NULL;
bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
goto out;
}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 1b8c2c1016dc..43b9d71f2f2b 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -61,11 +61,12 @@ static inline void bch2_csum_err_msg(struct printbuf *out,
struct bch_csum expected,
struct bch_csum got)
{
- prt_printf(out, "checksum error: got ");
+ prt_str(out, "checksum error, type ");
+ bch2_prt_csum_type(out, type);
+ prt_str(out, ": got ");
bch2_csum_to_text(out, type, got);
prt_str(out, " should be ");
bch2_csum_to_text(out, type, expected);
- prt_printf(out, " type %s", bch2_csum_types[type]);
}
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
@@ -108,7 +109,7 @@ int bch2_enable_encryption(struct bch_fs *, bool);
void bch2_fs_encryption_exit(struct bch_fs *);
int bch2_fs_encryption_init(struct bch_fs *);
-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type,
bool data)
{
switch (type) {
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 363644451106..1f8e035d7119 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -6,44 +6,45 @@
#include <linux/kthread.h>
#include <linux/preempt.h>
-static inline long io_timer_cmp(io_timer_heap *h,
- struct io_timer *l,
- struct io_timer *r)
+static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args)
{
- return l->expire - r->expire;
+ struct io_timer **_l = (struct io_timer **)l;
+ struct io_timer **_r = (struct io_timer **)r;
+
+ return (*_l)->expire < (*_r)->expire;
}
+static const struct min_heap_callbacks callbacks = {
+ .less = io_timer_cmp,
+ .swp = NULL,
+};
+
void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
{
- size_t i;
-
spin_lock(&clock->timer_lock);
- if (time_after_eq((unsigned long) atomic64_read(&clock->now),
- timer->expire)) {
+ if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) {
spin_unlock(&clock->timer_lock);
timer->fn(timer);
return;
}
- for (i = 0; i < clock->timers.used; i++)
+ for (size_t i = 0; i < clock->timers.nr; i++)
if (clock->timers.data[i] == timer)
goto out;
- BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL));
+ BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL));
out:
spin_unlock(&clock->timer_lock);
}
void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
{
- size_t i;
-
spin_lock(&clock->timer_lock);
- for (i = 0; i < clock->timers.used; i++)
+ for (size_t i = 0; i < clock->timers.nr; i++)
if (clock->timers.data[i] == timer) {
- heap_del(&clock->timers, i, io_timer_cmp, NULL);
+ min_heap_del(&clock->timers, i, &callbacks, NULL);
break;
}
@@ -75,33 +76,31 @@ static void io_clock_cpu_timeout(struct timer_list *timer)
wake_up_process(wait->task);
}
-void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
{
- struct io_clock_wait wait;
+ struct io_clock_wait wait = {
+ .io_timer.expire = until,
+ .io_timer.fn = io_clock_wait_fn,
+ .io_timer.fn2 = (void *) _RET_IP_,
+ .task = current,
+ };
- /* XXX: calculate sleep time rigorously */
- wait.io_timer.expire = until;
- wait.io_timer.fn = io_clock_wait_fn;
- wait.task = current;
- wait.expired = 0;
bch2_io_timer_add(clock, &wait.io_timer);
-
schedule();
-
bch2_io_timer_del(clock, &wait.io_timer);
}
void bch2_kthread_io_clock_wait(struct io_clock *clock,
- unsigned long io_until,
- unsigned long cpu_timeout)
+ u64 io_until, unsigned long cpu_timeout)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
- struct io_clock_wait wait;
+ struct io_clock_wait wait = {
+ .io_timer.expire = io_until,
+ .io_timer.fn = io_clock_wait_fn,
+ .io_timer.fn2 = (void *) _RET_IP_,
+ .task = current,
+ };
- wait.io_timer.expire = io_until;
- wait.io_timer.fn = io_clock_wait_fn;
- wait.task = current;
- wait.expired = 0;
bch2_io_timer_add(clock, &wait.io_timer);
timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
@@ -127,44 +126,44 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
bch2_io_timer_del(clock, &wait.io_timer);
}
-static struct io_timer *get_expired_timer(struct io_clock *clock,
- unsigned long now)
+static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
{
struct io_timer *ret = NULL;
- spin_lock(&clock->timer_lock);
-
- if (clock->timers.used &&
- time_after_eq(now, clock->timers.data[0]->expire))
- heap_pop(&clock->timers, ret, io_timer_cmp, NULL);
-
- spin_unlock(&clock->timer_lock);
+ if (clock->timers.nr &&
+ time_after_eq64(now, clock->timers.data[0]->expire)) {
+ ret = *min_heap_peek(&clock->timers);
+ min_heap_pop(&clock->timers, &callbacks, NULL);
+ }
return ret;
}
-void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
+void __bch2_increment_clock(struct io_clock *clock, u64 sectors)
{
struct io_timer *timer;
- unsigned long now = atomic64_add_return(sectors, &clock->now);
+ u64 now = atomic64_add_return(sectors, &clock->now);
+ spin_lock(&clock->timer_lock);
while ((timer = get_expired_timer(clock, now)))
timer->fn(timer);
+ spin_unlock(&clock->timer_lock);
}
void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
{
- unsigned long now;
- unsigned i;
-
out->atomic++;
spin_lock(&clock->timer_lock);
- now = atomic64_read(&clock->now);
+ u64 now = atomic64_read(&clock->now);
+
+ printbuf_tabstop_push(out, 40);
+ prt_printf(out, "current time:\t%llu\n", now);
- for (i = 0; i < clock->timers.used; i++)
- prt_printf(out, "%ps:\t%li\n",
+ for (unsigned i = 0; i < clock->timers.nr; i++)
+ prt_printf(out, "%ps %ps:\t%llu\n",
clock->timers.data[i]->fn,
- clock->timers.data[i]->expire - now);
+ clock->timers.data[i]->fn2,
+ clock->timers.data[i]->expire);
spin_unlock(&clock->timer_lock);
--out->atomic;
}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index 70a0f7436c84..82c79c8baf92 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -4,12 +4,11 @@
void bch2_io_timer_add(struct io_clock *, struct io_timer *);
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
-void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
- unsigned long);
+void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long);
-void __bch2_increment_clock(struct io_clock *, unsigned);
+void __bch2_increment_clock(struct io_clock *, u64);
-static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
+static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
int rw)
{
struct io_clock *clock = &c->io_clock[rw];
@@ -19,16 +18,7 @@ static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
__bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
}
-void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
-
-#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
-({ \
- long __ret = timeout; \
- might_sleep(); \
- if (!___wait_cond_timeout(condition)) \
- __ret = __wait_event_timeout(wq, condition, timeout); \
- __ret; \
-})
+void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
index 5fae0012d808..37554e4514fe 100644
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -17,13 +17,14 @@ typedef void (*io_timer_fn)(struct io_timer *);
struct io_timer {
io_timer_fn fn;
- unsigned long expire;
+ void *fn2;
+ u64 expire;
};
/* Amount to buffer up on a percpu counter */
#define IO_CLOCK_PCPU_SECTORS 128
-typedef HEAP(struct io_timer *) io_timer_heap;
+typedef DEFINE_MIN_HEAP(struct io_timer *, io_timer_heap) io_timer_heap;
struct io_clock {
atomic64_t now;
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 33df8cf86bd8..114bf2f3879f 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -2,13 +2,34 @@
#include "bcachefs.h"
#include "checksum.h"
#include "compress.h"
+#include "error.h"
#include "extents.h"
+#include "io_write.h"
+#include "opts.h"
#include "super-io.h"
#include <linux/lz4.h>
#include <linux/zlib.h>
#include <linux/zstd.h>
+static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type)
+{
+ switch (type) {
+ case BCH_COMPRESSION_TYPE_none:
+ case BCH_COMPRESSION_TYPE_incompressible:
+ return BCH_COMPRESSION_OPT_none;
+ case BCH_COMPRESSION_TYPE_lz4_old:
+ case BCH_COMPRESSION_TYPE_lz4:
+ return BCH_COMPRESSION_OPT_lz4;
+ case BCH_COMPRESSION_TYPE_gzip:
+ return BCH_COMPRESSION_OPT_gzip;
+ case BCH_COMPRESSION_TYPE_zstd:
+ return BCH_COMPRESSION_OPT_zstd;
+ default:
+ BUG();
+ }
+}
+
/* Bounce buffer: */
struct bbuf {
void *b;
@@ -158,6 +179,19 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
void *workspace;
int ret;
+ enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
+ mempool_t *workspace_pool = &c->compress_workspace[opt];
+ if (unlikely(!mempool_initialized(workspace_pool))) {
+ if (fsck_err(c, compression_type_not_marked_in_sb,
+ "compression type %s set but not marked in superblock",
+ __bch2_compression_types[crc.compression_type]))
+ ret = bch2_check_set_has_compressed_data(c, opt);
+ else
+ ret = -BCH_ERR_compression_workspace_not_initialized;
+ if (ret)
+ goto out;
+ }
+
src_data = bio_map_or_bounce(c, src, READ);
switch (crc.compression_type) {
@@ -176,13 +210,13 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
.avail_out = dst_len,
};
- workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+ workspace = mempool_alloc(workspace_pool, GFP_NOFS);
zlib_set_workspace(&strm, workspace);
zlib_inflateInit2(&strm, -MAX_WBITS);
ret = zlib_inflate(&strm, Z_FINISH);
- mempool_free(workspace, &c->decompress_workspace);
+ mempool_free(workspace, workspace_pool);
if (ret != Z_STREAM_END)
goto err;
@@ -195,14 +229,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
if (real_src_len > src_len - 4)
goto err;
- workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+ workspace = mempool_alloc(workspace_pool, GFP_NOFS);
ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
ret = zstd_decompress_dctx(ctx,
dst_data, dst_len,
src_data.b + 4, real_src_len);
- mempool_free(workspace, &c->decompress_workspace);
+ mempool_free(workspace, workspace_pool);
if (ret != dst_len)
goto err;
@@ -212,6 +246,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
BUG();
}
ret = 0;
+fsck_err:
out:
bio_unmap_or_unbounce(c, src_data);
return ret;
@@ -220,11 +255,14 @@ err:
goto out;
}
-int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
- struct bch_extent_crc_unpacked *crc)
+int bch2_bio_uncompress_inplace(struct bch_write_op *op,
+ struct bio *bio)
{
+ struct bch_fs *c = op->c;
+ struct bch_extent_crc_unpacked *crc = &op->crc;
struct bbuf data = { NULL };
size_t dst_len = crc->uncompressed_size << 9;
+ int ret = 0;
/* bio must own its pages: */
BUG_ON(!bio->bi_vcnt);
@@ -232,17 +270,26 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
crc->compressed_size << 9 > c->opts.encoded_extent_max) {
- bch_err(c, "error rewriting existing data: extent too big");
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error(&buf, op);
+ prt_printf(&buf, "error rewriting existing data: extent too big");
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
return -EIO;
}
data = __bounce_alloc(c, dst_len, WRITE);
if (__bio_uncompress(c, bio, data.b, *crc)) {
- if (!c->opts.no_data_io)
- bch_err(c, "error rewriting existing data: decompression error");
- bio_unmap_or_unbounce(c, data);
- return -EIO;
+ if (!c->opts.no_data_io) {
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error(&buf, op);
+ prt_printf(&buf, "error rewriting existing data: decompression error");
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+ ret = -EIO;
+ goto err;
}
/*
@@ -259,9 +306,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
crc->uncompressed_size = crc->live_size;
crc->offset = 0;
crc->csum = (struct bch_csum) { 0, 0 };
-
+err:
bio_unmap_or_unbounce(c, data);
- return 0;
+ return ret;
}
int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
@@ -394,8 +441,21 @@ static unsigned __bio_compress(struct bch_fs *c,
unsigned pad;
int ret = 0;
- BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
- BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+ /* bch2_compression_decode catches unknown compression types: */
+ BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR);
+
+ mempool_t *workspace_pool = &c->compress_workspace[compression.type];
+ if (unlikely(!mempool_initialized(workspace_pool))) {
+ if (fsck_err(c, compression_opt_not_marked_in_sb,
+ "compression opt %s set but not marked in superblock",
+ bch2_compression_opts[compression.type])) {
+ ret = bch2_check_set_has_compressed_data(c, compression.type);
+ if (ret) /* memory allocation failure, don't compress */
+ return 0;
+ } else {
+ return 0;
+ }
+ }
/* If it's only one block, don't bother trying to compress: */
if (src->bi_iter.bi_size <= c->opts.block_size)
@@ -404,7 +464,7 @@ static unsigned __bio_compress(struct bch_fs *c,
dst_data = bio_map_or_bounce(c, dst, WRITE);
src_data = bio_map_or_bounce(c, src, READ);
- workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS);
+ workspace = mempool_alloc(workspace_pool, GFP_NOFS);
*src_len = src->bi_iter.bi_size;
*dst_len = dst->bi_iter.bi_size;
@@ -447,7 +507,7 @@ static unsigned __bio_compress(struct bch_fs *c,
*src_len = round_down(*src_len, block_bytes(c));
}
- mempool_free(workspace, &c->compress_workspace[compression_type]);
+ mempool_free(workspace, workspace_pool);
if (ret)
goto err;
@@ -477,6 +537,9 @@ out:
err:
ret = BCH_COMPRESSION_TYPE_incompressible;
goto out;
+fsck_err:
+ ret = 0;
+ goto out;
}
unsigned bch2_bio_compress(struct bch_fs *c,
@@ -559,7 +622,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
{
unsigned i;
- mempool_exit(&c->decompress_workspace);
for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
mempool_exit(&c->compress_workspace[i]);
mempool_exit(&c->compression_bounce[WRITE]);
@@ -568,7 +630,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
{
- size_t decompress_workspace_size = 0;
ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
c->opts.encoded_extent_max);
@@ -576,19 +637,17 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
struct {
unsigned feature;
- enum bch_compression_type type;
+ enum bch_compression_opts type;
size_t compress_workspace;
- size_t decompress_workspace;
} compression_types[] = {
- { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
- max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
- 0 },
- { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
- zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
- zlib_inflate_workspacesize(), },
- { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
- c->zstd_workspace_size,
- zstd_dctx_workspace_bound() },
+ { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4,
+ max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
+ { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip,
+ max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+ zlib_inflate_workspacesize()) },
+ { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd,
+ max(c->zstd_workspace_size,
+ zstd_dctx_workspace_bound()) },
}, *i;
bool have_compressed = false;
@@ -601,38 +660,30 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
return 0;
if (!mempool_initialized(&c->compression_bounce[READ]) &&
- mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
- 1, c->opts.encoded_extent_max))
+ mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
+ 1, c->opts.encoded_extent_max))
return -BCH_ERR_ENOMEM_compression_bounce_read_init;
if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
- mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
- 1, c->opts.encoded_extent_max))
+ mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
+ 1, c->opts.encoded_extent_max))
return -BCH_ERR_ENOMEM_compression_bounce_write_init;
for (i = compression_types;
i < compression_types + ARRAY_SIZE(compression_types);
i++) {
- decompress_workspace_size =
- max(decompress_workspace_size, i->decompress_workspace);
-
if (!(features & (1 << i->feature)))
continue;
if (mempool_initialized(&c->compress_workspace[i->type]))
continue;
- if (mempool_init_kvpmalloc_pool(
+ if (mempool_init_kvmalloc_pool(
&c->compress_workspace[i->type],
1, i->compress_workspace))
return -BCH_ERR_ENOMEM_compression_workspace_init;
}
- if (!mempool_initialized(&c->decompress_workspace) &&
- mempool_init_kvpmalloc_pool(&c->decompress_workspace,
- 1, decompress_workspace_size))
- return -BCH_ERR_ENOMEM_decompression_workspace_init;
-
return 0;
}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 58c2eb45570f..bec2f05bfd52 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,16 +47,8 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
}
-static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
-{
- if (type < BCH_COMPRESSION_TYPE_NR)
- prt_str(out, __bch2_compression_types[type]);
- else
- prt_printf(out, "(invalid compression type %u)", type);
-}
-
-int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
- struct bch_extent_crc_unpacked *);
+struct bch_write_op;
+int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
struct bvec_iter, struct bch_extent_crc_unpacked);
unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
index ac35b8b705ae..e86d36d23e9e 100644
--- a/fs/bcachefs/darray.c
+++ b/fs/bcachefs/darray.c
@@ -2,18 +2,32 @@
#include <linux/log2.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include "darray.h"
-int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
{
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
- void *data = kvmalloc_array(new_size, element_size, gfp);
+ /*
+ * This is a workaround: kvmalloc() doesn't support > INT_MAX
+ * allocations, but vmalloc() does.
+ * The limit needs to be lifted from kvmalloc, and when it does
+ * we'll go back to just using that.
+ */
+ size_t bytes;
+ if (unlikely(check_mul_overflow(new_size, element_size, &bytes)))
+ return -ENOMEM;
+
+ void *data = likely(bytes < INT_MAX)
+ ? kvmalloc_noprof(bytes, gfp)
+ : vmalloc_noprof(bytes);
if (!data)
return -ENOMEM;
- memcpy(data, d->data, d->size * element_size);
+ if (d->size)
+ memcpy(data, d->data, d->size * element_size);
if (d->data != d->preallocated)
kvfree(d->data);
d->data = data;
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 4b340d13caac..c6151495985f 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -22,29 +22,23 @@ struct { \
typedef DARRAY(char) darray_char;
typedef DARRAY(char *) darray_str;
-int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
-
-static inline int __darray_resize(darray_char *d, size_t element_size,
- size_t new_size, gfp_t gfp)
-{
- return unlikely(new_size > d->size)
- ? __bch2_darray_resize(d, element_size, new_size, gfp)
- : 0;
-}
+int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
+
+#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
+
+#define __darray_resize(_d, _element_size, _new_size, _gfp) \
+ (unlikely((_new_size) > (_d)->size) \
+ ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
+ : 0)
#define darray_resize_gfp(_d, _new_size, _gfp) \
- unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp))
+ __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
#define darray_resize(_d, _new_size) \
darray_resize_gfp(_d, _new_size, GFP_KERNEL)
-static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp)
-{
- return __darray_resize(d, t_size, d->nr + more, gfp);
-}
-
#define darray_make_room_gfp(_d, _more, _gfp) \
- __darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
+ darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
#define darray_make_room(_d, _more) \
darray_make_room_gfp(_d, _more, GFP_KERNEL)
@@ -89,7 +83,7 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
#define darray_for_each_reverse(_d, _i) \
- for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
+ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
#define darray_init(_d) \
do { \
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 4150feca42a2..642fbc60ecab 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -5,7 +5,9 @@
#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
+#include "compress.h"
#include "data_update.h"
+#include "disk_groups.h"
#include "ec.h"
#include "error.h"
#include "extents.h"
@@ -14,20 +16,105 @@
#include "move.h"
#include "nocow_locking.h"
#include "rebalance.h"
+#include "snapshot.h"
#include "subvolume.h"
#include "trace.h"
-static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
+static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
{
- if (trace_move_extent_finish_enabled()) {
- struct printbuf buf = PRINTBUF;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr)
+ bch2_dev_put(bch2_dev_have_ref(c, ptr->dev));
+}
+
+static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!bch2_dev_tryget(c, ptr->dev)) {
+ bkey_for_each_ptr(ptrs, ptr2) {
+ if (ptr2 == ptr)
+ break;
+ bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
+ }
+ return false;
+ }
+ }
+ return true;
+}
- bch2_bkey_val_to_text(&buf, c, k);
- trace_move_extent_finish(c, buf.buf);
- printbuf_exit(&buf);
+static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
}
}
+static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+ if (ctxt) {
+ bool locked;
+
+ move_ctxt_wait_event(ctxt,
+ (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
+ list_empty(&ctxt->ios));
+
+ if (!locked)
+ bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
+ } else {
+ if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
+ bkey_for_each_ptr(ptrs, ptr2) {
+ if (ptr2 == ptr)
+ break;
+
+ ca = bch2_dev_have_ref(c, ptr2->dev);
+ bucket = PTR_BUCKET_POS(ca, ptr2);
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+ }
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+static noinline void trace_move_extent_finish2(struct data_update *u,
+ struct bkey_i *new,
+ struct bkey_i *insert)
+{
+ struct bch_fs *c = u->op.c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_newline(&buf);
+
+ bch2_data_update_to_text(&buf, u);
+ prt_newline(&buf);
+
+ prt_str_indented(&buf, "new replicas:\t");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+ prt_newline(&buf);
+
+ prt_str_indented(&buf, "insert:\t");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+ prt_newline(&buf);
+
+ trace_move_extent_finish(c, buf.buf);
+ printbuf_exit(&buf);
+}
+
static void trace_move_extent_fail2(struct data_update *m,
struct bkey_s_c new,
struct bkey_s_c wrote,
@@ -36,11 +123,8 @@ static void trace_move_extent_fail2(struct data_update *m,
{
struct bch_fs *c = m->op.c;
struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
- const union bch_extent_entry *entry;
- struct bch_extent_ptr *ptr;
- struct extent_ptr_decoded p;
struct printbuf buf = PRINTBUF;
- unsigned i, rewrites_found = 0;
+ unsigned rewrites_found = 0;
if (!trace_move_extent_fail_enabled())
return;
@@ -48,27 +132,25 @@ static void trace_move_extent_fail2(struct data_update *m,
prt_str(&buf, msg);
if (insert) {
- i = 0;
+ const union bch_extent_entry *entry;
+ struct bch_extent_ptr *ptr;
+ struct extent_ptr_decoded p;
+
+ unsigned ptr_bit = 1;
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
- if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached)
- rewrites_found |= 1U << i;
- i++;
+ rewrites_found |= ptr_bit;
+ ptr_bit <<= 1;
}
}
- prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u",
- (m->data_opts.rewrite_ptrs & (1 << 0)) != 0,
- (m->data_opts.rewrite_ptrs & (1 << 1)) != 0,
- (m->data_opts.rewrite_ptrs & (1 << 2)) != 0,
- (m->data_opts.rewrite_ptrs & (1 << 3)) != 0);
+ prt_str(&buf, "rewrites found:\t");
+ bch2_prt_u64_base2(&buf, rewrites_found);
+ prt_newline(&buf);
- prt_printf(&buf, "\nrewrites found: %u%u%u%u",
- (rewrites_found & (1 << 0)) != 0,
- (rewrites_found & (1 << 1)) != 0,
- (rewrites_found & (1 << 2)) != 0,
- (rewrites_found & (1 << 3)) != 0);
+ bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
prt_str(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
@@ -105,7 +187,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, m->btree_id,
bkey_start_pos(&bch2_keylist_front(keys)->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
while (1) {
struct bkey_s_c k;
@@ -120,7 +202,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
struct bpos next_pos;
bool should_check_enospc;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
- unsigned rewrites_found = 0, durability, i;
+ unsigned rewrites_found = 0, durability, ptr_bit;
bch2_trans_begin(trans);
@@ -157,15 +239,16 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
*
* Fist, drop rewrite_ptrs from @new:
*/
- i = 0;
+ ptr_bit = 1;
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
- if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) {
- bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
- rewrites_found |= 1U << i;
+ bch2_extent_ptr_set_cached(c, &m->op.opts,
+ bkey_i_to_s(insert), ptr);
+ rewrites_found |= ptr_bit;
}
- i++;
+ ptr_bit <<= 1;
}
if (m->data_opts.rewrite_ptrs &&
@@ -201,6 +284,7 @@ restart_drop_conflicting_replicas:
bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
/* Now, drop excess replicas: */
+ rcu_read_lock();
restart_drop_extra_replicas:
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
@@ -209,17 +293,19 @@ restart_drop_extra_replicas:
durability - ptr_durability >= m->op.opts.data_replicas) {
durability -= ptr_durability;
- bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
+ bch2_extent_ptr_set_cached(c, &m->op.opts,
+ bkey_i_to_s(insert), &entry->ptr);
goto restart_drop_extra_replicas;
}
}
+ rcu_read_unlock();
/* Finally, add the pointers we just wrote: */
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
bch2_extent_ptr_decoded_append(insert, &p);
bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
- bch2_extent_normalize(c, bkey_i_to_s(insert));
+ bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert));
ret = bch2_sum_sector_overwrites(trans, &iter, insert,
&should_check_enospc,
@@ -245,14 +331,16 @@ restart_drop_extra_replicas:
* it's been hard to reproduce, so this should give us some more
* information when it does occur:
*/
- struct printbuf err = PRINTBUF;
- int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
- printbuf_exit(&err);
-
+ int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert),
+ (struct bkey_validate_context) {
+ .btree = m->btree_id,
+ .flags = BCH_VALIDATE_commit,
+ });
if (invalid) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "about to insert invalid key in data update path");
+ prt_printf(&buf, "\nop.nonce: %u", m->op.nonce);
prt_str(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
prt_str(&buf, "\nk: ");
@@ -264,6 +352,7 @@ restart_drop_extra_replicas:
printbuf_exit(&buf);
bch2_fatal_error(c);
+ ret = -EIO;
goto out;
}
@@ -285,9 +374,9 @@ restart_drop_extra_replicas:
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?:
- bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
+ bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
bch2_trans_update(trans, &iter, insert,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, &op->res,
NULL,
BCH_TRANS_COMMIT_no_check_rw|
@@ -297,7 +386,8 @@ restart_drop_extra_replicas:
bch2_btree_iter_set_pos(&iter, next_pos);
this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
- trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i));
+ if (trace_move_extent_finish_enabled())
+ trace_move_extent_finish2(m, &new->k_i, insert);
}
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -352,16 +442,11 @@ void bch2_data_update_read_done(struct data_update *m,
void bch2_data_update_exit(struct data_update *update)
{
struct bch_fs *c = update->op.c;
- struct bkey_ptrs_c ptrs =
- bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
-
- bkey_for_each_ptr(ptrs, ptr) {
- if (c->opts.nocow_enabled)
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr), 0);
- percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref);
- }
+ struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
+ if (c->opts.nocow_enabled)
+ bkey_nocow_unlock(c, k);
+ bkey_put_dev_refs(c, k);
bch2_bkey_buf_exit(&update->k, c);
bch2_disk_reservation_put(c, &update->op.res);
bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
@@ -385,8 +470,10 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
while (bio_sectors(bio)) {
unsigned sectors = bio_sectors(bio);
+ bch2_trans_begin(trans);
+
bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
- BTREE_ITER_SLOTS);
+ BTREE_ITER_slots);
ret = lockrestart_do(trans, ({
k = bch2_btree_iter_peek_slot(&iter);
bkey_err(k);
@@ -448,10 +535,50 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
}
}
+void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ printbuf_tabstop_push(out, 20);
+
+ prt_str_indented(out, "rewrite ptrs:\t");
+ bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
+ prt_newline(out);
+
+ prt_str_indented(out, "kill ptrs:\t");
+ bch2_prt_u64_base2(out, data_opts->kill_ptrs);
+ prt_newline(out);
+
+ prt_str_indented(out, "target:\t");
+ bch2_target_to_text(out, c, data_opts->target);
+ prt_newline(out);
+
+ prt_str_indented(out, "compression:\t");
+ bch2_compression_opt_to_text(out, io_opts->background_compression);
+ prt_newline(out);
+
+ prt_str_indented(out, "opts.replicas:\t");
+ prt_u64(out, io_opts->data_replicas);
+ prt_newline(out);
+
+ prt_str_indented(out, "extra replicas:\t");
+ prt_u64(out, data_opts->extra_replicas);
+}
+
+void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
+{
+ bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
+ prt_newline(out);
+
+ prt_str_indented(out, "old key:\t");
+ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
+}
+
int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
- struct data_update_opts data_opts)
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_i *n;
@@ -462,12 +589,11 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
if (ret)
return ret;
- while (data_opts.kill_ptrs) {
- unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
- struct bch_extent_ptr *ptr;
+ while (data_opts->kill_ptrs) {
+ unsigned i = 0, drop = __fls(data_opts->kill_ptrs);
- bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
- data_opts.kill_ptrs ^= 1U << drop;
+ bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
+ data_opts->kill_ptrs ^= 1U << drop;
}
/*
@@ -475,19 +601,19 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_error key, or just a discard if it was a cached extent)
*/
- bch2_extent_normalize(c, bkey_i_to_s(n));
+ bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n));
/*
* Since we're not inserting through an extent iterator
- * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
* we aren't using the extent overwrite path to delete, we're
* just using the normal key deletion path:
*/
- if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents))
n->k.size = 0;
return bch2_trans_relock(trans) ?:
- bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
@@ -505,10 +631,26 @@ int bch2_data_update_init(struct btree_trans *trans,
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
- unsigned ptrs_locked = 0;
+ unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
int ret = 0;
+ /*
+ * fs is corrupt we have a key for a snapshot node that doesn't exist,
+ * and we have to check for this because we go rw before repairing the
+ * snapshots table - just skip it, we can move it later.
+ */
+ if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
+ return -BCH_ERR_data_update_done;
+
+ if (!bkey_get_dev_refs(c, k))
+ return -BCH_ERR_data_update_done;
+
+ if (c->opts.nocow_enabled &&
+ !bkey_nocow_lock(c, ctxt, k)) {
+ bkey_put_dev_refs(c, k);
+ return -BCH_ERR_nocow_lock_blocked;
+ }
+
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
@@ -518,7 +660,7 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k);
- m->op.version = k.k->version;
+ m->op.version = k.k->bversion;
m->op.target = data_opts.target;
m->op.write_point = wp;
m->op.nr_replicas = 0;
@@ -527,30 +669,26 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_MOVE|
m->data_opts.write_flags;
- m->op.compression_opt = background_compression(io_opts);
+ m->op.compression_opt = io_opts.background_compression;
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
- bkey_for_each_ptr(ptrs, ptr)
- percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
-
unsigned durability_have = 0, durability_removing = 0;
- i = 0;
+ unsigned ptr_bit = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- bool locked;
-
- if (((1U << i) & m->data_opts.rewrite_ptrs)) {
- BUG_ON(p.ptr.cached);
-
- if (crc_is_compressed(p.crc))
- reserve_sectors += k.k->size;
-
- m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
- durability_removing += bch2_extent_ptr_desired_durability(c, &p);
- } else if (!p.ptr.cached &&
- !((1U << i) & m->data_opts.kill_ptrs)) {
- bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
- durability_have += bch2_extent_ptr_durability(c, &p);
+ if (!p.ptr.cached) {
+ rcu_read_lock();
+ if (ptr_bit & m->data_opts.rewrite_ptrs) {
+ if (crc_is_compressed(p.crc))
+ reserve_sectors += k.k->size;
+
+ m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
+ durability_removing += bch2_extent_ptr_desired_durability(c, &p);
+ } else if (!(ptr_bit & m->data_opts.kill_ptrs)) {
+ bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+ durability_have += bch2_extent_ptr_durability(c, &p);
+ }
+ rcu_read_unlock();
}
/*
@@ -566,30 +704,11 @@ int bch2_data_update_init(struct btree_trans *trans,
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
m->op.incompressible = true;
- if (c->opts.nocow_enabled) {
- if (ctxt) {
- move_ctxt_wait_event(ctxt,
- (locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
- PTR_BUCKET_POS(c, &p.ptr), 0)) ||
- (!atomic_read(&ctxt->read_sectors) &&
- !atomic_read(&ctxt->write_sectors)));
-
- if (!locked)
- bch2_bucket_nocow_lock(&c->nocow_locks,
- PTR_BUCKET_POS(c, &p.ptr), 0);
- } else {
- if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
- PTR_BUCKET_POS(c, &p.ptr), 0)) {
- ret = -BCH_ERR_nocow_lock_blocked;
- goto err;
- }
- }
- ptrs_locked |= (1U << i);
- }
-
- i++;
+ ptr_bit <<= 1;
}
+ unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
+
/*
* If current extent durability is less than io_opts.data_replicas,
* we're not trying to rereplicate the extent up to data_replicas here -
@@ -598,51 +717,49 @@ int bch2_data_update_init(struct btree_trans *trans,
* Increasing replication is an explicit operation triggered by
* rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/
- if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
- durability_have >= io_opts.data_replicas) {
+ m->op.nr_replicas = min(durability_removing, durability_required) +
+ m->data_opts.extra_replicas;
+
+ /*
+ * If device(s) were set to durability=0 after data was written to them
+ * we can end up with a duribilty=0 extent, and the normal algorithm
+ * that tries not to increase durability doesn't work:
+ */
+ if (!(durability_have + durability_removing))
+ m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
+
+ m->op.nr_replicas_required = m->op.nr_replicas;
+
+ /*
+ * It might turn out that we don't need any new replicas, if the
+ * replicas or durability settings have been changed since the extent
+ * was written:
+ */
+ if (!m->op.nr_replicas) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
- ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
- goto done;
+ ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
+ goto out;
}
- m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
- m->data_opts.extra_replicas;
- m->op.nr_replicas_required = m->op.nr_replicas;
-
- BUG_ON(!m->op.nr_replicas);
-
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
m->data_opts.extra_replicas
? 0
: BCH_DISK_RESERVATION_NOFAIL);
if (ret)
- goto err;
+ goto out;
}
if (bkey_extent_is_unwritten(k)) {
bch2_update_unwritten_extent(trans, m);
- goto done;
+ goto out;
}
return 0;
-err:
- i = 0;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if ((1U << i) & ptrs_locked)
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, &p.ptr), 0);
- percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref);
- i++;
- }
-
- bch2_bkey_buf_exit(&m->k, c);
- bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
- return ret;
-done:
+out:
bch2_data_update_exit(m);
return ret ?: -BCH_ERR_data_update_done;
}
@@ -650,14 +767,14 @@ done:
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned i = 0;
+ unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
- if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
- opts->kill_ptrs |= 1U << i;
- opts->rewrite_ptrs ^= 1U << i;
+ if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) {
+ opts->kill_ptrs |= ptr_bit;
+ opts->rewrite_ptrs ^= ptr_bit;
}
- i++;
+ ptr_bit <<= 1;
}
}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 991095bbd469..e4b50723428e 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -17,6 +17,9 @@ struct data_update_opts {
unsigned write_flags;
};
+void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
+ struct bch_io_opts *, struct data_update_opts *);
+
struct data_update {
/* extent being updated: */
enum btree_id btree_id;
@@ -27,6 +30,8 @@ struct data_update {
struct bch_write_op op;
};
+void bch2_data_update_to_text(struct printbuf *, struct data_update *);
+
int bch2_data_update_index_update(struct bch_write_op *);
void bch2_data_update_read_done(struct data_update *,
@@ -35,7 +40,8 @@ void bch2_data_update_read_done(struct data_update *,
int bch2_extent_drop_ptrs(struct btree_trans *,
struct btree_iter *,
struct bkey_s_c,
- struct data_update_opts);
+ struct bch_io_opts *,
+ struct data_update_opts *);
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 7bdba8507fc9..55333e82d1fe 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -13,12 +13,14 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "buckets.h"
#include "debug.h"
#include "error.h"
#include "extents.h"
#include "fsck.h"
#include "inode.h"
+#include "journal_reclaim.h"
#include "super.h"
#include <linux/console.h>
@@ -36,11 +38,11 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
struct btree_node *n_ondisk = c->verify_ondisk;
struct btree_node *n_sorted = c->verify_data->data;
struct bset *sorted, *inmemory = &b->data->keys;
- struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
struct bio *bio;
bool failed = false, saw_error = false;
- if (!bch2_dev_get_ioref(ca, READ))
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ if (!ca)
return false;
bio = bio_alloc_bioset(ca->disk_sb.bdev,
@@ -137,7 +139,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
- c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+ c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!c->verify_ondisk)
goto out;
}
@@ -170,7 +172,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+ bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
printbuf_exit(&buf);
}
out:
@@ -193,13 +195,13 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
- if (!bch2_dev_get_ioref(ca, READ)) {
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ if (!ca) {
prt_printf(out, "error getting device to read from: not online\n");
return;
}
- n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+ n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!n_ondisk) {
prt_printf(out, "memory allocation failure\n");
goto out;
@@ -293,7 +295,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
out:
if (bio)
bio_put(bio);
- kvpfree(n_ondisk, btree_buf_bytes(b));
+ kvfree(n_ondisk);
percpu_ref_put(&ca->io_ref);
}
@@ -374,8 +376,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
return flush_buf(i) ?:
bch2_trans_run(i->c,
for_each_btree_key(trans, iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k, ({
bch2_bkey_val_to_text(&i->buf, i->c, k);
prt_newline(&i->buf);
bch2_trans_unlock(trans);
@@ -396,47 +398,27 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct btree *b;
- ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- ret = flush_buf(i);
+ ssize_t ret = flush_buf(i);
if (ret)
return ret;
if (bpos_eq(SPOS_MAX, i->from))
return i->ret;
- trans = bch2_trans_get(i->c);
-retry:
- bch2_trans_begin(trans);
-
- for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) {
- bch2_btree_node_to_text(&i->buf, i->c, b);
- i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
- ? bpos_successor(b->key.k.p)
- : b->key.k.p;
-
- ret = drop_locks_do(trans, flush_buf(i));
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ return bch2_trans_run(i->c,
+ for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({
+ bch2_btree_node_to_text(&i->buf, i->c, b);
+ i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
+ ? bpos_successor(b->key.k.p)
+ : b->key.k.p;
- if (!ret)
- ret = flush_buf(i);
-
- return ret ?: i->ret;
+ drop_locks_do(trans, flush_buf(i));
+ }))) ?: i->ret;
}
static const struct file_operations btree_format_debug_ops = {
@@ -458,8 +440,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
return flush_buf(i) ?:
bch2_trans_run(i->c,
for_each_btree_key(trans, iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k, ({
struct btree_path_level *l =
&btree_iter_path(trans, &iter)->l[0];
struct bkey_packed *_k =
@@ -491,51 +473,28 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
- prt_printf(out, "%px btree=%s l=%u ",
- b,
- bch2_btree_id_str(b->c.btree_id),
- b->c.level);
- prt_newline(out);
+ prt_printf(out, "%px ", b);
+ bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level);
+ prt_printf(out, "\n");
printbuf_indent_add(out, 2);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
prt_newline(out);
- prt_printf(out, "flags: ");
- prt_tab(out);
+ prt_printf(out, "flags:\t");
prt_bitflags(out, bch2_btree_node_flags, b->flags);
prt_newline(out);
- prt_printf(out, "pcpu read locks: ");
- prt_tab(out);
- prt_printf(out, "%u", b->c.lock.readers != NULL);
- prt_newline(out);
-
- prt_printf(out, "written:");
- prt_tab(out);
- prt_printf(out, "%u", b->written);
- prt_newline(out);
-
- prt_printf(out, "writes blocked:");
- prt_tab(out);
- prt_printf(out, "%u", !list_empty_careful(&b->write_blocked));
- prt_newline(out);
-
- prt_printf(out, "will make reachable:");
- prt_tab(out);
- prt_printf(out, "%lx", b->will_make_reachable);
- prt_newline(out);
-
- prt_printf(out, "journal pin %px:", &b->writes[0].journal);
- prt_tab(out);
- prt_printf(out, "%llu", b->writes[0].journal.seq);
- prt_newline(out);
+ prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL);
+ prt_printf(out, "written:\t%u\n", b->written);
+ prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked));
+ prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable);
- prt_printf(out, "journal pin %px:", &b->writes[1].journal);
- prt_tab(out);
- prt_printf(out, "%llu", b->writes[1].journal.seq);
- prt_newline(out);
+ prt_printf(out, "journal pin %px:\t%llu\n",
+ &b->writes[0].journal, b->writes[0].journal.seq);
+ prt_printf(out, "journal pin %px:\t%llu\n",
+ &b->writes[1].journal, b->writes[1].journal.seq);
printbuf_indent_sub(out, 2);
}
@@ -592,6 +551,32 @@ static const struct file_operations cached_btree_nodes_ops = {
.read = bch2_cached_btree_nodes_read,
};
+typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r);
+
+static void list_sort(struct list_head *head, list_cmp_fn cmp)
+{
+ struct list_head *pos;
+
+ list_for_each(pos, head)
+ while (!list_is_last(pos, head) &&
+ cmp(pos, pos->next) > 0) {
+ struct list_head *pos2, *next = pos->next;
+
+ list_del(next);
+ list_for_each(pos2, head)
+ if (cmp(next, pos2) < 0)
+ goto pos_found;
+ BUG();
+pos_found:
+ list_add_tail(next, pos2);
+ }
+}
+
+static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r)
+{
+ return cmp_int(l, r);
+}
+
static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
@@ -599,42 +584,39 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
struct bch_fs *c = i->c;
struct btree_trans *trans;
ssize_t ret = 0;
- u32 seq;
i->ubuf = buf;
i->size = size;
i->ret = 0;
restart:
seqmutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+ list_sort(&c->btree_trans_list, list_ptr_order_cmp);
- if (!task || task->pid <= i->iter)
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if ((ulong) trans <= i->iter)
continue;
- closure_get(&trans->ref);
- seq = seqmutex_seq(&c->btree_trans_lock);
- seqmutex_unlock(&c->btree_trans_lock);
+ i->iter = (ulong) trans;
- ret = flush_buf(i);
- if (ret) {
- closure_put(&trans->ref);
- goto unlocked;
- }
+ if (!closure_get_not_zero(&trans->ref))
+ continue;
+
+ u32 seq = seqmutex_unlock(&c->btree_trans_lock);
bch2_btree_trans_to_text(&i->buf, trans);
- prt_printf(&i->buf, "backtrace:");
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "backtrace:\n");
printbuf_indent_add(&i->buf, 2);
- bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
+ bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL);
printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf);
- i->iter = task->pid;
-
closure_put(&trans->ref);
+ ret = flush_buf(i);
+ if (ret)
+ goto unlocked;
+
if (!seqmutex_relock(&c->btree_trans_lock, seq))
goto restart;
}
@@ -668,7 +650,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
- do {
+ while (1) {
err = flush_buf(i);
if (err)
return err;
@@ -676,9 +658,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
if (!i->size)
break;
+ if (done)
+ break;
+
done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
i->iter++;
- } while (!done);
+ }
if (i->buf.allocation_failure)
return -ENOMEM;
@@ -693,13 +678,45 @@ static const struct file_operations journal_pins_ops = {
.read = bch2_journal_pins_read,
};
+static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (!i->iter) {
+ bch2_btree_updates_to_text(&i->buf, c);
+ i->iter++;
+ }
+
+ err = flush_buf(i);
+ if (err)
+ return err;
+
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
+
+ return i->ret;
+}
+
+static const struct file_operations btree_updates_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_btree_updates_read,
+};
+
static int btree_transaction_stats_open(struct inode *inode, struct file *file)
{
struct bch_fs *c = inode->i_private;
struct dump_iter *i;
i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-
if (!i)
return -ENOMEM;
@@ -746,25 +763,20 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
!bch2_btree_transaction_fns[i->iter])
break;
- prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]);
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]);
printbuf_indent_add(&i->buf, 2);
mutex_lock(&s->lock);
- prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
- prt_newline(&i->buf);
-
- prt_printf(&i->buf, "Transaction duration:");
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
+ prt_printf(&i->buf, "Transaction duration:\n");
printbuf_indent_add(&i->buf, 2);
bch2_time_stats_to_text(&i->buf, &s->duration);
printbuf_indent_sub(&i->buf, 2);
if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
- prt_printf(&i->buf, "Lock hold times:");
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "Lock hold times:\n");
printbuf_indent_add(&i->buf, 2);
bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
@@ -772,8 +784,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
}
if (s->max_paths_text) {
- prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths);
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths);
printbuf_indent_add(&i->buf, 2);
prt_str_indented(&i->buf, s->max_paths_text);
@@ -800,50 +811,55 @@ static const struct file_operations btree_transaction_stats_op = {
.read = btree_transaction_stats_read,
};
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
+/* walk btree transactions until we find a deadlock and print it */
+static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
struct btree_trans *trans;
- ssize_t ret = 0;
- u32 seq;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- if (i->iter)
- goto out;
+ ulong iter = 0;
restart:
seqmutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+ list_sort(&c->btree_trans_list, list_ptr_order_cmp);
- if (!task || task->pid <= i->iter)
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if ((ulong) trans <= iter)
continue;
- closure_get(&trans->ref);
- seq = seqmutex_seq(&c->btree_trans_lock);
- seqmutex_unlock(&c->btree_trans_lock);
+ iter = (ulong) trans;
- ret = flush_buf(i);
- if (ret) {
- closure_put(&trans->ref);
- goto out;
- }
+ if (!closure_get_not_zero(&trans->ref))
+ continue;
- bch2_check_for_deadlock(trans, &i->buf);
+ u32 seq = seqmutex_unlock(&c->btree_trans_lock);
- i->iter = task->pid;
+ bool found = bch2_check_for_deadlock(trans, out) != 0;
closure_put(&trans->ref);
+ if (found)
+ return;
+
if (!seqmutex_relock(&c->btree_trans_lock, seq))
goto restart;
}
seqmutex_unlock(&c->btree_trans_lock);
-out:
+}
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (!i->iter) {
+ btree_deadlock_to_text(&i->buf, c);
+ i->iter++;
+ }
+
if (i->buf.allocation_failure)
ret = -ENOMEM;
@@ -866,6 +882,20 @@ void bch2_fs_debug_exit(struct bch_fs *c)
debugfs_remove_recursive(c->fs_debug_dir);
}
+static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
+{
+ struct dentry *d;
+
+ d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
+
+ debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
+
+ debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
+
+ debugfs_create_file("bfloat-failed", 0400, d, bd,
+ &bfloat_failed_debug_ops);
+}
+
void bch2_fs_debug_init(struct bch_fs *c)
{
struct btree_debug *bd;
@@ -888,6 +918,9 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
c->btree_debug, &journal_pins_ops);
+ debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
+ c->btree_debug, &btree_updates_ops);
+
debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
c, &btree_transaction_stats_op);
@@ -902,21 +935,7 @@ void bch2_fs_debug_init(struct bch_fs *c)
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- debugfs_create_file(bch2_btree_id_str(bd->id),
- 0400, c->btree_debug_dir, bd,
- &btree_debug_ops);
-
- snprintf(name, sizeof(name), "%s-formats",
- bch2_btree_id_str(bd->id));
-
- debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
- &btree_format_debug_ops);
-
- snprintf(name, sizeof(name), "%s-bfloat-failed",
- bch2_btree_id_str(bd->id));
-
- debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
- &bfloat_failed_debug_ops);
+ bch2_fs_debug_btree_init(c, bd);
}
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 4ae1e9f002a0..600eee936f13 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -15,6 +15,9 @@
static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
+ if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
+ return 0;
+
unsigned bkey_u64s = bkey_val_u64s(d.k);
unsigned bkey_bytes = bkey_u64s * sizeof(u64);
u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
@@ -97,20 +100,19 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
.is_visible = dirent_is_visible,
};
-int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
int ret = 0;
- bkey_fsck_err_on(!d_name.len, c, err,
- dirent_empty_name,
+ bkey_fsck_err_on(!d_name.len,
+ c, dirent_empty_name,
"empty name");
- bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err,
- dirent_val_too_big,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len),
+ c, dirent_val_too_big,
"value too big (%zu > %u)",
bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
@@ -118,45 +120,47 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
* Check new keys don't exceed the max length
* (older keys may be larger.)
*/
- bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err,
- dirent_name_too_long,
+ bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
+ c, dirent_name_too_long,
"dirent name too big (%u > %u)",
d_name.len, BCH_NAME_MAX);
- bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err,
- dirent_name_embedded_nul,
+ bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len),
+ c, dirent_name_embedded_nul,
"dirent has stray data after name's NUL");
bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
- (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err,
- dirent_name_dot_or_dotdot,
+ (d_name.len == 2 && !memcmp(d_name.name, "..", 2)),
+ c, dirent_name_dot_or_dotdot,
"invalid name");
- bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err,
- dirent_name_has_slash,
+ bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len),
+ c, dirent_name_has_slash,
"name with /");
bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
- le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err,
- dirent_to_itself,
+ le64_to_cpu(d.v->d_inum) == d.k->p.inode,
+ c, dirent_to_itself,
"dirent points to own directory");
fsck_err:
return ret;
}
-void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
- prt_printf(out, "%.*s -> %llu type %s",
- d_name.len,
- d_name.name,
- d.v->d_type != DT_SUBVOL
- ? le64_to_cpu(d.v->d_inum)
- : le32_to_cpu(d.v->d_child_subvol),
- bch2_d_type_str(d.v->d_type));
+ prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
+
+ if (d.v->d_type != DT_SUBVOL)
+ prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
+ else
+ prt_printf(out, "%u -> %u",
+ le32_to_cpu(d.v->d_parent_subvol),
+ le32_to_cpu(d.v->d_child_subvol));
+
+ prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
}
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
@@ -199,17 +203,17 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
}
int bch2_dirent_create_snapshot(struct btree_trans *trans,
- u64 dir, u32 snapshot,
+ u32 dir_subvol, u64 dir, u32 snapshot,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
- bch_str_hash_flags_t str_hash_flags)
+ enum btree_iter_update_trigger_flags flags)
{
- subvol_inum zero_inum = { 0 };
+ subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum);
+ dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@@ -217,10 +221,9 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
dirent->k.p.inode = dir;
dirent->k.p.snapshot = snapshot;
- ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- zero_inum, snapshot,
- &dirent->k_i, str_hash_flags,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+ dir_inum, snapshot, &dirent->k_i,
+ flags|BTREE_UPDATE_internal_snapshot_node);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -230,7 +233,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
- bch_str_hash_flags_t str_hash_flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_i_dirent *dirent;
int ret;
@@ -241,19 +244,12 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir, &dirent->k_i, str_hash_flags);
+ dir, &dirent->k_i, flags);
*dir_offset = dirent->k.p.offset;
return ret;
}
-static void dirent_copy_target(struct bkey_i_dirent *dst,
- struct bkey_s_c_dirent src)
-{
- dst->v.d_inum = src.v->d_inum;
- dst->v.d_type = src.v->d_type;
-}
-
int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
struct bkey_s_c_dirent d, subvol_inum *target)
{
@@ -270,7 +266,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
} else {
target->subvol = le32_to_cpu(d.v->d_child_subvol);
- ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s);
+ ret = bch2_subvolume_get(trans, target->subvol, true, &s);
target->inum = le64_to_cpu(s.inode);
}
@@ -291,23 +287,17 @@ int bch2_dirent_rename(struct btree_trans *trans,
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
- unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
+ unsigned src_update_flags = 0;
+ bool delete_src, delete_dst;
int ret = 0;
- if (src_dir.subvol != dst_dir.subvol)
- return -EXDEV;
-
memset(src_inum, 0, sizeof(*src_inum));
memset(dst_inum, 0, sizeof(*dst_inum));
/* Lookup src: */
- ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
- src_hash, src_dir, src_name,
- BTREE_ITER_INTENT);
- if (ret)
- goto out;
-
- old_src = bch2_btree_iter_peek_slot(&src_iter);
+ old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
+ src_hash, src_dir, src_name,
+ BTREE_ITER_intent);
ret = bkey_err(old_src);
if (ret)
goto out;
@@ -317,12 +307,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
- src_type = bkey_s_c_to_dirent(old_src).v->d_type;
-
- if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
- return -EOPNOTSUPP;
-
-
/* Lookup dst: */
if (mode == BCH_RENAME) {
/*
@@ -335,13 +319,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
} else {
- ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name,
- BTREE_ITER_INTENT);
- if (ret)
- goto out;
-
- old_dst = bch2_btree_iter_peek_slot(&dst_iter);
+ old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
+ dst_hash, dst_dir, dst_name,
+ BTREE_ITER_intent);
ret = bkey_err(old_dst);
if (ret)
goto out;
@@ -350,11 +330,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
bkey_s_c_to_dirent(old_dst), dst_inum);
if (ret)
goto out;
-
- dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
-
- if (dst_type == DT_SUBVOL)
- return -EOPNOTSUPP;
}
if (mode != BCH_RENAME_EXCHANGE)
@@ -424,28 +399,55 @@ int bch2_dirent_rename(struct btree_trans *trans,
}
}
+ if (new_dst->v.d_type == DT_SUBVOL)
+ new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
+
+ if ((mode == BCH_RENAME_EXCHANGE) &&
+ new_src->v.d_type == DT_SUBVOL)
+ new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
+
ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
if (ret)
goto out;
out_set_src:
-
/*
- * If we're deleting a subvolume, we need to really delete the dirent,
- * not just emit a whiteout in the current snapshot:
+ * If we're deleting a subvolume we need to really delete the dirent,
+ * not just emit a whiteout in the current snapshot - there can only be
+ * single dirent that points to a given subvolume.
+ *
+ * IOW, we don't maintain multiple versions in different snapshots of
+ * dirents that point to subvolumes - dirents that point to subvolumes
+ * are only visible in one particular subvolume so it's not necessary,
+ * and it would be particularly confusing for fsck to have to deal with.
*/
- if (src_type == DT_SUBVOL) {
- bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&src_iter);
+ delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
+ new_src->k.p.snapshot != old_src.k->p.snapshot;
+
+ delete_dst = old_dst.k &&
+ bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
+ new_dst->k.p.snapshot != old_dst.k->p.snapshot;
+
+ if (!delete_src || !bkey_deleted(&new_src->k)) {
+ ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
if (ret)
goto out;
+ }
- new_src->k.p = src_iter.pos;
- src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+ if (delete_src) {
+ bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&src_iter) ?:
+ bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
+ if (ret)
+ goto out;
}
- ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
- if (ret)
- goto out;
+ if (delete_dst) {
+ bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&dst_iter) ?:
+ bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
+ if (ret)
+ goto out;
+ }
if (mode == BCH_RENAME_EXCHANGE)
*src_offset = new_src->k.p.offset;
@@ -456,41 +458,25 @@ out:
return ret;
}
-int __bch2_dirent_lookup_trans(struct btree_trans *trans,
- struct btree_iter *iter,
- subvol_inum dir,
- const struct bch_hash_info *hash_info,
- const struct qstr *name, subvol_inum *inum,
- unsigned flags)
+int bch2_dirent_lookup_trans(struct btree_trans *trans,
+ struct btree_iter *iter,
+ subvol_inum dir,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name, subvol_inum *inum,
+ unsigned flags)
{
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
- u32 snapshot;
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
- if (ret)
- return ret;
-
- ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
- hash_info, dir, name, flags);
- if (ret)
- return ret;
-
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
+ struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+ hash_info, dir, name, flags);
+ int ret = bkey_err(k);
if (ret)
goto err;
- d = bkey_s_c_to_dirent(k);
-
- ret = bch2_dirent_read_target(trans, dir, d, inum);
+ ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
if (ret > 0)
ret = -ENOENT;
err:
if (ret)
bch2_trans_iter_exit(trans, iter);
-
return ret;
}
@@ -502,23 +488,26 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
struct btree_iter iter = { NULL };
int ret = lockrestart_do(trans,
- __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
+ bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
return ret;
}
-int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
+int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
SPOS(dir, 0, snapshot),
POS(dir, U64_MAX), 0, k, ret)
if (k.k->type == KEY_TYPE_dirent) {
- ret = -ENOTEMPTY;
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
+ continue;
+ ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -531,73 +520,55 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
u32 snapshot;
return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
- bch2_empty_dir_snapshot(trans, dir.inum, snapshot);
+ bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
+}
+
+static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target)
+{
+ struct qstr name = bch2_dirent_get_name(d);
+ /*
+ * Although not required by the kernel code, updating ctx->pos is needed
+ * for the bcachefs FUSE driver. Without this update, the FUSE
+ * implementation will be stuck in an infinite loop when reading
+ * directories (via the bcachefs_fuse_readdir callback).
+ * In kernel space, ctx->pos is updated by the VFS code.
+ */
+ ctx->pos = d.k->p.offset;
+ bool ret = dir_emit(ctx, name.name,
+ name.len,
+ target.inum,
+ vfs_d_type(d.v->d_type));
+ if (ret)
+ ctx->pos = d.k->p.offset + 1;
+ return ret;
}
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_dirent dirent;
- subvol_inum target;
- u32 snapshot;
struct bkey_buf sk;
- struct qstr name;
- int ret;
-
bch2_bkey_buf_init(&sk);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(inum.inum, ctx->pos, snapshot),
- POS(inum.inum, U64_MAX), 0, k, ret) {
- if (k.k->type != KEY_TYPE_dirent)
- continue;
- dirent = bkey_s_c_to_dirent(k);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents,
+ POS(inum.inum, ctx->pos),
+ POS(inum.inum, U64_MAX),
+ inum.subvol, 0, k, ({
+ if (k.k->type != KEY_TYPE_dirent)
+ continue;
- ret = bch2_dirent_read_target(trans, inum, dirent, &target);
- if (ret < 0)
- break;
- if (ret)
- continue;
+ /* dir_emit() can fault and block: */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
- /* dir_emit() can fault and block: */
- bch2_bkey_buf_reassemble(&sk, c, k);
- dirent = bkey_i_to_s_c_dirent(sk.k);
- bch2_trans_unlock(trans);
+ subvol_inum target;
+ int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
+ if (ret2 > 0)
+ continue;
- name = bch2_dirent_get_name(dirent);
+ ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target));
+ })));
- ctx->pos = dirent.k->p.offset;
- if (!dir_emit(ctx, name.name,
- name.len,
- target.inum,
- vfs_d_type(dirent.v->d_type)))
- break;
- ctx->pos = dirent.k->p.offset + 1;
-
- /*
- * read_target looks up subvolumes, we can overflow paths if the
- * directory has many subvolumes in it
- */
- ret = btree_trans_too_many_iters(trans);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
- return ret;
+ return ret < 0 ? ret : 0;
}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 21ffeb78f02e..362b3b2f2f2e 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -4,15 +4,14 @@
#include "str_hash.h"
-enum bkey_invalid_flags;
extern const struct bch_hash_desc bch2_dirent_hash_desc;
-int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_dirent ((struct bkey_ops) { \
- .key_invalid = bch2_dirent_invalid, \
+ .key_validate = bch2_dirent_validate, \
.val_to_text = bch2_dirent_to_text, \
.min_val_size = 16, \
})
@@ -35,14 +34,21 @@ static inline unsigned dirent_val_u64s(unsigned len)
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
struct bkey_s_c_dirent, subvol_inum *);
-int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32,
+static inline void dirent_copy_target(struct bkey_i_dirent *dst,
+ struct bkey_s_c_dirent src)
+{
+ dst->v.d_inum = src.v->d_inum;
+ dst->v.d_type = src.v->d_type;
+}
+
+int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
- bch_str_hash_flags_t);
+ enum btree_iter_update_trigger_flags);
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
- bch_str_hash_flags_t);
+ enum btree_iter_update_trigger_flags);
static inline unsigned vfs_d_type(unsigned type)
{
@@ -62,14 +68,14 @@ int bch2_dirent_rename(struct btree_trans *,
const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
-int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
subvol_inum, const struct bch_hash_info *,
const struct qstr *, subvol_inum *, unsigned);
u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
const struct bch_hash_info *,
const struct qstr *, subvol_inum *);
-int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32);
+int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
new file mode 100644
index 000000000000..b32e91ba8be8
--- /dev/null
+++ b/fs/bcachefs/disk_accounting.c
@@ -0,0 +1,1012 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bcachefs_ioctl.h"
+#include "btree_cache.h"
+#include "btree_journal_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "compress.h"
+#include "disk_accounting.h"
+#include "error.h"
+#include "journal_io.h"
+#include "replicas.h"
+
+/*
+ * Notes on disk accounting:
+ *
+ * We have two parallel sets of counters to be concerned with, and both must be
+ * kept in sync.
+ *
+ * - Persistent/on disk accounting, stored in the accounting btree and updated
+ * via btree write buffer updates that treat new accounting keys as deltas to
+ * apply to existing values. But reading from a write buffer btree is
+ * expensive, so we also have
+ *
+ * - In memory accounting, where accounting is stored as an array of percpu
+ * counters, indexed by an eytzinger array of disk acounting keys/bpos (which
+ * are the same thing, excepting byte swabbing on big endian).
+ *
+ * Cheap to read, but non persistent.
+ *
+ * Disk accounting updates are generated by transactional triggers; these run as
+ * keys enter and leave the btree, and can compare old and new versions of keys;
+ * the output of these triggers are deltas to the various counters.
+ *
+ * Disk accounting updates are done as btree write buffer updates, where the
+ * counters in the disk accounting key are deltas that will be applied to the
+ * counter in the btree when the key is flushed by the write buffer (or journal
+ * replay).
+ *
+ * To do a disk accounting update:
+ * - initialize a disk_accounting_pos, to specify which counter is being update
+ * - initialize counter deltas, as an array of 1-3 s64s
+ * - call bch2_disk_accounting_mod()
+ *
+ * This queues up the accounting update to be done at transaction commit time.
+ * Underneath, it's a normal btree write buffer update.
+ *
+ * The transaction commit path is responsible for propagating updates to the in
+ * memory counters, with bch2_accounting_mem_mod().
+ *
+ * The commit path also assigns every disk accounting update a unique version
+ * number, based on the journal sequence number and offset within that journal
+ * buffer; this is used by journal replay to determine which updates have been
+ * done.
+ *
+ * The transaction commit path also ensures that replicas entry accounting
+ * updates are properly marked in the superblock (so that we know whether we can
+ * mount without data being unavailable); it will update the superblock if
+ * bch2_accounting_mem_mod() tells it to.
+ */
+
+static const char * const disk_accounting_type_strs[] = {
+#define x(t, n, ...) [n] = #t,
+ BCH_DISK_ACCOUNTING_TYPES()
+#undef x
+ NULL
+};
+
+static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
+ s64 *d, unsigned nr)
+{
+ struct bkey_i_accounting *acc = bkey_accounting_init(k);
+
+ acc->k.p = disk_accounting_pos_to_bpos(pos);
+ set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
+
+ memcpy_u64s_small(acc->v.d, d, nr);
+}
+
+static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos);
+
+int bch2_disk_accounting_mod(struct btree_trans *trans,
+ struct disk_accounting_pos *k,
+ s64 *d, unsigned nr, bool gc)
+{
+ /* Normalize: */
+ switch (k->type) {
+ case BCH_DISK_ACCOUNTING_replicas:
+ bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp);
+ break;
+ }
+
+ BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
+
+ struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
+
+ accounting_key_init(&k_i.k, k, d, nr);
+
+ if (unlikely(gc)) {
+ int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+ if (ret == -BCH_ERR_btree_insert_need_mark_replicas)
+ ret = drop_locks_do(trans,
+ bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?:
+ bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+ return ret;
+ } else {
+ return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k);
+ }
+}
+
+int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
+ unsigned dev, s64 sectors,
+ bool gc)
+{
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+
+ bch2_replicas_entry_cached(&acc.replicas, dev);
+
+ return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
+}
+
+static inline bool is_zero(char *start, char *end)
+{
+ BUG_ON(start > end);
+
+ for (; start < end; start++)
+ if (*start)
+ return false;
+ return true;
+}
+
+#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member))
+
+int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
+{
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+ void *end = &acc_k + 1;
+ int ret = 0;
+
+ bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) &&
+ bversion_zero(k.k->bversion),
+ c, accounting_key_version_0,
+ "accounting key with version=0");
+
+ switch (acc_k.type) {
+ case BCH_DISK_ACCOUNTING_nr_inodes:
+ end = field_end(acc_k, nr_inodes);
+ break;
+ case BCH_DISK_ACCOUNTING_persistent_reserved:
+ end = field_end(acc_k, persistent_reserved);
+ break;
+ case BCH_DISK_ACCOUNTING_replicas:
+ bkey_fsck_err_on(!acc_k.replicas.nr_devs,
+ c, accounting_key_replicas_nr_devs_0,
+ "accounting key replicas entry with nr_devs=0");
+
+ bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs ||
+ (acc_k.replicas.nr_required > 1 &&
+ acc_k.replicas.nr_required == acc_k.replicas.nr_devs),
+ c, accounting_key_replicas_nr_required_bad,
+ "accounting key replicas entry with bad nr_required");
+
+ for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++)
+ bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1],
+ c, accounting_key_replicas_devs_unsorted,
+ "accounting key replicas entry with unsorted devs");
+
+ end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas);
+ break;
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ end = field_end(acc_k, dev_data_type);
+ break;
+ case BCH_DISK_ACCOUNTING_compression:
+ end = field_end(acc_k, compression);
+ break;
+ case BCH_DISK_ACCOUNTING_snapshot:
+ end = field_end(acc_k, snapshot);
+ break;
+ case BCH_DISK_ACCOUNTING_btree:
+ end = field_end(acc_k, btree);
+ break;
+ case BCH_DISK_ACCOUNTING_rebalance_work:
+ end = field_end(acc_k, rebalance_work);
+ break;
+ }
+
+ bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
+ c, accounting_key_junk_at_end,
+ "junk at end of accounting key");
+fsck_err:
+ return ret;
+}
+
+void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
+{
+ if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
+ prt_printf(out, "unknown type %u", k->type);
+ return;
+ }
+
+ prt_str(out, disk_accounting_type_strs[k->type]);
+ prt_str(out, " ");
+
+ switch (k->type) {
+ case BCH_DISK_ACCOUNTING_nr_inodes:
+ break;
+ case BCH_DISK_ACCOUNTING_persistent_reserved:
+ prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas);
+ break;
+ case BCH_DISK_ACCOUNTING_replicas:
+ bch2_replicas_entry_to_text(out, &k->replicas);
+ break;
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
+ bch2_prt_data_type(out, k->dev_data_type.data_type);
+ break;
+ case BCH_DISK_ACCOUNTING_compression:
+ bch2_prt_compression_type(out, k->compression.type);
+ break;
+ case BCH_DISK_ACCOUNTING_snapshot:
+ prt_printf(out, "id=%u", k->snapshot.id);
+ break;
+ case BCH_DISK_ACCOUNTING_btree:
+ prt_str(out, "btree=");
+ bch2_btree_id_to_text(out, k->btree.id);
+ break;
+ }
+}
+
+void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k);
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+ bch2_accounting_key_to_text(out, &acc_k);
+
+ for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
+ prt_printf(out, " %lli", acc.v->d[i]);
+}
+
+void bch2_accounting_swab(struct bkey_s k)
+{
+ for (u64 *p = (u64 *) k.v;
+ p < (u64 *) bkey_val_end(k);
+ p++)
+ *p = swab64(*p);
+}
+
+static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
+ struct disk_accounting_pos *acc)
+{
+ unsafe_memcpy(r, &acc->replicas,
+ replicas_entry_bytes(&acc->replicas),
+ "variable length struct");
+}
+
+static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
+{
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, p);
+
+ switch (acc_k.type) {
+ case BCH_DISK_ACCOUNTING_replicas:
+ __accounting_to_replicas(r, &acc_k);
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
+{
+ struct bch_replicas_padded r;
+ return accounting_to_replicas(&r.e, p)
+ ? bch2_mark_replicas(c, &r.e)
+ : 0;
+}
+
+/*
+ * Ensure accounting keys being updated are present in the superblock, when
+ * applicable (i.e. replicas updates)
+ */
+int bch2_accounting_update_sb(struct btree_trans *trans)
+{
+ for (struct jset_entry *i = trans->journal_entries;
+ i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ i = vstruct_next(i))
+ if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) {
+ int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+
+ /* raced with another insert, already present: */
+ if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &a.k->p) < acc->k.nr)
+ return 0;
+
+ struct accounting_mem_entry n = {
+ .pos = a.k->p,
+ .bversion = a.k->bversion,
+ .nr_counters = bch2_accounting_counters(a.k),
+ .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
+ sizeof(u64), GFP_KERNEL),
+ };
+
+ if (!n.v[0])
+ goto err;
+
+ if (acc->gc_running) {
+ n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
+ sizeof(u64), GFP_KERNEL);
+ if (!n.v[1])
+ goto err;
+ }
+
+ if (darray_push(&acc->k, n))
+ goto err;
+
+ eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, NULL);
+
+ if (trace_accounting_mem_insert_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_accounting_to_text(&buf, c, a.s_c);
+ trace_accounting_mem_insert(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+ return 0;
+err:
+ free_percpu(n.v[1]);
+ free_percpu(n.v[0]);
+ return -BCH_ERR_ENOMEM_disk_accounting;
+}
+
+int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
+ enum bch_accounting_mode mode)
+{
+ struct bch_replicas_padded r;
+
+ if (mode != BCH_ACCOUNTING_read &&
+ accounting_to_replicas(&r.e, a.k->p) &&
+ !bch2_replicas_marked_locked(c, &r.e))
+ return -BCH_ERR_btree_insert_need_mark_replicas;
+
+ percpu_up_read(&c->mark_lock);
+ percpu_down_write(&c->mark_lock);
+ int ret = __bch2_accounting_mem_insert(c, a);
+ percpu_up_write(&c->mark_lock);
+ percpu_down_read(&c->mark_lock);
+ return ret;
+}
+
+static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
+{
+ for (unsigned i = 0; i < e->nr_counters; i++)
+ if (percpu_u64_get(e->v[0] + i) ||
+ (e->v[1] &&
+ percpu_u64_get(e->v[1] + i)))
+ return false;
+ return true;
+}
+
+void bch2_accounting_mem_gc(struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+
+ percpu_down_write(&c->mark_lock);
+ struct accounting_mem_entry *dst = acc->k.data;
+
+ darray_for_each(acc->k, src) {
+ if (accounting_mem_entry_is_zero(src)) {
+ free_percpu(src->v[0]);
+ free_percpu(src->v[1]);
+ } else {
+ *dst++ = *src;
+ }
+ }
+
+ acc->k.nr = dst - acc->k.data;
+ eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, NULL);
+ percpu_up_write(&c->mark_lock);
+}
+
+/*
+ * Read out accounting keys for replicas entries, as an array of
+ * bch_replicas_usage entries.
+ *
+ * Note: this may be deprecated/removed at smoe point in the future and replaced
+ * with something more general, it exists to support the ioctl used by the
+ * 'bcachefs fs usage' command.
+ */
+int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+ int ret = 0;
+
+ darray_init(usage);
+
+ percpu_down_read(&c->mark_lock);
+ darray_for_each(acc->k, i) {
+ struct {
+ struct bch_replicas_usage r;
+ u8 pad[BCH_BKEY_PTRS_MAX];
+ } u;
+
+ if (!accounting_to_replicas(&u.r.r, i->pos))
+ continue;
+
+ u64 sectors;
+ bch2_accounting_mem_read_counters(acc, i - acc->k.data, &sectors, 1, false);
+ u.r.sectors = sectors;
+
+ ret = darray_make_room(usage, replicas_usage_bytes(&u.r));
+ if (ret)
+ break;
+
+ memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r));
+ usage->nr += replicas_usage_bytes(&u.r);
+ }
+ percpu_up_read(&c->mark_lock);
+
+ if (ret)
+ darray_exit(usage);
+ return ret;
+}
+
+int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask)
+{
+
+ struct bch_accounting_mem *acc = &c->accounting;
+ int ret = 0;
+
+ darray_init(out_buf);
+
+ percpu_down_read(&c->mark_lock);
+ darray_for_each(acc->k, i) {
+ struct disk_accounting_pos a_p;
+ bpos_to_disk_accounting_pos(&a_p, i->pos);
+
+ if (!(accounting_types_mask & BIT(a_p.type)))
+ continue;
+
+ ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) +
+ sizeof(u64) * i->nr_counters);
+ if (ret)
+ break;
+
+ struct bkey_i_accounting *a_out =
+ bkey_accounting_init((void *) &darray_top(*out_buf));
+ set_bkey_val_u64s(&a_out->k, i->nr_counters);
+ a_out->k.p = i->pos;
+ bch2_accounting_mem_read_counters(acc, i - acc->k.data,
+ a_out->v.d, i->nr_counters, false);
+
+ if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out)))
+ out_buf->nr += bkey_bytes(&a_out->k);
+ }
+
+ percpu_up_read(&c->mark_lock);
+
+ if (ret)
+ darray_exit(out_buf);
+ return ret;
+}
+
+static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
+{
+ darray_for_each(acc->k, e) {
+ free_percpu(e->v[gc]);
+ e->v[gc] = NULL;
+ }
+}
+
+int bch2_gc_accounting_start(struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+ int ret = 0;
+
+ percpu_down_write(&c->mark_lock);
+ darray_for_each(acc->k, e) {
+ e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64),
+ sizeof(u64), GFP_KERNEL);
+ if (!e->v[1]) {
+ bch2_accounting_free_counters(acc, true);
+ ret = -BCH_ERR_ENOMEM_disk_accounting;
+ break;
+ }
+ }
+
+ acc->gc_running = !ret;
+ percpu_up_write(&c->mark_lock);
+
+ return ret;
+}
+
+int bch2_gc_accounting_done(struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct printbuf buf = PRINTBUF;
+ struct bpos pos = POS_MIN;
+ int ret = 0;
+
+ percpu_down_write(&c->mark_lock);
+ while (1) {
+ unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &pos);
+
+ if (idx >= acc->k.nr)
+ break;
+
+ struct accounting_mem_entry *e = acc->k.data + idx;
+ pos = bpos_successor(e->pos);
+
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, e->pos);
+
+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+ continue;
+
+ u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
+ u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
+
+ unsigned nr = e->nr_counters;
+ bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false);
+ bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true);
+
+ if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
+ printbuf_reset(&buf);
+ prt_str(&buf, "accounting mismatch for ");
+ bch2_accounting_key_to_text(&buf, &acc_k);
+
+ prt_str(&buf, ": got");
+ for (unsigned j = 0; j < nr; j++)
+ prt_printf(&buf, " %llu", dst_v[j]);
+
+ prt_str(&buf, " should be");
+ for (unsigned j = 0; j < nr; j++)
+ prt_printf(&buf, " %llu", src_v[j]);
+
+ for (unsigned j = 0; j < nr; j++)
+ src_v[j] -= dst_v[j];
+
+ if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) {
+ percpu_up_write(&c->mark_lock);
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
+ percpu_down_write(&c->mark_lock);
+ if (ret)
+ goto err;
+
+ if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
+ memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
+ struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
+
+ accounting_key_init(&k_i.k, &acc_k, src_v, nr);
+ bch2_accounting_mem_mod_locked(trans,
+ bkey_i_to_s_c_accounting(&k_i.k),
+ BCH_ACCOUNTING_normal);
+
+ preempt_disable();
+ struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
+ struct bch_fs_usage_base *src = &trans->fs_usage_delta;
+ acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
+ preempt_enable();
+ }
+ }
+ }
+ }
+err:
+fsck_err:
+ percpu_up_write(&c->mark_lock);
+ printbuf_exit(&buf);
+ bch2_trans_put(trans);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+
+ if (k.k->type != KEY_TYPE_accounting)
+ return 0;
+
+ percpu_down_read(&c->mark_lock);
+ int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
+ BCH_ACCOUNTING_read);
+ percpu_up_read(&c->mark_lock);
+ return ret;
+}
+
+static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
+ struct disk_accounting_pos acc,
+ u64 *v, unsigned nr)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0, invalid_dev = -1;
+
+ switch (acc.type) {
+ case BCH_DISK_ACCOUNTING_replicas: {
+ struct bch_replicas_padded r;
+ __accounting_to_replicas(&r.e, &acc);
+
+ for (unsigned i = 0; i < r.e.nr_devs; i++)
+ if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
+ !bch2_dev_exists(c, r.e.devs[i])) {
+ invalid_dev = r.e.devs[i];
+ goto invalid_device;
+ }
+
+ /*
+ * All replicas entry checks except for invalid device are done
+ * in bch2_accounting_validate
+ */
+ BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
+
+ if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
+ trans, accounting_replicas_not_marked,
+ "accounting not marked in superblock replicas\n %s",
+ (printbuf_reset(&buf),
+ bch2_accounting_key_to_text(&buf, &acc),
+ buf.buf))) {
+ /*
+ * We're not RW yet and still single threaded, dropping
+ * and retaking lock is ok:
+ */
+ percpu_up_write(&c->mark_lock);
+ ret = bch2_mark_replicas(c, &r.e);
+ if (ret)
+ goto fsck_err;
+ percpu_down_write(&c->mark_lock);
+ }
+ break;
+ }
+
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
+ invalid_dev = acc.dev_data_type.dev;
+ goto invalid_device;
+ }
+ break;
+ }
+
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+invalid_device:
+ if (fsck_err(trans, accounting_to_invalid_device,
+ "accounting entry points to invalid device %i\n %s",
+ invalid_dev,
+ (printbuf_reset(&buf),
+ bch2_accounting_key_to_text(&buf, &acc),
+ buf.buf))) {
+ for (unsigned i = 0; i < nr; i++)
+ v[i] = -v[i];
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
+ -BCH_ERR_remove_disk_accounting_entry;
+ } else {
+ ret = -BCH_ERR_remove_disk_accounting_entry;
+ }
+ goto fsck_err;
+}
+
+/*
+ * At startup time, initialize the in memory accounting from the btree (and
+ * journal)
+ */
+int bch2_accounting_read(struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct printbuf buf = PRINTBUF;
+
+ /*
+ * We might run more than once if we rewind to start topology repair or
+ * btree node scan - and those might cause us to get different results,
+ * so we can't just skip if we've already run.
+ *
+ * Instead, zero out any accounting we have:
+ */
+ percpu_down_write(&c->mark_lock);
+ darray_for_each(acc->k, e)
+ percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
+ for_each_member_device(c, ca)
+ percpu_memset(ca->usage, 0, sizeof(*ca->usage));
+ percpu_memset(c->usage, 0, sizeof(*c->usage));
+ percpu_up_write(&c->mark_lock);
+
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
+ iter.flags &= ~BTREE_ITER_with_journal;
+ int ret = for_each_btree_key_continue(trans, iter,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
+ struct bkey u;
+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
+
+ if (k.k->type != KEY_TYPE_accounting)
+ continue;
+
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+ break;
+
+ if (!bch2_accounting_is_mem(acc_k)) {
+ struct disk_accounting_pos next = { .type = acc_k.type + 1 };
+ bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
+ continue;
+ }
+
+ accounting_read_key(trans, k);
+ }));
+ if (ret)
+ goto err;
+
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_key *dst = keys->data;
+ move_gap(keys, keys->nr);
+
+ darray_for_each(*keys, i) {
+ if (i->k->k.type == KEY_TYPE_accounting) {
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
+
+ if (!bch2_accounting_is_mem(acc_k))
+ continue;
+
+ struct bkey_s_c k = bkey_i_to_s_c(i->k);
+ unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
+ sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &k.k->p);
+
+ bool applied = idx < acc->k.nr &&
+ bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
+
+ if (applied)
+ continue;
+
+ if (i + 1 < &darray_top(*keys) &&
+ i[1].k->k.type == KEY_TYPE_accounting &&
+ !journal_key_cmp(i, i + 1)) {
+ WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
+
+ i[1].journal_seq = i[0].journal_seq;
+
+ bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k),
+ bkey_s_c_to_accounting(k));
+ continue;
+ }
+
+ ret = accounting_read_key(trans, k);
+ if (ret)
+ goto err;
+ }
+
+ *dst++ = *i;
+ }
+ keys->gap = keys->nr = dst - keys->data;
+
+ percpu_down_write(&c->mark_lock);
+
+ darray_for_each_reverse(acc->k, i) {
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, i->pos);
+
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ memset(v, 0, sizeof(v));
+
+ for (unsigned j = 0; j < i->nr_counters; j++)
+ v[j] = percpu_u64_get(i->v[0] + j);
+
+ /*
+ * If the entry counters are zeroed, it should be treated as
+ * nonexistent - it might point to an invalid device.
+ *
+ * Remove it, so that if it's re-added it gets re-marked in the
+ * superblock:
+ */
+ ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
+ ? -BCH_ERR_remove_disk_accounting_entry
+ : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters);
+
+ if (ret == -BCH_ERR_remove_disk_accounting_entry) {
+ free_percpu(i->v[0]);
+ free_percpu(i->v[1]);
+ darray_remove_item(&acc->k, i);
+ ret = 0;
+ continue;
+ }
+
+ if (ret)
+ goto fsck_err;
+ }
+
+ eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, NULL);
+
+ preempt_disable();
+ struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
+
+ for (unsigned i = 0; i < acc->k.nr; i++) {
+ struct disk_accounting_pos k;
+ bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
+
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+
+ switch (k.type) {
+ case BCH_DISK_ACCOUNTING_persistent_reserved:
+ usage->reserved += v[0] * k.persistent_reserved.nr_replicas;
+ break;
+ case BCH_DISK_ACCOUNTING_replicas:
+ fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
+ break;
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
+ if (ca) {
+ struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
+ percpu_u64_set(&d->buckets, v[0]);
+ percpu_u64_set(&d->sectors, v[1]);
+ percpu_u64_set(&d->fragmented, v[2]);
+
+ if (k.dev_data_type.data_type == BCH_DATA_sb ||
+ k.dev_data_type.data_type == BCH_DATA_journal)
+ usage->hidden += v[0] * ca->mi.bucket_size;
+ }
+ rcu_read_unlock();
+ break;
+ }
+ }
+ preempt_enable();
+fsck_err:
+ percpu_up_write(&c->mark_lock);
+err:
+ printbuf_exit(&buf);
+ bch2_trans_put(trans);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
+{
+ return bch2_trans_run(c,
+ bch2_btree_write_buffer_flush_sync(trans) ?:
+ for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN,
+ BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({
+ struct disk_accounting_pos acc;
+ bpos_to_disk_accounting_pos(&acc, k.k->p);
+
+ acc.type == BCH_DISK_ACCOUNTING_dev_data_type &&
+ acc.dev_data_type.dev == dev
+ ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0)
+ : 0;
+ })) ?:
+ bch2_btree_write_buffer_flush_sync(trans));
+}
+
+int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
+{
+ struct bch_fs *c = ca->fs;
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_dev_data_type,
+ .dev_data_type.dev = ca->dev_idx,
+ .dev_data_type.data_type = BCH_DATA_free,
+ };
+ u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
+
+ int ret = bch2_trans_do(c, ({
+ bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?:
+ (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0);
+ }));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+void bch2_verify_accounting_clean(struct bch_fs *c)
+{
+ bool mismatch = false;
+ struct bch_fs_usage_base base = {}, base_inmem = {};
+
+ bch2_trans_run(c,
+ for_each_btree_key(trans, iter,
+ BTREE_ID_accounting, POS_MIN,
+ BTREE_ITER_all_snapshots, k, ({
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k);
+ unsigned nr = bch2_accounting_counters(k.k);
+
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+ break;
+
+ if (!bch2_accounting_is_mem(acc_k)) {
+ struct disk_accounting_pos next = { .type = acc_k.type + 1 };
+ bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
+ continue;
+ }
+
+ bch2_accounting_mem_read(c, k.k->p, v, nr);
+
+ if (memcmp(a.v->d, v, nr * sizeof(u64))) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, " !=");
+ for (unsigned j = 0; j < nr; j++)
+ prt_printf(&buf, " %llu", v[j]);
+
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
+ mismatch = true;
+ }
+
+ switch (acc_k.type) {
+ case BCH_DISK_ACCOUNTING_persistent_reserved:
+ base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
+ break;
+ case BCH_DISK_ACCOUNTING_replicas:
+ fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
+ break;
+ case BCH_DISK_ACCOUNTING_dev_data_type: {
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
+ if (!ca) {
+ rcu_read_unlock();
+ continue;
+ }
+
+ v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
+ v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
+ v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
+ rcu_read_unlock();
+
+ if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, " in mem");
+ for (unsigned j = 0; j < nr; j++)
+ prt_printf(&buf, " %llu", v[j]);
+
+ pr_err("dev accounting mismatch: %s", buf.buf);
+ printbuf_exit(&buf);
+ mismatch = true;
+ }
+ }
+ }
+
+ 0;
+ })));
+
+ acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64));
+
+#define check(x) \
+ if (base.x != base_inmem.x) { \
+ pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \
+ mismatch = true; \
+ }
+
+ //check(hidden);
+ check(btree);
+ check(data);
+ check(cached);
+ check(reserved);
+ check(nr_inodes);
+
+ WARN_ON(mismatch);
+}
+
+void bch2_accounting_gc_free(struct bch_fs *c)
+{
+ lockdep_assert_held(&c->mark_lock);
+
+ struct bch_accounting_mem *acc = &c->accounting;
+
+ bch2_accounting_free_counters(acc, true);
+ acc->gc_running = false;
+}
+
+void bch2_fs_accounting_exit(struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+
+ bch2_accounting_free_counters(acc, false);
+ darray_exit(&acc->k);
+}
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
new file mode 100644
index 000000000000..f4372cafea2e
--- /dev/null
+++ b/fs/bcachefs/disk_accounting.h
@@ -0,0 +1,275 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_H
+#define _BCACHEFS_DISK_ACCOUNTING_H
+
+#include "btree_update.h"
+#include "eytzinger.h"
+#include "sb-members.h"
+
+static inline void bch2_u64s_neg(u64 *v, unsigned nr)
+{
+ for (unsigned i = 0; i < nr; i++)
+ v[i] = -v[i];
+}
+
+static inline unsigned bch2_accounting_counters(const struct bkey *k)
+{
+ return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
+}
+
+static inline void bch2_accounting_neg(struct bkey_s_accounting a)
+{
+ bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k));
+}
+
+static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
+{
+ for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
+ if (a.v->d[i])
+ return false;
+ return true;
+}
+
+static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
+ struct bkey_s_c_accounting src)
+{
+ EBUG_ON(dst->k.u64s != src.k->u64s);
+
+ for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
+ dst->v.d[i] += src.v->d[i];
+ if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
+ dst->k.bversion = src.k->bversion;
+}
+
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
+ enum bch_data_type data_type,
+ s64 sectors)
+{
+ switch (data_type) {
+ case BCH_DATA_btree:
+ fs_usage->btree += sectors;
+ break;
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ fs_usage->data += sectors;
+ break;
+ case BCH_DATA_cached:
+ fs_usage->cached += sectors;
+ break;
+ default:
+ break;
+ }
+}
+
+static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
+{
+ BUILD_BUG_ON(sizeof(*acc) != sizeof(p));
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ acc->_pad = p;
+#else
+ memcpy_swab(acc, &p, sizeof(p));
+#endif
+}
+
+static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc)
+{
+ struct bpos p;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ p = acc->_pad;
+#else
+ memcpy_swab(&p, acc, sizeof(p));
+#endif
+ return p;
+}
+
+int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
+ s64 *, unsigned, bool);
+int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
+
+int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
+void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_accounting_swab(struct bkey_s);
+
+#define bch2_bkey_ops_accounting ((struct bkey_ops) { \
+ .key_validate = bch2_accounting_validate, \
+ .val_to_text = bch2_accounting_to_text, \
+ .swab = bch2_accounting_swab, \
+ .min_val_size = 8, \
+})
+
+int bch2_accounting_update_sb(struct btree_trans *);
+
+static inline int accounting_pos_cmp(const void *_l, const void *_r)
+{
+ const struct bpos *l = _l, *r = _r;
+
+ return bpos_cmp(*l, *r);
+}
+
+enum bch_accounting_mode {
+ BCH_ACCOUNTING_normal,
+ BCH_ACCOUNTING_gc,
+ BCH_ACCOUNTING_read,
+};
+
+int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
+void bch2_accounting_mem_gc(struct bch_fs *);
+
+static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
+{
+ return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR &&
+ acc.type != BCH_DISK_ACCOUNTING_inum;
+}
+
+/*
+ * Update in memory counters so they match the btree update we're doing; called
+ * from transaction commit path
+ */
+static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
+ struct bkey_s_c_accounting a,
+ enum bch_accounting_mode mode)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_accounting_mem *acc = &c->accounting;
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+ bool gc = mode == BCH_ACCOUNTING_gc;
+
+ if (gc && !acc->gc_running)
+ return 0;
+
+ if (!bch2_accounting_is_mem(acc_k))
+ return 0;
+
+ if (mode == BCH_ACCOUNTING_normal) {
+ switch (acc_k.type) {
+ case BCH_DISK_ACCOUNTING_persistent_reserved:
+ trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
+ break;
+ case BCH_DISK_ACCOUNTING_replicas:
+ fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
+ break;
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
+ if (ca) {
+ this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
+ this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
+ this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
+ }
+ rcu_read_unlock();
+ break;
+ }
+ }
+
+ unsigned idx;
+
+ while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
+ int ret = bch2_accounting_mem_insert(c, a, mode);
+ if (ret)
+ return ret;
+ }
+
+ struct accounting_mem_entry *e = &acc->k.data[idx];
+
+ EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
+
+ for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
+ this_cpu_add(e->v[gc][i], a.v->d[i]);
+ return 0;
+}
+
+static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
+{
+ percpu_down_read(&trans->c->mark_lock);
+ int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
+ percpu_up_read(&trans->c->mark_lock);
+ return ret;
+}
+
+static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc,
+ unsigned idx, u64 *v, unsigned nr, bool gc)
+{
+ memset(v, 0, sizeof(*v) * nr);
+
+ if (unlikely(idx >= acc->k.nr))
+ return;
+
+ struct accounting_mem_entry *e = &acc->k.data[idx];
+
+ nr = min_t(unsigned, nr, e->nr_counters);
+
+ for (unsigned i = 0; i < nr; i++)
+ v[i] = percpu_u64_get(e->v[gc] + i);
+}
+
+static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
+ u64 *v, unsigned nr)
+{
+ percpu_down_read(&c->mark_lock);
+ struct bch_accounting_mem *acc = &c->accounting;
+ unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &p);
+
+ bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
+ percpu_up_read(&c->mark_lock);
+}
+
+static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
+{
+ EBUG_ON(!res->ref);
+
+ return (struct bversion) {
+ .hi = res->seq >> 32,
+ .lo = (res->seq << 32) | (res->offset + offset),
+ };
+}
+
+static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
+ struct bkey_i_accounting *a,
+ unsigned commit_flags)
+{
+ a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
+ (u64 *) a - (u64 *) trans->journal_entries);
+
+ EBUG_ON(bversion_zero(a->k.bversion));
+
+ return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
+ ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal)
+ : 0;
+}
+
+static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans,
+ struct bkey_i_accounting *a_i,
+ unsigned commit_flags)
+{
+ if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
+ struct bkey_s_accounting a = accounting_i_to_s(a_i);
+
+ bch2_accounting_neg(a);
+ bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
+ bch2_accounting_neg(a);
+ }
+}
+
+int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
+int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
+
+int bch2_gc_accounting_start(struct bch_fs *);
+int bch2_gc_accounting_done(struct bch_fs *);
+
+int bch2_accounting_read(struct bch_fs *);
+
+int bch2_dev_usage_remove(struct bch_fs *, unsigned);
+int bch2_dev_usage_init(struct bch_dev *, bool);
+
+void bch2_verify_accounting_clean(struct bch_fs *c);
+
+void bch2_accounting_gc_free(struct bch_fs *);
+void bch2_fs_accounting_exit(struct bch_fs *);
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_H */
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
new file mode 100644
index 000000000000..7b6e6c97e6aa
--- /dev/null
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
+#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
+
+#include "replicas_format.h"
+
+/*
+ * Disk accounting - KEY_TYPE_accounting - on disk format:
+ *
+ * Here, the key has considerably more structure than a typical key (bpos); an
+ * accounting key is 'struct disk_accounting_pos', which is a union of bpos.
+ *
+ * More specifically: a key is just a muliword integer (where word endianness
+ * matches native byte order), so we're treating bpos as an opaque 20 byte
+ * integer and mapping bch_accounting_key to that.
+ *
+ * This is a type-tagged union of all our various subtypes; a disk accounting
+ * key can be device counters, replicas counters, et cetera - it's extensible.
+ *
+ * The value is a list of u64s or s64s; the number of counters is specific to a
+ * given accounting type.
+ *
+ * Unlike with other key types, updates are _deltas_, and the deltas are not
+ * resolved until the update to the underlying btree, done by btree write buffer
+ * flush or journal replay.
+ *
+ * Journal replay in particular requires special handling. The journal tracks a
+ * range of entries which may possibly have not yet been applied to the btree
+ * yet - it does not know definitively whether individual entries are dirty and
+ * still need to be applied.
+ *
+ * To handle this, we use the version field of struct bkey, and give every
+ * accounting update a unique version number - a total ordering in time; the
+ * version number is derived from the key's position in the journal. Then
+ * journal replay can compare the version number of the key from the journal
+ * with the version number of the key in the btree to determine if a key needs
+ * to be replayed.
+ *
+ * For this to work, we must maintain this strict time ordering of updates as
+ * they are flushed to the btree, both via write buffer flush and via journal
+ * replay. This has complications for the write buffer code while journal replay
+ * is still in progress; the write buffer cannot flush any accounting keys to
+ * the btree until journal replay has finished replaying its accounting keys, or
+ * the (newer) version number of the keys from the write buffer will cause
+ * updates from journal replay to be lost.
+ */
+
+struct bch_accounting {
+ struct bch_val v;
+ __u64 d[];
+};
+
+#define BCH_ACCOUNTING_MAX_COUNTERS 3
+
+#define BCH_DATA_TYPES() \
+ x(free, 0) \
+ x(sb, 1) \
+ x(journal, 2) \
+ x(btree, 3) \
+ x(user, 4) \
+ x(cached, 5) \
+ x(parity, 6) \
+ x(stripe, 7) \
+ x(need_gc_gens, 8) \
+ x(need_discard, 9) \
+ x(unstriped, 10)
+
+enum bch_data_type {
+#define x(t, n) BCH_DATA_##t,
+ BCH_DATA_TYPES()
+#undef x
+ BCH_DATA_NR
+};
+
+static inline bool data_type_is_empty(enum bch_data_type type)
+{
+ switch (type) {
+ case BCH_DATA_free:
+ case BCH_DATA_need_gc_gens:
+ case BCH_DATA_need_discard:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool data_type_is_hidden(enum bch_data_type type)
+{
+ switch (type) {
+ case BCH_DATA_sb:
+ case BCH_DATA_journal:
+ return true;
+ default:
+ return false;
+ }
+}
+
+#define BCH_DISK_ACCOUNTING_TYPES() \
+ x(nr_inodes, 0) \
+ x(persistent_reserved, 1) \
+ x(replicas, 2) \
+ x(dev_data_type, 3) \
+ x(compression, 4) \
+ x(snapshot, 5) \
+ x(btree, 6) \
+ x(rebalance_work, 7) \
+ x(inum, 8)
+
+enum disk_accounting_type {
+#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr,
+ BCH_DISK_ACCOUNTING_TYPES()
+#undef x
+ BCH_DISK_ACCOUNTING_TYPE_NR,
+};
+
+struct bch_nr_inodes {
+};
+
+struct bch_persistent_reserved {
+ __u8 nr_replicas;
+};
+
+struct bch_dev_data_type {
+ __u8 dev;
+ __u8 data_type;
+};
+
+struct bch_acct_compression {
+ __u8 type;
+};
+
+struct bch_acct_snapshot {
+ __u32 id;
+} __packed;
+
+struct bch_acct_btree {
+ __u32 id;
+} __packed;
+
+struct bch_acct_inum {
+ __u64 inum;
+} __packed;
+
+struct bch_acct_rebalance_work {
+};
+
+struct disk_accounting_pos {
+ union {
+ struct {
+ __u8 type;
+ union {
+ struct bch_nr_inodes nr_inodes;
+ struct bch_persistent_reserved persistent_reserved;
+ struct bch_replicas_entry_v1 replicas;
+ struct bch_dev_data_type dev_data_type;
+ struct bch_acct_compression compression;
+ struct bch_acct_snapshot snapshot;
+ struct bch_acct_btree btree;
+ struct bch_acct_rebalance_work rebalance_work;
+ struct bch_acct_inum inum;
+ } __packed;
+ } __packed;
+ struct bpos _pad;
+ };
+};
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h
new file mode 100644
index 000000000000..b1982131b206
--- /dev/null
+++ b/fs/bcachefs/disk_accounting_types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H
+#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H
+
+#include "darray.h"
+
+struct accounting_mem_entry {
+ struct bpos pos;
+ struct bversion bversion;
+ unsigned nr_counters;
+ u64 __percpu *v[2];
+};
+
+struct bch_accounting_mem {
+ DARRAY(struct accounting_mem_entry) k;
+ bool gc_running;
+};
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 06a7df529b40..5df8de0b8c02 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -18,9 +18,8 @@ static int group_cmp(const void *_l, const void *_r)
strncmp(l->label, r->label, sizeof(l->label));
}
-static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_disk_groups *groups =
field_to_type(f, disk_groups);
@@ -177,7 +176,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
struct bch_disk_group_cpu *dst;
- if (!bch2_member_exists(&m))
+ if (!bch2_member_alive(&m))
continue;
g = BCH_MEMBER_GROUP(&m);
@@ -512,7 +511,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
return -EINVAL;
if (!c)
- return 0;
+ return -BCH_ERR_option_needs_open_fs;
if (!strlen(val) || !strcmp(val, "none")) {
*res = 0;
@@ -523,7 +522,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
ca = bch2_dev_lookup(c, val);
if (!IS_ERR(ca)) {
*res = dev_to_target(ca->dev_idx);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return 0;
}
@@ -588,7 +587,7 @@ static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsi
case TARGET_DEV: {
struct bch_member m = bch2_sb_member_get(sb, t.dev);
- if (bch2_dev_exists(sb, t.dev)) {
+ if (bch2_member_exists(sb, t.dev)) {
prt_printf(out, "Device ");
pr_uuid(out, m.uuid.b);
prt_printf(out, " (%u)", t.dev);
diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h
new file mode 100644
index 000000000000..698990bbf1d2
--- /dev/null
+++ b/fs/bcachefs/disk_groups_format.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H
+#define _BCACHEFS_DISK_GROUPS_FORMAT_H
+
+#define BCH_SB_LABEL_SIZE 32
+
+struct bch_disk_group {
+ __u8 label[BCH_SB_LABEL_SIZE];
+ __le64 flags[2];
+} __packed __aligned(8);
+
+LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
+LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
+
+struct bch_sb_field_disk_groups {
+ struct bch_sb_field field;
+ struct bch_disk_group entries[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d503af270024..d2a5e76e6479 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -13,10 +13,12 @@
#include "btree_write_buffer.h"
#include "buckets.h"
#include "checksum.h"
+#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
#include "io_read.h"
+#include "io_write.h"
#include "keylist.h"
#include "recovery.h"
#include "replicas.h"
@@ -24,6 +26,7 @@
#include "util.h"
#include <linux/sort.h>
+#include <linux/string_choices.h>
#ifdef __KERNEL__
@@ -106,24 +109,28 @@ struct ec_bio {
/* Stripes btree keys: */
-int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
int ret = 0;
bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
- bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
- stripe_pos_bad,
+ bpos_gt(k.k->p, POS(0, U32_MAX)),
+ c, stripe_pos_bad,
"stripe at bad pos");
- bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
- stripe_val_size_bad,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s),
+ c, stripe_val_size_bad,
"incorrect value size (%zu < %u)",
bkey_val_u64s(k.k), stripe_val_u64s(s));
- ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+ bkey_fsck_err_on(s->csum_granularity_bits >= 64,
+ c, stripe_csum_granularity_bad,
+ "invalid csum granularity (%u >= 64)",
+ s->csum_granularity_bits);
+
+ ret = bch2_bkey_ptrs_validate(c, k, from);
fsck_err:
return ret;
}
@@ -131,174 +138,251 @@ fsck_err:
void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+ const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
+ struct bch_stripe s = {};
- prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
- s->algorithm,
- le16_to_cpu(s->sectors),
- nr_data,
- s->nr_redundant,
- s->csum_type,
- 1U << s->csum_granularity_bits);
+ memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
- for (i = 0; i < s->nr_blocks; i++) {
- const struct bch_extent_ptr *ptr = s->ptrs + i;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- u32 offset;
- u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+ unsigned nr_data = s.nr_blocks - s.nr_redundant;
- prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
- if (i < nr_data)
- prt_printf(out, "#%u", stripe_blockcount_get(s, i));
- prt_printf(out, " gen %u", ptr->gen);
- if (ptr_stale(ca, ptr))
- prt_printf(out, " stale");
+ prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
+ s.algorithm,
+ le16_to_cpu(s.sectors),
+ nr_data,
+ s.nr_redundant);
+ bch2_prt_csum_type(out, s.csum_type);
+ prt_str(out, " gran ");
+ if (s.csum_granularity_bits < 64)
+ prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
+ else
+ prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
+
+ if (s.disk_label) {
+ prt_str(out, " label");
+ bch2_disk_path_to_text(out, c, s.disk_label - 1);
+ }
+
+ for (unsigned i = 0; i < s.nr_blocks; i++) {
+ const struct bch_extent_ptr *ptr = sp->ptrs + i;
+
+ if ((void *) ptr >= bkey_val_end(k))
+ break;
+
+ prt_char(out, ' ');
+ bch2_extent_ptr_to_text(out, c, ptr);
+
+ if (s.csum_type < BCH_CSUM_NR &&
+ i < nr_data &&
+ stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
+ prt_printf(out, "#%u", stripe_blockcount_get(sp, i));
}
}
/* Triggers: */
-static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c_stripe s,
- unsigned idx, bool deleting)
+static int __mark_stripe_bucket(struct btree_trans *trans,
+ struct bch_dev *ca,
+ struct bkey_s_c_stripe s,
+ unsigned ptr_idx, bool deleting,
+ struct bpos bucket,
+ struct bch_alloc_v4 *a,
+ enum btree_iter_update_trigger_flags flags)
{
- struct bch_fs *c = trans->c;
- const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
- enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity : 0;
- s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
+ const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
+ unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
+ bool parity = ptr_idx >= nr_data;
+ enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
+ s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
+ struct bch_fs *c = trans->c;
if (deleting)
sectors = -sectors;
- a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
- a->v.gen, a->v.data_type,
- a->v.dirty_sectors);
- if (ret)
- goto err;
-
if (!deleting) {
- if (bch2_trans_inconsistent_on(a->v.stripe ||
- a->v.stripe_redundancy, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_type_str(a->v.data_type),
- a->v.dirty_sectors,
- a->v.stripe, s.k->p.offset)) {
- ret = -EIO;
+ if (bch2_trans_inconsistent_on(a->stripe ||
+ a->stripe_redundancy, trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ bch2_data_type_str(a->data_type),
+ a->dirty_sectors,
+ a->stripe, s.k->p.offset,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
- if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_type_str(a->v.data_type),
- a->v.dirty_sectors,
- s.k->p.offset)) {
- ret = -EIO;
+ if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ bch2_data_type_str(a->data_type),
+ a->dirty_sectors,
+ a->cached_sectors,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
-
- a->v.stripe = s.k->p.offset;
- a->v.stripe_redundancy = s.v->nr_redundant;
- a->v.data_type = BCH_DATA_stripe;
} else {
- if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
- a->v.stripe_redundancy != s.v->nr_redundant, trans,
- "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- s.k->p.offset, a->v.stripe)) {
- ret = -EIO;
+ if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
+ a->stripe_redundancy != s.v->nr_redundant, trans,
+ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ a->stripe,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -BCH_ERR_mark_stripe;
+ goto err;
+ }
+
+ if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
+ "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ bch2_data_type_str(a->data_type),
+ bch2_data_type_str(data_type),
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
- a->v.stripe = 0;
- a->v.stripe_redundancy = 0;
- a->v.data_type = alloc_data_type(a->v, BCH_DATA_user);
+ if (bch2_trans_inconsistent_on(parity &&
+ (a->dirty_sectors != -sectors ||
+ a->cached_sectors), trans,
+ "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ a->dirty_sectors,
+ a->cached_sectors,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -BCH_ERR_mark_stripe;
+ goto err;
+ }
}
- a->v.dirty_sectors += sectors;
- if (data_type)
- a->v.data_type = !deleting ? data_type : 0;
+ if (sectors) {
+ ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
+ a->gen, a->data_type, &a->dirty_sectors);
+ if (ret)
+ goto err;
+ }
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- if (ret)
- goto err;
+ if (!deleting) {
+ a->stripe = s.k->p.offset;
+ a->stripe_redundancy = s.v->nr_redundant;
+ alloc_data_type_set(a, data_type);
+ } else {
+ a->stripe = 0;
+ a->stripe_redundancy = 0;
+ alloc_data_type_set(a, BCH_DATA_user);
+ }
err:
- bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
return ret;
}
static int mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c k,
- unsigned ptr_idx,
- unsigned flags)
+ struct bkey_s_c_stripe s,
+ unsigned ptr_idx, bool deleting,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned nr_data = s->nr_blocks - s->nr_redundant;
- bool parity = ptr_idx >= nr_data;
- enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
- s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
- const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket old, new, *g;
+ const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
struct printbuf buf = PRINTBUF;
int ret = 0;
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- /* * XXX doesn't handle deletion */
-
- percpu_down_read(&c->mark_lock);
- g = PTR_GC_BUCKET(ca, ptr);
-
- if (g->dirty_sectors ||
- (g->stripe && g->stripe != k.k->p.offset)) {
- bch2_fs_inconsistent(c,
- "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EINVAL;
+ struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
+ if (unlikely(!ca)) {
+ if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
- bucket_lock(g);
- old = *g;
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
- ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type,
- g->gen, g->data_type,
- g->dirty_sectors);
- if (ret)
- goto err;
+ if (flags & BTREE_TRIGGER_transactional) {
+ struct bkey_i_alloc_v4 *a =
+ bch2_trans_start_alloc_update(trans, bucket, 0);
+ ret = PTR_ERR_OR_ZERO(a) ?:
+ __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
+ }
+
+ if (flags & BTREE_TRIGGER_gc) {
+ struct bucket *g = gc_bucket(ca, bucket.offset);
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
+ ptr->dev,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -BCH_ERR_mark_stripe;
+ goto err;
+ }
- g->data_type = data_type;
- g->dirty_sectors += sectors;
+ bucket_lock(g);
+ struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
+ ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
+ alloc_to_bucket(g, new);
+ bucket_unlock(g);
- g->stripe = k.k->p.offset;
- g->stripe_redundancy = s->nr_redundant;
- new = *g;
+ if (!ret)
+ ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
+ }
err:
- bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update_m(c, ca, &old, &new);
- percpu_up_read(&c->mark_lock);
+ bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
}
+static int mark_stripe_buckets(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ enum btree_iter_update_trigger_flags flags)
+{
+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(old).v : NULL;
+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(new).v : NULL;
+
+ BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
+
+ unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
+
+ for (unsigned i = 0; i < nr_blocks; i++) {
+ if (new_s && old_s &&
+ !memcmp(&new_s->ptrs[i],
+ &old_s->ptrs[i],
+ sizeof(new_s->ptrs[i])))
+ continue;
+
+ if (new_s) {
+ int ret = mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(new), i, false, flags);
+ if (ret)
+ return ret;
+ }
+
+ if (old_s) {
+ int ret = mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(old), i, true, flags);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
+{
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->disk_label = s->disk_label;
+ m->blocks_nonempty = 0;
+
+ for (unsigned i = 0; i < s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+}
+
int bch2_trigger_stripe(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
+ enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_s_c new = _new.s_c;
struct bch_fs *c = trans->c;
@@ -308,7 +392,15 @@ int bch2_trigger_stripe(struct btree_trans *trans,
const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
? bkey_s_c_to_stripe(new).v : NULL;
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (unlikely(flags & BTREE_TRIGGER_check_repair))
+ return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
+
+ BUG_ON(new_s && old_s &&
+ (new_s->nr_blocks != old_s->nr_blocks ||
+ new_s->nr_redundant != old_s->nr_redundant));
+
+
+ if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
/*
* If the pointers aren't changing, we don't need to do anything:
*/
@@ -319,55 +411,68 @@ int bch2_trigger_stripe(struct btree_trans *trans,
new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
return 0;
- BUG_ON(new_s && old_s &&
- (new_s->nr_blocks != old_s->nr_blocks ||
- new_s->nr_redundant != old_s->nr_redundant));
+ struct gc_stripe *gc = NULL;
+ if (flags & BTREE_TRIGGER_gc) {
+ gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+ if (!gc) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
+ return -BCH_ERR_ENOMEM_mark_stripe;
+ }
+
+ /*
+ * This will be wrong when we bring back runtime gc: we should
+ * be unmarking the old key and then marking the new key
+ *
+ * Also: when we bring back runtime gc, locking
+ */
+ gc->alive = true;
+ gc->sectors = le16_to_cpu(new_s->sectors);
+ gc->nr_blocks = new_s->nr_blocks;
+ gc->nr_redundant = new_s->nr_redundant;
+
+ for (unsigned i = 0; i < new_s->nr_blocks; i++)
+ gc->ptrs[i] = new_s->ptrs[i];
+
+ /*
+ * gc recalculates this field from stripe ptr
+ * references:
+ */
+ memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
+ }
if (new_s) {
- s64 sectors = le16_to_cpu(new_s->sectors);
+ s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
- struct bch_replicas_padded r;
- bch2_bkey_to_replicas(&r.e, new);
- int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+ bch2_bkey_to_replicas(&acc.replicas, new);
+ int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
if (ret)
return ret;
+
+ if (gc)
+ memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
}
if (old_s) {
- s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+ s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
- struct bch_replicas_padded r;
- bch2_bkey_to_replicas(&r.e, old);
- int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+ bch2_bkey_to_replicas(&acc.replicas, old);
+ int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
if (ret)
return ret;
}
- unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
- for (unsigned i = 0; i < nr_blocks; i++) {
- if (new_s && old_s &&
- !memcmp(&new_s->ptrs[i],
- &old_s->ptrs[i],
- sizeof(new_s->ptrs[i])))
- continue;
-
- if (new_s) {
- int ret = bch2_trans_mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(new), i, false);
- if (ret)
- return ret;
- }
-
- if (old_s) {
- int ret = bch2_trans_mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(old), i, true);
- if (ret)
- return ret;
- }
- }
+ int ret = mark_stripe_buckets(trans, old, new, flags);
+ if (ret)
+ return ret;
}
- if (flags & BTREE_TRIGGER_ATOMIC) {
+ if (flags & BTREE_TRIGGER_atomic) {
struct stripe *m = genradix_ptr(&c->stripes, idx);
if (!m) {
@@ -390,14 +495,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
memset(m, 0, sizeof(*m));
} else {
- m->sectors = le16_to_cpu(new_s->sectors);
- m->algorithm = new_s->algorithm;
- m->nr_blocks = new_s->nr_blocks;
- m->nr_redundant = new_s->nr_redundant;
- m->blocks_nonempty = 0;
-
- for (unsigned i = 0; i < new_s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
+ stripe_to_mem(m, new_s);
if (!old_s)
bch2_stripes_heap_insert(c, m, idx);
@@ -406,54 +504,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
}
}
- if (flags & BTREE_TRIGGER_GC) {
- struct gc_stripe *m =
- genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
-
- if (!m) {
- bch_err(c, "error allocating memory for gc_stripes, idx %llu",
- idx);
- return -BCH_ERR_ENOMEM_mark_stripe;
- }
- /*
- * This will be wrong when we bring back runtime gc: we should
- * be unmarking the old key and then marking the new key
- */
- m->alive = true;
- m->sectors = le16_to_cpu(new_s->sectors);
- m->nr_blocks = new_s->nr_blocks;
- m->nr_redundant = new_s->nr_redundant;
-
- for (unsigned i = 0; i < new_s->nr_blocks; i++)
- m->ptrs[i] = new_s->ptrs[i];
-
- bch2_bkey_to_replicas(&m->r.e, new);
-
- /*
- * gc recalculates this field from stripe ptr
- * references:
- */
- memset(m->block_sectors, 0, sizeof(m->block_sectors));
-
- for (unsigned i = 0; i < new_s->nr_blocks; i++) {
- int ret = mark_stripe_bucket(trans, new, i, flags);
- if (ret)
- return ret;
- }
-
- int ret = bch2_update_replicas(c, new, &m->r.e,
- ((s64) m->sectors * m->nr_redundant),
- 0, true);
- if (ret) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, new);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
- printbuf_exit(&buf);
- return ret;
- }
- }
-
return 0;
}
@@ -504,7 +554,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
unsigned i;
for (i = 0; i < s->v.nr_blocks; i++) {
- kvpfree(buf->data[i], buf->size << 9);
+ kvfree(buf->data[i]);
buf->data[i] = NULL;
}
}
@@ -531,7 +581,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
memset(buf->valid, 0xFF, sizeof(buf->valid));
for (i = 0; i < v->nr_blocks; i++) {
- buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+ buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
if (!buf->data[i])
goto err;
}
@@ -604,21 +654,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
- struct printbuf err = PRINTBUF;
- struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
-
- prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
- want.hi, want.lo,
- got.hi, got.lo,
- bch2_csum_types[v->csum_type]);
- prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
- bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
- bch_err_ratelimited(ca, "%s", err.buf);
- printbuf_exit(&err);
+ struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
+ if (ca) {
+ struct printbuf err = PRINTBUF;
- clear_bit(i, buf->valid);
+ prt_str(&err, "stripe ");
+ bch2_csum_err_msg(&err, v->csum_type, want, got);
+ prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
+ bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
+ bch_err_ratelimited(ca, "%s", err.buf);
+ printbuf_exit(&err);
+
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ }
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ clear_bit(i, buf->valid);
break;
}
@@ -681,14 +731,16 @@ static void ec_block_endio(struct bio *bio)
? BCH_MEMBER_ERROR_write
: BCH_MEMBER_ERROR_read,
"erasure coding %s error: %s",
- bio_data_dir(bio) ? "write" : "read",
+ str_write_read(bio_data_dir(bio)),
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
- if (ptr_stale(ca, ptr)) {
+ int stale = dev_ptr_stale(ca, ptr);
+ if (stale) {
bch_err_ratelimited(ca->fs,
- "error %s stripe: stale pointer after io",
- bio_data_dir(bio) == READ ? "reading from" : "writing to");
+ "error %s stripe: stale/invalid pointer (%i) after io",
+ bio_data_dir(bio) == READ ? "reading from" : "writing to",
+ stale);
clear_bit(ec_bio->idx, ec_bio->buf->valid);
}
@@ -703,25 +755,28 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
unsigned offset = 0, bytes = buf->size << 9;
struct bch_extent_ptr *ptr = &v->ptrs[idx];
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
? BCH_DATA_user
: BCH_DATA_parity;
int rw = op_is_write(opf);
- if (ptr_stale(ca, ptr)) {
- bch_err_ratelimited(c,
- "error %s stripe: stale pointer",
- rw == READ ? "reading from" : "writing to");
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
+ if (!ca) {
clear_bit(idx, buf->valid);
return;
}
- if (!bch2_dev_get_ioref(ca, rw)) {
+ int stale = dev_ptr_stale(ca, ptr);
+ if (stale) {
+ bch_err_ratelimited(c,
+ "error %s stripe: stale pointer (%i)",
+ rw == READ ? "reading from" : "writing to",
+ stale);
clear_bit(idx, buf->valid);
return;
}
+
this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
while (offset < bytes) {
@@ -767,7 +822,7 @@ static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
- POS(0, idx), BTREE_ITER_SLOTS);
+ POS(0, idx), BTREE_ITER_slots);
ret = bkey_err(k);
if (ret)
goto err;
@@ -782,13 +837,16 @@ err:
}
/* recovery read path: */
-int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
+int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
+ struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
- struct ec_stripe_buf *buf;
+ struct ec_stripe_buf *buf = NULL;
struct closure cl;
struct bch_stripe *v;
unsigned i, offset;
+ const char *msg = NULL;
+ struct printbuf msgbuf = PRINTBUF;
int ret = 0;
closure_init_stack(&cl);
@@ -801,32 +859,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: error %i looking up stripe", ret);
- kfree(buf);
- return -EIO;
+ msg = "stripe not found";
+ goto err;
}
v = &bkey_i_to_stripe(&buf->key)->v;
if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: pointer doesn't match stripe");
- ret = -EIO;
+ msg = "pointer doesn't match stripe";
goto err;
}
offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: read is bigger than stripe");
- ret = -EIO;
+ msg = "read is bigger than stripe";
goto err;
}
ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
- if (ret)
+ if (ret) {
+ msg = "-ENOMEM";
goto err;
+ }
for (i = 0; i < v->nr_blocks; i++)
ec_block_io(c, buf, REQ_OP_READ, i, &cl);
@@ -834,9 +888,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
closure_sync(&cl);
if (ec_nr_failed(buf) > v->nr_redundant) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: unable to read enough blocks");
- ret = -EIO;
+ msg = "unable to read enough blocks";
goto err;
}
@@ -848,10 +900,17 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
-err:
+out:
ec_stripe_buf_exit(buf);
kfree(buf);
return ret;
+err:
+ bch2_bkey_val_to_text(&msgbuf, c, orig_k);
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
+ printbuf_exit(&msgbuf);
+ ret = -BCH_ERR_stripe_reconstruct;
+ goto out;
}
/* stripe bucket accounting: */
@@ -866,8 +925,8 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
mutex_lock(&c->ec_stripes_heap_lock);
if (n.size > h->size) {
- memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
- n.used = h->used;
+ memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
+ n.nr = h->nr;
swap(*h, n);
}
mutex_unlock(&c->ec_stripes_heap_lock);
@@ -878,7 +937,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
- if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
+ if (c->gc_pos.phase != GC_PHASE_not_running &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
@@ -958,7 +1017,7 @@ static u64 stripe_idx_to_delete(struct bch_fs *c)
lockdep_assert_held(&c->ec_stripes_heap_lock);
- if (h->used &&
+ if (h->nr &&
h->data[0].blocks_nonempty == 0 &&
!bch2_stripe_is_open(c, h->data[0].idx))
return h->data[0].idx;
@@ -966,14 +1025,6 @@ static u64 stripe_idx_to_delete(struct bch_fs *c)
return 0;
}
-static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
- struct ec_stripe_heap_entry l,
- struct ec_stripe_heap_entry r)
-{
- return ((l.blocks_nonempty > r.blocks_nonempty) -
- (l.blocks_nonempty < r.blocks_nonempty));
-}
-
static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
size_t i)
{
@@ -982,12 +1033,40 @@ static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
}
+static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
+{
+ struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
+ struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
+
+ return ((_l->blocks_nonempty > _r->blocks_nonempty) <
+ (_l->blocks_nonempty < _r->blocks_nonempty));
+}
+
+static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
+{
+ struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
+ struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
+ ec_stripes_heap *_h = (ec_stripes_heap *)h;
+ size_t i = _l - _h->data;
+ size_t j = _r - _h->data;
+
+ swap(*_l, *_r);
+
+ ec_stripes_heap_set_backpointer(_h, i);
+ ec_stripes_heap_set_backpointer(_h, j);
+}
+
+static const struct min_heap_callbacks callbacks = {
+ .less = ec_stripes_heap_cmp,
+ .swp = ec_stripes_heap_swap,
+};
+
static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
struct stripe *m = genradix_ptr(&c->stripes, idx);
- BUG_ON(m->heap_idx >= h->used);
+ BUG_ON(m->heap_idx >= h->nr);
BUG_ON(h->data[m->heap_idx].idx != idx);
}
@@ -997,9 +1076,7 @@ void bch2_stripes_heap_del(struct bch_fs *c,
mutex_lock(&c->ec_stripes_heap_lock);
heap_verify_backpointer(c, idx);
- heap_del(&c->ec_stripes_heap, m->heap_idx,
- ec_stripes_heap_cmp,
- ec_stripes_heap_set_backpointer);
+ min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
mutex_unlock(&c->ec_stripes_heap_lock);
}
@@ -1007,14 +1084,15 @@ void bch2_stripes_heap_insert(struct bch_fs *c,
struct stripe *m, size_t idx)
{
mutex_lock(&c->ec_stripes_heap_lock);
- BUG_ON(heap_full(&c->ec_stripes_heap));
+ BUG_ON(min_heap_full(&c->ec_stripes_heap));
- heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
+ genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
+ min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
.idx = idx,
.blocks_nonempty = m->blocks_nonempty,
}),
- ec_stripes_heap_cmp,
- ec_stripes_heap_set_backpointer);
+ &callbacks,
+ &c->ec_stripes_heap);
heap_verify_backpointer(c, idx);
mutex_unlock(&c->ec_stripes_heap_lock);
@@ -1033,10 +1111,8 @@ void bch2_stripes_heap_update(struct bch_fs *c,
h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
i = m->heap_idx;
- heap_sift_up(h, i, ec_stripes_heap_cmp,
- ec_stripes_heap_set_backpointer);
- heap_sift_down(h, i, ec_stripes_heap_cmp,
- ec_stripes_heap_set_backpointer);
+ min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap);
+ min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
heap_verify_backpointer(c, idx);
@@ -1058,7 +1134,7 @@ static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1100,7 +1176,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
if (!idx)
break;
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
ec_stripe_delete(trans, idx));
bch_err_fn(c, ret);
if (ret)
@@ -1120,47 +1196,62 @@ void bch2_do_stripe_deletes(struct bch_fs *c)
/* stripe creation: */
static int ec_stripe_key_update(struct btree_trans *trans,
- struct bkey_i_stripe *new,
- bool create)
+ struct bkey_i_stripe *old,
+ struct bkey_i_stripe *new)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
+ bool create = !old;
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
- new->k.p, BTREE_ITER_INTENT);
- ret = bkey_err(k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+ new->k.p, BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (ret)
goto err;
- if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
- bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
- create ? "creating" : "updating",
- bch2_bkey_types[k.k->type]);
+ if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
+ c, "error %s stripe: got existing key type %s",
+ create ? "creating" : "updating",
+ bch2_bkey_types[k.k->type])) {
ret = -EINVAL;
goto err;
}
if (k.k->type == KEY_TYPE_stripe) {
- const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
- unsigned i;
+ const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
- if (old->nr_blocks != new->v.nr_blocks) {
- bch_err(c, "error updating stripe: nr_blocks does not match");
- ret = -EINVAL;
- goto err;
- }
+ BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
+ BUG_ON(old->v.nr_blocks != v->nr_blocks);
+
+ for (unsigned i = 0; i < new->v.nr_blocks; i++) {
+ unsigned sectors = stripe_blockcount_get(v, i);
- for (i = 0; i < new->v.nr_blocks; i++) {
- unsigned v = stripe_blockcount_get(old, i);
+ if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
+ struct printbuf buf = PRINTBUF;
- BUG_ON(v &&
- (old->ptrs[i].dev != new->v.ptrs[i].dev ||
- old->ptrs[i].gen != new->v.ptrs[i].gen ||
- old->ptrs[i].offset != new->v.ptrs[i].offset));
+ prt_printf(&buf, "stripe changed nonempty block %u", i);
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /*
+ * If the stripe ptr changed underneath us, it must have
+ * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
+ */
+ if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
+ BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
+
+ if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
+ new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
+ }
- stripe_blockcount_set(&new->v, i, v);
+ stripe_blockcount_set(&new->v, i, sectors);
}
}
@@ -1171,48 +1262,42 @@ err:
}
static int ec_stripe_update_extent(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bpos bucket, u8 gen,
struct ec_stripe_buf *s,
- struct bpos *bp_pos)
+ struct bkey_s_c_backpointer bp,
+ struct bkey_buf *last_flushed)
{
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
struct bch_fs *c = trans->c;
- struct bch_backpointer bp;
struct btree_iter iter;
struct bkey_s_c k;
const struct bch_extent_ptr *ptr_c;
- struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+ struct bch_extent_ptr *ec_ptr = NULL;
struct bch_extent_stripe_ptr stripe_ptr;
struct bkey_i *n;
int ret, dev, block;
- ret = bch2_get_next_backpointer(trans, bucket, gen,
- bp_pos, &bp, BTREE_ITER_CACHED);
- if (ret)
- return ret;
- if (bpos_eq(*bp_pos, SPOS_MAX))
- return 0;
-
- if (bp.level) {
+ if (bp.v->level) {
struct printbuf buf = PRINTBUF;
struct btree_iter node_iter;
struct btree *b;
- b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
+ b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed);
bch2_trans_iter_exit(trans, &node_iter);
if (!b)
return 0;
prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
- bch2_backpointer_to_text(&buf, &bp);
+ bch2_bkey_val_to_text(&buf, c, bp.s_c);
bch2_fs_inconsistent(c, "%s", buf.buf);
printbuf_exit(&buf);
return -EIO;
}
- k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT);
+ k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
ret = bkey_err(k);
if (ret)
return ret;
@@ -1244,7 +1329,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
bkey_reassemble(n, k);
- bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
+ bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
BUG_ON(!ec_ptr);
@@ -1270,25 +1355,37 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
{
struct bch_fs *c = trans->c;
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- struct bch_extent_ptr bucket = v->ptrs[block];
- struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
- struct bpos bp_pos = POS_MIN;
+ struct bch_extent_ptr ptr = v->ptrs[block];
int ret = 0;
- while (1) {
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
- s, &bp_pos));
- if (ret)
- break;
- if (bkey_eq(bp_pos, POS_MAX))
+ struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
+ if (!ca)
+ return -EIO;
+
+ struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
+
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp_start(ca, bucket_pos),
+ bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc, ({
+ if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0)))
break;
- bp_pos = bpos_nosnap_successor(bp_pos);
- }
+ if (bp_k.k->type != KEY_TYPE_backpointer)
+ continue;
+
+ ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
+ bkey_s_c_to_backpointer(bp_k), &last_flushed);
+ }));
+ bch2_bkey_buf_exit(&last_flushed, c);
+ bch2_dev_put(ca);
return ret;
}
@@ -1319,20 +1416,18 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
unsigned block,
struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
- unsigned offset = ca->mi.bucket_size - ob->sectors_free;
- int ret;
-
- if (!bch2_dev_get_ioref(ca, WRITE)) {
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
+ if (!ca) {
s->err = -BCH_ERR_erofs_no_writes;
return;
}
+ unsigned offset = ca->mi.bucket_size - ob->sectors_free;
memset(s->new_stripe.data[block] + (offset << 9),
0,
ob->sectors_free << 9);
- ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
+ int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
ob->bucket * ca->mi.bucket_size + offset,
ob->sectors_free,
GFP_KERNEL, 0);
@@ -1414,12 +1509,14 @@ static void ec_stripe_create(struct ec_stripe_new *s)
goto err;
}
- ret = bch2_trans_do(c, &s->res, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_key_update(trans,
- bkey_i_to_stripe(&s->new_stripe.key),
- !s->have_existing_stripe));
+ ret = bch2_trans_commit_do(c, &s->res, NULL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
+ ec_stripe_key_update(trans,
+ s->have_existing_stripe
+ ? bkey_i_to_stripe(&s->existing_stripe.key)
+ : NULL,
+ bkey_i_to_stripe(&s->new_stripe.key)));
bch_err_msg(c, ret, "creating stripe key");
if (ret) {
goto err;
@@ -1491,10 +1588,12 @@ void bch2_ec_do_stripe_creates(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
}
-static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
+static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
{
struct ec_stripe_new *s = h->s;
+ lockdep_assert_held(&h->lock);
+
BUG_ON(!s->allocated && !s->err);
h->s = NULL;
@@ -1507,6 +1606,12 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
ec_stripe_new_put(c, s, STRIPE_REF_io);
}
+static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
+{
+ h->s->err = err;
+ ec_stripe_new_set_pending(c, h);
+}
+
void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
{
struct ec_stripe_new *s = ob->ec;
@@ -1517,16 +1622,13 @@ void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
{
struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
- struct bch_dev *ca;
- unsigned offset;
-
if (!ob)
return NULL;
BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
- ca = bch_dev_bkey_exists(c, ob->dev);
- offset = ca->mi.bucket_size - ob->sectors_free;
+ struct bch_dev *ca = ob_dev(c, ob);
+ unsigned offset = ca->mi.bucket_size - ob->sectors_free;
return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
}
@@ -1580,7 +1682,8 @@ static void ec_stripe_key_init(struct bch_fs *c,
struct bkey_i *k,
unsigned nr_data,
unsigned nr_parity,
- unsigned stripe_size)
+ unsigned stripe_size,
+ unsigned disk_label)
{
struct bkey_i_stripe *s = bkey_stripe_init(k);
unsigned u64s;
@@ -1591,7 +1694,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
s->v.nr_redundant = nr_parity;
s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9);
s->v.csum_type = BCH_CSUM_crc32c;
- s->v.pad = 0;
+ s->v.disk_label = disk_label;
while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
BUG_ON(1 << s->v.csum_granularity_bits >=
@@ -1603,7 +1706,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
set_bkey_val_u64s(&s->k, u64s);
}
-static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
{
struct ec_stripe_new *s;
@@ -1611,7 +1714,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s)
- return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
+ return NULL;
mutex_init(&s->lock);
closure_init(&s->iodone, NULL);
@@ -1624,40 +1727,29 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
s->nr_parity = h->redundancy;
ec_stripe_key_init(c, &s->new_stripe.key,
- s->nr_data, s->nr_parity, h->blocksize);
-
- h->s = s;
- return 0;
+ s->nr_data, s->nr_parity,
+ h->blocksize, h->disk_label);
+ return s;
}
-static struct ec_stripe_head *
-ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
- unsigned algo, unsigned redundancy,
- enum bch_watermark watermark)
+static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
{
- struct ec_stripe_head *h;
-
- h = kzalloc(sizeof(*h), GFP_KERNEL);
- if (!h)
- return NULL;
-
- mutex_init(&h->lock);
- BUG_ON(!mutex_trylock(&h->lock));
-
- h->target = target;
- h->algo = algo;
- h->redundancy = redundancy;
- h->watermark = watermark;
+ struct bch_devs_mask devs = h->devs;
rcu_read_lock();
- h->devs = target_rw_devs(c, BCH_DATA_user, target);
+ h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
+ ? group_to_target(h->disk_label - 1)
+ : 0);
+ unsigned nr_devs = dev_mask_nr(&h->devs);
for_each_member_device_rcu(c, ca, &h->devs)
if (!ca->mi.durability)
__clear_bit(ca->dev_idx, h->devs.d);
+ unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
h->blocksize = pick_blocksize(c, &h->devs);
+ h->nr_active_devs = 0;
for_each_member_device_rcu(c, ca, &h->devs)
if (ca->mi.bucket_size == h->blocksize)
h->nr_active_devs++;
@@ -1668,9 +1760,50 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
* If we only have redundancy + 1 devices, we're better off with just
* replication:
*/
- if (h->nr_active_devs < h->redundancy + 2)
- bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
- h->nr_active_devs, h->redundancy + 2);
+ h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
+
+ if (h->insufficient_devs) {
+ const char *err;
+
+ if (nr_devs < h->redundancy + 2)
+ err = NULL;
+ else if (nr_devs_with_durability < h->redundancy + 2)
+ err = "cannot use durability=0 devices";
+ else
+ err = "mismatched bucket sizes";
+
+ if (err)
+ bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
+ h->nr_active_devs, h->redundancy + 2, err);
+ }
+
+ struct bch_devs_mask devs_leaving;
+ bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX);
+
+ if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving))
+ ec_stripe_new_cancel(c, h, -EINTR);
+
+ h->rw_devs_change_count = c->rw_devs_change_count;
+}
+
+static struct ec_stripe_head *
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
+ unsigned algo, unsigned redundancy,
+ enum bch_watermark watermark)
+{
+ struct ec_stripe_head *h;
+
+ h = kzalloc(sizeof(*h), GFP_KERNEL);
+ if (!h)
+ return NULL;
+
+ mutex_init(&h->lock);
+ BUG_ON(!mutex_trylock(&h->lock));
+
+ h->disk_label = disk_label;
+ h->algo = algo;
+ h->redundancy = redundancy;
+ h->watermark = watermark;
list_add(&h->list, &c->ec_stripe_head_list);
return h;
@@ -1682,14 +1815,14 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
h->s->allocated &&
bitmap_weight(h->s->blocks_allocated,
h->s->nr_data) == h->s->nr_data)
- ec_stripe_set_pending(c, h);
+ ec_stripe_new_set_pending(c, h);
mutex_unlock(&h->lock);
}
static struct ec_stripe_head *
__bch2_ec_stripe_head_get(struct btree_trans *trans,
- unsigned target,
+ unsigned disk_label,
unsigned algo,
unsigned redundancy,
enum bch_watermark watermark)
@@ -1707,63 +1840,84 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
if (test_bit(BCH_FS_going_ro, &c->flags)) {
h = ERR_PTR(-BCH_ERR_erofs_no_writes);
- goto found;
+ goto err;
}
list_for_each_entry(h, &c->ec_stripe_head_list, list)
- if (h->target == target &&
+ if (h->disk_label == disk_label &&
h->algo == algo &&
h->redundancy == redundancy &&
h->watermark == watermark) {
ret = bch2_trans_mutex_lock(trans, &h->lock);
- if (ret)
+ if (ret) {
h = ERR_PTR(ret);
+ goto err;
+ }
goto found;
}
- h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
+ h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
+ if (!h) {
+ h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc);
+ goto err;
+ }
found:
- if (!IS_ERR_OR_NULL(h) &&
- h->nr_active_devs < h->redundancy + 2) {
+ if (h->rw_devs_change_count != c->rw_devs_change_count)
+ ec_stripe_head_devs_update(c, h);
+
+ if (h->insufficient_devs) {
mutex_unlock(&h->lock);
h = NULL;
}
+err:
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
-static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
+static int new_stripe_alloc_buckets(struct btree_trans *trans,
+ struct ec_stripe_head *h, struct ec_stripe_new *s,
enum bch_watermark watermark, struct closure *cl)
{
struct bch_fs *c = trans->c;
struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
struct open_buckets buckets;
- struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
+ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
bool have_cache = true;
int ret = 0;
- BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity);
- BUG_ON(v->nr_redundant != h->s->nr_parity);
+ BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity);
+ BUG_ON(v->nr_redundant != s->nr_parity);
+
+ /* * We bypass the sector allocator which normally does this: */
+ bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
+
+ for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
+ /*
+ * Note: we don't yet repair invalid blocks (failed/removed
+ * devices) when reusing stripes - we still need a codepath to
+ * walk backpointers and update all extents that point to that
+ * block when updating the stripe
+ */
+ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
+ __clear_bit(v->ptrs[i].dev, devs.d);
- for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
- __clear_bit(v->ptrs[i].dev, devs.d);
- if (i < h->s->nr_data)
+ if (i < s->nr_data)
nr_have_data++;
else
nr_have_parity++;
}
- BUG_ON(nr_have_data > h->s->nr_data);
- BUG_ON(nr_have_parity > h->s->nr_parity);
+ BUG_ON(nr_have_data > s->nr_data);
+ BUG_ON(nr_have_parity > s->nr_parity);
buckets.nr = 0;
- if (nr_have_parity < h->s->nr_parity) {
+ if (nr_have_parity < s->nr_parity) {
ret = bch2_bucket_alloc_set_trans(trans, &buckets,
&h->parity_stripe,
&devs,
- h->s->nr_parity,
+ s->nr_parity,
&nr_have_parity,
&have_cache, 0,
BCH_DATA_parity,
@@ -1771,14 +1925,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
cl);
open_bucket_for_each(c, &buckets, ob, i) {
- j = find_next_zero_bit(h->s->blocks_gotten,
- h->s->nr_data + h->s->nr_parity,
- h->s->nr_data);
- BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+ j = find_next_zero_bit(s->blocks_gotten,
+ s->nr_data + s->nr_parity,
+ s->nr_data);
+ BUG_ON(j >= s->nr_data + s->nr_parity);
- h->s->blocks[j] = buckets.v[i];
+ s->blocks[j] = buckets.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
- __set_bit(j, h->s->blocks_gotten);
+ __set_bit(j, s->blocks_gotten);
}
if (ret)
@@ -1786,11 +1940,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
}
buckets.nr = 0;
- if (nr_have_data < h->s->nr_data) {
+ if (nr_have_data < s->nr_data) {
ret = bch2_bucket_alloc_set_trans(trans, &buckets,
&h->block_stripe,
&devs,
- h->s->nr_data,
+ s->nr_data,
&nr_have_data,
&have_cache, 0,
BCH_DATA_user,
@@ -1798,13 +1952,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
cl);
open_bucket_for_each(c, &buckets, ob, i) {
- j = find_next_zero_bit(h->s->blocks_gotten,
- h->s->nr_data, 0);
- BUG_ON(j >= h->s->nr_data);
+ j = find_next_zero_bit(s->blocks_gotten,
+ s->nr_data, 0);
+ BUG_ON(j >= s->nr_data);
- h->s->blocks[j] = buckets.v[i];
+ s->blocks[j] = buckets.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
- __set_bit(j, h->s->blocks_gotten);
+ __set_bit(j, s->blocks_gotten);
}
if (ret)
@@ -1814,7 +1968,6 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
return 0;
}
-/* XXX: doesn't obey target: */
static s64 get_existing_stripe(struct bch_fs *c,
struct ec_stripe_head *head)
{
@@ -1828,7 +1981,7 @@ static s64 get_existing_stripe(struct bch_fs *c,
return -1;
mutex_lock(&c->ec_stripes_heap_lock);
- for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+ for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
/* No blocks worth reusing, stripe will just be deleted: */
if (!h->data[heap_idx].blocks_nonempty)
continue;
@@ -1837,7 +1990,8 @@ static s64 get_existing_stripe(struct bch_fs *c,
m = genradix_ptr(&c->stripes, stripe_idx);
- if (m->algorithm == head->algo &&
+ if (m->disk_label == head->disk_label &&
+ m->algorithm == head->algo &&
m->nr_redundant == head->redundancy &&
m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
@@ -1850,73 +2004,78 @@ static s64 get_existing_stripe(struct bch_fs *c,
return ret;
}
-static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
+static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
{
- struct bch_fs *c = trans->c;
- struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
- struct bch_stripe *existing_v;
+ struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+ struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
unsigned i;
- s64 idx;
- int ret;
- /*
- * If we can't allocate a new stripe, and there's no stripes with empty
- * blocks for us to reuse, that means we have to wait on copygc:
- */
- idx = get_existing_stripe(c, h);
- if (idx < 0)
- return -BCH_ERR_stripe_alloc_blocked;
-
- ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
- if (ret) {
- bch2_stripe_close(c, h->s);
- if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
- return ret;
- }
-
- existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
-
- BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
- h->s->nr_data = existing_v->nr_blocks -
+ BUG_ON(existing_v->nr_redundant != s->nr_parity);
+ s->nr_data = existing_v->nr_blocks -
existing_v->nr_redundant;
- ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
+ int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
if (ret) {
- bch2_stripe_close(c, h->s);
+ bch2_stripe_close(c, s);
return ret;
}
- BUG_ON(h->s->existing_stripe.size != h->blocksize);
- BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
+ BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
/*
* Free buckets we initially allocated - they might conflict with
* blocks from the stripe we're reusing:
*/
- for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
- bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
- h->s->blocks[i] = 0;
+ for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
+ bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
+ s->blocks[i] = 0;
}
- memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
- memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
+ memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
+ memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
- for (i = 0; i < existing_v->nr_blocks; i++) {
+ for (unsigned i = 0; i < existing_v->nr_blocks; i++) {
if (stripe_blockcount_get(existing_v, i)) {
- __set_bit(i, h->s->blocks_gotten);
- __set_bit(i, h->s->blocks_allocated);
+ __set_bit(i, s->blocks_gotten);
+ __set_bit(i, s->blocks_allocated);
}
- ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+ ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
}
- bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
- h->s->have_existing_stripe = true;
+ bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
+ s->have_existing_stripe = true;
return 0;
}
-static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h,
+ struct ec_stripe_new *s)
+{
+ struct bch_fs *c = trans->c;
+ s64 idx;
+ int ret;
+
+ /*
+ * If we can't allocate a new stripe, and there's no stripes with empty
+ * blocks for us to reuse, that means we have to wait on copygc:
+ */
+ idx = get_existing_stripe(c, h);
+ if (idx < 0)
+ return -BCH_ERR_stripe_alloc_blocked;
+
+ ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
+ bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
+ "reading stripe key: %s", bch2_err_str(ret));
+ if (ret) {
+ bch2_stripe_close(c, s);
+ return ret;
+ }
+
+ return init_new_stripe_from_existing(c, s);
+}
+
+static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h,
+ struct ec_stripe_new *s)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
@@ -1925,17 +2084,21 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
int ret;
- if (!h->s->res.sectors) {
- ret = bch2_disk_reservation_get(c, &h->s->res,
+ if (!s->res.sectors) {
+ ret = bch2_disk_reservation_get(c, &s->res,
h->blocksize,
- h->s->nr_parity,
+ s->nr_parity,
BCH_DISK_RESERVATION_NOFAIL);
if (ret)
return ret;
}
+ /*
+ * Allocate stripe slot
+ * XXX: we're going to need a bitrange btree of free stripes
+ */
for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
if (start_pos.offset) {
start_pos = min_pos;
@@ -1948,7 +2111,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
}
if (bkey_deleted(k.k) &&
- bch2_try_open_stripe(c, h->s, k.k->p.offset))
+ bch2_try_open_stripe(c, s, k.k->p.offset))
break;
}
@@ -1959,16 +2122,16 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
ret = ec_stripe_mem_alloc(trans, &iter);
if (ret) {
- bch2_stripe_close(c, h->s);
+ bch2_stripe_close(c, s);
goto err;
}
- h->s->new_stripe.key.k.p = iter.pos;
+ s->new_stripe.key.k.p = iter.pos;
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
err:
- bch2_disk_reservation_put(c, &h->s->res);
+ bch2_disk_reservation_put(c, &s->res);
goto out;
}
@@ -1982,29 +2145,44 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct ec_stripe_head *h;
bool waiting = false;
+ unsigned disk_label = 0;
+ struct target t = target_decode(target);
int ret;
- h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
+ if (t.type == TARGET_GROUP) {
+ if (t.group > U8_MAX) {
+ bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
+ return NULL;
+ }
+ disk_label = t.group + 1; /* 0 == no label */
+ }
+
+ h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
if (IS_ERR_OR_NULL(h))
return h;
if (!h->s) {
- ret = ec_new_stripe_alloc(c, h);
- if (ret) {
+ h->s = ec_new_stripe_alloc(c, h);
+ if (!h->s) {
+ ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
bch_err(c, "failed to allocate new stripe");
goto err;
}
+
+ h->nr_created++;
}
- if (h->s->allocated)
+ struct ec_stripe_new *s = h->s;
+
+ if (s->allocated)
goto allocated;
- if (h->s->have_existing_stripe)
+ if (s->have_existing_stripe)
goto alloc_existing;
/* First, try to allocate a full stripe: */
- ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
- __bch2_ec_stripe_head_reserve(trans, h);
+ ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?:
+ __bch2_ec_stripe_head_reserve(trans, h, s);
if (!ret)
goto allocate_buf;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
@@ -2016,15 +2194,15 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
* existing stripe:
*/
while (1) {
- ret = __bch2_ec_stripe_head_reuse(trans, h);
+ ret = __bch2_ec_stripe_head_reuse(trans, h, s);
if (!ret)
break;
if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
goto err;
if (watermark == BCH_WATERMARK_copygc) {
- ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
- __bch2_ec_stripe_head_reserve(trans, h);
+ ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?:
+ __bch2_ec_stripe_head_reserve(trans, h, s);
if (ret)
goto err;
goto allocate_buf;
@@ -2042,19 +2220,19 @@ alloc_existing:
* Retry allocating buckets, with the watermark for this
* particular write:
*/
- ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
+ ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl);
if (ret)
goto err;
allocate_buf:
- ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
+ ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize);
if (ret)
goto err;
- h->s->allocated = true;
+ s->allocated = true;
allocated:
- BUG_ON(!h->s->idx);
- BUG_ON(!h->s->new_stripe.data[0]);
+ BUG_ON(!s->idx);
+ BUG_ON(!s->new_stripe.data[0]);
BUG_ON(trans->restarted);
return h;
err:
@@ -2062,6 +2240,73 @@ err:
return ERR_PTR(ret);
}
+/* device removal */
+
+static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
+{
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
+
+ if (!a->stripe)
+ return 0;
+
+ if (a->stripe_sectors) {
+ bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
+ return -BCH_ERR_invalidate_stripe_to_dev;
+ }
+
+ struct btree_iter iter;
+ struct bkey_i_stripe *s =
+ bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
+ BTREE_ITER_slots, stripe);
+ int ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ return ret;
+
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+
+ s64 sectors = 0;
+ for (unsigned i = 0; i < s->v.nr_blocks; i++)
+ sectors -= stripe_blockcount_get(&s->v, i);
+
+ bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+ acc.replicas.data_type = BCH_DATA_user;
+ ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+ if (ret)
+ goto err;
+
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->dev == k_a.k->p.inode)
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+
+ sectors = -sectors;
+
+ bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+ acc.replicas.data_type = BCH_DATA_user;
+ ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
+{
+ return bch2_trans_run(c,
+ for_each_btree_key_max_commit(trans, iter,
+ BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
+ BTREE_ITER_intent, k,
+ NULL, NULL, 0, ({
+ bch2_invalidate_stripe_to_dev(trans, k);
+ })));
+}
+
+/* startup/shutdown */
+
static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
{
struct ec_stripe_head *h;
@@ -2087,8 +2332,7 @@ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
}
goto unlock;
found:
- h->s->err = -BCH_ERR_erofs_no_writes;
- ec_stripe_set_pending(c, h);
+ ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
unlock:
mutex_unlock(&h->lock);
}
@@ -2125,7 +2369,7 @@ int bch2_stripes_read(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
if (k.k->type != KEY_TYPE_stripe)
continue;
@@ -2133,17 +2377,9 @@ int bch2_stripes_read(struct bch_fs *c)
if (ret)
break;
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-
struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->blocks_nonempty = 0;
- for (unsigned i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
bch2_stripes_heap_insert(c, m, k.k->p.offset);
0;
@@ -2159,7 +2395,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
size_t i;
mutex_lock(&c->ec_stripes_heap_lock);
- for (i = 0; i < min_t(size_t, h->used, 50); i++) {
+ for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
m = genradix_ptr(&c->stripes, h->data[i].idx);
prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
@@ -2173,6 +2409,25 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
mutex_unlock(&c->ec_stripes_heap_lock);
}
+static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
+ struct ec_stripe_new *s)
+{
+ prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
+ s->idx, s->nr_data, s->nr_parity,
+ bitmap_weight(s->blocks_allocated, s->nr_data),
+ atomic_read(&s->ref[STRIPE_REF_io]),
+ atomic_read(&s->ref[STRIPE_REF_stripe]),
+ bch2_watermarks[s->h->watermark]);
+
+ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+ unsigned i;
+ for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
+ prt_printf(out, " %u", s->blocks[i]);
+ prt_newline(out);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
+ prt_newline(out);
+}
+
void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
{
struct ec_stripe_head *h;
@@ -2180,28 +2435,21 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->ec_stripe_head_lock);
list_for_each_entry(h, &c->ec_stripe_head_list, list) {
- prt_printf(out, "target %u algo %u redundancy %u %s:\n",
- h->target, h->algo, h->redundancy,
- bch2_watermarks[h->watermark]);
+ prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
+ h->disk_label, h->algo, h->redundancy,
+ bch2_watermarks[h->watermark],
+ h->nr_created);
if (h->s)
- prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
- h->s->idx, h->s->nr_data, h->s->nr_parity,
- bitmap_weight(h->s->blocks_allocated,
- h->s->nr_data));
+ bch2_new_stripe_to_text(out, c, h->s);
}
mutex_unlock(&c->ec_stripe_head_lock);
prt_printf(out, "in flight:\n");
mutex_lock(&c->ec_stripe_new_lock);
- list_for_each_entry(s, &c->ec_stripe_new_list, list) {
- prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
- s->idx, s->nr_data, s->nr_parity,
- atomic_read(&s->ref[STRIPE_REF_io]),
- atomic_read(&s->ref[STRIPE_REF_stripe]),
- bch2_watermarks[s->h->watermark]);
- }
+ list_for_each_entry(s, &c->ec_stripe_new_list, list)
+ bch2_new_stripe_to_text(out, c, s);
mutex_unlock(&c->ec_stripe_new_lock);
}
@@ -2212,11 +2460,9 @@ void bch2_fs_ec_exit(struct bch_fs *c)
while (1) {
mutex_lock(&c->ec_stripe_head_lock);
- h = list_first_entry_or_null(&c->ec_stripe_head_list,
- struct ec_stripe_head, list);
- if (h)
- list_del(&h->list);
+ h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list);
mutex_unlock(&c->ec_stripe_head_lock);
+
if (!h)
break;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index f4369b02e805..583ca6a226da 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -6,17 +6,16 @@
#include "buckets_types.h"
#include "extents_types.h"
-enum bkey_invalid_flags;
-
-int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_stripe ((struct bkey_ops) { \
- .key_invalid = bch2_stripe_invalid, \
+ .key_validate = bch2_stripe_validate, \
.val_to_text = bch2_stripe_to_text, \
.swab = bch2_ptr_swab, \
.trigger = bch2_trigger_stripe, \
@@ -32,6 +31,8 @@ static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
unsigned dev, unsigned csum_idx)
{
+ EBUG_ON(s->csum_type >= BCH_CSUM_NR);
+
unsigned csum_bytes = bch_crc_bytes[s->csum_type];
return sizeof(struct bch_stripe) +
@@ -95,7 +96,9 @@ static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe
const struct bch_extent_ptr *data_ptr,
unsigned sectors)
{
- return data_ptr->dev == stripe_ptr->dev &&
+ return (data_ptr->dev == stripe_ptr->dev ||
+ data_ptr->dev == BCH_SB_MEMBER_INVALID ||
+ stripe_ptr->dev == BCH_SB_MEMBER_INVALID) &&
data_ptr->gen == stripe_ptr->gen &&
data_ptr->offset >= stripe_ptr->offset &&
data_ptr->offset < stripe_ptr->offset + sectors;
@@ -184,10 +187,15 @@ struct ec_stripe_head {
struct list_head list;
struct mutex lock;
- unsigned target;
+ unsigned disk_label;
unsigned algo;
unsigned redundancy;
enum bch_watermark watermark;
+ bool insufficient_devs;
+
+ unsigned long rw_devs_change_count;
+
+ u64 nr_created;
struct bch_devs_mask devs;
unsigned nr_active_devs;
@@ -200,7 +208,7 @@ struct ec_stripe_head {
struct ec_stripe_new *s;
};
-int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
+int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
@@ -245,6 +253,8 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
}
}
+int bch2_dev_remove_stripes(struct bch_fs *, unsigned);
+
void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_fs_ec_stop(struct bch_fs *);
void bch2_fs_ec_flush(struct bch_fs *);
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
index 44ce88ba08d7..b9770f24f213 100644
--- a/fs/bcachefs/ec_format.h
+++ b/fs/bcachefs/ec_format.h
@@ -11,7 +11,31 @@ struct bch_stripe {
__u8 csum_granularity_bits;
__u8 csum_type;
- __u8 pad;
+
+ /*
+ * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2
+ *
+ * we can manage with this because this only needs to point to a
+ * disk label, not a target:
+ */
+ __u8 disk_label;
+
+ /*
+ * Variable length sections:
+ * - Pointers
+ * - Checksums
+ * 2D array of [stripe block/device][csum block], with checksum block
+ * size given by csum_granularity_bits
+ * - Block sector counts: per-block array of u16s
+ *
+ * XXX:
+ * Either checksums should have come last, or we should have included a
+ * checksum_size field (the size in bytes of the checksum itself, not
+ * the blocksize the checksum covers).
+ *
+ * Currently we aren't able to access the block sector counts if the
+ * checksum type is unknown.
+ */
struct bch_extent_ptr ptrs[];
} __packed __aligned(8);
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 976426da3a12..8d1e70e830ac 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -16,6 +16,7 @@ struct stripe {
u8 nr_blocks;
u8 nr_redundant;
u8 blocks_nonempty;
+ u8 disk_label;
};
struct gc_stripe {
@@ -36,6 +37,6 @@ struct ec_stripe_heap_entry {
unsigned blocks_nonempty;
};
-typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
+typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap;
#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index d260ff9bbfeb..43557bebd0f8 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "errcode.h"
+#include "trace.h"
#include <linux/errname.h>
@@ -49,15 +50,17 @@ bool __bch2_err_matches(int err, int class)
return err == class;
}
-int __bch2_err_class(int err)
+int __bch2_err_class(int bch_err)
{
- err = -err;
- BUG_ON((unsigned) err >= BCH_ERR_MAX);
+ int std_err = -bch_err;
+ BUG_ON((unsigned) std_err >= BCH_ERR_MAX);
- while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
- err = bch2_errcode_parents[err - BCH_ERR_START];
+ while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START])
+ std_err = bch2_errcode_parents[std_err - BCH_ERR_START];
+
+ trace_error_downcast(bch_err, std_err, _RET_IP_);
- return -err;
+ return -std_err;
}
const char *bch2_blk_status_to_str(blk_status_t status)
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 8c40c2067a04..4590cd0c7c90 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -5,6 +5,10 @@
#define BCH_ERRCODES() \
x(ERANGE, ERANGE_option_too_small) \
x(ERANGE, ERANGE_option_too_big) \
+ x(EINVAL, mount_option) \
+ x(BCH_ERR_mount_option, option_name) \
+ x(BCH_ERR_mount_option, option_value) \
+ x(BCH_ERR_mount_option, option_not_bool) \
x(ENOMEM, ENOMEM_stripe_buf) \
x(ENOMEM, ENOMEM_replicas_table) \
x(ENOMEM, ENOMEM_cpu_replicas) \
@@ -50,7 +54,8 @@
x(ENOMEM, ENOMEM_compression_bounce_read_init) \
x(ENOMEM, ENOMEM_compression_bounce_write_init) \
x(ENOMEM, ENOMEM_compression_workspace_init) \
- x(ENOMEM, ENOMEM_decompression_workspace_init) \
+ x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \
+ x(EIO, compression_workspace_not_initialized) \
x(ENOMEM, ENOMEM_bucket_gens) \
x(ENOMEM, ENOMEM_buckets_nouse) \
x(ENOMEM, ENOMEM_usage_init) \
@@ -78,6 +83,9 @@
x(ENOMEM, ENOMEM_fs_name_alloc) \
x(ENOMEM, ENOMEM_fs_other_alloc) \
x(ENOMEM, ENOMEM_dev_alloc) \
+ x(ENOMEM, ENOMEM_disk_accounting) \
+ x(ENOMEM, ENOMEM_stripe_head_alloc) \
+ x(ENOMEM, ENOMEM_journal_read_bucket) \
x(ENOSPC, ENOSPC_disk_reservation) \
x(ENOSPC, ENOSPC_bucket_alloc) \
x(ENOSPC, ENOSPC_disk_label_add) \
@@ -109,8 +117,15 @@
x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
x(ENOENT, ENOENT_dev_not_found) \
x(ENOENT, ENOENT_dev_idx_not_found) \
- x(0, open_buckets_empty) \
- x(0, freelist_empty) \
+ x(ENOENT, ENOENT_inode_no_backpointer) \
+ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
+ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
+ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
+ x(EEXIST, EEXIST_str_hash_set) \
+ x(EEXIST, EEXIST_discard_in_flight_add) \
+ x(EEXIST, EEXIST_subvolume_create) \
+ x(ENOSPC, open_buckets_empty) \
+ x(ENOSPC, freelist_empty) \
x(BCH_ERR_freelist_empty, no_buckets_found) \
x(0, transaction_restart) \
x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \
@@ -136,6 +151,7 @@
x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
x(BCH_ERR_transaction_restart, transaction_restart_nested) \
+ x(BCH_ERR_transaction_restart, transaction_restart_commit) \
x(0, no_btree_node) \
x(BCH_ERR_no_btree_node, no_btree_node_relock) \
x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \
@@ -152,15 +168,17 @@
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
x(0, backpointer_to_overwritten_btree_node) \
- x(0, lock_fail_root_changed) \
x(0, journal_reclaim_would_deadlock) \
x(EINVAL, fsck) \
x(BCH_ERR_fsck, fsck_fix) \
+ x(BCH_ERR_fsck, fsck_delete_bkey) \
x(BCH_ERR_fsck, fsck_ignore) \
x(BCH_ERR_fsck, fsck_errors_not_fixed) \
x(BCH_ERR_fsck, fsck_repair_unimplemented) \
x(BCH_ERR_fsck, fsck_repair_impossible) \
- x(0, restart_recovery) \
+ x(EINVAL, restart_recovery) \
+ x(EINVAL, not_in_recovery) \
+ x(EINVAL, cannot_rewind_recovery) \
x(0, data_update_done) \
x(EINVAL, device_state_not_allowed) \
x(EINVAL, member_info_missing) \
@@ -168,6 +186,7 @@
x(EINVAL, block_size_too_small) \
x(EINVAL, bucket_size_too_small) \
x(EINVAL, device_size_too_small) \
+ x(EINVAL, device_size_too_big) \
x(EINVAL, device_not_a_member_of_filesystem) \
x(EINVAL, device_has_been_removed) \
x(EINVAL, device_splitbrain) \
@@ -176,6 +195,11 @@
x(EINVAL, invalid) \
x(EINVAL, internal_fsck_err) \
x(EINVAL, opt_parse_error) \
+ x(EINVAL, remove_with_metadata_missing_unimplemented)\
+ x(EINVAL, remove_would_lose_data) \
+ x(EINVAL, no_resize_with_buckets_nouse) \
+ x(EINVAL, inode_unpack_error) \
+ x(EINVAL, varint_decode_error) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@@ -207,6 +231,7 @@
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \
+ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \
x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \
x(BCH_ERR_invalid_sb, invalid_sb_members) \
x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \
@@ -223,9 +248,27 @@
x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \
x(BCH_ERR_invalid, invalid_bkey) \
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
+ x(EIO, journal_shutdown) \
+ x(EIO, journal_flush_err) \
x(EIO, btree_node_read_err) \
+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \
x(EIO, sb_not_downgraded) \
- x(EIO, btree_write_all_failed) \
+ x(EIO, btree_node_write_all_failed) \
+ x(EIO, btree_node_read_error) \
+ x(EIO, btree_node_read_validate_error) \
+ x(EIO, btree_need_topology_repair) \
+ x(EIO, bucket_ref_update) \
+ x(EIO, trigger_pointer) \
+ x(EIO, trigger_stripe_pointer) \
+ x(EIO, metadata_bucket_inconsistency) \
+ x(EIO, mark_stripe) \
+ x(EIO, stripe_reconstruct) \
+ x(EIO, key_type_error) \
+ x(EIO, no_device_to_read_from) \
+ x(EIO, missing_indirect_extent) \
+ x(EIO, invalidate_stripe_to_dev) \
+ x(EIO, no_encryption_key) \
+ x(EIO, insufficient_journal_devices) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
@@ -238,7 +281,10 @@
x(BCH_ERR_nopromote, nopromote_congested) \
x(BCH_ERR_nopromote, nopromote_in_flight) \
x(BCH_ERR_nopromote, nopromote_no_writes) \
- x(BCH_ERR_nopromote, nopromote_enomem)
+ x(BCH_ERR_nopromote, nopromote_enomem) \
+ x(0, invalid_snapshot_node) \
+ x(0, option_needs_open_fs) \
+ x(0, remove_disk_accounting_entry)
enum bch_errcode {
BCH_ERR_START = 2048,
@@ -271,6 +317,7 @@ static inline long bch2_err_class(long err)
#define BLK_STS_REMOVED ((__force blk_status_t)128)
+#include <linux/blk_types.h>
const char *bch2_blk_status_to_str(blk_status_t);
#endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index d32c8bebe46c..038da6a61f6b 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,6 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
#include "error.h"
+#include "fs-common.h"
+#include "journal.h"
+#include "recovery_passes.h"
#include "super.h"
#include "thread_with_file.h"
@@ -13,9 +18,11 @@ bool bch2_inconsistent_error(struct bch_fs *c)
switch (c->opts.errors) {
case BCH_ON_ERROR_continue:
return false;
+ case BCH_ON_ERROR_fix_safe:
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "inconsistency detected - emergency read only");
+ bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
+ journal_cur_seq(&c->journal));
return true;
case BCH_ON_ERROR_panic:
panic(bch2_fmt(c, "panic after error"));
@@ -25,11 +32,16 @@ bool bch2_inconsistent_error(struct bch_fs *c)
}
}
-void bch2_topology_error(struct bch_fs *c)
+int bch2_topology_error(struct bch_fs *c)
{
set_bit(BCH_FS_topology_error, &c->flags);
- if (!test_bit(BCH_FS_fsck_running, &c->flags))
+ if (!test_bit(BCH_FS_recovery_running, &c->flags)) {
bch2_inconsistent_error(c);
+ return -BCH_ERR_btree_need_topology_repair;
+ } else {
+ return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
+ -BCH_ERR_btree_node_read_validate_error;
+ }
}
void bch2_fatal_error(struct bch_fs *c)
@@ -89,7 +101,7 @@ static enum ask_yn parse_yn_response(char *buf)
}
#ifdef __KERNEL__
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
{
struct stdio_redirect *stdio = c->stdio;
@@ -99,25 +111,44 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
if (!stdio)
return YN_NO;
- char buf[100];
+ if (trans)
+ bch2_trans_unlock(trans);
+
+ unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0;
+ darray_char line = {};
int ret;
do {
+ unsigned long t;
bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
+rewait:
+ t = unlock_long_at
+ ? max_t(long, unlock_long_at - jiffies, 0)
+ : MAX_SCHEDULE_TIMEOUT;
+
+ int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t);
+ if (r == -ETIME) {
+ bch2_trans_unlock_long(trans);
+ unlock_long_at = 0;
+ goto rewait;
+ }
- int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
- if (r < 0)
- return YN_NO;
- buf[r] = '\0';
- } while ((ret = parse_yn_response(buf)) < 0);
+ if (r < 0) {
+ ret = YN_NO;
+ break;
+ }
+
+ darray_last(line) = '\0';
+ } while ((ret = parse_yn_response(line.data)) < 0);
+ darray_exit(&line);
return ret;
}
#else
#include "tools-util.h"
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
{
char *buf = NULL;
size_t buflen = 0;
@@ -168,20 +199,93 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
return s;
}
-int bch2_fsck_err(struct bch_fs *c,
+/* s/fix?/fixing/ s/recreate?/recreating/ */
+static void prt_actioning(struct printbuf *out, const char *action)
+{
+ unsigned len = strlen(action);
+
+ BUG_ON(action[len - 1] != '?');
+ --len;
+
+ if (action[len - 1] == 'e')
+ --len;
+
+ prt_bytes(out, action, len);
+ prt_str(out, "ing");
+}
+
+static const u8 fsck_flags_extra[] = {
+#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags,
+ BCH_SB_ERRS()
+#undef x
+};
+
+static int do_fsck_ask_yn(struct bch_fs *c,
+ struct btree_trans *trans,
+ struct printbuf *question,
+ const char *action)
+{
+ prt_str(question, ", ");
+ prt_str(question, action);
+
+ if (bch2_fs_stdio_redirect(c))
+ bch2_print(c, "%s", question->buf);
+ else
+ bch2_print_string_as_lines(KERN_ERR, question->buf);
+
+ int ask = bch2_fsck_ask_yn(c, trans);
+
+ if (trans) {
+ int ret = bch2_trans_relock(trans);
+ if (ret)
+ return ret;
+ }
+
+ return ask;
+}
+
+int __bch2_fsck_err(struct bch_fs *c,
+ struct btree_trans *trans,
enum bch_fsck_flags flags,
enum bch_sb_error_id err,
const char *fmt, ...)
{
struct fsck_err_state *s = NULL;
va_list args;
- bool print = true, suppressing = false, inconsistent = false;
+ bool print = true, suppressing = false, inconsistent = false, exiting = false;
struct printbuf buf = PRINTBUF, *out = &buf;
int ret = -BCH_ERR_fsck_ignore;
-
- if ((flags & FSCK_CAN_FIX) &&
- test_bit(err, c->sb.errors_silent))
- return -BCH_ERR_fsck_fix;
+ const char *action_orig = "fix?", *action = action_orig;
+
+ might_sleep();
+
+ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
+ flags |= fsck_flags_extra[err];
+
+ if (!c)
+ c = trans->c;
+
+ /*
+ * Ugly: if there's a transaction in the current task it has to be
+ * passed in to unlock if we prompt for user input.
+ *
+ * But, plumbing a transaction and transaction restarts into
+ * bkey_validate() is problematic.
+ *
+ * So:
+ * - make all bkey errors AUTOFIX, they're simple anyways (we just
+ * delete the key)
+ * - and we don't need to warn if we're not prompting
+ */
+ WARN_ON((flags & FSCK_CAN_FIX) &&
+ !(flags & FSCK_AUTOFIX) &&
+ !trans &&
+ bch2_current_has_btree_trans(c));
+
+ if (test_bit(err, c->sb.errors_silent))
+ return flags & FSCK_CAN_FIX
+ ? -BCH_ERR_fsck_fix
+ : -BCH_ERR_fsck_ignore;
bch2_sb_error_count(c, err);
@@ -189,6 +293,19 @@ int bch2_fsck_err(struct bch_fs *c,
prt_vprintf(out, fmt, args);
va_end(args);
+ /* Custom fix/continue/recreate/etc.? */
+ if (out->buf[out->pos - 1] == '?') {
+ const char *p = strrchr(out->buf, ',');
+ if (p) {
+ out->pos = p - out->buf;
+ action = kstrdup(p + 2, GFP_KERNEL);
+ if (!action) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+ }
+
mutex_lock(&c->fsck_error_msgs_lock);
s = fsck_err_get(c, fmt);
if (s) {
@@ -199,13 +316,15 @@ int bch2_fsck_err(struct bch_fs *c,
*/
if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
ret = s->ret;
- mutex_unlock(&c->fsck_error_msgs_lock);
- printbuf_exit(&buf);
- return ret;
+ goto err_unlock;
}
kfree(s->last_msg);
s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
+ if (!s->last_msg) {
+ ret = -ENOMEM;
+ goto err_unlock;
+ }
if (c->opts.ratelimit_errors &&
!(flags & FSCK_NO_RATELIMIT) &&
@@ -224,14 +343,28 @@ int bch2_fsck_err(struct bch_fs *c,
prt_printf(out, bch2_log_msg(c, ""));
#endif
- if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
+ if ((flags & FSCK_AUTOFIX) &&
+ (c->opts.errors == BCH_ON_ERROR_continue ||
+ c->opts.errors == BCH_ON_ERROR_fix_safe)) {
+ prt_str(out, ", ");
+ if (flags & FSCK_CAN_FIX) {
+ prt_actioning(out, action);
+ ret = -BCH_ERR_fsck_fix;
+ } else {
+ prt_str(out, ", continuing");
+ ret = -BCH_ERR_fsck_ignore;
+ }
+
+ goto print;
+ } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
if (c->opts.errors != BCH_ON_ERROR_continue ||
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
prt_str(out, ", shutting down");
inconsistent = true;
ret = -BCH_ERR_fsck_errors_not_fixed;
} else if (flags & FSCK_CAN_FIX) {
- prt_str(out, ", fixing");
+ prt_str(out, ", ");
+ prt_actioning(out, action);
ret = -BCH_ERR_fsck_fix;
} else {
prt_str(out, ", continuing");
@@ -246,36 +379,31 @@ int bch2_fsck_err(struct bch_fs *c,
: c->opts.fix_errors;
if (fix == FSCK_FIX_ask) {
- int ask;
-
- prt_str(out, ": fix?");
- if (bch2_fs_stdio_redirect(c))
- bch2_print(c, "%s", out->buf);
- else
- bch2_print_string_as_lines(KERN_ERR, out->buf);
print = false;
- ask = bch2_fsck_ask_yn(c);
+ ret = do_fsck_ask_yn(c, trans, out, action);
+ if (ret < 0)
+ goto err_unlock;
- if (ask >= YN_ALLNO && s)
- s->fix = ask == YN_ALLNO
+ if (ret >= YN_ALLNO && s)
+ s->fix = ret == YN_ALLNO
? FSCK_FIX_no
: FSCK_FIX_yes;
- ret = ask & 1
+ ret = ret & 1
? -BCH_ERR_fsck_fix
: -BCH_ERR_fsck_ignore;
} else if (fix == FSCK_FIX_yes ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
- prt_str(out, ", fixing");
+ prt_str(out, ", ");
+ prt_actioning(out, action);
ret = -BCH_ERR_fsck_fix;
} else {
- prt_str(out, ", not fixing");
+ prt_str(out, ", not ");
+ prt_actioning(out, action);
}
- } else if (flags & FSCK_NEED_FSCK) {
- prt_str(out, " (run fsck to correct)");
- } else {
+ } else if (!(flags & FSCK_CAN_IGNORE)) {
prt_str(out, " (repair unimplemented)");
}
@@ -284,6 +412,13 @@ int bch2_fsck_err(struct bch_fs *c,
!(flags & FSCK_CAN_IGNORE)))
ret = -BCH_ERR_fsck_errors_not_fixed;
+ if (test_bit(BCH_FS_fsck_running, &c->flags) &&
+ (ret != -BCH_ERR_fsck_fix &&
+ ret != -BCH_ERR_fsck_ignore)) {
+ exiting = true;
+ print = true;
+ }
+print:
if (print) {
if (bch2_fs_stdio_redirect(c))
bch2_print(c, "%s\n", out->buf);
@@ -291,9 +426,7 @@ int bch2_fsck_err(struct bch_fs *c,
bch2_print_string_as_lines(KERN_ERR, out->buf);
}
- if (test_bit(BCH_FS_fsck_running, &c->flags) &&
- (ret != -BCH_ERR_fsck_fix &&
- ret != -BCH_ERR_fsck_ignore))
+ if (exiting)
bch_err(c, "Unable to continue, halting");
else if (suppressing)
bch_err(c, "Ratelimiting new instances of previous error");
@@ -301,20 +434,81 @@ int bch2_fsck_err(struct bch_fs *c,
if (s)
s->ret = ret;
- mutex_unlock(&c->fsck_error_msgs_lock);
+ if (inconsistent)
+ bch2_inconsistent_error(c);
+ /*
+ * We don't yet track whether the filesystem currently has errors, for
+ * log_fsck_err()s: that would require us to track for every error type
+ * which recovery pass corrects it, to get the fsck exit status correct:
+ */
+ if (flags & FSCK_CAN_FIX) {
+ if (ret == -BCH_ERR_fsck_fix) {
+ set_bit(BCH_FS_errors_fixed, &c->flags);
+ } else {
+ set_bit(BCH_FS_errors_not_fixed, &c->flags);
+ set_bit(BCH_FS_error, &c->flags);
+ }
+ }
+err_unlock:
+ mutex_unlock(&c->fsck_error_msgs_lock);
+err:
+ if (action != action_orig)
+ kfree(action);
printbuf_exit(&buf);
+ return ret;
+}
- if (inconsistent)
- bch2_inconsistent_error(c);
+static const char * const bch2_bkey_validate_contexts[] = {
+#define x(n) #n,
+ BKEY_VALIDATE_CONTEXTS()
+#undef x
+ NULL
+};
- if (ret == -BCH_ERR_fsck_fix) {
- set_bit(BCH_FS_errors_fixed, &c->flags);
- } else {
- set_bit(BCH_FS_errors_not_fixed, &c->flags);
- set_bit(BCH_FS_error, &c->flags);
+int __bch2_bkey_fsck_err(struct bch_fs *c,
+ struct bkey_s_c k,
+ struct bkey_validate_context from,
+ enum bch_sb_error_id err,
+ const char *fmt, ...)
+{
+ if (from.flags & BCH_VALIDATE_silent)
+ return -BCH_ERR_fsck_delete_bkey;
+
+ unsigned fsck_flags = 0;
+ if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
+ if (test_bit(err, c->sb.errors_silent))
+ return -BCH_ERR_fsck_delete_bkey;
+
+ fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
}
+ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
+ fsck_flags |= fsck_flags_extra[err];
+
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "invalid bkey in %s",
+ bch2_bkey_validate_contexts[from.from]);
+
+ if (from.from == BKEY_VALIDATE_journal)
+ prt_printf(&buf, " journal seq=%llu offset=%u",
+ from.journal_seq, from.journal_offset);
+
+ prt_str(&buf, " btree=");
+ bch2_btree_id_to_text(&buf, from.btree);
+ prt_printf(&buf, " level=%u: ", from.level);
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\n ");
+ va_list args;
+ va_start(args, fmt);
+ prt_vprintf(&buf, fmt, args);
+ va_end(args);
+
+ prt_str(&buf, ": delete?");
+
+ int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf);
+ printbuf_exit(&buf);
return ret;
}
@@ -335,3 +529,36 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
mutex_unlock(&c->fsck_error_msgs_lock);
}
+
+int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum)
+{
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
+
+ /* XXX: we don't yet attempt to print paths when we don't know the subvol */
+ if (inum.subvol)
+ ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out));
+ if (!inum.subvol || ret)
+ prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
+
+ return trans_was_restarted(trans, restart_count);
+}
+
+int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+ subvol_inum inum, u64 offset)
+{
+ int ret = bch2_inum_err_msg_trans(trans, out, inum);
+ prt_printf(out, " offset %llu: ", offset);
+ return ret;
+}
+
+void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum)
+{
+ bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum));
+}
+
+void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
+ subvol_inum inum, u64 offset)
+{
+ bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
+}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index fec17d1353d1..7acf2a27ca28 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -4,6 +4,7 @@
#include <linux/list.h>
#include <linux/printk.h>
+#include "bkey_types.h"
#include "sb-errors.h"
struct bch_dev;
@@ -30,7 +31,13 @@ struct work_struct;
bool bch2_inconsistent_error(struct bch_fs *);
-void bch2_topology_error(struct bch_fs *);
+int bch2_topology_error(struct bch_fs *);
+
+#define bch2_fs_topology_error(c, ...) \
+({ \
+ bch_err(c, "btree topology error: " __VA_ARGS__); \
+ bch2_topology_error(c); \
+})
#define bch2_fs_inconsistent(c, ...) \
({ \
@@ -38,32 +45,11 @@ void bch2_topology_error(struct bch_fs *);
bch2_inconsistent_error(c); \
})
-#define bch2_fs_inconsistent_on(cond, c, ...) \
+#define bch2_fs_inconsistent_on(cond, ...) \
({ \
bool _ret = unlikely(!!(cond)); \
- \
if (_ret) \
- bch2_fs_inconsistent(c, __VA_ARGS__); \
- _ret; \
-})
-
-/*
- * Later we might want to mark only the particular device inconsistent, not the
- * entire filesystem:
- */
-
-#define bch2_dev_inconsistent(ca, ...) \
-do { \
- bch_err(ca, __VA_ARGS__); \
- bch2_inconsistent_error((ca)->fs); \
-} while (0)
-
-#define bch2_dev_inconsistent_on(cond, ca, ...) \
-({ \
- bool _ret = unlikely(!!(cond)); \
- \
- if (_ret) \
- bch2_dev_inconsistent(ca, __VA_ARGS__); \
+ bch2_fs_inconsistent(__VA_ARGS__); \
_ret; \
})
@@ -102,27 +88,23 @@ struct fsck_err_state {
char *last_msg;
};
-enum bch_fsck_flags {
- FSCK_CAN_FIX = 1 << 0,
- FSCK_CAN_IGNORE = 1 << 1,
- FSCK_NEED_FSCK = 1 << 2,
- FSCK_NO_RATELIMIT = 1 << 3,
-};
-
#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
-__printf(4, 5) __cold
-int bch2_fsck_err(struct bch_fs *,
+__printf(5, 6) __cold
+int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
enum bch_fsck_flags,
enum bch_sb_error_id,
const char *, ...);
+#define bch2_fsck_err(c, _flags, _err_type, ...) \
+ __bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\
+ type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\
+ _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
+
void bch2_flush_fsck_errs(struct bch_fs *);
-#define __fsck_err(c, _flags, _err_type, ...) \
+#define fsck_err_wrap(_do) \
({ \
- int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \
- __VA_ARGS__); \
- \
+ int _ret = _do; \
if (_ret != -BCH_ERR_fsck_fix && \
_ret != -BCH_ERR_fsck_ignore) { \
ret = _ret; \
@@ -132,18 +114,21 @@ void bch2_flush_fsck_errs(struct bch_fs *);
_ret == -BCH_ERR_fsck_fix; \
})
+#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
+
/* These macros return true if error should be fixed: */
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
#define __fsck_err_on(cond, c, _flags, _err_type, ...) \
- (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false)
-
-#define need_fsck_err_on(cond, c, _err_type, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
-
-#define need_fsck_err(c, _err_type, ...) \
- __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
+({ \
+ might_sleep(); \
+ \
+ if (type_is(c, struct bch_fs *)) \
+ WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\
+ \
+ (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
+})
#define mustfix_fsck_err(c, _err_type, ...) \
__fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
@@ -157,24 +142,38 @@ void bch2_flush_fsck_errs(struct bch_fs *);
#define fsck_err_on(cond, c, _err_type, ...) \
__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-__printf(4, 0)
-static inline void bch2_bkey_fsck_err(struct bch_fs *c,
- struct printbuf *err_msg,
- enum bch_sb_error_id err_type,
- const char *fmt, ...)
-{
- va_list args;
+#define log_fsck_err(c, _err_type, ...) \
+ __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
+
+#define log_fsck_err_on(cond, ...) \
+({ \
+ bool _ret = unlikely(!!(cond)); \
+ if (_ret) \
+ log_fsck_err(__VA_ARGS__); \
+ _ret; \
+})
- va_start(args, fmt);
- prt_vprintf(err_msg, fmt, args);
- va_end(args);
-}
+enum bch_validate_flags;
+__printf(5, 6)
+int __bch2_bkey_fsck_err(struct bch_fs *,
+ struct bkey_s_c,
+ struct bkey_validate_context from,
+ enum bch_sb_error_id,
+ const char *, ...);
-#define bkey_fsck_err(c, _err_msg, _err_type, ...) \
+/*
+ * for now, bkey fsck errors are always handled by deleting the entire key -
+ * this will change at some point
+ */
+#define bkey_fsck_err(c, _err_type, _err_msg, ...) \
do { \
- prt_printf(_err_msg, __VA_ARGS__); \
- bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \
- ret = -BCH_ERR_invalid_bkey; \
+ int _ret = __bch2_bkey_fsck_err(c, k, from, \
+ BCH_FSCK_ERR_##_err_type, \
+ _err_msg, ##__VA_ARGS__); \
+ if (_ret != -BCH_ERR_fsck_fix && \
+ _ret != -BCH_ERR_fsck_ignore) \
+ ret = _ret; \
+ ret = -BCH_ERR_fsck_delete_bkey; \
goto fsck_err; \
} while (0)
@@ -191,9 +190,9 @@ do { \
void bch2_fatal_error(struct bch_fs *);
-#define bch2_fs_fatal_error(c, ...) \
+#define bch2_fs_fatal_error(c, _msg, ...) \
do { \
- bch_err(c, __VA_ARGS__); \
+ bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__); \
bch2_fatal_error(c); \
} while (0)
@@ -239,4 +238,10 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
_ret; \
})
+int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum);
+int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
+
+void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum);
+void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
+
#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index b9033bb4f11c..6aac579a692a 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -64,7 +64,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
break;
case KEY_TYPE_reflink_p: {
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- u64 idx = le64_to_cpu(p.v->idx);
+ u64 idx = REFLINK_P_IDX(p.v);
unsigned sectors = bpos_min(*end, p.k->p).offset -
bkey_start_offset(p.k);
struct btree_iter iter;
@@ -72,7 +72,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
for_each_btree_key_norestart(trans, iter,
BTREE_ID_reflink, POS(0, idx + offset),
- BTREE_ITER_SLOTS, r_k, ret2) {
+ BTREE_ITER_slots, r_k, ret2) {
if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
break;
@@ -128,7 +128,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
bch2_trans_copy_iter(&copy, iter);
- for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) {
+ for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) {
unsigned offset = 0;
if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 61395b113df9..2d8042f853dc 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -21,6 +21,7 @@
#include "extents.h"
#include "inode.h"
#include "journal.h"
+#include "rebalance.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
@@ -37,8 +38,8 @@ static void bch2_extent_crc_pack(union bch_extent_crc *,
struct bch_extent_crc_unpacked,
enum bch_extent_entry_type);
-static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
- unsigned dev)
+struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
+ unsigned dev)
{
struct bch_dev_io_failures *i;
@@ -52,7 +53,7 @@ static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
void bch2_mark_io_failure(struct bch_io_failures *failed,
struct extent_ptr_decoded *p)
{
- struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
+ struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
if (!f) {
BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
@@ -71,6 +72,12 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
}
}
+static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
+{
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
+}
+
/*
* returns true if p1 is better than p2:
*/
@@ -79,15 +86,20 @@ static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p2)
{
if (likely(!p1.idx && !p2.idx)) {
- struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
- struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+ u64 l1 = dev_latency(c, p1.ptr.dev);
+ u64 l2 = dev_latency(c, p2.ptr.dev);
- u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
- u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+ /*
+ * Square the latencies, to bias more in favor of the faster
+ * device - we never want to stop issuing reads to the slower
+ * device altogether, so that we can update our latency numbers:
+ */
+ l1 *= l1;
+ l2 *= l2;
/* Pick at random, biased in favor of the faster device: */
- return bch2_rand_range(l1 + l2) > l1;
+ return bch2_get_random_u64_below(l1 + l2) > l1;
}
if (bch2_force_reconstruct_read)
@@ -109,47 +121,47 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
struct bch_dev_io_failures *f;
- struct bch_dev *ca;
int ret = 0;
if (k.k->type == KEY_TYPE_error)
- return -EIO;
+ return -BCH_ERR_key_type_error;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
/*
* Unwritten extent: no need to actually read, treat it as a
* hole and return 0s:
*/
- if (p.ptr.unwritten)
- return 0;
-
- ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ if (p.ptr.unwritten) {
+ ret = 0;
+ break;
+ }
/*
* If there are any dirty pointers it's an error if we can't
* read:
*/
if (!ret && !p.ptr.cached)
- ret = -EIO;
+ ret = -BCH_ERR_no_device_to_read_from;
- if (p.ptr.cached && ptr_stale(ca, &p.ptr))
+ struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+
+ if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
- f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
+ f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
if (f)
p.idx = f->nr_failed < f->nr_retries
? f->idx
: f->idx + 1;
- if (!p.idx &&
- !bch2_dev_is_readable(ca))
+ if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
p.idx++;
- if (bch2_force_reconstruct_read &&
- !p.idx && p.has_ec)
+ if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
p.idx++;
- if (p.idx >= (unsigned) p.has_ec + 1)
+ if (p.idx > (unsigned) p.has_ec)
continue;
if (ret > 0 && !ptr_better(c, p, *pick))
@@ -158,23 +170,23 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
*pick = p;
ret = 1;
}
+ rcu_read_unlock();
return ret;
}
/* KEY_TYPE_btree_ptr: */
-int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
int ret = 0;
- bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
- btree_ptr_val_too_big,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
+ c, btree_ptr_val_too_big,
"value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
- ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+ ret = bch2_bkey_ptrs_validate(c, k, from);
fsck_err:
return ret;
}
@@ -185,18 +197,28 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
-int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
int ret = 0;
- bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
- btree_ptr_v2_val_too_big,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
+ c, btree_ptr_v2_val_too_big,
"value too big (%zu > %zu)",
bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
- ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+ bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
+ c, btree_ptr_v2_min_key_bad,
+ "min_key > key");
+
+ if ((from.flags & BCH_VALIDATE_write) &&
+ c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
+ bkey_fsck_err_on(!bp.v->sectors_written,
+ c, btree_ptr_v2_written_0,
+ "sectors_written == 0");
+
+ ret = bch2_bkey_ptrs_validate(c, k, from);
fsck_err:
return ret;
}
@@ -242,7 +264,6 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
const union bch_extent_entry *en_r;
struct extent_ptr_decoded lp, rp;
bool use_right_ptr;
- struct bch_dev *ca;
en_l = l_ptrs.start;
en_r = r_ptrs.start;
@@ -273,8 +294,12 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
return false;
/* Extents may not straddle buckets: */
- ca = bch_dev_bkey_exists(c, lp.ptr.dev);
- if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr))
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
+ bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
+ rcu_read_unlock();
+
+ if (!same_bucket)
return false;
if (lp.has_ec != rp.has_ec ||
@@ -379,15 +404,14 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
/* KEY_TYPE_reservation: */
-int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
int ret = 0;
- bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err,
- reservation_key_nr_replicas_invalid,
+ bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
+ c, reservation_key_nr_replicas_invalid,
"invalid nr_replicas (%u)", r.v->nr_replicas);
fsck_err:
return ret;
@@ -662,16 +686,16 @@ static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent
unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
+ struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
- return __extent_ptr_durability(ca, p);
+ return ca ? __extent_ptr_durability(ca, p) : 0;
}
unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
+ struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
- if (ca->mi.state == BCH_MEMBER_STATE_failed)
+ if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
return 0;
return __extent_ptr_durability(ca, p);
@@ -684,8 +708,10 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
unsigned durability = 0;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
durability += bch2_extent_ptr_durability(c, &p);
+ rcu_read_unlock();
return durability;
}
@@ -697,9 +723,11 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
unsigned durability = 0;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
durability += bch2_extent_ptr_durability(c, &p);
+ rcu_read_unlock();
return durability;
}
@@ -760,14 +788,17 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
/*
* Returns pointer to the next entry after the one being dropped:
*/
-union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry = to_entry(ptr), *next;
- union bch_extent_entry *ret = entry;
bool drop_crc = true;
+ if (k.k->type == KEY_TYPE_stripe) {
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+ return;
+ }
+
EBUG_ON(ptr < &ptrs.start->ptr ||
ptr >= &ptrs.end->ptr);
EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
@@ -790,21 +821,28 @@ union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
break;
if ((extent_entry_is_crc(entry) && drop_crc) ||
- extent_entry_is_stripe_ptr(entry)) {
- ret = (void *) ret - extent_entry_bytes(entry);
+ extent_entry_is_stripe_ptr(entry))
extent_entry_drop(k, entry);
- }
}
-
- return ret;
}
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
{
+ if (k.k->type != KEY_TYPE_stripe) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (p.ptr.dev == ptr->dev && p.has_ec) {
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+ return;
+ }
+ }
+
bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
- union bch_extent_entry *ret =
- bch2_bkey_drop_ptr_noerror(k, ptr);
+
+ bch2_bkey_drop_ptr_noerror(k, ptr);
/*
* If we deleted all the dirty pointers and there's still cached
@@ -816,29 +854,20 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
!bch2_bkey_dirty_devs(k.s_c).nr) {
k.k->type = KEY_TYPE_error;
set_bkey_val_u64s(k.k, 0);
- ret = NULL;
} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
k.k->type = KEY_TYPE_deleted;
set_bkey_val_u64s(k.k, 0);
- ret = NULL;
}
-
- return ret;
}
void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
{
- struct bch_extent_ptr *ptr;
-
bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
}
void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
{
- struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
-
- if (ptr)
- bch2_bkey_drop_ptr_noerror(k, ptr);
+ bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
}
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
@@ -855,14 +884,21 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned
bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_dev *ca;
+ bool ret = false;
+ rcu_read_lock();
bkey_for_each_ptr(ptrs, ptr)
if (bch2_dev_in_target(c, ptr->dev, target) &&
+ (ca = bch2_dev_rcu(c, ptr->dev)) &&
(!ptr->cached ||
- !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
- return true;
+ !dev_ptr_stale_rcu(ca, ptr))) {
+ ret = true;
+ break;
+ }
+ rcu_read_unlock();
- return false;
+ return ret;
}
bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
@@ -903,8 +939,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
if (p1.ptr.dev == p2.ptr.dev &&
p1.ptr.gen == p2.ptr.gen &&
+
+ /*
+ * This checks that the two pointers point
+ * to the same region on disk - adjusting
+ * for the difference in where the extents
+ * start, since one may have been trimmed:
+ */
(s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
- (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
+
+ /*
+ * This additionally checks that the
+ * extents overlap on disk, since the
+ * previous check may trigger spuriously
+ * when one extent is immediately partially
+ * overwritten with another extent (so that
+ * on disk they are adjacent) and
+ * compression is in use:
+ */
+ ((p1.ptr.offset >= p2.ptr.offset &&
+ p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
+ (p2.ptr.offset >= p1.ptr.offset &&
+ p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
return true;
return false;
@@ -931,31 +988,54 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke
return NULL;
}
-void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
+static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
+ struct bch_extent_ptr *ptr)
+{
+ if (!opts->promote_target ||
+ !bch2_dev_in_target(c, ptr->dev, opts->promote_target))
+ return false;
+
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+
+ return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
+}
+
+void bch2_extent_ptr_set_cached(struct bch_fs *c,
+ struct bch_io_opts *opts,
+ struct bkey_s k,
+ struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry;
- union bch_extent_entry *ec = NULL;
+ struct extent_ptr_decoded p;
- bkey_extent_entry_for_each(ptrs, entry) {
+ rcu_read_lock();
+ if (!want_cached_ptr(c, opts, ptr)) {
+ bch2_bkey_drop_ptr_noerror(k, ptr);
+ goto out;
+ }
+
+ /*
+ * Stripes can't contain cached data, for - reasons.
+ *
+ * Possibly something we can fix in the future?
+ */
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (&entry->ptr == ptr) {
- ptr->cached = true;
- if (ec)
- extent_entry_drop(k, ec);
- return;
+ if (p.has_ec)
+ bch2_bkey_drop_ptr_noerror(k, ptr);
+ else
+ ptr->cached = true;
+ goto out;
}
- if (extent_entry_is_stripe_ptr(entry))
- ec = entry;
- else if (extent_entry_is_ptr(entry))
- ec = NULL;
- }
-
BUG();
+out:
+ rcu_read_unlock();
}
/*
- * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
+ * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
*
* Returns true if @k should be dropped entirely
*
@@ -964,15 +1044,143 @@ void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
*/
bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
{
- struct bch_extent_ptr *ptr;
+ struct bch_dev *ca;
+ rcu_read_lock();
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
- ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
+ (!(ca = bch2_dev_rcu(c, ptr->dev)) ||
+ dev_ptr_stale_rcu(ca, ptr) > 0));
+ rcu_read_unlock();
return bkey_deleted(k.k);
}
+/*
+ * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
+ *
+ * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
+ * the promote target.
+ */
+bool bch2_extent_normalize_by_opts(struct bch_fs *c,
+ struct bch_io_opts *opts,
+ struct bkey_s k)
+{
+ struct bkey_ptrs ptrs;
+ bool have_cached_ptr;
+
+ rcu_read_lock();
+restart_drop_ptrs:
+ ptrs = bch2_bkey_ptrs(k);
+ have_cached_ptr = false;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->cached) {
+ if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
+ bch2_bkey_drop_ptr(k, ptr);
+ goto restart_drop_ptrs;
+ }
+ have_cached_ptr = true;
+ }
+ rcu_read_unlock();
+
+ return bkey_deleted(k.k);
+}
+
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
+{
+ out->atomic++;
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+ if (!ca) {
+ prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+ (u64) ptr->offset, ptr->gen,
+ ptr->cached ? " cached" : "");
+ } else {
+ u32 offset;
+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+ prt_printf(out, "ptr: %u:%llu:%u gen %u",
+ ptr->dev, b, offset, ptr->gen);
+ if (ca->mi.durability != 1)
+ prt_printf(out, " d=%u", ca->mi.durability);
+ if (ptr->cached)
+ prt_str(out, " cached");
+ if (ptr->unwritten)
+ prt_str(out, " unwritten");
+ int stale = dev_ptr_stale_rcu(ca, ptr);
+ if (stale > 0)
+ prt_printf(out, " stale");
+ else if (stale)
+ prt_printf(out, " invalid");
+ }
+ rcu_read_unlock();
+ --out->atomic;
+}
+
+void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
+{
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
+ crc->compressed_size,
+ crc->uncompressed_size,
+ crc->offset, crc->nonce);
+ bch2_prt_csum_type(out, crc->csum_type);
+ prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
+ prt_str(out, " compress ");
+ bch2_prt_compression_type(out, crc->compression_type);
+}
+
+static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
+ const struct bch_extent_rebalance *r)
+{
+ prt_str(out, "rebalance:");
+
+ prt_printf(out, " replicas=%u", r->data_replicas);
+ if (r->data_replicas_from_inode)
+ prt_str(out, " (inode)");
+
+ prt_str(out, " checksum=");
+ bch2_prt_csum_opt(out, r->data_checksum);
+ if (r->data_checksum_from_inode)
+ prt_str(out, " (inode)");
+
+ if (r->background_compression || r->background_compression_from_inode) {
+ prt_str(out, " background_compression=");
+ bch2_compression_opt_to_text(out, r->background_compression);
+
+ if (r->background_compression_from_inode)
+ prt_str(out, " (inode)");
+ }
+
+ if (r->background_target || r->background_target_from_inode) {
+ prt_str(out, " background_target=");
+ if (c)
+ bch2_target_to_text(out, c, r->background_target);
+ else
+ prt_printf(out, "%u", r->background_target);
+
+ if (r->background_target_from_inode)
+ prt_str(out, " (inode)");
+ }
+
+ if (r->promote_target || r->promote_target_from_inode) {
+ prt_str(out, " promote_target=");
+ if (c)
+ bch2_target_to_text(out, c, r->promote_target);
+ else
+ prt_printf(out, "%u", r->promote_target);
+
+ if (r->promote_target_from_inode)
+ prt_str(out, " (inode)");
+ }
+
+ if (r->erasure_code || r->erasure_code_from_inode) {
+ prt_printf(out, " ec=%u", r->erasure_code);
+ if (r->erasure_code_from_inode)
+ prt_str(out, " (inode)");
+ }
+}
+
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
@@ -988,43 +1196,17 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr: {
- const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
- struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
- ? bch_dev_bkey_exists(c, ptr->dev)
- : NULL;
-
- if (!ca) {
- prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
- (u64) ptr->offset, ptr->gen,
- ptr->cached ? " cached" : "");
- } else {
- u32 offset;
- u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
- prt_printf(out, "ptr: %u:%llu:%u gen %u",
- ptr->dev, b, offset, ptr->gen);
- if (ptr->cached)
- prt_str(out, " cached");
- if (ptr->unwritten)
- prt_str(out, " unwritten");
- if (ca && ptr_stale(ca, ptr))
- prt_printf(out, " stale");
- }
+ case BCH_EXTENT_ENTRY_ptr:
+ bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
break;
- }
+
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128: {
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
- crc.compressed_size,
- crc.uncompressed_size,
- crc.offset, crc.nonce,
- bch2_csum_types[crc.csum_type]);
- bch2_prt_compression_type(out, crc.compression_type);
+ bch2_extent_crc_unpacked_to_text(out, &crc);
break;
}
case BCH_EXTENT_ENTRY_stripe_ptr: {
@@ -1034,18 +1216,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
(u64) ec->idx, ec->block);
break;
}
- case BCH_EXTENT_ENTRY_rebalance: {
- const struct bch_extent_rebalance *r = &entry->rebalance;
-
- prt_str(out, "rebalance: target ");
- if (c)
- bch2_target_to_text(out, c, r->target);
- else
- prt_printf(out, "%u", r->target);
- prt_str(out, " compression ");
- bch2_compression_opt_to_text(out, r->compression);
+ case BCH_EXTENT_ENTRY_rebalance:
+ bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
break;
- }
+
default:
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
@@ -1055,58 +1229,51 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
}
}
-static int extent_ptr_invalid(struct bch_fs *c,
- struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- const struct bch_extent_ptr *ptr,
- unsigned size_ondisk,
- bool metadata,
- struct printbuf *err)
+static int extent_ptr_validate(struct bch_fs *c,
+ struct bkey_s_c k,
+ struct bkey_validate_context from,
+ const struct bch_extent_ptr *ptr,
+ unsigned size_ondisk,
+ bool metadata)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- u64 bucket;
- u32 bucket_offset;
- struct bch_dev *ca;
int ret = 0;
- if (!bch2_dev_exists2(c, ptr->dev)) {
- /*
- * If we're in the write path this key might have already been
- * overwritten, and we could be seeing a device that doesn't
- * exist anymore due to racing with device removal:
- */
- if (flags & BKEY_INVALID_WRITE)
- return 0;
-
- bkey_fsck_err(c, err, ptr_to_invalid_device,
- "pointer to invalid device (%u)", ptr->dev);
- }
-
- ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr2)
- bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
- ptr_to_duplicate_device,
+ bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
+ c, ptr_to_duplicate_device,
"multiple pointers to same device (%u)", ptr->dev);
- bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
-
- bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err,
- ptr_after_last_bucket,
- "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets);
- bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err,
- ptr_before_first_bucket,
- "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
- bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err,
- ptr_spans_multiple_buckets,
+ /* bad pointers are repaired by check_fix_ptrs(): */
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+ if (!ca) {
+ rcu_read_unlock();
+ return 0;
+ }
+ u32 bucket_offset;
+ u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
+ unsigned first_bucket = ca->mi.first_bucket;
+ u64 nbuckets = ca->mi.nbuckets;
+ unsigned bucket_size = ca->mi.bucket_size;
+ rcu_read_unlock();
+
+ bkey_fsck_err_on(bucket >= nbuckets,
+ c, ptr_after_last_bucket,
+ "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
+ bkey_fsck_err_on(bucket < first_bucket,
+ c, ptr_before_first_bucket,
+ "pointer before first bucket (%llu < %u)", bucket, first_bucket);
+ bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
+ c, ptr_spans_multiple_buckets,
"pointer spans multiple buckets (%u + %u > %u)",
- bucket_offset, size_ondisk, ca->mi.bucket_size);
+ bucket_offset, size_ondisk, bucket_size);
fsck_err:
return ret;
}
-int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -1121,25 +1288,24 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
size_ondisk = btree_sectors(c);
bkey_extent_entry_for_each(ptrs, entry) {
- bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
- extent_ptrs_invalid_entry,
- "invalid extent entry type (got %u, max %u)",
- __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
+ bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
+ c, extent_ptrs_invalid_entry,
+ "invalid extent entry type (got %u, max %u)",
+ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
- !extent_entry_is_ptr(entry), c, err,
- btree_ptr_has_non_ptr,
+ !extent_entry_is_ptr(entry),
+ c, btree_ptr_has_non_ptr,
"has non ptr field");
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
- ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
- size_ondisk, false, err);
+ ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
if (ret)
return ret;
- bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err,
- ptr_cached_and_erasure_coded,
+ bkey_fsck_err_on(entry->ptr.cached && have_ec,
+ c, ptr_cached_and_erasure_coded,
"cached, erasure coded ptr");
if (!entry->ptr.unwritten)
@@ -1156,44 +1322,54 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err,
- ptr_crc_uncompressed_size_too_small,
- "checksum offset + key size > uncompressed size");
- bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err,
- ptr_crc_csum_type_unknown,
+ bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
+ c, ptr_crc_csum_type_unknown,
"invalid checksum type");
- bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err,
- ptr_crc_compression_type_unknown,
+ bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
+ c, ptr_crc_compression_type_unknown,
"invalid compression type");
+ bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
+ c, ptr_crc_uncompressed_size_too_small,
+ "checksum offset + key size > uncompressed size");
+ bkey_fsck_err_on(crc_is_encoded(crc) &&
+ (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+ (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
+ c, ptr_crc_uncompressed_size_too_big,
+ "too large encoded extent");
+ bkey_fsck_err_on(!crc_is_compressed(crc) &&
+ crc.compressed_size != crc.uncompressed_size,
+ c, ptr_crc_uncompressed_size_mismatch,
+ "not compressed but compressed != uncompressed size");
+
if (bch2_csum_type_is_encryption(crc.csum_type)) {
if (nonce == UINT_MAX)
nonce = crc.offset + crc.nonce;
else if (nonce != crc.offset + crc.nonce)
- bkey_fsck_err(c, err, ptr_crc_nonce_mismatch,
+ bkey_fsck_err(c, ptr_crc_nonce_mismatch,
"incorrect nonce");
}
- bkey_fsck_err_on(crc_since_last_ptr, c, err,
- ptr_crc_redundant,
+ bkey_fsck_err_on(crc_since_last_ptr,
+ c, ptr_crc_redundant,
"redundant crc entry");
crc_since_last_ptr = true;
- bkey_fsck_err_on(crc_is_encoded(crc) &&
- (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
- (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err,
- ptr_crc_uncompressed_size_too_big,
- "too large encoded extent");
-
size_ondisk = crc.compressed_size;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
- bkey_fsck_err_on(have_ec, c, err,
- ptr_stripe_redundant,
+ bkey_fsck_err_on(have_ec,
+ c, ptr_stripe_redundant,
"redundant stripe entry");
have_ec = true;
break;
case BCH_EXTENT_ENTRY_rebalance: {
+ /*
+ * this shouldn't be a fsck error, for forward
+ * compatibility; the rebalance code should just refetch
+ * the compression opt if it's unknown
+ */
+#if 0
const struct bch_extent_rebalance *r = &entry->rebalance;
if (!bch2_compression_opt_valid(r->compression)) {
@@ -1202,28 +1378,29 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
opt.type, opt.level);
return -BCH_ERR_invalid_bkey;
}
+#endif
break;
}
}
}
- bkey_fsck_err_on(!nr_ptrs, c, err,
- extent_ptrs_no_ptrs,
+ bkey_fsck_err_on(!nr_ptrs,
+ c, extent_ptrs_no_ptrs,
"no ptrs");
- bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
- extent_ptrs_too_many_ptrs,
+ bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
+ c, extent_ptrs_too_many_ptrs,
"too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
- bkey_fsck_err_on(have_written && have_unwritten, c, err,
- extent_ptrs_written_and_unwritten,
+ bkey_fsck_err_on(have_written && have_unwritten,
+ c, extent_ptrs_written_and_unwritten,
"extent with unwritten and written ptrs");
- bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err,
- extent_ptrs_unwritten,
+ bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
+ c, extent_ptrs_unwritten,
"has unwritten ptrs");
- bkey_fsck_err_on(crc_since_last_ptr, c, err,
- extent_ptrs_redundant_crc,
+ bkey_fsck_err_on(crc_since_last_ptr,
+ c, extent_ptrs_redundant_crc,
"redundant crc entry");
- bkey_fsck_err_on(have_ec, c, err,
- extent_ptrs_redundant_stripe,
+ bkey_fsck_err_on(have_ec,
+ c, extent_ptrs_redundant_stripe,
"redundant stripe entry");
fsck_err:
return ret;
@@ -1243,7 +1420,7 @@ void bch2_ptr_swab(struct bkey_s k)
for (entry = ptrs.start;
entry < ptrs.end;
entry = extent_entry_next(entry)) {
- switch (extent_entry_type(entry)) {
+ switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
break;
case BCH_EXTENT_ENTRY_crc32:
@@ -1263,131 +1440,13 @@ void bch2_ptr_swab(struct bkey_s k)
break;
case BCH_EXTENT_ENTRY_rebalance:
break;
+ default:
+ /* Bad entry type: will be caught by validate() */
+ return;
}
}
}
-const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
-
- bkey_extent_entry_for_each(ptrs, entry)
- if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
- return &entry->rebalance;
-
- return NULL;
-}
-
-unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
- unsigned target, unsigned compression)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned rewrite_ptrs = 0;
-
- if (compression) {
- unsigned compression_type = bch2_compression_opt_to_type(compression);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned i = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
- p.ptr.unwritten) {
- rewrite_ptrs = 0;
- goto incompressible;
- }
-
- if (!p.ptr.cached && p.crc.compression_type != compression_type)
- rewrite_ptrs |= 1U << i;
- i++;
- }
- }
-incompressible:
- if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
- unsigned i = 0;
-
- bkey_for_each_ptr(ptrs, ptr) {
- if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
- rewrite_ptrs |= 1U << i;
- i++;
- }
- }
-
- return rewrite_ptrs;
-}
-
-bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
-{
- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
-
- /*
- * If it's an indirect extent, we don't delete the rebalance entry when
- * done so that we know what options were applied - check if it still
- * needs work done:
- */
- if (r &&
- k.k->type == KEY_TYPE_reflink_v &&
- !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
- r = NULL;
-
- return r != NULL;
-}
-
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
- struct bch_io_opts *opts)
-{
- struct bkey_s k = bkey_i_to_s(_k);
- struct bch_extent_rebalance *r;
- unsigned target = opts->background_target;
- unsigned compression = background_compression(*opts);
- bool needs_rebalance;
-
- if (!bkey_extent_is_direct_data(k.k))
- return 0;
-
- /* get existing rebalance entry: */
- r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
- if (r) {
- if (k.k->type == KEY_TYPE_reflink_v) {
- /*
- * indirect extents: existing options take precedence,
- * so that we don't move extents back and forth if
- * they're referenced by different inodes with different
- * options:
- */
- if (r->target)
- target = r->target;
- if (r->compression)
- compression = r->compression;
- }
-
- r->target = target;
- r->compression = compression;
- }
-
- needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
-
- if (needs_rebalance && !r) {
- union bch_extent_entry *new = bkey_val_end(k);
-
- new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance;
- new->rebalance.compression = compression;
- new->rebalance.target = target;
- new->rebalance.unused = 0;
- k.k->u64s += extent_entry_u64s(new);
- } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
- /*
- * For indirect extents, don't delete the rebalance entry when
- * we're finished so that we know we specifically moved it or
- * compressed it to its current location/compression type
- */
- extent_entry_drop(k, (union bch_extent_entry *) r);
- }
-
- return 0;
-}
-
/* Generic extent code: */
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
@@ -1447,7 +1506,7 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
case KEY_TYPE_reflink_p: {
struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
- le64_add_cpu(&p.v->idx, sub);
+ SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
break;
}
case KEY_TYPE_inline_data:
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 6bf839d69e84..204d765dd74c 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -8,7 +8,6 @@
struct bch_fs;
struct btree_trans;
-enum bkey_invalid_flags;
/* extent entries: */
@@ -43,6 +42,11 @@ enum bkey_invalid_flags;
#define extent_entry_next(_entry) \
((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+#define extent_entry_next_safe(_entry, _end) \
+ (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \
+ ? extent_entry_next(_entry) \
+ : _end)
+
static inline unsigned
__extent_entry_type(const union bch_extent_entry *e)
{
@@ -103,17 +107,17 @@ static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *en
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
- return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+ return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
}
static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
{
- return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
+ return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
}
static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
{
- switch (extent_entry_type(e)) {
+ switch (__extent_entry_type(e)) {
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
@@ -207,6 +211,8 @@ static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
}
+void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *);
+
/* bkey_ptrs: generically over any key type that has ptrs */
struct bkey_ptrs_c {
@@ -280,7 +286,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \
for ((_entry) = (_start); \
(_entry) < (_end); \
- (_entry) = extent_entry_next(_entry))
+ (_entry) = extent_entry_next_safe(_entry, _end))
#define __bkey_ptr_next(_ptr, _end) \
({ \
@@ -318,7 +324,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
(_ptr).has_ec = false; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
- switch (extent_entry_type(_entry)) { \
+ switch (__extent_entry_type(_entry)) { \
case BCH_EXTENT_ENTRY_ptr: \
(_ptr).ptr = _entry->ptr; \
goto out; \
@@ -344,13 +350,13 @@ out: \
for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \
(_entry) = _start; \
__bkey_ptr_next_decode(_k, _end, _ptr, _entry); \
- (_entry) = extent_entry_next(_entry))
+ (_entry) = extent_entry_next_safe(_entry, _end))
#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
_ptr, _entry)
-#define bkey_crc_next(_k, _start, _end, _crc, _iter) \
+#define bkey_crc_next(_k, _end, _crc, _iter) \
({ \
__bkey_extent_entry_for_each_from(_iter, _end, _iter) \
if (extent_entry_is_crc(_iter)) { \
@@ -365,7 +371,7 @@ out: \
#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \
for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \
(_iter) = (_start); \
- bkey_crc_next(_k, _start, _end, _crc, _iter); \
+ bkey_crc_next(_k, _end, _crc, _iter); \
(_iter) = extent_entry_next(_iter))
#define bkey_for_each_crc(_k, _p, _crc, _iter) \
@@ -392,6 +398,8 @@ out: \
/* utility code common to all keys with pointers: */
+struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
+ unsigned);
void bch2_mark_io_failure(struct bch_io_failures *,
struct extent_ptr_decoded *);
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
@@ -400,26 +408,26 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
/* KEY_TYPE_btree_ptr: */
-int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
int, struct bkey_s);
#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \
- .key_invalid = bch2_btree_ptr_invalid, \
+ .key_validate = bch2_btree_ptr_validate, \
.val_to_text = bch2_btree_ptr_to_text, \
.swab = bch2_ptr_swab, \
.trigger = bch2_trigger_extent, \
})
#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \
- .key_invalid = bch2_btree_ptr_v2_invalid, \
+ .key_validate = bch2_btree_ptr_v2_validate, \
.val_to_text = bch2_btree_ptr_v2_to_text, \
.swab = bch2_ptr_swab, \
.compat = bch2_btree_ptr_v2_compat, \
@@ -432,7 +440,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
#define bch2_bkey_ops_extent ((struct bkey_ops) { \
- .key_invalid = bch2_bkey_ptrs_invalid, \
+ .key_validate = bch2_bkey_ptrs_validate, \
.val_to_text = bch2_bkey_ptrs_to_text, \
.swab = bch2_ptr_swab, \
.key_normalize = bch2_extent_normalize, \
@@ -442,13 +450,13 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
/* KEY_TYPE_reservation: */
-int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
#define bch2_bkey_ops_reservation ((struct bkey_ops) { \
- .key_invalid = bch2_reservation_invalid, \
+ .key_validate = bch2_reservation_validate, \
.val_to_text = bch2_reservation_to_text, \
.key_merge = bch2_reservation_merge, \
.trigger = bch2_trigger_reservation, \
@@ -591,30 +599,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
return ret;
}
-static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr)
-{
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- return BCH_DATA_btree;
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return BCH_DATA_user;
- case KEY_TYPE_stripe: {
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
- BUG_ON(ptr < s.v->ptrs ||
- ptr >= s.v->ptrs + s.v->nr_blocks);
-
- return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity
- : BCH_DATA_user;
- }
- default:
- BUG();
- }
-}
-
unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
@@ -626,9 +610,6 @@ unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_d
unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
@@ -664,26 +645,38 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
-union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
- struct bch_extent_ptr *);
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
- struct bch_extent_ptr *);
+void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *);
+void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
-#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+
+#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \
do { \
- struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \
- \
- _ptr = &_ptrs.start->ptr; \
+ __label__ _again; \
+ struct bkey_ptrs _ptrs; \
+_again: \
+ _ptrs = bch2_bkey_ptrs(_k); \
\
- while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \
+ bkey_for_each_ptr(_ptrs, _ptr) \
if (_cond) { \
- _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \
- _ptrs = bch2_bkey_ptrs(_k); \
- continue; \
+ bch2_bkey_drop_ptr_noerror(_k, _ptr); \
+ goto _again; \
} \
+} while (0)
+
+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
+do { \
+ __label__ _again; \
+ struct bkey_ptrs _ptrs; \
+_again: \
+ _ptrs = bch2_bkey_ptrs(_k); \
\
- (_ptr)++; \
- } \
+ bkey_for_each_ptr(_ptrs, _ptr) \
+ if (_cond) { \
+ bch2_bkey_drop_ptr(_k, _ptr); \
+ goto _again; \
+ } \
} while (0)
bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
@@ -692,23 +685,29 @@ bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
struct bch_extent_ptr *
bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
-void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
+void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *,
+ struct bkey_s, struct bch_extent_ptr *);
+bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
-void bch2_ptr_swab(struct bkey_s);
-
-const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
-unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
- unsigned, unsigned);
-bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
+ struct bch_extent_ptr ptr2)
+{
+ return (ptr1.cached == ptr2.cached &&
+ ptr1.unwritten == ptr2.unwritten &&
+ ptr1.offset == ptr2.offset &&
+ ptr1.dev == ptr2.dev &&
+ ptr1.gen == ptr2.gen);
+}
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
- struct bch_io_opts *);
+void bch2_ptr_swab(struct bkey_s);
/* Generic extent code: */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
index 3bd2fdbb0817..c198dfc376d6 100644
--- a/fs/bcachefs/extents_format.h
+++ b/fs/bcachefs/extents_format.h
@@ -201,19 +201,8 @@ struct bch_extent_stripe_ptr {
#endif
};
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:34,
- compression:8, /* enum bch_compression_opt */
- target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 target:16,
- compression:8,
- unused:34,
- type:6;
-#endif
-};
+/* bch_extent_rebalance: */
+#include "rebalance_format.h"
union bch_extent_entry {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
new file mode 100644
index 000000000000..2eaffe37b5e7
--- /dev/null
+++ b/fs/bcachefs/eytzinger.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "eytzinger.h"
+
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+ unsigned char lsbits = (unsigned char)size;
+
+ (void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+ return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory. This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, size_t n)
+{
+ do {
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+ } while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory. This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible. If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI). Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, size_t n)
+{
+ do {
+#ifdef CONFIG_64BIT
+ u64 t = *(u64 *)(a + (n -= 8));
+ *(u64 *)(a + n) = *(u64 *)(b + n);
+ *(u64 *)(b + n) = t;
+#else
+ /* Use two 32-bit transfers to avoid base+index+4 addressing */
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+
+ t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+#endif
+ } while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, size_t n)
+{
+ do {
+ char t = ((char *)a)[--n];
+ ((char *)a)[n] = ((char *)b)[n];
+ ((char *)b)[n] = t;
+ } while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_r_func_t)0
+#define SWAP_WORDS_32 (swap_r_func_t)1
+#define SWAP_BYTES (swap_r_func_t)2
+#define SWAP_WRAPPER (swap_r_func_t)3
+
+struct wrapper {
+ cmp_func_t cmp;
+ swap_func_t swap_func;
+};
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
+{
+ if (swap_func == SWAP_WRAPPER) {
+ ((const struct wrapper *)priv)->swap_func(a, b, (int)size);
+ return;
+ }
+
+ if (swap_func == SWAP_WORDS_64)
+ swap_words_64(a, b, size);
+ else if (swap_func == SWAP_WORDS_32)
+ swap_words_32(a, b, size);
+ else if (swap_func == SWAP_BYTES)
+ swap_bytes(a, b, size);
+ else
+ swap_func(a, b, (int)size, priv);
+}
+
+#define _CMP_WRAPPER ((cmp_r_func_t)0L)
+
+static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
+{
+ if (cmp == _CMP_WRAPPER)
+ return ((const struct wrapper *)priv)->cmp(a, b);
+ return cmp(a, b, priv);
+}
+
+static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func, const void *priv,
+ size_t l, size_t r)
+{
+ return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ cmp_func, priv);
+}
+
+static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+ swap_r_func_t swap_func, const void *priv,
+ size_t l, size_t r)
+{
+ do_swap(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ size, swap_func, priv);
+}
+
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv)
+{
+ int i, j, k;
+
+ /* called from 'sort' without swap function, let's pick the default */
+ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
+ swap_func = NULL;
+
+ if (!swap_func) {
+ if (is_aligned(base, size, 8))
+ swap_func = SWAP_WORDS_64;
+ else if (is_aligned(base, size, 4))
+ swap_func = SWAP_WORDS_32;
+ else
+ swap_func = SWAP_BYTES;
+ }
+
+ /* heapify */
+ for (i = n / 2 - 1; i >= 0; --i) {
+ /* Find the sift-down path all the way to the leaves. */
+ for (j = i; k = j * 2 + 1, k + 1 < n;)
+ j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
+
+ /* Special case for the last leaf with no sibling. */
+ if (j * 2 + 2 == n)
+ j = j * 2 + 1;
+
+ /* Backtrack to the correct location. */
+ while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0)
+ j = (j - 1) / 2;
+
+ /* Shift the element into its correct place. */
+ for (k = j; j != i;) {
+ j = (j - 1) / 2;
+ eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
+ }
+ }
+
+ /* sort */
+ for (i = n - 1; i > 0; --i) {
+ eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+
+ /* Find the sift-down path all the way to the leaves. */
+ for (j = 0; k = j * 2 + 1, k + 1 < i;)
+ j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
+
+ /* Special case for the last leaf with no sibling. */
+ if (j * 2 + 2 == i)
+ j = j * 2 + 1;
+
+ /* Backtrack to the correct location. */
+ while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0)
+ j = (j - 1) / 2;
+
+ /* Shift the element into its correct place. */
+ for (k = j; j;) {
+ j = (j - 1) / 2;
+ eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
+ }
+ }
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+ cmp_func_t cmp_func,
+ swap_func_t swap_func)
+{
+ struct wrapper w = {
+ .cmp = cmp_func,
+ .swap_func = swap_func,
+ };
+
+ return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
+}
+
+#if 0
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/ktime.h>
+
+static u64 cmp_count;
+
+static int mycmp(const void *a, const void *b)
+{
+ u32 _a = *(u32 *)a;
+ u32 _b = *(u32 *)b;
+
+ cmp_count++;
+ if (_a < _b)
+ return -1;
+ else if (_a > _b)
+ return 1;
+ else
+ return 0;
+}
+
+static int test(void)
+{
+ size_t N, i;
+ ktime_t start, end;
+ s64 delta;
+ u32 *arr;
+
+ for (N = 10000; N <= 100000; N += 10000) {
+ arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL);
+ cmp_count = 0;
+
+ for (i = 0; i < N; i++)
+ arr[i] = get_random_u32();
+
+ start = ktime_get();
+ eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL);
+ end = ktime_get();
+
+ delta = ktime_us_delta(end, start);
+ printk(KERN_INFO "time: %lld\n", delta);
+ printk(KERN_INFO "comparisons: %lld\n", cmp_count);
+
+ u32 prev = 0;
+
+ eytzinger0_for_each(i, N) {
+ if (prev > arr[i])
+ goto err;
+ prev = arr[i];
+ }
+
+ kfree(arr);
+ }
+ return 0;
+
+err:
+ kfree(arr);
+ return -1;
+}
+#endif
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index b04750dbf870..0541192d7bc0 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -5,23 +5,33 @@
#include <linux/bitops.h>
#include <linux/log2.h>
-#include "util.h"
+#ifdef EYTZINGER_DEBUG
+#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
+#else
+#define EYTZINGER_BUG_ON(cond)
+#endif
/*
* Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array
- */
-
-/*
- * One based indexing version:
+ * array.
+ *
+ * Consider using an eytzinger tree any time you would otherwise be doing binary
+ * search over an array. Binary search is a worst case scenario for branch
+ * prediction and prefetching, but in an eytzinger tree every node's children
+ * are adjacent in memory, thus we can prefetch children before knowing the
+ * result of the comparison, assuming multiple nodes fit on a cacheline.
+ *
+ * Two variants are provided, for one based indexing and zero based indexing.
*
- * With one based indexing each level of the tree starts at a power of two -
- * good for cacheline alignment:
+ * Zero based indexing is more convenient, but one based indexing has better
+ * alignment and thus better performance because each new level of the tree
+ * starts at a power of two, and thus if element 0 was cacheline aligned, each
+ * new level will be as well.
*/
static inline unsigned eytzinger1_child(unsigned i, unsigned child)
{
- EBUG_ON(child > 1);
+ EYTZINGER_BUG_ON(child > 1);
return (i << 1) + child;
}
@@ -38,7 +48,7 @@ static inline unsigned eytzinger1_right_child(unsigned i)
static inline unsigned eytzinger1_first(unsigned size)
{
- return rounddown_pow_of_two(size);
+ return size ? rounddown_pow_of_two(size) : 0;
}
static inline unsigned eytzinger1_last(unsigned size)
@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
- EBUG_ON(i > size);
+ EYTZINGER_BUG_ON(i > size);
if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
- EBUG_ON(i > size);
+ EYTZINGER_BUG_ON(i > size);
if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
@@ -91,7 +101,9 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
static inline unsigned eytzinger1_extra(unsigned size)
{
- return (size + 1 - rounddown_pow_of_two(size)) << 1;
+ return size
+ ? (size + 1 - rounddown_pow_of_two(size)) << 1
+ : 0;
}
static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
@@ -101,7 +113,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned shift = __fls(size) - b;
int s;
- EBUG_ON(!i || i > size);
+ EYTZINGER_BUG_ON(!i || i > size);
i ^= 1U << b;
i <<= 1;
@@ -126,7 +138,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
unsigned shift;
int s;
- EBUG_ON(!i || i > size);
+ EYTZINGER_BUG_ON(!i || i > size);
/*
* sign bit trick:
@@ -164,7 +176,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
static inline unsigned eytzinger0_child(unsigned i, unsigned child)
{
- EBUG_ON(child > 1);
+ EYTZINGER_BUG_ON(child > 1);
return (i << 1) + 1 + child;
}
@@ -231,11 +243,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
-typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
-
/* return greatest node <= @search, or -1 if not found */
-static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
- eytzinger_cmp_fn cmp, const void *search)
+static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
{
unsigned i, n = 0;
@@ -244,21 +254,49 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
do {
i = n;
- n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+ n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
} while (n < nr);
if (n & 1) {
- /* @i was greater than @search, return previous node: */
-
- if (i == eytzinger0_first(nr))
- return -1;
-
+ /*
+ * @i was greater than @search, return previous node:
+ *
+ * if @i was leftmost/smallest element,
+ * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
+ */
return eytzinger0_prev(i, nr);
} else {
return i;
}
}
+static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
+{
+ ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
+
+ /*
+ * if eytitzinger0_find_le() returned -1 - no element was <= search - we
+ * want to return the first element; next/prev identities mean this work
+ * as expected
+ *
+ * similarly if find_le() returns last element, we should return -1;
+ * identities mean this all works out:
+ */
+ return eytzinger0_next(idx, nr);
+}
+
+static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
+{
+ ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
+
+ if (idx < nr && !cmp(base + idx * size, search))
+ return idx;
+
+ return eytzinger0_next(idx, nr);
+}
+
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
void *_base = (base); \
@@ -269,13 +307,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
int _res; \
\
while (_i < _nr && \
- (_res = _cmp(_search, _base + _i * _size, _size))) \
+ (_res = _cmp(_search, _base + _i * _size))) \
_i = eytzinger0_child(_i, _res > 0); \
_i; \
})
-void eytzinger0_sort(void *, size_t, size_t,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t));
+void eytzinger0_sort_r(void *, size_t, size_t,
+ cmp_r_func_t, swap_r_func_t, const void *);
+void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
index 66b945be10c2..d8153fe27037 100644
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
@@ -24,12 +24,12 @@ struct { \
(fifo)->mask = (fifo)->size \
? roundup_pow_of_two((fifo)->size) - 1 \
: 0; \
- (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \
+ (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \
})
#define free_fifo(fifo) \
do { \
- kvpfree((fifo)->data, fifo_buf_size(fifo)); \
+ kvfree((fifo)->data); \
(fifo)->data = NULL; \
} while (0)
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 1c1ea0f0c692..2c3d46ac70c6 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -42,7 +42,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir,
+ BTREE_ITER_intent|BTREE_ITER_with_updates);
if (ret)
goto err;
@@ -68,9 +69,7 @@ int bch2_create_trans(struct btree_trans *trans,
if (!snapshot_src.inum) {
/* Inode wasn't specified, just snapshot: */
struct bch_subvolume s;
-
- ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
- BTREE_ITER_CACHED, &s);
+ ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s);
if (ret)
goto err;
@@ -78,7 +77,7 @@ int bch2_create_trans(struct btree_trans *trans,
}
ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
@@ -107,6 +106,7 @@ int bch2_create_trans(struct btree_trans *trans,
u32 new_subvol, dir_snapshot;
ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+ dir.subvol,
snapshot_src.subvol,
&new_subvol, &snapshot,
(flags & BCH_CREATE_SNAPSHOT_RO) != 0);
@@ -162,7 +162,7 @@ int bch2_create_trans(struct btree_trans *trans,
name,
dir_target,
&dir_offset,
- BCH_HASH_SET_MUST_CREATE);
+ STR_HASH_must_create|BTREE_ITER_with_updates);
if (ret)
goto err;
@@ -170,7 +170,11 @@ int bch2_create_trans(struct btree_trans *trans,
new_inode->bi_dir_offset = dir_offset;
}
- inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+ if (S_ISDIR(mode) &&
+ !new_inode->bi_subvol)
+ new_inode->bi_depth = dir_u->bi_depth + 1;
+
+ inode_iter.flags &= ~BTREE_ITER_all_snapshots;
bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
ret = bch2_btree_iter_traverse(&inode_iter) ?:
@@ -197,16 +201,16 @@ int bch2_link_trans(struct btree_trans *trans,
if (dir.subvol != inum.subvol)
return -EXDEV;
- ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
if (ret)
- goto err;
+ return ret;
inode_u->bi_ctime = now;
ret = bch2_inode_nlink_inc(inode_u);
if (ret)
- return ret;
+ goto err;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
if (ret)
goto err;
@@ -222,7 +226,7 @@ int bch2_link_trans(struct btree_trans *trans,
ret = bch2_dirent_create(trans, dir, &dir_hash,
mode_to_type(inode_u->bi_mode),
name, inum.inum, &dir_offset,
- BCH_HASH_SET_MUST_CREATE);
+ STR_HASH_must_create);
if (ret)
goto err;
@@ -242,7 +246,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u,
const struct qstr *name,
- bool deleting_snapshot)
+ bool deleting_subvol)
{
struct bch_fs *c = trans->c;
struct btree_iter dir_iter = { NULL };
@@ -254,34 +258,41 @@ int bch2_unlink_trans(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
if (ret)
goto err;
dir_hash = bch2_hash_info_init(c, dir_u);
- ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
- name, &inum, BTREE_ITER_INTENT);
+ ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+ name, &inum, BTREE_ITER_intent);
if (ret)
goto err;
ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
- if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
+ if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
ret = bch2_empty_dir_trans(trans, inum);
if (ret)
goto err;
}
- if (deleting_snapshot && !inode_u->bi_subvol) {
+ if (deleting_subvol && !inode_u->bi_subvol) {
ret = -BCH_ERR_ENOENT_not_subvol;
goto err;
}
- if (deleting_snapshot || inode_u->bi_subvol) {
+ if (inode_u->bi_subvol) {
+ /* Recursive subvolume destroy not allowed (yet?) */
+ ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
+ if (ret)
+ goto err;
+ }
+
+ if (deleting_subvol || inode_u->bi_subvol) {
ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
if (ret)
goto err;
@@ -314,7 +325,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
&dir_hash, &dirent_iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_inode_write(trans, &dir_iter, dir_u) ?:
bch2_inode_write(trans, &inode_iter, inode_u);
err:
@@ -349,6 +360,22 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
return ret;
}
+static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
+{
+ struct btree_iter iter;
+ struct bkey_i_subvolume *s =
+ bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, subvol),
+ BTREE_ITER_cached, subvolume);
+ int ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ return ret;
+
+ s->v.fs_path_parent = cpu_to_le32(new_parent);
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
int bch2_rename_trans(struct btree_trans *trans,
subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
@@ -370,7 +397,7 @@ int bch2_rename_trans(struct btree_trans *trans,
int ret;
ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
@@ -379,7 +406,7 @@ int bch2_rename_trans(struct btree_trans *trans,
if (dst_dir.inum != src_dir.inum ||
dst_dir.subvol != src_dir.subvol) {
ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
@@ -399,17 +426,47 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
if (dst_inum.inum) {
ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
+ if (ret)
+ goto err;
+ }
+
+ if (src_inode_u->bi_subvol &&
+ dst_dir.subvol != src_inode_u->bi_parent_subvol) {
+ ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
+ if (ret)
+ goto err;
+ }
+
+ if (mode == BCH_RENAME_EXCHANGE &&
+ dst_inode_u->bi_subvol &&
+ src_dir.subvol != dst_inode_u->bi_parent_subvol) {
+ ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
if (ret)
goto err;
}
+ /* Can't move across subvolumes, unless it's a subvolume root: */
+ if (src_dir.subvol != dst_dir.subvol &&
+ (!src_inode_u->bi_subvol ||
+ (dst_inum.inum && !dst_inode_u->bi_subvol))) {
+ ret = -EXDEV;
+ goto err;
+ }
+
+ if (src_inode_u->bi_parent_subvol)
+ src_inode_u->bi_parent_subvol = dst_dir.subvol;
+
+ if ((mode == BCH_RENAME_EXCHANGE) &&
+ dst_inode_u->bi_parent_subvol)
+ dst_inode_u->bi_parent_subvol = src_dir.subvol;
+
src_inode_u->bi_dir = dst_dir_u->bi_inum;
src_inode_u->bi_dir_offset = dst_offset;
@@ -432,10 +489,10 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
- if (S_ISDIR(dst_inode_u->bi_mode) &&
- bch2_empty_dir_trans(trans, dst_inum)) {
- ret = -ENOTEMPTY;
- goto err;
+ if (S_ISDIR(dst_inode_u->bi_mode)) {
+ ret = bch2_empty_dir_trans(trans, dst_inum);
+ if (ret)
+ goto err;
}
}
@@ -457,6 +514,15 @@ int bch2_rename_trans(struct btree_trans *trans,
dst_dir_u->bi_nlink++;
}
+ if (S_ISDIR(src_inode_u->bi_mode) &&
+ !src_inode_u->bi_subvol)
+ src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
+
+ if (mode == BCH_RENAME_EXCHANGE &&
+ S_ISDIR(dst_inode_u->bi_mode) &&
+ !dst_inode_u->bi_subvol)
+ dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
+
if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
dst_dir_u->bi_nlink--;
src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
@@ -493,3 +559,94 @@ err:
bch2_trans_iter_exit(trans, &src_dir_iter);
return ret;
}
+
+static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
+{
+ bch2_printbuf_make_room(out, n);
+
+ unsigned can_print = min(n, printbuf_remaining(out));
+
+ b += n;
+
+ for (unsigned i = 0; i < can_print; i++)
+ out->buf[out->pos++] = *((char *) --b);
+
+ printbuf_nul_terminate(out);
+}
+
+static inline void prt_str_reversed(struct printbuf *out, const char *s)
+{
+ prt_bytes_reversed(out, s, strlen(s));
+}
+
+static inline void reverse_bytes(void *b, size_t n)
+{
+ char *e = b + n, *s = b;
+
+ while (s < e) {
+ --e;
+ swap(*s, *e);
+ s++;
+ }
+}
+
+/* XXX: we don't yet attempt to print paths when we don't know the subvol */
+int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path)
+{
+ unsigned orig_pos = path->pos;
+ int ret = 0;
+
+ while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL &&
+ inum.inum == BCACHEFS_ROOT_INO)) {
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_find_by_inum_trans(trans, inum, &inode);
+ if (ret)
+ goto disconnected;
+
+ if (!inode.bi_dir && !inode.bi_dir_offset) {
+ ret = -BCH_ERR_ENOENT_inode_no_backpointer;
+ goto disconnected;
+ }
+
+ inum.subvol = inode.bi_parent_subvol ?: inum.subvol;
+ inum.inum = inode.bi_dir;
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto disconnected;
+
+ struct btree_iter d_iter;
+ struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
+ BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot),
+ 0, dirent);
+ ret = bkey_err(d.s_c);
+ if (ret)
+ goto disconnected;
+
+ struct qstr dirent_name = bch2_dirent_get_name(d);
+ prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
+
+ prt_char(path, '/');
+
+ bch2_trans_iter_exit(trans, &d_iter);
+ }
+
+ if (orig_pos == path->pos)
+ prt_char(path, '/');
+out:
+ ret = path->allocation_failure ? -ENOMEM : 0;
+ if (ret)
+ goto err;
+
+ reverse_bytes(path->buf + orig_pos, path->pos - orig_pos);
+ return 0;
+err:
+ return ret;
+disconnected:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto err;
+
+ prt_str_reversed(path, "(disconnected)");
+ goto out;
+}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index dde237859514..2b59210bb5e8 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_FS_COMMON_H
#define _BCACHEFS_FS_COMMON_H
+#include "dirent.h"
+
struct posix_acl;
#define BCH_CREATE_TMPFILE (1U << 0)
@@ -40,4 +42,6 @@ int bch2_rename_trans(struct btree_trans *,
bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
struct bch_inode_unpacked *);
+int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
+
#endif /* _BCACHEFS_FS_COMMON_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 27710cdd5710..ab1d5db2fa56 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -30,15 +30,8 @@ static void bch2_readpages_end_io(struct bio *bio)
{
struct folio_iter fi;
- bio_for_each_folio_all(fi, bio) {
- if (!bio->bi_status) {
- folio_mark_uptodate(fi.folio);
- } else {
- folio_clear_uptodate(fi.folio);
- folio_set_error(fi.folio);
- }
- folio_unlock(fi.folio);
- }
+ bio_for_each_folio_all(fi, bio)
+ folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK);
bio_put(bio);
}
@@ -158,7 +151,6 @@ static void bchfs_read(struct btree_trans *trans,
struct bkey_buf sk;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
- u32 snapshot;
int ret = 0;
rbio->c = c;
@@ -166,29 +158,24 @@ static void bchfs_read(struct btree_trans *trans,
rbio->subvol = inum.subvol;
bch2_bkey_buf_init(&sk);
-retry:
bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
- BTREE_ITER_SLOTS);
+ POS(inum.inum, rbio->bio.bi_iter.bi_sector),
+ BTREE_ITER_slots);
while (1) {
struct bkey_s_c k;
- unsigned bytes, sectors, offset_into_extent;
+ unsigned bytes, sectors;
+ s64 offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
- /*
- * read_extent -> io_time_reset may cause a transaction restart
- * without returning an error, we need to check for that here:
- */
- ret = bch2_trans_relock(trans);
+ bch2_trans_begin(trans);
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
- break;
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
@@ -196,7 +183,7 @@ retry:
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
- break;
+ goto err;
offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
@@ -207,17 +194,17 @@ retry:
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &sk);
if (ret)
- break;
+ goto err;
k = bkey_i_to_s_c(sk.k);
- sectors = min(sectors, k.k->size - offset_into_extent);
+ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
if (readpages_iter) {
ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
extent_partial_reads_expensive(k));
if (ret)
- break;
+ goto err;
}
bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
@@ -236,22 +223,20 @@ retry:
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
-
- ret = btree_trans_too_many_iters(trans);
- if (ret)
+err:
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
-err:
bch2_trans_iter_exit(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
if (ret) {
- bch_err_inum_offset_ratelimited(c,
- iter.pos.inode,
- iter.pos.offset << 9,
- "read error %i from btree lookup", ret);
+ struct printbuf buf = PRINTBUF;
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9);
+ prt_printf(&buf, "read error %i from btree lookup", ret);
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+
rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
}
@@ -264,9 +249,9 @@ void bch2_readahead(struct readahead_control *ractl)
struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts;
- struct btree_trans *trans = bch2_trans_get(c);
struct folio *folio;
struct readpages_iter readpages_iter;
+ struct blk_plug plug;
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
@@ -274,8 +259,19 @@ void bch2_readahead(struct readahead_control *ractl)
if (ret)
return;
+ /*
+ * Besides being a general performance optimization, plugging helps with
+ * avoiding btree transaction srcu warnings - submitting a bio can
+ * block, and we don't want todo that with the transaction locked.
+ *
+ * However, plugged bios are submitted when we schedule; we ideally
+ * would have our own scheduler hook to call unlock_long() before
+ * scheduling.
+ */
+ blk_start_plug(&plug);
bch2_pagecache_add_get(inode);
+ struct btree_trans *trans = bch2_trans_get(c);
while ((folio = readpage_iter_peek(&readpages_iter))) {
unsigned n = min_t(unsigned,
readpages_iter.folios.nr -
@@ -296,10 +292,10 @@ void bch2_readahead(struct readahead_control *ractl)
&readpages_iter);
bch2_trans_unlock(trans);
}
+ bch2_trans_put(trans);
bch2_pagecache_add_put(inode);
-
- bch2_trans_put(trans);
+ blk_finish_plug(&plug);
darray_exit(&readpages_iter.folios);
}
@@ -314,9 +310,13 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_read_bio *rbio;
struct bch_io_opts opts;
+ struct blk_plug plug;
int ret;
DECLARE_COMPLETION_ONSTACK(done);
+ BUG_ON(folio_test_uptodate(folio));
+ BUG_ON(folio_test_dirty(folio));
+
if (!bch2_folio_create(folio, GFP_KERNEL))
return -ENOMEM;
@@ -331,7 +331,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+ blk_start_plug(&plug);
bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
+ blk_finish_plug(&plug);
wait_for_completion(&done);
ret = blk_status_to_errno(rbio->bio.bi_status);
@@ -408,7 +410,6 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
bio_for_each_folio_all(fi, bio) {
struct bch_folio *s;
- folio_set_error(fi.folio);
mapping_set_error(fi.folio->mapping, -EIO);
s = __bch2_folio(fi.folio);
@@ -445,8 +446,8 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
*/
/*
- * PageWriteback is effectively our ref on the inode - fixup i_blocks
- * before calling end_page_writeback:
+ * The writeback flag is effectively our ref on the inode -
+ * fixup i_blocks before calling folio_end_writeback:
*/
bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
@@ -494,7 +495,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
- op->subvol = inode->ei_subvol;
+ op->subvol = inode->ei_inum.subvol;
op->pos = POS(inode->v.i_ino, sector);
op->end_io = bch2_writepage_io_done;
op->devs_need_flush = &inode->ei_devs_need_flush;
@@ -542,7 +543,7 @@ do_io:
if (f_sectors > w->tmp_sectors) {
kfree(w->tmp);
- w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL);
+ w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL);
w->tmp_sectors = f_sectors;
}
@@ -624,15 +625,6 @@ do_io:
BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
sectors << 9, offset << 9));
- /* Check for writing past i_size: */
- WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
- round_up(i_size, block_bytes(c)) &&
- !test_bit(BCH_FS_emergency_ro, &c->flags),
- "writing past i_size: %llu > %llu (unrounded %llu)\n",
- bio_end_sector(&w->io->op.wbio.bio) << 9,
- round_up(i_size, block_bytes(c)),
- i_size);
-
w->io->op.res.sectors += reserved_sectors;
w->io->op.i_sectors_delta -= dirty_sectors;
w->io->op.new_i_size = i_size;
@@ -667,7 +659,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
int bch2_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -686,9 +678,9 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
bch2_pagecache_add_get(inode);
folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
- FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
- mapping_gfp_mask(mapping));
- if (IS_ERR_OR_NULL(folio))
+ FGP_WRITEBEGIN | fgf_set_order(len),
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
goto err_unlock;
offset = pos - folio_pos(folio);
@@ -736,12 +728,11 @@ out:
goto err;
}
- *pagep = &folio->page;
+ *foliop = folio;
return 0;
err:
folio_unlock(folio);
folio_put(folio);
- *pagep = NULL;
err_unlock:
bch2_pagecache_add_put(inode);
kfree(res);
@@ -751,12 +742,11 @@ err_unlock:
int bch2_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation *res = fsdata;
- struct folio *folio = page_folio(page);
unsigned offset = pos - folio_pos(folio);
lockdep_assert_held(&inode->v.i_rwsem);
@@ -827,9 +817,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
darray_init(&fs);
ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
- FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
- mapping_gfp_mask(mapping),
- &fs);
+ FGP_WRITEBEGIN | fgf_set_order(len),
+ mapping_gfp_mask(mapping), &fs);
if (ret)
goto out;
@@ -862,24 +851,32 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
f_pos = pos;
f_offset = pos - folio_pos(darray_first(fs));
darray_for_each(fs, fi) {
+ ssize_t f_reserved;
+
f = *fi;
f_len = min(end, folio_end_pos(f)) - f_pos;
+ f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len);
+
+ if (unlikely(f_reserved != f_len)) {
+ if (f_reserved < 0) {
+ if (f == darray_first(fs)) {
+ ret = f_reserved;
+ goto out;
+ }
+
+ folios_trunc(&fs, fi);
+ end = min(end, folio_end_pos(darray_last(fs)));
+ } else {
+ if (!folio_test_uptodate(f)) {
+ ret = bch2_read_single_folio(f, mapping);
+ if (ret)
+ goto out;
+ }
+
+ folios_trunc(&fs, fi + 1);
+ end = f_pos + f_reserved;
+ }
- /*
- * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
- * supposed to write as much as we have disk space for.
- *
- * On failure here we should still write out a partial page if
- * we aren't completely out of disk space - we don't do that
- * yet:
- */
- ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
- if (unlikely(ret)) {
- folios_trunc(&fs, fi);
- if (!fs.nr)
- goto out;
-
- end = min(end, folio_end_pos(darray_last(fs)));
break;
}
@@ -896,7 +893,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
darray_for_each(fs, fi) {
f = *fi;
f_len = min(end, folio_end_pos(f)) - f_pos;
- f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
+ f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter);
if (!f_copied) {
folios_trunc(&fs, fi);
break;
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
index a6126ff790e6..3207ebbb4ab4 100644
--- a/fs/bcachefs/fs-io-buffered.h
+++ b/fs/bcachefs/fs-io-buffered.h
@@ -10,10 +10,10 @@ int bch2_read_folio(struct file *, struct folio *);
int bch2_writepages(struct address_space *, struct writeback_control *);
void bch2_readahead(struct readahead_control *);
-int bch2_write_begin(struct file *, struct address_space *, loff_t,
- unsigned, struct page **, void **);
+int bch2_write_begin(struct file *, struct address_space *, loff_t pos,
+ unsigned len, struct folio **, void **);
int bch2_write_end(struct file *, struct address_space *, loff_t,
- unsigned, unsigned, struct page *, void *);
+ unsigned len, unsigned copied, struct folio *, void *);
ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 33cb6da3a5ad..2089c36b5866 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -70,6 +70,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
struct bch_io_opts opts;
struct dio_read *dio;
struct bio *bio;
+ struct blk_plug plug;
loff_t offset = req->ki_pos;
bool sync = is_sync_kiocb(req);
size_t shorten;
@@ -128,6 +129,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
*/
dio->should_dirty = iter_is_iovec(iter);
+ blk_start_plug(&plug);
+
goto start;
while (iter->count) {
bio = bio_alloc_bioset(NULL,
@@ -160,6 +163,8 @@ start:
bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
}
+ blk_finish_plug(&plug);
+
iter->count += shorten;
if (sync) {
@@ -179,7 +184,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct bch_inode_info *inode = file_bch_inode(file);
struct address_space *mapping = file->f_mapping;
size_t count = iov_iter_count(iter);
- ssize_t ret;
+ ssize_t ret = 0;
if (!count)
return 0; /* skip atime */
@@ -205,7 +210,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
iocb->ki_pos += ret;
} else {
bch2_pagecache_add_get(inode);
- ret = generic_file_read_iter(iocb, iter);
+ ret = filemap_read(iocb, iter, ret);
bch2_pagecache_add_put(inode);
}
out:
@@ -254,7 +259,7 @@ retry:
for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_SLOTS, k, err) {
+ BTREE_ITER_slots, k, err) {
if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
break;
@@ -369,6 +374,7 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio)
static __always_inline long bch2_dio_write_done(struct dio_write *dio)
{
+ struct bch_fs *c = dio->op.c;
struct kiocb *req = dio->req;
struct bch_inode_info *inode = dio->inode;
bool sync = dio->sync;
@@ -387,6 +393,8 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
ret = dio->op.error ?: ((long) dio->written << 9);
bio_put(&dio->op.wbio.bio);
+ bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
@@ -498,7 +506,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
dio->op.target = dio->op.opts.foreground_target;
dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas;
- dio->op.subvol = inode->ei_subvol;
+ dio->op.subvol = inode->ei_inum.subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
dio->op.devs_need_flush = &inode->ei_devs_need_flush;
@@ -536,7 +544,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
if (likely(!dio->iter.count) || dio->op.error)
break;
- bio_reset(bio, NULL, REQ_OP_WRITE);
+ bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
}
out:
return bch2_dio_write_done(dio);
@@ -590,22 +598,27 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
prefetch(&inode->ei_inode);
prefetch((void *) &inode->ei_inode + 64);
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
+ return -EROFS;
+
inode_lock(&inode->v);
ret = generic_write_checks(req, iter);
if (unlikely(ret <= 0))
- goto err;
+ goto err_put_write_ref;
ret = file_remove_privs(file);
if (unlikely(ret))
- goto err;
+ goto err_put_write_ref;
ret = file_update_time(file);
if (unlikely(ret))
- goto err;
+ goto err_put_write_ref;
- if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
- goto err;
+ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) {
+ ret = -EINVAL;
+ goto err_put_write_ref;
+ }
inode_dio_begin(&inode->v);
bch2_pagecache_block_get(inode);
@@ -618,7 +631,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
bio = bio_alloc_bioset(NULL,
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_WRITE,
+ REQ_OP_WRITE | REQ_SYNC | REQ_IDLE,
GFP_KERNEL,
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, op.wbio.bio);
@@ -645,7 +658,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
}
ret = bch2_dio_write_loop(dio);
-err:
+out:
if (locked)
inode_unlock(&inode->v);
return ret;
@@ -653,7 +666,9 @@ err_put_bio:
bch2_pagecache_block_put(inode);
bio_put(bio);
inode_dio_end(&inode->v);
- goto err;
+err_put_write_ref:
+ bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+ goto out;
}
void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index d359aa9b33b8..e072900e6a5b 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -29,7 +29,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
break;
f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
- if (IS_ERR_OR_NULL(f))
+ if (IS_ERR(f))
break;
BUG_ON(fs->nr && folio_pos(f) != pos);
@@ -182,18 +182,11 @@ static void __bch2_folio_set(struct folio *folio,
int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
struct folio **fs, unsigned nr_folios)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_folio *s;
u64 offset = folio_sector(fs[0]);
- unsigned folio_idx;
- u32 snapshot;
bool need_set = false;
- int ret;
- for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
- s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
+ for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+ struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
if (!s)
return -ENOMEM;
@@ -203,53 +196,40 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
if (!need_set)
return 0;
- folio_idx = 0;
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_SLOTS, k, ret) {
- unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = bkey_to_sector_state(k);
-
- while (folio_idx < nr_folios) {
- struct folio *folio = fs[folio_idx];
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
- folio_start;
- unsigned folio_len = min(k.k->p.offset, folio_end) -
- folio_offset - folio_start;
-
- BUG_ON(k.k->p.offset < folio_start);
- BUG_ON(bkey_start_offset(k.k) > folio_end);
-
- if (!bch2_folio(folio)->uptodate)
- __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
- if (k.k->p.offset < folio_end)
- break;
- folio_idx++;
- }
-
- if (folio_idx == nr_folios)
- break;
- }
-
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- bch2_trans_put(trans);
+ unsigned folio_idx = 0;
+
+ return bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
+ POS(inum.inum, offset),
+ POS(inum.inum, U64_MAX),
+ inum.subvol, BTREE_ITER_slots, k, ({
+ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+ unsigned state = bkey_to_sector_state(k);
+
+ while (folio_idx < nr_folios) {
+ struct folio *folio = fs[folio_idx];
+ u64 folio_start = folio_sector(folio);
+ u64 folio_end = folio_end_sector(folio);
+ unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
+ folio_start;
+ unsigned folio_len = min(k.k->p.offset, folio_end) -
+ folio_offset - folio_start;
+
+ BUG_ON(k.k->p.offset < folio_start);
+ BUG_ON(bkey_start_offset(k.k) > folio_end);
+
+ if (!bch2_folio(folio)->uptodate)
+ __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
+
+ if (k.k->p.offset < folio_end)
+ break;
+ folio_idx++;
+ }
- return ret;
+ if (folio_idx == nr_folios)
+ break;
+ 0;
+ })));
}
void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
@@ -419,14 +399,17 @@ void bch2_folio_reservation_put(struct bch_fs *c,
bch2_quota_reservation_put(c, inode, &res->quota);
}
-int bch2_folio_reservation_get(struct bch_fs *c,
+static int __bch2_folio_reservation_get(struct bch_fs *c,
struct bch_inode_info *inode,
struct folio *folio,
struct bch2_folio_reservation *res,
- unsigned offset, unsigned len)
+ size_t offset, size_t len,
+ bool partial)
{
struct bch_folio *s = bch2_folio_create(folio, 0);
unsigned i, disk_sectors = 0, quota_sectors = 0;
+ struct disk_reservation disk_res = {};
+ size_t reserved = len;
int ret;
if (!s)
@@ -437,32 +420,70 @@ int bch2_folio_reservation_get(struct bch_fs *c,
for (i = round_down(offset, block_bytes(c)) >> 9;
i < round_up(offset + len, block_bytes(c)) >> 9;
i++) {
- disk_sectors += sectors_to_reserve(&s->s[i],
- res->disk.nr_replicas);
+ disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
quota_sectors += s->s[i].state == SECTOR_unallocated;
}
if (disk_sectors) {
- ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
+ ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors,
+ partial ? BCH_DISK_RESERVATION_PARTIAL : 0);
if (unlikely(ret))
return ret;
+
+ if (unlikely(disk_res.sectors != disk_sectors)) {
+ disk_sectors = quota_sectors = 0;
+
+ for (i = round_down(offset, block_bytes(c)) >> 9;
+ i < round_up(offset + len, block_bytes(c)) >> 9;
+ i++) {
+ disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
+ if (disk_sectors > disk_res.sectors) {
+ /*
+ * Make sure to get a reservation that's
+ * aligned to the filesystem blocksize:
+ */
+ unsigned reserved_offset = round_down(i << 9, block_bytes(c));
+ reserved = clamp(reserved_offset, offset, offset + len) - offset;
+
+ if (!reserved) {
+ bch2_disk_reservation_put(c, &disk_res);
+ return -BCH_ERR_ENOSPC_disk_reservation;
+ }
+ break;
+ }
+ quota_sectors += s->s[i].state == SECTOR_unallocated;
+ }
+ }
}
if (quota_sectors) {
- ret = bch2_quota_reservation_add(c, inode, &res->quota,
- quota_sectors, true);
+ ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
if (unlikely(ret)) {
- struct disk_reservation tmp = {
- .sectors = disk_sectors
- };
-
- bch2_disk_reservation_put(c, &tmp);
- res->disk.sectors -= disk_sectors;
+ bch2_disk_reservation_put(c, &disk_res);
return ret;
}
}
- return 0;
+ res->disk.sectors += disk_res.sectors;
+ return partial ? reserved : 0;
+}
+
+int bch2_folio_reservation_get(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct folio *folio,
+ struct bch2_folio_reservation *res,
+ size_t offset, size_t len)
+{
+ return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false);
+}
+
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct folio *folio,
+ struct bch2_folio_reservation *res,
+ size_t offset, size_t len)
+{
+ return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true);
}
static void bch2_clear_folio_bits(struct folio *folio)
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index 8cbaba6565b4..fad911cf5068 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -51,13 +51,10 @@ enum bch_folio_sector_state {
struct bch_folio_sector {
/* Uncompressed, fully allocated replicas (or on disk reservation): */
- unsigned nr_replicas:4;
-
+ u8 nr_replicas:4,
/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
- unsigned replicas_reserved:4;
-
- /* i_sectors: */
- enum bch_folio_sector_state state:8;
+ replicas_reserved:4;
+ u8 state;
};
struct bch_folio {
@@ -102,9 +99,7 @@ static inline void bch2_folio_release(struct folio *folio)
static inline struct bch_folio *__bch2_folio(struct folio *folio)
{
- return folio_has_private(folio)
- ? (struct bch_folio *) folio_get_private(folio)
- : NULL;
+ return folio_get_private(folio);
}
static inline struct bch_folio *bch2_folio(struct folio *folio)
@@ -156,7 +151,12 @@ int bch2_folio_reservation_get(struct bch_fs *,
struct bch_inode_info *,
struct folio *,
struct bch2_folio_reservation *,
- unsigned, unsigned);
+ size_t, size_t);
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *,
+ struct bch_inode_info *,
+ struct folio *,
+ struct bch2_folio_reservation *,
+ size_t, size_t);
void bch2_set_folio_dirty(struct bch_fs *,
struct bch_inode_info *,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8c70123b6a0c..717e7b94c66f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -167,6 +167,34 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
/* fsync: */
+static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum,
+ u64 *seq)
+{
+ struct printbuf buf = PRINTBUF;
+ struct bch_inode_unpacked u;
+ struct btree_iter iter;
+ int ret = bch2_inode_peek(trans, &iter, &u, inum, 0);
+ if (ret)
+ return ret;
+
+ u64 cur_seq = journal_cur_seq(&trans->c->journal);
+ *seq = min(cur_seq, u.bi_journal_seq);
+
+ if (fsck_err_on(u.bi_journal_seq > cur_seq,
+ trans, inode_journal_seq_in_future,
+ "inode journal seq in future (currently at %llu)\n%s",
+ cur_seq,
+ (bch2_inode_unpacked_to_text(&buf, &u),
+ buf.buf))) {
+ u.bi_journal_seq = cur_seq;
+ ret = bch2_inode_write(trans, &iter, &u);
+ }
+fsck_err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
/*
* inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
* insert trigger: look up the btree inode instead
@@ -174,25 +202,28 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
static int bch2_flush_inode(struct bch_fs *c,
struct bch_inode_info *inode)
{
- struct bch_inode_unpacked u;
- int ret;
-
if (c->opts.journal_flush_disabled)
return 0;
- ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
- if (ret)
- return ret;
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
+ return -EROFS;
- return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
- bch2_inode_flush_nocow_writes(c, inode);
+ u64 seq;
+ int ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+ bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?:
+ bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?:
+ bch2_inode_flush_nocow_writes(c, inode);
+ bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
+ return ret;
}
int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret;
+ int ret, err;
+
+ trace_bch2_fsync(file, datasync);
ret = file_write_and_wait_range(file, start, end);
if (ret)
@@ -202,7 +233,15 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
ret = bch2_flush_inode(c, inode);
out:
- return bch2_err_class(ret);
+ ret = bch2_err_class(ret);
+ if (ret == -EROFS)
+ ret = -EIO;
+
+ err = file_check_and_advance_wb_err(file);
+ if (!ret)
+ ret = err;
+
+ return ret;
}
/* truncate: */
@@ -211,30 +250,11 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
struct bpos start,
struct bpos end)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
- if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
- ret = 1;
- break;
- }
- start = iter.pos;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
- return ret;
+ return bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end,
+ subvol, 0, k, ({
+ bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
+ })));
}
static int __bch2_truncate_folio(struct bch_inode_info *inode,
@@ -257,7 +277,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
* XXX: we're doing two index lookups when we end up reading the
* folio
*/
- ret = range_has_data(c, inode->ei_subvol,
+ ret = range_has_data(c, inode->ei_inum.subvol,
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
if (ret <= 0)
@@ -265,7 +285,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
folio = __filemap_get_folio(mapping, index,
FGP_LOCK|FGP_CREAT, GFP_KERNEL);
- if (IS_ERR_OR_NULL(folio)) {
+ if (IS_ERR(folio)) {
ret = -ENOMEM;
goto out;
}
@@ -446,6 +466,7 @@ int bchfs_truncate(struct mnt_idmap *idmap,
ret = bch2_truncate_folio(inode, iattr->ia_size);
if (unlikely(ret < 0))
goto err;
+ ret = 0;
truncate_setsize(&inode->v, iattr->ia_size);
@@ -505,7 +526,7 @@ static int inode_update_times_fn(struct btree_trans *trans,
return 0;
}
-static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
+static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
u64 end = offset + len;
@@ -544,7 +565,7 @@ err:
return ret;
}
-static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
+static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
loff_t offset, loff_t len,
bool insert)
{
@@ -580,7 +601,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
return ret;
}
-static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
+static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
u64 start_sector, u64 end_sector)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -594,9 +615,9 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inode->v.i_ino, start_sector),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
- while (!ret && bkey_lt(iter.pos, end_pos)) {
+ while (!ret) {
s64 i_sectors_delta = 0;
struct quota_res quota_res = { 0 };
struct bkey_s_c k;
@@ -607,8 +628,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_begin(trans);
+ if (bkey_ge(iter.pos, end_pos))
+ break;
+
ret = bch2_subvolume_get_snapshot(trans,
- inode->ei_subvol, &snapshot);
+ inode->ei_inum.subvol, &snapshot);
if (ret)
goto bkey_err;
@@ -643,12 +667,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
if (bch2_clamp_data_hole(&inode->v,
&hole_start,
&hole_end,
- opts.data_replicas, true))
+ opts.data_replicas, true)) {
ret = drop_locks_do(trans,
(bch2_clamp_data_hole(&inode->v,
&hole_start,
&hole_end,
opts.data_replicas, false), 0));
+ if (ret)
+ goto bkey_err;
+ }
bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
if (ret)
@@ -676,10 +703,13 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
if (bch2_mark_pagecache_reserved(inode, &hole_start,
- iter.pos.offset, true))
- drop_locks_do(trans,
+ iter.pos.offset, true)) {
+ ret = drop_locks_do(trans,
bch2_mark_pagecache_reserved(inode, &hole_start,
iter.pos.offset, false));
+ if (ret)
+ goto bkey_err;
+ }
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -701,7 +731,7 @@ bkey_err:
return ret;
}
-static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
+static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode,
loff_t offset, loff_t len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -803,41 +833,23 @@ static int quota_reserve_range(struct bch_inode_info *inode,
u64 start, u64 end)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- u32 snapshot;
u64 sectors = end - start;
- u64 pos = start;
- int ret;
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, pos, snapshot), 0);
-
- while (!(ret = btree_trans_too_many_iters(trans)) &&
- (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
- !(ret = bkey_err(k))) {
- if (bkey_extent_is_allocation(k.k)) {
- u64 s = min(end, k.k->p.offset) -
- max(start, bkey_start_offset(k.k));
- BUG_ON(s > sectors);
- sectors -= s;
- }
- bch2_btree_iter_advance(&iter);
- }
- pos = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- bch2_trans_put(trans);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_max(trans, iter,
+ BTREE_ID_extents,
+ POS(inode->v.i_ino, start),
+ POS(inode->v.i_ino, end - 1),
+ inode->ei_inum.subvol, 0, k, ({
+ if (bkey_extent_is_allocation(k.k)) {
+ u64 s = min(end, k.k->p.offset) -
+ max(start, bkey_start_offset(k.k));
+ BUG_ON(s > sectors);
+ sectors -= s;
+ }
+
+ 0;
+ })));
return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
}
@@ -857,9 +869,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
return -EINVAL;
- if (remap_flags & REMAP_FILE_DEDUP)
- return -EOPNOTSUPP;
-
if ((pos_src & (block_bytes(c) - 1)) ||
(pos_dst & (block_bytes(c) - 1)))
return -EINVAL;
@@ -892,16 +901,24 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
if (ret)
goto err;
- file_update_time(file_dst);
+ if (!(remap_flags & REMAP_FILE_DEDUP))
+ file_update_time(file_dst);
bch2_mark_pagecache_unallocated(src, pos_src >> 9,
(pos_src + aligned_len) >> 9);
+ /*
+ * XXX: we'd like to be telling bch2_remap_range() if we have
+ * permission to write to the source file, and thus if io path option
+ * changes should be propagated through the copy, but we need mnt_idmap
+ * from the pathwalk, awkward
+ */
ret = bch2_remap_range(c,
inode_inum(dst), pos_dst >> 9,
inode_inum(src), pos_src >> 9,
aligned_len >> 9,
- pos_dst + len, &i_sectors_delta);
+ pos_dst + len, &i_sectors_delta,
+ false);
if (ret < 0)
goto err;
@@ -934,42 +951,25 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
u64 isize, next_data = MAX_LFS_FILESIZE;
- u32 snapshot;
- int ret;
isize = i_size_read(&inode->v);
if (offset >= isize)
return -ENXIO;
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, offset >> 9, snapshot),
- POS(inode->v.i_ino, U64_MAX),
- 0, k, ret) {
- if (bkey_extent_is_data(k.k)) {
- next_data = max(offset, bkey_start_offset(k.k) << 9);
- break;
- } else if (k.k->p.offset >> 9 > isize)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
+ POS(inode->v.i_ino, offset >> 9),
+ POS(inode->v.i_ino, U64_MAX),
+ inum.subvol, 0, k, ({
+ if (bkey_extent_is_data(k.k)) {
+ next_data = max(offset, bkey_start_offset(k.k) << 9);
+ break;
+ } else if (k.k->p.offset >> 9 > isize)
+ break;
+ 0;
+ })));
if (ret)
return ret;
@@ -987,50 +987,34 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
u64 isize, next_hole = MAX_LFS_FILESIZE;
- u32 snapshot;
- int ret;
isize = i_size_read(&inode->v);
if (offset >= isize)
return -ENXIO;
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, offset >> 9, snapshot),
- BTREE_ITER_SLOTS, k, ret) {
- if (k.k->p.inode != inode->v.i_ino) {
- next_hole = bch2_seek_pagecache_hole(&inode->v,
- offset, MAX_LFS_FILESIZE, 0, false);
- break;
- } else if (!bkey_extent_is_data(k.k)) {
- next_hole = bch2_seek_pagecache_hole(&inode->v,
- max(offset, bkey_start_offset(k.k) << 9),
- k.k->p.offset << 9, 0, false);
-
- if (next_hole < k.k->p.offset << 9)
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
+ POS(inode->v.i_ino, offset >> 9),
+ POS(inode->v.i_ino, U64_MAX),
+ inum.subvol, BTREE_ITER_slots, k, ({
+ if (k.k->p.inode != inode->v.i_ino) {
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
+ offset, MAX_LFS_FILESIZE, 0, false);
break;
- } else {
- offset = max(offset, bkey_start_offset(k.k) << 9);
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ } else if (!bkey_extent_is_data(k.k)) {
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
+ max(offset, bkey_start_offset(k.k) << 9),
+ k.k->p.offset << 9, 0, false);
+
+ if (next_hole < k.k->p.offset << 9)
+ break;
+ } else {
+ offset = max(offset, bkey_start_offset(k.k) << 9);
+ }
+ 0;
+ })));
if (ret)
return ret;
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 3dc8630ff9fe..15725b4ce393 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -100,7 +100,7 @@ static int bch2_ioc_setflags(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
@@ -184,7 +184,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_set_projid(c, inode, fa.fsx_projid) ?:
bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
ATTR_CTIME);
@@ -272,6 +272,69 @@ err1:
return ret;
}
+static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg)
+{
+ return put_user(inode->v.i_generation, arg);
+}
+
+static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label)
+{
+ int ret;
+ size_t len;
+ char label[BCH_SB_LABEL_SIZE];
+
+ BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX);
+
+ mutex_lock(&c->sb_lock);
+ memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
+ mutex_unlock(&c->sb_lock);
+
+ len = strnlen(label, BCH_SB_LABEL_SIZE);
+ if (len == BCH_SB_LABEL_SIZE) {
+ bch_warn(c,
+ "label is too long, return the first %zu bytes",
+ --len);
+ }
+
+ ret = copy_to_user(user_label, label, len);
+
+ return ret ? -EFAULT : 0;
+}
+
+static int bch2_ioc_setlabel(struct bch_fs *c,
+ struct file *file,
+ struct bch_inode_info *inode,
+ const char __user *user_label)
+{
+ int ret;
+ char label[BCH_SB_LABEL_SIZE];
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(label, user_label, sizeof(label)))
+ return -EFAULT;
+
+ if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) {
+ bch_err(c,
+ "unable to set label with more than %d bytes",
+ BCH_SB_LABEL_SIZE - 1);
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ mutex_lock(&c->sb_lock);
+ strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
+ ret = bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ mnt_drop_write_file(file);
+ return ret;
+}
+
static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
{
u32 flags;
@@ -308,8 +371,8 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
return ret;
}
-static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
{
struct inode *dir;
struct bch_inode_info *inode;
@@ -343,7 +406,7 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
sync_inodes_sb(c->vfs_sb);
up_read(&c->vfs_sb->s_umount);
}
-retry:
+
if (arg.src_ptr) {
error = user_path_at(arg.dirfd,
(const char __user *)(unsigned long)arg.src_ptr,
@@ -373,7 +436,7 @@ retry:
}
if (dst_dentry->d_inode) {
- error = -EEXIST;
+ error = -BCH_ERR_EEXIST_subvolume_create;
goto err3;
}
@@ -406,9 +469,12 @@ retry:
!arg.src_ptr)
snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
+ down_write(&c->snapshot_create_lock);
inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
dst_dentry, arg.mode|S_IFDIR,
0, snapshot_src, create_flags);
+ up_write(&c->snapshot_create_lock);
+
error = PTR_ERR_OR_ZERO(inode);
if (error)
goto err3;
@@ -420,25 +486,10 @@ err3:
err2:
if (arg.src_ptr)
path_put(&src_path);
-
- if (retry_estale(error, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
err1:
return error;
}
-static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
-{
- down_write(&c->snapshot_create_lock);
- long ret = __bch2_ioctl_subvolume_create(c, filp, arg);
- up_write(&c->snapshot_create_lock);
-
- return ret;
-}
-
static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
struct bch_ioctl_subvolume arg)
{
@@ -506,13 +557,21 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
break;
case FS_IOC_GETVERSION:
- ret = -ENOTTY;
+ ret = bch2_ioc_getversion(inode, (u32 __user *) arg);
break;
case FS_IOC_SETVERSION:
ret = -ENOTTY;
break;
+ case FS_IOC_GETFSLABEL:
+ ret = bch2_ioc_getlabel(c, (void __user *) arg);
+ break;
+
+ case FS_IOC_SETFSLABEL:
+ ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg);
+ break;
+
case FS_IOC_GOINGDOWN:
ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
break;
@@ -548,12 +607,18 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
- case FS_IOC_GETFLAGS:
+ case FS_IOC32_GETFLAGS:
cmd = FS_IOC_GETFLAGS;
break;
case FS_IOC32_SETFLAGS:
cmd = FS_IOC_SETFLAGS;
break;
+ case FS_IOC32_GETVERSION:
+ cmd = FS_IOC_GETVERSION;
+ break;
+ case FS_IOC_GETFSLABEL:
+ case FS_IOC_SETFSLABEL:
+ break;
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 77ae65542db9..90ade8f648d9 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -23,19 +23,23 @@
#include "journal.h"
#include "keylist.h"
#include "quota.h"
+#include "rebalance.h"
#include "snapshot.h"
#include "super.h"
#include "xattr.h"
+#include "trace.h"
#include <linux/aio.h>
#include <linux/backing-dev.h>
#include <linux/exportfs.h>
#include <linux/fiemap.h>
+#include <linux/fs_context.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/posix_acl.h>
#include <linux/random.h>
#include <linux/seq_file.h>
+#include <linux/siphash.h>
#include <linux/statfs.h>
#include <linux/string.h>
#include <linux/xattr.h>
@@ -56,15 +60,16 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
BUG_ON(bi->bi_inum != inode->v.i_ino);
- bch2_assert_pos_locked(trans, BTREE_ID_inodes,
- POS(0, bi->bi_inum),
- c->opts.inodes_use_key_cache);
+ bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum));
set_nlink(&inode->v, bch2_inode_nlink_get(bi));
i_uid_write(&inode->v, bi->bi_uid);
i_gid_write(&inode->v, bi->bi_gid);
inode->v.i_mode = bi->bi_mode;
+ if (fields & ATTR_SIZE)
+ i_size_write(&inode->v, bi->bi_size);
+
if (fields & ATTR_ATIME)
inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
if (fields & ATTR_MTIME)
@@ -89,10 +94,25 @@ int __must_check bch2_write_inode(struct bch_fs *c,
retry:
bch2_trans_begin(trans);
- ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
- BTREE_ITER_INTENT) ?:
- (set ? set(trans, inode, &inode_u, p) : 0) ?:
- bch2_inode_write(trans, &iter, &inode_u) ?:
+ ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent);
+ if (ret)
+ goto err;
+
+ struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u);
+
+ ret = (set ? set(trans, inode, &inode_u, p) : 0);
+ if (ret)
+ goto err;
+
+ struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
+
+ if (memcmp(&old_r, &new_r, sizeof(new_r))) {
+ ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_inode_write(trans, &iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
/*
@@ -101,14 +121,15 @@ retry:
*/
if (!ret)
bch2_inode_update_after_write(trans, inode, &inode_u, fields);
-
+err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
- "inode %u:%llu not found when updating",
+ "%s: inode %llu:%llu not found when updating",
+ bch2_err_str(ret),
inode_inum(inode).subvol,
inode_inum(inode).inum);
@@ -151,70 +172,342 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
return ret;
}
-static int bch2_iget5_test(struct inode *vinode, void *p)
+static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- subvol_inum *inum = p;
+ return a.subvol == b.subvol && a.inum == b.inum;
+}
+
+static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
+{
+ const subvol_inum *inum = data;
+ siphash_key_t k = { .key[0] = seed };
- return inode->ei_subvol == inum->subvol &&
- inode->ei_inode.bi_inum == inum->inum;
+ return siphash_2u64(inum->subvol, inum->inum, &k);
}
-static int bch2_iget5_set(struct inode *vinode, void *p)
+static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- subvol_inum *inum = p;
+ const struct bch_inode_info *inode = data;
- inode->v.i_ino = inum->inum;
- inode->ei_subvol = inum->subvol;
- inode->ei_inode.bi_inum = inum->inum;
- return 0;
+ return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
}
-static unsigned bch2_inode_hash(subvol_inum inum)
+static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
+ const void *obj)
{
- return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+ const struct bch_inode_info *inode = obj;
+ const subvol_inum *v = arg->key;
+
+ return !subvol_inum_eq(inode->ei_inum, *v);
}
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+static const struct rhashtable_params bch2_vfs_inodes_params = {
+ .head_offset = offsetof(struct bch_inode_info, hash),
+ .key_offset = offsetof(struct bch_inode_info, ei_inum),
+ .key_len = sizeof(subvol_inum),
+ .hashfn = bch2_vfs_inode_hash_fn,
+ .obj_hashfn = bch2_vfs_inode_obj_hash_fn,
+ .obj_cmpfn = bch2_vfs_inode_cmp_fn,
+ .automatic_shrinking = true,
+};
+
+static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = {
+ .head_offset = offsetof(struct bch_inode_info, by_inum_hash),
+ .key_offset = offsetof(struct bch_inode_info, ei_inum.inum),
+ .key_len = sizeof(u64),
+ .automatic_shrinking = true,
+};
+
+int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
+{
+ struct bch_fs *c = trans->c;
+ struct rhltable *ht = &c->vfs_inodes_by_inum_table;
+ u64 inum = p.offset;
+ DARRAY(u32) subvols;
+ int ret = 0;
+
+ if (!test_bit(BCH_FS_started, &c->flags))
+ return false;
+
+ darray_init(&subvols);
+restart_from_top:
+
+ /*
+ * Tweaked version of __rhashtable_lookup(); we need to get a list of
+ * subvolumes in which the given inode number is open.
+ *
+ * For this to work, we don't include the subvolume ID in the key that
+ * we hash - all inodes with the same inode number regardless of
+ * subvolume will hash to the same slot.
+ *
+ * This will be less than ideal if the same file is ever open
+ * simultaneously in many different snapshots:
+ */
+ rcu_read_lock();
+ struct rhash_lock_head __rcu *const *bkt;
+ struct rhash_head *he;
+ unsigned int hash;
+ struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht);
+restart:
+ hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params);
+ bkt = rht_bucket(tbl, hash);
+ do {
+ struct bch_inode_info *inode;
+
+ rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
+ if (inode->ei_inum.inum == inum) {
+ ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
+ GFP_NOWAIT|__GFP_NOWARN);
+ if (ret) {
+ rcu_read_unlock();
+ ret = darray_make_room(&subvols, 1);
+ if (ret)
+ goto err;
+ subvols.nr = 0;
+ goto restart_from_top;
+ }
+ }
+ }
+ /* An object might have been moved to a different hash chain,
+ * while we walk along it - better check and retry.
+ */
+ } while (he != RHT_NULLS_MARKER(bkt));
+
+ /* Ensure we see any new tables. */
+ smp_rmb();
+
+ tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht);
+ if (unlikely(tbl))
+ goto restart;
+ rcu_read_unlock();
+
+ darray_for_each(subvols, i) {
+ u32 snap;
+ ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
+ if (ret)
+ goto err;
+
+ ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
+ if (ret)
+ break;
+ }
+err:
+ darray_exit(&subvols);
+ return ret;
+}
+
+static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+{
+ return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
+}
+
+static void __wait_on_freeing_inode(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ subvol_inum inum)
+{
+ wait_queue_head_t *wq;
+ struct wait_bit_queue_entry wait;
+
+ wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->v.i_lock);
+
+ if (__bch2_inode_hash_find(c, inum) == inode)
+ schedule_timeout(HZ * 10);
+ finish_wait(wq, &wait.wq_entry);
+}
+
+static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
+ subvol_inum inum)
{
- struct bch_inode_unpacked inode_u;
struct bch_inode_info *inode;
- struct btree_trans *trans;
- struct bch_subvolume subvol;
- int ret;
+repeat:
+ inode = __bch2_inode_hash_find(c, inum);
+ if (inode) {
+ spin_lock(&inode->v.i_lock);
+ if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
+ spin_unlock(&inode->v.i_lock);
+ return NULL;
+ }
+ if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
+ if (!trans) {
+ __wait_on_freeing_inode(c, inode, inum);
+ } else {
+ bch2_trans_unlock(trans);
+ __wait_on_freeing_inode(c, inode, inum);
+ int ret = bch2_trans_relock(trans);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ goto repeat;
+ }
+ __iget(&inode->v);
+ spin_unlock(&inode->v.i_lock);
+ }
- inode = to_bch_ei(iget5_locked(c->vfs_sb,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- bch2_iget5_set,
- &inum));
- if (unlikely(!inode))
- return ERR_PTR(-ENOMEM);
- if (!(inode->v.i_state & I_NEW))
- return &inode->v;
+ return inode;
+}
- trans = bch2_trans_get(c);
- ret = lockrestart_do(trans,
- bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
- bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
+{
+ spin_lock(&inode->v.i_lock);
+ bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
+ spin_unlock(&inode->v.i_lock);
+
+ if (remove) {
+ int ret = rhltable_remove(&c->vfs_inodes_by_inum_table,
+ &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params);
+ BUG_ON(ret);
+
+ ret = rhashtable_remove_fast(&c->vfs_inodes_table,
+ &inode->hash, bch2_vfs_inodes_params);
+ BUG_ON(ret);
+ inode->v.i_hash.pprev = NULL;
+ /*
+ * This pairs with the bch2_inode_hash_find() ->
+ * __wait_on_freeing_inode() path
+ */
+ inode_wake_up_bit(&inode->v, __I_NEW);
+ }
+}
- if (!ret)
- bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
- bch2_trans_put(trans);
+static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
+ struct btree_trans *trans,
+ struct bch_inode_info *inode)
+{
+ struct bch_inode_info *old = inode;
+
+ set_bit(EI_INODE_HASHED, &inode->ei_flags);
+retry:
+ if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
+ &inode->ei_inum,
+ &inode->hash,
+ bch2_vfs_inodes_params))) {
+ old = bch2_inode_hash_find(c, trans, inode->ei_inum);
+ if (!old)
+ goto retry;
+
+ clear_bit(EI_INODE_HASHED, &inode->ei_flags);
+
+ /*
+ * bcachefs doesn't use I_NEW; we have no use for it since we
+ * only insert fully created inodes in the inode hash table. But
+ * discard_new_inode() expects it to be set...
+ */
+ inode->v.i_state |= I_NEW;
+ /*
+ * We don't want bch2_evict_inode() to delete the inode on disk,
+ * we just raced and had another inode in cache. Normally new
+ * inodes don't have nlink == 0 - except tmpfiles do...
+ */
+ set_nlink(&inode->v, 1);
+ discard_new_inode(&inode->v);
+ return old;
+ } else {
+ int ret = rhltable_insert(&c->vfs_inodes_by_inum_table,
+ &inode->by_inum_hash,
+ bch2_vfs_inodes_by_inum_params);
+ BUG_ON(ret);
+
+ inode_fake_hash(&inode->v);
+
+ inode_sb_list_add(&inode->v);
- if (ret) {
- iget_failed(&inode->v);
- return ERR_PTR(bch2_err_class(ret));
+ mutex_lock(&c->vfs_inodes_lock);
+ list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+ mutex_unlock(&c->vfs_inodes_lock);
+ return inode;
}
+}
- mutex_lock(&c->vfs_inodes_lock);
- list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
- mutex_unlock(&c->vfs_inodes_lock);
+#define memalloc_flags_do(_flags, _do) \
+({ \
+ unsigned _saved_flags = memalloc_flags_save(_flags); \
+ typeof(_do) _ret = _do; \
+ memalloc_noreclaim_restore(_saved_flags); \
+ _ret; \
+})
+
+static struct inode *bch2_alloc_inode(struct super_block *sb)
+{
+ BUG();
+}
+
+static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
+{
+ struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
+ bch2_inode_cache, gfp);
+ if (!inode)
+ return NULL;
+
+ inode_init_once(&inode->v);
+ mutex_init(&inode->ei_update_lock);
+ two_state_lock_init(&inode->ei_pagecache_lock);
+ INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
+ inode->ei_flags = 0;
+ mutex_init(&inode->ei_quota_lock);
+ memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
+
+ if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) {
+ kmem_cache_free(bch2_inode_cache, inode);
+ return NULL;
+ }
+
+ return inode;
+}
+
+/*
+ * Allocate a new inode, dropping/retaking btree locks if necessary:
+ */
+static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
+{
+ struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT);
+
+ if (unlikely(!inode)) {
+ int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
+ if (ret && inode) {
+ __destroy_inode(&inode->v);
+ kmem_cache_free(bch2_inode_cache, inode);
+ }
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ return inode;
+}
+
+static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
+ subvol_inum inum,
+ struct bch_inode_unpacked *bi,
+ struct bch_subvolume *subvol)
+{
+ struct bch_inode_info *inode = bch2_new_inode(trans);
+ if (IS_ERR(inode))
+ return inode;
+
+ bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
+
+ return bch2_inode_hash_insert(trans->c, trans, inode);
+
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+{
+ struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
+ if (inode)
+ return &inode->v;
+
+ struct btree_trans *trans = bch2_trans_get(c);
- unlock_new_inode(&inode->v);
+ struct bch_inode_unpacked inode_u;
+ struct bch_subvolume subvol;
+ int ret = lockrestart_do(trans,
+ bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
+ bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
+ PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
+ bch2_trans_put(trans);
- return &inode->v;
+ return ret ? ERR_PTR(ret) : &inode->v;
}
struct bch_inode_info *
@@ -226,12 +519,14 @@ __bch2_create(struct mnt_idmap *idmap,
struct bch_fs *c = dir->v.i_sb->s_fs_info;
struct btree_trans *trans;
struct bch_inode_unpacked dir_u;
- struct bch_inode_info *inode, *old;
+ struct bch_inode_info *inode;
struct bch_inode_unpacked inode_u;
struct posix_acl *default_acl = NULL, *acl = NULL;
subvol_inum inum;
struct bch_subvolume subvol;
u64 journal_seq = 0;
+ kuid_t kuid;
+ kgid_t kgid;
int ret;
/*
@@ -243,7 +538,7 @@ __bch2_create(struct mnt_idmap *idmap,
if (ret)
return ERR_PTR(ret);
#endif
- inode = to_bch_ei(new_inode(c->vfs_sb));
+ inode = __bch2_new_inode(c, GFP_NOFS);
if (unlikely(!inode)) {
inode = ERR_PTR(-ENOMEM);
goto err;
@@ -258,13 +553,15 @@ __bch2_create(struct mnt_idmap *idmap,
retry:
bch2_trans_begin(trans);
- ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
+ kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
+ kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
+ ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
bch2_create_trans(trans,
inode_inum(dir), &dir_u, &inode_u,
!(flags & BCH_CREATE_TMPFILE)
? &dentry->d_name : NULL,
- from_kuid(i_user_ns(&dir->v), current_fsuid()),
- from_kgid(i_user_ns(&dir->v), current_fsgid()),
+ from_kuid(i_user_ns(&dir->v), kuid),
+ from_kgid(i_user_ns(&dir->v), kgid),
mode, rdev,
default_acl, acl, snapshot_src, flags) ?:
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
@@ -272,11 +569,10 @@ retry:
if (unlikely(ret))
goto err_before_quota;
- inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+ inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
inum.inum = inode_u.bi_inum;
- ret = bch2_subvolume_get(trans, inum.subvol, true,
- BTREE_ITER_WITH_UPDATES, &subvol) ?:
+ ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
bch2_trans_commit(trans, NULL, &journal_seq, 0);
if (unlikely(ret)) {
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
@@ -289,11 +585,10 @@ err_before_quota:
if (!(flags & BCH_CREATE_TMPFILE)) {
bch2_inode_update_after_write(trans, dir, &dir_u,
- ATTR_MTIME|ATTR_CTIME);
+ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
mutex_unlock(&dir->ei_update_lock);
}
- bch2_iget5_set(&inode->v, &inum);
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -303,37 +598,16 @@ err_before_quota:
* we must insert the new inode into the inode cache before calling
* bch2_trans_exit() and dropping locks, else we could race with another
* thread pulling the inode in and modifying it:
+ *
+ * also, calling bch2_inode_hash_insert() without passing in the
+ * transaction object is sketchy - if we could ever end up in
+ * __wait_on_freeing_inode(), we'd risk deadlock.
+ *
+ * But that shouldn't be possible, since we still have the inode locked
+ * that we just created, and we _really_ can't take a transaction
+ * restart here.
*/
-
- inode->v.i_state |= I_CREATING;
-
- old = to_bch_ei(inode_insert5(&inode->v,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- bch2_iget5_set,
- &inum));
- BUG_ON(!old);
-
- if (unlikely(old != inode)) {
- /*
- * We raced, another process pulled the new inode into cache
- * before us:
- */
- make_bad_inode(&inode->v);
- iput(&inode->v);
-
- inode = old;
- } else {
- mutex_lock(&c->vfs_inodes_lock);
- list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
- mutex_unlock(&c->vfs_inodes_lock);
- /*
- * we really don't want insert_inode_locked2() to be setting
- * I_NEW...
- */
- unlock_new_inode(&inode->v);
- }
-
+ inode = bch2_inode_hash_insert(c, NULL, inode);
bch2_trans_put(trans);
err:
posix_acl_release(default_acl);
@@ -352,23 +626,79 @@ err_trans:
/* methods */
+static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
+ subvol_inum dir, struct bch_hash_info *dir_hash_info,
+ const struct qstr *name)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter dirent_iter = {};
+ subvol_inum inum = {};
+ struct printbuf buf = PRINTBUF;
+
+ struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
+ dir_hash_info, dir, name, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret)
+ goto err;
+
+ struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
+ if (inode)
+ goto out;
+
+ struct bch_subvolume subvol;
+ struct bch_inode_unpacked inode_u;
+ ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
+ bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
+ PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
+
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
+ c, "dirent to missing inode:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ if (ret)
+ goto err;
+
+ /* regular files may have hardlinks: */
+ if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) &&
+ !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
+ c,
+ "dirent points to inode that does not point back:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k),
+ prt_printf(&buf, "\n "),
+ bch2_inode_unpacked_to_text(&buf, &inode_u),
+ buf.buf))) {
+ ret = -ENOENT;
+ goto err;
+ }
+out:
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ printbuf_exit(&buf);
+ return inode;
+err:
+ inode = ERR_PTR(ret);
+ goto out;
+}
+
static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
unsigned int flags)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
- struct inode *vinode = NULL;
- subvol_inum inum = { .subvol = 1 };
- int ret;
- ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
- &dentry->d_name, &inum);
-
- if (!ret)
- vinode = bch2_vfs_inode_get(c, inum);
+ struct bch_inode_info *inode;
+ bch2_trans_do(c,
+ PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
+ &hash, &dentry->d_name)));
+ if (IS_ERR(inode))
+ inode = NULL;
- return d_splice_alias(vinode, dentry);
+ return d_splice_alias(&inode->v, dentry);
}
static int bch2_mknod(struct mnt_idmap *idmap,
@@ -398,11 +728,11 @@ static int __bch2_link(struct bch_fs *c,
struct bch_inode_info *dir,
struct dentry *dentry)
{
- struct btree_trans *trans = bch2_trans_get(c);
struct bch_inode_unpacked dir_u, inode_u;
int ret;
mutex_lock(&inode->ei_update_lock);
+ struct btree_trans *trans = bch2_trans_get(c);
ret = commit_do(trans, NULL, NULL, 0,
bch2_link_trans(trans,
@@ -412,7 +742,7 @@ static int __bch2_link(struct bch_fs *c,
if (likely(!ret)) {
bch2_inode_update_after_write(trans, dir, &dir_u,
- ATTR_MTIME|ATTR_CTIME);
+ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
}
@@ -431,8 +761,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
lockdep_assert_held(&inode->v.i_rwsem);
- ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
- bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
+ bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
__bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
return bch2_err_class(ret);
@@ -449,11 +779,12 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_inode_unpacked dir_u, inode_u;
- struct btree_trans *trans = bch2_trans_get(c);
int ret;
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
+ struct btree_trans *trans = bch2_trans_get(c);
+
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
bch2_unlink_trans(trans,
@@ -464,7 +795,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
goto err;
bch2_inode_update_after_write(trans, dir, &dir_u,
- ATTR_MTIME|ATTR_CTIME);
+ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
bch2_inode_update_after_write(trans, inode, &inode_u,
ATTR_MTIME);
@@ -476,8 +807,8 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
set_nlink(&inode->v, 0);
}
err:
- bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
bch2_trans_put(trans);
+ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
return ret;
}
@@ -487,7 +818,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
struct bch_inode_info *dir= to_bch_ei(vdir);
struct bch_fs *c = dir->v.i_sb->s_fs_info;
- int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+ int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
__bch2_unlink(vdir, dentry, false);
return bch2_err_class(ret);
}
@@ -544,15 +875,16 @@ static int bch2_rename2(struct mnt_idmap *idmap,
struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
struct bch_inode_unpacked dst_dir_u, src_dir_u;
- struct bch_inode_unpacked src_inode_u, dst_inode_u;
+ struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
struct btree_trans *trans;
enum bch_rename_mode mode = flags & RENAME_EXCHANGE
? BCH_RENAME_EXCHANGE
: dst_dentry->d_inode
? BCH_RENAME_OVERWRITE : BCH_RENAME;
+ bool whiteout = !!(flags & RENAME_WHITEOUT);
int ret;
- if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
return -EINVAL;
if (mode == BCH_RENAME_OVERWRITE) {
@@ -562,18 +894,18 @@ static int bch2_rename2(struct mnt_idmap *idmap,
return ret;
}
- trans = bch2_trans_get(c);
-
bch2_lock_inodes(INODE_UPDATE_LOCK,
src_dir,
dst_dir,
src_inode,
dst_inode);
- ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
- bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
+ trans = bch2_trans_get(c);
+
+ ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
+ bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
if (ret)
- goto err;
+ goto err_tx_restart;
if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
ret = bch2_fs_quota_transfer(c, src_inode,
@@ -593,29 +925,59 @@ static int bch2_rename2(struct mnt_idmap *idmap,
if (ret)
goto err;
}
+retry:
+ bch2_trans_begin(trans);
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_rename_trans(trans,
- inode_inum(src_dir), &src_dir_u,
- inode_inum(dst_dir), &dst_dir_u,
- &src_inode_u,
- &dst_inode_u,
- &src_dentry->d_name,
- &dst_dentry->d_name,
- mode));
+ ret = bch2_rename_trans(trans,
+ inode_inum(src_dir), &src_dir_u,
+ inode_inum(dst_dir), &dst_dir_u,
+ &src_inode_u,
+ &dst_inode_u,
+ &src_dentry->d_name,
+ &dst_dentry->d_name,
+ mode);
if (unlikely(ret))
+ goto err_tx_restart;
+
+ if (whiteout) {
+ whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
+ ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
+ if (unlikely(ret))
+ goto err_tx_restart;
+ bch2_inode_init_early(c, whiteout_inode_u);
+
+ ret = bch2_create_trans(trans,
+ inode_inum(src_dir), &src_dir_u,
+ whiteout_inode_u,
+ &src_dentry->d_name,
+ from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
+ from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
+ S_IFCHR|WHITEOUT_MODE, 0,
+ NULL, NULL, (subvol_inum) { 0 }, 0) ?:
+ bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
+ KEY_TYPE_QUOTA_PREALLOC);
+ if (unlikely(ret))
+ goto err_tx_restart;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0);
+ if (unlikely(ret)) {
+err_tx_restart:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
goto err;
+ }
BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
BUG_ON(dst_inode &&
dst_inode->v.i_ino != dst_inode_u.bi_inum);
bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
- ATTR_MTIME|ATTR_CTIME);
+ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
if (src_dir != dst_dir)
bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
- ATTR_MTIME|ATTR_CTIME);
+ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
ATTR_CTIME);
@@ -652,11 +1014,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
unsigned int ia_valid = attr->ia_valid;
+ kuid_t kuid;
+ kgid_t kgid;
- if (ia_valid & ATTR_UID)
- bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
- if (ia_valid & ATTR_GID)
- bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+ if (ia_valid & ATTR_UID) {
+ kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+ bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
+ }
+ if (ia_valid & ATTR_GID) {
+ kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+ bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
+ }
if (ia_valid & ATTR_SIZE)
bi->bi_size = attr->ia_size;
@@ -671,11 +1039,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
kgid_t gid = ia_valid & ATTR_GID
- ? attr->ia_gid
+ ? kgid
: inode->v.i_gid;
- if (!in_group_p(gid) &&
- !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
+ if (!in_group_or_capable(idmap, &inode->v,
+ make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
mode &= ~S_ISGID;
bi->bi_mode = mode;
}
@@ -691,17 +1059,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
+ kuid_t kuid;
+ kgid_t kgid;
int ret;
mutex_lock(&inode->ei_update_lock);
qid = inode->ei_qid;
- if (attr->ia_valid & ATTR_UID)
- qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+ if (attr->ia_valid & ATTR_UID) {
+ kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+ qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
+ }
- if (attr->ia_valid & ATTR_GID)
- qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+ if (attr->ia_valid & ATTR_GID) {
+ kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+ qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
+ }
ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
KEY_TYPE_QUOTA_PREALLOC);
@@ -715,7 +1089,7 @@ retry:
acl = NULL;
ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto btree_err;
@@ -757,13 +1131,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
{
struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
stat->dev = inode->v.i_sb->s_dev;
stat->ino = inode->v.i_ino;
stat->mode = inode->v.i_mode;
stat->nlink = inode->v.i_nlink;
- stat->uid = inode->v.i_uid;
- stat->gid = inode->v.i_gid;
+ stat->uid = vfsuid_into_kuid(vfsuid);
+ stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->v.i_rdev;
stat->size = i_size_read(&inode->v);
stat->atime = inode_get_atime(&inode->v);
@@ -772,6 +1148,19 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->blksize = block_bytes(c);
stat->blocks = inode->v.i_blocks;
+ stat->subvol = inode->ei_inum.subvol;
+ stat->result_mask |= STATX_SUBVOL;
+
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
+ stat->result_mask |= STATX_DIOALIGN;
+ /*
+ * this is incorrect; we should be tracking this in superblock,
+ * and checking the alignment of open devices
+ */
+ stat->dio_mem_align = SECTOR_SIZE;
+ stat->dio_offset_align = block_bytes(c);
+ }
+
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
@@ -801,7 +1190,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
lockdep_assert_held(&inode->v.i_rwsem);
- ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
setattr_prepare(idmap, dentry, iattr);
if (ret)
return ret;
@@ -892,16 +1281,14 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
- struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
- unsigned offset_into_extent, sectors;
bool have_extent = false;
- u32 snapshot;
int ret = 0;
ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
if (ret)
return ret;
+ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
if (start + len < start)
return -EINVAL;
@@ -910,42 +1297,50 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_bkey_buf_init(&cur);
bch2_bkey_buf_init(&prev);
trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
- if (ret)
- goto err;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(ei->v.i_ino, start, snapshot), 0);
+ POS(ei->v.i_ino, start), 0);
- while (!(ret = btree_trans_too_many_iters(trans)) &&
- (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
- !(ret = bkey_err(k))) {
+ while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
enum btree_id data_btree = BTREE_ID_extents;
+ bch2_trans_begin(trans);
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+ k = bch2_btree_iter_peek_max(&iter, end);
+ ret = bkey_err(k);
+ if (ret)
+ continue;
+
+ if (!k.k)
+ break;
+
if (!bkey_extent_is_data(k.k) &&
k.k->type != KEY_TYPE_reservation) {
bch2_btree_iter_advance(&iter);
continue;
}
- offset_into_extent = iter.pos.offset -
- bkey_start_offset(k.k);
- sectors = k.k->size - offset_into_extent;
+ s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+ unsigned sectors = k.k->size - offset_into_extent;
bch2_bkey_buf_reassemble(&cur, c, k);
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &cur);
if (ret)
- break;
+ continue;
k = bkey_i_to_s_c(cur.k);
bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
- sectors = min(sectors, k.k->size - offset_into_extent);
+ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
bch2_cut_front(POS(k.k->p.inode,
bkey_start_offset(k.k) +
@@ -969,11 +1364,7 @@ retry:
bch2_btree_iter_set_pos(&iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
}
- start = iter.pos.offset;
bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
if (!ret && have_extent) {
bch2_trans_unlock(trans);
@@ -1029,11 +1420,13 @@ static int bch2_open(struct inode *vinode, struct file *file)
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
+ int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
if (ret)
return ret;
}
+ file->f_mode |= FMODE_CAN_ODIRECT;
+
return generic_file_open(vinode, file);
}
@@ -1043,6 +1436,7 @@ static const struct file_operations bch_file_operations = {
.read_iter = bch2_read_iter,
.write_iter = bch2_write_iter,
.mmap = bch2_mmap,
+ .get_unmapped_area = thp_get_unmapped_area,
.fsync = bch2_fsync,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
@@ -1060,7 +1454,7 @@ static const struct inode_operations bch_file_inode_operations = {
.fiemap = bch2_fiemap,
.listxattr = bch2_xattr_list,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_acl = bch2_get_acl,
+ .get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
};
@@ -1080,7 +1474,7 @@ static const struct inode_operations bch_dir_inode_operations = {
.tmpfile = bch2_tmpfile,
.listxattr = bch2_xattr_list,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_acl = bch2_get_acl,
+ .get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
};
@@ -1102,7 +1496,7 @@ static const struct inode_operations bch_symlink_inode_operations = {
.setattr = bch2_setattr,
.listxattr = bch2_xattr_list,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_acl = bch2_get_acl,
+ .get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
};
@@ -1112,7 +1506,7 @@ static const struct inode_operations bch_special_inode_operations = {
.setattr = bch2_setattr,
.listxattr = bch2_xattr_list,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_acl = bch2_get_acl,
+ .get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
};
@@ -1126,7 +1520,6 @@ static const struct address_space_operations bch_address_space_operations = {
.write_end = bch2_write_end,
.invalidate_folio = bch2_invalidate_folio,
.release_folio = bch2_release_folio,
- .direct_IO = noop_direct_IO,
#ifdef CONFIG_MIGRATION
.migrate_folio = filemap_migrate_folio,
#endif
@@ -1159,8 +1552,8 @@ static int bcachefs_fid_valid(int fh_len, int fh_type)
static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
{
return (struct bcachefs_fid) {
- .inum = inode->ei_inode.bi_inum,
- .subvol = inode->ei_subvol,
+ .inum = inode->ei_inum.inum,
+ .subvol = inode->ei_inum.subvol,
.gen = inode->ei_inode.bi_generation,
};
}
@@ -1245,7 +1638,7 @@ static struct dentry *bch2_get_parent(struct dentry *child)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
subvol_inum parent_inum = {
.subvol = inode->ei_inode.bi_parent_subvol ?:
- inode->ei_subvol,
+ inode->ei_inum.subvol,
.inum = inode->ei_inode.bi_dir,
};
@@ -1281,7 +1674,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
retry:
bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
if (ret)
goto err;
@@ -1312,8 +1705,7 @@ retry:
if (ret)
goto err;
- if (target.subvol == inode->ei_subvol &&
- target.inum == inode->ei_inode.bi_inum)
+ if (subvol_inum_eq(target, inode->ei_inum))
goto found;
} else {
/*
@@ -1334,8 +1726,7 @@ retry:
if (ret)
continue;
- if (target.subvol == inode->ei_subvol &&
- target.inum == inode->ei_inode.bi_inum)
+ if (subvol_inum_eq(target, inode->ei_inum))
goto found;
}
}
@@ -1367,20 +1758,18 @@ static const struct export_operations bch_export_ops = {
.get_name = bch2_get_name,
};
-static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
+static void bch2_vfs_inode_init(struct btree_trans *trans,
+ subvol_inum inum,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
struct bch_subvolume *subvol)
{
+ inode->v.i_ino = inum.inum;
+ inode->ei_inum = inum;
+ inode->ei_inode.bi_inum = inum.inum;
bch2_inode_update_after_write(trans, inode, bi, ~0);
- if (BCH_SUBVOLUME_SNAP(subvol))
- set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
- else
- clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
-
inode->v.i_blocks = bi->bi_sectors;
- inode->v.i_ino = bi->bi_inum;
inode->v.i_rdev = bi->bi_dev;
inode->v.i_generation = bi->bi_generation;
inode->v.i_size = bi->bi_size;
@@ -1388,7 +1777,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
inode->ei_flags = 0;
inode->ei_quota_reserved = 0;
inode->ei_qid = bch_qid(bi);
- inode->ei_subvol = inum.subvol;
+
+ if (BCH_SUBVOLUME_SNAP(subvol))
+ set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
inode->v.i_mapping->a_ops = &bch_address_space_operations;
@@ -1414,34 +1805,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
mapping_set_large_folios(inode->v.i_mapping);
}
-static struct inode *bch2_alloc_inode(struct super_block *sb)
+static void bch2_free_inode(struct inode *vinode)
{
- struct bch_inode_info *inode;
-
- inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
- if (!inode)
- return NULL;
-
- inode_init_once(&inode->v);
- mutex_init(&inode->ei_update_lock);
- two_state_lock_init(&inode->ei_pagecache_lock);
- INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
- mutex_init(&inode->ei_quota_lock);
-
- return &inode->v;
-}
-
-static void bch2_i_callback(struct rcu_head *head)
-{
- struct inode *vinode = container_of(head, struct inode, i_rcu);
- struct bch_inode_info *inode = to_bch_ei(vinode);
-
- kmem_cache_free(bch2_inode_cache, inode);
-}
-
-static void bch2_destroy_inode(struct inode *vinode)
-{
- call_rcu(&vinode->i_rcu, bch2_i_callback);
+ kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
}
static int inode_update_times_fn(struct btree_trans *trans,
@@ -1477,6 +1843,17 @@ static void bch2_evict_inode(struct inode *vinode)
{
struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(vinode);
+ bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
+
+ /*
+ * evict() has waited for outstanding writeback, we'll do no more IO
+ * through this inode: it's safe to remove from VFS inode hashtable here
+ *
+ * Do that now so that other threads aren't blocked from pulling it back
+ * in, there's no reason for them to be:
+ */
+ if (!delete)
+ bch2_inode_hash_remove(c, inode);
truncate_inode_pages_final(&inode->v.i_data);
@@ -1484,12 +1861,18 @@ static void bch2_evict_inode(struct inode *vinode)
BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
- if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+ if (delete) {
bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
bch2_inode_rm(c, inode_inum(inode));
+
+ /*
+ * If we are deleting, we need it present in the vfs hash table
+ * so that fsck can check if unlinked inodes are still open:
+ */
+ bch2_inode_hash_remove(c, inode);
}
mutex_lock(&c->vfs_inodes_lock);
@@ -1519,7 +1902,7 @@ again:
mutex_lock(&c->vfs_inodes_lock);
list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
- if (!snapshot_list_has_id(s, inode->ei_subvol))
+ if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
continue;
if (!(inode->v.i_state & I_DONTCACHE) &&
@@ -1532,14 +1915,16 @@ again:
break;
}
} else if (clean_pass && this_pass_clean) {
- wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
- DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+ struct wait_bit_queue_entry wqe;
+ struct wait_queue_head *wq_head;
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
+ prepare_to_wait_event(wq_head, &wqe.wq_entry,
+ TASK_UNINTERRUPTIBLE);
mutex_unlock(&c->vfs_inodes_lock);
schedule();
- finish_wait(wq, &wait.wq_entry);
+ finish_wait(wq_head, &wqe.wq_entry);
goto again;
}
}
@@ -1572,7 +1957,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
* number:
*/
u64 avail_inodes = ((usage.capacity - usage.used) << 3);
- u64 fsid;
buf->f_type = BCACHEFS_STATFS_MAGIC;
buf->f_bsize = sb->s_blocksize;
@@ -1583,10 +1967,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = usage.nr_inodes + avail_inodes;
buf->f_ffree = avail_inodes;
- fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
- le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
- buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
- buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+ buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b);
buf->f_namelen = BCH_NAME_MAX;
return 0;
@@ -1597,6 +1978,8 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
struct bch_fs *c = sb->s_fs_info;
int ret;
+ trace_bch2_sync_fs(sb, wait);
+
if (c->opts.journal_flush_disabled)
return 0;
@@ -1625,15 +2008,11 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
return c ?: ERR_PTR(-ENOENT);
}
-static int bch2_remount(struct super_block *sb, int *flags, char *data)
+static int bch2_remount(struct super_block *sb, int *flags,
+ struct bch_opts opts)
{
struct bch_fs *c = sb->s_fs_info;
- struct bch_opts opts = bch2_opts_empty();
- int ret;
-
- ret = bch2_parse_mount_opts(c, &opts, data);
- if (ret)
- goto err;
+ int ret = 0;
opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
@@ -1685,29 +2064,14 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
static int bch2_show_options(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
- enum bch_opt_id i;
struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- for (i = 0; i < bch2_opts_nr; i++) {
- const struct bch_option *opt = &bch2_opt_table[i];
- u64 v = bch2_opt_get_by_id(&c->opts, i);
- if (!(opt->flags & OPT_MOUNT))
- continue;
-
- if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
- continue;
+ bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
+ OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
+ printbuf_nul_terminate(&buf);
+ seq_printf(seq, ",%s", buf.buf);
- printbuf_reset(&buf);
- bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
- OPT_SHOW_MOUNT_STYLE);
- seq_putc(seq, ',');
- seq_puts(seq, buf.buf);
- }
-
- if (buf.allocation_failure)
- ret = -ENOMEM;
+ int ret = buf.allocation_failure ? -ENOMEM : 0;
printbuf_exit(&buf);
return ret;
}
@@ -1753,14 +2117,13 @@ static int bch2_unfreeze(struct super_block *sb)
static const struct super_operations bch_super_operations = {
.alloc_inode = bch2_alloc_inode,
- .destroy_inode = bch2_destroy_inode,
+ .free_inode = bch2_free_inode,
.write_inode = bch2_vfs_write_inode,
.evict_inode = bch2_evict_inode,
.sync_fs = bch2_sync_fs,
.statfs = bch2_statfs,
.show_devname = bch2_show_devname,
.show_options = bch2_show_options,
- .remount_fs = bch2_remount,
.put_super = bch2_put_super,
.freeze_fs = bch2_freeze,
.unfreeze_fs = bch2_unfreeze,
@@ -1793,75 +2156,63 @@ static int bch2_test_super(struct super_block *s, void *data)
return true;
}
-static struct dentry *bch2_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int bch2_fs_get_tree(struct fs_context *fc)
{
struct bch_fs *c;
struct super_block *sb;
struct inode *vinode;
- struct bch_opts opts = bch2_opts_empty();
+ struct bch2_opts_parse *opts_parse = fc->fs_private;
+ struct bch_opts opts = opts_parse->opts;
+ darray_str devs;
+ darray_fs devs_to_fs = {};
int ret;
- opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
-
- ret = bch2_parse_mount_opts(NULL, &opts, data);
- if (ret)
- return ERR_PTR(ret);
+ opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
+ opt_set(opts, nostart, true);
- if (!dev_name || strlen(dev_name) == 0)
- return ERR_PTR(-EINVAL);
+ if (!fc->source || strlen(fc->source) == 0)
+ return -EINVAL;
- darray_str devs;
- ret = bch2_split_devs(dev_name, &devs);
+ ret = bch2_split_devs(fc->source, &devs);
if (ret)
- return ERR_PTR(ret);
+ return ret;
- darray_fs devs_to_fs = {};
darray_for_each(devs, i) {
ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
- if (ret) {
- sb = ERR_PTR(ret);
- goto got_sb;
- }
+ if (ret)
+ goto err;
}
- sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
+ sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
if (!IS_ERR(sb))
goto got_sb;
c = bch2_fs_open(devs.data, devs.nr, opts);
- if (IS_ERR(c)) {
- sb = ERR_CAST(c);
- goto got_sb;
- }
+ ret = PTR_ERR_OR_ZERO(c);
+ if (ret)
+ goto err;
/* Some options can't be parsed until after the fs is started: */
- ret = bch2_parse_mount_opts(c, &opts, data);
- if (ret) {
- bch2_fs_stop(c);
- sb = ERR_PTR(ret);
- goto got_sb;
- }
+ opts = bch2_opts_empty();
+ ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
+ if (ret)
+ goto err_stop_fs;
bch2_opts_apply(&c->opts, opts);
- sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
- if (IS_ERR(sb))
- bch2_fs_stop(c);
-got_sb:
- darray_exit(&devs_to_fs);
- bch2_darray_str_exit(&devs);
-
- if (IS_ERR(sb)) {
- ret = PTR_ERR(sb);
- ret = bch2_err_class(ret);
- return ERR_PTR(ret);
- }
+ ret = bch2_fs_start(c);
+ if (ret)
+ goto err_stop_fs;
+ sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
+ ret = PTR_ERR_OR_ZERO(sb);
+ if (ret)
+ goto err_stop_fs;
+got_sb:
c = sb->s_fs_info;
if (sb->s_root) {
- if ((flags ^ sb->s_flags) & SB_RDONLY) {
+ if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) {
ret = -EBUSY;
goto err_put_super;
}
@@ -1882,6 +2233,9 @@ got_sb:
sb->s_time_gran = c->sb.nsec_per_time_unit;
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
+ super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
+ super_set_sysfs_name_uuid(sb);
+ sb->s_shrink->seeks = 0;
c->vfs_sb = sb;
strscpy(sb->s_id, c->name, sizeof(sb->s_id));
@@ -1925,11 +2279,30 @@ got_sb:
sb->s_flags |= SB_ACTIVE;
out:
- return dget(sb->s_root);
+ fc->root = dget(sb->s_root);
+err:
+ darray_exit(&devs_to_fs);
+ bch2_darray_str_exit(&devs);
+ if (ret)
+ pr_err("error: %s", bch2_err_str(ret));
+ /*
+ * On an inconsistency error in recovery we might see an -EROFS derived
+ * errorcode (from the journal), but we don't want to return that to
+ * userspace as that causes util-linux to retry the mount RO - which is
+ * confusing:
+ */
+ if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
+ ret = -EIO;
+ return bch2_err_class(ret);
+
+err_stop_fs:
+ bch2_fs_stop(c);
+ goto err;
err_put_super:
+ __bch2_fs_stop(c);
deactivate_locked_super(sb);
- return ERR_PTR(bch2_err_class(ret));
+ goto err;
}
static void bch2_kill_sb(struct super_block *sb)
@@ -1940,12 +2313,90 @@ static void bch2_kill_sb(struct super_block *sb)
bch2_fs_free(c);
}
+static void bch2_fs_context_free(struct fs_context *fc)
+{
+ struct bch2_opts_parse *opts = fc->fs_private;
+
+ if (opts) {
+ printbuf_exit(&opts->parse_later);
+ kfree(opts);
+ }
+}
+
+static int bch2_fs_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ /*
+ * the "source" param, i.e., the name of the device(s) to mount,
+ * is handled by the VFS layer.
+ */
+ if (!strcmp(param->key, "source"))
+ return -ENOPARAM;
+
+ struct bch2_opts_parse *opts = fc->fs_private;
+ struct bch_fs *c = NULL;
+
+ /* for reconfigure, we already have a struct bch_fs */
+ if (fc->root)
+ c = fc->root->d_sb->s_fs_info;
+
+ int ret = bch2_parse_one_mount_opt(c, &opts->opts,
+ &opts->parse_later, param->key,
+ param->string);
+
+ return bch2_err_class(ret);
+}
+
+static int bch2_fs_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ struct bch2_opts_parse *opts = fc->fs_private;
+
+ return bch2_remount(sb, &fc->sb_flags, opts->opts);
+}
+
+static const struct fs_context_operations bch2_context_ops = {
+ .free = bch2_fs_context_free,
+ .parse_param = bch2_fs_parse_param,
+ .get_tree = bch2_fs_get_tree,
+ .reconfigure = bch2_fs_reconfigure,
+};
+
+static int bch2_init_fs_context(struct fs_context *fc)
+{
+ struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+
+ if (!opts)
+ return -ENOMEM;
+
+ opts->parse_later = PRINTBUF;
+
+ fc->ops = &bch2_context_ops;
+ fc->fs_private = opts;
+
+ return 0;
+}
+
+void bch2_fs_vfs_exit(struct bch_fs *c)
+{
+ if (c->vfs_inodes_by_inum_table.ht.tbl)
+ rhltable_destroy(&c->vfs_inodes_by_inum_table);
+ if (c->vfs_inodes_table.tbl)
+ rhashtable_destroy(&c->vfs_inodes_table);
+}
+
+int bch2_fs_vfs_init(struct bch_fs *c)
+{
+ return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?:
+ rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params);
+}
+
static struct file_system_type bcache_fs_type = {
- .owner = THIS_MODULE,
- .name = "bcachefs",
- .mount = bch2_mount,
- .kill_sb = bch2_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "bcachefs",
+ .init_fs_context = bch2_init_fs_context,
+ .kill_sb = bch2_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("bcachefs");
@@ -1960,7 +2411,8 @@ int __init bch2_vfs_init(void)
{
int ret = -ENOMEM;
- bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
+ bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT);
if (!bch2_inode_cache)
goto err;
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index c3af7225ff69..dd2198541455 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -13,6 +13,10 @@
struct bch_inode_info {
struct inode v;
+ struct rhash_head hash;
+ struct rhlist_head by_inum_hash;
+ subvol_inum ei_inum;
+
struct list_head ei_vfs_inode_list;
unsigned long ei_flags;
@@ -24,8 +28,6 @@ struct bch_inode_info {
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
- u32 ei_subvol;
-
/*
* When we've been doing nocow writes we'll need to issue flushes to the
* underlying block devices
@@ -50,10 +52,7 @@ struct bch_inode_info {
static inline subvol_inum inode_inum(struct bch_inode_info *inode)
{
- return (subvol_inum) {
- .subvol = inode->ei_subvol,
- .inum = inode->ei_inode.bi_inum,
- };
+ return inode->ei_inum;
}
/*
@@ -67,6 +66,7 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
* those:
*/
#define EI_INODE_SNAPSHOT 1
+#define EI_INODE_HASHED 2
#define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v)
@@ -147,6 +147,8 @@ struct bch_inode_info *
__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
+int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);
+
int bch2_fs_quota_transfer(struct bch_fs *,
struct bch_inode_info *,
struct bch_qid,
@@ -187,6 +189,9 @@ int __bch2_unlink(struct inode *, struct dentry *, bool);
void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
+void bch2_fs_vfs_exit(struct bch_fs *);
+int bch2_fs_vfs_init(struct bch_fs *);
+
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
@@ -194,8 +199,14 @@ int bch2_vfs_init(void);
#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); })
+static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }
+
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
snapshot_id_list *s) {}
+
+static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
+static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
+
static inline void bch2_vfs_exit(void) {}
static inline int bch2_vfs_init(void) { return 0; }
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6a760777bafb..0e85131d0af8 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bcachefs_ioctl.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_update.h"
@@ -8,18 +9,63 @@
#include "darray.h"
#include "dirent.h"
#include "error.h"
+#include "fs.h"
#include "fs-common.h"
#include "fsck.h"
#include "inode.h"
#include "keylist.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "snapshot.h"
#include "super.h"
+#include "thread_with_file.h"
#include "xattr.h"
#include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *inode)
+{
+ if (d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
+ : le64_to_cpu(d.v->d_inum) == inode->bi_inum)
+ return 0;
+ return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+}
+
+static void dirent_inode_mismatch_msg(struct printbuf *out,
+ struct bch_fs *c,
+ struct bkey_s_c_dirent dirent,
+ struct bch_inode_unpacked *inode)
+{
+ prt_str(out, "inode points to dirent that does not point back:");
+ prt_newline(out);
+ bch2_bkey_val_to_text(out, c, dirent.s_c);
+ prt_newline(out);
+ bch2_inode_unpacked_to_text(out, inode);
+}
+
+static int dirent_points_to_inode(struct bch_fs *c,
+ struct bkey_s_c_dirent dirent,
+ struct bch_inode_unpacked *inode)
+{
+ int ret = dirent_points_to_inode_nowarn(dirent, inode);
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+ dirent_inode_mismatch_msg(&buf, c, dirent, inode);
+ bch_warn(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+ return ret;
+}
+
/*
* XXX: this is handling transaction restarts without returning
* -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
@@ -29,7 +75,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
{
u64 sectors = 0;
- int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
SPOS(inum, 0, snapshot),
POS(inum, U64_MAX),
0, k, ({
@@ -46,7 +92,7 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
{
u64 subdirs = 0;
- int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
SPOS(inum, 0, snapshot),
POS(inum, U64_MAX),
0, k, ({
@@ -63,9 +109,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
u32 *snapshot, u64 *inum)
{
struct bch_subvolume s;
- int ret;
-
- ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+ int ret = bch2_subvolume_get(trans, subvol, false, &s);
*snapshot = le32_to_cpu(s.snapshot);
*inum = le64_to_cpu(s.inode);
@@ -79,36 +123,31 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
struct bkey_s_c k;
int ret;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
- POS(0, inode_nr),
- BTREE_ITER_ALL_SNAPSHOTS);
- k = bch2_btree_iter_peek(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inode_nr)
+ break;
+ if (!bkey_is_inode(k.k))
+ continue;
+ ret = bch2_inode_unpack(k, inode);
+ goto found;
}
-
- ret = bch2_inode_unpack(k, inode);
-err:
+ ret = -BCH_ERR_ENOENT_inode;
+found:
bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode,
- u32 *snapshot)
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
+ struct bch_inode_unpacked *inode)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inode_nr, *snapshot), 0);
+ SPOS(0, inode_nr, snapshot), 0);
ret = bkey_err(k);
if (ret)
goto err;
@@ -116,8 +155,6 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
ret = bkey_is_inode(k.k)
? bch2_inode_unpack(k, inode)
: -BCH_ERR_ENOENT_inode;
- if (!ret)
- *snapshot = iter.pos.snapshot;
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -129,47 +166,19 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
u64 *target, unsigned *type, u32 snapshot)
{
struct btree_iter iter;
- struct bkey_s_c_dirent d;
- int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
- &hash_info, dir, name, 0, snapshot);
+ struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
+ &hash_info, dir, name, 0, snapshot);
+ int ret = bkey_err(k);
if (ret)
return ret;
- d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
*target = le64_to_cpu(d.v->d_inum);
*type = d.v->d_type;
bch2_trans_iter_exit(trans, &iter);
return 0;
}
-static int __write_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
-{
- struct bkey_inode_buf *inode_p =
- bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
- if (IS_ERR(inode_p))
- return PTR_ERR(inode_p);
-
- bch2_inode_pack(inode_p, inode);
- inode_p->inode.k.p.snapshot = snapshot;
-
- return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
- &inode_p->inode.k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-}
-
-static int fsck_write_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
-{
- int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __write_inode(trans, inode, snapshot));
- bch_err_fn(trans->c, ret);
- return ret;
-}
-
static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
{
struct bch_fs *c = trans->c;
@@ -184,23 +193,56 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
dir_hash_info = bch2_hash_info_init(c, &dir_inode);
- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
- ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash_info, &iter,
+ BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter);
err:
bch_err_fn(c, ret);
return ret;
}
+/*
+ * Find any subvolume associated with a tree of snapshots
+ * We can't rely on master_subvol - it might have been deleted.
+ */
+static int find_snapshot_tree_subvol(struct btree_trans *trans,
+ u32 tree_id, u32 *subvol)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+ if (le32_to_cpu(s.v->tree) != tree_id)
+ continue;
+
+ if (s.v->subvol) {
+ *subvol = le32_to_cpu(s.v->subvol);
+ goto found;
+ }
+ }
+ ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
+found:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
/* Get lost+found, create if it doesn't exist: */
static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
- struct bch_inode_unpacked *lostfound)
+ struct bch_inode_unpacked *lostfound,
+ u64 reattaching_inum)
{
struct bch_fs *c = trans->c;
struct qstr lostfound_str = QSTR("lost+found");
+ struct btree_iter lostfound_iter = { NULL };
u64 inum = 0;
unsigned d_type = 0;
int ret;
@@ -211,20 +253,43 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
if (ret)
return ret;
- subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
- u32 subvol_snapshot;
+ u32 subvolid;
+ ret = find_snapshot_tree_subvol(trans,
+ bch2_snapshot_tree(c, snapshot), &subvolid);
+ bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
+ bch2_snapshot_tree(c, snapshot));
+ if (ret)
+ return ret;
- ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
- &subvol_snapshot, &root_inum.inum);
- bch_err_msg(c, ret, "looking up root subvol");
+ struct bch_subvolume subvol;
+ ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
+ bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
if (ret)
return ret;
+ if (!subvol.inode) {
+ struct btree_iter iter;
+ struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, subvolid),
+ 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(subvol);
+ if (ret)
+ return ret;
+
+ subvol->v.inode = cpu_to_le64(reattaching_inum);
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ subvol_inum root_inum = {
+ .subvol = subvolid,
+ .inum = le64_to_cpu(subvol.inode)
+ };
+
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
- u32 root_inode_snapshot = snapshot;
- ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
- bch_err_msg(c, ret, "looking up root inode");
+ ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
+ bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
+ root_inum.inum, subvolid);
if (ret)
return ret;
@@ -248,25 +313,37 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
* The bch2_check_dirents pass has already run, dangling dirents
* shouldn't exist here:
*/
- ret = lookup_inode(trans, inum, lostfound, &snapshot);
+ ret = lookup_inode(trans, inum, snapshot, lostfound);
bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
return ret;
create_lostfound:
/*
+ * we always create lost+found in the root snapshot; we don't want
+ * different branches of the snapshot tree to have different lost+found
+ */
+ snapshot = le32_to_cpu(st.root_snapshot);
+ /*
* XXX: we could have a nicer log message here if we had a nice way to
* walk backpointers to print a path
*/
- bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
+ struct printbuf path = PRINTBUF;
+ ret = bch2_inum_to_path(trans, root_inum, &path);
+ if (ret)
+ goto err;
+
+ bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
+ path.buf, root_inum.subvol, snapshot);
+ printbuf_exit(&path);
u64 now = bch2_current_time(c);
- struct btree_iter lostfound_iter = { NULL };
u64 cpu = raw_smp_processor_id();
bch2_inode_init_early(c, lostfound);
bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
lostfound->bi_dir = root_inode.bi_inum;
+ lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
root_inode.bi_nlink++;
@@ -280,87 +357,345 @@ create_lostfound:
goto err;
ret = bch2_dirent_create_snapshot(trans,
- root_inode.bi_inum, snapshot, &root_hash_info,
+ 0, root_inode.bi_inum, snapshot, &root_hash_info,
mode_to_type(lostfound->bi_mode),
&lostfound_str,
lostfound->bi_inum,
&lostfound->bi_dir_offset,
- BCH_HASH_SET_MUST_CREATE) ?:
+ STR_HASH_must_create) ?:
bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
err:
bch_err_msg(c, ret, "creating lost+found");
bch2_trans_iter_exit(trans, &lostfound_iter);
return ret;
}
-static int reattach_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 inode_snapshot)
+static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
+{
+ if (inode->bi_inum == BCACHEFS_ROOT_INO &&
+ inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
+ return false;
+
+ return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
+}
+
+static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
+ SPOS(d_pos.inode, d_pos.offset, snapshot),
+ BTREE_ITER_intent|
+ BTREE_ITER_with_updates);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (bpos_eq(k.k->p, d_pos)) {
+ /*
+ * delet_at() doesn't work because the update path doesn't
+ * internally use BTREE_ITER_with_updates yet
+ */
+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+ ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ goto err;
+
+ bkey_init(&k->k);
+ k->k.type = KEY_TYPE_whiteout;
+ k->k.p = iter.pos;
+ ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
{
- struct bch_hash_info dir_hash;
+ struct bch_fs *c = trans->c;
struct bch_inode_unpacked lostfound;
char name_buf[20];
- struct qstr name;
- u64 dir_offset = 0;
int ret;
- ret = lookup_lostfound(trans, inode_snapshot, &lostfound);
+ u32 dirent_snapshot = inode->bi_snapshot;
+ if (inode->bi_subvol) {
+ inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ u64 root_inum;
+ ret = subvol_lookup(trans, inode->bi_parent_subvol,
+ &dirent_snapshot, &root_inum);
+ if (ret)
+ return ret;
+
+ snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
+ } else {
+ snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
+ }
+
+ ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
if (ret)
return ret;
- if (S_ISDIR(inode->bi_mode)) {
- lostfound.bi_nlink++;
+ lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
- ret = __write_inode(trans, &lostfound, U32_MAX);
- if (ret)
- return ret;
+ /* ensure lost+found inode is also present in inode snapshot */
+ if (!inode->bi_subvol) {
+ BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
+ lostfound.bi_snapshot = inode->bi_snapshot;
}
- dir_hash = bch2_hash_info_init(trans->c, &lostfound);
+ ret = __bch2_fsck_write_inode(trans, &lostfound);
+ if (ret)
+ return ret;
- snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
- name = (struct qstr) QSTR(name_buf);
+ struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
+ struct qstr name = QSTR(name_buf);
+
+ inode->bi_dir = lostfound.bi_inum;
ret = bch2_dirent_create_snapshot(trans,
- lostfound.bi_inum, inode_snapshot,
+ inode->bi_parent_subvol, lostfound.bi_inum,
+ dirent_snapshot,
&dir_hash,
inode_d_type(inode),
- &name, inode->bi_inum, &dir_offset,
- BCH_HASH_SET_MUST_CREATE);
+ &name,
+ inode->bi_subvol ?: inode->bi_inum,
+ &inode->bi_dir_offset,
+ STR_HASH_must_create);
+ if (ret) {
+ bch_err_msg(c, ret, "error creating dirent");
+ return ret;
+ }
+
+ ret = __bch2_fsck_write_inode(trans, inode);
if (ret)
return ret;
- inode->bi_dir = lostfound.bi_inum;
- inode->bi_dir_offset = dir_offset;
+ /*
+ * Fix up inodes in child snapshots: if they should also be reattached
+ * update the backpointer field, if they should not be we need to emit
+ * whiteouts for the dirent we just created.
+ */
+ if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
+ snapshot_id_list whiteouts_done;
+ struct btree_iter iter;
+ struct bkey_s_c k;
- return __write_inode(trans, inode, inode_snapshot);
+ darray_init(&whiteouts_done);
+
+ for_each_btree_key_reverse_norestart(trans, iter,
+ BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
+ BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
+ if (k.k->p.offset != inode->bi_inum)
+ break;
+
+ if (!bkey_is_inode(k.k) ||
+ !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
+ snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
+ continue;
+
+ struct bch_inode_unpacked child_inode;
+ ret = bch2_inode_unpack(k, &child_inode);
+ if (ret)
+ break;
+
+ if (!inode_should_reattach(&child_inode)) {
+ ret = maybe_delete_dirent(trans,
+ SPOS(lostfound.bi_inum, inode->bi_dir_offset,
+ dirent_snapshot),
+ k.k->p.snapshot);
+ if (ret)
+ break;
+
+ ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
+ if (ret)
+ break;
+ } else {
+ iter.snapshot = k.k->p.snapshot;
+ child_inode.bi_dir = inode->bi_dir;
+ child_inode.bi_dir_offset = inode->bi_dir_offset;
+
+ ret = bch2_inode_write_flags(trans, &iter, &child_inode,
+ BTREE_UPDATE_internal_snapshot_node);
+ if (ret)
+ break;
+ }
+ }
+ darray_exit(&whiteouts_done);
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ return ret;
+}
+
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos pos)
+{
+ return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
}
static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
- struct btree_iter iter;
- struct bkey_s_c_dirent d;
- int ret;
+ if (!inode->bi_dir)
+ return 0;
- d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
- POS(inode->bi_dir, inode->bi_dir_offset), 0,
- dirent);
- ret = bkey_err(d) ?:
- __remove_dirent(trans, d.k->p);
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
+ SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
+ int ret = bkey_err(d) ?:
+ dirent_points_to_inode(c, d, inode) ?:
+ __remove_dirent(trans, d.k->p);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-struct snapshots_seen_entry {
- u32 id;
- u32 equiv;
-};
+static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
+{
+ struct bch_fs *c = trans->c;
+
+ struct bch_inode_unpacked inode;
+ int ret = bch2_inode_find_by_inum_trans(trans,
+ (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+ &inode);
+ if (ret)
+ return ret;
+
+ ret = remove_backpointer(trans, &inode);
+ if (!bch2_err_matches(ret, ENOENT))
+ bch_err_msg(c, ret, "removing dirent");
+ if (ret)
+ return ret;
+
+ ret = reattach_inode(trans, &inode);
+ bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+ return ret;
+}
+
+static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
+{
+ struct bch_fs *c = trans->c;
+
+ if (!bch2_snapshot_is_leaf(c, snapshotid)) {
+ bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
+ /*
+ * If inum isn't set, that means we're being called from check_dirents,
+ * not check_inodes - the root of this subvolume doesn't exist or we
+ * would have found it there:
+ */
+ if (!inum) {
+ struct btree_iter inode_iter = {};
+ struct bch_inode_unpacked new_inode;
+ u64 cpu = raw_smp_processor_id();
+
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
+
+ new_inode.bi_subvol = subvolid;
+
+ int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
+ bch2_btree_iter_traverse(&inode_iter) ?:
+ bch2_inode_write(trans, &inode_iter, &new_inode);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ if (ret)
+ return ret;
+
+ inum = new_inode.bi_inum;
+ }
+
+ bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
+
+ struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+ int ret = PTR_ERR_OR_ZERO(new_subvol);
+ if (ret)
+ return ret;
+
+ bkey_subvolume_init(&new_subvol->k_i);
+ new_subvol->k.p.offset = subvolid;
+ new_subvol->v.snapshot = cpu_to_le32(snapshotid);
+ new_subvol->v.inode = cpu_to_le64(inum);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
+ if (ret)
+ return ret;
+
+ struct btree_iter iter;
+ struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshots, POS(0, snapshotid),
+ 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(s);
+ bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
+ if (ret)
+ return ret;
+
+ u32 snapshot_tree = le32_to_cpu(s->v.tree);
+
+ s->v.subvol = cpu_to_le32(subvolid);
+ SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
+ bch2_trans_iter_exit(trans, &iter);
+
+ struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
+ 0, snapshot_tree);
+ ret = PTR_ERR_OR_ZERO(st);
+ bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
+ if (ret)
+ return ret;
+
+ if (!st->v.master_subvol)
+ st->v.master_subvol = cpu_to_le32(subvolid);
+
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
+static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
+{
+ struct bch_fs *c = trans->c;
+ unsigned i_mode = S_IFREG;
+ u64 i_size = 0;
+
+ switch (btree) {
+ case BTREE_ID_extents: {
+ struct btree_iter iter = {};
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
+ bch2_trans_iter_exit(trans, &iter);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ i_size = k.k->p.offset << 9;
+ break;
+ }
+ case BTREE_ID_dirents:
+ i_mode = S_IFDIR;
+ break;
+ case BTREE_ID_xattrs:
+ break;
+ default:
+ BUG();
+ }
+
+ struct bch_inode_unpacked new_inode;
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
+ new_inode.bi_size = i_size;
+ new_inode.bi_inum = inum;
+ new_inode.bi_snapshot = snapshot;
+
+ return __bch2_fsck_write_inode(trans, &new_inode);
+}
struct snapshots_seen {
struct bpos pos;
- DARRAY(struct snapshots_seen_entry) ids;
+ snapshot_id_list ids;
};
static inline void snapshots_seen_exit(struct snapshots_seen *s)
@@ -375,20 +710,15 @@ static inline void snapshots_seen_init(struct snapshots_seen *s)
static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
{
- struct snapshots_seen_entry *i, n = {
- .id = id,
- .equiv = bch2_snapshot_equiv(c, id),
- };
- int ret = 0;
-
+ u32 *i;
__darray_for_each(s->ids, i) {
- if (i->id == id)
+ if (*i == id)
return 0;
- if (i->id > id)
+ if (*i > id)
break;
}
- ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+ int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
if (ret)
bch_err(c, "error reallocating snapshots_seen table (size %zu)",
s->ids.size);
@@ -398,42 +728,11 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s
static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
enum btree_id btree_id, struct bpos pos)
{
- struct snapshots_seen_entry n = {
- .id = pos.snapshot,
- .equiv = bch2_snapshot_equiv(c, pos.snapshot),
- };
- int ret = 0;
-
if (!bkey_eq(s->pos, pos))
s->ids.nr = 0;
-
s->pos = pos;
- s->pos.snapshot = n.equiv;
-
- darray_for_each(s->ids, i) {
- if (i->id == n.id)
- return 0;
- /*
- * We currently don't rigorously track for snapshot cleanup
- * needing to be run, so it shouldn't be a fsck error yet:
- */
- if (i->equiv == n.equiv) {
- bch_err(c, "snapshot deletion did not finish:\n"
- " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
- bch2_btree_id_str(btree_id),
- pos.inode, pos.offset,
- i->id, n.id, n.equiv);
- set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
- return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
- }
- }
-
- ret = darray_push(&s->ids, n);
- if (ret)
- bch_err(c, "error reallocating snapshots_seen table (size %zu)",
- s->ids.size);
- return ret;
+ return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
}
/**
@@ -453,12 +752,10 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
ssize_t i;
EBUG_ON(id > ancestor);
- EBUG_ON(!bch2_snapshot_is_equiv(c, id));
- EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
/* @ancestor should be the snapshot most recently added to @seen */
EBUG_ON(ancestor != seen->pos.snapshot);
- EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
+ EBUG_ON(ancestor != darray_last(seen->ids));
if (id == ancestor)
return true;
@@ -477,9 +774,9 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
*/
for (i = seen->ids.nr - 2;
- i >= 0 && seen->ids.data[i].equiv >= id;
+ i >= 0 && seen->ids.data[i] >= id;
--i)
- if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
+ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
return false;
return true;
@@ -510,9 +807,6 @@ static int ref_visible2(struct bch_fs *c,
u32 src, struct snapshots_seen *src_seen,
u32 dst, struct snapshots_seen *dst_seen)
{
- src = bch2_snapshot_equiv(c, src);
- dst = bch2_snapshot_equiv(c, dst);
-
if (dst > src) {
swap(dst, src);
swap(dst_seen, src_seen);
@@ -528,21 +822,24 @@ static int ref_visible2(struct bch_fs *c,
struct inode_walker_entry {
struct bch_inode_unpacked inode;
u32 snapshot;
- bool seen_this_pos;
u64 count;
+ u64 i_size;
};
struct inode_walker {
bool first_this_inode;
+ bool have_inodes;
bool recalculate_sums;
struct bpos last_pos;
DARRAY(struct inode_walker_entry) inodes;
+ snapshot_id_list deletes;
};
static void inode_walker_exit(struct inode_walker *w)
{
darray_exit(&w->inodes);
+ darray_exit(&w->deletes);
}
static struct inode_walker inode_walker_init(void)
@@ -555,11 +852,10 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w,
{
struct bch_inode_unpacked u;
- BUG_ON(bch2_inode_unpack(inode, &u));
-
- return darray_push(&w->inodes, ((struct inode_walker_entry) {
+ return bch2_inode_unpack(inode, &u) ?:
+ darray_push(&w->inodes, ((struct inode_walker_entry) {
.inode = u,
- .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot),
+ .snapshot = inode.k->p.snapshot,
}));
}
@@ -571,11 +867,17 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
+ /*
+ * We no longer have inodes for w->last_pos; clear this to avoid
+ * screwing up check_i_sectors/check_subdir_count if we take a
+ * transaction restart here:
+ */
+ w->have_inodes = false;
w->recalculate_sums = false;
w->inodes.nr = 0;
for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ BTREE_ITER_all_snapshots, k, ret) {
if (k.k->p.offset != inum)
break;
@@ -588,41 +890,45 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
return ret;
w->first_this_inode = true;
+ w->have_inodes = true;
return 0;
}
static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
- u32 snapshot, bool is_whiteout)
+lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
{
- struct inode_walker_entry *i;
-
- snapshot = bch2_snapshot_equiv(c, snapshot);
+ bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
+ struct inode_walker_entry *i;
__darray_for_each(w->inodes, i)
- if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
goto found;
return NULL;
found:
- BUG_ON(snapshot > i->snapshot);
+ BUG_ON(k.k->p.snapshot > i->snapshot);
- if (snapshot != i->snapshot && !is_whiteout) {
+ if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
struct inode_walker_entry new = *i;
- size_t pos;
- int ret;
- new.snapshot = snapshot;
- new.count = 0;
+ new.snapshot = k.k->p.snapshot;
+ new.count = 0;
+ new.i_size = 0;
+
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
- bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
- w->last_pos.inode, snapshot, i->snapshot);
+ bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
+ "unexpected because we should always update the inode when we update a key in that inode\n"
+ "%s",
+ w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
+ printbuf_exit(&buf);
- while (i > w->inodes.data && i[-1].snapshot > snapshot)
+ while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
--i;
- pos = i - w->inodes.data;
- ret = darray_insert_item(&w->inodes, pos, new);
+ size_t pos = i - w->inodes.data;
+ int ret = darray_insert_item(&w->inodes, pos, new);
if (ret)
return ERR_PTR(ret);
@@ -633,27 +939,24 @@ found:
}
static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
- struct inode_walker *w, struct bpos pos,
- bool is_whiteout)
+ struct inode_walker *w,
+ struct bkey_s_c k)
{
- if (w->last_pos.inode != pos.inode) {
- int ret = get_inodes_all_snapshots(trans, w, pos.inode);
+ if (w->last_pos.inode != k.k->p.inode) {
+ int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
if (ret)
return ERR_PTR(ret);
- } else if (bkey_cmp(w->last_pos, pos)) {
- darray_for_each(w->inodes, i)
- i->seen_this_pos = false;
}
- w->last_pos = pos;
+ w->last_pos = k.k->p;
- return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
+ return lookup_inode_for_snapshot(trans->c, w, k);
}
-static int __get_visible_inodes(struct btree_trans *trans,
- struct inode_walker *w,
- struct snapshots_seen *s,
- u64 inum)
+static int get_visible_inodes(struct btree_trans *trans,
+ struct inode_walker *w,
+ struct snapshots_seen *s,
+ u64 inum)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
@@ -661,21 +964,23 @@ static int __get_visible_inodes(struct btree_trans *trans,
int ret;
w->inodes.nr = 0;
+ w->deletes.nr = 0;
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
+ BTREE_ITER_all_snapshots, k, ret) {
if (k.k->p.offset != inum)
break;
- if (!ref_visible(c, s, s->pos.snapshot, equiv))
+ if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
continue;
- if (bkey_is_inode(k.k))
- add_inode(c, w, k);
+ if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
+ continue;
- if (equiv >= s->pos.snapshot)
+ ret = bkey_is_inode(k.k)
+ ? add_inode(c, w, k)
+ : snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
+ if (ret)
break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -683,143 +988,149 @@ static int __get_visible_inodes(struct btree_trans *trans,
return ret;
}
-static int check_key_has_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
+/*
+ * Prefer to delete the first one, since that will be the one at the wrong
+ * offset:
+ * return value: 0 -> delete k1, 1 -> delete k2
+ */
+int bch2_fsck_update_backpointers(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_i *new)
{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
+ if (new->k.type != KEY_TYPE_dirent)
+ return 0;
+
+ struct bkey_i_dirent *d = bkey_i_to_dirent(new);
+ struct inode_walker target = inode_walker_init();
int ret = 0;
- if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
- bkey_in_missing_snapshot,
- "key in missing snapshot: %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
-fsck_err:
- printbuf_exit(&buf);
+ if (d->v.d_type == DT_SUBVOL) {
+ BUG();
+ } else {
+ ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
+ if (ret)
+ goto err;
+
+ darray_for_each(target.inodes, i) {
+ i->inode.bi_dir_offset = d->k.p.offset;
+ ret = __bch2_fsck_write_inode(trans, &i->inode);
+ if (ret)
+ goto err;
+ }
+ }
+err:
+ inode_walker_exit(&target);
return ret;
}
-static int hash_redo_key(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c k)
+static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ u32 *snapshot)
{
- struct bkey_i *delete;
- struct bkey_i *tmp;
-
- delete = bch2_trans_kmalloc(trans, sizeof(*delete));
- if (IS_ERR(delete))
- return PTR_ERR(delete);
+ if (inode->bi_subvol) {
+ u64 inum;
+ int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
+ if (ret)
+ return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
+ }
- tmp = bch2_bkey_make_mut_noupdate(trans, k);
- if (IS_ERR(tmp))
- return PTR_ERR(tmp);
+ return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
+}
- bkey_init(&delete->k);
- delete->k.p = k_iter->pos;
- return bch2_btree_iter_traverse(k_iter) ?:
- bch2_trans_update(trans, k_iter, delete, 0) ?:
- bch2_hash_set_snapshot(trans, desc, hash_info,
- (subvol_inum) { 0, k.k->p.inode },
- k.k->p.snapshot, tmp,
- BCH_HASH_SET_MUST_CREATE,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
+ int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
-static int hash_check_key(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k)
+static int check_inode_dirent_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ bool *write_inode)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
struct printbuf buf = PRINTBUF;
- struct bkey_s_c k;
- u64 hash;
- int ret = 0;
-
- if (hash_k.k->type != desc.key_type)
- return 0;
-
- hash = desc.hash_bkey(hash_info, hash_k);
-
- if (likely(hash == hash_k.k->p.offset))
- return 0;
-
- if (hash_k.k->p.offset < hash)
- goto bad_hash;
-
- for_each_btree_key_norestart(trans, iter, desc.btree_id,
- SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
- BTREE_ITER_SLOTS, k, ret) {
- if (bkey_eq(k.k->p, hash_k.k->p))
- break;
- if (fsck_err_on(k.k->type == desc.key_type &&
- !desc.cmp_bkey(k, hash_k), c,
- hash_table_key_duplicate,
- "duplicate hash table keys:\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k),
- buf.buf))) {
- ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
- break;
- }
+ u32 inode_snapshot = inode->bi_snapshot;
+ struct btree_iter dirent_iter = {};
+ struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
+ int ret = bkey_err(d);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
- if (bkey_deleted(k.k)) {
- bch2_trans_iter_exit(trans, &iter);
- goto bad_hash;
- }
+ if (fsck_err_on(ret,
+ trans, inode_points_to_missing_dirent,
+ "inode points to missing dirent\n%s",
+ (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
+ fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
+ trans, inode_points_to_wrong_dirent,
+ "%s",
+ (printbuf_reset(&buf),
+ dirent_inode_mismatch_msg(&buf, c, d, inode),
+ buf.buf))) {
+ /*
+ * We just clear the backpointer fields for now. If we find a
+ * dirent that points to this inode in check_dirents(), we'll
+ * update it then; then when we get to check_path() if the
+ * backpointer is still 0 we'll reattach it.
+ */
+ inode->bi_dir = 0;
+ inode->bi_dir_offset = 0;
+ *write_inode = true;
}
-out:
- bch2_trans_iter_exit(trans, &iter);
+
+ ret = 0;
+fsck_err:
+ bch2_trans_iter_exit(trans, &dirent_iter);
printbuf_exit(&buf);
+ bch_err_fn(c, ret);
return ret;
-bad_hash:
- if (fsck_err(c, hash_table_key_wrong_offset,
- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
- bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
- ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
- bch_err_fn(c, ret);
- if (ret)
- return ret;
- ret = -BCH_ERR_transaction_restart_nested;
- }
-fsck_err:
- goto out;
}
-static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
+static int get_snapshot_root_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *root,
+ u64 inum)
{
struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
- int ret = bkey_err(k);
- if (ret)
- return ret;
+ struct bkey_s_c k;
+ int ret = 0;
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
+ SPOS(0, inum, U32_MAX),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+ if (bkey_is_inode(k.k))
+ goto found_root;
+ }
+ if (ret)
+ goto err;
+ BUG();
+found_root:
+ ret = bch2_inode_unpack(k, root);
+err:
bch2_trans_iter_exit(trans, &iter);
- return k.k->type == KEY_TYPE_set;
+ return ret;
}
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
- struct bch_inode_unpacked *prev,
- struct snapshots_seen *s,
- bool full)
+ struct bch_inode_unpacked *snapshot_root,
+ struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
struct bch_inode_unpacked u;
bool do_update = false;
int ret;
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret < 0)
goto err;
if (ret)
@@ -832,140 +1143,187 @@ static int check_inode(struct btree_trans *trans,
if (!bkey_is_inode(k.k))
return 0;
- BUG_ON(bch2_inode_unpack(k, &u));
-
- if (!full &&
- !(u.bi_flags & (BCH_INODE_i_size_dirty|
- BCH_INODE_i_sectors_dirty|
- BCH_INODE_unlinked)))
- return 0;
-
- if (prev->bi_inum != u.bi_inum)
- *prev = u;
-
- if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
- inode_d_type(prev) != inode_d_type(&u),
- c, inode_snapshot_mismatch,
- "inodes in different snapshots don't match")) {
- bch_err(c, "repair not implemented yet");
- return -BCH_ERR_fsck_repair_unimplemented;
- }
-
- if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
- bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
- struct bpos new_min_pos;
+ ret = bch2_inode_unpack(k, &u);
+ if (ret)
+ goto err;
- ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
+ if (snapshot_root->bi_inum != u.bi_inum) {
+ ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
if (ret)
goto err;
+ }
- u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
+ if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed ||
+ INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root),
+ trans, inode_snapshot_mismatch,
+ "inode hash info in different snapshots don't match")) {
+ u.bi_hash_seed = snapshot_root->bi_hash_seed;
+ SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
+ do_update = true;
+ }
- ret = __write_inode(trans, &u, iter->pos.snapshot);
- bch_err_msg(c, ret, "in fsck updating inode");
+ if (u.bi_dir || u.bi_dir_offset) {
+ ret = check_inode_dirent_inode(trans, &u, &do_update);
if (ret)
- return ret;
+ goto err;
+ }
- if (!bpos_eq(new_min_pos, POS_MIN))
- bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
- return 0;
+ if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
+ trans, inode_unlinked_but_has_dirent,
+ "inode unlinked but has dirent\n%s",
+ (printbuf_reset(&buf),
+ bch2_inode_unpacked_to_text(&buf, &u),
+ buf.buf))) {
+ u.bi_flags &= ~BCH_INODE_unlinked;
+ do_update = true;
}
- if (u.bi_flags & BCH_INODE_unlinked) {
- ret = check_inode_deleted_list(trans, k.k->p);
- if (ret < 0)
- return ret;
+ if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
+ /* Check for this early so that check_unreachable_inode() will reattach it */
+
+ ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
+ if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
+ goto err;
- fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
- "inode %llu:%u unlinked, but not on deleted list",
- u.bi_inum, k.k->p.snapshot);
+ fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
+ "dir unlinked but not empty\n%s",
+ (printbuf_reset(&buf),
+ bch2_inode_unpacked_to_text(&buf, &u),
+ buf.buf));
+ u.bi_flags &= ~BCH_INODE_unlinked;
+ do_update = true;
ret = 0;
}
- if (u.bi_flags & BCH_INODE_unlinked &&
- (!c->sb.clean ||
- fsck_err(c, inode_unlinked_but_clean,
- "filesystem marked clean, but inode %llu unlinked",
- u.bi_inum))) {
- ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
- bch_err_msg(c, ret, "in fsck deleting inode");
- return ret;
+ ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+ if (ret < 0)
+ goto err;
+
+ if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
+ trans, inode_has_child_snapshots_wrong,
+ "inode has_child_snapshots flag wrong (should be %u)\n%s",
+ ret,
+ (printbuf_reset(&buf),
+ bch2_inode_unpacked_to_text(&buf, &u),
+ buf.buf))) {
+ if (ret)
+ u.bi_flags |= BCH_INODE_has_child_snapshot;
+ else
+ u.bi_flags &= ~BCH_INODE_has_child_snapshot;
+ do_update = true;
}
+ ret = 0;
- if (u.bi_flags & BCH_INODE_i_size_dirty &&
- (!c->sb.clean ||
- fsck_err(c, inode_i_size_dirty_but_clean,
- "filesystem marked clean, but inode %llu has i_size dirty",
- u.bi_inum))) {
- bch_verbose(c, "truncating inode %llu", u.bi_inum);
+ if ((u.bi_flags & BCH_INODE_unlinked) &&
+ !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
+ if (!test_bit(BCH_FS_started, &c->flags)) {
+ /*
+ * If we're not in online fsck, don't delete unlinked
+ * inodes, just make sure they're on the deleted list.
+ *
+ * They might be referred to by a logged operation -
+ * i.e. we might have crashed in the middle of a
+ * truncate on an unlinked but open file - so we want to
+ * let the delete_dead_inodes kill it after resuming
+ * logged ops.
+ */
+ ret = check_inode_deleted_list(trans, k.k->p);
+ if (ret < 0)
+ goto err_noprint;
- /*
- * XXX: need to truncate partial blocks too here - or ideally
- * just switch units to bytes and that issue goes away
- */
- ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
- SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
- iter->pos.snapshot),
- POS(u.bi_inum, U64_MAX),
- 0, NULL);
- bch_err_msg(c, ret, "in fsck truncating inode");
- if (ret)
- return ret;
+ fsck_err_on(!ret,
+ trans, unlinked_inode_not_on_deleted_list,
+ "inode %llu:%u unlinked, but not on deleted list",
+ u.bi_inum, k.k->p.snapshot);
- /*
- * We truncated without our normal sector accounting hook, just
- * make sure we recalculate it:
- */
- u.bi_flags |= BCH_INODE_i_sectors_dirty;
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
+ if (ret)
+ goto err;
+ } else {
+ ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
+ if (ret < 0)
+ goto err;
- u.bi_flags &= ~BCH_INODE_i_size_dirty;
+ if (fsck_err_on(!ret,
+ trans, inode_unlinked_and_not_open,
+ "inode %llu:%u unlinked and not open",
+ u.bi_inum, u.bi_snapshot)) {
+ ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
+ bch_err_msg(c, ret, "in fsck deleting inode");
+ goto err_noprint;
+ }
+ ret = 0;
+ }
+ }
+
+ if (fsck_err_on(u.bi_parent_subvol &&
+ (u.bi_subvol == 0 ||
+ u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
+ trans, inode_bi_parent_nonzero,
+ "inode %llu:%u has subvol %u but nonzero parent subvol %u",
+ u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
+ u.bi_parent_subvol = 0;
do_update = true;
}
- if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
- (!c->sb.clean ||
- fsck_err(c, inode_i_sectors_dirty_but_clean,
- "filesystem marked clean, but inode %llu has i_sectors dirty",
- u.bi_inum))) {
- s64 sectors;
+ if (u.bi_subvol) {
+ struct bch_subvolume s;
- bch_verbose(c, "recounting sectors for inode %llu",
- u.bi_inum);
+ ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
- sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
- if (sectors < 0) {
- bch_err_msg(c, sectors, "in fsck recounting inode sectors");
- return sectors;
+ if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+ ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
+ goto do_update;
}
- u.bi_sectors = sectors;
- u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
- do_update = true;
+ if (fsck_err_on(ret,
+ trans, inode_bi_subvol_missing,
+ "inode %llu:%u bi_subvol points to missing subvolume %u",
+ u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
+ fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
+ !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
+ k.k->p.snapshot),
+ trans, inode_bi_subvol_wrong,
+ "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
+ u.bi_inum, k.k->p.snapshot, u.bi_subvol,
+ le64_to_cpu(s.inode),
+ le32_to_cpu(s.snapshot))) {
+ u.bi_subvol = 0;
+ u.bi_parent_subvol = 0;
+ do_update = true;
+ }
}
- if (u.bi_flags & BCH_INODE_backptr_untrusted) {
- u.bi_dir = 0;
- u.bi_dir_offset = 0;
- u.bi_flags &= ~BCH_INODE_backptr_untrusted;
+ if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
+ trans, inode_journal_seq_in_future,
+ "inode journal seq in future (currently at %llu)\n%s",
+ journal_cur_seq(&c->journal),
+ (printbuf_reset(&buf),
+ bch2_inode_unpacked_to_text(&buf, &u),
+ buf.buf))) {
+ u.bi_journal_seq = journal_cur_seq(&c->journal);
do_update = true;
}
-
+do_update:
if (do_update) {
- ret = __write_inode(trans, &u, iter->pos.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u);
bch_err_msg(c, ret, "in fsck updating inode");
if (ret)
- return ret;
+ goto err_noprint;
}
err:
fsck_err:
bch_err_fn(c, ret);
+err_noprint:
+ printbuf_exit(&buf);
return ret;
}
int bch2_check_inodes(struct bch_fs *c)
{
- bool full = c->opts.fsck;
- struct bch_inode_unpacked prev = { 0 };
+ struct bch_inode_unpacked snapshot_root = {};
struct snapshots_seen s;
snapshots_seen_init(&s);
@@ -973,41 +1331,178 @@ int bch2_check_inodes(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_inode(trans, &iter, k, &prev, &s, full)));
+ check_inode(trans, &iter, k, &snapshot_root, &s)));
snapshots_seen_exit(&s);
bch_err_fn(c, ret);
return ret;
}
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos pos)
+static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode)
{
- return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ /*
+ * We look for inodes to reattach in natural key order, leaves first,
+ * but we should do the reattach at the oldest version that needs to be
+ * reattached:
+ */
+ for_each_btree_key_norestart(trans, iter,
+ BTREE_ID_inodes,
+ SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inode->bi_inum)
+ break;
+
+ if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
+ continue;
+
+ if (!bkey_is_inode(k.k))
+ break;
+
+ struct bch_inode_unpacked parent_inode;
+ ret = bch2_inode_unpack(k, &parent_inode);
+ if (ret)
+ break;
+
+ if (!inode_should_reattach(&parent_inode))
+ break;
+
+ *inode = parent_inode;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
}
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
- struct bkey_s_c_dirent d)
+static int check_unreachable_inode(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
- return inode->bi_dir == d.k->p.inode &&
- inode->bi_dir_offset == d.k->p.offset;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (!bkey_is_inode(k.k))
+ return 0;
+
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_unpack(k, &inode);
+ if (ret)
+ return ret;
+
+ if (!inode_should_reattach(&inode))
+ return 0;
+
+ ret = find_oldest_inode_needs_reattach(trans, &inode);
+ if (ret)
+ return ret;
+
+ if (fsck_err(trans, inode_unreachable,
+ "unreachable inode:\n%s",
+ (bch2_inode_unpacked_to_text(&buf, &inode),
+ buf.buf)))
+ ret = reattach_inode(trans, &inode);
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/*
+ * Reattach unreachable (but not unlinked) inodes
+ *
+ * Run after check_inodes() and check_dirents(), so we node that inode
+ * backpointer fields point to valid dirents, and every inode that has a dirent
+ * that points to it has its backpointer field set - so we're just looking for
+ * non-unlinked inodes without backpointers:
+ *
+ * XXX: this is racy w.r.t. hardlink removal in online fsck
+ */
+int bch2_check_unreachable_inodes(struct bch_fs *c)
+{
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+ POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_unreachable_inode(trans, &iter, k)));
+ bch_err_fn(c, ret);
+ return ret;
}
-static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *inode)
+static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
{
- return d.v->d_type == DT_SUBVOL
- ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
- : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
+ switch (btree) {
+ case BTREE_ID_extents:
+ return S_ISREG(mode) || S_ISLNK(mode);
+ case BTREE_ID_dirents:
+ return S_ISDIR(mode);
+ case BTREE_ID_xattrs:
+ return true;
+ default:
+ BUG();
+ }
}
-static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+static int check_key_has_inode(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct inode_walker *inode,
+ struct inode_walker_entry *i,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = PTR_ERR_OR_ZERO(i);
+ if (ret)
+ return ret;
+
+ if (k.k->type == KEY_TYPE_whiteout)
+ goto out;
+
+ if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ inode->last_pos.inode--;
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
+ if (fsck_err_on(!i,
+ trans, key_in_missing_inode,
+ "key in missing inode:\n %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ goto delete;
+
+ if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
+ trans, key_in_wrong_inode_type,
+ "key for wrong inode mode %o:\n %s",
+ i->inode.bi_mode,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ goto delete;
+out:
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+delete:
+ ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
+ goto out;
+}
+
+static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1021,25 +1516,32 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
i->count = count2;
if (i->count != count2) {
- bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->snapshot, i->count, count2);
- return -BCH_ERR_internal_fsck_err;
+ bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
+ i->count = count2;
}
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
- c, inode_i_sectors_wrong,
+ trans, inode_i_sectors_wrong,
"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
w->last_pos.inode, i->snapshot,
i->inode.bi_sectors, i->count)) {
i->inode.bi_sectors = i->count;
- ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+ ret = bch2_fsck_write_inode(trans, &i->inode);
if (ret)
break;
}
}
fsck_err:
bch_err_fn(c, ret);
- return ret ?: trans_was_restarted(trans, restart_count);
+ return ret;
+}
+
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+ u32 restart_count = trans->restart_count;
+ return check_i_sectors_notnested(trans, w) ?:
+ trans_was_restarted(trans, restart_count);
}
struct extent_end {
@@ -1118,9 +1620,9 @@ static int overlapping_extents_found(struct btree_trans *trans,
BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
bch2_trans_iter_init(trans, &iter1, btree, pos1,
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_NOT_EXTENTS);
- k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_not_extents);
+ k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
ret = bkey_err(k1);
if (ret)
goto err;
@@ -1145,7 +1647,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
while (1) {
bch2_btree_iter_advance(&iter2);
- k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
+ k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
ret = bkey_err(k2);
if (ret)
goto err;
@@ -1168,7 +1670,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
prt_printf(&buf, "\n overwriting %s extent",
pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
- if (fsck_err(c, extent_overlapping,
+ if (fsck_err(trans, extent_overlapping,
"overlapping extents%s", buf.buf)) {
struct btree_iter *old_iter = &iter1;
struct disk_reservation res = { 0 };
@@ -1181,7 +1683,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
ret = bch2_trans_update_extent_overwrite(trans, old_iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+ BTREE_UPDATE_internal_snapshot_node,
k1, k2) ?:
bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(c, &res);
@@ -1222,7 +1724,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
struct snapshots_seen *seen,
struct extent_ends *extent_ends,
struct bkey_s_c k,
- u32 equiv,
struct btree_iter *iter,
bool *fixed)
{
@@ -1255,10 +1756,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
goto err;
}
- ret = extent_ends_at(c, extent_ends, seen, k);
- if (ret)
- goto err;
-
extent_ends->last_pos = k.k->p;
err:
return ret;
@@ -1290,81 +1787,59 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k,
struct inode_walker *inode,
struct snapshots_seen *s,
- struct extent_ends *extent_ends)
+ struct extent_ends *extent_ends,
+ struct disk_reservation *res)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
- struct bpos equiv = k.k->p;
int ret = 0;
- equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret) {
ret = ret < 0 ? ret : 0;
goto out;
}
- if (inode->last_pos.inode != k.k->p.inode) {
+ if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
ret = check_i_sectors(trans, inode);
if (ret)
goto err;
}
- i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
- ret = PTR_ERR_OR_ZERO(i);
+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
if (ret)
goto err;
- ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+ struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
+ ret = PTR_ERR_OR_ZERO(extent_i);
if (ret)
goto err;
- if (k.k->type != KEY_TYPE_whiteout) {
- if (fsck_err_on(!i, c, extent_in_missing_inode,
- "extent in missing inode:\n %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- goto delete;
-
- if (fsck_err_on(i &&
- !S_ISREG(i->inode.bi_mode) &&
- !S_ISLNK(i->inode.bi_mode),
- c, extent_in_non_reg_inode,
- "extent in non regular inode mode %o:\n %s",
- i->inode.bi_mode,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- goto delete;
+ ret = check_key_has_inode(trans, iter, inode, extent_i, k);
+ if (ret)
+ goto err;
- ret = check_overlapping_extents(trans, s, extent_ends, k,
- equiv.snapshot, iter,
+ if (k.k->type != KEY_TYPE_whiteout) {
+ ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
&inode->recalculate_sums);
if (ret)
goto err;
- }
-
- /*
- * Check inodes in reverse order, from oldest snapshots to newest,
- * starting from the inode that matches this extent's snapshot. If we
- * didn't have one, iterate over all inodes:
- */
- if (!i)
- i = inode->inodes.data + inode->inodes.nr - 1;
- for (;
- inode->inodes.data && i >= inode->inodes.data;
- --i) {
- if (i->snapshot > equiv.snapshot ||
- !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
- continue;
+ /*
+ * Check inodes in reverse order, from oldest snapshots to
+ * newest, starting from the inode that matches this extent's
+ * snapshot. If we didn't have one, iterate over all inodes:
+ */
+ for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
+ inode->inodes.data && i >= inode->inodes.data;
+ --i) {
+ if (i->snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+ continue;
- if (k.k->type != KEY_TYPE_whiteout) {
- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
- k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+ if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
!bkey_extent_is_reservation(k),
- c, extent_past_end_of_inode,
+ trans, extent_past_end_of_inode,
"extent type past end of inode %llu:%u, i_size %llu\n %s",
i->inode.bi_inum, i->snapshot, i->inode.bi_size,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -1374,19 +1849,37 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
ret = bch2_btree_iter_traverse(&iter2) ?:
bch2_btree_delete_at(trans, &iter2,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter2);
if (ret)
goto err;
iter->k.type = KEY_TYPE_whiteout;
+ break;
}
+ }
+ }
+
+ ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ if (bkey_extent_is_allocation(k.k)) {
+ for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
+ inode->inodes.data && i >= inode->inodes.data;
+ --i) {
+ if (i->snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+ continue;
- if (bkey_extent_is_allocation(k.k))
- i->count += k.k->size;
+ i->count += k.k->size;
}
+ }
- i->seen_this_pos = true;
+ if (k.k->type != KEY_TYPE_whiteout) {
+ ret = extent_ends_at(c, extent_ends, s, k);
+ if (ret)
+ goto err;
}
out:
err:
@@ -1394,9 +1887,6 @@ fsck_err:
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
-delete:
- ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
- goto out;
}
/*
@@ -1414,16 +1904,14 @@ int bch2_check_extents(struct bch_fs *c)
extent_ends_init(&extent_ends);
int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
+ for_each_btree_key(trans, iter, BTREE_ID_extents,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL,
- BCH_TRANS_COMMIT_no_enospc, ({
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
bch2_disk_reservation_put(c, &res);
- check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+ check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
check_extent_overbig(trans, &iter, k);
})) ?:
- check_i_sectors(trans, &w));
+ check_i_sectors_notnested(trans, &w));
bch2_disk_reservation_put(c, &res);
extent_ends_exit(&extent_ends);
@@ -1441,7 +1929,7 @@ int bch2_check_indirect_extents(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
&res, NULL,
BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res);
@@ -1453,10 +1941,9 @@ int bch2_check_indirect_extents(struct bch_fs *c)
return ret;
}
-static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1469,111 +1956,166 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
return count2;
if (i->count != count2) {
- bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
- i->count, count2);
+ bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
i->count = count2;
if (i->inode.bi_nlink == i->count)
continue;
}
if (fsck_err_on(i->inode.bi_nlink != i->count,
- c, inode_dir_wrong_nlink,
+ trans, inode_dir_wrong_nlink,
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
i->inode.bi_nlink = i->count;
- ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+ ret = bch2_fsck_write_inode(trans, &i->inode);
if (ret)
break;
}
}
fsck_err:
bch_err_fn(c, ret);
- return ret ?: trans_was_restarted(trans, restart_count);
+ return ret;
}
-static int check_dirent_target(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target,
- u32 target_snapshot)
+static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
+{
+ u32 restart_count = trans->restart_count;
+ return check_subdir_count_notnested(trans, w) ?:
+ trans_was_restarted(trans, restart_count);
+}
+
+noinline_for_stack
+static int check_dirent_inode_dirent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target)
{
struct bch_fs *c = trans->c;
- struct bkey_i_dirent *n;
struct printbuf buf = PRINTBUF;
struct btree_iter bp_iter = { NULL };
int ret = 0;
+ if (inode_points_to_dirent(target, d))
+ return 0;
+
if (!target->bi_dir &&
!target->bi_dir_offset) {
+ fsck_err_on(S_ISDIR(target->bi_mode),
+ trans, inode_dir_missing_backpointer,
+ "directory with missing backpointer\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n"),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf));
+
+ fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
+ trans, inode_unlinked_but_has_dirent,
+ "inode unlinked but has dirent\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n"),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf));
+
+ target->bi_flags &= ~BCH_INODE_unlinked;
target->bi_dir = d.k->p.inode;
target->bi_dir_offset = d.k->p.offset;
-
- ret = __write_inode(trans, target, target_snapshot);
- if (ret)
- goto err;
+ return __bch2_fsck_write_inode(trans, target);
}
- if (!inode_points_to_dirent(target, d)) {
- struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
- SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
- ret = bkey_err(bp_dirent);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- bool backpointer_exists = !ret;
- ret = 0;
+ if (bch2_inode_should_have_single_bp(target) &&
+ !fsck_err(trans, inode_wrong_backpointer,
+ "dirent points to inode that does not point back:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n "),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf)))
+ goto err;
- bch2_bkey_val_to_text(&buf, c, d.s_c);
- prt_newline(&buf);
- if (backpointer_exists)
- bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+ struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
+ SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot));
+ ret = bkey_err(bp_dirent);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
- if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
- c, inode_dir_multiple_links,
- "directory %llu:%u with multiple links\n%s",
- target->bi_inum, target_snapshot, buf.buf)) {
- ret = __remove_dirent(trans, d.k->p);
- goto out;
- }
+ bool backpointer_exists = !ret;
+ ret = 0;
+
+ if (fsck_err_on(!backpointer_exists,
+ trans, inode_wrong_backpointer,
+ "inode %llu:%u has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ target->bi_inum, target->bi_snapshot,
+ target->bi_dir,
+ target->bi_dir_offset,
+ d.k->p.inode,
+ d.k->p.offset)) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, target);
+ goto out;
+ }
- /*
- * hardlinked file with nlink 0:
- * We're just adjusting nlink here so check_nlinks() will pick
- * it up, it ignores inodes with nlink 0
- */
- if (fsck_err_on(backpointer_exists && !target->bi_nlink,
- c, inode_multiple_links_but_nlink_0,
- "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
- target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
- target->bi_nlink++;
- target->bi_flags &= ~BCH_INODE_unlinked;
-
- ret = __write_inode(trans, target, target_snapshot);
- if (ret)
- goto err;
- }
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
+ prt_newline(&buf);
+ if (backpointer_exists)
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
+ if (fsck_err_on(backpointer_exists &&
+ (S_ISDIR(target->bi_mode) ||
+ target->bi_subvol),
+ trans, inode_dir_multiple_links,
+ "%s %llu:%u with multiple links\n%s",
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+ target->bi_inum, target->bi_snapshot, buf.buf)) {
+ ret = __remove_dirent(trans, d.k->p);
+ goto out;
+ }
- if (fsck_err_on(!backpointer_exists,
- c, inode_wrong_backpointer,
- "inode %llu:%u has wrong backpointer:\n"
- "got %llu:%llu\n"
- "should be %llu:%llu",
- target->bi_inum, target_snapshot,
- target->bi_dir,
- target->bi_dir_offset,
- d.k->p.inode,
- d.k->p.offset)) {
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
-
- ret = __write_inode(trans, target, target_snapshot);
- if (ret)
- goto err;
- }
+ /*
+ * hardlinked file with nlink 0:
+ * We're just adjusting nlink here so check_nlinks() will pick
+ * it up, it ignores inodes with nlink 0
+ */
+ if (fsck_err_on(backpointer_exists && !target->bi_nlink,
+ trans, inode_multiple_links_but_nlink_0,
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+ target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
+ target->bi_nlink++;
+ target->bi_flags &= ~BCH_INODE_unlinked;
+ ret = __bch2_fsck_write_inode(trans, target);
+ if (ret)
+ goto err;
}
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+noinline_for_stack
+static int check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_dirent *n;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ ret = check_dirent_inode_dirent(trans, iter, d, target);
+ if (ret)
+ goto err;
if (fsck_err_on(d.v->d_type != inode_d_type(target),
- c, dirent_d_type_wrong,
+ trans, dirent_d_type_wrong,
"incorrect d_type: got %s, should be %s:\n%s",
bch2_d_type_str(d.v->d_type),
bch2_d_type_str(inode_d_type(target)),
@@ -1586,6 +2128,12 @@ static int check_dirent_target(struct btree_trans *trans,
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = inode_d_type(target);
+ if (n->v.d_type == DT_SUBVOL) {
+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+ n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
+ } else {
+ n->v.d_inum = cpu_to_le64(target->bi_inum);
+ }
ret = bch2_trans_update(trans, iter, &n->k_i, 0);
if (ret)
@@ -1593,33 +2141,164 @@ static int check_dirent_target(struct btree_trans *trans,
d = dirent_i_to_s_c(n);
}
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
- if (fsck_err_on(d.v->d_type == DT_SUBVOL &&
- target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol),
- c, dirent_d_parent_subvol_wrong,
- "dirent has wrong d_parent_subvol field: got %u, should be %u",
- le32_to_cpu(d.v->d_parent_subvol),
- target->bi_parent_subvol)) {
- n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
- ret = PTR_ERR_OR_ZERO(n);
+/* find a subvolume that's a descendent of @snapshot: */
+static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_subvolume)
+ continue;
+
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+ if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
+ bch2_trans_iter_exit(trans, &iter);
+ *subvolid = k.k->p.offset;
+ goto found;
+ }
+ }
+ if (!ret)
+ ret = -ENOENT;
+found:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+noinline_for_stack
+static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c_dirent d)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter subvol_iter = {};
+ struct bch_inode_unpacked subvol_root;
+ u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
+ u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+ u32 parent_snapshot;
+ u32 new_parent_subvol = 0;
+ u64 parent_inum;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (ret ||
+ (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
+ int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
+ if (ret2 && !bch2_err_matches(ret, ENOENT))
+ return ret2;
+ }
+
+ if (ret &&
+ !new_parent_subvol &&
+ (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+ /*
+ * Couldn't find a subvol for dirent's snapshot - but we lost
+ * subvols, so we need to reconstruct:
+ */
+ ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
+ if (ret)
+ return ret;
+
+ parent_snapshot = d.k->p.snapshot;
+ }
+
+ if (fsck_err_on(ret,
+ trans, dirent_to_missing_parent_subvol,
+ "dirent parent_subvol points to missing subvolume\n%s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
+ fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
+ trans, dirent_not_visible_in_parent_subvol,
+ "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
+ parent_snapshot,
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ if (!new_parent_subvol) {
+ bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
+ struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
+ ret = PTR_ERR_OR_ZERO(new_dirent);
if (ret)
goto err;
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+ new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
+ }
- ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+ struct bkey_s_c_subvolume s =
+ bch2_bkey_get_iter_typed(trans, &subvol_iter,
+ BTREE_ID_subvolumes, POS(0, target_subvol),
+ 0, subvolume);
+ ret = bkey_err(s.s_c);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (ret) {
+ if (fsck_err(trans, dirent_to_missing_subvol,
+ "dirent points to missing subvolume\n%s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
+ return __remove_dirent(trans, d.k->p);
+ ret = 0;
+ goto out;
+ }
+
+ if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
+ trans, subvol_fs_path_parent_wrong,
+ "subvol with wrong fs_path_parent, should be be %u\n%s",
+ parent_subvol,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ struct bkey_i_subvolume *n =
+ bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(n);
if (ret)
goto err;
- d = dirent_i_to_s_c(n);
+ n->v.fs_path_parent = cpu_to_le32(parent_subvol);
+ }
+
+ u64 target_inum = le64_to_cpu(s.v->inode);
+ u32 target_snapshot = le32_to_cpu(s.v->snapshot);
+
+ ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ if (ret) {
+ bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
+ }
+
+ if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
+ trans, inode_bi_parent_wrong,
+ "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
+ target_inum,
+ subvol_root.bi_parent_subvol, parent_subvol)) {
+ subvol_root.bi_parent_subvol = parent_subvol;
+ subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
+ ret = __bch2_fsck_write_inode(trans, &subvol_root);
+ if (ret)
+ goto err;
}
+
+ ret = check_dirent_target(trans, iter, d, &subvol_root);
+ if (ret)
+ goto err;
out:
err:
fsck_err:
- bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &subvol_iter);
printbuf_exit(&buf);
- bch_err_fn(c, ret);
return ret;
}
@@ -1631,21 +2310,16 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c_dirent d;
struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
- struct bpos equiv;
int ret = 0;
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret) {
ret = ret < 0 ? ret : 0;
goto out;
}
- equiv = k.k->p;
- equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
if (ret)
goto err;
@@ -1653,46 +2327,29 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type == KEY_TYPE_whiteout)
goto out;
- if (dir->last_pos.inode != k.k->p.inode) {
- ret = check_subdir_count(trans, dir);
+ if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
+ ret = check_subdir_dirents_count(trans, dir);
if (ret)
goto err;
}
- BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
-
- i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
+ i = walk_inode(trans, dir, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret < 0)
goto err;
- if (dir->first_this_inode && dir->inodes.nr)
- *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
- dir->first_this_inode = false;
-
- if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
- "dirent in nonexisting directory:\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
- goto out;
- }
+ ret = check_key_has_inode(trans, iter, dir, i, k);
+ if (ret)
+ goto err;
if (!i)
goto out;
- if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
- c, dirent_in_non_dir_inode,
- "dirent in non directory inode type %s:\n%s",
- bch2_d_type_str(inode_d_type(&i->inode)),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_delete_at(trans, iter, 0);
- goto out;
- }
+ if (dir->first_this_inode)
+ *hash_info = bch2_hash_info_init(c, &i->inode);
+ dir->first_this_inode = false;
- ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
+ ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
if (ret < 0)
goto err;
if (ret) {
@@ -1704,64 +2361,20 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type != KEY_TYPE_dirent)
goto out;
- d = bkey_s_c_to_dirent(k);
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL) {
- struct bch_inode_unpacked subvol_root;
- u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
- u32 target_snapshot;
- u64 target_inum;
-
- ret = subvol_lookup(trans, target_subvol,
- &target_snapshot, &target_inum);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (fsck_err_on(ret, c, dirent_to_missing_subvol,
- "dirent points to missing subvolume %u",
- le32_to_cpu(d.v->d_child_subvol))) {
- ret = __remove_dirent(trans, d.k->p);
- goto err;
- }
-
- ret = lookup_inode(trans, target_inum,
- &subvol_root, &target_snapshot);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (fsck_err_on(ret, c, subvol_to_missing_root,
- "subvolume %u points to missing subvolume root %llu",
- target_subvol,
- target_inum)) {
- bch_err(c, "repair not implemented yet");
- ret = -EINVAL;
- goto err;
- }
-
- if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
- c, subvol_root_wrong_bi_subvol,
- "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
- target_inum,
- subvol_root.bi_subvol, target_subvol)) {
- subvol_root.bi_subvol = target_subvol;
- ret = __write_inode(trans, &subvol_root, target_snapshot);
- if (ret)
- goto err;
- }
-
- ret = check_dirent_target(trans, iter, d, &subvol_root,
- target_snapshot);
+ ret = check_dirent_to_subvol(trans, iter, d);
if (ret)
goto err;
} else {
- ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
+ ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
if (ret)
goto err;
if (fsck_err_on(!target->inodes.nr,
- c, dirent_to_missing_inode,
- "dirent points to missing inode: (equiv %u)\n%s",
- equiv.snapshot,
+ trans, dirent_to_missing_inode,
+ "dirent points to missing inode:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
@@ -1771,17 +2384,45 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
}
darray_for_each(target->inodes, i) {
- ret = check_dirent_target(trans, iter, d,
- &i->inode, i->snapshot);
+ ret = check_dirent_target(trans, iter, d, &i->inode);
if (ret)
goto err;
}
+
+ darray_for_each(target->deletes, i)
+ if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
+ trans, dirent_to_overwritten_inode,
+ "dirent points to inode overwritten in snapshot %u:\n%s",
+ *i,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf))) {
+ struct btree_iter delete_iter;
+ bch2_trans_iter_init(trans, &delete_iter,
+ BTREE_ID_dirents,
+ SPOS(k.k->p.inode, k.k->p.offset, *i),
+ BTREE_ITER_intent);
+ ret = bch2_btree_iter_traverse(&delete_iter) ?:
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ hash_info,
+ &delete_iter,
+ BTREE_UPDATE_internal_snapshot_node);
+ bch2_trans_iter_exit(trans, &delete_iter);
+ if (ret)
+ goto err;
+
+ }
}
- if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, equiv.snapshot, i)
- i->count++;
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
+ if (d.v->d_type == DT_DIR)
+ i->count++;
+ i->i_size += bkey_bytes(d.k);
+ }
out:
err:
fsck_err:
@@ -1804,13 +2445,11 @@ int bch2_check_dirents(struct bch_fs *c)
snapshots_seen_init(&s);
int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
+ for_each_btree_key(trans, iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
- k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
+ check_subdir_count_notnested(trans, &dir));
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
@@ -1828,29 +2467,29 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
struct inode_walker_entry *i;
int ret;
- ret = check_key_has_snapshot(trans, iter, k);
- if (ret)
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
+ if (ret < 0)
return ret;
+ if (ret)
+ return 0;
- i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
+ i = walk_inode(trans, inode, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret)
return ret;
- if (inode->first_this_inode && inode->inodes.nr)
- *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
- inode->first_this_inode = false;
-
- if (fsck_err_on(!i, c, xattr_in_missing_inode,
- "xattr for missing inode %llu",
- k.k->p.inode))
- return bch2_btree_delete_at(trans, iter, 0);
+ ret = check_key_has_inode(trans, iter, inode, i, k);
+ if (ret)
+ return ret;
if (!i)
return 0;
- ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
-fsck_err:
+ if (inode->first_this_inode)
+ *hash_info = bch2_hash_info_init(c, &i->inode);
+ inode->first_this_inode = false;
+
+ ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
bch_err_fn(c, ret);
return ret;
}
@@ -1867,11 +2506,13 @@ int bch2_check_xattrs(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
check_xattr(trans, &iter, k, &hash_info, &inode)));
+
+ inode_walker_exit(&inode);
bch_err_fn(c, ret);
return ret;
}
@@ -1888,38 +2529,44 @@ static int check_root_trans(struct btree_trans *trans)
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
+ if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
"root subvol missing")) {
- struct bkey_i_subvolume root_subvol;
+ struct bkey_i_subvolume *root_subvol =
+ bch2_trans_kmalloc(trans, sizeof(*root_subvol));
+ ret = PTR_ERR_OR_ZERO(root_subvol);
+ if (ret)
+ goto err;
snapshot = U32_MAX;
inum = BCACHEFS_ROOT_INO;
- bkey_subvolume_init(&root_subvol.k_i);
- root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_subvol.v.flags = 0;
- root_subvol.v.snapshot = cpu_to_le32(snapshot);
- root_subvol.v.inode = cpu_to_le64(inum);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
+ bkey_subvolume_init(&root_subvol->k_i);
+ root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_subvol->v.flags = 0;
+ root_subvol->v.snapshot = cpu_to_le32(snapshot);
+ root_subvol->v.inode = cpu_to_le64(inum);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
bch_err_msg(c, ret, "writing root subvol");
if (ret)
goto err;
}
- ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+ ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- if (mustfix_fsck_err_on(ret, c, root_dir_missing,
+ if (mustfix_fsck_err_on(ret,
+ trans, root_dir_missing,
"root directory missing") ||
mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
- c, root_inode_not_dir,
+ trans, root_inode_not_dir,
"root inode not a directory")) {
bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
0, NULL);
root_inode.bi_inum = inum;
+ root_inode.bi_snapshot = snapshot;
- ret = __write_inode(trans, &root_inode, snapshot);
+ ret = __bch2_fsck_write_inode(trans, &root_inode);
bch_err_msg(c, ret, "writing root inode");
}
err:
@@ -1930,12 +2577,91 @@ fsck_err:
/* Get root directory, create if it doesn't exist: */
int bch2_check_root(struct bch_fs *c)
{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_root_trans(trans));
bch_err_fn(c, ret);
return ret;
}
+typedef DARRAY(u32) darray_u32;
+
+static bool darray_u32_has(darray_u32 *d, u32 v)
+{
+ darray_for_each(*d, i)
+ if (*i == v)
+ return true;
+ return false;
+}
+
+static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter parent_iter = {};
+ darray_u32 subvol_path = {};
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (k.k->type != KEY_TYPE_subvolume)
+ return 0;
+
+ while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
+ ret = darray_push(&subvol_path, k.k->p.offset);
+ if (ret)
+ goto err;
+
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+ struct bch_inode_unpacked subvol_root;
+ ret = bch2_inode_find_by_inum_trans(trans,
+ (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+ &subvol_root);
+ if (ret)
+ break;
+
+ u32 parent = le32_to_cpu(s.v->fs_path_parent);
+
+ if (darray_u32_has(&subvol_path, parent)) {
+ if (fsck_err(c, subvol_loop, "subvolume loop"))
+ ret = reattach_subvol(trans, s);
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &parent_iter);
+ bch2_trans_iter_init(trans, &parent_iter,
+ BTREE_ID_subvolumes, POS(0, parent), 0);
+ k = bch2_btree_iter_peek_slot(&parent_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
+ trans, subvol_unreachable,
+ "unreachable subvolume %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c),
+ buf.buf))) {
+ ret = reattach_subvol(trans, s);
+ break;
+ }
+ }
+fsck_err:
+err:
+ printbuf_exit(&buf);
+ darray_exit(&subvol_path);
+ bch2_trans_iter_exit(trans, &parent_iter);
+ return ret;
+}
+
+int bch2_check_subvolume_structure(struct bch_fs *c)
+{
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_subvol_path(trans, &iter, k)));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
struct pathbuf_entry {
u64 inum;
u32 snapshot;
@@ -1943,6 +2669,48 @@ struct pathbuf_entry {
typedef DARRAY(struct pathbuf_entry) pathbuf;
+static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
+ u32 new_depth)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, p->inum, p->snapshot), 0);
+
+ struct bch_inode_unpacked inode;
+ int ret = bkey_err(k) ?:
+ !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
+ : bch2_inode_unpack(k, &inode);
+ if (ret)
+ goto err;
+
+ if (inode.bi_depth != new_depth) {
+ inode.bi_depth = new_depth;
+ ret = __bch2_fsck_write_inode(trans, &inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
+{
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
+
+ darray_for_each_reverse(*path, i) {
+ ret = nested_lockrestart_do(trans,
+ bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
+ bch_err_fn(trans->c, ret);
+ if (ret)
+ break;
+
+ new_bi_depth++;
+ }
+
+ return ret ?: trans_was_restarted(trans, restart_count);
+}
+
static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
{
darray_for_each(*p, i)
@@ -1952,152 +2720,134 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
return false;
}
-static int path_down(struct bch_fs *c, pathbuf *p,
- u64 inum, u32 snapshot)
-{
- int ret = darray_push(p, ((struct pathbuf_entry) {
- .inum = inum,
- .snapshot = snapshot,
- }));
-
- if (ret)
- bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
- p->size);
- return ret;
-}
-
-/*
- * Check that a given inode is reachable from the root:
- *
- * XXX: we should also be verifying that inodes are in the right subvolumes
- */
-static int check_path(struct btree_trans *trans,
- pathbuf *p,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
+static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
{
struct bch_fs *c = trans->c;
+ struct btree_iter inode_iter = {};
+ pathbuf path = {};
+ struct printbuf buf = PRINTBUF;
+ u32 snapshot = inode_k.k->p.snapshot;
+ bool redo_bi_depth = false;
+ u32 min_bi_depth = U32_MAX;
int ret = 0;
- snapshot = bch2_snapshot_equiv(c, snapshot);
- p->nr = 0;
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_unpack(inode_k, &inode);
+ if (ret)
+ return ret;
- while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
- inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+ while (!inode.bi_subvol) {
struct btree_iter dirent_iter;
struct bkey_s_c_dirent d;
u32 parent_snapshot = snapshot;
- if (inode->bi_subvol) {
- u64 inum;
-
- ret = subvol_lookup(trans, inode->bi_parent_subvol,
- &parent_snapshot, &inum);
- if (ret)
- break;
- }
-
- d = dirent_get_by_pos(trans, &dirent_iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset,
- parent_snapshot));
+ d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
ret = bkey_err(d.s_c);
if (ret && !bch2_err_matches(ret, ENOENT))
- break;
+ goto out;
- if (!ret && !dirent_points_to_inode(d, inode)) {
+ if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
bch2_trans_iter_exit(trans, &dirent_iter);
- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
- }
if (bch2_err_matches(ret, ENOENT)) {
- if (fsck_err(c, inode_unreachable,
- "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
- inode->bi_inum, snapshot,
- bch2_d_type_str(inode_d_type(inode)),
- inode->bi_nlink,
- inode->bi_dir,
- inode->bi_dir_offset))
- ret = reattach_inode(trans, inode, snapshot);
- break;
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, inode_k);
+ bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
+ bch2_err_str(ret), buf.buf);
+ goto out;
}
bch2_trans_iter_exit(trans, &dirent_iter);
- if (!S_ISDIR(inode->bi_mode))
- break;
-
- ret = path_down(c, p, inode->bi_inum, snapshot);
- if (ret) {
- bch_err(c, "memory allocation failure");
+ ret = darray_push(&path, ((struct pathbuf_entry) {
+ .inum = inode.bi_inum,
+ .snapshot = snapshot,
+ }));
+ if (ret)
return ret;
- }
snapshot = parent_snapshot;
- ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
+ SPOS(0, inode.bi_dir, snapshot), 0);
+
+ struct bch_inode_unpacked parent_inode;
+ ret = bkey_err(inode_k) ?:
+ !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
+ : bch2_inode_unpack(inode_k, &parent_inode);
if (ret) {
/* Should have been caught in dirents pass */
- if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err(c, "error looking up parent directory: %i", ret);
- break;
+ bch_err_msg(c, ret, "error looking up parent directory");
+ goto out;
}
- if (path_is_dup(p, inode->bi_inum, snapshot)) {
+ min_bi_depth = parent_inode.bi_depth;
+
+ if (parent_inode.bi_depth < inode.bi_depth &&
+ min_bi_depth < U16_MAX)
+ break;
+
+ inode = parent_inode;
+ snapshot = inode_k.k->p.snapshot;
+ redo_bi_depth = true;
+
+ if (path_is_dup(&path, inode.bi_inum, snapshot)) {
/* XXX print path */
bch_err(c, "directory structure loop");
- darray_for_each(*p, i)
+ darray_for_each(path, i)
pr_err("%llu:%u", i->inum, i->snapshot);
- pr_err("%llu:%u", inode->bi_inum, snapshot);
+ pr_err("%llu:%u", inode.bi_inum, snapshot);
- if (!fsck_err(c, dir_loop, "directory structure loop"))
- return 0;
-
- ret = remove_backpointer(trans, inode);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ if (fsck_err(trans, dir_loop, "directory structure loop")) {
+ ret = remove_backpointer(trans, &inode);
bch_err_msg(c, ret, "removing dirent");
- if (ret)
- break;
+ if (ret)
+ break;
- ret = reattach_inode(trans, inode, snapshot);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum);
- break;
+ ret = reattach_inode(trans, &inode);
+ bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+ }
+
+ goto out;
}
}
+
+ if (inode.bi_subvol)
+ min_bi_depth = 0;
+
+ if (redo_bi_depth)
+ ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
+out:
fsck_err:
+ bch2_trans_iter_exit(trans, &inode_iter);
+ darray_exit(&path);
+ printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
}
/*
- * Check for unreachable inodes, as well as loops in the directory structure:
- * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
- * unreachable:
+ * Check for loops in the directory structure: all other connectivity issues
+ * have been fixed by prior passes
*/
int bch2_check_directory_structure(struct bch_fs *c)
{
- struct bch_inode_unpacked u;
- pathbuf path = { 0, };
- int ret;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_intent|
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- if (!bkey_is_inode(k.k))
+ if (!S_ISDIR(bkey_inode_mode(k)))
continue;
- BUG_ON(bch2_inode_unpack(k, &u));
-
- if (u.bi_flags & BCH_INODE_unlinked)
+ if (bch2_inode_flags(k) & BCH_INODE_unlinked)
continue;
- check_path(trans, &path, &u, iter.pos.snapshot);
+ check_path_loop(trans, k);
})));
- darray_exit(&path);
bch_err_fn(c, ret);
return ret;
@@ -2187,15 +2937,17 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_inodes,
POS(0, start),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_intent|
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k, ({
if (!bkey_is_inode(k.k))
continue;
/* Should never fail, checked by bch2_inode_invalid: */
struct bch_inode_unpacked u;
- BUG_ON(bch2_inode_unpack(k, &u));
+ _ret3 = bch2_inode_unpack(k, &u);
+ if (_ret3)
+ break;
/*
* Backpointer and directory structure checks are sufficient for
@@ -2204,6 +2956,10 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
if (S_ISDIR(u.bi_mode))
continue;
+ /*
+ * Previous passes ensured that bi_nlink is nonzero if
+ * it had multiple hardlinks:
+ */
if (!u.bi_nlink)
continue;
@@ -2230,9 +2986,9 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_intent|
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k, ({
ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
if (ret)
break;
@@ -2243,8 +2999,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
if (d.v->d_type != DT_DIR &&
d.v->d_type != DT_SUBVOL)
inc_link(c, &s, links, range_start, range_end,
- le64_to_cpu(d.v->d_inum),
- bch2_snapshot_equiv(c, d.k->p.snapshot));
+ le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
}
0;
})));
@@ -2260,7 +3015,6 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
struct nlink_table *links,
size_t *idx, u64 range_end)
{
- struct bch_fs *c = trans->c;
struct bch_inode_unpacked u;
struct nlink *link = &links->d[*idx];
int ret = 0;
@@ -2271,7 +3025,9 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
if (!bkey_is_inode(k.k))
return 0;
- BUG_ON(bch2_inode_unpack(k, &u));
+ ret = bch2_inode_unpack(k, &u);
+ if (ret)
+ return ret;
if (S_ISDIR(u.bi_mode))
return 0;
@@ -2286,12 +3042,12 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
}
if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
- c, inode_wrong_nlink,
+ trans, inode_wrong_nlink,
"inode %llu type %s has wrong i_nlink (%u, should be %u)",
u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
bch2_inode_nlink_get(&u), link->count)) {
bch2_inode_nlink_set(&u, link->count);
- ret = __write_inode(trans, &u, k.k->p.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u);
}
fsck_err:
return ret;
@@ -2307,7 +3063,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS(0, range_start),
- BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
if (ret < 0) {
@@ -2375,7 +3131,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
u->v.front_pad = 0;
u->v.back_pad = 0;
- return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
+ return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
}
int bch2_fix_reflink_p(struct bch_fs *c)
@@ -2386,10 +3142,230 @@ int bch2_fix_reflink_p(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_extents, POS_MIN,
- BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_intent|BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
fix_reflink_p_key(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
}
+
+#ifndef NO_BCACHEFS_CHARDEV
+
+struct fsck_thread {
+ struct thread_with_stdio thr;
+ struct bch_fs *c;
+ struct bch_opts opts;
+};
+
+static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
+{
+ struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
+ kfree(thr);
+}
+
+static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
+{
+ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
+ struct bch_fs *c = thr->c;
+
+ int ret = PTR_ERR_OR_ZERO(c);
+ if (ret)
+ return ret;
+
+ ret = bch2_fs_start(thr->c);
+ if (ret)
+ goto err;
+
+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
+ ret |= 1;
+ }
+ if (test_bit(BCH_FS_error, &c->flags)) {
+ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
+ ret |= 4;
+ }
+err:
+ bch2_fs_stop(c);
+ return ret;
+}
+
+static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
+ .exit = bch2_fsck_thread_exit,
+ .fn = bch2_fsck_offline_thread_fn,
+};
+
+long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
+{
+ struct bch_ioctl_fsck_offline arg;
+ struct fsck_thread *thr = NULL;
+ darray_str(devs) = {};
+ long ret = 0;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ for (size_t i = 0; i < arg.nr_devs; i++) {
+ u64 dev_u64;
+ ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
+ if (ret)
+ goto err;
+
+ char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(dev_str);
+ if (ret)
+ goto err;
+
+ ret = darray_push(&devs, dev_str);
+ if (ret) {
+ kfree(dev_str);
+ goto err;
+ }
+ }
+
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ thr->opts = bch2_opts_empty();
+
+ if (arg.opts) {
+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+ ret = PTR_ERR_OR_ZERO(optstr) ?:
+ bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
+ if (!IS_ERR(optstr))
+ kfree(optstr);
+
+ if (ret)
+ goto err;
+ }
+
+ opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
+ opt_set(thr->opts, read_only, 1);
+ opt_set(thr->opts, ratelimit_errors, 0);
+
+ /* We need request_key() to be called before we punt to kthread: */
+ opt_set(thr->opts, nostart, true);
+
+ bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
+
+ thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
+
+ if (!IS_ERR(thr->c) &&
+ thr->c->opts.errors == BCH_ON_ERROR_panic)
+ thr->c->opts.errors = BCH_ON_ERROR_ro;
+
+ ret = __bch2_run_thread_with_stdio(&thr->thr);
+out:
+ darray_for_each(devs, i)
+ kfree(*i);
+ darray_exit(&devs);
+ return ret;
+err:
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ pr_err("ret %s", bch2_err_str(ret));
+ goto out;
+}
+
+static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
+{
+ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
+ struct bch_fs *c = thr->c;
+
+ c->stdio_filter = current;
+ c->stdio = &thr->thr.stdio;
+
+ /*
+ * XXX: can we figure out a way to do this without mucking with c->opts?
+ */
+ unsigned old_fix_errors = c->opts.fix_errors;
+ if (opt_defined(thr->opts, fix_errors))
+ c->opts.fix_errors = thr->opts.fix_errors;
+ else
+ c->opts.fix_errors = FSCK_FIX_ask;
+
+ c->opts.fsck = true;
+ set_bit(BCH_FS_fsck_running, &c->flags);
+
+ c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
+ int ret = bch2_run_online_recovery_passes(c);
+
+ clear_bit(BCH_FS_fsck_running, &c->flags);
+ bch_err_fn(c, ret);
+
+ c->stdio = NULL;
+ c->stdio_filter = NULL;
+ c->opts.fix_errors = old_fix_errors;
+
+ up(&c->online_fsck_mutex);
+ bch2_ro_ref_put(c);
+ return ret;
+}
+
+static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
+ .exit = bch2_fsck_thread_exit,
+ .fn = bch2_fsck_online_thread_fn,
+};
+
+long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
+{
+ struct fsck_thread *thr = NULL;
+ long ret = 0;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!bch2_ro_ref_tryget(c))
+ return -EROFS;
+
+ if (down_trylock(&c->online_fsck_mutex)) {
+ bch2_ro_ref_put(c);
+ return -EAGAIN;
+ }
+
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ thr->c = c;
+ thr->opts = bch2_opts_empty();
+
+ if (arg.opts) {
+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+
+ ret = PTR_ERR_OR_ZERO(optstr) ?:
+ bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
+ if (!IS_ERR(optstr))
+ kfree(optstr);
+
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
+err:
+ if (ret < 0) {
+ bch_err_fn(c, ret);
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ up(&c->online_fsck_mutex);
+ bch2_ro_ref_put(c);
+ }
+ return ret;
+}
+
+#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index da991e8cf27e..574948278cd4 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -2,14 +2,27 @@
#ifndef _BCACHEFS_FSCK_H
#define _BCACHEFS_FSCK_H
+#include "str_hash.h"
+
+int bch2_fsck_update_backpointers(struct btree_trans *,
+ struct snapshots_seen *,
+ const struct bch_hash_desc,
+ struct bch_hash_info *,
+ struct bkey_i *);
+
int bch2_check_inodes(struct bch_fs *);
int bch2_check_extents(struct bch_fs *);
int bch2_check_indirect_extents(struct bch_fs *);
int bch2_check_dirents(struct bch_fs *);
int bch2_check_xattrs(struct bch_fs *);
int bch2_check_root(struct bch_fs *);
+int bch2_check_subvolume_structure(struct bch_fs *);
+int bch2_check_unreachable_inodes(struct bch_fs *);
int bch2_check_directory_structure(struct bch_fs *);
int bch2_check_nlinks(struct bch_fs *);
int bch2_fix_reflink_p(struct bch_fs *);
+long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *);
+long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online);
+
#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 086f0090b03a..339b80770f1d 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -8,10 +8,13 @@
#include "buckets.h"
#include "compress.h"
#include "dirent.h"
+#include "disk_accounting.h"
#include "error.h"
#include "extents.h"
#include "extent_update.h"
+#include "fs.h"
#include "inode.h"
+#include "opts.h"
#include "str_hash.h"
#include "snapshot.h"
#include "subvolume.h"
@@ -19,7 +22,7 @@
#include <linux/random.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#define x(name, ...) #name,
const char * const bch2_inode_opts[] = {
@@ -33,6 +36,8 @@ static const char * const bch2_inode_flag_strs[] = {
};
#undef x
+static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
+
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
static int inode_decode_field(const u8 *in, const u8 *end,
@@ -43,10 +48,10 @@ static int inode_decode_field(const u8 *in, const u8 *end,
u8 *p;
if (in >= end)
- return -1;
+ return -BCH_ERR_inode_unpack_error;
if (!*in)
- return -1;
+ return -BCH_ERR_inode_unpack_error;
/*
* position of highest set bit indicates number of bytes:
@@ -56,7 +61,7 @@ static int inode_decode_field(const u8 *in, const u8 *end,
bytes = byte_table[shift - 1];
if (in + bytes > end)
- return -1;
+ return -BCH_ERR_inode_unpack_error;
p = (u8 *) be + 16 - bytes;
memcpy(p, in, bytes);
@@ -159,8 +164,8 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
unsigned fieldnr = 0, field_bits;
int ret;
-#define x(_name, _bits) \
- if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
+#define x(_name, _bits) \
+ if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \
unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
memset((void *) unpacked + offset, 0, \
sizeof(*unpacked) - offset); \
@@ -172,7 +177,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
return ret; \
\
if (field_bits > sizeof(unpacked->_name) * 8) \
- return -1; \
+ return -BCH_ERR_inode_unpack_error; \
\
unpacked->_name = field[1]; \
in += ret;
@@ -213,7 +218,7 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
\
unpacked->_name = v[0]; \
if (v[1] || v[0] != unpacked->_name) \
- return -1; \
+ return -BCH_ERR_inode_unpack_error; \
fieldnr++;
BCH_INODE_FIELDS_v2()
@@ -264,7 +269,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k,
\
unpacked->_name = v[0]; \
if (v[1] || v[0] != unpacked->_name) \
- return -1; \
+ return -BCH_ERR_inode_unpack_error; \
fieldnr++;
BCH_INODE_FIELDS_v3()
@@ -279,6 +284,8 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
{
memset(unpacked, 0, sizeof(*unpacked));
+ unpacked->bi_snapshot = k.k->p.snapshot;
+
switch (k.k->type) {
case KEY_TYPE_inode: {
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
@@ -289,10 +296,10 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
- if (INODE_NEW_VARINT(inode.v)) {
+ if (INODEv1_NEW_VARINT(inode.v)) {
return bch2_inode_unpack_v2(unpacked, inode.v->fields,
bkey_val_end(inode),
- INODE_NR_FIELDS(inode.v));
+ INODEv1_NR_FIELDS(inode.v));
} else {
return bch2_inode_unpack_v1(inode, unpacked);
}
@@ -319,27 +326,27 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
int bch2_inode_unpack(struct bkey_s_c k,
struct bch_inode_unpacked *unpacked)
{
- if (likely(k.k->type == KEY_TYPE_inode_v3))
- return bch2_inode_unpack_v3(k, unpacked);
- return bch2_inode_unpack_slowpath(k, unpacked);
+ unpacked->bi_snapshot = k.k->p.snapshot;
+
+ return likely(k.k->type == KEY_TYPE_inode_v3)
+ ? bch2_inode_unpack_v3(k, unpacked)
+ : bch2_inode_unpack_slowpath(k, unpacked);
}
-static int bch2_inode_peek_nowarn(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- subvol_inum inum, unsigned flags)
+int __bch2_inode_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ subvol_inum inum, unsigned flags,
+ bool warn)
{
- struct bkey_s_c k;
u32 snapshot;
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn);
if (ret)
return ret;
- k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, snapshot),
- flags|BTREE_ITER_CACHED);
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
+ SPOS(0, inum.inum, snapshot),
+ flags|BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
return ret;
@@ -354,24 +361,16 @@ static int bch2_inode_peek_nowarn(struct btree_trans *trans,
return 0;
err:
+ if (warn)
+ bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
bch2_trans_iter_exit(trans, iter);
return ret;
}
-int bch2_inode_peek(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- subvol_inum inum, unsigned flags)
-{
- int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
- bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
- return ret;
-}
-
int bch2_inode_write_flags(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
- enum btree_update_flags flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_inode_buf *inode_p;
@@ -384,6 +383,30 @@ int bch2_inode_write_flags(struct btree_trans *trans,
return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
}
+int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
+{
+ struct bkey_inode_buf *inode_p =
+ bch2_trans_kmalloc(trans, sizeof(*inode_p));
+
+ if (IS_ERR(inode_p))
+ return PTR_ERR(inode_p);
+
+ bch2_inode_pack(inode_p, inode);
+ inode_p->inode.k.p.snapshot = inode->bi_snapshot;
+
+ return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+ &inode_p->inode.k_i,
+ BTREE_UPDATE_internal_snapshot_node);
+}
+
+int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
+{
+ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __bch2_fsck_write_inode(trans, inode));
+ bch_err_fn(trans->c, ret);
+ return ret;
+}
+
struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
{
struct bch_inode_unpacked u;
@@ -405,100 +428,98 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
return &inode_p->inode.k_i;
}
-static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
+static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bch_inode_unpacked unpacked;
int ret = 0;
- bkey_fsck_err_on(k.k->p.inode, c, err,
- inode_pos_inode_nonzero,
+ bkey_fsck_err_on(k.k->p.inode,
+ c, inode_pos_inode_nonzero,
"nonzero k.p.inode");
- bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
- inode_pos_blockdev_range,
+ bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX,
+ c, inode_pos_blockdev_range,
"fs inode in blockdev range");
- bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
- inode_unpack_error,
+ bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked),
+ c, inode_unpack_error,
"invalid variable length fields");
- bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
- inode_checksum_type_invalid,
+ bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1,
+ c, inode_checksum_type_invalid,
"invalid data checksum type (%u >= %u",
unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
bkey_fsck_err_on(unpacked.bi_compression &&
- !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
- inode_compression_type_invalid,
+ !bch2_compression_opt_valid(unpacked.bi_compression - 1),
+ c, inode_compression_type_invalid,
"invalid compression opt %u", unpacked.bi_compression - 1);
bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
- unpacked.bi_nlink != 0, c, err,
- inode_unlinked_but_nlink_nonzero,
+ unpacked.bi_nlink != 0,
+ c, inode_unlinked_but_nlink_nonzero,
"flagged as unlinked but bi_nlink != 0");
- bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
- inode_subvol_root_but_not_dir,
+ bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode),
+ c, inode_subvol_root_but_not_dir,
"subvolume root but not a directory");
fsck_err:
return ret;
}
-int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
int ret = 0;
- bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
- inode_str_hash_invalid,
+ bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+ c, inode_str_hash_invalid,
"invalid str hash type (%llu >= %u)",
- INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
+ INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
- ret = __bch2_inode_invalid(c, k, err);
+ ret = __bch2_inode_validate(c, k, from);
fsck_err:
return ret;
}
-int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
int ret = 0;
- bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
- inode_str_hash_invalid,
+ bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+ c, inode_str_hash_invalid,
"invalid str hash type (%llu >= %u)",
INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
- ret = __bch2_inode_invalid(c, k, err);
+ ret = __bch2_inode_validate(c, k, from);
fsck_err:
return ret;
}
-int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
int ret = 0;
bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
- INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
- inode_v3_fields_start_bad,
+ INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k),
+ c, inode_v3_fields_start_bad,
"invalid fields_start (got %llu, min %u max %zu)",
INODEv3_FIELDS_START(inode.v),
INODEv3_FIELDS_START_INITIAL,
bkey_val_u64s(inode.k));
- bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
- inode_str_hash_invalid,
+ bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+ c, inode_str_hash_invalid,
"invalid str hash type (%llu >= %u)",
INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
- ret = __bch2_inode_invalid(c, k, err);
+ ret = __bch2_inode_validate(c, k, from);
fsck_err:
return ret;
}
@@ -506,38 +527,35 @@ fsck_err:
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
+ prt_printf(out, "\n");
printbuf_indent_add(out, 2);
- prt_printf(out, "mode=%o", inode->bi_mode);
- prt_newline(out);
+ prt_printf(out, "mode=%o\n", inode->bi_mode);
prt_str(out, "flags=");
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
- prt_printf(out, " (%x)", inode->bi_flags);
- prt_newline(out);
+ prt_printf(out, "(%x)\n", inode->bi_flags);
- prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq);
+ prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq);
+ prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed);
+ prt_printf(out, "hash_type=");
+ bch2_prt_str_hash_type(out, INODE_STR_HASH(inode));
prt_newline(out);
-
- prt_printf(out, "bi_size=%llu", inode->bi_size);
- prt_newline(out);
-
- prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
- prt_newline(out);
-
- prt_newline(out);
- prt_printf(out, "bi_version=%llu", inode->bi_version);
+ prt_printf(out, "bi_size=%llu\n", inode->bi_size);
+ prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors);
+ prt_printf(out, "bi_version=%llu\n", inode->bi_version);
#define x(_name, _bits) \
- prt_printf(out, #_name "=%llu", (u64) inode->_name); \
- prt_newline(out);
+ prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
BCH_INODE_FIELDS_v3()
#undef x
+
+ bch2_printbuf_strip_trailing_newline(out);
printbuf_indent_sub(out, 2);
}
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
{
- prt_printf(out, "inum: %llu ", inode->bi_inum);
+ prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot);
__bch2_inode_unpacked_to_text(out, inode);
}
@@ -567,62 +585,207 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k)
}
}
-static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode:
+ bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
+ return;
+ case KEY_TYPE_inode_v2:
+ bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
+ return;
+ case KEY_TYPE_inode_v3:
+ bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
+ return;
+ default:
+ BUG();
+ }
+}
+
+static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
+{
+ unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
+
+ return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
+}
+
+static struct bkey_s_c
+bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
+ enum btree_id btree, struct bpos pos,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_max_norestart(trans, *iter, btree,
+ bpos_successor(pos),
+ SPOS(pos.inode, pos.offset, U32_MAX),
+ flags|BTREE_ITER_all_snapshots, k, ret)
+ if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
+ return k;
+
+ bch2_trans_iter_exit(trans, iter);
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
+}
+
+static struct bkey_s_c
+bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos, unsigned flags)
+{
+ struct bkey_s_c k;
+again:
+ k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
+ if (!k.k ||
+ bkey_err(k) ||
+ bkey_is_inode(k.k))
+ return k;
+
+ bch2_trans_iter_exit(trans, iter);
+ pos = k.k->p;
+ goto again;
+}
+
+int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_max_norestart(trans, iter,
+ BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_with_updates, k, ret)
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
+ bkey_is_inode(k.k)) {
+ ret = 1;
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int update_inode_has_children(struct btree_trans *trans,
+ struct bkey_s k,
+ bool have_child)
+{
+ if (!have_child) {
+ int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+ if (ret)
+ return ret < 0 ? ret : 0;
+ }
+
+ u64 f = bkey_inode_flags(k.s_c);
+ if (have_child != !!(f & BCH_INODE_has_child_snapshot))
+ bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
+
+ return 0;
+}
+
+static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
+ bool have_child)
{
- return bkey_inode_flags(k) & BCH_INODE_unlinked;
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
+ &iter, pos, BTREE_ITER_with_updates);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+ if (!k.k)
+ return 0;
+
+ if (!have_child) {
+ ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
+ goto err;
+ }
+ }
+
+ u64 f = bkey_inode_flags(k);
+ if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
+ struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
int bch2_trigger_inode(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
+
+ if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
+ BUG_ON(!trans->journal_res.seq);
+ bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
+ }
+
s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+ if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) {
+ struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes };
+ int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc);
+ if (ret)
+ return ret;
+ }
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
- if (nr) {
- int ret = bch2_replicas_deltas_realloc(trans, 0);
+ if (flags & BTREE_TRIGGER_transactional) {
+ int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) -
+ (int) bkey_is_unlinked_inode(old);
+ if (unlinked_delta) {
+ int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+ new.k->p, unlinked_delta > 0);
if (ret)
return ret;
-
- trans->fs_usage_deltas->nr_inodes += nr;
}
- bool old_deleted = bkey_is_deleted_inode(old);
- bool new_deleted = bkey_is_deleted_inode(new.s_c);
- if (old_deleted != new_deleted) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted);
+ /*
+ * If we're creating or deleting an inode at this snapshot ID,
+ * and there might be an inode in a parent snapshot ID, we might
+ * need to set or clear the has_child_snapshot flag on the
+ * parent.
+ */
+ int deleted_delta = (int) bkey_is_inode(new.k) -
+ (int) bkey_is_inode(old.k);
+ if (deleted_delta &&
+ bch2_snapshot_parent(c, new.k->p.snapshot)) {
+ int ret = update_parent_inode_has_children(trans, new.k->p,
+ deleted_delta > 0);
if (ret)
return ret;
}
- }
- if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
- BUG_ON(!trans->journal_res.seq);
-
- bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
- }
-
- if (flags & BTREE_TRIGGER_GC) {
- struct bch_fs *c = trans->c;
-
- percpu_down_read(&c->mark_lock);
- this_cpu_add(c->usage_gc->b.nr_inodes, nr);
- percpu_up_read(&c->mark_lock);
+ /*
+ * When an inode is first updated in a new snapshot, we may need
+ * to clear has_child_snapshot
+ */
+ if (deleted_delta > 0) {
+ int ret = update_inode_has_children(trans, new, false);
+ if (ret)
+ return ret;
+ }
}
return 0;
}
-int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
int ret = 0;
- bkey_fsck_err_on(k.k->p.inode, c, err,
- inode_pos_inode_nonzero,
+ bkey_fsck_err_on(k.k->p.inode,
+ c, inode_pos_inode_nonzero,
"nonzero k.p.inode");
fsck_err:
return ret;
@@ -636,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
}
+int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors,
+ c, inode_alloc_cursor_inode_bad,
+ "k.p.inode bad");
+fsck_err:
+ return ret;
+}
+
+void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k);
+
+ prt_printf(out, "idx %llu generation %llu",
+ le64_to_cpu(i.v->idx),
+ le64_to_cpu(i.v->gen));
+}
+
void bch2_inode_init_early(struct bch_fs *c,
struct bch_inode_unpacked *inode_u)
{
@@ -644,10 +829,8 @@ void bch2_inode_init_early(struct bch_fs *c,
memset(inode_u, 0, sizeof(*inode_u));
- /* ick */
- inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
- get_random_bytes(&inode_u->bi_hash_seed,
- sizeof(inode_u->bi_hash_seed));
+ SET_INODE_STR_HASH(inode_u, str_hash);
+ get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
}
void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
@@ -698,43 +881,78 @@ static inline u32 bkey_generation(struct bkey_s_c k)
}
}
-/*
- * This just finds an empty slot:
- */
-int bch2_inode_create(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode_u,
- u32 snapshot, u64 cpu)
+static struct bkey_i_inode_alloc_cursor *
+bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
- u64 min, max, start, pos, *hint;
- int ret = 0;
- unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
- if (c->opts.shard_inode_numbers) {
- bits -= c->inode_shard_bits;
+ u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1;
+
+ cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);
- min = (cpu << bits);
- max = (cpu << bits) | ~(ULLONG_MAX << bits);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+ BTREE_ID_logged_ops,
+ POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx),
+ BTREE_ITER_cached);
+ int ret = bkey_err(k);
+ if (ret)
+ return ERR_PTR(ret);
- min = max_t(u64, min, BLOCKDEV_INODE_MAX);
- hint = c->unused_inode_hints + cpu;
+ struct bkey_i_inode_alloc_cursor *cursor =
+ k.k->type == KEY_TYPE_inode_alloc_cursor
+ ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor)
+ : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor);
+ ret = PTR_ERR_OR_ZERO(cursor);
+ if (ret)
+ goto err;
+
+ if (c->opts.inodes_32bit) {
+ *min = BLOCKDEV_INODE_MAX;
+ *max = INT_MAX;
} else {
- min = BLOCKDEV_INODE_MAX;
- max = ~(ULLONG_MAX << bits);
- hint = c->unused_inode_hints;
+ cursor->v.bits = c->opts.shard_inode_numbers_bits;
+
+ unsigned bits = 63 - c->opts.shard_inode_numbers_bits;
+
+ *min = max(cpu << bits, (u64) INT_MAX + 1);
+ *max = (cpu << bits) | ~(ULLONG_MAX << bits);
}
- start = READ_ONCE(*hint);
+ if (le64_to_cpu(cursor->v.idx) < *min)
+ cursor->v.idx = cpu_to_le64(*min);
+
+ if (le64_to_cpu(cursor->v.idx) >= *max) {
+ cursor->v.idx = cpu_to_le64(*min);
+ le32_add_cpu(&cursor->v.gen, 1);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret ? ERR_PTR(ret) : cursor;
+}
+
+/*
+ * This just finds an empty slot:
+ */
+int bch2_inode_create(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode_u,
+ u32 snapshot, u64 cpu)
+{
+ u64 min, max;
+ struct bkey_i_inode_alloc_cursor *cursor =
+ bch2_inode_alloc_cursor_get(trans, cpu, &min, &max);
+ int ret = PTR_ERR_OR_ZERO(cursor);
+ if (ret)
+ return ret;
- if (start >= max || start < min)
- start = min;
+ u64 start = le64_to_cpu(cursor->v.idx);
+ u64 pos = start;
- pos = start;
bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_INTENT);
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_intent);
+ struct bkey_s_c k;
again:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
@@ -764,6 +982,7 @@ again:
/* Retry from start */
pos = start = min;
bch2_btree_iter_set_pos(iter, POS(0, pos));
+ le32_add_cpu(&cursor->v.gen, 1);
goto again;
found_slot:
bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
@@ -774,9 +993,9 @@ found_slot:
return ret;
}
- *hint = k.k->p.offset;
inode_u->bi_inum = k.k->p.offset;
- inode_u->bi_generation = bkey_generation(k);
+ inode_u->bi_generation = le64_to_cpu(cursor->v.gen);
+ cursor->v.idx = cpu_to_le64(k.k->p.offset + 1);
return 0;
}
@@ -795,7 +1014,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
* extent iterator:
*/
bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
while (1) {
bch2_trans_begin(trans);
@@ -806,7 +1025,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
bch2_btree_iter_set_snapshot(&iter, snapshot);
- k = bch2_btree_iter_peek_upto(&iter, end);
+ k = bch2_btree_iter_peek_max(&iter, end);
ret = bkey_err(k);
if (ret)
goto err;
@@ -817,7 +1036,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
bkey_init(&delete.k);
delete.k.p = iter.pos;
- if (iter.flags & BTREE_ITER_IS_EXTENTS)
+ if (iter.flags & BTREE_ITER_is_extents)
bch2_key_resize(&delete.k,
bpos_min(end, k.k->p).offset -
iter.pos.offset);
@@ -838,8 +1057,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
- struct bkey_i_inode_generation delete;
- struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
u32 snapshot;
int ret;
@@ -866,7 +1083,7 @@ retry:
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0, inum.inum, snapshot),
- BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+ BTREE_ITER_intent|BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
goto err;
@@ -879,13 +1096,7 @@ retry:
goto err;
}
- bch2_inode_unpack(k, &inode_u);
-
- bkey_inode_generation_init(&delete.k_i);
- delete.k.p = iter.pos;
- delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-
- ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+ ret = bch2_btree_delete_at(trans, &iter, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc);
err:
@@ -893,6 +1104,11 @@ err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
+ if (ret)
+ goto err2;
+
+ ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
+err2:
bch2_trans_put(trans);
return ret;
}
@@ -926,8 +1142,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
struct bch_inode_unpacked *inode)
{
- return bch2_trans_do(c, NULL, NULL, 0,
- bch2_inode_find_by_inum_trans(trans, inum, inode));
+ return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
}
int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
@@ -977,12 +1192,18 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
struct bch_inode_unpacked *inode)
{
-#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name);
+#define x(_name, _bits) \
+ if ((inode)->bi_##_name) { \
+ opts->_name = inode->bi_##_name - 1; \
+ opts->_name##_from_inode = true; \
+ } else { \
+ opts->_name = c->opts._name; \
+ opts->_name##_from_inode = false; \
+ }
BCH_INODE_OPTS()
#undef x
- if (opts->nocow)
- opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
+ bch2_io_opts_fixups(opts);
}
int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
@@ -997,7 +1218,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i
return 0;
}
-int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;
struct btree_iter iter = { NULL };
@@ -1026,7 +1247,7 @@ retry:
bch2_trans_begin(trans);
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+ SPOS(0, inum, snapshot), BTREE_ITER_intent);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1060,6 +1281,45 @@ err:
return ret ?: -BCH_ERR_transaction_restart_nested;
}
+/*
+ * After deleting an inode, there may be versions in older snapshots that should
+ * also be deleted - if they're not referenced by sibling snapshots and not open
+ * in other subvolumes:
+ */
+static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+next_parent:
+ ret = lockrestart_do(trans,
+ bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
+ if (ret || !k.k)
+ return ret;
+
+ bool unlinked = bkey_is_unlinked_inode(k);
+ pos = k.k->p;
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (!unlinked)
+ return 0;
+
+ ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
+ if (ret)
+ return ret;
+ goto next_parent;
+}
+
+int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+ return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
+ delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
+}
+
static int may_delete_deleted_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bpos pos,
@@ -1069,16 +1329,17 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
struct btree_iter inode_iter;
struct bkey_s_c k;
struct bch_inode_unpacked inode;
+ struct printbuf buf = PRINTBUF;
int ret;
- k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
+ k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
return ret;
ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
- if (fsck_err_on(!bkey_is_inode(k.k), c,
- deleted_inode_missing,
+ if (fsck_err_on(!bkey_is_inode(k.k),
+ trans, deleted_inode_missing,
"nonexistent inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
@@ -1088,8 +1349,9 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
goto out;
if (S_ISDIR(inode.bi_mode)) {
- ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot);
- if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir,
+ ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
+ if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
+ trans, deleted_inode_is_dir,
"non empty directory %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
@@ -1097,41 +1359,42 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
goto out;
}
- if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
- deleted_inode_not_unlinked,
+ if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked),
+ trans, deleted_inode_not_unlinked,
"non-deleted inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
- if (c->sb.clean &&
- !fsck_err(c,
- deleted_inode_but_clean,
- "filesystem marked as clean but have deleted inode %llu:%u",
- pos.offset, pos.snapshot)) {
- ret = 0;
- goto out;
- }
-
- if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
- struct bpos new_min_pos;
+ if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
+ trans, deleted_inode_has_child_snapshots,
+ "inode with child snapshots %llu:%u in deleted_inodes btree",
+ pos.offset, pos.snapshot))
+ goto delete;
- ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
- if (ret)
- goto out;
+ ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+ if (ret < 0)
+ goto out;
- inode.bi_flags &= ~BCH_INODE_unlinked;
+ if (ret) {
+ if (fsck_err(trans, inode_has_child_snapshots_wrong,
+ "inode has_child_snapshots flag wrong (should be set)\n%s",
+ (printbuf_reset(&buf),
+ bch2_inode_unpacked_to_text(&buf, &inode),
+ buf.buf))) {
+ inode.bi_flags |= BCH_INODE_has_child_snapshot;
+ ret = __bch2_fsck_write_inode(trans, &inode);
+ if (ret)
+ goto out;
+ }
+ goto delete;
- ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
- bch_err_msg(c, ret, "clearing inode unlinked flag");
- if (ret)
- goto out;
+ }
- /*
- * We'll need another write buffer flush to pick up the new
- * unlinked inodes in the snapshot leaves:
- */
- *need_another_pass = true;
+ if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
+ !fsck_err(trans, deleted_inode_but_clean,
+ "filesystem marked as clean but have deleted inode %llu:%u",
+ pos.offset, pos.snapshot)) {
+ ret = 0;
goto out;
}
@@ -1139,9 +1402,10 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
out:
fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
+ printbuf_exit(&buf);
return ret;
delete:
- ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
goto out;
}
@@ -1151,6 +1415,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
bool need_another_pass;
int ret;
again:
+ /*
+ * if we ran check_inodes() unlinked inodes will have already been
+ * cleaned up but the write buffer will be out of sync; therefore we
+ * alway need a write buffer flush
+ */
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
+
need_another_pass = false;
/*
@@ -1160,11 +1433,12 @@ again:
* flushed and we'd spin:
*/
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
if (ret > 0) {
- bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
+ bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
+ k.k->p.offset, k.k->p.snapshot);
ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
/*
@@ -1183,12 +1457,8 @@ again:
ret;
}));
- if (!ret && need_another_pass) {
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
+ if (!ret && need_another_pass)
goto again;
- }
err:
bch2_trans_put(trans);
return ret;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index b63f312581cf..428b9be6af34 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -5,37 +5,47 @@
#include "bkey.h"
#include "bkey_methods.h"
#include "opts.h"
+#include "snapshot.h"
-enum bkey_invalid_flags;
extern const char * const bch2_inode_opts[];
-int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
-int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
-int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_inode_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
+
+static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
+{
+ return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0
+ ? __bch2_inode_has_child_snapshots(trans, pos)
+ : 0;
+}
+
int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_inode ((struct bkey_ops) { \
- .key_invalid = bch2_inode_invalid, \
+ .key_validate = bch2_inode_validate, \
.val_to_text = bch2_inode_to_text, \
.trigger = bch2_trigger_inode, \
.min_val_size = 16, \
})
#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \
- .key_invalid = bch2_inode_v2_invalid, \
+ .key_validate = bch2_inode_v2_validate, \
.val_to_text = bch2_inode_to_text, \
.trigger = bch2_trigger_inode, \
.min_val_size = 32, \
})
#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \
- .key_invalid = bch2_inode_v3_invalid, \
+ .key_validate = bch2_inode_v3_validate, \
.val_to_text = bch2_inode_to_text, \
.trigger = bch2_trigger_inode, \
.min_val_size = 48, \
@@ -48,16 +58,26 @@ static inline bool bkey_is_inode(const struct bkey *k)
k->type == KEY_TYPE_inode_v3;
}
-int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \
- .key_invalid = bch2_inode_generation_invalid, \
+ .key_validate = bch2_inode_generation_validate, \
.val_to_text = bch2_inode_generation_to_text, \
.min_val_size = 8, \
})
+int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \
+ .key_validate = bch2_inode_alloc_cursor_validate, \
+ .val_to_text = bch2_inode_alloc_cursor_to_text, \
+ .min_val_size = 16, \
+})
+
#if 0
typedef struct {
u64 lo;
@@ -68,6 +88,7 @@ typedef u64 u96;
struct bch_inode_unpacked {
u64 bi_inum;
+ u32 bi_snapshot;
u64 bi_journal_seq;
__le64 bi_hash_seed;
u64 bi_size;
@@ -80,6 +101,7 @@ struct bch_inode_unpacked {
BCH_INODE_FIELDS_v3()
#undef x
};
+BITMASK(INODE_STR_HASH, struct bch_inode_unpacked, bi_flags, 20, 24);
struct bkey_inode_buf {
struct bkey_i_inode_v3 inode;
@@ -87,7 +109,7 @@ struct bkey_inode_buf {
#define x(_name, _bits) + 8 + _bits / 8
u8 _pad[0 + BCH_INODE_FIELDS_v3()];
#undef x
-} __packed __aligned(8);
+};
void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
@@ -95,11 +117,29 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
-int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, subvol_inum, unsigned);
+int __bch2_inode_peek(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, subvol_inum, unsigned, bool);
+
+static inline int bch2_inode_peek_nowarn(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ subvol_inum inum, unsigned flags)
+{
+ return __bch2_inode_peek(trans, iter, inode, inum, flags, false);
+}
+
+static inline int bch2_inode_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ subvol_inum inum, unsigned flags)
+{
+ return __bch2_inode_peek(trans, iter, inode, inum, flags, true);
+ int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
+ return ret;
+}
int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, enum btree_update_flags);
+ struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
static inline int bch2_inode_write(struct btree_trans *trans,
struct btree_iter *iter,
@@ -108,6 +148,9 @@ static inline int bch2_inode_write(struct btree_trans *trans,
return bch2_inode_write_flags(trans, iter, inode, 0);
}
+int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
+int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
+
void bch2_inode_init_early(struct bch_fs *,
struct bch_inode_unpacked *);
void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
@@ -172,6 +215,34 @@ static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
}
+static inline u32 bch2_inode_flags(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode:
+ return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
+ case KEY_TYPE_inode_v2:
+ return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
+ case KEY_TYPE_inode_v3:
+ return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
+ default:
+ return 0;
+ }
+}
+
+static inline unsigned bkey_inode_mode(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode:
+ return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode);
+ case KEY_TYPE_inode_v2:
+ return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode);
+ case KEY_TYPE_inode_v3:
+ return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v);
+ default:
+ return 0;
+ }
+}
+
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)
@@ -201,11 +272,29 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
+static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode)
+{
+ bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
+
+ return S_ISDIR(inode->bi_mode) ||
+ (!inode->bi_nlink && inode_has_bp);
+}
+
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
+#include "rebalance.h"
+
+static inline struct bch_extent_rebalance
+bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
+{
+ struct bch_io_opts io_opts;
+ bch2_inode_opts_get(&io_opts, c, inode);
+ return io_opts_to_rebalance_opts(c, &io_opts);
+}
+
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
int bch2_delete_dead_inodes(struct bch_fs *);
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index 83d107331edf..b99a5bf1a75e 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -101,7 +101,9 @@ struct bch_inode_generation {
x(bi_dir_offset, 64) \
x(bi_subvol, 32) \
x(bi_parent_subvol, 32) \
- x(bi_nocow, 8)
+ x(bi_nocow, 8) \
+ x(bi_depth, 32) \
+ x(bi_inodes_32bit, 8)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
@@ -114,7 +116,8 @@ struct bch_inode_generation {
x(foreground_target, 16) \
x(background_target, 16) \
x(erasure_code, 16) \
- x(nocow, 8)
+ x(nocow, 8) \
+ x(inodes_32bit, 8)
enum inode_opt_id {
#define x(name, ...) \
@@ -133,7 +136,8 @@ enum inode_opt_id {
x(i_size_dirty, 5) \
x(i_sectors_dirty, 6) \
x(unlinked, 7) \
- x(backptr_untrusted, 8)
+ x(backptr_untrusted, 8) \
+ x(has_child_snapshot, 9)
/* bits 20+ reserved for packed fields below: */
@@ -149,9 +153,9 @@ enum __bch_inode_flags {
#undef x
};
-LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODEv1_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32);
LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
@@ -163,4 +167,12 @@ LE64_BITMASK(INODEv3_FIELDS_START,
struct bch_inode_v3, bi_flags, 31, 36);
LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
+struct bch_inode_alloc_cursor {
+ struct bch_val v;
+ __u8 bits;
+ __u8 pad;
+ __le32 gen;
+ __le64 idx;
+};
+
#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 1baf78594cca..5353979117b0 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -113,11 +113,13 @@ int bch2_extent_fallocate(struct btree_trans *trans,
err:
if (!ret && sectors_allocated)
bch2_increment_clock(c, sectors_allocated, WRITE);
- if (should_print_err(ret))
- bch_err_inum_offset_ratelimited(c,
- inum.inum,
- iter->pos.offset << 9,
- "%s(): error: %s", __func__, bch2_err_str(ret));
+ if (should_print_err(ret)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9);
+ prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
err_noprint:
bch2_open_buckets_put(c, &open_buckets);
bch2_disk_reservation_put(c, &disk_res);
@@ -125,8 +127,8 @@ err_noprint:
bch2_bkey_buf_exit(&old, c);
if (closure_nr_remaining(&cl) != 1) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
+ bch2_trans_unlock_long(trans);
+ bch2_wait_on_allocator(c, &cl);
}
return ret;
@@ -164,9 +166,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
bch2_btree_iter_set_snapshot(iter, snapshot);
/*
- * peek_upto() doesn't have ideal semantics for extents:
+ * peek_max() doesn't have ideal semantics for extents:
*/
- k = bch2_btree_iter_peek_upto(iter, end_pos);
+ k = bch2_btree_iter_peek_max(iter, end_pos);
if (!k.k)
break;
@@ -198,7 +200,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inum.inum, start),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
@@ -224,13 +226,14 @@ void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, str
static int truncate_set_isize(struct btree_trans *trans,
subvol_inum inum,
- u64 new_i_size)
+ u64 new_i_size,
+ bool warn)
{
struct btree_iter iter = { NULL };
struct bch_inode_unpacked inode_u;
int ret;
- ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+ ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?:
(inode_u.bi_size = new_i_size, 0) ?:
bch2_inode_write(trans, &iter, &inode_u);
@@ -247,23 +250,25 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
u64 new_i_size = le64_to_cpu(op->v.new_i_size);
+ bool warn_errors = i_sectors_delta != NULL;
int ret;
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- truncate_set_isize(trans, inum, new_i_size));
+ truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL));
if (ret)
goto err;
bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
bch2_trans_iter_exit(trans, &fpunch_iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
ret = 0;
err:
- bch2_logged_op_finish(trans, op_k);
+ if (warn_errors)
+ bch_err_fn(c, ret);
return ret;
}
@@ -287,9 +292,14 @@ int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sec
* resume only proceeding in one of the snapshots
*/
down_read(&c->snapshot_create_lock);
- int ret = bch2_trans_run(c,
- bch2_logged_op_start(trans, &op.k_i) ?:
- __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
+ struct btree_trans *trans = bch2_trans_get(c);
+ int ret = bch2_logged_op_start(trans, &op.k_i);
+ if (ret)
+ goto out;
+ ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta);
+ ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
+out:
+ bch2_trans_put(trans);
up_read(&c->snapshot_create_lock);
return ret;
@@ -307,7 +317,8 @@ void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, stru
prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset));
}
-static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
+static int adjust_i_size(struct btree_trans *trans, subvol_inum inum,
+ u64 offset, s64 len, bool warn)
{
struct btree_iter iter;
struct bch_inode_unpacked inode_u;
@@ -316,7 +327,7 @@ static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset
offset <<= 9;
len <<= 9;
- ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+ ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn);
if (ret)
return ret;
@@ -356,15 +367,25 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
u64 len = abs(shift);
u64 pos = le64_to_cpu(op->v.pos);
bool insert = shift > 0;
+ u32 snapshot;
+ bool warn_errors = i_sectors_delta != NULL;
int ret = 0;
ret = bch2_inum_opts_get(trans, inum, &opts);
if (ret)
return ret;
+ /*
+ * check for missing subvolume before fpunch, as in resume we don't want
+ * it to be a fatal error
+ */
+ ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors));
+ if (ret)
+ return ret;
+
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inum.inum, 0),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
switch (op->v.state) {
case LOGGED_OP_FINSERT_start:
@@ -372,7 +393,7 @@ case LOGGED_OP_FINSERT_start:
if (insert) {
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- adjust_i_size(trans, inum, src_offset, len) ?:
+ adjust_i_size(trans, inum, src_offset, len, warn_errors) ?:
bch2_logged_op_update(trans, &op->k_i));
if (ret)
goto err;
@@ -395,11 +416,11 @@ case LOGGED_OP_FINSERT_shift_extents:
struct bkey_i delete, *copy;
struct bkey_s_c k;
struct bpos src_pos = POS(inum.inum, src_offset);
- u32 snapshot;
bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot,
+ warn_errors);
if (ret)
goto btree_err;
@@ -407,8 +428,8 @@ case LOGGED_OP_FINSERT_shift_extents:
bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
k = insert
- ? bch2_btree_iter_peek_prev(&iter)
- : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
+ ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0))
+ : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX));
if ((ret = bkey_err(k)))
goto btree_err;
@@ -442,7 +463,7 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
- ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
@@ -462,12 +483,12 @@ btree_err:
if (!insert) {
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- adjust_i_size(trans, inum, src_offset, shift) ?:
+ adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?:
bch2_logged_op_update(trans, &op->k_i));
} else {
/* We need an inode update to update bi_journal_seq for fsync: */
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- adjust_i_size(trans, inum, 0, 0) ?:
+ adjust_i_size(trans, inum, 0, 0, warn_errors) ?:
bch2_logged_op_update(trans, &op->k_i));
}
@@ -476,8 +497,9 @@ case LOGGED_OP_FINSERT_finish:
break;
}
err:
- bch2_logged_op_finish(trans, op_k);
bch2_trans_iter_exit(trans, &iter);
+ if (warn_errors)
+ bch_err_fn(c, ret);
return ret;
}
@@ -506,9 +528,14 @@ int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
* resume only proceeding in one of the snapshots
*/
down_read(&c->snapshot_create_lock);
- int ret = bch2_trans_run(c,
- bch2_logged_op_start(trans, &op.k_i) ?:
- __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
+ struct btree_trans *trans = bch2_trans_get(c);
+ int ret = bch2_logged_op_start(trans, &op.k_i);
+ if (ret)
+ goto out;
+ ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta);
+ ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
+out:
+ bch2_trans_put(trans);
up_read(&c->snapshot_create_lock);
return ret;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 3c574d8873a1..aa91fcf51eec 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -21,6 +21,7 @@
#include "io_read.h"
#include "io_misc.h"
#include "io_write.h"
+#include "reflink.h"
#include "subvolume.h"
#include "trace.h"
@@ -58,7 +59,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
}
rcu_read_unlock();
- return bch2_rand_range(nr * CONGESTED_MAX) < total;
+ return get_random_u32_below(nr * CONGESTED_MAX) < total;
}
#else
@@ -84,29 +85,38 @@ struct promote_op {
};
static const struct rhashtable_params bch_promote_params = {
- .head_offset = offsetof(struct promote_op, hash),
- .key_offset = offsetof(struct promote_op, pos),
- .key_len = sizeof(struct bpos),
+ .head_offset = offsetof(struct promote_op, hash),
+ .key_offset = offsetof(struct promote_op, pos),
+ .key_len = sizeof(struct bpos),
+ .automatic_shrinking = true,
};
+static inline bool have_io_error(struct bch_io_failures *failed)
+{
+ return failed && failed->nr;
+}
+
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
struct bpos pos,
struct bch_io_opts opts,
- unsigned flags)
+ unsigned flags,
+ struct bch_io_failures *failed)
{
- BUG_ON(!opts.promote_target);
+ if (!have_io_error(failed)) {
+ BUG_ON(!opts.promote_target);
- if (!(flags & BCH_READ_MAY_PROMOTE))
- return -BCH_ERR_nopromote_may_not;
+ if (!(flags & BCH_READ_MAY_PROMOTE))
+ return -BCH_ERR_nopromote_may_not;
- if (bch2_bkey_has_target(c, k, opts.promote_target))
- return -BCH_ERR_nopromote_already_promoted;
+ if (bch2_bkey_has_target(c, k, opts.promote_target))
+ return -BCH_ERR_nopromote_already_promoted;
- if (bkey_extent_is_unwritten(k))
- return -BCH_ERR_nopromote_unwritten;
+ if (bkey_extent_is_unwritten(k))
+ return -BCH_ERR_nopromote_unwritten;
- if (bch2_target_congested(c, opts.promote_target))
- return -BCH_ERR_nopromote_congested;
+ if (bch2_target_congested(c, opts.promote_target))
+ return -BCH_ERR_nopromote_congested;
+ }
if (rhashtable_lookup_fast(&c->promote_table, &pos,
bch_promote_params))
@@ -163,7 +173,8 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
struct extent_ptr_decoded *pick,
struct bch_io_opts opts,
unsigned sectors,
- struct bch_read_bio **rbio)
+ struct bch_read_bio **rbio,
+ struct bch_io_failures *failed)
{
struct bch_fs *c = trans->c;
struct promote_op *op = NULL;
@@ -174,7 +185,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
return ERR_PTR(-BCH_ERR_nopromote_no_writes);
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL);
+ op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
if (!op) {
ret = -BCH_ERR_nopromote_enomem;
goto err;
@@ -216,14 +227,28 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
bio = &op->write.op.wbio.bio;
bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
+ struct data_update_opts update_opts = {};
+
+ if (!have_io_error(failed)) {
+ update_opts.target = opts.promote_target;
+ update_opts.extra_replicas = 1;
+ update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED;
+ } else {
+ update_opts.target = opts.foreground_target;
+
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned ptr_bit = 1;
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (bch2_dev_io_failures(failed, ptr->dev))
+ update_opts.rewrite_ptrs |= ptr_bit;
+ ptr_bit <<= 1;
+ }
+ }
+
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
writepoint_hashed((unsigned long) current),
opts,
- (struct data_update_opts) {
- .target = opts.promote_target,
- .extra_replicas = 1,
- .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
- },
+ update_opts,
btree_id, k);
/*
* possible errors: -BCH_ERR_nocow_lock_blocked,
@@ -243,7 +268,8 @@ err:
bio_free_pages(&(*rbio)->bio);
kfree(*rbio);
*rbio = NULL;
- kfree(op);
+ /* We may have added to the rhashtable and thus need rcu freeing: */
+ kfree_rcu(op, rcu);
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
return ERR_PTR(ret);
}
@@ -257,10 +283,17 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
unsigned flags,
struct bch_read_bio **rbio,
bool *bounce,
- bool *read_full)
+ bool *read_full,
+ struct bch_io_failures *failed)
{
struct bch_fs *c = trans->c;
- bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+ /*
+ * if failed != NULL we're not actually doing a promote, we're
+ * recovering from an io/checksum error
+ */
+ bool promote_full = (have_io_error(failed) ||
+ *read_full ||
+ READ_ONCE(c->opts.promote_whole_extents));
/* data might have to be decompressed in the write path: */
unsigned sectors = promote_full
? max(pick->crc.compressed_size, pick->crc.live_size)
@@ -271,7 +304,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
struct promote_op *promote;
int ret;
- ret = should_promote(c, k, pos, opts, flags);
+ ret = should_promote(c, k, pos, opts, flags, failed);
if (ret)
goto nopromote;
@@ -279,7 +312,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
k.k->type == KEY_TYPE_reflink_v
? BTREE_ID_reflink
: BTREE_ID_extents,
- k, pos, pick, opts, sectors, rbio);
+ k, pos, pick, opts, sectors, rbio, failed);
ret = PTR_ERR_OR_ZERO(promote);
if (ret)
goto nopromote;
@@ -294,6 +327,20 @@ nopromote:
/* Read */
+static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+ struct bch_read_bio *rbio, struct bpos read_pos)
+{
+ return bch2_inum_offset_err_msg_trans(trans, out,
+ (subvol_inum) { rbio->subvol, read_pos.inode },
+ read_pos.offset << 9);
+}
+
+static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
+ struct bch_read_bio *rbio, struct bpos read_pos)
+{
+ bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
+}
+
#define READ_RETRY_AVOID 1
#define READ_RETRY 2
#define READ_ERR 3
@@ -378,17 +425,17 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
bch2_bkey_buf_init(&sk);
bch2_trans_iter_init(trans, &iter, rbio->data_btree,
- rbio->read_pos, BTREE_ITER_SLOTS);
+ rbio->read_pos, BTREE_ITER_slots);
retry:
+ bch2_trans_begin(trans);
rbio->bio.bi_status = 0;
- k = bch2_btree_iter_peek_slot(&iter);
- if (bkey_err(k))
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (ret)
goto err;
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- bch2_trans_unlock(trans);
if (!bch2_bkey_matches_ptr(c, k,
rbio->pick.ptr,
@@ -472,6 +519,29 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
}
}
+static void bch2_read_io_err(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bio *bio = &rbio->bio;
+ struct bch_fs *c = rbio->c;
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+ prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
+
+ if (ca) {
+ bch2_io_error(ca, BCH_MEMBER_ERROR_read);
+ bch_err_ratelimited(ca, "%s", buf.buf);
+ } else {
+ bch_err_ratelimited(c, "%s", buf.buf);
+ }
+
+ printbuf_exit(&buf);
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+}
+
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
struct bch_read_bio *rbio)
{
@@ -487,11 +557,11 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
return 0;
k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
if ((ret = bkey_err(k)))
goto out;
- if (bversion_cmp(k.k->version, rbio->version) ||
+ if (bversion_cmp(k.k->bversion, rbio->version) ||
!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
goto out;
@@ -523,7 +593,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
goto out;
ret = bch2_trans_update(trans, &iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -531,8 +601,75 @@ out:
static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
{
- bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_rbio_narrow_crcs(trans, rbio));
+ bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __bch2_rbio_narrow_crcs(trans, rbio));
+}
+
+static void bch2_read_csum_err(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bch_fs *c = rbio->c;
+ struct bio *src = &rbio->bio;
+ struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+ struct nonce nonce = extent_nonce(rbio->version, crc);
+ struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+ struct printbuf buf = PRINTBUF;
+
+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+ prt_str(&buf, "data ");
+ bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
+
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ if (ca) {
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ bch_err_ratelimited(ca, "%s", buf.buf);
+ } else {
+ bch_err_ratelimited(c, "%s", buf.buf);
+ }
+
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ printbuf_exit(&buf);
+}
+
+static void bch2_read_decompress_err(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bch_fs *c = rbio->c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+ prt_str(&buf, "decompression error");
+
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ if (ca)
+ bch_err_ratelimited(ca, "%s", buf.buf);
+ else
+ bch_err_ratelimited(c, "%s", buf.buf);
+
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ printbuf_exit(&buf);
+}
+
+static void bch2_read_decrypt_err(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bch_fs *c = rbio->c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+ prt_str(&buf, "decrypt error");
+
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ if (ca)
+ bch_err_ratelimited(ca, "%s", buf.buf);
+ else
+ bch_err_ratelimited(c, "%s", buf.buf);
+
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ printbuf_exit(&buf);
}
/* Inner part that may run in process context */
@@ -541,7 +678,6 @@ static void __bch2_read_endio(struct work_struct *work)
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
struct bio *src = &rbio->bio;
struct bio *dst = &bch2_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
@@ -642,31 +778,13 @@ csum_err:
goto out;
}
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
- prt_str(&buf, "data ");
- bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
-
- bch_err_inum_offset_ratelimited(ca,
- rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "data %s", buf.buf);
- printbuf_exit(&buf);
-
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
goto out;
decompression_err:
- bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "decompression error");
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
goto out;
decrypt_err:
- bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "decrypt error");
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
goto out;
}
@@ -675,7 +793,7 @@ static void bch2_read_endio(struct bio *bio)
struct bch_read_bio *rbio =
container_of(bio, struct bch_read_bio, bio);
struct bch_fs *c = rbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
struct workqueue_struct *wq = NULL;
enum rbio_context context = RBIO_CONTEXT_NULL;
@@ -687,17 +805,13 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
- rbio->read_pos.inode,
- rbio->read_pos.offset,
- "data read error: %s",
- bch2_blk_status_to_str(bio->bi_status))) {
- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+ if (unlikely(bio->bi_status)) {
+ bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
return;
}
if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(ca, &rbio->pick.ptr)) {
+ (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
trace_and_count(c, read_reuse_race, &rbio->bio);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
@@ -718,72 +832,45 @@ static void bch2_read_endio(struct bio *bio)
bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
}
-int __bch2_read_indirect_extent(struct btree_trans *trans,
- unsigned *offset_into_extent,
- struct bkey_buf *orig_k)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 reflink_offset;
- int ret;
-
- reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
- *offset_into_extent;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
- POS(0, reflink_offset), 0);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_reflink_v &&
- k.k->type != KEY_TYPE_indirect_inline_data) {
- bch_err_inum_offset_ratelimited(trans->c,
- orig_k->k->k.p.inode,
- orig_k->k->k.p.offset << 9,
- "%llu len %u points to nonexistent indirect extent %llu",
- orig_k->k->k.p.offset,
- orig_k->k->k.size,
- reflink_offset);
- bch2_inconsistent_error(trans->c);
- ret = -EIO;
- goto err;
- }
-
- *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
- bch2_bkey_buf_reassemble(orig_k, trans->c, k);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bkey_s_c k,
struct bch_extent_ptr ptr)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
struct btree_iter iter;
struct printbuf buf = PRINTBUF;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- PTR_BUCKET_POS(c, &ptr),
- BTREE_ITER_CACHED);
+ PTR_BUCKET_POS(ca, &ptr),
+ BTREE_ITER_cached);
- prt_printf(&buf, "Attempting to read from stale dirty pointer:");
- printbuf_indent_add(&buf, 2);
- prt_newline(&buf);
+ int gen = bucket_gen_get(ca, iter.pos.offset);
+ if (gen >= 0) {
+ prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
+ printbuf_indent_add(&buf, 2);
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
- prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+ prt_printf(&buf, "memory gen: %u", gen);
+
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (!ret) {
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ }
+ } else {
+ prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
+ iter.pos.inode, iter.pos.offset);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "first bucket %u nbuckets %llu\n",
+ ca->mi.first_bucket, ca->mi.nbuckets);
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
- if (!ret) {
- prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
}
bch2_fs_inconsistent(c, "%s", buf.buf);
@@ -801,7 +888,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct bch_read_bio *rbio = NULL;
- struct bch_dev *ca = NULL;
struct promote_op *promote = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
@@ -825,14 +911,29 @@ retry_pick:
if (!pick_ret)
goto hole;
- if (pick_ret < 0) {
- bch_err_inum_offset_ratelimited(c,
- read_pos.inode, read_pos.offset << 9,
- "no device to read from");
+ if (unlikely(pick_ret < 0)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
+ prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret));
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
goto err;
}
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
+ struct printbuf buf = PRINTBUF;
+ bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
+ prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ goto err;
+ }
+
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
/*
* Stale dirty pointers are treated as IO errors, but @failed isn't
@@ -842,25 +943,24 @@ retry_pick:
*/
if ((flags & BCH_READ_IN_RETRY) &&
!pick.ptr.cached &&
- unlikely(ptr_stale(ca, &pick.ptr))) {
- read_from_stale_dirty_pointer(trans, k, pick.ptr);
+ ca &&
+ unlikely(dev_ptr_stale(ca, &pick.ptr))) {
+ read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick);
+ percpu_ref_put(&ca->io_ref);
goto retry_pick;
}
- /*
- * Unlock the iterator while the btree node's lock is still in
- * cache, before doing the IO:
- */
- bch2_trans_unlock(trans);
-
if (flags & BCH_READ_NODECODE) {
/*
* can happen if we retry, and the extent we were going to read
* has been merged in the meantime:
*/
- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) {
+ if (ca)
+ percpu_ref_put(&ca->io_ref);
goto hole;
+ }
iter.bi_size = pick.crc.compressed_size << 9;
goto get_bio;
@@ -888,9 +988,9 @@ retry_pick:
bounce = true;
}
- if (orig->opts.promote_target)
+ if (orig->opts.promote_target || have_io_error(failed))
promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
- &rbio, &bounce, &read_full);
+ &rbio, &bounce, &read_full, failed);
if (!read_full) {
EBUG_ON(crc_is_compressed(pick.crc));
@@ -965,7 +1065,7 @@ get_bio:
rbio->bvec_iter = iter;
rbio->offset_into_extent= offset_into_extent;
rbio->flags = flags;
- rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+ rbio->have_ioref = ca != NULL;
rbio->narrow_crcs = narrow_crcs;
rbio->hole = 0;
rbio->retry = 0;
@@ -977,10 +1077,13 @@ get_bio:
rbio->read_pos = read_pos;
rbio->data_btree = data_btree;
rbio->data_pos = data_pos;
- rbio->version = k.k->version;
+ rbio->version = k.k->bversion;
rbio->promote = promote;
INIT_WORK(&rbio->work, NULL);
+ if (flags & BCH_READ_NODECODE)
+ orig->pick = pick;
+
rbio->bio.bi_opf = orig->bio.bi_opf;
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
@@ -995,7 +1098,7 @@ get_bio:
* If it's being moved internally, we don't want to flag it as a cache
* hit:
*/
- if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+ if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE))
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
PTR_BUCKET_NR(ca, &pick.ptr), READ);
@@ -1004,12 +1107,25 @@ get_bio:
trace_and_count(c, read_split, &orig->bio);
}
+ /*
+ * Unlock the iterator while the btree node's lock is still in
+ * cache, before doing the IO:
+ */
+ if (!(flags & BCH_READ_IN_RETRY))
+ bch2_trans_unlock(trans);
+ else
+ bch2_trans_unlock_long(trans);
+
if (!rbio->pick.idx) {
- if (!rbio->have_ioref) {
- bch_err_inum_offset_ratelimited(c,
- read_pos.inode,
- read_pos.offset << 9,
- "no device to read from");
+ if (unlikely(!rbio->have_ioref)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
+ prt_printf(&buf, "no device to read from:\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
@@ -1035,7 +1151,7 @@ get_bio:
trans->notrace_relock_fail = true;
} else {
/* Attempting reconstruct read: */
- if (bch2_ec_read_extent(trans, rbio)) {
+ if (bch2_ec_read_extent(trans, rbio, k)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
@@ -1047,6 +1163,8 @@ out:
if (likely(!(flags & BCH_READ_IN_RETRY))) {
return 0;
} else {
+ bch2_trans_unlock(trans);
+
int ret;
rbio->context = RBIO_CONTEXT_UNBOUND;
@@ -1097,34 +1215,26 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
- u32 snapshot;
int ret;
BUG_ON(flags & BCH_READ_NODECODE);
bch2_bkey_buf_init(&sk);
-retry:
- bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
- BTREE_ITER_SLOTS);
+ POS(inum.inum, bvec_iter.bi_sector),
+ BTREE_ITER_slots);
+
while (1) {
- unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
- /*
- * read_extent -> io_time_reset may cause a transaction restart
- * without returning an error, we need to check for that here:
- */
- ret = bch2_trans_relock(trans);
+ bch2_trans_begin(trans);
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
- break;
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, bvec_iter.bi_sector));
@@ -1132,18 +1242,18 @@ retry:
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
- break;
+ goto err;
- offset_into_extent = iter.pos.offset -
+ s64 offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
- sectors = k.k->size - offset_into_extent;
+ unsigned sectors = k.k->size - offset_into_extent;
bch2_bkey_buf_reassemble(&sk, c, k);
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &sk);
if (ret)
- break;
+ goto err;
k = bkey_i_to_s_c(sk.k);
@@ -1151,9 +1261,9 @@ retry:
* With indirect extents, the amount of data to read is the min
* of the original extent and the indirect extent:
*/
- sectors = min(sectors, k.k->size - offset_into_extent);
+ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
- bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+ unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
swap(bvec_iter.bi_size, bytes);
if (bvec_iter.bi_size == bytes)
@@ -1163,36 +1273,36 @@ retry:
data_btree, k,
offset_into_extent, failed, flags);
if (ret)
- break;
+ goto err;
if (flags & BCH_READ_LAST_FRAGMENT)
break;
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-
- ret = btree_trans_too_many_iters(trans);
- if (ret)
+err:
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+ ret != READ_RETRY &&
+ ret != READ_RETRY_AVOID)
break;
}
-err:
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- ret == READ_RETRY ||
- ret == READ_RETRY_AVOID)
- goto retry;
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&sk, c);
+ bch2_trans_iter_exit(trans, &iter);
if (ret) {
- bch_err_inum_offset_ratelimited(c, inum.inum,
- bvec_iter.bi_sector << 9,
- "read error %i from btree lookup", ret);
+ struct printbuf buf = PRINTBUF;
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9);
+ prt_printf(&buf, "read error %i from btree lookup", ret);
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+
rbio->bio.bi_status = BLK_STS_IOERR;
bch2_rbio_done(rbio);
}
+
+ bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&sk, c);
}
void bch2_fs_io_read_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
index d9c18bb7d403..a82e8a94ccb6 100644
--- a/fs/bcachefs/io_read.h
+++ b/fs/bcachefs/io_read.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_IO_READ_H
#include "bkey_buf.h"
+#include "reflink.h"
struct bch_read_bio {
struct bch_fs *c;
@@ -79,19 +80,32 @@ struct bch_devs_mask;
struct cache_promote_op;
struct extent_ptr_decoded;
-int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
- struct bkey_buf *);
-
static inline int bch2_read_indirect_extent(struct btree_trans *trans,
enum btree_id *data_btree,
- unsigned *offset_into_extent,
- struct bkey_buf *k)
+ s64 *offset_into_extent,
+ struct bkey_buf *extent)
{
- if (k->k->k.type != KEY_TYPE_reflink_p)
+ if (extent->k->k.type != KEY_TYPE_reflink_p)
return 0;
*data_btree = BTREE_ID_reflink;
- return __bch2_read_indirect_extent(trans, offset_into_extent, k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
+ offset_into_extent,
+ bkey_i_to_s_c_reflink_p(extent->k),
+ true, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (bkey_deleted(k.k)) {
+ bch2_trans_iter_exit(trans, &iter);
+ return -BCH_ERR_missing_indirect_extent;
+ }
+
+ bch2_bkey_buf_reassemble(extent, trans->c, k);
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
}
enum bch_read_flags {
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 2c098ac017b3..03892388832b 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -69,11 +69,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
u64 io_latency = time_after64(now, submit_time)
? now - submit_time
: 0;
- u64 old, new, v = atomic64_read(latency);
+ u64 old, new;
+ old = atomic64_read(latency);
do {
- old = v;
-
/*
* If the io latency was reasonably close to the current
* latency, skip doing the update and atomic operation - most of
@@ -84,11 +83,11 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
break;
new = ewma_add(old, io_latency, 5);
- } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+ } while (!atomic64_try_cmpxchg(latency, &old, new));
bch2_congested_acct(ca, io_latency, now, rw);
- __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+ __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
}
#endif
@@ -165,8 +164,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
bch2_trans_copy_iter(&iter, extent_iter);
- for_each_btree_key_upto_continue_norestart(iter,
- new->k.p, BTREE_ITER_SLOTS, old, ret) {
+ for_each_btree_key_max_continue_norestart(iter,
+ new->k.p, BTREE_ITER_slots, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
bkey_start_offset(old.k));
@@ -199,9 +198,6 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
u64 new_i_size,
s64 i_sectors_delta)
{
- struct btree_iter iter;
- struct bkey_i *k;
- struct bkey_i_inode_v3 *inode;
/*
* Crazy performance optimization:
* Every extent update needs to also update the inode: the inode trigger
@@ -213,26 +209,38 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
* to be journalled - if we crash, the bi_journal_seq update will be
* lost, but that's fine.
*/
- unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
- int ret;
+ unsigned inode_update_flags = BTREE_UPDATE_nojournal;
- k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0,
extent_iter->pos.inode,
extent_iter->snapshot),
- BTREE_ITER_CACHED);
- ret = PTR_ERR_OR_ZERO(k);
+ BTREE_ITER_intent|
+ BTREE_ITER_cached);
+ int ret = bkey_err(k);
if (unlikely(ret))
return ret;
- if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
- k = bch2_inode_to_v3(trans, k);
- ret = PTR_ERR_OR_ZERO(k);
+ /*
+ * varint_decode_fast(), in the inode .invalid method, reads up to 7
+ * bytes past the end of the buffer:
+ */
+ struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8);
+ ret = PTR_ERR_OR_ZERO(k_mut);
+ if (unlikely(ret))
+ goto err;
+
+ bkey_reassemble(k_mut, k);
+
+ if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) {
+ k_mut = bch2_inode_to_v3(trans, k_mut);
+ ret = PTR_ERR_OR_ZERO(k_mut);
if (unlikely(ret))
goto err;
}
- inode = bkey_i_to_inode_v3(k);
+ struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut);
if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
new_i_size > le64_to_cpu(inode->v.bi_size)) {
@@ -251,7 +259,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
}
ret = bch2_trans_update(trans, &iter, &inode->k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ BTREE_UPDATE_internal_snapshot_node|
inode_update_flags);
err:
bch2_trans_iter_exit(trans, &iter);
@@ -360,9 +368,9 @@ static int bch2_write_index_default(struct bch_write_op *op)
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
bkey_start_pos(&sk.k->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
- ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?:
bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
@@ -388,6 +396,31 @@ static int bch2_write_index_default(struct bch_write_op *op)
/* Writes */
+static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
+ u64 offset)
+{
+ bch2_inum_offset_err_msg(op->c, out,
+ (subvol_inum) { op->subvol, op->pos.inode, },
+ offset << 9);
+ prt_printf(out, "write error%s: ",
+ op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
+}
+
+void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
+{
+ __bch2_write_op_error(out, op, op->pos.offset);
+}
+
+static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
+ struct bch_write_op *op, u64 offset)
+{
+ bch2_inum_offset_err_msg_trans(trans, out,
+ (subvol_inum) { op->subvol, op->pos.inode, },
+ offset << 9);
+ prt_printf(out, "write error%s: ",
+ op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
+}
+
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
enum bch_data_type type,
const struct bkey_i *k,
@@ -399,13 +432,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
BUG_ON(c->opts.nochanges);
bkey_for_each_ptr(ptrs, ptr) {
- BUG_ON(!bch2_dev_exists2(c, ptr->dev));
-
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = nocow
+ ? bch2_dev_have_ref(c, ptr->dev)
+ : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE);
if (to_entry(ptr + 1) < ptrs.end) {
- n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
- GFP_NOFS, &ca->replica_set));
+ n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
n->bio.bi_end_io = wbio->bio.bi_end_io;
n->bio.bi_private = wbio->bio.bi_private;
@@ -422,11 +454,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->c = c;
n->dev = ptr->dev;
- n->have_ioref = nocow || bch2_dev_get_ioref(ca,
- type == BCH_DATA_btree ? READ : WRITE);
+ n->have_ioref = ca != NULL;
n->nocow = nocow;
n->submit_time = local_clock();
n->inode_offset = bkey_start_offset(&k->k);
+ if (nocow)
+ n->nocow_bucket = PTR_BUCKET_NR(ca, ptr);
n->bio.bi_iter.bi_sector = ptr->offset;
if (likely(n->have_ioref)) {
@@ -473,7 +506,6 @@ static void bch2_write_done(struct closure *cl)
static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
{
struct keylist *keys = &op->insert_keys;
- struct bch_extent_ptr *ptr;
struct bkey_i *src, *dst = keys->keys, *n;
for (src = keys->keys; src != keys->top; src = n) {
@@ -525,13 +557,14 @@ static void __bch2_write_index(struct bch_write_op *op)
op->written += sectors_start - keylist_sectors(keys);
- if (ret && !bch2_err_matches(ret, EROFS)) {
+ if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
- bch_err_inum_offset_ratelimited(c,
- insert->k.p.inode, insert->k.p.offset << 9,
- "write error while doing btree update: %s",
- bch2_err_str(ret));
+ struct printbuf buf = PRINTBUF;
+ __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
+ prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
}
if (ret)
@@ -547,7 +580,7 @@ out:
err:
keys->top = keys->keys;
op->error = ret;
- op->flags |= BCH_WRITE_DONE;
+ op->flags |= BCH_WRITE_SUBMITTED;
goto out;
}
@@ -582,7 +615,7 @@ static CLOSURE_CALLBACK(bch2_write_index)
struct workqueue_struct *wq = index_update_wq(op);
unsigned long flags;
- if ((op->flags & BCH_WRITE_DONE) &&
+ if ((op->flags & BCH_WRITE_SUBMITTED) &&
(op->flags & BCH_WRITE_MOVE))
bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
@@ -614,9 +647,7 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
while (1) {
spin_lock_irq(&wp->writes_lock);
- op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
- if (op)
- list_del(&op->wp_list);
+ op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list);
wp_update_state(wp, op != NULL);
spin_unlock_irq(&wp->writes_lock);
@@ -627,7 +658,7 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
__bch2_write_index(op);
- if (!(op->flags & BCH_WRITE_DONE))
+ if (!(op->flags & BCH_WRITE_SUBMITTED))
__bch2_write(op);
else
bch2_write_done(&op->cl);
@@ -641,7 +672,9 @@ static void bch2_write_endio(struct bio *bio)
struct bch_write_bio *wbio = to_wbio(bio);
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_fs *c = wbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
+ struct bch_dev *ca = wbio->have_ioref
+ ? bch2_dev_have_ref(c, wbio->dev)
+ : NULL;
if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
op->pos.inode,
@@ -652,8 +685,12 @@ static void bch2_write_endio(struct bio *bio)
op->flags |= BCH_WRITE_IO_ERROR;
}
- if (wbio->nocow)
+ if (wbio->nocow) {
+ bch2_bucket_nocow_unlock(&c->nocow_locks,
+ POS(ca->dev_idx, wbio->nocow_bucket),
+ BUCKET_NOCOW_LOCK_UPDATE);
set_bit(wbio->dev, op->devs_need_flush->d);
+ }
if (wbio->have_ioref) {
bch2_latency_acct(ca, wbio->submit_time, WRITE);
@@ -684,7 +721,7 @@ static void init_append_extent(struct bch_write_op *op,
e = bkey_extent_init(op->insert_keys.top);
e->k.p = op->pos;
e->k.size = crc.uncompressed_size;
- e->k.version = version;
+ e->k.bversion = version;
if (crc.csum_type ||
crc.compression_type ||
@@ -846,7 +883,7 @@ static enum prep_encoded_ret {
if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
return PREP_ENCODED_CHECKSUM_ERR;
- if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+ if (bch2_bio_uncompress_inplace(op, bio))
return PREP_ENCODED_ERR;
}
@@ -1067,7 +1104,14 @@ do_write:
*_dst = dst;
return more;
csum_err:
- bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+ {
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error(&buf, op);
+ prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
ret = -EIO;
err:
if (to_wbio(dst)->bounce)
@@ -1091,30 +1135,21 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
return false;
e = bkey_s_c_to_extent(k);
+
+ rcu_read_lock();
extent_for_each_ptr_decode(e, p, entry) {
- if (crc_is_encoded(p.crc) || p.has_ec)
+ if (crc_is_encoded(p.crc) || p.has_ec) {
+ rcu_read_unlock();
return false;
+ }
replicas += bch2_extent_ptr_durability(c, &p);
}
+ rcu_read_unlock();
return replicas >= op->opts.data_replicas;
}
-static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
-
- for_each_keylist_key(&op->insert_keys, k) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
-
- bkey_for_each_ptr(ptrs, ptr)
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr),
- BUCKET_NOCOW_LOCK_UPDATE);
- }
-}
-
static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *orig,
@@ -1148,7 +1183,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
return bch2_extent_update_i_size_sectors(trans, iter,
min(new->k.p.offset << 9, new_i_size), 0) ?:
bch2_trans_update(trans, iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
}
static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
@@ -1157,9 +1192,9 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
struct btree_trans *trans = bch2_trans_get(c);
for_each_keylist_key(&op->insert_keys, orig) {
- int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
+ int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
- BTREE_ITER_INTENT, k,
+ BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
}));
@@ -1167,10 +1202,11 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
if (ret && !bch2_err_matches(ret, EROFS)) {
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
- bch_err_inum_offset_ratelimited(c,
- insert->k.p.inode, insert->k.p.offset << 9,
- "write error while doing btree update: %s",
- bch2_err_str(ret));
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k));
+ prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
}
if (ret) {
@@ -1184,8 +1220,6 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
static void __bch2_nocow_write_done(struct bch_write_op *op)
{
- bch2_nocow_write_unlock(op);
-
if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
op->error = -EIO;
} else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
@@ -1215,7 +1249,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
u32 snapshot;
struct bucket_to_lock *stale_at;
- int ret;
+ int stale, ret;
if (op->flags & BCH_WRITE_MOVE)
return;
@@ -1231,12 +1265,16 @@ retry:
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(op->pos.inode, op->pos.offset, snapshot),
- BTREE_ITER_SLOTS);
+ BTREE_ITER_slots);
while (1) {
struct bio *bio = &op->wbio.bio;
buckets.nr = 0;
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ break;
+
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -1256,14 +1294,15 @@ retry:
/* Get iorefs before dropping btree locks: */
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
- struct bpos b = PTR_BUCKET_POS(c, ptr);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
+ if (unlikely(!ca))
+ goto err_get_ioref;
+
+ struct bpos b = PTR_BUCKET_POS(ca, ptr);
struct nocow_lock_bucket *l =
bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
prefetch(l);
- if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
- goto err_get_ioref;
-
/* XXX allocating memory with btree locks held - rare */
darray_push_gfp(&buckets, ((struct bucket_to_lock) {
.b = b, .gen = ptr->gen, .l = l,
@@ -1282,16 +1321,14 @@ retry:
bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
darray_for_each(buckets, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode);
+ struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);
__bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
bucket_to_u64(i->b),
BUCKET_NOCOW_LOCK_UPDATE);
- rcu_read_lock();
- bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen);
- rcu_read_unlock();
-
+ int gen = bucket_gen_get(ca, i->b.offset);
+ stale = gen < 0 ? gen : gen_after(gen, i->gen);
if (unlikely(stale)) {
stale_at = i;
goto err_bucket_stale;
@@ -1305,7 +1342,7 @@ retry:
wbio_init(bio)->put_bio = true;
bio->bi_opf = op->wbio.bio.bi_opf;
} else {
- op->flags |= BCH_WRITE_DONE;
+ op->flags |= BCH_WRITE_SUBMITTED;
}
op->pos.offset += bio_sectors(bio);
@@ -1319,7 +1356,7 @@ retry:
op->insert_keys.top, true);
bch2_keylist_push(&op->insert_keys);
- if (op->flags & BCH_WRITE_DONE)
+ if (op->flags & BCH_WRITE_SUBMITTED)
break;
bch2_btree_iter_advance(&iter);
}
@@ -1329,19 +1366,21 @@ err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
+ bch2_trans_put(trans);
+ darray_exit(&buckets);
+
if (ret) {
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode, op->pos.offset << 9,
- "%s: btree lookup error %s", __func__, bch2_err_str(ret));
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error(&buf, op);
+ prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
op->error = ret;
- op->flags |= BCH_WRITE_DONE;
+ op->flags |= BCH_WRITE_SUBMITTED;
}
- bch2_trans_put(trans);
- darray_exit(&buckets);
-
/* fallback to cow write path? */
- if (!(op->flags & BCH_WRITE_DONE)) {
+ if (!(op->flags & BCH_WRITE_SUBMITTED)) {
closure_sync(&op->cl);
__bch2_nocow_write_done(op);
op->insert_keys.top = op->insert_keys.keys;
@@ -1359,7 +1398,7 @@ err:
return;
err_get_ioref:
darray_for_each(buckets, i)
- percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref);
+ percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref);
/* Fall back to COW path: */
goto out;
@@ -1370,8 +1409,18 @@ err_bucket_stale:
break;
}
- /* We can retry this: */
- ret = -BCH_ERR_transaction_restart;
+ struct printbuf buf = PRINTBUF;
+ if (bch2_fs_inconsistent_on(stale < 0, c,
+ "pointer to invalid bucket in nocow path on device %llu\n %s",
+ stale_at->b.inode,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = -EIO;
+ } else {
+ /* We can retry this: */
+ ret = -BCH_ERR_transaction_restart;
+ }
+ printbuf_exit(&buf);
+
goto err_get_ioref;
}
@@ -1387,7 +1436,7 @@ static void __bch2_write(struct bch_write_op *op)
if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
bch2_nocow_write(op);
- if (op->flags & BCH_WRITE_DONE)
+ if (op->flags & BCH_WRITE_SUBMITTED)
goto out_nofs_restore;
}
again:
@@ -1414,7 +1463,7 @@ again:
* freeing up space on specific disks, which means that
* allocations for specific disks may hang arbitrarily long:
*/
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_run(c, lockrestart_do(trans,
bch2_alloc_sectors_start_trans(trans,
op->target,
op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
@@ -1424,9 +1473,7 @@ again:
op->nr_replicas_required,
op->watermark,
op->flags,
- (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
- BCH_WRITE_ONLY_SPECIFIED_DEVS))
- ? NULL : &op->cl, &wp));
+ &op->cl, &wp)));
if (unlikely(ret)) {
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
break;
@@ -1442,14 +1489,16 @@ again:
bch2_alloc_sectors_done_inlined(c, wp);
err:
if (ret <= 0) {
- op->flags |= BCH_WRITE_DONE;
-
- if (ret < 0) {
- if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode,
- op->pos.offset << 9,
- "%s(): error: %s", __func__, bch2_err_str(ret));
+ op->flags |= BCH_WRITE_SUBMITTED;
+
+ if (unlikely(ret < 0)) {
+ if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error(&buf, op);
+ prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
op->error = ret;
break;
}
@@ -1476,12 +1525,13 @@ err:
* once, as that signals backpressure to the caller.
*/
if ((op->flags & BCH_WRITE_SYNC) ||
- (!(op->flags & BCH_WRITE_DONE) &&
+ (!(op->flags & BCH_WRITE_SUBMITTED) &&
!(op->flags & BCH_WRITE_IN_WORKER))) {
- closure_sync(&op->cl);
+ bch2_wait_on_allocator(c, &op->cl);
+
__bch2_write_index(op);
- if (!(op->flags & BCH_WRITE_DONE))
+ if (!(op->flags & BCH_WRITE_SUBMITTED))
goto again;
bch2_write_done(&op->cl);
} else {
@@ -1500,8 +1550,10 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
unsigned sectors;
int ret;
+ memset(&op->failed, 0, sizeof(op->failed));
+
op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
- op->flags |= BCH_WRITE_DONE;
+ op->flags |= BCH_WRITE_SUBMITTED;
bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
@@ -1518,7 +1570,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
id = bkey_inline_data_init(op->insert_keys.top);
id->k.p = op->pos;
- id->k.version = op->version;
+ id->k.bversion = op->version;
id->k.size = sectors;
iter = bio->bi_iter;
@@ -1564,16 +1616,19 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
+ if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+ op->flags |= BCH_WRITE_ALLOC_NOWAIT;
+
op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
wbio_init(bio)->put_bio = false;
- if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode,
- op->pos.offset << 9,
- "misaligned write");
+ if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error(&buf, op);
+ prt_printf(&buf, "misaligned write");
+ printbuf_exit(&buf);
op->error = -EIO;
goto err;
}
@@ -1633,8 +1688,7 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
prt_bitflags(out, bch2_write_flags, op->flags);
prt_newline(out);
- prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
- prt_newline(out);
+ prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
printbuf_indent_sub(out, 2);
}
@@ -1642,13 +1696,14 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
void bch2_fs_io_write_exit(struct bch_fs *c)
{
mempool_exit(&c->bio_bounce_pages);
+ bioset_exit(&c->replica_set);
bioset_exit(&c->bio_write);
}
int bch2_fs_io_write_init(struct bch_fs *c)
{
- if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
- BIOSET_NEED_BVECS))
+ if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
+ bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
return -BCH_ERR_ENOMEM_bio_write_init;
if (mempool_init_page_pool(&c->bio_bounce_pages,
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
index 6c276a48f95d..b4626013abc8 100644
--- a/fs/bcachefs/io_write.h
+++ b/fs/bcachefs/io_write.h
@@ -20,6 +20,8 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *, bool);
+void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op);
+
#define BCH_WRITE_FLAGS() \
x(ALLOC_NOWAIT) \
x(CACHED) \
@@ -33,7 +35,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
x(SYNC) \
x(MOVE) \
x(IN_WORKER) \
- x(DONE) \
+ x(SUBMITTED) \
x(IO_ERROR) \
x(CONVERT_UNWRITTEN)
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
index c7f97c2c4805..6e878a6f2f0b 100644
--- a/fs/bcachefs/io_write_types.h
+++ b/fs/bcachefs/io_write_types.h
@@ -20,6 +20,7 @@ struct bch_write_bio {
u64 submit_time;
u64 inode_offset;
+ u64 nocow_bucket;
struct bch_devs_list failed;
u8 dev;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index bc890776eb57..05b1250619ec 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -27,31 +27,59 @@ static const char * const bch2_journal_errors[] = {
NULL
};
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+ return seq > j->seq_ondisk;
+}
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+ return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+ return __journal_entry_is_open(j->reservations);
+}
+
static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
{
union journal_res_state s = READ_ONCE(j->reservations);
unsigned i = seq & JOURNAL_BUF_MASK;
struct journal_buf *buf = j->buf + i;
- prt_printf(out, "seq:");
- prt_tab(out);
- prt_printf(out, "%llu", seq);
- prt_newline(out);
+ prt_printf(out, "seq:\t%llu\n", seq);
printbuf_indent_add(out, 2);
- prt_printf(out, "refcount:");
- prt_tab(out);
- prt_printf(out, "%u", journal_state_count(s, i));
- prt_newline(out);
+ prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i));
- prt_printf(out, "size:");
- prt_tab(out);
+ prt_printf(out, "size:\t");
prt_human_readable_u64(out, vstruct_bytes(buf->data));
prt_newline(out);
- prt_printf(out, "expires");
- prt_tab(out);
- prt_printf(out, "%li jiffies", buf->expires - jiffies);
+ prt_printf(out, "expires:\t");
+ prt_printf(out, "%li jiffies\n", buf->expires - jiffies);
+
+ prt_printf(out, "flags:\t");
+ if (buf->noflush)
+ prt_str(out, "noflush ");
+ if (buf->must_flush)
+ prt_str(out, "must_flush ");
+ if (buf->separate_flush)
+ prt_str(out, "separate_flush ");
+ if (buf->need_flush_to_write_buffer)
+ prt_str(out, "need_flush_to_write_buffer ");
+ if (buf->write_started)
+ prt_str(out, "write_started ");
+ if (buf->write_allocated)
+ prt_str(out, "write_allocated ");
+ if (buf->write_done)
+ prt_str(out, "write_done");
prt_newline(out);
printbuf_indent_sub(out, 2);
@@ -66,26 +94,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
seq <= journal_cur_seq(j);
seq++)
bch2_journal_buf_to_text(out, j, seq);
-}
-
-static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
-{
- return seq > j->seq_ondisk;
-}
-
-static bool __journal_entry_is_open(union journal_res_state state)
-{
- return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-static inline unsigned nr_unwritten_journal_entries(struct journal *j)
-{
- return atomic64_read(&j->seq) - j->seq_ondisk;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
- return __journal_entry_is_open(j->reservations);
+ prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
}
static inline struct journal_buf *
@@ -104,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq)
static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(p->list); i++)
- INIT_LIST_HEAD(&p->list[i]);
- INIT_LIST_HEAD(&p->flushed);
+ for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++)
+ INIT_LIST_HEAD(&p->unflushed[i]);
+ for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
+ INIT_LIST_HEAD(&p->flushed[i]);
atomic_set(&p->count, count);
p->devs.nr = 0;
}
@@ -174,21 +182,46 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
return stuck;
}
+void bch2_journal_do_writes(struct journal *j)
+{
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++) {
+ unsigned idx = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *w = j->buf + idx;
+
+ if (w->write_started && !w->write_allocated)
+ break;
+ if (w->write_started)
+ continue;
+
+ if (!journal_state_count(j->reservations, idx)) {
+ w->write_started = true;
+ closure_call(&w->io, bch2_journal_write, j->wq, NULL);
+ }
+
+ break;
+ }
+}
+
/*
* Final processing when the last reference of a journal buffer has been
* dropped. Drop the pin list reference acquired at journal entry open and write
* the buffer, if requested.
*/
-void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
+void bch2_journal_buf_put_final(struct journal *j, u64 seq)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
lockdep_assert_held(&j->lock);
if (__bch2_journal_pin_put(j, seq))
bch2_journal_reclaim_fast(j);
- if (write)
- closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+ bch2_journal_do_writes(j);
+
+ /*
+ * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an
+ * open journal entry
+ */
+ wake_up(&j->wait);
}
/*
@@ -202,7 +235,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
unsigned sectors;
BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
@@ -210,19 +242,23 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
lockdep_assert_held(&j->lock);
+ old.v = atomic64_read(&j->reservations.counter);
do {
- old.v = new.v = v;
+ new.v = old.v;
new.cur_entry_offset = closed_val;
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
old.cur_entry_offset == new.cur_entry_offset)
return;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ } while (!atomic64_try_cmpxchg(&j->reservations.counter,
+ &old.v, new.v));
if (!__journal_entry_is_open(old))
return;
+ if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)
+ old.cur_entry_offset = j->cur_entry_offset_if_blocked;
+
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
@@ -283,6 +319,16 @@ void bch2_journal_halt(struct journal *j)
spin_unlock(&j->lock);
}
+void bch2_journal_halt_locked(struct journal *j)
+{
+ lockdep_assert_held(&j->lock);
+
+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
+ if (!j->err_seq)
+ j->err_seq = journal_cur_seq(j);
+ journal_wake(j);
+}
+
static bool journal_entry_want_write(struct journal *j)
{
bool ret = !journal_entry_is_open(j) ||
@@ -325,7 +371,6 @@ static int journal_entry_open(struct journal *j)
((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
union journal_res_state old, new;
int u64s;
- u64 v;
lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j));
@@ -346,6 +391,13 @@ static int journal_entry_open(struct journal *j)
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
return JOURNAL_ERR_max_in_flight;
+ if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) {
+ bch_err(c, "cannot start: journal seq overflow");
+ if (bch2_fs_emergency_read_only_locked(c))
+ bch_err(c, "fatal error - emergency read only");
+ return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+ }
+
BUG_ON(!j->cur_entry_sectors);
buf->expires =
@@ -380,11 +432,14 @@ static int journal_entry_open(struct journal *j)
BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
bkey_extent_init(&buf->key);
- buf->noflush = false;
- buf->must_flush = false;
- buf->separate_flush = false;
- buf->flush_time = 0;
+ buf->noflush = false;
+ buf->must_flush = false;
+ buf->separate_flush = false;
+ buf->flush_time = 0;
buf->need_flush_to_write_buffer = true;
+ buf->write_started = false;
+ buf->write_allocated = false;
+ buf->write_done = false;
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
@@ -401,9 +456,9 @@ static int journal_entry_open(struct journal *j)
*/
j->cur_entry_u64s = u64s;
- v = atomic64_read(&j->reservations.counter);
+ old.v = atomic64_read(&j->reservations.counter);
do {
- old.v = new.v = v;
+ new.v = old.v;
BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
@@ -415,12 +470,13 @@ static int journal_entry_open(struct journal *j)
/* Handle any already added entries */
new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ } while (!atomic64_try_cmpxchg(&j->reservations.counter,
+ &old.v, new.v));
- mod_delayed_work(c->io_complete_wq,
- &j->write_work,
- msecs_to_jiffies(c->opts.journal_flush_delay));
+ if (nr_unwritten_journal_entries(j) == 1)
+ mod_delayed_work(j->wq,
+ &j->write_work,
+ msecs_to_jiffies(c->opts.journal_flush_delay));
journal_wake(j);
if (j->early_journal_entries.nr)
@@ -445,20 +501,16 @@ static void journal_quiesce(struct journal *j)
static void journal_write_work(struct work_struct *work)
{
struct journal *j = container_of(work, struct journal, write_work.work);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- long delta;
spin_lock(&j->lock);
- if (!__journal_entry_is_open(j->reservations))
- goto unlock;
-
- delta = journal_cur_buf(j)->expires - jiffies;
+ if (__journal_entry_is_open(j->reservations)) {
+ long delta = journal_cur_buf(j)->expires - jiffies;
- if (delta > 0)
- mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
- else
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-unlock:
+ if (delta > 0)
+ mod_delayed_work(j->wq, &j->write_work, delta);
+ else
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+ }
spin_unlock(&j->lock);
}
@@ -476,30 +528,29 @@ retry:
if (bch2_journal_error(j))
return -BCH_ERR_erofs_journal_err;
- spin_lock(&j->lock);
+ if (j->blocked)
+ return -BCH_ERR_journal_res_get_blocked;
- /* check once more in case somebody else shut things down... */
- if (bch2_journal_error(j)) {
- spin_unlock(&j->lock);
- return -BCH_ERR_erofs_journal_err;
+ if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
+ ret = JOURNAL_ERR_journal_full;
+ can_discard = j->can_discard;
+ goto out;
}
+ if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
+ ret = JOURNAL_ERR_max_in_flight;
+ goto out;
+ }
+
+ spin_lock(&j->lock);
+
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call bch2_journal_entry_close()
* unnecessarily
*/
if (journal_res_get_fast(j, res, flags)) {
- spin_unlock(&j->lock);
- return 0;
- }
-
- if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
- /*
- * Don't want to close current journal entry, just need to
- * invoke reclaim:
- */
- ret = JOURNAL_ERR_journal_full;
+ ret = 0;
goto unlock;
}
@@ -515,30 +566,30 @@ retry:
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
- ret = journal_entry_open(j);
-
- if (ret == JOURNAL_ERR_max_in_flight) {
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
- &j->max_in_flight_start, true);
- if (trace_journal_entry_full_enabled()) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
-
- bch2_journal_bufs_to_text(&buf, j);
- trace_journal_entry_full(c, buf.buf);
- printbuf_exit(&buf);
- }
- count_event(c, journal_entry_full);
- }
+ ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
-
- if (!ret)
+out:
+ if (ret == JOURNAL_ERR_retry)
goto retry;
+ if (!ret)
+ return 0;
+
if (journal_error_check_stuck(j, ret, flags))
ret = -BCH_ERR_journal_res_get_blocked;
+ if (ret == JOURNAL_ERR_max_in_flight &&
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
+
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
+ bch2_journal_bufs_to_text(&buf, j);
+ trace_journal_entry_full(c, buf.buf);
+ printbuf_exit(&buf);
+ count_event(c, journal_entry_full);
+ }
+
/*
* Journal is full - can't rely on reclaim from work item due to
* freezing:
@@ -562,6 +613,16 @@ unlock:
: -BCH_ERR_journal_res_get_blocked;
}
+static unsigned max_dev_latency(struct bch_fs *c)
+{
+ u64 nsecs = 0;
+
+ for_each_rw_member(c, ca)
+ nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
+
+ return nsecs_to_jiffies(nsecs);
+}
+
/*
* Essentially the entry function to the journaling code. When bcachefs is doing
* a btree insert, it calls this function to get the current journal write.
@@ -573,10 +634,37 @@ unlock:
* btree node write locks.
*/
int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
- unsigned flags)
+ unsigned flags,
+ struct btree_trans *trans)
{
int ret;
+ if (closure_wait_event_timeout(&j->async_wait,
+ (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+ (flags & JOURNAL_RES_GET_NONBLOCK),
+ HZ))
+ return ret;
+
+ if (trans)
+ bch2_trans_unlock_long(trans);
+
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10);
+
+ remaining_wait = max(0, remaining_wait - HZ);
+
+ if (closure_wait_event_timeout(&j->async_wait,
+ (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+ (flags & JOURNAL_RES_GET_NONBLOCK),
+ remaining_wait))
+ return ret;
+
+ struct printbuf buf = PRINTBUF;
+ bch2_journal_debug_to_text(&buf, j);
+ bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
+ buf.buf);
+ printbuf_exit(&buf);
+
closure_wait_event(&j->async_wait,
(ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
(flags & JOURNAL_RES_GET_NONBLOCK));
@@ -625,7 +713,7 @@ out:
* @seq: seq to flush
* @parent: closure object to wait with
* Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed,
- * -EIO if @seq will never be flushed
+ * -BCH_ERR_journal_flush_err if @seq will never be flushed
*
* Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
* necessary
@@ -648,7 +736,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
/* Recheck under lock: */
if (j->err_seq && seq >= j->err_seq) {
- ret = -EIO;
+ ret = -BCH_ERR_journal_flush_err;
goto out;
}
@@ -669,12 +757,18 @@ recheck_need_open:
spin_unlock(&j->lock);
- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+ /*
+ * We're called from bch2_journal_flush_seq() -> wait_event();
+ * but this might block. We won't usually block, so we won't
+ * livelock:
+ */
+ sched_annotate_sleep();
+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
if (ret)
return ret;
seq = res.seq;
- buf = j->buf + (seq & JOURNAL_BUF_MASK);
+ buf = journal_seq_to_buf(j, seq);
buf->must_flush = true;
if (!buf->flush_time) {
@@ -692,8 +786,8 @@ recheck_need_open:
}
/*
- * if write was kicked off without a flush, flush the next sequence
- * number instead
+ * if write was kicked off without a flush, or if we promised it
+ * wouldn't be a flush, flush the next sequence number instead
*/
buf = journal_seq_to_buf(j, seq);
if (buf->noflush) {
@@ -702,6 +796,7 @@ recheck_need_open:
}
buf->must_flush = true;
+ j->flushing_seq = max(j->flushing_seq, seq);
if (parent && !closure_wait(&buf->wait, parent))
BUG();
@@ -713,7 +808,7 @@ out:
return ret;
}
-int bch2_journal_flush_seq(struct journal *j, u64 seq)
+int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state)
{
u64 start_time = local_clock();
int ret, ret2;
@@ -724,7 +819,9 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
if (seq <= j->flushed_seq_ondisk)
return 0;
- ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
+ ret = wait_event_state(j->wait,
+ (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)),
+ task_state);
if (!ret)
bch2_time_stats_update(j->flush_seq_time, start_time);
@@ -743,14 +840,15 @@ void bch2_journal_flush_async(struct journal *j, struct closure *parent)
int bch2_journal_flush(struct journal *j)
{
- return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
+ return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE);
}
/*
- * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the
+ * range [start, end)
* @seq
*/
-bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
+bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
u64 unwritten_seq;
@@ -759,20 +857,20 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
return false;
- if (seq <= c->journal.flushed_seq_ondisk)
+ if (c->journal.flushed_seq_ondisk >= start)
return false;
spin_lock(&j->lock);
- if (seq <= c->journal.flushed_seq_ondisk)
+ if (c->journal.flushed_seq_ondisk >= start)
goto out;
for (unwritten_seq = journal_last_unwritten_seq(j);
- unwritten_seq < seq;
+ unwritten_seq < end;
unwritten_seq++) {
struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
- /* journal write is already in flight, and was a flush write: */
- if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
+ /* journal flush already in flight, or flush requseted */
+ if (buf->must_flush)
goto out;
buf->noflush = true;
@@ -784,19 +882,14 @@ out:
return ret;
}
-int bch2_journal_meta(struct journal *j)
+static int __bch2_journal_meta(struct journal *j)
{
- struct journal_buf *buf;
- struct journal_res res;
- int ret;
-
- memset(&res, 0, sizeof(res));
-
- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+ struct journal_res res = {};
+ int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
if (ret)
return ret;
- buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+ struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
buf->must_flush = true;
if (!buf->flush_time) {
@@ -806,7 +899,19 @@ int bch2_journal_meta(struct journal *j)
bch2_journal_res_put(j, &res);
- return bch2_journal_flush_seq(j, res.seq);
+ return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE);
+}
+
+int bch2_journal_meta(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal))
+ return -EROFS;
+
+ int ret = __bch2_journal_meta(j);
+ bch2_write_ref_put(c, BCH_WRITE_REF_journal);
+ return ret;
}
/* block/unlock the journal: */
@@ -814,25 +919,58 @@ int bch2_journal_meta(struct journal *j)
void bch2_journal_unblock(struct journal *j)
{
spin_lock(&j->lock);
- j->blocked--;
+ if (!--j->blocked &&
+ j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL &&
+ j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) {
+ union journal_res_state old, new;
+
+ old.v = atomic64_read(&j->reservations.counter);
+ do {
+ new.v = old.v;
+ new.cur_entry_offset = j->cur_entry_offset_if_blocked;
+ } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
+ }
spin_unlock(&j->lock);
journal_wake(j);
}
+static void __bch2_journal_block(struct journal *j)
+{
+ if (!j->blocked++) {
+ union journal_res_state old, new;
+
+ old.v = atomic64_read(&j->reservations.counter);
+ do {
+ j->cur_entry_offset_if_blocked = old.cur_entry_offset;
+
+ if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL)
+ break;
+
+ new.v = old.v;
+ new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
+ } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
+
+ journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
+ }
+}
+
void bch2_journal_block(struct journal *j)
{
spin_lock(&j->lock);
- j->blocked++;
+ __bch2_journal_block(j);
spin_unlock(&j->lock);
journal_quiesce(j);
}
-static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j,
+ u64 max_seq, bool *blocked)
{
struct journal_buf *ret = NULL;
+ /* We're inside wait_event(), but using mutex_lock(: */
+ sched_annotate_sleep();
mutex_lock(&j->buf_lock);
spin_lock(&j->lock);
max_seq = min(max_seq, journal_cur_seq(j));
@@ -844,13 +982,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
struct journal_buf *buf = j->buf + idx;
if (buf->need_flush_to_write_buffer) {
- if (seq == journal_cur_seq(j))
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-
union journal_res_state s;
s.v = atomic64_read_acquire(&j->reservations.counter);
- ret = journal_state_count(s, idx)
+ unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s);
+
+ if (open && !*blocked) {
+ __bch2_journal_block(j);
+ *blocked = true;
+ }
+
+ ret = journal_state_count(s, idx) > open
? ERR_PTR(-EAGAIN)
: buf;
break;
@@ -863,18 +1005,24 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
return ret;
}
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
+ u64 max_seq, bool *blocked)
{
struct journal_buf *ret;
+ *blocked = false;
+
+ wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j,
+ max_seq, blocked)) != ERR_PTR(-EAGAIN));
+ if (IS_ERR_OR_NULL(ret) && *blocked)
+ bch2_journal_unblock(j);
- wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN));
return ret;
}
/* allocate journal on a device: */
-static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
- bool new_fs, struct closure *cl)
+static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
+ bool new_fs, struct closure *cl)
{
struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
@@ -896,30 +1044,29 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
}
for (nr_got = 0; nr_got < nr_want; nr_got++) {
- if (new_fs) {
- bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
- if (bu[nr_got] < 0) {
- ret = -BCH_ERR_ENOSPC_bucket_alloc;
- break;
- }
- } else {
- ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl);
- ret = PTR_ERR_OR_ZERO(ob[nr_got]);
- if (ret)
- break;
+ enum bch_watermark watermark = new_fs
+ ? BCH_WATERMARK_btree
+ : BCH_WATERMARK_normal;
+ ob[nr_got] = bch2_bucket_alloc(c, ca, watermark,
+ BCH_DATA_journal, cl);
+ ret = PTR_ERR_OR_ZERO(ob[nr_got]);
+ if (ret)
+ break;
+
+ if (!new_fs) {
ret = bch2_trans_run(c,
bch2_trans_mark_metadata_bucket(trans, ca,
ob[nr_got]->bucket, BCH_DATA_journal,
- ca->mi.bucket_size));
+ ca->mi.bucket_size, BTREE_TRIGGER_transactional));
if (ret) {
bch2_open_bucket_put(c, ob[nr_got]);
bch_err_msg(c, ret, "marking new journal buckets");
break;
}
-
- bu[nr_got] = ob[nr_got]->bucket;
}
+
+ bu[nr_got] = ob[nr_got]->bucket;
}
if (!nr_got)
@@ -959,8 +1106,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (ret)
goto err_unblock;
- if (!new_fs)
- bch2_write_super(c);
+ bch2_write_super(c);
/* Commit: */
if (c)
@@ -991,11 +1137,11 @@ err_unblock:
for (i = 0; i < nr_got; i++)
bch2_trans_run(c,
bch2_trans_mark_metadata_bucket(trans, ca,
- bu[i], BCH_DATA_free, 0));
+ bu[i], BCH_DATA_free, 0,
+ BTREE_TRIGGER_transactional));
err_free:
- if (!new_fs)
- for (i = 0; i < nr_got; i++)
- bch2_open_bucket_put(c, ob[i]);
+ for (i = 0; i < nr_got; i++)
+ bch2_open_bucket_put(c, ob[i]);
kfree(new_bucket_seq);
kfree(new_buckets);
@@ -1004,26 +1150,20 @@ err_free:
return ret;
}
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr)
+static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca,
+ unsigned nr, bool new_fs)
{
struct journal_device *ja = &ca->journal;
- struct closure cl;
int ret = 0;
+ struct closure cl;
closure_init_stack(&cl);
- down_write(&c->state_lock);
-
/* don't handle reducing nr of buckets yet: */
if (nr < ja->nr)
- goto unlock;
+ return 0;
- while (ja->nr < nr) {
+ while (!ret && ja->nr < nr) {
struct disk_reservation disk_res = { 0, 0, 0 };
/*
@@ -1036,29 +1176,42 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
* filesystem-wide allocation will succeed, this is a device
* specific allocation - we can hang here:
*/
+ if (!new_fs) {
+ ret = bch2_disk_reservation_get(c, &disk_res,
+ bucket_to_sector(ca, nr - ja->nr), 1, 0);
+ if (ret)
+ break;
+ }
- ret = bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 1, 0);
- if (ret)
- break;
+ ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl);
- ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+ if (ret == -BCH_ERR_bucket_alloc_blocked ||
+ ret == -BCH_ERR_open_buckets_empty)
+ ret = 0; /* wait and retry */
bch2_disk_reservation_put(c, &disk_res);
-
closure_sync(&cl);
-
- if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
- break;
}
- bch_err_fn(c, ret);
-unlock:
+ return ret;
+}
+
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+ unsigned nr)
+{
+ down_write(&c->state_lock);
+ int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false);
up_write(&c->state_lock);
+
+ bch_err_fn(c, ret);
return ret;
}
-int bch2_dev_journal_alloc(struct bch_dev *ca)
+int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
{
unsigned nr;
int ret;
@@ -1080,7 +1233,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
min(1 << 13,
(1 << 24) / ca->mi.bucket_size));
- ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+ ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs);
err:
bch_err_fn(ca, ret);
return ret;
@@ -1092,7 +1245,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c)
if (ca->journal.nr)
continue;
- int ret = bch2_dev_journal_alloc(ca);
+ int ret = bch2_dev_journal_alloc(ca, true);
if (ret) {
percpu_ref_put(&ca->io_ref);
return ret;
@@ -1130,6 +1283,9 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
void bch2_fs_journal_stop(struct journal *j)
{
+ if (!test_bit(JOURNAL_running, &j->flags))
+ return;
+
bch2_journal_reclaim_stop(j);
bch2_journal_flush_all_pins(j);
@@ -1139,15 +1295,19 @@ void bch2_fs_journal_stop(struct journal *j)
* Always write a new journal entry, to make sure the clock hands are up
* to date (and match the superblock)
*/
- bch2_journal_meta(j);
+ __bch2_journal_meta(j);
journal_quiesce(j);
+ cancel_delayed_work_sync(&j->write_work);
- BUG_ON(!bch2_journal_error(j) &&
- test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
- j->last_empty_seq != journal_cur_seq(j));
+ WARN(!bch2_journal_error(j) &&
+ test_bit(JOURNAL_replay_done, &j->flags) &&
+ j->last_empty_seq != journal_cur_seq(j),
+ "journal shutdown error: cur seq %llu but last empty seq %llu",
+ journal_cur_seq(j), j->last_empty_seq);
- cancel_delayed_work_sync(&j->write_work);
+ if (!bch2_journal_error(j))
+ clear_bit(JOURNAL_running, &j->flags);
}
int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
@@ -1157,13 +1317,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
struct journal_replay *i, **_i;
struct genradix_iter iter;
bool had_entries = false;
- unsigned ptr;
u64 last_seq = cur_seq, nr, seq;
+ if (cur_seq >= JOURNAL_SEQ_MAX) {
+ bch_err(c, "cannot start: journal seq overflow");
+ return -EINVAL;
+ }
+
genradix_for_each_reverse(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
last_seq = le64_to_cpu(i->j.last_seq);
@@ -1196,7 +1360,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
seq = le64_to_cpu(i->j.seq);
@@ -1211,18 +1375,18 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
p = journal_seq_pin(j, seq);
p->devs.nr = 0;
- for (ptr = 0; ptr < i->nr_ptrs; ptr++)
- bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+ darray_for_each(i->ptrs, ptr)
+ bch2_dev_list_add_dev(&p->devs, ptr->dev);
had_entries = true;
}
if (!had_entries)
- j->last_empty_seq = cur_seq;
+ j->last_empty_seq = cur_seq - 1; /* to match j->seq */
spin_lock(&j->lock);
- set_bit(JOURNAL_STARTED, &j->flags);
+ set_bit(JOURNAL_running, &j->flags);
j->last_flush_write = jiffies;
j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
@@ -1240,13 +1404,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
void bch2_dev_journal_exit(struct bch_dev *ca)
{
- kfree(ca->journal.bio);
- kfree(ca->journal.buckets);
- kfree(ca->journal.bucket_seq);
+ struct journal_device *ja = &ca->journal;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+ kfree(ja->bio[i]);
+ ja->bio[i] = NULL;
+ }
- ca->journal.bio = NULL;
- ca->journal.buckets = NULL;
- ca->journal.bucket_seq = NULL;
+ kfree(ja->buckets);
+ kfree(ja->bucket_seq);
+ ja->buckets = NULL;
+ ja->bucket_seq = NULL;
}
int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
@@ -1256,14 +1424,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
bch2_sb_field_get(sb, journal);
struct bch_sb_field_journal_v2 *journal_buckets_v2 =
bch2_sb_field_get(sb, journal_v2);
- unsigned i, nr_bvecs;
ja->nr = 0;
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
- for (i = 0; i < nr; i++)
+ for (unsigned i = 0; i < nr; i++)
ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
} else if (journal_buckets) {
ja->nr = bch2_nr_journal_buckets(journal_buckets);
@@ -1273,13 +1440,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (!ja->bucket_seq)
return -BCH_ERR_ENOMEM_dev_journal_init;
- nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+ unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
- ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
- if (!ca->journal.bio)
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+ ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+ nr_bvecs), GFP_KERNEL);
+ if (!ja->bio[i])
+ return -BCH_ERR_ENOMEM_dev_journal_init;
- bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+ ja->bio[i]->ca = ca;
+ ja->bio[i]->buf_idx = i;
+ bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
+ }
ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->buckets)
@@ -1287,14 +1459,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
- unsigned j, dst = 0;
+ unsigned dst = 0;
- for (i = 0; i < nr; i++)
- for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+ for (unsigned i = 0; i < nr; i++)
+ for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
ja->buckets[dst++] =
le64_to_cpu(journal_buckets_v2->d[i].start) + j;
} else if (journal_buckets) {
- for (i = 0; i < ja->nr; i++)
+ for (unsigned i = 0; i < ja->nr; i++)
ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
}
@@ -1303,19 +1475,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
void bch2_fs_journal_exit(struct journal *j)
{
- unsigned i;
+ if (j->wq)
+ destroy_workqueue(j->wq);
darray_exit(&j->early_journal_entries);
- for (i = 0; i < ARRAY_SIZE(j->buf); i++)
- kvpfree(j->buf[i].data, j->buf[i].buf_size);
+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
+ kvfree(j->buf[i].data);
free_fifo(&j->pin);
}
int bch2_fs_journal_init(struct journal *j)
{
static struct lock_class_key res_key;
- unsigned i;
mutex_init(&j->buf_lock);
spin_lock_init(&j->lock);
@@ -1336,19 +1508,32 @@ int bch2_fs_journal_init(struct journal *j)
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
return -BCH_ERR_ENOMEM_journal_pin_fifo;
- for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
- j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+ j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
if (!j->buf[i].data)
return -BCH_ERR_ENOMEM_journal_buf;
+ j->buf[i].idx = i;
}
j->pin.front = j->pin.back = 1;
+
+ j->wq = alloc_workqueue("bcachefs_journal",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
+ if (!j->wq)
+ return -BCH_ERR_ENOMEM_fs_other_alloc;
return 0;
}
/* debug: */
+static const char * const bch2_journal_flags_strs[] = {
+#define x(n) #n,
+ JOURNAL_FLAGS()
+#undef x
+ NULL
+};
+
void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -1356,20 +1541,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
unsigned long now = jiffies;
u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 24);
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 28);
out->atomic++;
rcu_read_lock();
s = READ_ONCE(j->reservations);
+ prt_printf(out, "flags:\t");
+ prt_bitflags(out, bch2_journal_flags_strs, j->flags);
+ prt_newline(out);
prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
- prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
- prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
- prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
+ prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j));
+ prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk);
+ prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
+ prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]);
prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
@@ -1378,49 +1566,52 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_newline(out);
prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
- prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
+ prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked);
prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+ prt_printf(out, "blocked:\t%u\n", j->blocked);
prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
- prt_printf(out, "current entry:\t\t");
+ prt_printf(out, "current entry:\t");
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
- prt_printf(out, "error");
+ prt_printf(out, "error\n");
break;
case JOURNAL_ENTRY_CLOSED_VAL:
- prt_printf(out, "closed");
+ prt_printf(out, "closed\n");
+ break;
+ case JOURNAL_ENTRY_BLOCKED_VAL:
+ prt_printf(out, "blocked\n");
break;
default:
- prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
+ prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
break;
}
- prt_newline(out);
- prt_printf(out, "unwritten entries:");
- prt_newline(out);
+ prt_printf(out, "unwritten entries:\n");
bch2_journal_bufs_to_text(out, j);
- prt_printf(out,
- "replay done:\t\t%i\n",
- test_bit(JOURNAL_REPLAY_DONE, &j->flags));
-
prt_printf(out, "space:\n");
- prt_printf(out, "\tdiscarded\t%u:%u\n",
+ printbuf_indent_add(out, 2);
+ prt_printf(out, "discarded\t%u:%u\n",
j->space[journal_space_discarded].next_entry,
j->space[journal_space_discarded].total);
- prt_printf(out, "\tclean ondisk\t%u:%u\n",
+ prt_printf(out, "clean ondisk\t%u:%u\n",
j->space[journal_space_clean_ondisk].next_entry,
j->space[journal_space_clean_ondisk].total);
- prt_printf(out, "\tclean\t\t%u:%u\n",
+ prt_printf(out, "clean\t%u:%u\n",
j->space[journal_space_clean].next_entry,
j->space[journal_space_clean].total);
- prt_printf(out, "\ttotal\t\t%u:%u\n",
+ prt_printf(out, "total\t%u:%u\n",
j->space[journal_space_total].next_entry,
j->space[journal_space_total].total);
+ printbuf_indent_sub(out, 2);
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
+ if (!ca->mi.durability)
+ continue;
+
struct journal_device *ja = &ca->journal;
if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
@@ -1429,16 +1620,21 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
if (!ja->nr)
continue;
- prt_printf(out, "dev %u:\n", ca->dev_idx);
- prt_printf(out, "\tnr\t\t%u\n", ja->nr);
- prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size);
- prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
- prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx);
- prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
- prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
- prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
+ prt_printf(out, "dev %u:\n", ca->dev_idx);
+ prt_printf(out, "durability %u:\n", ca->mi.durability);
+ printbuf_indent_add(out, 2);
+ prt_printf(out, "nr\t%u\n", ja->nr);
+ prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size);
+ prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+ prt_printf(out, "discard_idx\t%u\n", ja->discard_idx);
+ prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
+ prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
+ prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
+ printbuf_indent_sub(out, 2);
}
+ prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required);
+
rcu_read_unlock();
--out->atomic;
@@ -1450,57 +1646,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
__bch2_journal_debug_to_text(out, j);
spin_unlock(&j->lock);
}
-
-bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
-{
- struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *pin;
- unsigned i;
-
- spin_lock(&j->lock);
- *seq = max(*seq, j->pin.front);
-
- if (*seq >= j->pin.back) {
- spin_unlock(&j->lock);
- return true;
- }
-
- out->atomic++;
-
- pin_list = journal_seq_pin(j, *seq);
-
- prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
- list_for_each_entry(pin, &pin_list->list[i], list) {
- prt_printf(out, "\t%px %ps", pin, pin->flush);
- prt_newline(out);
- }
-
- if (!list_empty(&pin_list->flushed)) {
- prt_printf(out, "flushed:");
- prt_newline(out);
- }
-
- list_for_each_entry(pin, &pin_list->flushed, list) {
- prt_printf(out, "\t%px %ps", pin, pin->flush);
- prt_newline(out);
- }
-
- printbuf_indent_sub(out, 2);
-
- --out->atomic;
- spin_unlock(&j->lock);
-
- return false;
-}
-
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
-{
- u64 seq = 0;
-
- while (!bch2_journal_seq_pins_to_text(out, j, &seq))
- seq++;
-}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 4544ce24bb8a..107f7f901cd9 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
}
bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64, bool);
+void bch2_journal_do_writes(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64);
static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
{
@@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
s = journal_state_buf_put(j, idx);
if (!journal_state_count(s, idx))
- bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+ bch2_journal_buf_put_final(j, seq);
}
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
@@ -282,9 +283,10 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
s = journal_state_buf_put(j, idx);
if (!journal_state_count(s, idx)) {
spin_lock(&j->lock);
- bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+ bch2_journal_buf_put_final(j, seq);
spin_unlock(&j->lock);
- }
+ } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL))
+ wake_up(&j->wait);
}
/*
@@ -310,7 +312,7 @@ static inline void bch2_journal_res_put(struct journal *j,
}
int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
- unsigned);
+ unsigned, struct btree_trans *);
/* First bits for BCH_WATERMARK: */
enum journal_res_flags {
@@ -326,10 +328,10 @@ static inline int journal_res_get_fast(struct journal *j,
unsigned flags)
{
union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
+ old.v = atomic64_read(&j->reservations.counter);
do {
- old.v = new.v = v;
+ new.v = old.v;
/*
* Check if there is still room in the current journal
@@ -355,8 +357,8 @@ static inline int journal_res_get_fast(struct journal *j,
if (flags & JOURNAL_RES_GET_CHECK)
return 1;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ } while (!atomic64_try_cmpxchg(&j->reservations.counter,
+ &old.v, new.v));
res->ref = true;
res->idx = old.idx;
@@ -366,19 +368,20 @@ static inline int journal_res_get_fast(struct journal *j,
}
static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
- unsigned u64s, unsigned flags)
+ unsigned u64s, unsigned flags,
+ struct btree_trans *trans)
{
int ret;
EBUG_ON(res->ref);
- EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+ EBUG_ON(!test_bit(JOURNAL_running, &j->flags));
res->u64s = u64s;
if (journal_res_get_fast(j, res, flags))
goto out;
- ret = bch2_journal_res_get_slowpath(j, res, flags);
+ ret = bch2_journal_res_get_slowpath(j, res, flags, trans);
if (ret)
return ret;
out:
@@ -400,39 +403,38 @@ void bch2_journal_entry_res_resize(struct journal *,
int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
void bch2_journal_flush_async(struct journal *, struct closure *);
-int bch2_journal_flush_seq(struct journal *, u64);
+int bch2_journal_flush_seq(struct journal *, u64, unsigned);
int bch2_journal_flush(struct journal *);
-bool bch2_journal_noflush_seq(struct journal *, u64);
+bool bch2_journal_noflush_seq(struct journal *, u64, u64);
int bch2_journal_meta(struct journal *);
void bch2_journal_halt(struct journal *);
+void bch2_journal_halt_locked(struct journal *);
static inline int bch2_journal_error(struct journal *j)
{
return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
- ? -EIO : 0;
+ ? -BCH_ERR_journal_shutdown : 0;
}
struct bch_dev;
static inline void bch2_journal_set_replay_done(struct journal *j)
{
- BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
- set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+ BUG_ON(!test_bit(JOURNAL_running, &j->flags));
+ set_bit(JOURNAL_replay_done, &j->flags);
}
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq);
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
-bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
-int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_dev_journal_alloc(struct bch_dev *, bool);
int bch2_fs_journal_alloc(struct bch_fs *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 47805193f18c..11c39e0c34f4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,6 +17,63 @@
#include "sb-clean.h"
#include "trace.h"
+#include <linux/ioprio.h>
+#include <linux/string_choices.h>
+
+void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ for_each_member_device(c, ca) {
+ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+
+ m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx);
+ m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free);
+ }
+}
+
+void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
+{
+ mutex_lock(&c->sb_lock);
+ for_each_member_device(c, ca) {
+ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
+
+ unsigned idx = le32_to_cpu(m.last_journal_bucket);
+ if (idx < ca->journal.nr)
+ ca->journal.cur_idx = idx;
+ unsigned offset = le32_to_cpu(m.last_journal_bucket_offset);
+ if (offset <= ca->mi.bucket_size)
+ ca->journal.sectors_free = ca->mi.bucket_size - offset;
+ }
+ mutex_unlock(&c->sb_lock);
+}
+
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ darray_for_each(j->ptrs, i) {
+ if (i != j->ptrs.data)
+ prt_printf(out, " ");
+ prt_printf(out, "%u:%u:%u (sector %llu)",
+ i->dev, i->bucket, i->bucket_offset, i->sector);
+ }
+}
+
+static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
+
+ bch2_journal_ptrs_to_text(out, c, j);
+
+ for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
+ struct jset_entry_datetime *datetime =
+ container_of(entry, struct jset_entry_datetime, entry);
+ bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+ break;
+ }
+}
+
static struct nonce journal_nonce(const struct jset *jset)
{
return (struct nonce) {{
@@ -52,13 +109,15 @@ static void __journal_replay_free(struct bch_fs *c,
BUG_ON(*p != i);
*p = NULL;
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
+ kvfree(i);
}
-static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
{
- i->ignore = true;
+ if (blacklisted)
+ i->ignore_blacklisted = true;
+ else
+ i->ignore_not_dirty = true;
if (!c->opts.read_entire_journal)
__journal_replay_free(c, i);
@@ -84,11 +143,15 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
{
struct genradix_iter iter;
struct journal_replay **_i, *i, *dup;
- struct journal_ptr *ptr;
size_t bytes = vstruct_bytes(j);
u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+ struct printbuf buf = PRINTBUF;
int ret = JOURNAL_ENTRY_ADD_OK;
+ if (!c->journal.oldest_seq_found_ondisk ||
+ le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
+ c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
+
/* Is this entry older than the range we need? */
if (!c->opts.read_entire_journal &&
le64_to_cpu(j->seq) < jlist->last_seq)
@@ -108,12 +171,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
journal_entry_radix_idx(c, jlist->last_seq)) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
if (le64_to_cpu(i->j.seq) >= last_seq)
break;
- journal_replay_free(c, i);
+
+ journal_replay_free(c, i, false);
}
}
@@ -131,72 +195,62 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
*/
dup = *_i;
if (dup) {
- if (bytes == vstruct_bytes(&dup->j) &&
- !memcmp(j, &dup->j, bytes)) {
- i = dup;
- goto found;
- }
+ bool identical = bytes == vstruct_bytes(&dup->j) &&
+ !memcmp(j, &dup->j, bytes);
+ bool not_identical = !identical &&
+ entry_ptr.csum_good &&
+ dup->csum_good;
+
+ bool same_device = false;
+ darray_for_each(dup->ptrs, ptr)
+ if (ptr->dev == ca->dev_idx)
+ same_device = true;
+
+ ret = darray_push(&dup->ptrs, entry_ptr);
+ if (ret)
+ goto out;
- if (!entry_ptr.csum_good) {
- i = dup;
- goto found;
- }
+ bch2_journal_replay_to_text(&buf, c, dup);
+
+ fsck_err_on(same_device,
+ c, journal_entry_dup_same_device,
+ "duplicate journal entry on same device\n %s",
+ buf.buf);
+
+ fsck_err_on(not_identical,
+ c, journal_entry_replicas_data_mismatch,
+ "found duplicate but non identical journal entries\n %s",
+ buf.buf);
- if (!dup->csum_good)
+ if (entry_ptr.csum_good && !identical)
goto replace;
- fsck_err(c, journal_entry_replicas_data_mismatch,
- "found duplicate but non identical journal entries (seq %llu)",
- le64_to_cpu(j->seq));
- i = dup;
- goto found;
+ goto out;
}
replace:
- i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+ i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i)
return -BCH_ERR_ENOMEM_journal_entry_add;
- i->nr_ptrs = 0;
- i->csum_good = entry_ptr.csum_good;
- i->ignore = false;
+ darray_init(&i->ptrs);
+ i->csum_good = entry_ptr.csum_good;
+ i->ignore_blacklisted = false;
+ i->ignore_not_dirty = false;
unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
- i->ptrs[i->nr_ptrs++] = entry_ptr;
if (dup) {
- if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
- bch_err(c, "found too many copies of journal entry %llu",
- le64_to_cpu(i->j.seq));
- dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
- }
-
/* The first ptr should represent the jset we kept: */
- memcpy(i->ptrs + i->nr_ptrs,
- dup->ptrs,
- sizeof(dup->ptrs[0]) * dup->nr_ptrs);
- i->nr_ptrs += dup->nr_ptrs;
+ darray_for_each(dup->ptrs, ptr)
+ darray_push(&i->ptrs, *ptr);
__journal_replay_free(c, dup);
+ } else {
+ darray_push(&i->ptrs, entry_ptr);
}
*_i = i;
- return 0;
-found:
- for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
- if (ptr->dev == ca->dev_idx) {
- bch_err(c, "duplicate journal entry %llu on same device",
- le64_to_cpu(i->j.seq));
- goto out;
- }
- }
-
- if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
- bch_err(c, "found too many copies of journal entry %llu",
- le64_to_cpu(i->j.seq));
- goto out;
- }
-
- i->ptrs[i->nr_ptrs++] = entry_ptr;
out:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -223,7 +277,7 @@ static void journal_entry_err_msg(struct printbuf *out,
if (entry) {
prt_str(out, " type=");
- prt_str(out, bch2_jset_entry_types[entry->type]);
+ bch2_prt_jset_entry_type(out, entry->type);
}
if (!jset) {
@@ -248,7 +302,7 @@ static void journal_entry_err_msg(struct printbuf *out,
journal_entry_err_msg(&_buf, version, jset, entry); \
prt_printf(&_buf, msg, ##__VA_ARGS__); \
\
- switch (flags & BKEY_INVALID_WRITE) { \
+ switch (from.flags & BCH_VALIDATE_write) { \
case READ: \
mustfix_fsck_err(c, _err, "%s", _buf.buf); \
break; \
@@ -274,14 +328,13 @@ static void journal_entry_err_msg(struct printbuf *out,
static int journal_validate_key(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
- unsigned level, enum btree_id btree_id,
struct bkey_i *k,
- unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from,
+ unsigned version, int big_endian)
{
- int write = flags & BKEY_INVALID_WRITE;
+ enum bch_validate_flags flags = from.flags;
+ int write = flags & BCH_VALIDATE_write;
void *next = vstruct_next(entry);
- struct printbuf buf = PRINTBUF;
int ret = 0;
if (journal_entry_err_on(!k->k.u64s,
@@ -314,37 +367,23 @@ static int journal_validate_key(struct bch_fs *c,
}
if (!write)
- bch2_bkey_compat(level, btree_id, version, big_endian,
+ bch2_bkey_compat(from.level, from.btree, version, big_endian,
write, NULL, bkey_to_packed(k));
- if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
- __btree_node_type(level, btree_id), write, &buf)) {
- printbuf_reset(&buf);
- journal_entry_err_msg(&buf, version, jset, entry);
- prt_newline(&buf);
- printbuf_indent_add(&buf, 2);
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
- prt_newline(&buf);
- bch2_bkey_invalid(c, bkey_i_to_s_c(k),
- __btree_node_type(level, btree_id), write, &buf);
-
- mustfix_fsck_err(c, journal_entry_bkey_invalid,
- "%s", buf.buf);
-
+ ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from);
+ if (ret == -BCH_ERR_fsck_delete_bkey) {
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
journal_entry_null_range(vstruct_next(entry), next);
-
- printbuf_exit(&buf);
return FSCK_DELETED_KEY;
}
+ if (ret)
+ goto fsck_err;
if (write)
- bch2_bkey_compat(level, btree_id, version, big_endian,
+ bch2_bkey_compat(from.level, from.btree, version, big_endian,
write, NULL, bkey_to_packed(k));
fsck_err:
- printbuf_exit(&buf);
return ret;
}
@@ -352,18 +391,19 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_i *k = entry->start;
+ from.level = entry->level;
+ from.btree = entry->btree_id;
+
while (k != vstruct_last(entry)) {
- int ret = journal_validate_key(c, jset, entry,
- entry->level,
- entry->btree_id,
- k, version, big_endian,
- flags|BKEY_INVALID_JOURNAL);
+ int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
if (ret == FSCK_DELETED_KEY)
continue;
+ else if (ret)
+ return ret;
k = bkey_next(k);
}
@@ -374,15 +414,16 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
- struct bkey_i *k;
bool first = true;
jset_entry_for_each_key(entry, k) {
if (!first) {
prt_newline(out);
- prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ bch2_prt_jset_entry_type(out, entry->type);
+ prt_str(out, ": ");
}
- prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
+ bch2_btree_id_level_to_text(out, entry->btree_id, entry->level);
+ prt_char(out, ' ');
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
first = false;
}
@@ -392,11 +433,15 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_i *k = entry->start;
int ret = 0;
+ from.root = true;
+ from.level = entry->level + 1;
+ from.btree = entry->btree_id;
+
if (journal_entry_err_on(!entry->u64s ||
le16_to_cpu(entry->u64s) != k->k.u64s,
c, version, jset, entry,
@@ -413,8 +458,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
return 0;
}
- ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
- version, big_endian, flags);
+ ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
if (ret == FSCK_DELETED_KEY)
ret = 0;
fsck_err:
@@ -431,7 +475,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
/* obsolete, don't care: */
return 0;
@@ -446,7 +490,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
int ret = 0;
@@ -473,7 +517,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
struct jset_entry_blacklist_v2 *bl_entry;
int ret = 0;
@@ -515,7 +559,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
@@ -540,16 +584,16 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
- prt_printf(out, "type=%s v=%llu",
- bch2_fs_usage_types[u->entry.btree_id],
- le64_to_cpu(u->v));
+ prt_str(out, "type=");
+ bch2_prt_fs_usage_type(out, u->entry.btree_id);
+ prt_printf(out, " v=%llu", le64_to_cpu(u->v));
}
static int journal_entry_data_usage_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
@@ -566,7 +610,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
goto out;
}
- if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err),
+ if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err),
c, version, jset, entry,
journal_entry_data_usage_bad_size,
"invalid journal entry usage: %s", err.buf)) {
@@ -593,7 +637,7 @@ static int journal_entry_clock_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
@@ -626,20 +670,19 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
- prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+ prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time));
}
static int journal_entry_dev_usage_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
unsigned expected = sizeof(*u);
- unsigned dev;
int ret = 0;
if (journal_entry_err_on(bytes < expected,
@@ -651,16 +694,6 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
return ret;
}
- dev = le32_to_cpu(u->dev);
-
- if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
- c, version, jset, entry,
- journal_entry_dev_usage_bad_dev,
- "bad dev")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
if (journal_entry_err_on(u->pad,
c, version, jset, entry,
journal_entry_dev_usage_bad_pad,
@@ -680,22 +713,28 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
container_of(entry, struct jset_entry_dev_usage, entry);
unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+ if (vstruct_bytes(entry) < sizeof(*u))
+ return;
+
prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
+ printbuf_indent_add(out, 2);
for (i = 0; i < nr_types; i++) {
+ prt_newline(out);
bch2_prt_data_type(out, i);
prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
le64_to_cpu(u->d[i].buckets),
le64_to_cpu(u->d[i].sectors),
le64_to_cpu(u->d[i].fragmented));
}
+ printbuf_indent_sub(out, 2);
}
static int journal_entry_log_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
return 0;
}
@@ -704,19 +743,19 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
- unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
- prt_printf(out, "%.*s", bytes, l->d);
+ prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d);
}
static int journal_entry_overwrite_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
+ from.flags = 0;
return journal_entry_btree_keys_validate(c, jset, entry,
- version, big_endian, READ);
+ version, big_endian, from);
}
static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
@@ -729,10 +768,10 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
return journal_entry_btree_keys_validate(c, jset, entry,
- version, big_endian, READ);
+ version, big_endian, from);
}
static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
@@ -741,10 +780,41 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct
journal_entry_btree_keys_to_text(out, c, entry);
}
+static int journal_entry_datetime_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ struct bkey_validate_context from)
+{
+ unsigned bytes = vstruct_bytes(entry);
+ unsigned expected = 16;
+ int ret = 0;
+
+ if (journal_entry_err_on(vstruct_bytes(entry) < expected,
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_size,
+ "bad size (%u < %u)",
+ bytes, expected)) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+fsck_err:
+ return ret;
+}
+
+static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_datetime *datetime =
+ container_of(entry, struct jset_entry_datetime, entry);
+
+ bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
- enum bkey_invalid_flags);
+ struct bkey_validate_context);
void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
};
@@ -762,32 +832,40 @@ int bch2_journal_entry_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ struct bkey_validate_context from)
{
return entry->type < BCH_JSET_ENTRY_NR
? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
- version, big_endian, flags)
+ version, big_endian, from)
: 0;
}
void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
+ bch2_prt_jset_entry_type(out, entry->type);
+
if (entry->type < BCH_JSET_ENTRY_NR) {
- prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ prt_str(out, ": ");
bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
- } else {
- prt_printf(out, "(unknown type %u)", entry->type);
}
}
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
+ struct bkey_validate_context from = {
+ .flags = flags,
+ .from = BKEY_VALIDATE_journal,
+ .journal_seq = le64_to_cpu(jset->seq),
+ };
+
unsigned version = le32_to_cpu(jset->version);
int ret = 0;
vstruct_for_each(jset, entry) {
+ from.journal_offset = (u64 *) entry - jset->_data;
+
if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
c, version, jset, entry,
journal_entry_past_jset_end,
@@ -796,8 +874,8 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
break;
}
- ret = bch2_journal_entry_validate(c, jset, entry,
- version, JSET_BIG_ENDIAN(jset), flags);
+ ret = bch2_journal_entry_validate(c, jset, entry, version,
+ JSET_BIG_ENDIAN(jset), from);
if (ret)
break;
}
@@ -808,15 +886,19 @@ fsck_err:
static int jset_validate(struct bch_fs *c,
struct bch_dev *ca,
struct jset *jset, u64 sector,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
- unsigned version;
+ struct bkey_validate_context from = {
+ .flags = flags,
+ .from = BKEY_VALIDATE_journal,
+ .journal_seq = le64_to_cpu(jset->seq),
+ };
int ret = 0;
if (le64_to_cpu(jset->magic) != jset_magic(c))
return JOURNAL_ENTRY_NONE;
- version = le32_to_cpu(jset->version);
+ unsigned version = le32_to_cpu(jset->version);
if (journal_entry_err_on(!bch2_version_compatible(version),
c, version, jset, NULL,
jset_unsupported_version,
@@ -861,15 +943,16 @@ static int jset_validate_early(struct bch_fs *c,
unsigned bucket_sectors_left,
unsigned sectors_read)
{
- size_t bytes = vstruct_bytes(jset);
- unsigned version;
- enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
+ struct bkey_validate_context from = {
+ .from = BKEY_VALIDATE_journal,
+ .journal_seq = le64_to_cpu(jset->seq),
+ };
int ret = 0;
if (le64_to_cpu(jset->magic) != jset_magic(c))
return JOURNAL_ENTRY_NONE;
- version = le32_to_cpu(jset->version);
+ unsigned version = le32_to_cpu(jset->version);
if (journal_entry_err_on(!bch2_version_compatible(version),
c, version, jset, NULL,
jset_unsupported_version,
@@ -882,6 +965,7 @@ static int jset_validate_early(struct bch_fs *c,
return -EINVAL;
}
+ size_t bytes = vstruct_bytes(jset);
if (bytes > (sectors_read << 9) &&
sectors_read < bucket_sectors_left)
return JOURNAL_ENTRY_REREAD;
@@ -913,11 +997,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
new_size = roundup_pow_of_two(new_size);
- n = kvpmalloc(new_size, GFP_KERNEL);
+ n = kvmalloc(new_size, GFP_KERNEL);
if (!n)
return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
- kvpfree(b->data, b->size);
+ kvfree(b->data);
b->data = n;
b->size = new_size;
return 0;
@@ -950,6 +1034,8 @@ reread:
nr_bvecs = buf_pages(buf->data, sectors_read << 9);
bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+ if (!bio)
+ return -BCH_ERR_ENOMEM_journal_read_bucket;
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
bio->bi_iter.bi_sector = offset;
@@ -1002,6 +1088,13 @@ reread:
goto err;
}
+ if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
+ ja->highest_seq_found = le64_to_cpu(j->seq);
+ ja->cur_idx = bucket;
+ ja->sectors_free = ca->mi.bucket_size -
+ bucket_remainder(ca, offset) - sectors;
+ }
+
/*
* This happens sometimes if we don't have discards on -
* when we've partially overwritten a bucket with new
@@ -1028,9 +1121,7 @@ reread:
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
vstruct_end(j) - (void *) j->encrypted_start);
- bch2_fs_fatal_err_on(ret, c,
- "error decrypting journal entry: %s",
- bch2_err_str(ret));
+ bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, (struct journal_ptr) {
@@ -1072,8 +1163,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
struct bch_fs *c = ca->fs;
struct journal_list *jlist =
container_of(cl->parent, struct journal_list, cl);
- struct journal_replay *r, **_r;
- struct genradix_iter iter;
struct journal_read_buf buf = { NULL, 0 };
unsigned i;
int ret = 0;
@@ -1093,48 +1182,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
goto err;
}
- ja->sectors_free = ca->mi.bucket_size;
-
- mutex_lock(&jlist->lock);
- genradix_for_each_reverse(&c->journal_entries, iter, _r) {
- r = *_r;
-
- if (!r)
- continue;
-
- for (i = 0; i < r->nr_ptrs; i++) {
- if (r->ptrs[i].dev == ca->dev_idx) {
- unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
- vstruct_sectors(&r->j, c->block_bits);
-
- ja->cur_idx = r->ptrs[i].bucket;
- ja->sectors_free = ca->mi.bucket_size - wrote;
- goto found;
- }
- }
- }
-found:
- mutex_unlock(&jlist->lock);
-
- if (ja->bucket_seq[ja->cur_idx] &&
- ja->sectors_free == ca->mi.bucket_size) {
-#if 0
- /*
- * Debug code for ZNS support, where we (probably) want to be
- * correlated where we stopped in the journal to the zone write
- * points:
- */
- bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
- bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
- for (i = 0; i < 3; i++) {
- unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
-
- bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
- }
-#endif
- ja->sectors_free = 0;
- }
-
/*
* Set dirty_idx to indicate the entire journal is full and needs to be
* reclaimed - journal reclaim will immediately reclaim whatever isn't
@@ -1144,7 +1191,7 @@ found:
ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
out:
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
- kvpfree(buf.data, buf.size);
+ kvfree(buf.data);
percpu_ref_put(&ca->io_ref);
closure_return(cl);
return;
@@ -1155,27 +1202,6 @@ err:
goto out;
}
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
-{
- unsigned i;
-
- for (i = 0; i < j->nr_ptrs; i++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
- u64 offset;
-
- div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
-
- if (i)
- prt_printf(out, " ");
- prt_printf(out, "%u:%u:%u (sector %llu)",
- j->ptrs[i].dev,
- j->ptrs[i].bucket,
- j->ptrs[i].bucket_offset,
- j->ptrs[i].sector);
- }
-}
-
int bch2_journal_read(struct bch_fs *c,
u64 *last_seq,
u64 *blacklist_seq,
@@ -1224,27 +1250,29 @@ int bch2_journal_read(struct bch_fs *c,
* those entries will be blacklisted:
*/
genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
- enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
-
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
if (!*start_seq)
*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
if (JSET_NO_FLUSH(&i->j)) {
- i->ignore = true;
+ i->ignore_blacklisted = true;
continue;
}
if (!last_write_torn && !i->csum_good) {
last_write_torn = true;
- i->ignore = true;
+ i->ignore_blacklisted = true;
continue;
}
+ struct bkey_validate_context from = {
+ .from = BKEY_VALIDATE_journal,
+ .journal_seq = le64_to_cpu(i->j.seq),
+ };
if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
c, le32_to_cpu(i->j.version), &i->j, NULL,
jset_last_seq_newer_than_seq,
@@ -1280,12 +1308,12 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
seq = le64_to_cpu(i->j.seq);
if (seq < *last_seq) {
- journal_replay_free(c, i);
+ journal_replay_free(c, i, false);
continue;
}
@@ -1293,7 +1321,7 @@ int bch2_journal_read(struct bch_fs *c,
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
jset_seq_blacklisted,
"found blacklisted journal entry %llu", seq);
- i->ignore = true;
+ i->ignore_blacklisted = true;
}
}
@@ -1302,7 +1330,7 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
BUG_ON(seq > le64_to_cpu(i->j.seq));
@@ -1335,7 +1363,7 @@ int bch2_journal_read(struct bch_fs *c,
fsck_err(c, journal_entries_missing,
"journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
" prev at %s\n"
- " next at %s",
+ " next at %s, continue?",
missing_start, missing_end,
*last_seq, *blacklist_seq - 1,
buf1.buf, buf2.buf);
@@ -1351,34 +1379,34 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
struct bch_replicas_padded replicas = {
.e.data_type = BCH_DATA_journal,
+ .e.nr_devs = 0,
.e.nr_required = 1,
};
- unsigned ptr;
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
- for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+ darray_for_each(i->ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- if (!i->ptrs[ptr].csum_good)
- bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+ if (!ptr->csum_good)
+ bch_err_dev_offset(ca, ptr->sector,
"invalid journal checksum, seq %llu%s",
le64_to_cpu(i->j.seq),
i->csum_good ? " (had good copy on another device)" : "");
}
ret = jset_validate(c,
- bch_dev_bkey_exists(c, i->ptrs[0].dev),
+ bch2_dev_have_ref(c, i->ptrs.data[0].dev),
&i->j,
- i->ptrs[0].sector,
+ i->ptrs.data[0].sector,
READ);
if (ret)
goto err;
- for (ptr = 0; ptr < i->nr_ptrs; ptr++)
- replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+ darray_for_each(i->ptrs, ptr)
+ replicas_entry_add_dev(&replicas.e, ptr->dev);
bch2_replicas_entry_sort(&replicas.e);
@@ -1404,27 +1432,50 @@ fsck_err:
/* journal write: */
+static void journal_advance_devs_to_next_bucket(struct journal *j,
+ struct dev_alloc_list *devs,
+ unsigned sectors, u64 seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ darray_for_each(*devs, i) {
+ struct bch_dev *ca = rcu_dereference(c->devs[*i]);
+ if (!ca)
+ continue;
+
+ struct journal_device *ja = &ca->journal;
+
+ if (sectors > ja->sectors_free &&
+ sectors <= ca->mi.bucket_size &&
+ bch2_journal_dev_buckets_available(j, ja,
+ journal_space_discarded)) {
+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+ ja->sectors_free = ca->mi.bucket_size;
+
+ /*
+ * ja->bucket_seq[ja->cur_idx] must always have
+ * something sensible:
+ */
+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
+ }
+ }
+}
+
static void __journal_write_alloc(struct journal *j,
struct journal_buf *w,
- struct dev_alloc_list *devs_sorted,
+ struct dev_alloc_list *devs,
unsigned sectors,
unsigned *replicas,
unsigned replicas_want)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_device *ja;
- struct bch_dev *ca;
- unsigned i;
-
- if (*replicas >= replicas_want)
- return;
- for (i = 0; i < devs_sorted->nr; i++) {
- ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
+ darray_for_each(*devs, i) {
+ struct bch_dev *ca = rcu_dereference(c->devs[*i]);
if (!ca)
continue;
- ja = &ca->journal;
+ struct journal_device *ja = &ca->journal;
/*
* Check that we can use this device, and aren't already using
@@ -1470,65 +1521,53 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_devs_mask devs;
- struct journal_device *ja;
- struct bch_dev *ca;
struct dev_alloc_list devs_sorted;
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
unsigned target = c->opts.metadata_target ?:
c->opts.foreground_target;
- unsigned i, replicas = 0, replicas_want =
+ unsigned replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
unsigned replicas_need = min_t(unsigned, replicas_want,
READ_ONCE(c->opts.metadata_replicas_required));
+ bool advance_done = false;
rcu_read_lock();
-retry:
- devs = target_rw_devs(c, BCH_DATA_journal, target);
- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
+ /* We might run more than once if we have to stop and do discards: */
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key));
+ bkey_for_each_ptr(ptrs, p) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev);
+ if (ca)
+ replicas += ca->mi.durability;
+ }
- __journal_write_alloc(j, w, &devs_sorted,
- sectors, &replicas, replicas_want);
+retry_target:
+ devs = target_rw_devs(c, BCH_DATA_journal, target);
+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
+retry_alloc:
+ __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want);
- if (replicas >= replicas_want)
+ if (likely(replicas >= replicas_want))
goto done;
- for (i = 0; i < devs_sorted.nr; i++) {
- ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
- if (!ca)
- continue;
-
- ja = &ca->journal;
-
- if (sectors > ja->sectors_free &&
- sectors <= ca->mi.bucket_size &&
- bch2_journal_dev_buckets_available(j, ja,
- journal_space_discarded)) {
- ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- ja->sectors_free = ca->mi.bucket_size;
-
- /*
- * ja->bucket_seq[ja->cur_idx] must always have
- * something sensible:
- */
- ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
- }
+ if (!advance_done) {
+ journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq);
+ advance_done = true;
+ goto retry_alloc;
}
- __journal_write_alloc(j, w, &devs_sorted,
- sectors, &replicas, replicas_want);
-
if (replicas < replicas_want && target) {
/* Retry from all devices: */
target = 0;
- goto retry;
+ advance_done = false;
+ goto retry_target;
}
done:
rcu_read_unlock();
BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
- return replicas >= replicas_need ? 0 : -EROFS;
+ return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
}
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
@@ -1547,7 +1586,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
return;
- new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
+ new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
@@ -1558,7 +1597,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
swap(buf->buf_size, new_size);
spin_unlock(&j->lock);
- kvpfree(new_buf, new_size);
+ kvfree(new_buf);
}
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
@@ -1568,12 +1607,12 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
static CLOSURE_CALLBACK(journal_write_done)
{
- closure_type(j, struct journal, io);
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
union journal_res_state old, new;
- u64 v, seq;
+ u64 seq = le64_to_cpu(w->data->seq);
int err = 0;
bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
@@ -1593,63 +1632,69 @@ static CLOSURE_CALLBACK(journal_write_done)
if (err)
bch2_fatal_error(c);
- spin_lock(&j->lock);
- seq = le64_to_cpu(w->data->seq);
+ closure_debug_destroy(cl);
+ spin_lock(&j->lock);
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = w->devs_written;
+ if (err && (!j->err_seq || seq < j->err_seq))
+ j->err_seq = seq;
+ w->write_done = true;
+
+ bool completed = false;
+
+ for (seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++) {
+ w = j->buf + (seq & JOURNAL_BUF_MASK);
+ if (!w->write_done)
+ break;
- if (!err) {
- if (!JSET_NO_FLUSH(w->data)) {
+ if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
bch2_do_discards(c);
closure_wake_up(&c->freelist_wait);
-
bch2_reset_alloc_cursors(c);
}
- } else if (!j->err_seq || seq < j->err_seq)
- j->err_seq = seq;
- j->seq_ondisk = seq;
+ j->seq_ondisk = seq;
- /*
- * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
- * more buckets:
- *
- * Must come before signaling write completion, for
- * bch2_fs_journal_stop():
- */
- if (j->watermark != BCH_WATERMARK_stripe)
- journal_reclaim_kick(&c->journal);
+ /*
+ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+ * more buckets:
+ *
+ * Must come before signaling write completion, for
+ * bch2_fs_journal_stop():
+ */
+ if (j->watermark != BCH_WATERMARK_stripe)
+ journal_reclaim_kick(&c->journal);
- /* also must come before signalling write completion: */
- closure_debug_destroy(cl);
+ old.v = atomic64_read(&j->reservations.counter);
+ do {
+ new.v = old.v;
+ BUG_ON(journal_state_count(new, new.unwritten_idx));
+ BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
- v = atomic64_read(&j->reservations.counter);
- do {
- old.v = new.v = v;
- BUG_ON(journal_state_count(new, new.unwritten_idx));
+ new.unwritten_idx++;
+ } while (!atomic64_try_cmpxchg(&j->reservations.counter,
+ &old.v, new.v));
- new.unwritten_idx++;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ closure_wake_up(&w->wait);
+ completed = true;
+ }
- bch2_journal_reclaim_fast(j);
- bch2_journal_space_available(j);
+ if (completed) {
+ bch2_journal_reclaim_fast(j);
+ bch2_journal_space_available(j);
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
- &j->max_in_flight_start, false);
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
- closure_wake_up(&w->wait);
- journal_wake(j);
+ journal_wake(j);
+ }
- if (!journal_state_count(new, new.unwritten_idx) &&
- journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
- spin_unlock(&j->lock);
- closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
- } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+ if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
struct journal_buf *buf = journal_cur_buf(j);
long delta = buf->expires - jiffies;
@@ -1659,47 +1704,52 @@ static CLOSURE_CALLBACK(journal_write_done)
* previous entries still in flight - the current journal entry
* might want to be written now:
*/
-
- spin_unlock(&j->lock);
- mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
- } else {
- spin_unlock(&j->lock);
+ mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
}
+
+ /*
+ * We don't typically trigger journal writes from her - the next journal
+ * write will be triggered immediately after the previous one is
+ * allocated, in bch2_journal_write() - but the journal write error path
+ * is special:
+ */
+ bch2_journal_do_writes(j);
+ spin_unlock(&j->lock);
}
static void journal_write_endio(struct bio *bio)
{
- struct bch_dev *ca = bio->bi_private;
+ struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
+ struct bch_dev *ca = jbio->ca;
struct journal *j = &ca->fs->journal;
- struct journal_buf *w = journal_last_unwritten_buf(j);
- unsigned long flags;
+ struct journal_buf *w = j->buf + jbio->buf_idx;
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
"error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
+ unsigned long flags;
+
spin_lock_irqsave(&j->err_lock, flags);
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
spin_unlock_irqrestore(&j->err_lock, flags);
}
- closure_put(&j->io);
+ closure_put(&w->io);
percpu_ref_put(&ca->io_ref);
}
-static CLOSURE_CALLBACK(do_journal_write)
+static CLOSURE_CALLBACK(journal_write_submit)
{
- closure_type(j, struct journal, io);
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct journal_buf *w = journal_last_unwritten_buf(j);
- struct bio *bio;
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- ca = bch_dev_bkey_exists(c, ptr->dev);
- if (!percpu_ref_tryget(&ca->io_ref)) {
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
+ if (!ca) {
/* XXX: fix this */
bch_err(c, "missing device for journal write\n");
continue;
@@ -1708,11 +1758,13 @@ static CLOSURE_CALLBACK(do_journal_write)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
sectors);
- bio = ca->journal.bio;
+ struct journal_device *ja = &ca->journal;
+ struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
+ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0);
BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
ca->prev_journal_sector = bio->bi_iter.bi_sector;
@@ -1727,11 +1779,50 @@ static CLOSURE_CALLBACK(do_journal_write)
trace_and_count(c, journal_write, bio);
closure_bio_submit(bio, cl);
- ca->journal.bucket_seq[ca->journal.cur_idx] =
- le64_to_cpu(w->data->seq);
+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
}
- continue_at(cl, journal_write_done, c->io_complete_wq);
+ continue_at(cl, journal_write_done, j->wq);
+}
+
+static CLOSURE_CALLBACK(journal_write_preflush)
+{
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
+ spin_lock(&j->lock);
+ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
+ closure_wait(&j->async_wait, cl);
+ spin_unlock(&j->lock);
+ continue_at(cl, journal_write_preflush, j->wq);
+ return;
+ }
+ spin_unlock(&j->lock);
+ }
+
+ if (w->separate_flush) {
+ for_each_rw_member(c, ca) {
+ percpu_ref_get(&ca->io_ref);
+
+ struct journal_device *ja = &ca->journal;
+ struct bio *bio = &ja->bio[w->idx]->bio;
+ bio_reset(bio, ca->disk_sb.bdev,
+ REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ closure_bio_submit(bio, cl);
+ }
+
+ continue_at(cl, journal_write_submit, j->wq);
+ } else {
+ /*
+ * no need to punt to another work item if we're not waiting on
+ * preflushes
+ */
+ journal_write_submit(&cl->work);
+ }
}
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
@@ -1782,11 +1873,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
if (!wb.wb)
bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
- struct bkey_i *k;
jset_entry_for_each_key(i, k) {
ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
if (ret) {
- bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+ bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
+ bch2_err_str(ret));
bch2_journal_keys_to_write_buffer_end(c, &wb);
return ret;
}
@@ -1796,17 +1887,32 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
}
}
- if (wb.wb)
- bch2_journal_keys_to_write_buffer_end(c, &wb);
+ if (wb.wb) {
+ ret = bch2_journal_keys_to_write_buffer_end(c, &wb);
+ if (ret) {
+ bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s",
+ bch2_err_str(ret));
+ return ret;
+ }
+ }
+
+ spin_lock(&c->journal.lock);
w->need_flush_to_write_buffer = false;
+ spin_unlock(&c->journal.lock);
start = end = vstruct_last(jset);
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
+ struct jset_entry_datetime *d =
+ container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
+ d->entry.type = BCH_JSET_ENTRY_datetime;
+ d->seconds = cpu_to_le64(ktime_get_real_seconds());
+
bch2_journal_super_entries_add_common(c, &end, seq);
u64s = (u64 *) end - (u64 *) start;
- BUG_ON(u64s > j->entry_u64s_reserved);
+
+ WARN_ON(u64s > j->entry_u64s_reserved);
le32_add_cpu(&jset->u64s, u64s);
@@ -1814,7 +1920,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
bytes = vstruct_bytes(jset);
if (sectors > w->sectors) {
- bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+ bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
vstruct_bytes(jset), w->sectors << 9,
u64s, w->u64s_reserved, j->entry_u64s_reserved);
return -EINVAL;
@@ -1842,8 +1948,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
- if (bch2_fs_fatal_err_on(ret, c,
- "error decrypting journal entry: %i", ret))
+ if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
return ret;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
@@ -1878,14 +1983,15 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
* So if we're in an error state, and we're still starting up, we don't
* write anything at all.
*/
- if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
+ if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
return -EIO;
if (error ||
w->noflush ||
(!w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+ time_before(jiffies, j->last_flush_write +
+ msecs_to_jiffies(c->opts.journal_flush_delay)) &&
+ test_bit(JOURNAL_may_skip_flush, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
w->data->last_seq = 0;
@@ -1893,9 +1999,10 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
j->nr_noflush_writes++;
} else {
+ w->must_flush = true;
j->last_flush_write = jiffies;
j->nr_flush_writes++;
- clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+ clear_bit(JOURNAL_need_flush_write, &j->flags);
}
return 0;
@@ -1903,20 +2010,27 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
CLOSURE_CALLBACK(bch2_journal_write)
{
- closure_type(j, struct journal, io);
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
- struct bio *bio;
- struct printbuf journal_debug_buf = PRINTBUF;
unsigned nr_rw_members = 0;
int ret;
+ for_each_rw_member(c, ca)
+ nr_rw_members++;
+
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+ BUG_ON(!w->write_started);
+ BUG_ON(w->write_allocated);
+ BUG_ON(w->write_done);
j->write_start_time = local_clock();
spin_lock(&j->lock);
+ if (nr_rw_members > 1)
+ w->separate_flush = true;
+
ret = bch2_journal_write_pick_flush(j, w);
spin_unlock(&j->lock);
if (ret)
@@ -1942,26 +2056,35 @@ CLOSURE_CALLBACK(bch2_journal_write)
bch2_journal_do_discards(j);
}
- if (ret) {
- __bch2_journal_debug_to_text(&journal_debug_buf, j);
+ if (ret && !bch2_journal_error(j)) {
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
+ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
+ le64_to_cpu(w->data->seq),
+ vstruct_sectors(w->data, c->block_bits),
+ bch2_err_str(ret));
+ __bch2_journal_debug_to_text(&buf, j);
spin_unlock(&j->lock);
- bch_err(c, "Unable to allocate journal write:\n%s",
- journal_debug_buf.buf);
- printbuf_exit(&journal_debug_buf);
- goto err;
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
}
+ if (ret)
+ goto err;
/*
* write is allocated, no longer need to account for it in
* bch2_journal_space_available():
*/
w->sectors = 0;
+ w->write_allocated = true;
/*
* journal entry has been compacted and allocated, recalculate space
* available:
*/
bch2_journal_space_available(j);
+ bch2_journal_do_writes(j);
spin_unlock(&j->lock);
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
@@ -1969,12 +2092,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (c->opts.nochanges)
goto no_io;
- for_each_rw_member(c, ca)
- nr_rw_members++;
-
- if (nr_rw_members > 1)
- w->separate_flush = true;
-
/*
* Mark journal replicas before we submit the write to guarantee
* recovery will find the journal entries after a crash.
@@ -1985,25 +2102,15 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (ret)
goto err;
- if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
- for_each_rw_member(c, ca) {
- percpu_ref_get(&ca->io_ref);
-
- bio = ca->journal.bio;
- bio_reset(bio, ca->disk_sb.bdev,
- REQ_OP_WRITE|REQ_PREFLUSH);
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- closure_bio_submit(bio, cl);
- }
- }
-
- continue_at(cl, do_journal_write, c->io_complete_wq);
+ if (!JSET_NO_FLUSH(w->data))
+ continue_at(cl, journal_write_preflush, j->wq);
+ else
+ continue_at(cl, journal_write_submit, j->wq);
return;
no_io:
- continue_at(cl, journal_write_done, c->io_complete_wq);
+ continue_at(cl, journal_write_done, j->wq);
return;
err:
bch2_fatal_error(c);
- continue_at(cl, journal_write_done, c->io_complete_wq);
+ continue_at(cl, journal_write_done, j->wq);
}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index c035e7c108e1..12b39fcb4424 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -2,26 +2,38 @@
#ifndef _BCACHEFS_JOURNAL_IO_H
#define _BCACHEFS_JOURNAL_IO_H
+#include "darray.h"
+
+void bch2_journal_pos_from_member_info_set(struct bch_fs *);
+void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
+
+struct journal_ptr {
+ bool csum_good;
+ u8 dev;
+ u32 bucket;
+ u32 bucket_offset;
+ u64 sector;
+};
+
/*
* Only used for holding the journal entries we read in btree_journal_read()
* during cache_registration
*/
struct journal_replay {
- struct journal_ptr {
- bool csum_good;
- u8 dev;
- u32 bucket;
- u32 bucket_offset;
- u64 sector;
- } ptrs[BCH_REPLICAS_MAX];
- unsigned nr_ptrs;
+ DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
bool csum_good;
- bool ignore;
+ bool ignore_blacklisted;
+ bool ignore_not_dirty;
/* must be last: */
struct jset j;
};
+static inline bool journal_replay_ignore(struct journal_replay *i)
+{
+ return !i || i->ignore_blacklisted || i->ignore_not_dirty;
+}
+
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
@@ -36,12 +48,12 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
}
#define for_each_jset_entry_type(entry, jset, type) \
- for (entry = (jset)->start; \
+ for (struct jset_entry *entry = (jset)->start; \
(entry = __jset_entry_type_next(jset, entry, type)); \
entry = vstruct_next(entry))
#define jset_entry_for_each_key(_e, _k) \
- for (_k = (_e)->start; \
+ for (struct bkey_i *_k = (_e)->start; \
_k < vstruct_last(_e); \
_k = bkey_next(_k))
@@ -51,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
- enum bkey_invalid_flags);
+ struct bkey_validate_context);
void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
struct jset_entry *);
@@ -62,4 +74,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
CLOSURE_CALLBACK(bch2_journal_write);
+static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+ struct jset_entry *entry = *end;
+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+ memset(entry, 0, u64s * sizeof(u64));
+ /*
+ * The u64s field counts from the start of data, ignoring the shared
+ * fields.
+ */
+ entry->u64s = cpu_to_le16(u64s - 1);
+
+ *end = vstruct_next(*end);
+ return entry;
+}
+
#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index c33dca641575..d373cd181a7f 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -38,6 +38,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
struct journal_device *ja,
enum journal_space_from from)
{
+ if (!ja->nr)
+ return 0;
+
unsigned available = (journal_space_from(ja, from) -
ja->cur_idx - 1 + ja->nr) % ja->nr;
@@ -62,14 +65,13 @@ void bch2_journal_set_watermark(struct journal *j)
? BCH_WATERMARK_reclaim
: BCH_WATERMARK_stripe;
- if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
- &j->low_on_space_start, low_on_space) ||
- track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
- &j->low_on_pin_start, low_on_pin) ||
- track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
- &j->write_buffer_full_start, low_on_wb))
+ if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
+ track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
+ track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
trace_and_count(c, journal_full, c);
+ mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin);
+
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
@@ -138,14 +140,18 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
struct bch_fs *c = container_of(j, struct bch_fs, journal);
unsigned pos, nr_devs = 0;
struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
+ unsigned min_bucket_size = U32_MAX;
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
- if (!ca->journal.nr)
+ if (!ca->journal.nr ||
+ !ca->mi.durability)
continue;
+ min_bucket_size = min(min_bucket_size, ca->mi.bucket_size);
+
space = journal_dev_space_available(j, ca, from);
if (!space.next_entry)
continue;
@@ -165,7 +171,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
* We sorted largest to smallest, and we want the smallest out of the
* @nr_devs_want largest devices:
*/
- return dev_space[nr_devs_want - 1];
+ space = dev_space[nr_devs_want - 1];
+ space.next_entry = min(space.next_entry, min_bucket_size);
+ return space;
}
void bch2_journal_space_available(struct journal *j)
@@ -206,6 +214,18 @@ void bch2_journal_space_available(struct journal *j)
j->can_discard = can_discard;
if (nr_online < metadata_replicas_required(c)) {
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+ prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
+ "rw journal devs:", nr_online, metadata_replicas_required(c));
+
+ rcu_read_lock();
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
+ prt_printf(&buf, " %s", ca->name);
+ rcu_read_unlock();
+
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
ret = JOURNAL_ERR_insufficient_devices;
goto out;
}
@@ -226,9 +246,9 @@ void bch2_journal_space_available(struct journal *j)
j->space[journal_space_clean_ondisk].total) &&
(clean - clean_ondisk <= total / 8) &&
(clean_ondisk * 2 > clean))
- set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+ set_bit(JOURNAL_may_skip_flush, &j->flags);
else
- clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+ clear_bit(JOURNAL_may_skip_flush, &j->flags);
bch2_journal_set_watermark(j);
out:
@@ -307,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j)
popped = true;
}
- if (popped)
+ if (popped) {
bch2_journal_space_available(j);
+ __closure_wake_up(&j->reclaim_flush_wait);
+ }
}
bool __bch2_journal_pin_put(struct journal *j, u64 seq)
@@ -342,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j,
pin->seq = 0;
list_del_init(&pin->list);
+ if (j->reclaim_flush_wait.list.first)
+ __closure_wake_up(&j->reclaim_flush_wait);
+
/*
* Unpinning a journal entry may make journal_next_bucket() succeed, if
* writing a new last_seq will now make another bucket available:
@@ -359,15 +384,19 @@ void bch2_journal_pin_drop(struct journal *j,
spin_unlock(&j->lock);
}
-static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
+ journal_pin_flush_fn fn)
{
if (fn == bch2_btree_node_flush0 ||
- fn == bch2_btree_node_flush1)
- return JOURNAL_PIN_btree;
- else if (fn == bch2_btree_key_cache_journal_flush)
- return JOURNAL_PIN_key_cache;
+ fn == bch2_btree_node_flush1) {
+ unsigned idx = fn == bch2_btree_node_flush1;
+ struct btree *b = container_of(pin, struct btree, writes[idx].journal);
+
+ return JOURNAL_PIN_TYPE_btree0 - b->c.level;
+ } else if (fn == bch2_btree_key_cache_journal_flush)
+ return JOURNAL_PIN_TYPE_key_cache;
else
- return JOURNAL_PIN_other;
+ return JOURNAL_PIN_TYPE_other;
}
static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
@@ -386,7 +415,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
atomic_inc(&pin_list->count);
pin->seq = seq;
pin->flush = flush_fn;
- list_add(&pin->list, &pin_list->list[type]);
+
+ if (list_empty(&pin_list->unflushed[type]) &&
+ j->reclaim_flush_wait.list.first)
+ __closure_wake_up(&j->reclaim_flush_wait);
+
+ list_add(&pin->list, &pin_list->unflushed[type]);
}
void bch2_journal_pin_copy(struct journal *j,
@@ -394,8 +428,6 @@ void bch2_journal_pin_copy(struct journal *j,
struct journal_entry_pin *src,
journal_pin_flush_fn flush_fn)
{
- bool reclaim;
-
spin_lock(&j->lock);
u64 seq = READ_ONCE(src->seq);
@@ -411,44 +443,44 @@ void bch2_journal_pin_copy(struct journal *j,
return;
}
- reclaim = __journal_pin_drop(j, dst);
+ bool reclaim = __journal_pin_drop(j, dst);
- bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
+ bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
- spin_unlock(&j->lock);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
- journal_wake(j);
+ if (seq == journal_last_seq(j))
+ journal_wake(j);
+ spin_unlock(&j->lock);
}
void bch2_journal_pin_set(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
- bool reclaim;
-
spin_lock(&j->lock);
BUG_ON(seq < journal_last_seq(j));
- reclaim = __journal_pin_drop(j, pin);
+ bool reclaim = __journal_pin_drop(j, pin);
- bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
+ bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
- spin_unlock(&j->lock);
-
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
- journal_wake(j);
+ if (seq == journal_last_seq(j))
+ journal_wake(j);
+
+ spin_unlock(&j->lock);
}
/**
@@ -481,16 +513,15 @@ journal_get_next_pin(struct journal *j,
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
- unsigned i;
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
if (*seq > seq_to_flush && !allowed_above_seq)
break;
- for (i = 0; i < JOURNAL_PIN_NR; i++)
- if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
- ((1U << i) & allowed_above_seq)) {
- ret = list_first_entry_or_null(&pin_list->list[i],
+ for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
+ if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) ||
+ (BIT(i) & allowed_above_seq)) {
+ ret = list_first_entry_or_null(&pin_list->unflushed[i],
struct journal_entry_pin, list);
if (ret)
return ret;
@@ -526,8 +557,8 @@ static size_t journal_flush_pins(struct journal *j,
}
if (min_key_cache) {
- allowed_above |= 1U << JOURNAL_PIN_key_cache;
- allowed_below |= 1U << JOURNAL_PIN_key_cache;
+ allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache);
+ allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache);
}
cond_resched();
@@ -535,7 +566,9 @@ static size_t journal_flush_pins(struct journal *j,
j->last_flushed = jiffies;
spin_lock(&j->lock);
- pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
+ pin = journal_get_next_pin(j, seq_to_flush,
+ allowed_below,
+ allowed_above, &seq);
if (pin) {
BUG_ON(j->flush_in_progress);
j->flush_in_progress = pin;
@@ -558,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j,
spin_lock(&j->lock);
/* Pin might have been dropped or rearmed: */
if (likely(!err && !j->flush_in_progress_dropped))
- list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
+ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
j->flush_in_progress = NULL;
j->flush_in_progress_dropped = false;
spin_unlock(&j->lock);
@@ -632,6 +665,7 @@ static u64 journal_seq_to_flush(struct journal *j)
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct btree_cache *bc = &c->btree_cache;
bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush;
size_t min_nr, min_key_cache, nr_flushed;
@@ -672,7 +706,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
- if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
+ size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
+ if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
min_nr = 1;
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
@@ -680,8 +715,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
- atomic_read(&c->btree_cache.dirty),
- c->btree_cache.used,
+ atomic_long_read(&bc->nr_dirty), btree_cache_live,
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));
@@ -748,10 +782,12 @@ static int bch2_journal_reclaim_thread(void *arg)
journal_empty = fifo_empty(&j->pin);
spin_unlock(&j->lock);
+ long timeout = j->next_reclaim - jiffies;
+
if (journal_empty)
schedule();
- else if (time_after(j->next_reclaim, jiffies))
- schedule_timeout(j->next_reclaim - jiffies);
+ else if (timeout > 0)
+ schedule_timeout(timeout);
else
break;
}
@@ -795,10 +831,41 @@ int bch2_journal_reclaim_start(struct journal *j)
return 0;
}
+static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush,
+ unsigned types)
+{
+ struct journal_entry_pin_list *pin_list;
+ u64 seq;
+
+ spin_lock(&j->lock);
+ fifo_for_each_entry_ptr(pin_list, &j->pin, seq) {
+ if (seq > seq_to_flush)
+ break;
+
+ for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
+ if ((BIT(i) & types) &&
+ (!list_empty(&pin_list->unflushed[i]) ||
+ !list_empty(&pin_list->flushed[i]))) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+ }
+ spin_unlock(&j->lock);
+
+ return false;
+}
+
+static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush,
+ unsigned types)
+{
+ return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) ||
+ journal_pins_still_flushing(j, seq_to_flush, types);
+}
+
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
bool *did_work)
{
- int ret;
+ int ret = 0;
ret = bch2_journal_error(j);
if (ret)
@@ -806,12 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
- if (journal_flush_pins(j, seq_to_flush,
- (1U << JOURNAL_PIN_key_cache)|
- (1U << JOURNAL_PIN_other), 0, 0, 0) ||
- journal_flush_pins(j, seq_to_flush,
- (1U << JOURNAL_PIN_btree), 0, 0, 0))
- *did_work = true;
+ for (int type = JOURNAL_PIN_TYPE_NR - 1;
+ type >= 0;
+ --type)
+ if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
+ *did_work = true;
+ goto unlock;
+ }
if (seq_to_flush > journal_cur_seq(j))
bch2_journal_entry_close(j);
@@ -821,11 +889,12 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
* If journal replay hasn't completed, the unreplayed journal entries
* hold refs on their corresponding sequence numbers
*/
- ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+ ret = !test_bit(JOURNAL_replay_done, &j->flags) ||
journal_last_seq(j) > seq_to_flush ||
!fifo_used(&j->pin);
spin_unlock(&j->lock);
+unlock:
mutex_unlock(&j->reclaim_lock);
return ret;
@@ -836,10 +905,10 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
/* time_stats this */
bool did_work = false;
- if (!test_bit(JOURNAL_STARTED, &j->flags))
+ if (!test_bit(JOURNAL_running, &j->flags))
return false;
- closure_wait_event(&j->async_wait,
+ closure_wait_event(&j->reclaim_flush_wait,
journal_flush_done(j, seq_to_flush, &did_work));
return did_work;
@@ -905,3 +974,54 @@ err:
return ret;
}
+
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
+{
+ struct journal_entry_pin_list *pin_list;
+ struct journal_entry_pin *pin;
+
+ spin_lock(&j->lock);
+ if (!test_bit(JOURNAL_running, &j->flags)) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+
+ *seq = max(*seq, j->pin.front);
+
+ if (*seq >= j->pin.back) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+
+ out->atomic++;
+
+ pin_list = journal_seq_pin(j, *seq);
+
+ prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "unflushed:\n");
+ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
+ list_for_each_entry(pin, &pin_list->unflushed[i], list)
+ prt_printf(out, "\t%px %ps\n", pin, pin->flush);
+
+ prt_printf(out, "flushed:\n");
+ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++)
+ list_for_each_entry(pin, &pin_list->flushed[i], list)
+ prt_printf(out, "\t%px %ps\n", pin, pin->flush);
+
+ printbuf_indent_sub(out, 2);
+
+ --out->atomic;
+ spin_unlock(&j->lock);
+
+ return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+ u64 seq = 0;
+
+ while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+ seq++;
+}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index ec84c3345281..0a73d7134e1c 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j)
int bch2_journal_flush_device_pins(struct journal *, int);
+void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
+
#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index ae4fb8c3a2bc..62b910f2fb27 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -16,9 +16,8 @@ static int u64_cmp(const void *_l, const void *_r)
return cmp_int(*l, *r);
}
-static int bch2_sb_journal_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_journal *journal = field_to_type(f, journal);
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
@@ -99,13 +98,13 @@ static int u64_range_cmp(const void *_l, const void *_r)
return cmp_int(l->start, r->start);
}
-static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
int ret = -BCH_ERR_invalid_sb_journal;
+ u64 sum = 0;
unsigned nr;
unsigned i;
struct u64_range *b;
@@ -121,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
for (i = 0; i < nr; i++) {
b[i].start = le64_to_cpu(journal->d[i].start);
b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+
+ if (b[i].end <= b[i].start) {
+ prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
+ le64_to_cpu(journal->d[i].start),
+ le64_to_cpu(journal->d[i].nr));
+ goto err;
+ }
+
+ sum += le64_to_cpu(journal->d[i].nr);
}
sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
@@ -150,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
}
}
+ if (sum > UINT_MAX) {
+ prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
+ goto err;
+ }
+
ret = 0;
err:
kfree(b);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 0200e299cfbb..1f25c111c54c 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "btree_iter.h"
#include "eytzinger.h"
+#include "journal.h"
#include "journal_seq_blacklist.h"
#include "super-io.h"
@@ -43,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr)
return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
}
-static struct bch_sb_field_journal_seq_blacklist *
-blacklist_entry_try_merge(struct bch_fs *c,
- struct bch_sb_field_journal_seq_blacklist *bl,
- unsigned i)
-{
- unsigned nr = blacklist_nr_entries(bl);
-
- if (le64_to_cpu(bl->start[i].end) >=
- le64_to_cpu(bl->start[i + 1].start)) {
- bl->start[i].end = bl->start[i + 1].end;
- --nr;
- memmove(&bl->start[i],
- &bl->start[i + 1],
- sizeof(bl->start[0]) * (nr - i));
-
- bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
- sb_blacklist_u64s(nr));
- BUG_ON(!bl);
- }
-
- return bl;
-}
-
-static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
- u64 start, u64 end)
-{
- return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
-}
-
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
{
struct bch_sb_field_journal_seq_blacklist *bl;
- unsigned i, nr;
+ unsigned i = 0, nr;
int ret = 0;
mutex_lock(&c->sb_lock);
bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
nr = blacklist_nr_entries(bl);
- for (i = 0; i < nr; i++) {
+ while (i < nr) {
struct journal_seq_blacklist_entry *e =
bl->start + i;
- if (bl_entry_contig_or_overlaps(e, start, end)) {
- e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
- e->end = cpu_to_le64(max(end, le64_to_cpu(e->end)));
-
- if (i + 1 < nr)
- bl = blacklist_entry_try_merge(c,
- bl, i);
- if (i)
- bl = blacklist_entry_try_merge(c,
- bl, i - 1);
- goto out_write_sb;
+ if (end < le64_to_cpu(e->start))
+ break;
+
+ if (start > le64_to_cpu(e->end)) {
+ i++;
+ continue;
}
+
+ /*
+ * Entry is contiguous or overlapping with new entry: merge it
+ * with new entry, and delete:
+ */
+
+ start = min(start, le64_to_cpu(e->start));
+ end = max(end, le64_to_cpu(e->end));
+ array_remove_item(bl->start, nr, i);
}
bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
@@ -107,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
goto out;
}
- bl->start[nr].start = cpu_to_le64(start);
- bl->start[nr].end = cpu_to_le64(end);
-out_write_sb:
+ array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
+ .start = cpu_to_le64(start),
+ .end = cpu_to_le64(end),
+ }));
c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
ret = bch2_write_super(c);
@@ -119,8 +95,7 @@ out:
return ret ?: bch2_blacklist_table_initialize(c);
}
-static int journal_seq_blacklist_table_cmp(const void *_l,
- const void *_r, size_t size)
+static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
{
const struct journal_seq_blacklist_table_entry *l = _l;
const struct journal_seq_blacklist_table_entry *r = _r;
@@ -165,8 +140,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
if (!bl)
return 0;
- t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
- GFP_KERNEL);
+ t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
if (!t)
return -BCH_ERR_ENOMEM_blacklist_table_init;
@@ -188,9 +162,8 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
return 0;
}
-static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_journal_seq_blacklist *bl =
field_to_type(f, journal_seq_blacklist);
@@ -243,78 +216,40 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
.to_text = bch2_sb_journal_seq_blacklist_to_text
};
-void bch2_blacklist_entries_gc(struct work_struct *work)
+bool bch2_blacklist_entries_gc(struct bch_fs *c)
{
- struct bch_fs *c = container_of(work, struct bch_fs,
- journal_seq_blacklist_gc_work);
- struct journal_seq_blacklist_table *t;
- struct bch_sb_field_journal_seq_blacklist *bl;
struct journal_seq_blacklist_entry *src, *dst;
- struct btree_trans *trans = bch2_trans_get(c);
- unsigned i, nr, new_nr;
- int ret;
-
- for (i = 0; i < BTREE_ID_NR; i++) {
- struct btree_iter iter;
- struct btree *b;
-
- bch2_trans_node_iter_init(trans, &iter, i, POS_MIN,
- 0, 0, BTREE_ITER_PREFETCH);
-retry:
- bch2_trans_begin(trans);
-
- b = bch2_btree_iter_peek_node(&iter);
-
- while (!(ret = PTR_ERR_OR_ZERO(b)) &&
- b &&
- !test_bit(BCH_FS_stopping, &c->flags))
- b = bch2_btree_iter_next_node(&iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- bch2_trans_iter_exit(trans, &iter);
- }
-
- bch2_trans_put(trans);
- if (ret)
- return;
-
- mutex_lock(&c->sb_lock);
- bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+ struct bch_sb_field_journal_seq_blacklist *bl =
+ bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
if (!bl)
- goto out;
+ return false;
- nr = blacklist_nr_entries(bl);
+ unsigned nr = blacklist_nr_entries(bl);
dst = bl->start;
- t = c->journal_seq_blacklist_table;
+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
BUG_ON(nr != t->nr);
- for (src = bl->start, i = eytzinger0_first(t->nr);
+ unsigned i;
+ for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr);
src < bl->start + nr;
src++, i = eytzinger0_next(i, nr)) {
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
- if (t->entries[i].dirty)
+ if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
*dst++ = *src;
}
- new_nr = dst - bl->start;
-
- bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
-
- if (new_nr != nr) {
- bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
- new_nr ? sb_blacklist_u64s(new_nr) : 0);
- BUG_ON(new_nr && !bl);
+ unsigned new_nr = dst - bl->start;
+ if (new_nr == nr)
+ return false;
- if (!new_nr)
- c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
+ bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr);
- bch2_write_super(c);
- }
-out:
- mutex_unlock(&c->sb_lock);
+ bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+ new_nr ? sb_blacklist_u64s(new_nr) : 0);
+ BUG_ON(new_nr && !bl);
+ return true;
}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index afb886ec8e25..d47636f96fdc 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -17,6 +17,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
-void bch2_blacklist_entries_gc(struct work_struct *);
+bool bch2_blacklist_entries_gc(struct bch_fs *);
#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h
new file mode 100644
index 000000000000..2566b12dbc04
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist_format.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
+
+struct journal_seq_blacklist_entry {
+ __le64 start;
+ __le64 end;
+};
+
+struct bch_sb_field_journal_seq_blacklist {
+ struct bch_sb_field field;
+ struct journal_seq_blacklist_entry start[];
+};
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 38817c7a0851..1ef3a28ed6ab 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -9,6 +9,9 @@
#include "super_types.h"
#include "fifo.h"
+/* btree write buffer steals 8 bits for its own purposes: */
+#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1)
+
#define JOURNAL_BUF_BITS 2
#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
@@ -18,6 +21,7 @@
* the journal that are being staged or in flight.
*/
struct journal_buf {
+ struct closure io;
struct jset *data;
__BKEY_PADDED(key, BCH_REPLICAS_MAX);
@@ -33,10 +37,14 @@ struct journal_buf {
unsigned disk_sectors; /* maximum size entry could have been, if
buf_size was bigger */
unsigned u64s_reserved;
- bool noflush; /* write has already been kicked off, and was noflush */
- bool must_flush; /* something wants a flush */
- bool separate_flush;
- bool need_flush_to_write_buffer;
+ bool noflush:1; /* write has already been kicked off, and was noflush */
+ bool must_flush:1; /* something wants a flush */
+ bool separate_flush:1;
+ bool need_flush_to_write_buffer:1;
+ bool write_started:1;
+ bool write_allocated:1;
+ bool write_done:1;
+ u8 idx;
};
/*
@@ -45,15 +53,18 @@ struct journal_buf {
*/
enum journal_pin_type {
- JOURNAL_PIN_btree,
- JOURNAL_PIN_key_cache,
- JOURNAL_PIN_other,
- JOURNAL_PIN_NR,
+ JOURNAL_PIN_TYPE_btree3,
+ JOURNAL_PIN_TYPE_btree2,
+ JOURNAL_PIN_TYPE_btree1,
+ JOURNAL_PIN_TYPE_btree0,
+ JOURNAL_PIN_TYPE_key_cache,
+ JOURNAL_PIN_TYPE_other,
+ JOURNAL_PIN_TYPE_NR,
};
struct journal_entry_pin_list {
- struct list_head list[JOURNAL_PIN_NR];
- struct list_head flushed;
+ struct list_head unflushed[JOURNAL_PIN_TYPE_NR];
+ struct list_head flushed[JOURNAL_PIN_TYPE_NR];
atomic_t count;
struct bch_devs_list devs;
};
@@ -107,6 +118,7 @@ union journal_res_state {
*/
#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
+#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2)
#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX)
@@ -124,16 +136,23 @@ enum journal_space_from {
journal_space_nr,
};
+#define JOURNAL_FLAGS() \
+ x(replay_done) \
+ x(running) \
+ x(may_skip_flush) \
+ x(need_flush_write) \
+ x(space_low)
+
enum journal_flags {
- JOURNAL_REPLAY_DONE,
- JOURNAL_STARTED,
- JOURNAL_MAY_SKIP_FLUSH,
- JOURNAL_NEED_FLUSH_WRITE,
+#define x(n) JOURNAL_##n,
+ JOURNAL_FLAGS()
+#undef x
};
/* Reasons we may fail to get a journal reservation: */
#define JOURNAL_ERRORS() \
x(ok) \
+ x(retry) \
x(blocked) \
x(max_in_flight) \
x(journal_full) \
@@ -149,6 +168,13 @@ enum journal_errors {
typedef DARRAY(u64) darray_u64;
+struct journal_bio {
+ struct bch_dev *ca;
+ unsigned buf_idx;
+
+ struct bio bio;
+};
+
/* Embedded in struct bch_fs */
struct journal {
/* Fastpath stuff up front: */
@@ -174,6 +200,7 @@ struct journal {
* insufficient devices:
*/
enum journal_errors cur_entry_error;
+ unsigned cur_entry_offset_if_blocked;
unsigned buf_size_want;
/*
@@ -202,9 +229,10 @@ struct journal {
/* Used when waiting because the journal was full */
wait_queue_head_t wait;
struct closure_waitlist async_wait;
+ struct closure_waitlist reclaim_flush_wait;
- struct closure io;
struct delayed_work write_work;
+ struct workqueue_struct *wq;
/* Sequence number of most recent journal entry (last entry in @pin) */
atomic64_t seq;
@@ -212,9 +240,11 @@ struct journal {
/* seq, last_seq from the most recent journal entry successfully written */
u64 seq_ondisk;
u64 flushed_seq_ondisk;
+ u64 flushing_seq;
u64 last_seq_ondisk;
u64 err_seq;
u64 last_empty_seq;
+ u64 oldest_seq_found_ondisk;
/*
* FIFO of journal entries whose btree updates have not yet been
@@ -274,11 +304,6 @@ struct journal {
u64 nr_noflush_writes;
u64 entry_bytes_written;
- u64 low_on_space_start;
- u64 low_on_pin_start;
- u64 max_in_flight_start;
- u64 write_buffer_full_start;
-
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
struct bch2_time_stats *flush_seq_time;
@@ -313,10 +338,11 @@ struct journal_device {
u64 *buckets;
/* Bio for journal reads/writes to this device */
- struct bio *bio;
+ struct journal_bio *bio[JOURNAL_BUF_NR];
/* for bch_journal_read_device */
struct closure read;
+ u64 highest_seq_found;
};
/*
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index ad598105c587..75f27ec26f85 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -34,30 +34,40 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
- struct bkey_buf sk;
u32 restart_count = trans->restart_count;
- int ret;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
- if (!fn)
- return 0;
+ fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags),
+ trans, logged_op_but_clean,
+ "filesystem marked as clean but have logged op\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf));
+ struct bkey_buf sk;
bch2_bkey_buf_init(&sk);
bch2_bkey_buf_reassemble(&sk, c, k);
- ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?:
- fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+ const struct bch_logged_op_fn *fn = logged_op_fn(sk.k->k.type);
+ if (fn)
+ fn->resume(trans, sk.k);
+
+ ret = bch2_logged_op_finish(trans, sk.k);
bch2_bkey_buf_exit(&sk, c);
- return ret;
+fsck_err:
+ printbuf_exit(&buf);
+ return ret ?: trans_was_restarted(trans, restart_count);
}
int bch2_resume_logged_ops(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter,
- BTREE_ID_logged_ops, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ for_each_btree_key_max(trans, iter,
+ BTREE_ID_logged_ops,
+ POS(LOGGED_OPS_INUM_logged_ops, 0),
+ POS(LOGGED_OPS_INUM_logged_ops, U64_MAX),
+ BTREE_ITER_prefetch, k,
resume_logged_op(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
@@ -66,9 +76,8 @@ int bch2_resume_logged_ops(struct bch_fs *c)
static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
{
struct btree_iter iter;
- int ret;
-
- ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX);
+ int ret = bch2_bkey_get_empty_slot(trans, &iter,
+ BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX));
if (ret)
return ret;
@@ -85,7 +94,7 @@ int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
__bch2_logged_op_start(trans, k));
}
-void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
+int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
{
int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
@@ -101,8 +110,10 @@ void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
- bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
- __func__, buf.buf, bch2_err_str(ret));
+ bch2_fs_fatal_error(c, "deleting logged operation %s: %s",
+ buf.buf, bch2_err_str(ret));
printbuf_exit(&buf);
}
+
+ return ret;
}
diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h
index 4d1e786a27a8..30ae9ef737dd 100644
--- a/fs/bcachefs/logged_ops.h
+++ b/fs/bcachefs/logged_ops.h
@@ -15,6 +15,6 @@ static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i
int bch2_resume_logged_ops(struct bch_fs *);
int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
-void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
+int bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
index 6a4bf7129dba..cfb67c95d4c8 100644
--- a/fs/bcachefs/logged_ops_format.h
+++ b/fs/bcachefs/logged_ops_format.h
@@ -2,6 +2,11 @@
#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
#define _BCACHEFS_LOGGED_OPS_FORMAT_H
+enum logged_ops_inums {
+ LOGGED_OPS_INUM_logged_ops,
+ LOGGED_OPS_INUM_inode_cursors,
+};
+
struct bch_logged_op_truncate {
struct bch_val v;
__le32 subvol;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 7a4ca5a28b3e..ce794d55818f 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
+#include "bkey_buf.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
@@ -10,14 +11,13 @@
#include "recovery.h"
/* KEY_TYPE_lru is obsolete: */
-int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
int ret = 0;
- bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err,
- lru_entry_at_time_0,
+ bkey_fsck_err_on(!lru_pos_time(k.k->p),
+ c, lru_entry_at_time_0,
"lru entry at time=0");
fsck_err:
return ret;
@@ -44,8 +44,8 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
u64 dev_bucket, u64 time, bool set)
{
return time
- ? bch2_btree_bit_mod(trans, BTREE_ID_lru,
- lru_pos(lru_id, dev_bucket, time), set)
+ ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru,
+ lru_pos(lru_id, dev_bucket, time), set)
: 0;
}
@@ -77,10 +77,49 @@ static const char * const bch2_lru_types[] = {
NULL
};
+int bch2_lru_check_set(struct btree_trans *trans,
+ u16 lru_id, u64 time,
+ struct bkey_s_c referring_k,
+ struct bkey_buf *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ struct btree_iter lru_iter;
+ struct bkey_s_c lru_k =
+ bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
+ lru_pos(lru_id,
+ bucket_to_u64(referring_k.k->p),
+ time), 0);
+ int ret = bkey_err(lru_k);
+ if (ret)
+ return ret;
+
+ if (lru_k.k->type != KEY_TYPE_set) {
+ ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed);
+ if (ret)
+ goto err;
+
+ if (fsck_err(trans, alloc_key_to_missing_lru_entry,
+ "missing %s lru entry\n"
+ " %s",
+ bch2_lru_types[lru_type(lru_k)],
+ (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
+ ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time);
+ if (ret)
+ goto err;
+ }
+ }
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &lru_iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static int bch2_check_lru_key(struct btree_trans *trans,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
- struct bpos *last_flushed_pos)
+ struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
@@ -94,11 +133,13 @@ static int bch2_check_lru_key(struct btree_trans *trans,
u64 idx;
int ret;
- if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
- lru_entry_to_invalid_bucket,
+ struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos);
+
+ if (fsck_err_on(!ca,
+ trans, lru_entry_to_invalid_bucket,
"lru key points to nonexistent device:bucket %llu:%llu",
alloc_pos.inode, alloc_pos.offset))
- return bch2_btree_delete_at(trans, lru_iter, 0);
+ return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
ret = bkey_err(k);
@@ -112,21 +153,17 @@ static int bch2_check_lru_key(struct btree_trans *trans,
idx = alloc_lru_idx_read(*a);
break;
case BCH_LRU_fragmentation:
- idx = a->fragmentation_lru;
+ idx = alloc_lru_idx_fragmentation(*a, ca);
break;
}
if (lru_k.k->type != KEY_TYPE_set ||
lru_pos_time(lru_k.k->p) != idx) {
- if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) {
- *last_flushed_pos = lru_k.k->p;
- ret = bch2_btree_write_buffer_flush_sync(trans) ?:
- -BCH_ERR_transaction_restart_write_buffer_flush;
- goto out;
- }
+ ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
+ if (ret)
+ goto err;
- if (c->opts.reconstruct_alloc ||
- fsck_err(c, lru_entry_bad,
+ if (fsck_err(trans, lru_entry_bad,
"incorrect lru entry: lru %s time %llu\n"
" %s\n"
" for %s",
@@ -134,12 +171,12 @@ static int bch2_check_lru_key(struct btree_trans *trans,
lru_pos_time(lru_k.k->p),
(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
- ret = bch2_btree_delete_at(trans, lru_iter, 0);
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
}
-out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &iter);
+ bch2_dev_put(ca);
printbuf_exit(&buf2);
printbuf_exit(&buf1);
return ret;
@@ -147,12 +184,18 @@ fsck_err:
int bch2_check_lrus(struct bch_fs *c)
{
- struct bpos last_flushed_pos = POS_MIN;
+ struct bkey_buf last_flushed;
+
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
- bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
+ BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_check_lru_key(trans, &iter, k, &last_flushed)));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 429dca816df5..f31a6cf1514c 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -2,9 +2,6 @@
#ifndef _BCACHEFS_LRU_H
#define _BCACHEFS_LRU_H
-#define LRU_TIME_BITS 48
-#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
-
static inline u64 lru_pos_id(struct bpos pos)
{
return pos.inode >> LRU_TIME_BITS;
@@ -27,18 +24,6 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
return pos;
}
-#define BCH_LRU_TYPES() \
- x(read) \
- x(fragmentation)
-
-enum bch_lru_type {
-#define x(n) BCH_LRU_##n,
- BCH_LRU_TYPES()
-#undef x
-};
-
-#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
-
static inline enum bch_lru_type lru_type(struct bkey_s_c l)
{
u16 lru_id = l.k->p.inode >> 48;
@@ -48,14 +33,13 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
return BCH_LRU_read;
}
-int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
#define bch2_bkey_ops_lru ((struct bkey_ops) { \
- .key_invalid = bch2_lru_invalid, \
+ .key_validate = bch2_lru_validate, \
.val_to_text = bch2_lru_to_text, \
.min_val_size = 8, \
})
@@ -64,6 +48,9 @@ int bch2_lru_del(struct btree_trans *, u16, u64, u64);
int bch2_lru_set(struct btree_trans *, u16, u64, u64);
int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
+struct bkey_buf;
+int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *);
+
int bch2_check_lrus(struct bch_fs *);
#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h
new file mode 100644
index 000000000000..f372cb3b8cda
--- /dev/null
+++ b/fs/bcachefs/lru_format.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_FORMAT_H
+#define _BCACHEFS_LRU_FORMAT_H
+
+struct bch_lru {
+ struct bch_val v;
+ __le64 idx;
+} __packed __aligned(8);
+
+#define BCH_LRU_TYPES() \
+ x(read) \
+ x(fragmentation)
+
+enum bch_lru_type {
+#define x(n) BCH_LRU_##n,
+ BCH_LRU_TYPES()
+#undef x
+};
+
+#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
+
+#define LRU_TIME_BITS 48
+#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
+
+#endif /* _BCACHEFS_LRU_FORMAT_H */
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
index bf0ef668fd38..0ea9f30803a2 100644
--- a/fs/bcachefs/mean_and_variance.c
+++ b/fs/bcachefs/mean_and_variance.c
@@ -103,14 +103,17 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
* mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
* @s: mean and variance number of samples and their sums
* @x: new value to include in the &mean_and_variance_weighted
+ * @initted: caller must track whether this is the first use or not
+ * @weight: ewma weight
*
* see linked pdf: function derived from equations 140-143 where alpha = 2^w.
* values are stored bitshifted for performance and added precision.
*/
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+ s64 x, bool initted, u8 weight)
{
// previous weighted variance.
- u8 w = s->weight;
+ u8 w = weight;
u64 var_w0 = s->variance;
// new value weighted.
s64 x_w = x << w;
@@ -119,45 +122,50 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64
// new mean weighted.
s64 u_w1 = s->mean + diff;
- if (!s->init) {
+ if (!initted) {
s->mean = x_w;
s->variance = 0;
} else {
s->mean = u_w1;
s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
}
- s->init = true;
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
/**
* mean_and_variance_weighted_get_mean() - get mean from @s
* @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
*/
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+ u8 weight)
{
- return fast_divpow2(s.mean, s.weight);
+ return fast_divpow2(s.mean, weight);
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
/**
* mean_and_variance_weighted_get_variance() -- get variance from @s
* @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
*/
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+ u8 weight)
{
// always positive don't need fast divpow2
- return s.variance >> s.weight;
+ return s.variance >> weight;
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
/**
* mean_and_variance_weighted_get_stddev() - get standard deviation from @s
* @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
*/
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+ u8 weight)
{
- return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+ return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
index 64df11ab422b..47e4a3c3d26e 100644
--- a/fs/bcachefs/mean_and_variance.h
+++ b/fs/bcachefs/mean_and_variance.h
@@ -111,11 +111,11 @@ static inline u128_u u128_shl(u128_u i, s8 shift)
{
u128_u r;
- r.lo = i.lo << shift;
+ r.lo = i.lo << (shift & 63);
if (shift < 64)
- r.hi = (i.hi << shift) | (i.lo >> (64 - shift));
+ r.hi = (i.hi << (shift & 63)) | (i.lo >> (-shift & 63));
else {
- r.hi = i.lo << (shift - 64);
+ r.hi = i.lo << (-shift & 63);
r.lo = 0;
}
return r;
@@ -154,8 +154,6 @@ struct mean_and_variance {
/* expontentially weighted variant */
struct mean_and_variance_weighted {
- bool init;
- u8 weight; /* base 2 logarithim */
s64 mean;
u64 variance;
};
@@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s);
u64 mean_and_variance_get_variance(struct mean_and_variance s1);
u32 mean_and_variance_get_stddev(struct mean_and_variance s);
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+ s64 v, bool initted, u8 weight);
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+ u8 weight);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+ u8 weight);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+ u8 weight);
#endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
index 019583c3ca0e..e9d9c0212e44 100644
--- a/fs/bcachefs/mean_and_variance_test.c
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -31,53 +31,59 @@ static void mean_and_variance_basic_test(struct kunit *test)
static void mean_and_variance_weighted_test(struct kunit *test)
{
- struct mean_and_variance_weighted s = { .weight = 2 };
+ struct mean_and_variance_weighted s = { };
- mean_and_variance_weighted_update(&s, 10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+ mean_and_variance_weighted_update(&s, 10, false, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
- mean_and_variance_weighted_update(&s, 20);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+ mean_and_variance_weighted_update(&s, 20, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
- mean_and_variance_weighted_update(&s, 30);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+ mean_and_variance_weighted_update(&s, 30, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
- s = (struct mean_and_variance_weighted) { .weight = 2 };
+ s = (struct mean_and_variance_weighted) { };
- mean_and_variance_weighted_update(&s, -10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+ mean_and_variance_weighted_update(&s, -10, false, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
- mean_and_variance_weighted_update(&s, -20);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+ mean_and_variance_weighted_update(&s, -20, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
- mean_and_variance_weighted_update(&s, -30);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+ mean_and_variance_weighted_update(&s, -30, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
}
static void mean_and_variance_weighted_advanced_test(struct kunit *test)
{
- struct mean_and_variance_weighted s = { .weight = 8 };
+ struct mean_and_variance_weighted s = { };
+ bool initted = false;
s64 i;
- for (i = 10; i <= 100; i += 10)
- mean_and_variance_weighted_update(&s, i);
+ for (i = 10; i <= 100; i += 10) {
+ mean_and_variance_weighted_update(&s, i, initted, 8);
+ initted = true;
+ }
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
- s = (struct mean_and_variance_weighted) { .weight = 8 };
+ s = (struct mean_and_variance_weighted) { };
+ initted = false;
- for (i = -10; i >= -100; i -= 10)
- mean_and_variance_weighted_update(&s, i);
+ for (i = -10; i >= -100; i -= 10) {
+ mean_and_variance_weighted_update(&s, i, initted, 8);
+ initted = true;
+ }
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
}
static void do_mean_and_variance_test(struct kunit *test,
@@ -92,26 +98,26 @@ static void do_mean_and_variance_test(struct kunit *test,
s64 *weighted_stddev)
{
struct mean_and_variance mv = {};
- struct mean_and_variance_weighted vw = { .weight = weight };
+ struct mean_and_variance_weighted vw = { };
for (unsigned i = 0; i < initial_n; i++) {
mean_and_variance_update(&mv, initial_value);
- mean_and_variance_weighted_update(&vw, initial_value);
+ mean_and_variance_weighted_update(&vw, initial_value, false, weight);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0);
}
for (unsigned i = 0; i < n; i++) {
mean_and_variance_update(&mv, data[i]);
- mean_and_variance_weighted_update(&vw, data[i]);
+ mean_and_variance_weighted_update(&vw, data[i], true, weight);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]);
}
KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
@@ -130,20 +136,8 @@ static void mean_and_variance_test_1(struct kunit *test)
d, mean, stddev, weighted_mean, weighted_stddev);
}
-static void mean_and_variance_test_2(struct kunit *test)
-{
- s64 d[] = { 100, 10, 10, 10, 10, 10, 10 };
- s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 };
- s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 };
- s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 };
- s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
/* Test behaviour where we switch from one steady state to another: */
-static void mean_and_variance_test_3(struct kunit *test)
+static void mean_and_variance_test_2(struct kunit *test)
{
s64 d[] = { 100, 100, 100, 100, 100 };
s64 mean[] = { 22, 32, 40, 46, 50 };
@@ -155,18 +149,6 @@ static void mean_and_variance_test_3(struct kunit *test)
d, mean, stddev, weighted_mean, weighted_stddev);
}
-static void mean_and_variance_test_4(struct kunit *test)
-{
- s64 d[] = { 100, 100, 100, 100, 100 };
- s64 mean[] = { 10, 11, 12, 13, 14 };
- s64 stddev[] = { 9, 13, 15, 17, 19 };
- s64 weighted_mean[] = { 32, 49, 61, 71, 78 };
- s64 weighted_stddev[] = { 38, 44, 44, 41, 38 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
static void mean_and_variance_fast_divpow2(struct kunit *test)
{
s64 i;
@@ -224,8 +206,6 @@ static struct kunit_case mean_and_variance_test_cases[] = {
KUNIT_CASE(mean_and_variance_weighted_advanced_test),
KUNIT_CASE(mean_and_variance_test_1),
KUNIT_CASE(mean_and_variance_test_2),
- KUNIT_CASE(mean_and_variance_test_3),
- KUNIT_CASE(mean_and_variance_test_4),
{}
};
@@ -237,4 +217,5 @@ static struct kunit_suite mean_and_variance_test_suite = {
kunit_test_suite(mean_and_variance_test_suite);
MODULE_AUTHOR("Daniel B. Hill");
+MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests");
MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 5623cee3ef86..ddc187fb693d 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
nr_good = bch2_bkey_durability(c, k.s_c);
if ((!nr_good && !(flags & lost)) ||
(nr_good < replicas && !(flags & degraded)))
- return -EINVAL;
+ return -BCH_ERR_remove_would_lose_data;
return 0;
}
@@ -49,7 +49,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
if (!bch2_bkey_has_device_c(k, dev_idx))
return 0;
- n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node);
ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
@@ -67,7 +67,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
/*
* Since we're not inserting through an extent iterator
- * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
* we aren't using the extent overwrite path to delete, we're
* just using the normal key deletion path:
*/
@@ -87,7 +87,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
continue;
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
if (ret)
@@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
/* don't handle this yet: */
if (flags & BCH_FORCE_IF_METADATA_LOST)
- return -EINVAL;
+ return -BCH_ERR_remove_with_metadata_missing_unimplemented;
trans = bch2_trans_get(c);
bch2_bkey_buf_init(&k);
@@ -119,7 +119,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
for (id = 0; id < BTREE_ID_NR; id++) {
bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
retry:
ret = 0;
while (bch2_trans_begin(trans),
@@ -132,10 +132,8 @@ retry:
ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
dev_idx, flags, true);
- if (ret) {
- bch_err(c, "Cannot drop device without losing data");
+ if (ret)
break;
- }
ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index bf68ea49447b..160b4374160a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -21,6 +21,8 @@
#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
+#include "rebalance.h"
+#include "reflink.h"
#include "replicas.h"
#include "snapshot.h"
#include "super-io.h"
@@ -36,36 +38,6 @@ const char * const bch2_data_ops_strs[] = {
NULL
};
-static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- printbuf_tabstop_push(out, 20);
- prt_str(out, "rewrite ptrs:");
- prt_tab(out);
- bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
- prt_newline(out);
-
- prt_str(out, "kill ptrs: ");
- prt_tab(out);
- bch2_prt_u64_base2(out, data_opts->kill_ptrs);
- prt_newline(out);
-
- prt_str(out, "target: ");
- prt_tab(out);
- bch2_target_to_text(out, c, data_opts->target);
- prt_newline(out);
-
- prt_str(out, "compression: ");
- prt_tab(out);
- bch2_compression_opt_to_text(out, background_compression(*io_opts));
- prt_newline(out);
-
- prt_str(out, "extra replicas: ");
- prt_tab(out);
- prt_u64(out, data_opts->extra_replicas);
-}
-
static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
@@ -226,6 +198,13 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
list_del(&ctxt->list);
mutex_unlock(&c->moving_context_lock);
+ /*
+ * Generally, releasing a transaction within a transaction restart means
+ * an unhandled transaction restart: but this can happen legitimately
+ * within the move code, e.g. when bch2_move_ratelimit() tells us to
+ * exit before we've retried
+ */
+ bch2_trans_begin(ctxt->trans);
bch2_trans_put(ctxt->trans);
memset(ctxt, 0, sizeof(*ctxt));
}
@@ -296,7 +275,7 @@ int bch2_move_extent(struct moving_context *ctxt,
if (!data_opts.rewrite_ptrs &&
!data_opts.extra_replicas) {
if (data_opts.kill_ptrs)
- return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+ return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
return 0;
}
@@ -322,8 +301,8 @@ int bch2_move_extent(struct moving_context *ctxt,
io->write_sectors = k.k->size;
bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
- bio_set_prio(&io->write.op.wbio.bio,
- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+ io->write.op.wbio.bio.bi_ioprio =
+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
GFP_KERNEL))
@@ -333,7 +312,7 @@ int bch2_move_extent(struct moving_context *ctxt,
io->rbio.opts = io_opts;
bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
io->rbio.bio.bi_vcnt = pages;
- bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+ io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
io->rbio.bio.bi_iter.bi_size = sectors << 9;
io->rbio.bio.bi_opf = REQ_OP_READ;
@@ -409,34 +388,42 @@ err:
return ret;
}
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
struct per_snapshot_io_opts *io_opts,
+ struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
+ struct btree_iter *extent_iter,
struct bkey_s_c extent_k)
{
struct bch_fs *c = trans->c;
u32 restart_count = trans->restart_count;
+ struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
int ret = 0;
- if (io_opts->cur_inum != extent_k.k->p.inode) {
+ if (extent_k.k->type == KEY_TYPE_reflink_v)
+ goto out;
+
+ if (io_opts->cur_inum != extent_pos.inode) {
io_opts->d.nr = 0;
- ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
- if (k.k->p.offset != extent_k.k->p.inode)
+ ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode),
+ BTREE_ITER_all_snapshots, k, ({
+ if (k.k->p.offset != extent_pos.inode)
break;
if (!bkey_is_inode(k.k))
continue;
struct bch_inode_unpacked inode;
- BUG_ON(bch2_inode_unpack(k, &inode));
+ _ret3 = bch2_inode_unpack(k, &inode);
+ if (_ret3)
+ break;
struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
darray_push(&io_opts->d, e);
}));
- io_opts->cur_inum = extent_k.k->p.inode;
+ io_opts->cur_inum = extent_pos.inode;
}
ret = ret ?: trans_was_restarted(trans, restart_count);
@@ -445,43 +432,46 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
if (extent_k.k->p.snapshot)
darray_for_each(io_opts->d, i)
- if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
- return &i->io_opts;
-
- return &io_opts->fs_io_opts;
+ if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) {
+ opts_ret = &i->io_opts;
+ break;
+ }
+out:
+ ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k);
+ if (ret)
+ return ERR_PTR(ret);
+ return opts_ret;
}
int bch2_move_get_io_opts_one(struct btree_trans *trans,
struct bch_io_opts *io_opts,
+ struct btree_iter *extent_iter,
struct bkey_s_c extent_k)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
+ struct bch_fs *c = trans->c;
+
+ *io_opts = bch2_opts_to_inode_opts(c->opts);
/* reflink btree? */
- if (!extent_k.k->p.inode) {
- *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
- return 0;
- }
+ if (!extent_k.k->p.inode)
+ goto out;
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ struct btree_iter inode_iter;
+ struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
- BTREE_ITER_CACHED);
- ret = bkey_err(k);
+ BTREE_ITER_cached);
+ int ret = bkey_err(inode_k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
- if (!ret && bkey_is_inode(k.k)) {
+ if (!ret && bkey_is_inode(inode_k.k)) {
struct bch_inode_unpacked inode;
- bch2_inode_unpack(k, &inode);
- bch2_inode_opts_get(io_opts, trans->c, &inode);
- } else {
- *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+ bch2_inode_unpack(inode_k, &inode);
+ bch2_inode_opts_get(io_opts, c, &inode);
}
-
- bch2_trans_iter_exit(trans, &iter);
- return 0;
+ bch2_trans_iter_exit(trans, &inode_iter);
+out:
+ return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
}
int bch2_move_ratelimit(struct moving_context *ctxt)
@@ -539,9 +529,15 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
struct per_snapshot_io_opts snapshot_io_opts;
struct bch_io_opts *io_opts;
struct bkey_buf sk;
- struct btree_iter iter;
+ struct btree_iter iter, reflink_iter = {};
struct bkey_s_c k;
struct data_update_opts data_opts;
+ /*
+ * If we're moving a single file, also process reflinked data it points
+ * to (this includes propagating changed io_opts from the inode to the
+ * extent):
+ */
+ bool walk_indirect = start.inode == end.inode;
int ret = 0, ret2;
per_snapshot_io_opts_init(&snapshot_io_opts, c);
@@ -552,14 +548,17 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
ctxt->stats->pos = BBPOS(btree_id, start);
}
+ bch2_trans_begin(trans);
bch2_trans_iter_init(trans, &iter, btree_id, start,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots);
if (ctxt->rate)
bch2_ratelimit_reset(ctxt->rate);
while (!bch2_move_ratelimit(ctxt)) {
+ struct btree_iter *extent_iter = &iter;
+
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
@@ -578,10 +577,36 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
+ if (walk_indirect &&
+ k.k->type == KEY_TYPE_reflink_p &&
+ REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+
+ bch2_trans_iter_exit(trans, &reflink_iter);
+ k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+
+ if (bkey_deleted(k.k))
+ goto next_nondata;
+
+ /*
+ * XXX: reflink pointers may point to multiple indirect
+ * extents, so don't advance past the entire reflink
+ * pointer - need to fixup iter->k
+ */
+ extent_iter = &reflink_iter;
+ }
+
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
- io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
+ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
+ iter.pos, extent_iter, k);
ret = PTR_ERR_OR_ZERO(io_opts);
if (ret)
continue;
@@ -597,7 +622,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
+ ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
if (ret2) {
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
@@ -618,6 +643,7 @@ next_nondata:
bch2_btree_iter_advance(&iter);
}
+ bch2_trans_iter_exit(trans, &reflink_iter);
bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c);
per_snapshot_io_opts_exit(&snapshot_io_opts);
@@ -683,20 +709,22 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
struct bch_fs *c = trans->c;
bool is_kthread = current->flags & PF_KTHREAD;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- struct btree_iter iter;
+ struct btree_iter iter = {}, bp_iter = {};
struct bkey_buf sk;
- struct bch_backpointer bp;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
struct bkey_s_c k;
struct data_update_opts data_opts;
- unsigned dirty_sectors, bucket_size;
- u64 fragmentation;
- struct bpos bp_pos = POS_MIN;
+ unsigned sectors_moved = 0;
+ struct bkey_buf last_flushed;
int ret = 0;
+ struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
+ if (!ca)
+ return 0;
+
trace_bucket_evacuate(c, &bucket);
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
bch2_bkey_buf_init(&sk);
/*
@@ -704,21 +732,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
*/
bch2_trans_begin(trans);
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- bucket, BTREE_ITER_CACHED);
- ret = lockrestart_do(trans,
- bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
- bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp_start(ca, bucket), 0);
bch_err_msg(c, ret, "looking up alloc key");
if (ret)
goto err;
- a = bch2_alloc_to_v4(k, &a_convert);
- dirty_sectors = bch2_bucket_sectors_dirty(*a);
- bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
- fragmentation = a->fragmentation_lru;
-
ret = bch2_btree_write_buffer_tryflush(trans);
bch_err_msg(c, ret, "flushing btree write buffer");
if (ret)
@@ -730,18 +750,23 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
bch2_trans_begin(trans);
- ret = bch2_get_next_backpointer(trans, bucket, gen,
- &bp_pos, &bp,
- BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek(&bp_iter);
+ ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
goto err;
- if (bkey_eq(bp_pos, POS_MAX))
+
+ if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket)))
break;
- if (!bp.level) {
- k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
+ if (k.k->type != KEY_TYPE_backpointer)
+ goto next;
+
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+
+ if (!bp.v->level) {
+ k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
@@ -753,7 +778,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+ ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
if (ret) {
bch2_trans_iter_exit(trans, &iter);
continue;
@@ -763,14 +788,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
data_opts.target = io_opts.background_target;
data_opts.rewrite_ptrs = 0;
+ unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */
unsigned i = 0;
- bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
- if (ptr->dev == bucket.inode) {
- data_opts.rewrite_ptrs |= 1U << i;
- if (ptr->cached) {
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+ if (p.ptr.dev == bucket.inode) {
+ if (p.ptr.cached) {
bch2_trans_iter_exit(trans, &iter);
goto next;
}
+ data_opts.rewrite_ptrs |= 1U << i;
+ break;
}
i++;
}
@@ -790,14 +819,15 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
goto err;
if (ctxt->stats)
- atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+ atomic64_add(sectors, &ctxt->stats->sectors_seen);
+ sectors_moved += sectors;
} else {
struct btree *b;
- b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
+ b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed);
ret = PTR_ERR_OR_ZERO(b);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
- continue;
+ goto next;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
@@ -805,7 +835,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
if (!b)
goto next;
- unsigned sectors = btree_ptr_sectors_written(&b->key);
+ unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
bch2_trans_iter_exit(trans, &iter);
@@ -821,14 +851,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
atomic64_add(sectors, &ctxt->stats->sectors_seen);
atomic64_add(sectors, &ctxt->stats->sectors_moved);
}
+ sectors_moved += btree_sectors(c);
}
next:
- bp_pos = bpos_nosnap_successor(bp_pos);
+ bch2_btree_iter_advance(&bp_iter);
}
- trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
+ trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret);
err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_dev_put(ca);
bch2_bkey_buf_exit(&sk, c);
+ bch2_bkey_buf_exit(&last_flushed, c);
return ret;
}
@@ -868,7 +902,7 @@ static int bch2_move_btree(struct bch_fs *c,
continue;
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
retry:
ret = 0;
while (bch2_trans_begin(trans),
@@ -920,7 +954,20 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
? c->opts.metadata_replicas
: io_opts->data_replicas;
- if (!nr_good || nr_good >= replicas)
+ rcu_read_lock();
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned i = 0;
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ptr->cached &&
+ (!ca || !ca->mi.durability))
+ data_opts->kill_ptrs |= BIT(i);
+ i++;
+ }
+ rcu_read_unlock();
+
+ if (!data_opts->kill_ptrs &&
+ (!nr_good || nr_good >= replicas))
return false;
data_opts->target = 0;
@@ -968,27 +1015,17 @@ static bool migrate_btree_pred(struct bch_fs *c, void *arg,
return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
+/*
+ * Ancient versions of bcachefs produced packed formats which could represent
+ * keys that the in memory format cannot represent; this checks for those
+ * formats so we can get rid of them.
+ */
static bool bformat_needs_redo(struct bkey_format *f)
{
- unsigned i;
-
- for (i = 0; i < f->nr_fields; i++) {
- unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
- u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
- if (f->bits_per_field[i] > unpacked_bits)
- return true;
-
- if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+ for (unsigned i = 0; i < f->nr_fields; i++)
+ if (bch2_bkey_format_field_overflows(f, i))
return true;
- if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
- unpacked_mask) <
- field_offset)
- return true;
- }
-
return false;
}
@@ -1043,6 +1080,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
struct extent_ptr_decoded p;
unsigned i = 0;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
unsigned d = bch2_extent_ptr_durability(c, &p);
@@ -1053,6 +1091,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
i++;
}
+ rcu_read_unlock();
return data_opts->kill_ptrs != 0;
}
@@ -1137,23 +1176,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_str(out, "keys moved: ");
- prt_u64(out, atomic64_read(&stats->keys_moved));
- prt_newline(out);
-
- prt_str(out, "keys raced: ");
- prt_u64(out, atomic64_read(&stats->keys_raced));
- prt_newline(out);
-
- prt_str(out, "bytes seen: ");
+ prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved));
+ prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced));
+ prt_printf(out, "bytes seen: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
prt_newline(out);
- prt_str(out, "bytes moved: ");
+ prt_printf(out, "bytes moved: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out);
- prt_str(out, "bytes raced: ");
+ prt_printf(out, "bytes raced: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
prt_newline(out);
@@ -1167,19 +1200,17 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
- prt_printf(out, "reads: ios %u/%u sectors %u/%u",
+ prt_printf(out, "reads: ios %u/%u sectors %u/%u\n",
atomic_read(&ctxt->read_ios),
c->opts.move_ios_in_flight,
atomic_read(&ctxt->read_sectors),
c->opts.move_bytes_in_flight >> 9);
- prt_newline(out);
- prt_printf(out, "writes: ios %u/%u sectors %u/%u",
+ prt_printf(out, "writes: ios %u/%u sectors %u/%u\n",
atomic_read(&ctxt->write_ios),
c->opts.move_ios_in_flight,
atomic_read(&ctxt->write_sectors),
c->opts.move_bytes_in_flight >> 9);
- prt_newline(out);
printbuf_indent_add(out, 2);
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 9baf3093a678..51e0505a8156 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -110,9 +110,8 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt
darray_exit(&io_opts->d);
}
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
- struct per_snapshot_io_opts *, struct bkey_s_c);
-int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *,
+ struct btree_iter *, struct bkey_s_c);
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 69e06a84dad4..6718dc37c5a3 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -35,9 +35,10 @@ struct buckets_in_flight {
};
static const struct rhashtable_params bch_move_bucket_params = {
- .head_offset = offsetof(struct move_bucket_in_flight, hash),
- .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
- .key_len = sizeof(struct move_bucket_key),
+ .head_offset = offsetof(struct move_bucket_in_flight, hash),
+ .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
+ .key_len = sizeof(struct move_bucket_key),
+ .automatic_shrinking = true,
};
static struct move_bucket_in_flight *
@@ -72,31 +73,36 @@ move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
static int bch2_bucket_is_movable(struct btree_trans *trans,
struct move_bucket *b, u64 time)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_alloc_v4 _a;
- const struct bch_alloc_v4 *a;
- int ret;
+ struct bch_fs *c = trans->c;
- if (bch2_bucket_is_open(trans->c,
- b->k.bucket.inode,
- b->k.bucket.offset))
+ if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
return 0;
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
- b->k.bucket, BTREE_ITER_CACHED);
- ret = bkey_err(k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+ b->k.bucket, BTREE_ITER_cached);
+ int ret = bkey_err(k);
if (ret)
return ret;
- a = bch2_alloc_to_v4(k, &_a);
+ struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode);
+ if (!ca)
+ goto out;
+
+ if (ca->mi.state != BCH_MEMBER_STATE_rw ||
+ !bch2_dev_is_online(ca))
+ goto out_put;
+
+ struct bch_alloc_v4 _a;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
b->k.gen = a->gen;
b->sectors = bch2_bucket_sectors_dirty(*a);
+ u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
- ret = data_type_movable(a->data_type) &&
- a->fragmentation_lru &&
- a->fragmentation_lru <= time;
-
+ ret = lru_idx && lru_idx <= time;
+out_put:
+ bch2_dev_put(ca);
+out:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -155,11 +161,12 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
if (bch2_err_matches(ret, EROFS))
return ret;
- if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
- __func__, bch2_err_str(ret)))
+ if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
return ret;
- ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
+ bch2_trans_begin(trans);
+
+ ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
0, k, ({
@@ -207,7 +214,8 @@ static int bch2_copygc(struct moving_context *ctxt,
};
move_buckets buckets = { 0 };
struct move_bucket_in_flight *f;
- u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
+ u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen);
+ u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
@@ -237,7 +245,6 @@ static int bch2_copygc(struct moving_context *ctxt,
*did_work = true;
}
err:
- darray_exit(&buckets);
/* no entries in LRU btree found, or got to end: */
if (bch2_err_matches(ret, ENOENT))
@@ -246,8 +253,11 @@ err:
if (ret < 0 && !bch2_err_matches(ret, EROFS))
bch_err_msg(c, ret, "from bch2_move_data()");
- moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
- trace_and_count(c, copygc, c, moved, 0, 0, 0);
+ sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen;
+ sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved;
+ trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved);
+
+ darray_exit(&buckets);
return ret;
}
@@ -288,18 +298,23 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
{
- prt_printf(out, "Currently waiting for: ");
+ printbuf_tabstop_push(out, 32);
+ prt_printf(out, "running:\t%u\n", c->copygc_running);
+ prt_printf(out, "copygc_wait:\t%llu\n", c->copygc_wait);
+ prt_printf(out, "copygc_wait_at:\t%llu\n", c->copygc_wait_at);
+
+ prt_printf(out, "Currently waiting for:\t");
prt_human_readable_u64(out, max(0LL, c->copygc_wait -
atomic64_read(&c->io_clock[WRITE].now)) << 9);
prt_newline(out);
- prt_printf(out, "Currently waiting since: ");
+ prt_printf(out, "Currently waiting since:\t");
prt_human_readable_u64(out, max(0LL,
atomic64_read(&c->io_clock[WRITE].now) -
c->copygc_wait_at) << 9);
prt_newline(out);
- prt_printf(out, "Currently calculated wait: ");
+ prt_printf(out, "Currently calculated wait:\t");
prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
prt_newline(out);
}
@@ -337,9 +352,9 @@ static int bch2_copygc_thread(void *arg)
bch2_trans_unlock_long(ctxt.trans);
cond_resched();
- if (!c->copy_gc_enabled) {
+ if (!c->opts.copygc_enabled) {
move_buckets_wait(&ctxt, buckets, true);
- kthread_wait_freezable(c->copy_gc_enabled ||
+ kthread_wait_freezable(c->opts.copygc_enabled ||
kthread_should_stop());
}
@@ -376,7 +391,7 @@ static int bch2_copygc_thread(void *arg)
if (min_member_capacity == U64_MAX)
min_member_capacity = 128 * 2048;
- bch2_trans_unlock_long(ctxt.trans);
+ move_buckets_wait(&ctxt, buckets, true);
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
MAX_SCHEDULE_TIMEOUT);
}
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index b1ed0b9a20d3..6772faf385a5 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -1,12 +1,14 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
+#include <linux/fs_parser.h>
#include "bcachefs.h"
#include "compress.h"
#include "disk_groups.h"
#include "error.h"
#include "opts.h"
+#include "recovery_passes.h"
#include "super-io.h"
#include "util.h"
@@ -42,12 +44,12 @@ const char * const __bch2_btree_ids[] = {
NULL
};
-const char * const bch2_csum_types[] = {
+static const char * const __bch2_csum_types[] = {
BCH_CSUM_TYPES()
NULL
};
-const char * const bch2_csum_opts[] = {
+const char * const __bch2_csum_opts[] = {
BCH_CSUM_OPTS()
NULL
};
@@ -62,7 +64,7 @@ const char * const bch2_compression_opts[] = {
NULL
};
-const char * const bch2_str_hash_types[] = {
+const char * const __bch2_str_hash_types[] = {
BCH_STR_HASH_TYPES()
NULL
};
@@ -82,18 +84,41 @@ const char * const bch2_member_states[] = {
NULL
};
-const char * const bch2_jset_entry_types[] = {
+static const char * const __bch2_jset_entry_types[] = {
BCH_JSET_ENTRY_TYPES()
NULL
};
-const char * const bch2_fs_usage_types[] = {
+static const char * const __bch2_fs_usage_types[] = {
BCH_FS_USAGE_TYPES()
NULL
};
#undef x
+static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
+ unsigned nr, const char *type, unsigned idx)
+{
+ if (idx < nr)
+ prt_str(out, opts[idx]);
+ else
+ prt_printf(out, "(unknown %s %u)", type, idx);
+}
+
+#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \
+void bch2_prt_##name(struct printbuf *out, type t) \
+{ \
+ prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\
+}
+
+PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type);
+PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type);
+PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type);
+PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt);
+PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
+PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
+PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type);
+
static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
struct printbuf *err)
{
@@ -203,7 +228,12 @@ const struct bch_option bch2_opt_table[] = {
#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \
.min = _min, .max = _max
#define OPT_STR(_choices) .type = BCH_OPT_STR, \
- .min = 0, .max = ARRAY_SIZE(_choices), \
+ .min = 0, .max = ARRAY_SIZE(_choices) - 1, \
+ .choices = _choices
+#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
+ .min = 0, .max = U64_MAX, \
+ .choices = _choices
+#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \
.choices = _choices
#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
@@ -305,17 +335,18 @@ int bch2_opt_parse(struct bch_fs *c,
switch (opt->type) {
case BCH_OPT_BOOL:
if (val) {
- ret = kstrtou64(val, 10, res);
+ ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
+ if (ret != -BCH_ERR_option_not_bool) {
+ *res = ret;
+ } else {
+ if (err)
+ prt_printf(err, "%s: must be bool", opt->attr.name);
+ return ret;
+ }
} else {
- ret = 0;
*res = 1;
}
- if (ret < 0 || (*res != 0 && *res != 1)) {
- if (err)
- prt_printf(err, "%s: must be bool", opt->attr.name);
- return ret;
- }
break;
case BCH_OPT_UINT:
if (!val) {
@@ -351,8 +382,19 @@ int bch2_opt_parse(struct bch_fs *c,
*res = ret;
break;
+ case BCH_OPT_BITFIELD: {
+ s64 v = bch2_read_flag_list(val, opt->choices);
+ if (v < 0)
+ return v;
+ *res = v;
+ break;
+ }
case BCH_OPT_FN:
ret = opt->fn.parse(c, val, res, err);
+
+ if (ret == -BCH_ERR_option_needs_open_fs)
+ return ret;
+
if (ret < 0) {
if (err)
prt_printf(err, "%s: parse error",
@@ -389,11 +431,16 @@ void bch2_opt_to_text(struct printbuf *out,
prt_printf(out, "%lli", v);
break;
case BCH_OPT_STR:
- if (flags & OPT_SHOW_FULL_LIST)
+ if (v < opt->min || v >= opt->max)
+ prt_printf(out, "(invalid option %lli)", v);
+ else if (flags & OPT_SHOW_FULL_LIST)
prt_string_option(out, opt->choices, v);
else
prt_str(out, opt->choices[v]);
break;
+ case BCH_OPT_BITFIELD:
+ prt_bitflags(out, opt->choices, v);
+ break;
case BCH_OPT_FN:
opt->fn.to_text(out, c, sb, v);
break;
@@ -402,6 +449,32 @@ void bch2_opt_to_text(struct printbuf *out,
}
}
+void bch2_opts_to_text(struct printbuf *out,
+ struct bch_opts opts,
+ struct bch_fs *c, struct bch_sb *sb,
+ unsigned show_mask, unsigned hide_mask,
+ unsigned flags)
+{
+ bool first = true;
+
+ for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) {
+ const struct bch_option *opt = &bch2_opt_table[i];
+
+ if ((opt->flags & hide_mask) || !(opt->flags & show_mask))
+ continue;
+
+ u64 v = bch2_opt_get_by_id(&opts, i);
+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+ continue;
+
+ if (!first)
+ prt_char(out, ',');
+ first = false;
+
+ bch2_opt_to_text(out, c, sb, opt, v, flags);
+ }
+}
+
int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
{
int ret = 0;
@@ -435,14 +508,81 @@ int bch2_opts_check_may_set(struct bch_fs *c)
return 0;
}
+int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
+ struct printbuf *parse_later,
+ const char *name, const char *val)
+{
+ struct printbuf err = PRINTBUF;
+ u64 v;
+ int ret, id;
+
+ id = bch2_mount_opt_lookup(name);
+
+ /* Check for the form "noopt", negation of a boolean opt: */
+ if (id < 0 &&
+ !val &&
+ !strncmp("no", name, 2)) {
+ id = bch2_mount_opt_lookup(name + 2);
+ val = "0";
+ }
+
+ /* Unknown options are ignored: */
+ if (id < 0)
+ return 0;
+
+ if (!(bch2_opt_table[id].flags & OPT_MOUNT))
+ goto bad_opt;
+
+ if (id == Opt_acl &&
+ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
+ goto bad_opt;
+
+ if ((id == Opt_usrquota ||
+ id == Opt_grpquota) &&
+ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+ goto bad_opt;
+
+ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
+ if (ret == -BCH_ERR_option_needs_open_fs && parse_later) {
+ prt_printf(parse_later, "%s=%s,", name, val);
+ if (parse_later->allocation_failure) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ if (ret < 0)
+ goto bad_val;
+
+ if (opts)
+ bch2_opt_set_by_id(opts, id, v);
+
+ ret = 0;
+ goto out;
+
+bad_opt:
+ pr_err("Bad mount option %s", name);
+ ret = -BCH_ERR_option_name;
+ goto out;
+
+bad_val:
+ pr_err("Invalid mount option %s", err.buf);
+ ret = -BCH_ERR_option_value;
+
+out:
+ printbuf_exit(&err);
+ return ret;
+}
+
int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
- char *options)
+ struct printbuf *parse_later, char *options)
{
char *copied_opts, *copied_opts_start;
char *opt, *name, *val;
- int ret, id;
- struct printbuf err = PRINTBUF;
- u64 v;
+ int ret;
if (!options)
return 0;
@@ -456,60 +596,26 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
copied_opts = kstrdup(options, GFP_KERNEL);
if (!copied_opts)
- return -1;
+ return -ENOMEM;
copied_opts_start = copied_opts;
while ((opt = strsep(&copied_opts, ",")) != NULL) {
- name = strsep(&opt, "=");
- val = opt;
-
- id = bch2_mount_opt_lookup(name);
-
- /* Check for the form "noopt", negation of a boolean opt: */
- if (id < 0 &&
- !val &&
- !strncmp("no", name, 2)) {
- id = bch2_mount_opt_lookup(name + 2);
- val = "0";
- }
-
- /* Unknown options are ignored: */
- if (id < 0)
+ if (!*opt)
continue;
- if (!(bch2_opt_table[id].flags & OPT_MOUNT))
- goto bad_opt;
-
- if (id == Opt_acl &&
- !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
- goto bad_opt;
-
- if ((id == Opt_usrquota ||
- id == Opt_grpquota) &&
- !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
- goto bad_opt;
+ name = strsep(&opt, "=");
+ val = opt;
- ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
+ ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val);
if (ret < 0)
- goto bad_val;
-
- bch2_opt_set_by_id(opts, id, v);
+ goto out;
}
ret = 0;
goto out;
-bad_opt:
- pr_err("Bad mount option %s", name);
- ret = -1;
- goto out;
-bad_val:
- pr_err("Invalid mount option %s", err.buf);
- ret = -1;
- goto out;
out:
kfree(copied_opts_start);
- printbuf_exit(&err);
return ret;
}
@@ -549,10 +655,20 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
return 0;
}
-void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
+struct bch_dev_sb_opt_set {
+ void (*set_sb)(struct bch_member *, u64);
+};
+
+static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = {
+#define x(n, set) [Opt_##n] = { .set_sb = SET_##set },
+ BCH_DEV_OPT_SETTERS()
+#undef x
+};
+
+void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
+ const struct bch_option *opt, u64 v)
{
- if (opt->set_sb == SET_BCH2_NO_SB_OPT)
- return;
+ enum bch_opt_id id = opt - bch2_opt_table;
if (opt->flags & OPT_SB_FIELD_SECTORS)
v >>= 9;
@@ -560,16 +676,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
if (opt->flags & OPT_SB_FIELD_ILOG2)
v = ilog2(v);
- opt->set_sb(sb, v);
+ if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
+ v++;
+
+ if (opt->flags & OPT_FS) {
+ if (opt->set_sb != SET_BCH2_NO_SB_OPT)
+ opt->set_sb(sb, v);
+ }
+
+ if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) {
+ if (WARN(!bch2_member_exists(sb, dev_idx),
+ "tried to set device option %s on nonexistent device %i",
+ opt->attr.name, dev_idx))
+ return;
+
+ struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
+
+ const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id;
+ if (set->set_sb)
+ set->set_sb(m, v);
+ else
+ pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name);
+ }
}
-void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
+void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
+ const struct bch_option *opt, u64 v)
{
- if (opt->set_sb == SET_BCH2_NO_SB_OPT)
- return;
-
mutex_lock(&c->sb_lock);
- __bch2_opt_set_sb(c->disk_sb.sb, opt, v);
+ __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
@@ -578,11 +713,14 @@ void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
{
- return (struct bch_io_opts) {
+ struct bch_io_opts opts = {
#define x(_name, _bits) ._name = src._name,
BCH_INODE_OPTS()
#undef x
};
+
+ bch2_io_opts_fixups(&opts);
+ return opts;
}
bool bch2_opt_is_inode_opt(enum bch_opt_id id)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 9a4b7faa3765..9d397fc2a1f0 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -16,18 +16,23 @@ extern const char * const bch2_version_upgrade_opts[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
-extern const char * const bch2_csum_types[];
-extern const char * const bch2_csum_opts[];
+extern const char * const __bch2_csum_opts[];
extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
-extern const char * const bch2_str_hash_types[];
+extern const char * const __bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
-extern const char * const bch2_jset_entry_types[];
-extern const char * const bch2_fs_usage_types[];
extern const char * const bch2_d_types[];
+void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type);
+void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type);
+void bch2_prt_data_type(struct printbuf *, enum bch_data_type);
+void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt);
+void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
+void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
+void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type);
+
static inline const char *bch2_d_type_str(unsigned d_type)
{
return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
@@ -51,22 +56,25 @@ void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
/* When can be set: */
enum opt_flags {
- OPT_FS = (1 << 0), /* Filesystem option */
- OPT_DEVICE = (1 << 1), /* Device option */
- OPT_INODE = (1 << 2), /* Inode option */
- OPT_FORMAT = (1 << 3), /* May be specified at format time */
- OPT_MOUNT = (1 << 4), /* May be specified at mount time */
- OPT_RUNTIME = (1 << 5), /* May be specified at runtime */
- OPT_HUMAN_READABLE = (1 << 6),
- OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */
- OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
- OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */
+ OPT_FS = BIT(0), /* Filesystem option */
+ OPT_DEVICE = BIT(1), /* Device option */
+ OPT_INODE = BIT(2), /* Inode option */
+ OPT_FORMAT = BIT(3), /* May be specified at format time */
+ OPT_MOUNT = BIT(4), /* May be specified at mount time */
+ OPT_RUNTIME = BIT(5), /* May be specified at runtime */
+ OPT_HUMAN_READABLE = BIT(6),
+ OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */
+ OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */
+ OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */
+ OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */
+ OPT_HIDDEN = BIT(11),
};
enum opt_type {
BCH_OPT_BOOL,
BCH_OPT_UINT,
BCH_OPT_STR,
+ BCH_OPT_BITFIELD,
BCH_OPT_FN,
};
@@ -135,7 +143,7 @@ enum fsck_err_opts {
x(errors, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_error_actions), \
- BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \
+ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
NULL, "Action to take on filesystem error") \
x(metadata_replicas, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
@@ -165,12 +173,12 @@ enum fsck_err_opts {
"size", "Maximum size of checksummed/compressed extents")\
x(metadata_checksum, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(bch2_csum_opts), \
+ OPT_STR(__bch2_csum_opts), \
BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(data_checksum, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(bch2_csum_opts), \
+ OPT_STR(__bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(compression, u8, \
@@ -214,14 +222,14 @@ enum fsck_err_opts {
BCH_SB_ERASURE_CODE, false, \
NULL, "Enable erasure coding (DO NOT USE YET)") \
x(inodes_32bit, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_INODE_32BIT, true, \
NULL, "Constrain inode numbers to 32 bits") \
- x(shard_inode_numbers, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_SHARD_INUMS, true, \
+ x(shard_inode_numbers_bits, u8, \
+ OPT_FS|OPT_FORMAT, \
+ OPT_UINT(0, 8), \
+ BCH_SB_SHARD_INUMS_NBITS, 0, \
NULL, "Shard new inode numbers by CPU id") \
x(inodes_use_key_cache, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
@@ -260,6 +268,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Enable inline data extents") \
+ x(promote_whole_extents, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \
+ NULL, "Promote whole extents, instead of just part being read")\
x(acl, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
@@ -290,6 +303,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Allow mounting in when data will be missing") \
+ x(no_splitbrain_check, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Don't kick drives out when splitbrain detected")\
x(discard, u8, \
OPT_FS|OPT_MOUNT|OPT_DEVICE, \
OPT_BOOL(), \
@@ -332,6 +350,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Run fsck on mount") \
+ x(fsck_memory_usage_percent, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_UINT(20, 70), \
+ BCH2_NO_SB_OPT, 50, \
+ NULL, "Maximum percentage of system ram fsck is allowed to pin")\
x(fix_errors, u8, \
OPT_FS|OPT_MOUNT, \
OPT_FN(bch2_opt_fix_errors), \
@@ -352,12 +375,27 @@ enum fsck_err_opts {
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
- NULL, "Don't replay the journal") \
- x(keep_journal, u8, \
+ NULL, "Exit recovery immediately prior to journal replay")\
+ x(recovery_passes, u64, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BITFIELD(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Recovery passes to run explicitly") \
+ x(recovery_passes_exclude, u64, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BITFIELD(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Recovery passes to exclude") \
+ x(recovery_pass_last, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_STR_NOLIMIT(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Exit recovery after specified pass") \
+ x(retain_recovery_info, u8, \
0, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
- NULL, "Don't free journal entries/keys after startup")\
+ NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\
x(read_entire_journal, u8, \
0, \
OPT_BOOL(), \
@@ -373,6 +411,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \
NULL, "Log transaction function names in journal") \
+ x(allocator_stuck_timeout, u16, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(0, U16_MAX), \
+ BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \
+ NULL, "Default timeout in seconds for stuck allocator messages")\
x(noexcl, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
@@ -389,7 +432,7 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
"offset", "Sector offset of superblock") \
x(read_only, u8, \
- OPT_FS|OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT|OPT_HIDDEN, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, NULL) \
@@ -409,11 +452,6 @@ enum fsck_err_opts {
BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \
NULL, "Set superblock to latest version,\n" \
"allowing any new features to be used") \
- x(buckets_nouse, u8, \
- 0, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Allocate the buckets_nouse bitmap") \
x(stdio, u64, \
0, \
OPT_UINT(0, S64_MAX), \
@@ -437,6 +475,18 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, true, \
NULL, "Enable nocow mode: enables runtime locking in\n"\
"data move path needed if nocow will ever be in use\n")\
+ x(copygc_enabled, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Enable copygc: disable for debugging, or to\n"\
+ "quiet the system when doing performance testing\n")\
+ x(rebalance_enabled, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Enable rebalance: disable for debugging, or to\n"\
+ "quiet the system when doing performance testing\n")\
x(no_data_io, u8, \
OPT_MOUNT, \
OPT_BOOL(), \
@@ -452,20 +502,30 @@ enum fsck_err_opts {
OPT_DEVICE, \
OPT_UINT(0, S64_MAX), \
BCH2_NO_SB_OPT, 0, \
- "size", "Size of filesystem on device") \
+ "size", "Specifies the bucket size; must be greater than the btree node size")\
x(durability, u8, \
- OPT_DEVICE, \
+ OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \
OPT_UINT(0, BCH_REPLICAS_MAX), \
BCH2_NO_SB_OPT, 1, \
"n", "Data written to this device will be considered\n"\
"to have already been replicated n times") \
+ x(data_allowed, u8, \
+ OPT_DEVICE, \
+ OPT_BITFIELD(__bch2_data_types), \
+ BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
+ "types", "Allowed data types for this device: journal, btree, and/or user")\
x(btree_node_prefetch, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
- NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\
+ NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
" prefetched sequentially")
+#define BCH_DEV_OPT_SETTERS() \
+ x(discard, BCH_MEMBER_DISCARD) \
+ x(durability, BCH_MEMBER_DURABILITY) \
+ x(data_allowed, BCH_MEMBER_DATA_ALLOWED)
+
struct bch_opts {
#define x(_name, _bits, ...) unsigned _name##_defined:1;
BCH_OPTS()
@@ -476,6 +536,13 @@ struct bch_opts {
#undef x
};
+struct bch2_opts_parse {
+ struct bch_opts opts;
+
+ /* to save opts that can't be parsed before the FS is opened: */
+ struct printbuf parse_later;
+};
+
static const __maybe_unused struct bch_opts bch2_opts_default = {
#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \
._name##_defined = true, \
@@ -538,8 +605,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
-void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
-void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
+void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
+
+struct bch_dev;
+void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
int bch2_opt_lookup(const char *);
int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
@@ -551,10 +620,17 @@ int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
const struct bch_option *, u64, unsigned);
+void bch2_opts_to_text(struct printbuf *,
+ struct bch_opts,
+ struct bch_fs *, struct bch_sb *,
+ unsigned, unsigned, unsigned);
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
int bch2_opts_check_may_set(struct bch_fs *);
-int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
+int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
+ struct printbuf *, const char *, const char *);
+int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
+ char *);
/* inode opts: */
@@ -562,11 +638,22 @@ struct bch_io_opts {
#define x(_name, _bits) u##_bits _name;
BCH_INODE_OPTS()
#undef x
+#define x(_name, _bits) u64 _name##_from_inode:1;
+ BCH_INODE_OPTS()
+#undef x
};
-static inline unsigned background_compression(struct bch_io_opts opts)
+static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
{
- return opts.background_compression ?: opts.compression;
+ if (!opts->background_target)
+ opts->background_target = opts->foreground_target;
+ if (!opts->background_compression)
+ opts->background_compression = opts->compression;
+ if (opts->nocow) {
+ opts->compression = opts->background_compression = 0;
+ opts->data_checksum = 0;
+ opts->erasure_code = 0;
+ }
}
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index b27d22925929..4cf5a2af1e6f 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -10,35 +10,57 @@
#include "printbuf.h"
-static inline unsigned printbuf_linelen(struct printbuf *buf)
+static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos)
{
- return buf->pos - buf->last_newline;
+ return pos - buf->last_newline;
}
-int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
+static inline unsigned printbuf_linelen(struct printbuf *buf)
{
- unsigned new_size;
- char *buf;
+ return __printbuf_linelen(buf, buf->pos);
+}
- if (!out->heap_allocated)
- return 0;
+/*
+ * Returns spaces from start of line, if set, or 0 if unset:
+ */
+static inline unsigned cur_tabstop(struct printbuf *buf)
+{
+ return buf->cur_tabstop < buf->nr_tabstops
+ ? buf->_tabstops[buf->cur_tabstop]
+ : 0;
+}
+int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
+{
/* Reserved space for terminating nul: */
extra += 1;
- if (out->pos + extra < out->size)
+ if (out->pos + extra <= out->size)
+ return 0;
+
+ if (!out->heap_allocated) {
+ out->overflow = true;
return 0;
+ }
+
+ unsigned new_size = roundup_pow_of_two(out->size + extra);
- new_size = roundup_pow_of_two(out->size + extra);
+ /* Sanity check... */
+ if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) {
+ out->allocation_failure = true;
+ out->overflow = true;
+ return -ENOMEM;
+ }
/*
* Note: output buffer must be freeable with kfree(), it's not required
* that the user use printbuf_exit().
*/
- buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
+ char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
if (!buf) {
out->allocation_failure = true;
+ out->overflow = true;
return -ENOMEM;
}
@@ -47,6 +69,92 @@ int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
return 0;
}
+static void printbuf_advance_pos(struct printbuf *out, unsigned len)
+{
+ out->pos += min(len, printbuf_remaining(out));
+}
+
+static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr)
+{
+ unsigned move = out->pos - pos;
+
+ bch2_printbuf_make_room(out, nr);
+
+ if (pos + nr < out->size)
+ memmove(out->buf + pos + nr,
+ out->buf + pos,
+ min(move, out->size - 1 - pos - nr));
+
+ if (pos < out->size)
+ memset(out->buf + pos, ' ', min(nr, out->size - pos));
+
+ printbuf_advance_pos(out, nr);
+ printbuf_nul_terminate_reserved(out);
+}
+
+static void __printbuf_do_indent(struct printbuf *out, unsigned pos)
+{
+ while (true) {
+ int pad;
+ unsigned len = out->pos - pos;
+ char *p = out->buf + pos;
+ char *n = memscan(p, '\n', len);
+ if (cur_tabstop(out)) {
+ n = min(n, (char *) memscan(p, '\r', len));
+ n = min(n, (char *) memscan(p, '\t', len));
+ }
+
+ pos = n - out->buf;
+ if (pos == out->pos)
+ break;
+
+ switch (*n) {
+ case '\n':
+ pos++;
+ out->last_newline = pos;
+
+ printbuf_insert_spaces(out, pos, out->indent);
+
+ pos = min(pos + out->indent, out->pos);
+ out->last_field = pos;
+ out->cur_tabstop = 0;
+ break;
+ case '\r':
+ memmove(n, n + 1, out->pos - pos);
+ --out->pos;
+ pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos);
+ if (pad > 0) {
+ printbuf_insert_spaces(out, out->last_field, pad);
+ pos += pad;
+ }
+
+ out->last_field = pos;
+ out->cur_tabstop++;
+ break;
+ case '\t':
+ pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1;
+ if (pad > 0) {
+ *n = ' ';
+ printbuf_insert_spaces(out, pos, pad - 1);
+ pos += pad;
+ } else {
+ memmove(n, n + 1, out->pos - pos);
+ --out->pos;
+ }
+
+ out->last_field = pos;
+ out->cur_tabstop++;
+ break;
+ }
+ }
+}
+
+static inline void printbuf_do_indent(struct printbuf *out, unsigned pos)
+{
+ if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling)
+ __printbuf_do_indent(out, pos);
+}
+
void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
{
int len;
@@ -55,14 +163,14 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
va_list args2;
va_copy(args2, args);
- len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+ len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2);
va_end(args2);
- } while (len + 1 >= printbuf_remaining(out) &&
- !bch2_printbuf_make_room(out, len + 1));
+ } while (len > printbuf_remaining(out) &&
+ !bch2_printbuf_make_room(out, len));
- len = min_t(size_t, len,
- printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
- out->pos += len;
+ unsigned indent_pos = out->pos;
+ printbuf_advance_pos(out, len);
+ printbuf_do_indent(out, indent_pos);
}
void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
@@ -72,14 +180,14 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
do {
va_start(args, fmt);
- len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
+ len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args);
va_end(args);
- } while (len + 1 >= printbuf_remaining(out) &&
- !bch2_printbuf_make_room(out, len + 1));
+ } while (len > printbuf_remaining(out) &&
+ !bch2_printbuf_make_room(out, len));
- len = min_t(size_t, len,
- printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
- out->pos += len;
+ unsigned indent_pos = out->pos;
+ printbuf_advance_pos(out, len);
+ printbuf_do_indent(out, indent_pos);
}
/**
@@ -194,31 +302,32 @@ void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
void bch2_prt_newline(struct printbuf *buf)
{
- unsigned i;
-
bch2_printbuf_make_room(buf, 1 + buf->indent);
- __prt_char(buf, '\n');
+ __prt_char_reserved(buf, '\n');
buf->last_newline = buf->pos;
- for (i = 0; i < buf->indent; i++)
- __prt_char(buf, ' ');
+ __prt_chars_reserved(buf, ' ', buf->indent);
- printbuf_nul_terminate(buf);
+ printbuf_nul_terminate_reserved(buf);
buf->last_field = buf->pos;
buf->cur_tabstop = 0;
}
-/*
- * Returns spaces from start of line, if set, or 0 if unset:
- */
-static inline unsigned cur_tabstop(struct printbuf *buf)
+void bch2_printbuf_strip_trailing_newline(struct printbuf *out)
{
- return buf->cur_tabstop < buf->nr_tabstops
- ? buf->_tabstops[buf->cur_tabstop]
- : 0;
+ for (int p = out->pos - 1; p >= 0; --p) {
+ if (out->buf[p] == '\n') {
+ out->pos = p;
+ break;
+ }
+ if (out->buf[p] != ' ')
+ break;
+ }
+
+ printbuf_nul_terminate_reserved(out);
}
static void __prt_tab(struct printbuf *out)
@@ -247,24 +356,9 @@ void bch2_prt_tab(struct printbuf *out)
static void __prt_tab_rjust(struct printbuf *buf)
{
- unsigned move = buf->pos - buf->last_field;
int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
-
- if (pad > 0) {
- bch2_printbuf_make_room(buf, pad);
-
- if (buf->last_field + pad < buf->size)
- memmove(buf->buf + buf->last_field + pad,
- buf->buf + buf->last_field,
- min(move, buf->size - 1 - buf->last_field - pad));
-
- if (buf->last_field < buf->size)
- memset(buf->buf + buf->last_field, ' ',
- min((unsigned) pad, buf->size - buf->last_field));
-
- buf->pos += pad;
- printbuf_nul_terminate(buf);
- }
+ if (pad > 0)
+ printbuf_insert_spaces(buf, buf->last_field, pad);
buf->last_field = buf->pos;
buf->cur_tabstop++;
@@ -301,41 +395,9 @@ void bch2_prt_tab_rjust(struct printbuf *buf)
*/
void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
{
- const char *unprinted_start = str;
- const char *end = str + count;
-
- if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) {
- prt_bytes(out, str, count);
- return;
- }
-
- while (str != end) {
- switch (*str) {
- case '\n':
- prt_bytes(out, unprinted_start, str - unprinted_start);
- unprinted_start = str + 1;
- bch2_prt_newline(out);
- break;
- case '\t':
- if (likely(cur_tabstop(out))) {
- prt_bytes(out, unprinted_start, str - unprinted_start);
- unprinted_start = str + 1;
- __prt_tab(out);
- }
- break;
- case '\r':
- if (likely(cur_tabstop(out))) {
- prt_bytes(out, unprinted_start, str - unprinted_start);
- unprinted_start = str + 1;
- __prt_tab_rjust(out);
- }
- break;
- }
-
- str++;
- }
-
- prt_bytes(out, unprinted_start, str - unprinted_start);
+ unsigned indent_pos = out->pos;
+ prt_bytes(out, str, count);
+ printbuf_do_indent(out, indent_pos);
}
/**
@@ -348,9 +410,10 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou
void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
{
bch2_printbuf_make_room(out, 10);
- out->pos += string_get_size(v, 1, !out->si_units,
- out->buf + out->pos,
- printbuf_remaining_size(out));
+ unsigned len = string_get_size(v, 1, !out->si_units,
+ out->buf + out->pos,
+ printbuf_remaining_size(out));
+ printbuf_advance_pos(out, len);
}
/**
@@ -402,9 +465,7 @@ void bch2_prt_string_option(struct printbuf *out,
const char * const list[],
size_t selected)
{
- size_t i;
-
- for (i = 0; list[i]; i++)
+ for (size_t i = 0; list[i]; i++)
bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
}
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
index 9a4a56c40937..d0dd398baa2b 100644
--- a/fs/bcachefs/printbuf.h
+++ b/fs/bcachefs/printbuf.h
@@ -86,6 +86,7 @@ struct printbuf {
u8 atomic;
bool allocation_failure:1;
bool heap_allocated:1;
+ bool overflow:1;
enum printbuf_si si_units:1;
bool human_readable_units:1;
bool has_indent_or_tabstops:1;
@@ -114,6 +115,7 @@ void bch2_printbuf_indent_add(struct printbuf *, unsigned);
void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
void bch2_prt_newline(struct printbuf *);
+void bch2_printbuf_strip_trailing_newline(struct printbuf *);
void bch2_prt_tab(struct printbuf *);
void bch2_prt_tab_rjust(struct printbuf *);
@@ -142,7 +144,9 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
*/
static inline unsigned printbuf_remaining_size(struct printbuf *out)
{
- return out->pos < out->size ? out->size - out->pos : 0;
+ if (WARN_ON(out->size && out->pos >= out->size))
+ out->pos = out->size - 1;
+ return out->size - out->pos;
}
/*
@@ -151,7 +155,7 @@ static inline unsigned printbuf_remaining_size(struct printbuf *out)
*/
static inline unsigned printbuf_remaining(struct printbuf *out)
{
- return out->pos < out->size ? out->size - out->pos - 1 : 0;
+ return out->size ? printbuf_remaining_size(out) - 1 : 0;
}
static inline unsigned printbuf_written(struct printbuf *out)
@@ -159,30 +163,25 @@ static inline unsigned printbuf_written(struct printbuf *out)
return out->size ? min(out->pos, out->size - 1) : 0;
}
-/*
- * Returns true if output was truncated:
- */
-static inline bool printbuf_overflowed(struct printbuf *out)
+static inline void printbuf_nul_terminate_reserved(struct printbuf *out)
{
- return out->pos >= out->size;
+ if (WARN_ON(out->size && out->pos >= out->size))
+ out->pos = out->size - 1;
+ if (out->size)
+ out->buf[out->pos] = 0;
}
static inline void printbuf_nul_terminate(struct printbuf *out)
{
bch2_printbuf_make_room(out, 1);
-
- if (out->pos < out->size)
- out->buf[out->pos] = 0;
- else if (out->size)
- out->buf[out->size - 1] = 0;
+ printbuf_nul_terminate_reserved(out);
}
/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
static inline void __prt_char_reserved(struct printbuf *out, char c)
{
if (printbuf_remaining(out))
- out->buf[out->pos] = c;
- out->pos++;
+ out->buf[out->pos++] = c;
}
/* Doesn't nul terminate: */
@@ -194,37 +193,34 @@ static inline void __prt_char(struct printbuf *out, char c)
static inline void prt_char(struct printbuf *out, char c)
{
- __prt_char(out, c);
- printbuf_nul_terminate(out);
+ bch2_printbuf_make_room(out, 2);
+ __prt_char_reserved(out, c);
+ printbuf_nul_terminate_reserved(out);
}
static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
{
- unsigned i, can_print = min(n, printbuf_remaining(out));
+ unsigned can_print = min(n, printbuf_remaining(out));
- for (i = 0; i < can_print; i++)
+ for (unsigned i = 0; i < can_print; i++)
out->buf[out->pos++] = c;
- out->pos += n - can_print;
}
static inline void prt_chars(struct printbuf *out, char c, unsigned n)
{
bch2_printbuf_make_room(out, n);
__prt_chars_reserved(out, c, n);
- printbuf_nul_terminate(out);
+ printbuf_nul_terminate_reserved(out);
}
static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
{
- unsigned i, can_print;
-
bch2_printbuf_make_room(out, n);
- can_print = min(n, printbuf_remaining(out));
+ unsigned can_print = min(n, printbuf_remaining(out));
- for (i = 0; i < can_print; i++)
+ for (unsigned i = 0; i < can_print; i++)
out->buf[out->pos++] = ((char *) b)[i];
- out->pos += n - can_print;
printbuf_nul_terminate(out);
}
@@ -241,18 +237,28 @@ static inline void prt_str_indented(struct printbuf *out, const char *str)
static inline void prt_hex_byte(struct printbuf *out, u8 byte)
{
- bch2_printbuf_make_room(out, 2);
+ bch2_printbuf_make_room(out, 3);
__prt_char_reserved(out, hex_asc_hi(byte));
__prt_char_reserved(out, hex_asc_lo(byte));
- printbuf_nul_terminate(out);
+ printbuf_nul_terminate_reserved(out);
}
static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
{
- bch2_printbuf_make_room(out, 2);
+ bch2_printbuf_make_room(out, 3);
__prt_char_reserved(out, hex_asc_upper_hi(byte));
__prt_char_reserved(out, hex_asc_upper_lo(byte));
- printbuf_nul_terminate(out);
+ printbuf_nul_terminate_reserved(out);
+}
+
+static inline void printbuf_reset_keep_tabstops(struct printbuf *buf)
+{
+ buf->pos = 0;
+ buf->allocation_failure = 0;
+ buf->last_newline = 0;
+ buf->last_field = 0;
+ buf->indent = 0;
+ buf->cur_tabstop = 0;
}
/**
@@ -260,11 +266,8 @@ static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
*/
static inline void printbuf_reset(struct printbuf *buf)
{
- buf->pos = 0;
- buf->allocation_failure = 0;
- buf->indent = 0;
+ printbuf_reset_keep_tabstops(buf);
buf->nr_tabstops = 0;
- buf->cur_tabstop = 0;
}
/**
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index e68b34eab90a..8b857fc33244 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -20,7 +20,7 @@ static const char * const bch2_quota_counters[] = {
};
static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_quota *q = field_to_type(f, quota);
@@ -59,14 +59,13 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
.to_text = bch2_sb_quota_to_text,
};
-int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
int ret = 0;
- bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err,
- quota_type_invalid,
+ bkey_fsck_err_on(k.k->p.inode >= QTYP_NR,
+ c, quota_type_invalid,
"invalid quota type (%llu >= %u)",
k.k->p.inode, QTYP_NR);
fsck_err:
@@ -97,45 +96,14 @@ static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 20);
- prt_str(out, "i_fieldmask");
- prt_tab(out);
- prt_printf(out, "%x", i->i_fieldmask);
- prt_newline(out);
-
- prt_str(out, "i_flags");
- prt_tab(out);
- prt_printf(out, "%u", i->i_flags);
- prt_newline(out);
-
- prt_str(out, "i_spc_timelimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_spc_timelimit);
- prt_newline(out);
-
- prt_str(out, "i_ino_timelimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_ino_timelimit);
- prt_newline(out);
-
- prt_str(out, "i_rt_spc_timelimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_rt_spc_timelimit);
- prt_newline(out);
-
- prt_str(out, "i_spc_warnlimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_spc_warnlimit);
- prt_newline(out);
-
- prt_str(out, "i_ino_warnlimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_ino_warnlimit);
- prt_newline(out);
-
- prt_str(out, "i_rt_spc_warnlimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_rt_spc_warnlimit);
- prt_newline(out);
+ prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask);
+ prt_printf(out, "i_flags\t%u\n", i->i_flags);
+ prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit);
+ prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit);
+ prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit);
+ prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit);
+ prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit);
+ prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit);
}
static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
@@ -143,60 +111,17 @@ static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 20);
- prt_str(out, "d_fieldmask");
- prt_tab(out);
- prt_printf(out, "%x", q->d_fieldmask);
- prt_newline(out);
-
- prt_str(out, "d_spc_hardlimit");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_spc_hardlimit);
- prt_newline(out);
-
- prt_str(out, "d_spc_softlimit");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_spc_softlimit);
- prt_newline(out);
-
- prt_str(out, "d_ino_hardlimit");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_ino_hardlimit);
- prt_newline(out);
-
- prt_str(out, "d_ino_softlimit");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_ino_softlimit);
- prt_newline(out);
-
- prt_str(out, "d_space");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_space);
- prt_newline(out);
-
- prt_str(out, "d_ino_count");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_ino_count);
- prt_newline(out);
-
- prt_str(out, "d_ino_timer");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_ino_timer);
- prt_newline(out);
-
- prt_str(out, "d_spc_timer");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_spc_timer);
- prt_newline(out);
-
- prt_str(out, "d_ino_warns");
- prt_tab(out);
- prt_printf(out, "%i", q->d_ino_warns);
- prt_newline(out);
-
- prt_str(out, "d_spc_warns");
- prt_tab(out);
- prt_printf(out, "%i", q->d_spc_warns);
- prt_newline(out);
+ prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask);
+ prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit);
+ prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit);
+ prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit);
+ prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit);
+ prt_printf(out, "d_space\t%llu\n", q->d_space);
+ prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count);
+ prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer);
+ prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer);
+ prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns);
+ prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns);
}
static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
@@ -560,13 +485,11 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bch_inode_unpacked u;
struct bch_snapshot_tree s_t;
- int ret;
+ u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot);
- ret = bch2_snapshot_tree_lookup(trans,
- bch2_snapshot_tree(c, k.k->p.snapshot), &s_t);
+ int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "%s: snapshot tree %u not found", __func__,
- snapshot_t(c, k.k->p.snapshot)->tree);
+ "%s: snapshot tree %u not found", __func__, tree);
if (ret)
return ret;
@@ -612,10 +535,10 @@ int bch2_fs_quota_read(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
__bch2_quota_set(c, k, NULL)) ?:
for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
bch2_fs_quota_read_inode(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
@@ -902,7 +825,7 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
ret = bkey_err(k);
if (unlikely(ret))
return ret;
@@ -946,7 +869,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
bkey_quota_init(&new_quota.k_i);
new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 884f601f41c4..1551800ff44c 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -5,15 +5,14 @@
#include "inode.h"
#include "quota_types.h"
-enum bkey_invalid_flags;
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
-int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_quota_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_quota ((struct bkey_ops) { \
- .key_invalid = bch2_quota_invalid, \
+ .key_validate = bch2_quota_validate, \
.val_to_text = bch2_quota_to_text, \
.min_val_size = 32, \
})
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
new file mode 100644
index 000000000000..bef2aa1b8bcd
--- /dev/null
+++ b/fs/bcachefs/rcu_pending.c
@@ -0,0 +1,666 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+
+#include <linux/generic-radix-tree.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/vmalloc.h>
+
+#include "rcu_pending.h"
+#include "darray.h"
+#include "util.h"
+
+#define static_array_for_each(_a, _i) \
+ for (typeof(&(_a)[0]) _i = _a; \
+ _i < (_a) + ARRAY_SIZE(_a); \
+ _i++)
+
+enum rcu_pending_special {
+ RCU_PENDING_KVFREE = 1,
+ RCU_PENDING_CALL_RCU = 2,
+};
+
+#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
+#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
+
+#ifdef __KERNEL__
+typedef unsigned long rcu_gp_poll_state_t;
+
+static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
+{
+ return l == r;
+}
+#else
+typedef struct urcu_gp_poll_state rcu_gp_poll_state_t;
+
+static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
+{
+ return l.grace_period_id == r.grace_period_id;
+}
+#endif
+
+static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp)
+{
+ return ssp
+ ? get_state_synchronize_srcu(ssp)
+ : get_state_synchronize_rcu();
+}
+
+static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp)
+{
+ return ssp
+ ? start_poll_synchronize_srcu(ssp)
+ : start_poll_synchronize_rcu();
+}
+
+static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie)
+{
+ return ssp
+ ? poll_state_synchronize_srcu(ssp, cookie)
+ : poll_state_synchronize_rcu(cookie);
+}
+
+static inline void __rcu_barrier(struct srcu_struct *ssp)
+{
+ return ssp
+ ? srcu_barrier(ssp)
+ : rcu_barrier();
+}
+
+static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+ rcu_callback_t func)
+{
+ if (ssp)
+ call_srcu(ssp, rhp, func);
+ else
+ call_rcu(rhp, func);
+}
+
+struct rcu_pending_seq {
+ /*
+ * We're using a radix tree like a vector - we're just pushing elements
+ * onto the end; we're using a radix tree instead of an actual vector to
+ * avoid reallocation overhead
+ */
+ GENRADIX(struct rcu_head *) objs;
+ size_t nr;
+ struct rcu_head **cursor;
+ rcu_gp_poll_state_t seq;
+};
+
+struct rcu_pending_list {
+ struct rcu_head *head;
+ struct rcu_head *tail;
+ rcu_gp_poll_state_t seq;
+};
+
+struct rcu_pending_pcpu {
+ struct rcu_pending *parent;
+ spinlock_t lock;
+ int cpu;
+
+ /*
+ * We can't bound the number of unprocessed gp sequence numbers, and we
+ * can't efficiently merge radix trees for expired grace periods, so we
+ * need darray/vector:
+ */
+ DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
+
+ /* Third entry is for expired objects: */
+ struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
+
+ struct rcu_head cb;
+ bool cb_armed;
+ struct work_struct work;
+};
+
+static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
+{
+ if (p->objs.nr)
+ return true;
+
+ static_array_for_each(p->lists, i)
+ if (i->head)
+ return true;
+
+ return false;
+}
+
+static void rcu_pending_list_merge(struct rcu_pending_list *l1,
+ struct rcu_pending_list *l2)
+{
+#ifdef __KERNEL__
+ if (!l1->head)
+ l1->head = l2->head;
+ else
+ l1->tail->next = l2->head;
+#else
+ if (!l1->head)
+ l1->head = l2->head;
+ else
+ l1->tail->next.next = (void *) l2->head;
+#endif
+
+ l1->tail = l2->tail;
+ l2->head = l2->tail = NULL;
+}
+
+static void rcu_pending_list_add(struct rcu_pending_list *l,
+ struct rcu_head *n)
+{
+#ifdef __KERNEL__
+ if (!l->head)
+ l->head = n;
+ else
+ l->tail->next = n;
+ l->tail = n;
+ n->next = NULL;
+#else
+ if (!l->head)
+ l->head = n;
+ else
+ l->tail->next.next = (void *) n;
+ l->tail = n;
+ n->next.next = NULL;
+#endif
+}
+
+static void merge_expired_lists(struct rcu_pending_pcpu *p)
+{
+ struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+
+ for (struct rcu_pending_list *i = p->lists; i < expired; i++)
+ if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
+ rcu_pending_list_merge(expired, i);
+}
+
+#ifndef __KERNEL__
+static inline void kfree_bulk(size_t nr, void ** p)
+{
+ while (nr--)
+ kfree(*p);
+}
+
+#define local_irq_save(flags) \
+do { \
+ flags = 0; \
+} while (0)
+#endif
+
+static noinline void __process_finished_items(struct rcu_pending *pending,
+ struct rcu_pending_pcpu *p,
+ unsigned long flags)
+{
+ struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+ struct rcu_pending_seq objs = {};
+ struct rcu_head *list = NULL;
+
+ if (p->objs.nr &&
+ __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
+ objs = p->objs.data[0];
+ darray_remove_item(&p->objs, p->objs.data);
+ }
+
+ merge_expired_lists(p);
+
+ list = expired->head;
+ expired->head = expired->tail = NULL;
+
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ switch ((ulong) pending->process) {
+ case RCU_PENDING_KVFREE:
+ for (size_t i = 0; i < objs.nr; ) {
+ size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
+
+ kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
+ i += nr_this_node;
+ }
+ genradix_free(&objs.objs);
+
+ while (list) {
+ struct rcu_head *obj = list;
+#ifdef __KERNEL__
+ list = obj->next;
+#else
+ list = (void *) obj->next.next;
+#endif
+
+ /*
+ * low bit of pointer indicates whether rcu_head needs
+ * to be freed - kvfree_rcu_mightsleep()
+ */
+ BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
+
+ void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
+ bool free_head = ((unsigned long) obj->func) & 1UL;
+
+ kvfree(ptr);
+ if (free_head)
+ kfree(obj);
+ }
+
+ break;
+
+ case RCU_PENDING_CALL_RCU:
+ for (size_t i = 0; i < objs.nr; i++) {
+ struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
+ obj->func(obj);
+ }
+ genradix_free(&objs.objs);
+
+ while (list) {
+ struct rcu_head *obj = list;
+#ifdef __KERNEL__
+ list = obj->next;
+#else
+ list = (void *) obj->next.next;
+#endif
+ obj->func(obj);
+ }
+ break;
+
+ default:
+ for (size_t i = 0; i < objs.nr; i++)
+ pending->process(pending, *genradix_ptr(&objs.objs, i));
+ genradix_free(&objs.objs);
+
+ while (list) {
+ struct rcu_head *obj = list;
+#ifdef __KERNEL__
+ list = obj->next;
+#else
+ list = (void *) obj->next.next;
+#endif
+ pending->process(pending, obj);
+ }
+ break;
+ }
+}
+
+static bool process_finished_items(struct rcu_pending *pending,
+ struct rcu_pending_pcpu *p,
+ unsigned long flags)
+{
+ /*
+ * XXX: we should grab the gp seq once and avoid multiple function
+ * calls, this is called from __rcu_pending_enqueue() fastpath in
+ * may_sleep==true mode
+ */
+ if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
+ (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
+ (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
+ p->lists[2].head) {
+ __process_finished_items(pending, p, flags);
+ return true;
+ }
+
+ return false;
+}
+
+static void rcu_pending_work(struct work_struct *work)
+{
+ struct rcu_pending_pcpu *p =
+ container_of(work, struct rcu_pending_pcpu, work);
+ struct rcu_pending *pending = p->parent;
+ unsigned long flags;
+
+ do {
+ spin_lock_irqsave(&p->lock, flags);
+ } while (process_finished_items(pending, p, flags));
+
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void rcu_pending_rcu_cb(struct rcu_head *rcu)
+{
+ struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
+
+ schedule_work_on(p->cpu, &p->work);
+
+ unsigned long flags;
+ spin_lock_irqsave(&p->lock, flags);
+ if (__rcu_pending_has_pending(p)) {
+ spin_unlock_irqrestore(&p->lock, flags);
+ __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
+ } else {
+ p->cb_armed = false;
+ spin_unlock_irqrestore(&p->lock, flags);
+ }
+}
+
+static __always_inline struct rcu_pending_seq *
+get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq)
+{
+ darray_for_each_reverse(p->objs, objs)
+ if (rcu_gp_poll_cookie_eq(objs->seq, seq))
+ return objs;
+
+ if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
+ return NULL;
+
+ return &darray_last(p->objs);
+}
+
+static noinline bool
+rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq,
+ struct rcu_head *head, void *ptr,
+ unsigned long *flags)
+{
+ if (ptr) {
+ if (!head) {
+ /*
+ * kvfree_rcu_mightsleep(): we weren't passed an
+ * rcu_head, but we need one: use the low bit of the
+ * ponter to free to flag that the head needs to be
+ * freed as well:
+ */
+ ptr = (void *)(((unsigned long) ptr)|1UL);
+ head = kmalloc(sizeof(*head), __GFP_NOWARN);
+ if (!head) {
+ spin_unlock_irqrestore(&p->lock, *flags);
+ head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
+ /*
+ * dropped lock, did GFP_KERNEL allocation,
+ * check for gp expiration
+ */
+ if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
+ kvfree(--ptr);
+ kfree(head);
+ spin_lock_irqsave(&p->lock, *flags);
+ return false;
+ }
+ }
+ }
+
+ head->func = ptr;
+ }
+again:
+ for (struct rcu_pending_list *i = p->lists;
+ i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
+ if (rcu_gp_poll_cookie_eq(i->seq, seq)) {
+ rcu_pending_list_add(i, head);
+ return false;
+ }
+ }
+
+ for (struct rcu_pending_list *i = p->lists;
+ i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
+ if (!i->head) {
+ i->seq = seq;
+ rcu_pending_list_add(i, head);
+ return true;
+ }
+ }
+
+ merge_expired_lists(p);
+ goto again;
+}
+
+/*
+ * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
+ * pending->pracess) once grace period elapses.
+ *
+ * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
+ * back to a linked list.
+ *
+ * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
+ * process callback
+ *
+ * - If @ptr and @head are both not NULL, we're kvfree_rcu()
+ *
+ * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
+ *
+ * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
+ * expired items.
+ */
+static __always_inline void
+__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
+ void *ptr, bool may_sleep)
+{
+
+ struct rcu_pending_pcpu *p;
+ struct rcu_pending_seq *objs;
+ struct genradix_node *new_node = NULL;
+ unsigned long flags;
+ bool start_gp = false;
+
+ BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
+
+ local_irq_save(flags);
+ p = this_cpu_ptr(pending->p);
+ spin_lock(&p->lock);
+ rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu);
+restart:
+ if (may_sleep &&
+ unlikely(process_finished_items(pending, p, flags)))
+ goto check_expired;
+
+ /*
+ * In kvfree_rcu() mode, the radix tree is only for slab pointers so
+ * that we can do kfree_bulk() - vmalloc pointers always use the linked
+ * list:
+ */
+ if (ptr && unlikely(is_vmalloc_addr(ptr)))
+ goto list_add;
+
+ objs = get_object_radix(p, seq);
+ if (unlikely(!objs))
+ goto list_add;
+
+ if (unlikely(!objs->cursor)) {
+ /*
+ * New radix tree nodes must be added under @p->lock because the
+ * tree root is in a darray that can be resized (typically,
+ * genradix supports concurrent unlocked allocation of new
+ * nodes) - hence preallocation and the retry loop:
+ */
+ objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
+ objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
+ if (unlikely(!objs->cursor)) {
+ if (may_sleep) {
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ gfp_t gfp = GFP_KERNEL;
+ if (!head)
+ gfp |= __GFP_NOFAIL;
+
+ new_node = genradix_alloc_node(gfp);
+ if (!new_node)
+ may_sleep = false;
+ goto check_expired;
+ }
+list_add:
+ start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
+ goto start_gp;
+ }
+ }
+
+ *objs->cursor++ = ptr ?: head;
+ /* zero cursor if we hit the end of a radix tree node: */
+ if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
+ objs->cursor = NULL;
+ start_gp = !objs->nr;
+ objs->nr++;
+start_gp:
+ if (unlikely(start_gp)) {
+ /*
+ * We only have one callback (ideally, we would have one for
+ * every outstanding graceperiod) - so if our callback is
+ * already in flight, we may still have to start a grace period
+ * (since we used get_state() above, not start_poll())
+ */
+ if (!p->cb_armed) {
+ p->cb_armed = true;
+ __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
+ } else {
+ __start_poll_synchronize_rcu(pending->srcu);
+ }
+ }
+ spin_unlock_irqrestore(&p->lock, flags);
+free_node:
+ if (new_node)
+ genradix_free_node(new_node);
+ return;
+check_expired:
+ if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
+ switch ((ulong) pending->process) {
+ case RCU_PENDING_KVFREE:
+ kvfree(ptr);
+ break;
+ case RCU_PENDING_CALL_RCU:
+ head->func(head);
+ break;
+ default:
+ pending->process(pending, head);
+ break;
+ }
+ goto free_node;
+ }
+
+ local_irq_save(flags);
+ p = this_cpu_ptr(pending->p);
+ spin_lock(&p->lock);
+ goto restart;
+}
+
+void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
+{
+ __rcu_pending_enqueue(pending, obj, NULL, true);
+}
+
+static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
+{
+ struct rcu_head *ret = NULL;
+
+ spin_lock_irq(&p->lock);
+ darray_for_each(p->objs, objs)
+ if (objs->nr) {
+ ret = *genradix_ptr(&objs->objs, --objs->nr);
+ objs->cursor = NULL;
+ if (!objs->nr)
+ genradix_free(&objs->objs);
+ goto out;
+ }
+
+ static_array_for_each(p->lists, i)
+ if (i->head) {
+ ret = i->head;
+#ifdef __KERNEL__
+ i->head = ret->next;
+#else
+ i->head = (void *) ret->next.next;
+#endif
+ if (!i->head)
+ i->tail = NULL;
+ goto out;
+ }
+out:
+ spin_unlock_irq(&p->lock);
+
+ return ret;
+}
+
+struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
+{
+ return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
+}
+
+struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
+{
+ struct rcu_head *ret = rcu_pending_dequeue(pending);
+
+ if (ret)
+ return ret;
+
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
+{
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ spin_lock_irq(&p->lock);
+ if (__rcu_pending_has_pending(p) || p->cb_armed) {
+ spin_unlock_irq(&p->lock);
+ return true;
+ }
+ spin_unlock_irq(&p->lock);
+ }
+
+ return false;
+}
+
+void rcu_pending_exit(struct rcu_pending *pending)
+{
+ int cpu;
+
+ if (!pending->p)
+ return;
+
+ while (rcu_pending_has_pending_or_armed(pending)) {
+ __rcu_barrier(pending->srcu);
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ flush_work(&p->work);
+ }
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ flush_work(&p->work);
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+
+ static_array_for_each(p->lists, i)
+ WARN_ON(i->head);
+ WARN_ON(p->objs.nr);
+ darray_exit(&p->objs);
+ }
+ free_percpu(pending->p);
+}
+
+/**
+ * rcu_pending_init: - initialize a rcu_pending
+ *
+ * @pending: Object to init
+ * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal
+ * RCU flavor
+ * @process: Callback function invoked on objects once their RCU barriers
+ * have completed; if NULL, kvfree() is used.
+ */
+int rcu_pending_init(struct rcu_pending *pending,
+ struct srcu_struct *srcu,
+ rcu_pending_process_fn process)
+{
+ pending->p = alloc_percpu(struct rcu_pending_pcpu);
+ if (!pending->p)
+ return -ENOMEM;
+
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ p->parent = pending;
+ p->cpu = cpu;
+ spin_lock_init(&p->lock);
+ darray_init(&p->objs);
+ INIT_WORK(&p->work, rcu_pending_work);
+ }
+
+ pending->srcu = srcu;
+ pending->process = process;
+
+ return 0;
+}
diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h
new file mode 100644
index 000000000000..71a2f4ddaade
--- /dev/null
+++ b/fs/bcachefs/rcu_pending.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RCU_PENDING_H
+#define _LINUX_RCU_PENDING_H
+
+#include <linux/rcupdate.h>
+
+struct rcu_pending;
+typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
+
+struct rcu_pending_pcpu;
+
+struct rcu_pending {
+ struct rcu_pending_pcpu __percpu *p;
+ struct srcu_struct *srcu;
+ rcu_pending_process_fn process;
+};
+
+void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
+struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
+struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
+
+void rcu_pending_exit(struct rcu_pending *pending);
+int rcu_pending_init(struct rcu_pending *pending,
+ struct srcu_struct *srcu,
+ rcu_pending_process_fn process);
+
+#endif /* _LINUX_RCU_PENDING_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 22d1017aa49b..d0a1f5cd5c2b 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -13,6 +13,7 @@
#include "errcode.h"
#include "error.h"
#include "inode.h"
+#include "io_write.h"
#include "move.h"
#include "rebalance.h"
#include "subvolume.h"
@@ -23,6 +24,190 @@
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
+/* bch_extent_rebalance: */
+
+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+ return &entry->rebalance;
+
+ return NULL;
+}
+
+static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
+ struct bch_io_opts *opts,
+ struct bkey_s_c k,
+ struct bkey_ptrs_c ptrs)
+{
+ if (!opts->background_compression)
+ return 0;
+
+ unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned ptr_bit = 1;
+ unsigned rewrite_ptrs = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+ p.ptr.unwritten)
+ return 0;
+
+ if (!p.ptr.cached && p.crc.compression_type != compression_type)
+ rewrite_ptrs |= ptr_bit;
+ ptr_bit <<= 1;
+ }
+
+ return rewrite_ptrs;
+}
+
+static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
+ struct bch_io_opts *opts,
+ struct bkey_ptrs_c ptrs)
+{
+ if (!opts->background_target ||
+ !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
+ return 0;
+
+ unsigned ptr_bit = 1;
+ unsigned rewrite_ptrs = 0;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
+ rewrite_ptrs |= ptr_bit;
+ ptr_bit <<= 1;
+ }
+
+ return rewrite_ptrs;
+}
+
+static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
+ struct bch_io_opts *opts,
+ struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
+ bch2_bkey_ptrs_need_move(c, opts, ptrs);
+}
+
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+ const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
+ if (!opts)
+ return 0;
+
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ u64 sectors = 0;
+
+ if (opts->background_compression) {
+ unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+ p.ptr.unwritten) {
+ sectors = 0;
+ goto incompressible;
+ }
+
+ if (!p.ptr.cached && p.crc.compression_type != compression_type)
+ sectors += p.crc.compressed_size;
+ }
+ }
+incompressible:
+ if (opts->background_target)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
+ sectors += p.crc.compressed_size;
+
+ return sectors;
+}
+
+static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
+ struct bkey_s_c k)
+{
+ if (!bkey_extent_is_direct_data(k.k))
+ return 0;
+
+ const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
+
+ if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
+ struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
+ return old == NULL || memcmp(old, &new, sizeof(new));
+ } else {
+ return old != NULL;
+ }
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
+ struct bkey_i *_k)
+{
+ if (!bkey_extent_is_direct_data(&_k->k))
+ return 0;
+
+ struct bkey_s k = bkey_i_to_s(_k);
+ struct bch_extent_rebalance *old =
+ (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+
+ if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
+ if (!old) {
+ old = bkey_val_end(k);
+ k.k->u64s += sizeof(*old) / sizeof(u64);
+ }
+
+ *old = io_opts_to_rebalance_opts(c, opts);
+ } else {
+ if (old)
+ extent_entry_drop(k, (union bch_extent_entry *) old);
+ }
+
+ return 0;
+}
+
+int bch2_get_update_rebalance_opts(struct btree_trans *trans,
+ struct bch_io_opts *io_opts,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ BUG_ON(iter->flags & BTREE_ITER_is_extents);
+ BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
+
+ const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
+ ? bch2_bkey_rebalance_opts(k) : NULL;
+ if (r) {
+#define x(_name) \
+ if (r->_name##_from_inode) { \
+ io_opts->_name = r->_name; \
+ io_opts->_name##_from_inode = true; \
+ }
+ BCH_REBALANCE_OPTS()
+#undef x
+ }
+
+ if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
+ return 0;
+
+ struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(n, k);
+
+ /* On successfull transaction commit, @k was invalidated: */
+
+ return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
+ bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+}
+
#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
static const char * const bch2_rebalance_state_strs[] = {
@@ -32,7 +217,7 @@ static const char * const bch2_rebalance_state_strs[] = {
#undef x
};
-static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
+int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -42,7 +227,7 @@ static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -69,8 +254,9 @@ err:
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
- __bch2_set_rebalance_needs_scan(trans, inum));
+ int ret = bch2_trans_commit_do(c, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_set_rebalance_needs_scan_trans(trans, inum));
rebalance_wakeup(c);
return ret;
}
@@ -89,7 +275,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum,
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -118,6 +304,9 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
+ if (!bch2_bkey_rebalance_opts(k))
+ return 0;
+
struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
int ret = PTR_ERR_OR_ZERO(n);
if (ret)
@@ -131,31 +320,28 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
struct bpos work_pos,
struct btree_iter *extent_iter,
+ struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
bch2_trans_iter_exit(trans, extent_iter);
bch2_trans_iter_init(trans, extent_iter,
work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
work_pos,
- BTREE_ITER_ALL_SNAPSHOTS);
- k = bch2_btree_iter_peek_slot(extent_iter);
+ BTREE_ITER_all_snapshots);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter);
if (bkey_err(k))
return k;
- const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
- if (!r) {
- /* raced due to btree write buffer, nothing to do */
- return bkey_s_c_null;
- }
+ int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
+ if (ret)
+ return bkey_s_c_err(ret);
memset(data_opts, 0, sizeof(*data_opts));
-
- data_opts->rewrite_ptrs =
- bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
- data_opts->target = r->target;
+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
+ data_opts->target = io_opts->background_target;
+ data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
if (!data_opts->rewrite_ptrs) {
/*
@@ -174,12 +360,28 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
if (trace_rebalance_extent_enabled()) {
struct printbuf buf = PRINTBUF;
- prt_str(&buf, "target=");
- bch2_target_to_text(&buf, c, r->target);
- prt_str(&buf, " compression=");
- bch2_compression_opt_to_text(&buf, r->compression);
- prt_str(&buf, " ");
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
+ if (p) {
+ prt_str(&buf, "compression=");
+ bch2_compression_opt_to_text(&buf, io_opts->background_compression);
+ prt_str(&buf, " ");
+ bch2_prt_u64_base2(&buf, p);
+ prt_newline(&buf);
+ }
+
+ p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
+ if (p) {
+ prt_str(&buf, "move=");
+ bch2_target_to_text(&buf, c, io_opts->background_target);
+ prt_str(&buf, " ");
+ bch2_prt_u64_base2(&buf, p);
+ prt_newline(&buf);
+ }
trace_rebalance_extent(c, buf.buf);
printbuf_exit(&buf);
@@ -208,14 +410,10 @@ static int do_rebalance_extent(struct moving_context *ctxt,
bch2_bkey_buf_init(&sk);
ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
- extent_iter, &data_opts));
+ extent_iter, &io_opts, &data_opts));
if (ret || !k.k)
goto out;
- ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
- if (ret)
- goto out;
-
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
/*
@@ -249,20 +447,9 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
- unsigned target, compression;
-
- if (k.k->p.inode) {
- target = io_opts->background_target;
- compression = background_compression(*io_opts);
- } else {
- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
-
- target = r ? r->target : io_opts->background_target;
- compression = r ? r->compression : background_compression(*io_opts);
- }
-
- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
- data_opts->target = target;
+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
+ data_opts->target = io_opts->background_target;
+ data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
return data_opts->rewrite_ptrs != 0;
}
@@ -323,17 +510,19 @@ static int do_rebalance(struct moving_context *ctxt)
struct bkey_s_c k;
int ret = 0;
+ bch2_trans_begin(trans);
+
bch2_move_stats_init(&r->work_stats, "rebalance_work");
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
bch2_trans_iter_init(trans, &rebalance_work_iter,
BTREE_ID_rebalance_work, POS_MIN,
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_all_snapshots);
while (!bch2_move_ratelimit(ctxt)) {
- if (!r->enabled) {
+ if (!c->opts.rebalance_enabled) {
bch2_moving_ctxt_flush_all(ctxt);
- kthread_wait_freezable(r->enabled ||
+ kthread_wait_freezable(c->opts.rebalance_enabled ||
kthread_should_stop());
}
@@ -412,11 +601,11 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
u64 now = atomic64_read(&c->io_clock[WRITE].now);
prt_str(out, "io wait duration: ");
- bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+ bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
prt_newline(out);
prt_str(out, "io wait remaining: ");
- bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+ bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
prt_newline(out);
prt_str(out, "duration waited: ");
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 28a52638f16c..62a3859d3823 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -2,8 +2,38 @@
#ifndef _BCACHEFS_REBALANCE_H
#define _BCACHEFS_REBALANCE_H
+#include "compress.h"
+#include "disk_groups.h"
+#include "opts.h"
#include "rebalance_types.h"
+static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
+ struct bch_io_opts *opts)
+{
+ struct bch_extent_rebalance r = {
+ .type = BIT(BCH_EXTENT_ENTRY_rebalance),
+#define x(_name) \
+ ._name = opts->_name, \
+ ._name##_from_inode = opts->_name##_from_inode,
+ BCH_REBALANCE_OPTS()
+#undef x
+ };
+
+ if (r.background_target &&
+ !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
+ r.background_target = 0;
+
+ return r;
+};
+
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
+int bch2_get_update_rebalance_opts(struct btree_trans *,
+ struct bch_io_opts *,
+ struct btree_iter *,
+ struct bkey_s_c);
+
+int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
int bch2_set_fs_needs_rebalance(struct bch_fs *);
diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h
new file mode 100644
index 000000000000..ff9a1342a22b
--- /dev/null
+++ b/fs/bcachefs/rebalance_format.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_FORMAT_H
+#define _BCACHEFS_REBALANCE_FORMAT_H
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:6,
+ unused:3,
+
+ promote_target_from_inode:1,
+ erasure_code_from_inode:1,
+ data_checksum_from_inode:1,
+ background_compression_from_inode:1,
+ data_replicas_from_inode:1,
+ background_target_from_inode:1,
+
+ promote_target:16,
+ erasure_code:1,
+ data_checksum:4,
+ data_replicas:4,
+ background_compression:8, /* enum bch_compression_opt */
+ background_target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 background_target:16,
+ background_compression:8,
+ data_replicas:4,
+ data_checksum:4,
+ erasure_code:1,
+ promote_target:16,
+
+ background_target_from_inode:1,
+ data_replicas_from_inode:1,
+ background_compression_from_inode:1,
+ data_checksum_from_inode:1,
+ erasure_code_from_inode:1,
+ promote_target_from_inode:1,
+
+ unused:3,
+ type:6;
+#endif
+};
+
+/* subset of BCH_INODE_OPTS */
+#define BCH_REBALANCE_OPTS() \
+ x(data_checksum) \
+ x(background_compression) \
+ x(data_replicas) \
+ x(promote_target) \
+ x(background_target) \
+ x(erasure_code)
+
+#endif /* _BCACHEFS_REBALANCE_FORMAT_H */
+
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 0fffb536c1d0..fe5098c17dfc 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -30,8 +30,6 @@ struct bch_fs_rebalance {
struct bbpos scan_start;
struct bbpos scan_end;
struct bch_move_stats scan_stats;
-
- unsigned enabled:1;
};
#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 21e13bb4335b..71c786cdb192 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1,66 +1,170 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
#include "alloc_background.h"
-#include "btree_gc.h"
+#include "bkey_buf.h"
#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "buckets.h"
#include "dirent.h"
-#include "ec.h"
+#include "disk_accounting.h"
#include "errcode.h"
#include "error.h"
#include "fs-common.h"
-#include "fsck.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
-#include "lru.h"
#include "logged_ops.h"
#include "move.h"
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "sb-clean.h"
#include "sb-downgrade.h"
#include "snapshot.h"
-#include "subvolume.h"
#include "super-io.h"
#include <linux/sort.h>
#include <linux/stat.h>
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-static bool btree_id_is_alloc(enum btree_id id)
+int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
{
- switch (id) {
+ u64 b = BIT_ULL(btree);
+ int ret = 0;
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (!(c->sb.btrees_lost_data & b)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_btree_id_to_text(&buf, btree);
+ bch_err(c, "flagging btree %s lost data", buf.buf);
+ printbuf_exit(&buf);
+ ext->btrees_lost_data |= cpu_to_le64(b);
+ }
+
+ /* Once we have runtime self healing for topology errors we won't need this: */
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret;
+
+ /* Btree node accounting will be off: */
+ __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ /*
+ * These are much more minor, and don't need to be corrected right away,
+ * but in debug mode we want the next fsck run to be clean:
+ */
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret;
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret;
+#endif
+
+ switch (btree) {
case BTREE_ID_alloc:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+ goto out;
case BTREE_ID_backpointers:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret;
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret;
+ goto out;
case BTREE_ID_need_discard:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ goto out;
case BTREE_ID_freespace:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ goto out;
case BTREE_ID_bucket_gens:
- return true;
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ goto out;
+ case BTREE_ID_lru:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ goto out;
+ case BTREE_ID_accounting:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
+ goto out;
default:
- return false;
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret;
+ goto out;
}
+out:
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return ret;
+}
+
+static void kill_btree(struct bch_fs *c, enum btree_id btree)
+{
+ bch2_btree_id_root(c, btree)->alive = false;
+ bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
}
/* for -o reconstruct_alloc: */
-static void drop_alloc_keys(struct journal_keys *keys)
+static void bch2_reconstruct_alloc(struct bch_fs *c)
{
- size_t src, dst;
+ bch2_journal_log_msg(c, "dropping alloc info");
+ bch_info(c, "dropping and reconstructing all alloc info");
- for (src = 0, dst = 0; src < keys->nr; src++)
- if (!btree_id_is_alloc(keys->d[src].btree_id))
- keys->d[dst++] = keys->d[src];
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
+
+ __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
+
+ __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent);
+
+ __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent);
+
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent);
+
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+
+ c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- keys->nr = dst;
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++)
+ if (btree_id_is_alloc(i))
+ kill_btree(c, i);
}
/*
@@ -70,9 +174,7 @@ static void drop_alloc_keys(struct journal_keys *keys)
*/
static void zero_out_btree_mem_ptr(struct journal_keys *keys)
{
- struct journal_key *i;
-
- for (i = keys->d; i < keys->d + keys->nr; i++)
+ darray_for_each(*keys, i)
if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
}
@@ -89,14 +191,53 @@ static void replay_now_at(struct journal *j, u64 seq)
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
+static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
+ struct journal_key *k)
+{
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ BTREE_MAX_DEPTH, k->level,
+ BTREE_ITER_intent);
+ int ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto out;
+
+ struct bkey u;
+ struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
+
+ /* Has this delta already been applied to the btree? */
+ if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
+ ret = 0;
+ goto out;
+ }
+
+ struct bkey_i *new = k->k;
+ if (old.k->type == KEY_TYPE_accounting) {
+ new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto out;
+
+ bch2_accounting_accumulate(bkey_i_to_accounting(new),
+ bkey_s_c_to_accounting(old));
+ }
+
+ trans->journal_res.seq = k->journal_seq;
+
+ ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
static int bch2_journal_replay_key(struct btree_trans *trans,
struct journal_key *k)
{
struct btree_iter iter;
unsigned iter_flags =
- BTREE_ITER_INTENT|
- BTREE_ITER_NOT_EXTENTS;
- unsigned update_flags = BTREE_TRIGGER_NORUN;
+ BTREE_ITER_intent|
+ BTREE_ITER_not_extents;
+ unsigned update_flags = BTREE_TRIGGER_norun;
int ret;
if (k->overwritten)
@@ -105,17 +246,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
trans->journal_res.seq = k->journal_seq;
/*
- * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
+ * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to
* keep the key cache coherent with the underlying btree. Nothing
* besides the allocator is doing updates yet so we don't need key cache
* coherency for non-alloc btrees, and key cache fills for snapshots
- * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until
+ * btrees use BTREE_ITER_filter_snapshots, which isn't available until
* the snapshots recovery pass runs.
*/
if (!k->level && k->btree_id == BTREE_ID_alloc)
- iter_flags |= BTREE_ITER_CACHED;
+ iter_flags |= BTREE_ITER_cached;
else
- update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM;
+ update_flags |= BTREE_UPDATE_key_cache_reclaim;
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
@@ -124,10 +265,26 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (ret)
goto out;
+ struct btree_path *path = btree_iter_path(trans, &iter);
+ if (unlikely(!btree_path_node(path, k->level))) {
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ BTREE_MAX_DEPTH, 0, iter_flags);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_increase_depth(trans, iter.path, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
+ }
+
/* Must be checked with btree locked: */
if (k->overwritten)
goto out;
+ if (k->k->k.type == KEY_TYPE_accounting) {
+ ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k);
+ goto out;
+ }
+
ret = bch2_trans_update(trans, &iter, k->k, update_flags);
out:
bch2_trans_iter_exit(trans, &iter);
@@ -139,17 +296,24 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
const struct journal_key *l = *((const struct journal_key **)_l);
const struct journal_key *r = *((const struct journal_key **)_r);
- return cmp_int(l->journal_seq, r->journal_seq);
+ /*
+ * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
+ *
+ * journal_seq == 0 means that the key comes from early repair, and
+ * should be inserted last so as to avoid overflowing the journal
+ */
+ return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
}
-static int bch2_journal_replay(struct bch_fs *c)
+int bch2_journal_replay(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
DARRAY(struct journal_key *) keys_sorted = { 0 };
struct journal *j = &c->journal;
u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start;
- struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_trans *trans = NULL;
+ bool immediate_flush = false;
int ret = 0;
if (keys->nr) {
@@ -161,24 +325,58 @@ static int bch2_journal_replay(struct bch_fs *c)
BUG_ON(!atomic_read(&keys->ref));
+ move_gap(keys, keys->nr);
+ trans = bch2_trans_get(c);
+
+ /*
+ * Replay accounting keys first: we can't allow the write buffer to
+ * flush accounting keys until we're done
+ */
+ darray_for_each(*keys, k) {
+ if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
+ continue;
+
+ cond_resched();
+
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_skip_accounting_apply|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_WATERMARK_reclaim,
+ bch2_journal_replay_accounting_key(trans, k));
+ if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
+ goto err;
+
+ k->overwritten = true;
+ }
+
+ set_bit(BCH_FS_accounting_replay_done, &c->flags);
+
/*
* First, attempt to replay keys in sorted order. This is more
* efficient - better locality of btree access - but some might fail if
* that would cause a journal deadlock.
*/
- for (size_t i = 0; i < keys->nr; i++) {
+ darray_for_each(*keys, k) {
cond_resched();
- struct journal_key *k = keys->d + i;
+ /*
+ * k->allocated means the key wasn't read in from the journal,
+ * rather it was from early repair code
+ */
+ if (k->allocated)
+ immediate_flush = true;
/* Skip fastpath if we're low on space in the journal */
ret = c->journal.watermark ? -1 :
commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_skip_accounting_apply|
(!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
bch2_journal_replay_key(trans, k));
- BUG_ON(!ret && !k->overwritten);
+ BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
if (ret) {
ret = darray_push(&keys_sorted, k);
if (ret)
@@ -186,6 +384,7 @@ static int bch2_journal_replay(struct bch_fs *c)
}
}
+ bch2_trans_unlock_long(trans);
/*
* Now, replay any remaining keys in the order in which they appear in
* the journal, unpinning those journal entries as we go:
@@ -199,20 +398,27 @@ static int bch2_journal_replay(struct bch_fs *c)
struct journal_key *k = *kp;
- replay_now_at(j, k->journal_seq);
+ if (k->journal_seq)
+ replay_now_at(j, k->journal_seq);
+ else
+ replay_now_at(j, j->replay_journal_seq_end);
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_skip_accounting_apply|
(!k->allocated
? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
: 0),
bch2_journal_replay_key(trans, k));
- bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
- bch2_btree_id_str(k->btree_id), k->level);
- if (ret)
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+ bch2_btree_id_level_to_text(&buf, k->btree_id, k->level);
+ bch_err_msg(c, ret, "while replaying key at %s:", buf.buf);
+ printbuf_exit(&buf);
goto err;
+ }
- BUG_ON(!k->overwritten);
+ BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
}
/*
@@ -222,7 +428,8 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_trans_put(trans);
trans = NULL;
- if (!c->opts.keep_journal)
+ if (!c->opts.retain_recovery_info &&
+ c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
bch2_journal_keys_put_initial(c);
replay_now_at(j, j->replay_journal_seq_end);
@@ -230,6 +437,12 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_journal_set_replay_done(j);
+ /* if we did any repair, flush it immediately */
+ if (immediate_flush) {
+ bch2_journal_flush_all_pins(&c->journal);
+ ret = bch2_journal_meta(&c->journal);
+ }
+
if (keys->nr)
bch2_journal_log_msg(c, "journal replay finished");
err:
@@ -249,7 +462,15 @@ static int journal_replay_entry_early(struct bch_fs *c,
switch (entry->type) {
case BCH_JSET_ENTRY_btree_root: {
- struct btree_root *r;
+
+ if (unlikely(!entry->u64s))
+ return 0;
+
+ if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX,
+ c, invalid_btree_id,
+ "invalid btree id %u (max %u)",
+ entry->btree_id, BTREE_ID_NR_MAX))
+ return 0;
while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
@@ -257,15 +478,11 @@ static int journal_replay_entry_early(struct bch_fs *c,
return ret;
}
- r = bch2_btree_id_root(c, entry->btree_id);
+ struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
- if (entry->u64s) {
- r->level = entry->level;
- bkey_copy(&r->key, (struct bkey_i *) entry->start);
- r->error = 0;
- } else {
- r->error = -EIO;
- }
+ r->level = entry->level;
+ bkey_copy(&r->key, (struct bkey_i *) entry->start);
+ r->error = 0;
r->alive = true;
break;
}
@@ -274,42 +491,10 @@ static int journal_replay_entry_early(struct bch_fs *c,
container_of(entry, struct jset_entry_usage, entry);
switch (entry->btree_id) {
- case BCH_FS_USAGE_reserved:
- if (entry->level < BCH_REPLICAS_MAX)
- c->usage_base->persistent_reserved[entry->level] =
- le64_to_cpu(u->v);
- break;
- case BCH_FS_USAGE_inodes:
- c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
- break;
case BCH_FS_USAGE_key_version:
- atomic64_set(&c->key_version,
- le64_to_cpu(u->v));
+ atomic64_set(&c->key_version, le64_to_cpu(u->v));
break;
}
-
- break;
- }
- case BCH_JSET_ENTRY_data_usage: {
- struct jset_entry_data_usage *u =
- container_of(entry, struct jset_entry_data_usage, entry);
-
- ret = bch2_replicas_set_usage(c, &u->r,
- le64_to_cpu(u->v));
- break;
- }
- case BCH_JSET_ENTRY_dev_usage: {
- struct jset_entry_dev_usage *u =
- container_of(entry, struct jset_entry_dev_usage, entry);
- struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
- unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
-
- for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
- ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
- ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
- ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
- }
-
break;
}
case BCH_JSET_ENTRY_blacklist: {
@@ -337,7 +522,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
}
}
-
+fsck_err:
return ret;
}
@@ -359,7 +544,7 @@ static int journal_replay_early(struct bch_fs *c,
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
vstruct_for_each(&i->j, entry) {
@@ -370,8 +555,6 @@ static int journal_replay_early(struct bch_fs *c,
}
}
- bch2_fs_usage_initialize(c);
-
return 0;
}
@@ -379,199 +562,45 @@ static int journal_replay_early(struct bch_fs *c,
static int read_btree_roots(struct bch_fs *c)
{
- unsigned i;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (!r->alive)
continue;
- if (btree_id_is_alloc(i) &&
- c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- continue;
- }
-
- if (r->error) {
- __fsck_err(c,
- btree_id_is_alloc(i)
- ? FSCK_CAN_IGNORE : 0,
- btree_root_bkey_invalid,
- "invalid btree root %s",
- bch2_btree_id_str(i));
- if (i == BTREE_ID_alloc)
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- }
-
- ret = bch2_btree_root_read(c, i, &r->key, r->level);
- if (ret) {
- fsck_err(c,
- btree_root_read_error,
- "error reading btree root %s",
- bch2_btree_id_str(i));
+ printbuf_reset(&buf);
+ bch2_btree_id_level_to_text(&buf, i, r->level);
+
+ if (mustfix_fsck_err_on((ret = r->error),
+ c, btree_root_bkey_invalid,
+ "invalid btree root %s",
+ buf.buf) ||
+ mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
+ c, btree_root_read_error,
+ "error reading btree root %s: %s",
+ buf.buf, bch2_err_str(ret))) {
if (btree_id_is_alloc(i))
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- ret = 0;
+ r->error = 0;
+
+ ret = bch2_btree_lost_data(c, i);
+ BUG_ON(ret);
}
}
- for (i = 0; i < BTREE_ID_NR; i++) {
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
- if (!r->b) {
+ if (!r->b && !r->error) {
r->alive = false;
r->level = 0;
- bch2_btree_root_alloc(c, i);
+ bch2_btree_root_alloc_fake(c, i, 0);
}
}
fsck_err:
- return ret;
-}
-
-static int bch2_initialize_subvolumes(struct bch_fs *c)
-{
- struct bkey_i_snapshot_tree root_tree;
- struct bkey_i_snapshot root_snapshot;
- struct bkey_i_subvolume root_volume;
- int ret;
-
- bkey_snapshot_tree_init(&root_tree.k_i);
- root_tree.k.p.offset = 1;
- root_tree.v.master_subvol = cpu_to_le32(1);
- root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
-
- bkey_snapshot_init(&root_snapshot.k_i);
- root_snapshot.k.p.offset = U32_MAX;
- root_snapshot.v.flags = 0;
- root_snapshot.v.parent = 0;
- root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
- root_snapshot.v.tree = cpu_to_le32(1);
- SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
-
- bkey_subvolume_init(&root_volume.k_i);
- root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_volume.v.flags = 0;
- root_volume.v.snapshot = cpu_to_le32(U32_MAX);
- root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
-
- ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_inode_unpacked inode;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (!bkey_is_inode(k.k)) {
- bch_err(trans->c, "root inode not found");
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
- }
-
- ret = bch2_inode_unpack(k, &inode);
- BUG_ON(ret);
-
- inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
-
- ret = bch2_inode_write(trans, &iter, &inode);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* set bi_subvol on root inode */
-noinline_for_stack
-static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
-{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
- __bch2_fs_upgrade_for_subvolumes(trans));
- bch_err_fn(c, ret);
- return ret;
-}
-
-const char * const bch2_recovery_passes[] = {
-#define x(_fn, ...) #_fn,
- BCH_RECOVERY_PASSES()
-#undef x
- NULL
-};
-
-static int bch2_check_allocations(struct bch_fs *c)
-{
- return bch2_gc(c, true, c->opts.norecovery);
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
-
- /*
- * After we go RW, the journal keys buffer can't be modified (except for
- * setting journal_key->overwritten: it will be accessed by multiple
- * threads
- */
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
-
- set_bit(BCH_FS_may_go_rw, &c->flags);
-
- if (keys->nr || c->opts.fsck || !c->sb.clean)
- return bch2_fs_read_write_early(c);
- return 0;
-}
-
-struct recovery_pass_fn {
- int (*fn)(struct bch_fs *);
- unsigned when;
-};
-
-static struct recovery_pass_fn recovery_pass_fns[] = {
-#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-u64 bch2_recovery_passes_to_stable(u64 v)
-{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
-
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
- return ret;
-}
-
-u64 bch2_recovery_passes_from_stable(u64 v)
-{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
-
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
+ printbuf_exit(&buf);
return ret;
}
@@ -582,6 +611,7 @@ static bool check_version_upgrade(struct bch_fs *c)
bch2_latest_compatible_version(c->sb.version));
unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
unsigned new_version = 0;
+ bool ret = false;
if (old_version < bcachefs_metadata_required_upgrade_below) {
if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
@@ -637,103 +667,31 @@ static bool check_version_upgrade(struct bch_fs *c)
}
bch_info(c, "%s", buf.buf);
-
- bch2_sb_upgrade(c, new_version);
-
printbuf_exit(&buf);
- return true;
- }
-
- return false;
-}
-
-u64 bch2_fsck_recovery_passes(void)
-{
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
- if (recovery_pass_fns[i].when & PASS_FSCK)
- ret |= BIT_ULL(i);
- return ret;
-}
-
-static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
-
- if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
- return false;
- if (c->recovery_passes_explicit & BIT_ULL(pass))
- return true;
- if ((p->when & PASS_FSCK) && c->opts.fsck)
- return true;
- if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
- return true;
- if (p->when & PASS_ALWAYS)
- return true;
- return false;
-}
-
-static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
- int ret;
-
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
- bch2_recovery_passes[pass]);
- ret = p->fn(c);
- if (ret)
- return ret;
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_CONT " done\n");
-
- return 0;
-}
-
-static int bch2_run_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
-
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
- if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
- unsigned pass = c->curr_recovery_pass;
-
- ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
- (ret && c->curr_recovery_pass < pass))
- continue;
- if (ret)
- break;
-
- c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
- }
- c->curr_recovery_pass++;
- c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
+ ret = true;
}
- return ret;
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
+ if (new_version > c->sb.version_incompat &&
+ c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) {
+ struct printbuf buf = PRINTBUF;
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
- struct recovery_pass_fn *p = recovery_pass_fns + i;
+ prt_str(&buf, "Now allowing incompatible features up to ");
+ bch2_version_to_text(&buf, new_version);
+ prt_str(&buf, ", previously allowed up to ");
+ bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
+ prt_newline(&buf);
- if (!(p->when & PASS_ONLINE))
- continue;
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
- ret = bch2_run_recovery_pass(c, i);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
- i = c->curr_recovery_pass;
- continue;
- }
- if (ret)
- break;
+ ret = true;
}
+ if (ret)
+ bch2_sb_upgrade(c, new_version,
+ c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible);
+
return ret;
}
@@ -769,75 +727,73 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (c->opts.fsck && c->opts.norecovery) {
- bch_err(c, "cannot select both norecovery and fsck");
- ret = -EINVAL;
- goto err;
+ if (c->opts.norecovery) {
+ c->opts.recovery_pass_last = c->opts.recovery_pass_last
+ ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
+ : BCH_RECOVERY_PASS_snapshots_read;
+ c->opts.nochanges = true;
+ c->opts.read_only = true;
}
- if (!c->opts.nochanges) {
- mutex_lock(&c->sb_lock);
- bool write_sb = false;
-
- struct bch_sb_field_ext *ext =
- bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
- if (!ext) {
- ret = -BCH_ERR_ENOSPC_sb;
- mutex_unlock(&c->sb_lock);
- goto err;
- }
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ bool write_sb = false;
- if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
- ext->recovery_passes_required[0] |=
- cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
- write_sb = true;
- }
+ if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
+ ext->recovery_passes_required[0] |=
+ cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
+ write_sb = true;
+ }
- u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- if (sb_passes) {
- struct printbuf buf = PRINTBUF;
- prt_str(&buf, "superblock requires following recovery passes to be run:\n ");
- prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
+ u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ if (sb_passes) {
+ struct printbuf buf = PRINTBUF;
+ prt_str(&buf, "superblock requires following recovery passes to be run:\n ");
+ prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
- if (bch2_check_version_downgrade(c)) {
- struct printbuf buf = PRINTBUF;
+ if (bch2_check_version_downgrade(c)) {
+ struct printbuf buf = PRINTBUF;
- prt_str(&buf, "Version downgrade required:");
-
- __le64 passes = ext->recovery_passes_required[0];
- bch2_sb_set_downgrade(c,
- BCH_VERSION_MINOR(bcachefs_metadata_version_current),
- BCH_VERSION_MINOR(c->sb.version));
- passes = ext->recovery_passes_required[0] & ~passes;
- if (passes) {
- prt_str(&buf, "\n running recovery passes: ");
- prt_bitflags(&buf, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
- }
+ prt_str(&buf, "Version downgrade required:");
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- write_sb = true;
+ __le64 passes = ext->recovery_passes_required[0];
+ bch2_sb_set_downgrade(c,
+ BCH_VERSION_MINOR(bcachefs_metadata_version_current),
+ BCH_VERSION_MINOR(c->sb.version));
+ passes = ext->recovery_passes_required[0] & ~passes;
+ if (passes) {
+ prt_str(&buf, "\n running recovery passes: ");
+ prt_bitflags(&buf, bch2_recovery_passes,
+ bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
}
- if (check_version_upgrade(c))
- write_sb = true;
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ write_sb = true;
+ }
+
+ if (check_version_upgrade(c))
+ write_sb = true;
- if (write_sb)
- bch2_write_super(c);
+ c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- mutex_unlock(&c->sb_lock);
+ if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) {
+ SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe);
+ write_sb = true;
}
- if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ if (write_sb)
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
if (c->opts.fsck)
set_bit(BCH_FS_fsck_running, &c->flags);
+ if (c->sb.clean)
+ set_bit(BCH_FS_clean_recovery, &c->flags);
+ set_bit(BCH_FS_recovery_running, &c->flags);
ret = bch2_blacklist_table_initialize(c);
if (ret) {
@@ -845,7 +801,9 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
+ bch2_journal_pos_from_member_info_resume(c);
+
+ if (!c->sb.clean || c->opts.retain_recovery_info) {
struct genradix_iter iter;
struct journal_replay **i;
@@ -862,7 +820,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto out;
genradix_for_each_reverse(&c->journal_entries, iter, i)
- if (*i && !(*i)->ignore) {
+ if (!journal_replay_ignore(*i)) {
last_journal_entry = &(*i)->j;
break;
}
@@ -887,7 +845,8 @@ int bch2_fs_recovery(struct bch_fs *c)
genradix_for_each_reverse(&c->journal_entries, iter, i)
if (*i) {
last_journal_entry = &(*i)->j;
- (*i)->ignore = false;
+ (*i)->ignore_blacklisted = false;
+ (*i)->ignore_not_dirty= false;
/*
* This was probably a NO_FLUSH entry,
* so last_seq was garbage - but we know
@@ -923,17 +882,15 @@ use_clean:
c->journal_replay_seq_start = last_seq;
c->journal_replay_seq_end = blacklist_seq - 1;
- if (c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- drop_alloc_keys(&c->journal_keys);
- }
-
zero_out_btree_mem_ptr(&c->journal_keys);
ret = journal_replay_early(c, clean);
if (ret)
goto err;
+ if (c->opts.reconstruct_alloc)
+ bch2_reconstruct_alloc(c);
+
/*
* After an unclean shutdown, skip then next few journal sequence
* numbers as they may have been referenced by btree writes that
@@ -950,7 +907,7 @@ use_clean:
bch2_journal_seq_blacklist_add(c,
blacklist_seq, journal_seq);
if (ret) {
- bch_err(c, "error creating new journal seq blacklist entry");
+ bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
goto err;
}
}
@@ -961,9 +918,6 @@ use_clean:
if (ret)
goto err;
- if (c->opts.reconstruct_alloc)
- bch2_journal_log_msg(c, "dropping alloc info");
-
/*
* Skip past versions that might have possibly been used (as nonces),
* but hadn't had their pointers written:
@@ -975,11 +929,34 @@ use_clean:
if (ret)
goto err;
+ set_bit(BCH_FS_btree_running, &c->flags);
+
+ ret = bch2_sb_set_upgrade_extra(c);
+
ret = bch2_run_recovery_passes(c);
if (ret)
goto err;
+ /*
+ * Normally set by the appropriate recovery pass: when cleared, this
+ * indicates we're in early recovery and btree updates should be done by
+ * being applied to the journal replay keys. _Must_ be cleared before
+ * multithreaded use:
+ */
+ set_bit(BCH_FS_may_go_rw, &c->flags);
clear_bit(BCH_FS_fsck_running, &c->flags);
+ clear_bit(BCH_FS_recovery_running, &c->flags);
+
+ /* in case we don't run journal replay, i.e. norecovery mode */
+ set_bit(BCH_FS_accounting_replay_done, &c->flags);
+
+ bch2_async_btree_node_rewrites_flush(c);
+
+ /* fsync if we fixed errors */
+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_meta(&c->journal);
+ }
/* If we fixed errors, verify that fs is actually clean now: */
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
@@ -1015,7 +992,8 @@ use_clean:
}
mutex_lock(&c->sb_lock);
- bool write_sb = false;
+ ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ write_sb = false;
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
@@ -1028,15 +1006,18 @@ use_clean:
write_sb = true;
}
- if (!test_bit(BCH_FS_error, &c->flags)) {
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- if (ext &&
- (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
- !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) {
- memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required));
- memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
- write_sb = true;
- }
+ if (!test_bit(BCH_FS_error, &c->flags) &&
+ !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
+ memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
+ write_sb = true;
+ }
+
+ if (c->opts.fsck &&
+ !test_bit(BCH_FS_error, &c->flags) &&
+ c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
+ ext->btrees_lost_data) {
+ ext->btrees_lost_data = 0;
+ write_sb = true;
}
if (c->opts.fsck &&
@@ -1047,6 +1028,9 @@ use_clean:
write_sb = true;
}
+ if (bch2_blacklist_entries_gc(c))
+ write_sb = true;
+
if (write_sb)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -1069,18 +1053,16 @@ use_clean:
bch_info(c, "scanning for old btree nodes done");
}
- if (c->journal_seq_blacklist_table &&
- c->journal_seq_blacklist_table->nr > 128)
- queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
-
ret = 0;
out:
bch2_flush_fsck_errs(c);
- if (!c->opts.keep_journal &&
- test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ if (!c->opts.retain_recovery_info) {
bch2_journal_keys_put_initial(c);
- kfree(clean);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
+ }
+ if (!IS_ERR(clean))
+ kfree(clean);
if (!ret &&
test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
@@ -1102,9 +1084,11 @@ int bch2_fs_initialize(struct bch_fs *c)
struct bch_inode_unpacked root_inode, lostfound_inode;
struct bkey_inode_buf packed_inode;
struct qstr lostfound = QSTR("lost+found");
+ struct bch_member *m;
int ret;
bch_notice(c, "initializing new filesystem");
+ set_bit(BCH_FS_new_fs, &c->flags);
mutex_lock(&c->sb_lock);
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
@@ -1113,20 +1097,25 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_check_version_downgrade(c);
if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
- bch2_sb_upgrade(c, bcachefs_metadata_version_current);
+ bch2_sb_upgrade(c, bcachefs_metadata_version_current, false);
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
bch2_write_super(c);
}
+
+ for_each_member_device(c, ca) {
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false);
+ ca->mi = bch2_mi_to_cpu(m);
+ }
+
+ bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
+ set_bit(BCH_FS_btree_running, &c->flags);
set_bit(BCH_FS_may_go_rw, &c->flags);
for (unsigned i = 0; i < BTREE_ID_NR; i++)
- bch2_btree_root_alloc(c, i);
-
- for_each_member_device(c, ca)
- bch2_dev_usage_init(ca);
+ bch2_btree_root_alloc_fake(c, i, 0);
ret = bch2_fs_journal_alloc(c);
if (ret)
@@ -1137,12 +1126,21 @@ int bch2_fs_initialize(struct bch_fs *c)
* set up the journal.pin FIFO and journal.cur pointer:
*/
bch2_fs_journal_start(&c->journal, 1);
+ set_bit(BCH_FS_accounting_replay_done, &c->flags);
bch2_journal_set_replay_done(&c->journal);
ret = bch2_fs_read_write_early(c);
if (ret)
goto err;
+ for_each_member_device(c, ca) {
+ ret = bch2_dev_usage_init(ca, false);
+ if (ret) {
+ bch2_dev_put(ca);
+ goto err;
+ }
+ }
+
/*
* Write out the superblock and journal buckets, now that we can do
* btree updates
@@ -1153,9 +1151,6 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- for_each_online_member(c, ca)
- ca->new_fs_bucket_idx = 0;
-
ret = bch2_fs_freespace_init(c);
if (ret)
goto err;
@@ -1176,14 +1171,14 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_inode_pack(&packed_inode, &root_inode);
packed_inode.inode.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0);
bch_err_msg(c, ret, "creating root directory");
if (ret)
goto err;
bch2_inode_init_early(c, &lostfound_inode);
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_create_trans(trans,
BCACHEFS_ROOT_SUBVOL_INUM,
&root_inode, &lostfound_inode,
@@ -1194,7 +1189,7 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1;
+ c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
@@ -1214,6 +1209,7 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
return 0;
err:
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 4e9d24719b2e..b0d55754b21b 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,37 +2,9 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
-extern const char * const bch2_recovery_passes[];
+int bch2_btree_lost_data(struct bch_fs *, enum btree_id);
-u64 bch2_recovery_passes_to_stable(u64 v);
-u64 bch2_recovery_passes_from_stable(u64 v);
-
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- if (c->recovery_passes_explicit & BIT_ULL(pass))
- return 0;
-
- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
- bch2_recovery_passes[pass], pass,
- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
-
- c->recovery_passes_explicit |= BIT_ULL(pass);
-
- if (c->curr_recovery_pass >= pass) {
- c->curr_recovery_pass = pass;
- c->recovery_passes_complete &= (1ULL << pass) >> 1;
- return -BCH_ERR_restart_recovery;
- } else {
- return 0;
- }
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *);
-u64 bch2_fsck_recovery_passes(void);
+int bch2_journal_replay(struct bch_fs *);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
new file mode 100644
index 000000000000..0b3c951c32da
--- /dev/null
+++ b/fs/bcachefs/recovery_passes.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_gc.h"
+#include "btree_node_scan.h"
+#include "disk_accounting.h"
+#include "ec.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "lru.h"
+#include "logged_ops.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "recovery_passes.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, ...) #_fn,
+ BCH_RECOVERY_PASSES()
+#undef x
+ NULL
+};
+
+/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
+static int bch2_recovery_pass_empty(struct bch_fs *c)
+{
+ return 0;
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+
+ /*
+ * After we go RW, the journal keys buffer can't be modified (except for
+ * setting journal_key->overwritten: it will be accessed by multiple
+ * threads
+ */
+ move_gap(keys, keys->nr);
+
+ set_bit(BCH_FS_may_go_rw, &c->flags);
+
+ if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
+ return bch2_fs_read_write_early(c);
+ return 0;
+}
+
+struct recovery_pass_fn {
+ int (*fn)(struct bch_fs *);
+ unsigned when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static const u8 passes_to_stable_map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
+{
+ return passes_to_stable_map[pass];
+}
+
+u64 bch2_recovery_passes_to_stable(u64 v)
+{
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(passes_to_stable_map[i]);
+ return ret;
+}
+
+u64 bch2_recovery_passes_from_stable(u64 v)
+{
+ static const u8 map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+ };
+
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(map[i]);
+ return ret;
+}
+
+/*
+ * For when we need to rewind recovery passes and run a pass we skipped:
+ */
+static int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns))
+ return -BCH_ERR_not_in_recovery;
+
+ if (c->recovery_passes_complete & BIT_ULL(pass))
+ return 0;
+
+ bool print = !(c->opts.recovery_passes & BIT_ULL(pass));
+
+ if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
+ c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) {
+ if (print)
+ bch_info(c, "need recovery pass %s (%u), but already rw",
+ bch2_recovery_passes[pass], pass);
+ return -BCH_ERR_cannot_rewind_recovery;
+ }
+
+ if (print)
+ bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+ bch2_recovery_passes[pass], pass,
+ bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+ c->opts.recovery_passes |= BIT_ULL(pass);
+
+ if (c->curr_recovery_pass > pass) {
+ c->next_recovery_pass = pass;
+ c->recovery_passes_complete &= (1ULL << pass) >> 1;
+ return -BCH_ERR_restart_recovery;
+ } else {
+ return 0;
+ }
+}
+
+int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&c->recovery_pass_lock, flags);
+ int ret = __bch2_run_explicit_recovery_pass(c, pass);
+ spin_unlock_irqrestore(&c->recovery_pass_lock, flags);
+ return ret;
+}
+
+int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
+
+ return bch2_run_explicit_recovery_pass(c, pass);
+}
+
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (!test_bit_le64(s, ext->recovery_passes_required)) {
+ __set_bit_le64(s, ext->recovery_passes_required);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+
+ return bch2_run_explicit_recovery_pass(c, pass);
+}
+
+static void bch2_clear_recovery_pass_required(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (test_bit_le64(s, ext->recovery_passes_required)) {
+ __clear_bit_le64(s, ext->recovery_passes_required);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+}
+
+u64 bch2_fsck_recovery_passes(void)
+{
+ u64 ret = 0;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+ if (recovery_pass_fns[i].when & PASS_FSCK)
+ ret |= BIT_ULL(i);
+ return ret;
+}
+
+static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+
+ if (c->opts.recovery_passes_exclude & BIT_ULL(pass))
+ return false;
+ if (c->opts.recovery_passes & BIT_ULL(pass))
+ return true;
+ if ((p->when & PASS_FSCK) && c->opts.fsck)
+ return true;
+ if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
+ return true;
+ if (p->when & PASS_ALWAYS)
+ return true;
+ return false;
+}
+
+static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+ int ret;
+
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
+ bch2_recovery_passes[pass]);
+ ret = p->fn(c);
+ if (ret)
+ return ret;
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_CONT " done\n");
+
+ return 0;
+}
+
+int bch2_run_online_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
+
+ down_read(&c->state_lock);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
+ struct recovery_pass_fn *p = recovery_pass_fns + i;
+
+ if (!(p->when & PASS_ONLINE))
+ continue;
+
+ ret = bch2_run_recovery_pass(c, i);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
+ i = c->curr_recovery_pass;
+ continue;
+ }
+ if (ret)
+ break;
+ }
+
+ up_read(&c->state_lock);
+
+ return ret;
+}
+
+int bch2_run_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
+
+ /*
+ * We can't allow set_may_go_rw to be excluded; that would cause us to
+ * use the journal replay keys for updates where it's not expected.
+ */
+ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
+
+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) {
+ c->next_recovery_pass = c->curr_recovery_pass + 1;
+
+ spin_lock_irq(&c->recovery_pass_lock);
+ unsigned pass = c->curr_recovery_pass;
+
+ if (c->opts.recovery_pass_last &&
+ c->curr_recovery_pass > c->opts.recovery_pass_last) {
+ spin_unlock_irq(&c->recovery_pass_lock);
+ break;
+ }
+
+ if (!should_run_recovery_pass(c, pass)) {
+ c->curr_recovery_pass++;
+ c->recovery_pass_done = max(c->recovery_pass_done, pass);
+ spin_unlock_irq(&c->recovery_pass_lock);
+ continue;
+ }
+ spin_unlock_irq(&c->recovery_pass_lock);
+
+ ret = bch2_run_recovery_pass(c, pass) ?:
+ bch2_journal_flush(&c->journal);
+
+ if (!ret && !test_bit(BCH_FS_error, &c->flags))
+ bch2_clear_recovery_pass_required(c, pass);
+
+ spin_lock_irq(&c->recovery_pass_lock);
+ if (c->next_recovery_pass < c->curr_recovery_pass) {
+ /*
+ * bch2_run_explicit_recovery_pass() was called: we
+ * can't always catch -BCH_ERR_restart_recovery because
+ * it may have been called from another thread (btree
+ * node read completion)
+ */
+ ret = 0;
+ c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass);
+ } else {
+ c->recovery_passes_complete |= BIT_ULL(pass);
+ c->recovery_pass_done = max(c->recovery_pass_done, pass);
+ }
+ c->curr_recovery_pass = c->next_recovery_pass;
+ spin_unlock_irq(&c->recovery_pass_lock);
+ }
+
+ return ret;
+}
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
new file mode 100644
index 000000000000..7d7339c8fa29
--- /dev/null
+++ b/fs/bcachefs/recovery_passes.h
@@ -0,0 +1,18 @@
+#ifndef _BCACHEFS_RECOVERY_PASSES_H
+#define _BCACHEFS_RECOVERY_PASSES_H
+
+extern const char * const bch2_recovery_passes[];
+
+u64 bch2_recovery_passes_to_stable(u64 v);
+u64 bch2_recovery_passes_from_stable(u64 v);
+
+u64 bch2_fsck_recovery_passes(void);
+
+int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
+int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass);
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
+
+int bch2_run_online_recovery_passes(struct bch_fs *);
+int bch2_run_recovery_passes(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
new file mode 100644
index 000000000000..418557960ed6
--- /dev/null
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
+#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
+
+#define PASS_SILENT BIT(0)
+#define PASS_FSCK BIT(1)
+#define PASS_UNCLEAN BIT(2)
+#define PASS_ALWAYS BIT(3)
+#define PASS_ONLINE BIT(4)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define PASS_FSCK_DEBUG BIT(1)
+#else
+#define PASS_FSCK_DEBUG 0
+#endif
+
+/*
+ * Passes may be reordered, but the second field is a persistent identifier and
+ * must never change:
+ */
+#define BCH_RECOVERY_PASSES() \
+ x(recovery_pass_empty, 41, PASS_SILENT) \
+ x(scan_for_btree_nodes, 37, 0) \
+ x(check_topology, 4, 0) \
+ x(accounting_read, 39, PASS_ALWAYS) \
+ x(alloc_read, 0, PASS_ALWAYS) \
+ x(stripes_read, 1, PASS_ALWAYS) \
+ x(initialize_subvolumes, 2, 0) \
+ x(snapshots_read, 3, PASS_ALWAYS) \
+ x(check_allocations, 5, PASS_FSCK) \
+ x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \
+ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
+ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
+ x(journal_replay, 9, PASS_ALWAYS) \
+ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \
+ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \
+ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \
+ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
+ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \
+ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
+ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
+ x(bucket_gens_init, 17, 0) \
+ x(reconstruct_snapshots, 38, 0) \
+ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
+ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
+ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
+ x(fs_upgrade_for_subvolumes, 22, 0) \
+ x(check_inodes, 24, PASS_FSCK) \
+ x(check_extents, 25, PASS_FSCK) \
+ x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \
+ x(check_dirents, 27, PASS_FSCK) \
+ x(check_xattrs, 28, PASS_FSCK) \
+ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
+ x(check_unreachable_inodes, 40, PASS_FSCK) \
+ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
+ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
+ x(check_nlinks, 31, PASS_FSCK) \
+ x(resume_logged_ops, 23, PASS_ALWAYS) \
+ x(delete_dead_inodes, 32, PASS_ALWAYS) \
+ x(fix_reflink_p, 33, 0) \
+ x(set_fs_needs_rebalance, 34, 0)
+
+/* We normally enumerate recovery passes in the order we run them: */
+enum bch_recovery_pass {
+#define x(n, id, when) BCH_RECOVERY_PASS_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+ BCH_RECOVERY_PASS_NR
+};
+
+/* But we also need stable identifiers that can be used in the superblock */
+enum bch_recovery_pass_stable {
+#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
deleted file mode 100644
index fa0c8efd2a1b..000000000000
--- a/fs/bcachefs/recovery_types.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_TYPES_H
-#define _BCACHEFS_RECOVERY_TYPES_H
-
-#define PASS_SILENT BIT(0)
-#define PASS_FSCK BIT(1)
-#define PASS_UNCLEAN BIT(2)
-#define PASS_ALWAYS BIT(3)
-#define PASS_ONLINE BIT(4)
-
-/*
- * Passes may be reordered, but the second field is a persistent identifier and
- * must never change:
- */
-#define BCH_RECOVERY_PASSES() \
- x(alloc_read, 0, PASS_ALWAYS) \
- x(stripes_read, 1, PASS_ALWAYS) \
- x(initialize_subvolumes, 2, 0) \
- x(snapshots_read, 3, PASS_ALWAYS) \
- x(check_topology, 4, 0) \
- x(check_allocations, 5, PASS_FSCK) \
- x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \
- x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
- x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
- x(journal_replay, 9, PASS_ALWAYS) \
- x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \
- x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \
- x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \
- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \
- x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \
- x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
- x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
- x(bucket_gens_init, 17, 0) \
- x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
- x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
- x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
- x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
- x(fs_upgrade_for_subvolumes, 22, 0) \
- x(resume_logged_ops, 23, PASS_ALWAYS) \
- x(check_inodes, 24, PASS_FSCK) \
- x(check_extents, 25, PASS_FSCK) \
- x(check_indirect_extents, 26, PASS_FSCK) \
- x(check_dirents, 27, PASS_FSCK) \
- x(check_xattrs, 28, PASS_FSCK) \
- x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
- x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
- x(check_nlinks, 31, PASS_FSCK) \
- x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
- x(fix_reflink_p, 33, 0) \
- x(set_fs_needs_rebalance, 34, 0) \
-
-/* We normally enumerate recovery passes in the order we run them: */
-enum bch_recovery_pass {
-#define x(n, id, when) BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-/* But we also need stable identifiers that can be used in the superblock */
-enum bch_recovery_pass_stable {
-#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id,
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-#endif /* _BCACHEFS_RECOVERY_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c47c66c2b394..441e648f28b5 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -15,6 +15,17 @@
#include <linux/sched/signal.h>
+static inline bool bkey_extent_is_reflink_data(const struct bkey *k)
+{
+ switch (k->type) {
+ case KEY_TYPE_reflink_v:
+ case KEY_TYPE_indirect_inline_data:
+ return true;
+ default:
+ return false;
+ }
+}
+
static inline unsigned bkey_type_to_indirect(const struct bkey *k)
{
switch (k->type) {
@@ -29,17 +40,16 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
/* reflink pointers */
-int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
int ret = 0;
- bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad),
- c, err, reflink_p_front_pad_bad,
+ bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad),
+ c, reflink_p_front_pad_bad,
"idx < front_pad (%llu < %u)",
- le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+ REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad));
fsck_err:
return ret;
}
@@ -50,7 +60,7 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
prt_printf(out, "idx %llu front_pad %u back_pad %u",
- le64_to_cpu(p.v->idx),
+ REFLINK_P_IDX(p.v),
le32_to_cpu(p.v->front_pad),
le32_to_cpu(p.v->back_pad));
}
@@ -66,90 +76,291 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
*/
return false;
- if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+ if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v))
+ return false;
+
+ if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v))
return false;
bch2_key_resize(l.k, l.k->size + r.k->size);
return true;
}
+/* indirect extents */
+
+int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)),
+ c, reflink_v_pos_bad,
+ "indirect extent above maximum position 0:%llu",
+ REFLINK_P_IDX_MAX);
+
+ ret = bch2_bkey_ptrs_validate(c, k, from);
+fsck_err:
+ return ret;
+}
+
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+ prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+
+ bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+#if 0
+Currently disabled, needs to be debugged:
+
+bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+ struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l);
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
+
+ return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
+}
+#endif
+
+/* indirect inline data */
+
+int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
+{
+ return 0;
+}
+
+void bch2_indirect_inline_data_to_text(struct printbuf *out,
+ struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
+ unsigned datalen = bkey_inline_data_bytes(k.k);
+
+ prt_printf(out, "refcount %llu datalen %u: %*phN",
+ le64_to_cpu(d.v->refcount), datalen,
+ min(datalen, 32U), d.v->data);
+}
+
+/* lookup */
+
+static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p,
+ bool should_commit)
+{
+ struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
+ int ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ SET_REFLINK_P_ERROR(&new->v, false);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
+ if (ret)
+ return ret;
+
+ if (!should_commit)
+ return 0;
+
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+}
+
+static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 missing_start, u64 missing_end,
+ bool should_commit)
+{
+ if (REFLINK_P_ERROR(p.v))
+ return 0;
+
+ struct bch_fs *c = trans->c;
+ u64 live_start = REFLINK_P_IDX(p.v);
+ u64 live_end = REFLINK_P_IDX(p.v) + p.k->size;
+ u64 refd_start = live_start - le32_to_cpu(p.v->front_pad);
+ u64 refd_end = live_end + le32_to_cpu(p.v->back_pad);
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ BUG_ON(missing_start < refd_start);
+ BUG_ON(missing_end > refd_end);
+
+ if (fsck_err(trans, reflink_p_to_missing_reflink_v,
+ "pointer to missing indirect extent\n"
+ " %s\n"
+ " missing range %llu-%llu",
+ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+ missing_start, missing_end)) {
+ struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto err;
+
+ /*
+ * Is the missing range not actually needed?
+ *
+ * p.v->idx refers to the data that we actually want, but if the
+ * indirect extent we point to was bigger, front_pad and back_pad
+ * indicate the range we took a reference on.
+ */
+
+ if (missing_end <= live_start) {
+ new->v.front_pad = cpu_to_le32(live_start - missing_end);
+ } else if (missing_start >= live_end) {
+ new->v.back_pad = cpu_to_le32(missing_start - live_end);
+ } else {
+ struct bpos new_start = bkey_start_pos(&new->k);
+ struct bpos new_end = new->k.p;
+
+ if (missing_start > live_start)
+ new_start.offset += missing_start - live_start;
+ if (missing_end < live_end)
+ new_end.offset -= live_end - missing_end;
+
+ bch2_cut_front(new_start, &new->k_i);
+ bch2_cut_back(new_end, &new->k_i);
+
+ SET_REFLINK_P_ERROR(&new->v, true);
+ }
+
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
+ if (ret)
+ goto err;
+
+ if (should_commit)
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/*
+ * This is used from the read path, which doesn't expect to have to do a
+ * transaction commit, and from triggers, which should not be doing a commit:
+ */
+struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ s64 *offset_into_extent,
+ struct bkey_s_c_reflink_p p,
+ bool should_commit,
+ unsigned iter_flags)
+{
+ BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad)));
+ BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad));
+
+ u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent;
+
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink,
+ POS(0, reflink_offset), iter_flags);
+ if (bkey_err(k))
+ return k;
+
+ if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
+ unsigned size = min((u64) k.k->size,
+ REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) -
+ reflink_offset);
+ bch2_key_resize(&iter->k, size);
+
+ int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
+ k.k->p.offset, should_commit);
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
+ return bkey_s_c_err(ret);
+ }
+ } else if (unlikely(REFLINK_P_ERROR(p.v))) {
+ int ret = bch2_indirect_extent_not_missing(trans, p, should_commit);
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
+ return bkey_s_c_err(ret);
+ }
+ }
+
+ *offset_into_extent = reflink_offset - bkey_start_offset(k.k);
+ return k;
+}
+
+/* reflink pointer trigger */
+
static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 *idx, unsigned flags)
+ struct bkey_s_c_reflink_p p, u64 *idx,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i *k;
- __le64 *refcount;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
struct printbuf buf = PRINTBUF;
- int ret;
- k = bch2_bkey_get_mut_noupdate(trans, &iter,
- BTREE_ID_reflink, POS(0, *idx),
- BTREE_ITER_WITH_UPDATES);
- ret = PTR_ERR_OR_ZERO(k);
+ s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false,
+ BTREE_ITER_intent|
+ BTREE_ITER_with_updates);
+ int ret = bkey_err(k);
if (ret)
- goto err;
+ return ret;
- refcount = bkey_refcount(bkey_i_to_s(k));
- if (!refcount) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- bch2_trans_inconsistent(trans,
- "nonexistent indirect extent at %llu while marking\n %s",
- *idx, buf.buf);
- ret = -EIO;
- goto err;
+ if (!bkey_refcount_c(k)) {
+ if (!(flags & BTREE_TRIGGER_overwrite))
+ ret = -BCH_ERR_missing_indirect_extent;
+ goto next;
}
- if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- bch2_trans_inconsistent(trans,
- "indirect extent refcount underflow at %llu while marking\n %s",
- *idx, buf.buf);
- ret = -EIO;
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
goto err;
+
+ __le64 *refcount = bkey_refcount(bkey_i_to_s(new));
+ if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ log_fsck_err(trans, reflink_refcount_underflow,
+ "indirect extent refcount underflow while marking\n %s",
+ buf.buf);
+ goto next;
}
- if (flags & BTREE_TRIGGER_INSERT) {
+ if (flags & BTREE_TRIGGER_insert) {
struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
u64 pad;
pad = max_t(s64, le32_to_cpu(v->front_pad),
- le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
+ REFLINK_P_IDX(v) - bkey_start_offset(&new->k));
BUG_ON(pad > U32_MAX);
v->front_pad = cpu_to_le32(pad);
pad = max_t(s64, le32_to_cpu(v->back_pad),
- k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
+ new->k.p.offset - p.k->size - REFLINK_P_IDX(v));
BUG_ON(pad > U32_MAX);
v->back_pad = cpu_to_le32(pad);
}
- le64_add_cpu(refcount, add);
+ le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1);
bch2_btree_iter_set_pos_to_extent_start(&iter);
- ret = bch2_trans_update(trans, &iter, k, 0);
+ ret = bch2_trans_update(trans, &iter, new, 0);
if (ret)
goto err;
-
- *idx = k->k.p.offset;
+next:
+ *idx = k.k->p.offset;
err:
+fsck_err:
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
return ret;
}
static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 *idx, unsigned flags, size_t r_idx)
+ struct bkey_s_c_reflink_p p, u64 *idx,
+ enum btree_iter_update_trigger_flags flags,
+ size_t r_idx)
{
struct bch_fs *c = trans->c;
struct reflink_gc *r;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- u64 start = le64_to_cpu(p.v->idx);
- u64 end = le64_to_cpu(p.v->idx) + p.k->size;
- u64 next_idx = end + le32_to_cpu(p.v->back_pad);
+ int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
+ u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
s64 ret = 0;
struct printbuf buf = PRINTBUF;
@@ -163,60 +374,40 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
BUG_ON((s64) r->refcount + add < 0);
- r->refcount += add;
+ if (flags & BTREE_TRIGGER_gc)
+ r->refcount += add;
*idx = r->offset;
return 0;
not_found:
- if (fsck_err(c, reflink_p_to_missing_reflink_v,
- "pointer to missing indirect extent\n"
- " %s\n"
- " missing range %llu-%llu",
- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
- *idx, next_idx)) {
- struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c);
- ret = PTR_ERR_OR_ZERO(update);
+ if (flags & BTREE_TRIGGER_check_repair) {
+ ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false);
if (ret)
goto err;
-
- if (next_idx <= start) {
- bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx);
- } else if (*idx >= end) {
- bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end);
- } else {
- bkey_error_init(update);
- update->k.p = p.k->p;
- update->k.p.offset = next_idx;
- update->k.size = next_idx - *idx;
- set_bkey_val_u64s(&update->k, 0);
- }
-
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN);
}
*idx = next_idx;
err:
-fsck_err:
printbuf_exit(&buf);
return ret;
}
static int __trigger_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+ enum btree_id btree_id, unsigned level, struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
int ret = 0;
- u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
- u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad);
+ u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
+ u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
while (idx < end && !ret)
ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags);
}
- if (flags & BTREE_TRIGGER_GC) {
+ if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) {
size_t l = 0, r = c->reflink_gc_nr;
while (l < r) {
@@ -239,10 +430,10 @@ int bch2_trigger_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
- if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
- (flags & BTREE_TRIGGER_INSERT)) {
+ if ((flags & BTREE_TRIGGER_transactional) &&
+ (flags & BTREE_TRIGGER_insert)) {
struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v;
v->front_pad = v->back_pad = 0;
@@ -251,92 +442,48 @@ int bch2_trigger_reflink_p(struct btree_trans *trans,
return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags);
}
-/* indirect extents */
+/* indirect extent trigger */
-int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+static inline void
+check_indirect_extent_deleting(struct bkey_s new,
+ enum btree_iter_update_trigger_flags *flags)
{
- return bch2_bkey_ptrs_invalid(c, k, flags, err);
-}
-
-void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
- prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
-
- bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-#if 0
-Currently disabled, needs to be debugged:
-
-bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
- struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l);
- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
-
- return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
-}
-#endif
-
-static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags)
-{
- if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
+ if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) {
new.k->type = KEY_TYPE_deleted;
new.k->size = 0;
set_bkey_val_u64s(new.k, 0);
- *flags &= ~BTREE_TRIGGER_INSERT;
+ *flags &= ~BTREE_TRIGGER_insert;
}
}
int bch2_trigger_reflink_v(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
- if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
- (flags & BTREE_TRIGGER_INSERT))
+ if ((flags & BTREE_TRIGGER_transactional) &&
+ (flags & BTREE_TRIGGER_insert))
check_indirect_extent_deleting(new, &flags);
return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
}
-/* indirect inline data */
-
-int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
-{
- return 0;
-}
-
-void bch2_indirect_inline_data_to_text(struct printbuf *out,
- struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
- unsigned datalen = bkey_inline_data_bytes(k.k);
-
- prt_printf(out, "refcount %llu datalen %u: %*phN",
- le64_to_cpu(d.v->refcount), datalen,
- min(datalen, 32U), d.v->data);
-}
-
int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
check_indirect_extent_deleting(new, &flags);
return 0;
}
+/* create */
+
static int bch2_make_extent_indirect(struct btree_trans *trans,
struct btree_iter *extent_iter,
- struct bkey_i *orig)
+ struct bkey_i *orig,
+ bool reflink_p_may_update_opts_field)
{
struct bch_fs *c = trans->c;
struct btree_iter reflink_iter = { NULL };
@@ -350,12 +497,20 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
k = bch2_btree_iter_peek_prev(&reflink_iter);
ret = bkey_err(k);
if (ret)
goto err;
+ /*
+ * XXX: we're assuming that 56 bits will be enough for the life of the
+ * filesystem: we need to implement wraparound, with a cursor in the
+ * logged ops btree:
+ */
+ if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size)))
+ return -ENOSPC;
+
r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
ret = PTR_ERR_OR_ZERO(r_v);
if (ret)
@@ -365,7 +520,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
r_v->k.type = bkey_type_to_indirect(&orig->k);
r_v->k.p = reflink_iter.pos;
bch2_key_resize(&r_v->k, orig->k.size);
- r_v->k.version = orig->k.version;
+ r_v->k.bversion = orig->k.bversion;
set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
@@ -392,10 +547,13 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
memset(&r_p->v, 0, sizeof(r_p->v));
#endif
- r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
+ SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k));
+
+ if (reflink_p_may_update_opts_field)
+ SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true);
ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
err:
bch2_trans_iter_exit(trans, &reflink_iter);
@@ -407,7 +565,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
struct bkey_s_c k;
int ret;
- for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) {
+ for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) {
if (bkey_extent_is_unwritten(k))
continue;
@@ -424,7 +582,8 @@ s64 bch2_remap_range(struct bch_fs *c,
subvol_inum dst_inum, u64 dst_offset,
subvol_inum src_inum, u64 src_offset,
u64 remap_sectors,
- u64 new_i_size, s64 *i_sectors_delta)
+ u64 new_i_size, s64 *i_sectors_delta,
+ bool may_change_src_io_path_opts)
{
struct btree_trans *trans;
struct btree_iter dst_iter, src_iter;
@@ -437,6 +596,8 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bpos src_want;
u64 dst_done = 0;
u32 dst_snapshot, src_snapshot;
+ bool reflink_p_may_update_opts_field =
+ bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
int ret = 0, ret2 = 0;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
@@ -456,9 +617,9 @@ s64 bch2_remap_range(struct bch_fs *c,
goto err;
bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
while ((ret == 0 ||
bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
@@ -518,7 +679,8 @@ s64 bch2_remap_range(struct bch_fs *c,
src_k = bkey_i_to_s_c(new_src.k);
ret = bch2_make_extent_indirect(trans, &src_iter,
- new_src.k);
+ new_src.k,
+ reflink_p_may_update_opts_field);
if (ret)
continue;
@@ -531,11 +693,15 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bkey_i_reflink_p *dst_p =
bkey_reflink_p_init(new_dst.k);
- u64 offset = le64_to_cpu(src_p.v->idx) +
+ u64 offset = REFLINK_P_IDX(src_p.v) +
(src_want.offset -
bkey_start_offset(src_k.k));
- dst_p->v.idx = cpu_to_le64(offset);
+ SET_REFLINK_P_IDX(&dst_p->v, offset);
+
+ if (reflink_p_may_update_opts_field &&
+ may_change_src_io_path_opts)
+ SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true);
} else {
BUG();
}
@@ -545,7 +711,7 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?:
bch2_extent_update(trans, dst_inum, &dst_iter,
new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
@@ -568,7 +734,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_begin(trans);
ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
- dst_inum, BTREE_ITER_INTENT);
+ dst_inum, BTREE_ITER_intent);
if (!ret2 &&
inode_u.bi_size < new_i_size) {
@@ -589,3 +755,97 @@ err:
return dst_done ?: ret ?: ret2;
}
+
+/* fsck */
+
+static int bch2_gc_write_reflink_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ size_t *idx)
+{
+ struct bch_fs *c = trans->c;
+ const __le64 *refcount = bkey_refcount_c(k);
+ struct printbuf buf = PRINTBUF;
+ struct reflink_gc *r;
+ int ret = 0;
+
+ if (!refcount)
+ return 0;
+
+ while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
+ r->offset < k.k->p.offset)
+ ++*idx;
+
+ if (!r ||
+ r->offset != k.k->p.offset ||
+ r->size != k.k->size) {
+ bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+ return -EINVAL;
+ }
+
+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
+ trans, reflink_v_refcount_wrong,
+ "reflink key has wrong refcount:\n"
+ " %s\n"
+ " should be %u",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+ r->refcount)) {
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto out;
+
+ if (!r->refcount)
+ new->k.type = KEY_TYPE_deleted;
+ else
+ *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
+ ret = bch2_trans_update(trans, iter, new, 0);
+ }
+out:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_gc_reflink_done(struct bch_fs *c)
+{
+ size_t idx = 0;
+
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_prefetch, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
+ c->reflink_gc_nr = 0;
+ return ret;
+}
+
+int bch2_gc_reflink_start(struct bch_fs *c)
+{
+ c->reflink_gc_nr = 0;
+
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_prefetch, k, ({
+ const __le64 *refcount = bkey_refcount_c(k);
+
+ if (!refcount)
+ continue;
+
+ struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
+ c->reflink_gc_nr++, GFP_KERNEL);
+ if (!r) {
+ ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+ break;
+ }
+
+ r->offset = k.k->p.offset;
+ r->size = k.k->size;
+ r->refcount = 0;
+ 0;
+ })));
+
+ bch_err_fn(c, ret);
+ return ret;
+}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 4d8867289717..1632780bdf18 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -2,50 +2,48 @@
#ifndef _BCACHEFS_REFLINK_H
#define _BCACHEFS_REFLINK_H
-enum bkey_invalid_flags;
-
-int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
-void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
+int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \
- .key_invalid = bch2_reflink_p_invalid, \
+ .key_validate = bch2_reflink_p_validate, \
.val_to_text = bch2_reflink_p_to_text, \
.key_merge = bch2_reflink_p_merge, \
.trigger = bch2_trigger_reflink_p, \
.min_val_size = 16, \
})
-int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
-void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
+int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
- .key_invalid = bch2_reflink_v_invalid, \
+ .key_validate = bch2_reflink_v_validate, \
.val_to_text = bch2_reflink_v_to_text, \
.swab = bch2_ptr_swab, \
.trigger = bch2_trigger_reflink_v, \
.min_val_size = 8, \
})
-int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
int bch2_trigger_indirect_inline_data(struct btree_trans *,
enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
- unsigned);
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
- .key_invalid = bch2_indirect_inline_data_invalid, \
+ .key_validate = bch2_indirect_inline_data_validate, \
.val_to_text = bch2_indirect_inline_data_to_text, \
.trigger = bch2_trigger_indirect_inline_data, \
.min_val_size = 8, \
@@ -75,7 +73,15 @@ static inline __le64 *bkey_refcount(struct bkey_s k)
}
}
+struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *,
+ s64 *, struct bkey_s_c_reflink_p,
+ bool, unsigned);
+
s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
- subvol_inum, u64, u64, u64, s64 *);
+ subvol_inum, u64, u64, u64, s64 *,
+ bool);
+
+int bch2_gc_reflink_done(struct bch_fs *);
+int bch2_gc_reflink_start(struct bch_fs *);
#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
index 6772eebb1fc6..92995e4f898e 100644
--- a/fs/bcachefs/reflink_format.h
+++ b/fs/bcachefs/reflink_format.h
@@ -4,7 +4,7 @@
struct bch_reflink_p {
struct bch_val v;
- __le64 idx;
+ __le64 idx_flags;
/*
* A reflink pointer might point to an indirect extent which is then
* later split (by copygc or rebalance). If we only pointed to part of
@@ -17,6 +17,11 @@ struct bch_reflink_p {
__le32 back_pad;
} __packed __aligned(8);
+LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56);
+LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57);
+LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS,
+ struct bch_reflink_p, idx_flags, 57, 58);
+
struct bch_reflink_v {
struct bch_val v;
__le64 refcount;
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index cc2672c12031..477ef0997949 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -2,16 +2,20 @@
#include "bcachefs.h"
#include "buckets.h"
+#include "disk_accounting.h"
#include "journal.h"
#include "replicas.h"
#include "super-io.h"
+#include <linux/sort.h>
+
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r, size_t size)
+static int bch2_memcmp(const void *l, const void *r, const void *priv)
{
+ size_t size = (size_t) priv;
return memcmp(l, r, size);
}
@@ -20,14 +24,11 @@ static int bch2_memcmp(const void *l, const void *r, size_t size)
static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- unsigned i;
-
- BUG_ON(e->data_type >= BCH_DATA_NR);
BUG_ON(!e->nr_devs);
BUG_ON(e->nr_required > 1 &&
e->nr_required >= e->nr_devs);
- for (i = 0; i + 1 < e->nr_devs; i++)
+ for (unsigned i = 0; i + 1 < e->nr_devs; i++)
BUG_ON(e->devs[i] >= e->devs[i + 1]);
#endif
}
@@ -39,7 +40,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
- eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
+ eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
+ bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
}
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@@ -64,8 +66,36 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
prt_printf(out, "]");
}
+static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
+ struct bch_sb *sb,
+ struct printbuf *err)
+{
+ if (!r->nr_devs) {
+ prt_printf(err, "no devices in entry ");
+ goto bad;
+ }
+
+ if (r->nr_required > 1 &&
+ r->nr_required >= r->nr_devs) {
+ prt_printf(err, "bad nr_required in entry ");
+ goto bad;
+ }
+
+ for (unsigned i = 0; i < r->nr_devs; i++)
+ if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
+ !bch2_member_exists(sb, r->devs[i])) {
+ prt_printf(err, "invalid device %u in entry ", r->devs[i]);
+ goto bad;
+ }
+
+ return 0;
+bad:
+ bch2_replicas_entry_to_text(err, r);
+ return -BCH_ERR_invalid_replicas_entry;
+}
+
int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
- struct bch_sb *sb,
+ struct bch_fs *c,
struct printbuf *err)
{
if (!r->nr_devs) {
@@ -80,7 +110,8 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
}
for (unsigned i = 0; i < r->nr_devs; i++)
- if (!bch2_dev_exists(sb, r->devs[i])) {
+ if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
+ !bch2_dev_exists(c, r->devs[i])) {
prt_printf(err, "invalid device %u in entry ", r->devs[i]);
goto bad;
}
@@ -120,7 +151,7 @@ static void extent_to_replicas(struct bkey_s_c k,
continue;
if (!p.has_ec)
- r->devs[r->nr_devs++] = p.ptr.dev;
+ replicas_entry_add_dev(r, p.ptr.dev);
else
r->nr_required = 0;
}
@@ -137,7 +168,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
for (ptr = s.v->ptrs;
ptr < s.v->ptrs + s.v->nr_blocks;
ptr++)
- r->devs[r->nr_devs++] = ptr->dev;
+ replicas_entry_add_dev(r, ptr->dev);
}
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
@@ -178,7 +209,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
e->nr_required = 1;
darray_for_each(devs, i)
- e->devs[e->nr_devs++] = *i;
+ replicas_entry_add_dev(e, *i);
bch2_replicas_entry_sort(e);
}
@@ -188,24 +219,17 @@ cpu_replicas_add_entry(struct bch_fs *c,
struct bch_replicas_cpu *old,
struct bch_replicas_entry_v1 *new_entry)
{
- unsigned i;
struct bch_replicas_cpu new = {
.nr = old->nr + 1,
.entry_size = max_t(unsigned, old->entry_size,
replicas_entry_bytes(new_entry)),
};
- for (i = 0; i < new_entry->nr_devs; i++)
- BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i]));
-
- BUG_ON(!new_entry->data_type);
- verify_replicas_entry(new_entry);
-
new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
if (!new.entries)
return new;
- for (i = 0; i < old->nr; i++)
+ for (unsigned i = 0; i < old->nr; i++)
memcpy(cpu_replicas_entry(&new, i),
cpu_replicas_entry(old, i),
old->entry_size);
@@ -226,9 +250,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
if (unlikely(entry_size > r->entry_size))
return -1;
- verify_replicas_entry(search);
-
-#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
+#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size)
idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
entry_cmp, search);
#undef entry_cmp
@@ -250,145 +272,25 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r,
return __replicas_entry_idx(r, search) >= 0;
}
-bool bch2_replicas_marked(struct bch_fs *c,
+bool bch2_replicas_marked_locked(struct bch_fs *c,
struct bch_replicas_entry_v1 *search)
{
- bool marked;
-
- if (!search->nr_devs)
- return true;
-
verify_replicas_entry(search);
- percpu_down_read(&c->mark_lock);
- marked = __replicas_has_entry(&c->replicas, search) &&
- (likely((!c->replicas_gc.entries)) ||
- __replicas_has_entry(&c->replicas_gc, search));
- percpu_up_read(&c->mark_lock);
-
- return marked;
+ return !search->nr_devs ||
+ (__replicas_has_entry(&c->replicas, search) &&
+ (likely((!c->replicas_gc.entries)) ||
+ __replicas_has_entry(&c->replicas_gc, search)));
}
-static void __replicas_table_update(struct bch_fs_usage *dst,
- struct bch_replicas_cpu *dst_r,
- struct bch_fs_usage *src,
- struct bch_replicas_cpu *src_r)
-{
- int src_idx, dst_idx;
-
- *dst = *src;
-
- for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
- if (!src->replicas[src_idx])
- continue;
-
- dst_idx = __replicas_entry_idx(dst_r,
- cpu_replicas_entry(src_r, src_idx));
- BUG_ON(dst_idx < 0);
-
- dst->replicas[dst_idx] = src->replicas[src_idx];
- }
-}
-
-static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
- struct bch_replicas_cpu *dst_r,
- struct bch_fs_usage __percpu *src_p,
- struct bch_replicas_cpu *src_r)
-{
- unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
- struct bch_fs_usage *dst, *src = (void *)
- bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
-
- preempt_disable();
- dst = this_cpu_ptr(dst_p);
- preempt_enable();
-
- __replicas_table_update(dst, dst_r, src, src_r);
-}
-
-/*
- * Resize filesystem accounting:
- */
-static int replicas_table_update(struct bch_fs *c,
- struct bch_replicas_cpu *new_r)
+bool bch2_replicas_marked(struct bch_fs *c,
+ struct bch_replicas_entry_v1 *search)
{
- struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
- struct bch_fs_usage_online *new_scratch = NULL;
- struct bch_fs_usage __percpu *new_gc = NULL;
- struct bch_fs_usage *new_base = NULL;
- unsigned i, bytes = sizeof(struct bch_fs_usage) +
- sizeof(u64) * new_r->nr;
- unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
- sizeof(u64) * new_r->nr;
- int ret = 0;
-
- memset(new_usage, 0, sizeof(new_usage));
-
- for (i = 0; i < ARRAY_SIZE(new_usage); i++)
- if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
- sizeof(u64), GFP_KERNEL)))
- goto err;
+ percpu_down_read(&c->mark_lock);
+ bool ret = bch2_replicas_marked_locked(c, search);
+ percpu_up_read(&c->mark_lock);
- if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
- !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) ||
- (c->usage_gc &&
- !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
- goto err;
-
- for (i = 0; i < ARRAY_SIZE(new_usage); i++)
- if (c->usage[i])
- __replicas_table_update_pcpu(new_usage[i], new_r,
- c->usage[i], &c->replicas);
- if (c->usage_base)
- __replicas_table_update(new_base, new_r,
- c->usage_base, &c->replicas);
- if (c->usage_gc)
- __replicas_table_update_pcpu(new_gc, new_r,
- c->usage_gc, &c->replicas);
-
- for (i = 0; i < ARRAY_SIZE(new_usage); i++)
- swap(c->usage[i], new_usage[i]);
- swap(c->usage_base, new_base);
- swap(c->usage_scratch, new_scratch);
- swap(c->usage_gc, new_gc);
- swap(c->replicas, *new_r);
-out:
- free_percpu(new_gc);
- kfree(new_scratch);
- for (i = 0; i < ARRAY_SIZE(new_usage); i++)
- free_percpu(new_usage[i]);
- kfree(new_base);
return ret;
-err:
- bch_err(c, "error updating replicas table: memory allocation failure");
- ret = -BCH_ERR_ENOMEM_replicas_table;
- goto out;
-}
-
-static unsigned reserve_journal_replicas(struct bch_fs *c,
- struct bch_replicas_cpu *r)
-{
- struct bch_replicas_entry_v1 *e;
- unsigned journal_res_u64s = 0;
-
- /* nr_inodes: */
- journal_res_u64s +=
- DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
-
- /* key_version: */
- journal_res_u64s +=
- DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
-
- /* persistent_reserved: */
- journal_res_u64s +=
- DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
- BCH_REPLICAS_MAX;
-
- for_each_cpu_replicas_entry(r, e)
- journal_res_u64s +=
- DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
- e->nr_devs, sizeof(u64));
- return journal_res_u64s;
}
noinline
@@ -424,10 +326,6 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
if (ret)
goto err;
-
- bch2_journal_entry_res_resize(&c->journal,
- &c->replicas_journal_res,
- reserve_journal_replicas(c, &new_r));
}
if (!new_r.entries &&
@@ -442,7 +340,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
/* don't update in memory replicas until changes are persistent */
percpu_down_write(&c->mark_lock);
if (new_r.entries)
- ret = replicas_table_update(c, &new_r);
+ swap(c->replicas, new_r);
if (new_gc.entries)
swap(new_gc, c->replicas_gc);
percpu_up_write(&c->mark_lock);
@@ -464,20 +362,6 @@ int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
? 0 : bch2_mark_replicas_slowpath(c, r);
}
-/* replicas delta list: */
-
-int bch2_replicas_delta_list_mark(struct bch_fs *c,
- struct replicas_delta_list *r)
-{
- struct replicas_delta *d = r->d;
- struct replicas_delta *top = (void *) r->d + r->used;
- int ret = 0;
-
- for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
- ret = bch2_mark_replicas(c, &d->r);
- return ret;
-}
-
/*
* Old replicas_gc mechanism: only used for journal replicas entries now, should
* die at some point:
@@ -491,8 +375,9 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
percpu_down_write(&c->mark_lock);
ret = ret ?:
- bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
- replicas_table_update(c, &c->replicas_gc);
+ bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
+ if (!ret)
+ swap(c->replicas, c->replicas_gc);
kfree(c->replicas_gc.entries);
c->replicas_gc.entries = NULL;
@@ -520,13 +405,16 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
c->replicas_gc.nr = 0;
c->replicas_gc.entry_size = 0;
- for_each_cpu_replicas_entry(&c->replicas, e)
- if (!((1 << e->data_type) & typemask)) {
+ for_each_cpu_replicas_entry(&c->replicas, e) {
+ /* Preserve unknown data types */
+ if (e->data_type >= BCH_DATA_NR ||
+ !((1 << e->data_type) & typemask)) {
c->replicas_gc.nr++;
c->replicas_gc.entry_size =
max_t(unsigned, c->replicas_gc.entry_size,
replicas_entry_bytes(e));
}
+ }
c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
c->replicas_gc.entry_size,
@@ -538,7 +426,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
}
for_each_cpu_replicas_entry(&c->replicas, e)
- if (!((1 << e->data_type) & typemask))
+ if (e->data_type >= BCH_DATA_NR ||
+ !((1 << e->data_type) & typemask))
memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
e, c->replicas_gc.entry_size);
@@ -559,10 +448,10 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
int bch2_replicas_gc2(struct bch_fs *c)
{
struct bch_replicas_cpu new = { 0 };
- unsigned i, nr;
+ unsigned nr;
int ret = 0;
- bch2_journal_meta(&c->journal);
+ bch2_accounting_mem_gc(c);
retry:
nr = READ_ONCE(c->replicas.nr);
new.entry_size = READ_ONCE(c->replicas.entry_size);
@@ -583,24 +472,34 @@ retry:
goto retry;
}
- for (i = 0; i < c->replicas.nr; i++) {
+ for (unsigned i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
- if (e->data_type == BCH_DATA_journal ||
- c->usage_base->replicas[i] ||
- percpu_u64_get(&c->usage[0]->replicas[i]) ||
- percpu_u64_get(&c->usage[1]->replicas[i]) ||
- percpu_u64_get(&c->usage[2]->replicas[i]) ||
- percpu_u64_get(&c->usage[3]->replicas[i]))
+ struct disk_accounting_pos k = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+
+ unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
+ "embedded variable length struct");
+
+ struct bpos p = disk_accounting_pos_to_bpos(&k);
+
+ struct bch_accounting_mem *acc = &c->accounting;
+ bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &p) >= acc->k.nr;
+
+ if (e->data_type == BCH_DATA_journal || !kill)
memcpy(cpu_replicas_entry(&new, new.nr++),
e, new.entry_size);
}
bch2_cpu_replicas_sort(&new);
- ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
- replicas_table_update(c, &new);
+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
+
+ if (!ret)
+ swap(c->replicas, new);
kfree(new.entries);
@@ -614,34 +513,6 @@ retry:
return ret;
}
-int bch2_replicas_set_usage(struct bch_fs *c,
- struct bch_replicas_entry_v1 *r,
- u64 sectors)
-{
- int ret, idx = bch2_replicas_entry_idx(c, r);
-
- if (idx < 0) {
- struct bch_replicas_cpu n;
-
- n = cpu_replicas_add_entry(c, &c->replicas, r);
- if (!n.entries)
- return -BCH_ERR_ENOMEM_cpu_replicas;
-
- ret = replicas_table_update(c, &n);
- if (ret)
- return ret;
-
- kfree(n.entries);
-
- idx = bch2_replicas_entry_idx(c, r);
- BUG_ON(ret < 0);
- }
-
- c->usage_base->replicas[idx] = sectors;
-
- return 0;
-}
-
/* Replicas tracking - superblock: */
static int
@@ -727,8 +598,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
bch2_cpu_replicas_sort(&new_r);
percpu_down_write(&c->mark_lock);
-
- ret = replicas_table_update(c, &new_r);
+ swap(c->replicas, new_r);
percpu_up_write(&c->mark_lock);
kfree(new_r.entries);
@@ -824,16 +694,17 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
{
unsigned i;
- sort_cmp_size(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- bch2_memcmp, NULL);
+ sort_r(cpu_r->entries,
+ cpu_r->nr,
+ cpu_r->entry_size,
+ bch2_memcmp, NULL,
+ (void *)(size_t)cpu_r->entry_size);
for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i);
- int ret = bch2_replicas_entry_validate(e, sb, err);
+ int ret = bch2_replicas_entry_sb_validate(e, sb, err);
if (ret)
return ret;
@@ -855,7 +726,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
}
static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
struct bch_replicas_cpu cpu_r;
@@ -894,7 +765,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
};
static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
struct bch_replicas_cpu cpu_r;
@@ -942,20 +813,27 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
percpu_down_read(&c->mark_lock);
for_each_cpu_replicas_entry(&c->replicas, e) {
- unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
+ unsigned nr_online = 0, nr_failed = 0, dflags = 0;
bool metadata = e->data_type < BCH_DATA_user;
if (e->data_type == BCH_DATA_cached)
continue;
- for (i = 0; i < e->nr_devs; i++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
+ rcu_read_lock();
+ for (unsigned i = 0; i < e->nr_devs; i++) {
+ if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
+ nr_failed++;
+ continue;
+ }
nr_online += test_bit(e->devs[i], devs.d);
- nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
+
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
+ nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
}
+ rcu_read_unlock();
- if (nr_failed == e->nr_devs)
+ if (nr_online + nr_failed == e->nr_devs)
continue;
if (nr_online < e->nr_required)
@@ -991,7 +869,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
{
struct bch_sb_field_replicas *replicas;
struct bch_sb_field_replicas_v0 *replicas_v0;
- unsigned i, data_has = 0;
+ unsigned data_has = 0;
replicas = bch2_sb_field_get(sb, replicas);
replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
@@ -999,17 +877,26 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
if (replicas) {
struct bch_replicas_entry_v1 *r;
- for_each_replicas_entry(replicas, r)
- for (i = 0; i < r->nr_devs; i++)
+ for_each_replicas_entry(replicas, r) {
+ if (r->data_type >= sizeof(data_has) * 8)
+ continue;
+
+ for (unsigned i = 0; i < r->nr_devs; i++)
if (r->devs[i] == dev)
data_has |= 1 << r->data_type;
+ }
+
} else if (replicas_v0) {
struct bch_replicas_entry_v0 *r;
- for_each_replicas_entry_v0(replicas_v0, r)
- for (i = 0; i < r->nr_devs; i++)
+ for_each_replicas_entry_v0(replicas_v0, r) {
+ if (r->data_type >= sizeof(data_has) * 8)
+ continue;
+
+ for (unsigned i = 0; i < r->nr_devs; i++)
if (r->devs[i] == dev)
data_has |= 1 << r->data_type;
+ }
}
@@ -1018,10 +905,8 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned ret;
-
mutex_lock(&c->sb_lock);
- ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
+ unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
mutex_unlock(&c->sb_lock);
return ret;
@@ -1029,25 +914,6 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
void bch2_fs_replicas_exit(struct bch_fs *c)
{
- unsigned i;
-
- kfree(c->usage_scratch);
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- free_percpu(c->usage[i]);
- kfree(c->usage_base);
kfree(c->replicas.entries);
kfree(c->replicas_gc.entries);
-
- mempool_exit(&c->replicas_delta_pool);
-}
-
-int bch2_fs_replicas_init(struct bch_fs *c)
-{
- bch2_journal_entry_res_resize(&c->journal,
- &c->replicas_journal_res,
- reserve_journal_replicas(c, &c->replicas));
-
- return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
- REPLICAS_DELTA_LIST_MAX) ?:
- replicas_table_update(c, &c->replicas);
}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 654a4b26d3a3..5aba2c1ce133 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -10,7 +10,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry_v1 *);
int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
- struct bch_sb *, struct printbuf *);
+ struct bch_fs *, struct printbuf *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
static inline struct bch_replicas_entry_v1 *
@@ -25,18 +25,13 @@ int bch2_replicas_entry_idx(struct bch_fs *,
void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
enum bch_data_type,
struct bch_devs_list);
+
+bool bch2_replicas_marked_locked(struct bch_fs *,
+ struct bch_replicas_entry_v1 *);
bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
int bch2_mark_replicas(struct bch_fs *,
struct bch_replicas_entry_v1 *);
-static inline struct replicas_delta *
-replicas_delta_next(struct replicas_delta *d)
-{
- return (void *) d + replicas_entry_bytes(&d->r) + 8;
-}
-
-int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
-
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
@@ -58,10 +53,6 @@ int bch2_replicas_gc_end(struct bch_fs *, int);
int bch2_replicas_gc_start(struct bch_fs *, unsigned);
int bch2_replicas_gc2(struct bch_fs *);
-int bch2_replicas_set_usage(struct bch_fs *,
- struct bch_replicas_entry_v1 *,
- u64);
-
#define for_each_cpu_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
@@ -88,6 +79,5 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
void bch2_fs_replicas_exit(struct bch_fs *);
-int bch2_fs_replicas_init(struct bch_fs *);
#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
new file mode 100644
index 000000000000..b7eff904acdb
--- /dev/null
+++ b/fs/bcachefs/replicas_format.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_FORMAT_H
+#define _BCACHEFS_REPLICAS_FORMAT_H
+
+struct bch_replicas_entry_v0 {
+ __u8 data_type;
+ __u8 nr_devs;
+ __u8 devs[] __counted_by(nr_devs);
+} __packed;
+
+struct bch_sb_field_replicas_v0 {
+ struct bch_sb_field field;
+ struct bch_replicas_entry_v0 entries[];
+} __packed __aligned(8);
+
+struct bch_replicas_entry_v1 {
+ __u8 data_type;
+ __u8 nr_devs;
+ __u8 nr_required;
+ __u8 devs[] __counted_by(nr_devs);
+} __packed;
+
+struct bch_sb_field_replicas {
+ struct bch_sb_field field;
+ struct bch_replicas_entry_v1 entries[];
+} __packed __aligned(8);
+
+#define replicas_entry_bytes(_i) \
+ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+
+#define replicas_entry_add_dev(e, d) ({ \
+ (e)->nr_devs++; \
+ (e)->devs[(e)->nr_devs - 1] = (d); \
+})
+
+#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
index ac90d142c4e8..fed71c861fe7 100644
--- a/fs/bcachefs/replicas_types.h
+++ b/fs/bcachefs/replicas_types.h
@@ -8,20 +8,4 @@ struct bch_replicas_cpu {
struct bch_replicas_entry_v1 *entries;
};
-struct replicas_delta {
- s64 delta;
- struct bch_replicas_entry_v1 r;
-} __packed;
-
-struct replicas_delta_list {
- unsigned size;
- unsigned used;
-
- struct {} memset_start;
- u64 nr_inodes;
- u64 persistent_reserved[BCH_REPLICAS_MAX];
- struct {} memset_end;
- struct replicas_delta d[];
-};
-
#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index b6bf0ebe7e84..59c8770e4a0e 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -23,16 +23,28 @@
int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
int write)
{
+ struct bkey_validate_context from = {
+ .flags = write,
+ .from = BKEY_VALIDATE_superblock,
+ };
struct jset_entry *entry;
int ret;
for (entry = clean->start;
entry < (struct jset_entry *) vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
+ if (vstruct_end(entry) > vstruct_end(&clean->field)) {
+ bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu",
+ le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s),
+ (u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field));
+ bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
ret = bch2_journal_entry_validate(c, NULL, entry,
le16_to_cpu(c->disk_sb.sb->version),
BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
- write);
+ from);
if (ret)
return ret;
}
@@ -147,7 +159,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
mutex_unlock(&c->sb_lock);
- return NULL;
+ return ERR_PTR(-BCH_ERR_invalid_sb_clean);
}
clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
@@ -159,6 +171,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
ret = bch2_sb_clean_validate_late(c, clean, READ);
if (ret) {
+ kfree(clean);
mutex_unlock(&c->sb_lock);
return ERR_PTR(ret);
}
@@ -171,45 +184,10 @@ fsck_err:
return ERR_PTR(ret);
}
-static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
- struct jset_entry *entry = *end;
- unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
- memset(entry, 0, u64s * sizeof(u64));
- /*
- * The u64s field counts from the start of data, ignoring the shared
- * fields.
- */
- entry->u64s = cpu_to_le16(u64s - 1);
-
- *end = vstruct_next(*end);
- return entry;
-}
-
void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)
{
- percpu_down_read(&c->mark_lock);
-
- if (!journal_seq) {
- for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
- bch2_fs_usage_acc_to_base(c, i);
- } else {
- bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
- }
-
- {
- struct jset_entry_usage *u =
- container_of(jset_entry_init(end, sizeof(*u)),
- struct jset_entry_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = BCH_FS_USAGE_inodes;
- u->v = cpu_to_le64(c->usage_base->b.nr_inodes);
- }
-
{
struct jset_entry_usage *u =
container_of(jset_entry_init(end, sizeof(*u)),
@@ -220,49 +198,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->v = cpu_to_le64(atomic64_read(&c->key_version));
}
- for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
- struct jset_entry_usage *u =
- container_of(jset_entry_init(end, sizeof(*u)),
- struct jset_entry_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = BCH_FS_USAGE_reserved;
- u->entry.level = i;
- u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
- }
-
- for (unsigned i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(&c->replicas, i);
- struct jset_entry_data_usage *u =
- container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
- struct jset_entry_data_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_data_usage;
- u->v = cpu_to_le64(c->usage_base->replicas[i]);
- unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
- "embedded variable length struct");
- }
-
- for_each_member_device(c, ca) {
- unsigned b = sizeof(struct jset_entry_dev_usage) +
- sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
- struct jset_entry_dev_usage *u =
- container_of(jset_entry_init(end, b),
- struct jset_entry_dev_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_dev_usage;
- u->dev = cpu_to_le32(ca->dev_idx);
-
- for (unsigned i = 0; i < BCH_DATA_NR; i++) {
- u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
- u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
- u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
- }
- }
-
- percpu_up_read(&c->mark_lock);
-
for (unsigned i = 0; i < 2; i++) {
struct jset_entry_clock *clock =
container_of(jset_entry_init(end, sizeof(*clock)),
@@ -274,9 +209,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
}
}
-static int bch2_sb_clean_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_clean *clean = field_to_type(f, clean);
@@ -286,6 +220,17 @@ static int bch2_sb_clean_validate(struct bch_sb *sb,
return -BCH_ERR_invalid_sb_clean;
}
+ for (struct jset_entry *entry = clean->start;
+ entry != vstruct_end(&clean->field);
+ entry = vstruct_next(entry)) {
+ if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) {
+ prt_str(err, "entry type ");
+ bch2_prt_jset_entry_type(err, entry->type);
+ prt_str(err, " overruns end of section");
+ return -BCH_ERR_invalid_sb_clean;
+ }
+ }
+
return 0;
}
@@ -295,14 +240,15 @@ static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field_clean *clean = field_to_type(f, clean);
struct jset_entry *entry;
- prt_printf(out, "flags: %x", le32_to_cpu(clean->flags));
- prt_newline(out);
- prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq));
- prt_newline(out);
+ prt_printf(out, "flags: %x\n", le32_to_cpu(clean->flags));
+ prt_printf(out, "journal_seq: %llu\n", le64_to_cpu(clean->journal_seq));
for (entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
+ if ((void *) vstruct_next(entry) > vstruct_end(&clean->field))
+ break;
+
if (entry->type == BCH_JSET_ENTRY_btree_keys &&
!entry->u64s)
continue;
@@ -386,6 +332,8 @@ void bch2_fs_mark_clean(struct bch_fs *c)
goto out;
}
+ bch2_journal_pos_from_member_info_set(c);
+
bch2_write_super(c);
out:
mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c
index 7dc898761bb3..6992e7469112 100644
--- a/fs/bcachefs/sb-counters.c
+++ b/fs/bcachefs/sb-counters.c
@@ -20,9 +20,8 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
};
-static int bch2_sb_counters_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
return 0;
};
@@ -31,19 +30,12 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
- unsigned int i;
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
- for (i = 0; i < nr; i++) {
- if (i < BCH_COUNTER_NR)
- prt_printf(out, "%s ", bch2_counter_names[i]);
- else
- prt_printf(out, "(unknown)");
-
- prt_tab(out);
- prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < nr; i++)
+ prt_printf(out, "%s \t%llu\n",
+ i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)",
+ le64_to_cpu(ctrs->d[i]));
};
int bch2_sb_counters_to_cpu(struct bch_fs *c)
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
index 62ea478215d0..fdcf598f08b1 100644
--- a/fs/bcachefs/sb-counters_format.h
+++ b/fs/bcachefs/sb-counters_format.h
@@ -2,86 +2,91 @@
#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
#define _BCACHEFS_SB_COUNTERS_FORMAT_H
-#define BCH_PERSISTENT_COUNTERS() \
- x(io_read, 0) \
- x(io_write, 1) \
- x(io_move, 2) \
- x(bucket_invalidate, 3) \
- x(bucket_discard, 4) \
- x(bucket_alloc, 5) \
- x(bucket_alloc_fail, 6) \
- x(btree_cache_scan, 7) \
- x(btree_cache_reap, 8) \
- x(btree_cache_cannibalize, 9) \
- x(btree_cache_cannibalize_lock, 10) \
- x(btree_cache_cannibalize_lock_fail, 11) \
- x(btree_cache_cannibalize_unlock, 12) \
- x(btree_node_write, 13) \
- x(btree_node_read, 14) \
- x(btree_node_compact, 15) \
- x(btree_node_merge, 16) \
- x(btree_node_split, 17) \
- x(btree_node_rewrite, 18) \
- x(btree_node_alloc, 19) \
- x(btree_node_free, 20) \
- x(btree_node_set_root, 21) \
- x(btree_path_relock_fail, 22) \
- x(btree_path_upgrade_fail, 23) \
- x(btree_reserve_get_fail, 24) \
- x(journal_entry_full, 25) \
- x(journal_full, 26) \
- x(journal_reclaim_finish, 27) \
- x(journal_reclaim_start, 28) \
- x(journal_write, 29) \
- x(read_promote, 30) \
- x(read_bounce, 31) \
- x(read_split, 33) \
- x(read_retry, 32) \
- x(read_reuse_race, 34) \
- x(move_extent_read, 35) \
- x(move_extent_write, 36) \
- x(move_extent_finish, 37) \
- x(move_extent_fail, 38) \
- x(move_extent_start_fail, 39) \
- x(copygc, 40) \
- x(copygc_wait, 41) \
- x(gc_gens_end, 42) \
- x(gc_gens_start, 43) \
- x(trans_blocked_journal_reclaim, 44) \
- x(trans_restart_btree_node_reused, 45) \
- x(trans_restart_btree_node_split, 46) \
- x(trans_restart_fault_inject, 47) \
- x(trans_restart_iter_upgrade, 48) \
- x(trans_restart_journal_preres_get, 49) \
- x(trans_restart_journal_reclaim, 50) \
- x(trans_restart_journal_res_get, 51) \
- x(trans_restart_key_cache_key_realloced, 52) \
- x(trans_restart_key_cache_raced, 53) \
- x(trans_restart_mark_replicas, 54) \
- x(trans_restart_mem_realloced, 55) \
- x(trans_restart_memory_allocation_failure, 56) \
- x(trans_restart_relock, 57) \
- x(trans_restart_relock_after_fill, 58) \
- x(trans_restart_relock_key_cache_fill, 59) \
- x(trans_restart_relock_next_node, 60) \
- x(trans_restart_relock_parent_for_fill, 61) \
- x(trans_restart_relock_path, 62) \
- x(trans_restart_relock_path_intent, 63) \
- x(trans_restart_too_many_iters, 64) \
- x(trans_restart_traverse, 65) \
- x(trans_restart_upgrade, 66) \
- x(trans_restart_would_deadlock, 67) \
- x(trans_restart_would_deadlock_write, 68) \
- x(trans_restart_injected, 69) \
- x(trans_restart_key_cache_upgrade, 70) \
- x(trans_traverse_all, 71) \
- x(transaction_commit, 72) \
- x(write_super, 73) \
- x(trans_restart_would_deadlock_recursion_limit, 74) \
- x(trans_restart_write_buffer_flush, 75) \
- x(trans_restart_split_race, 76) \
- x(write_buffer_flush_slowpath, 77) \
- x(write_buffer_flush_sync, 78)
+enum counters_flags {
+ TYPE_COUNTER = BIT(0), /* event counters */
+ TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */
+};
+
+#define BCH_PERSISTENT_COUNTERS() \
+ x(io_read, 0, TYPE_SECTORS) \
+ x(io_write, 1, TYPE_SECTORS) \
+ x(io_move, 2, TYPE_SECTORS) \
+ x(bucket_invalidate, 3, TYPE_COUNTER) \
+ x(bucket_discard, 4, TYPE_COUNTER) \
+ x(bucket_alloc, 5, TYPE_COUNTER) \
+ x(bucket_alloc_fail, 6, TYPE_COUNTER) \
+ x(btree_cache_scan, 7, TYPE_COUNTER) \
+ x(btree_cache_reap, 8, TYPE_COUNTER) \
+ x(btree_cache_cannibalize, 9, TYPE_COUNTER) \
+ x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \
+ x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \
+ x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \
+ x(btree_node_write, 13, TYPE_COUNTER) \
+ x(btree_node_read, 14, TYPE_COUNTER) \
+ x(btree_node_compact, 15, TYPE_COUNTER) \
+ x(btree_node_merge, 16, TYPE_COUNTER) \
+ x(btree_node_split, 17, TYPE_COUNTER) \
+ x(btree_node_rewrite, 18, TYPE_COUNTER) \
+ x(btree_node_alloc, 19, TYPE_COUNTER) \
+ x(btree_node_free, 20, TYPE_COUNTER) \
+ x(btree_node_set_root, 21, TYPE_COUNTER) \
+ x(btree_path_relock_fail, 22, TYPE_COUNTER) \
+ x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \
+ x(btree_reserve_get_fail, 24, TYPE_COUNTER) \
+ x(journal_entry_full, 25, TYPE_COUNTER) \
+ x(journal_full, 26, TYPE_COUNTER) \
+ x(journal_reclaim_finish, 27, TYPE_COUNTER) \
+ x(journal_reclaim_start, 28, TYPE_COUNTER) \
+ x(journal_write, 29, TYPE_COUNTER) \
+ x(read_promote, 30, TYPE_COUNTER) \
+ x(read_bounce, 31, TYPE_COUNTER) \
+ x(read_split, 33, TYPE_COUNTER) \
+ x(read_retry, 32, TYPE_COUNTER) \
+ x(read_reuse_race, 34, TYPE_COUNTER) \
+ x(move_extent_read, 35, TYPE_SECTORS) \
+ x(move_extent_write, 36, TYPE_SECTORS) \
+ x(move_extent_finish, 37, TYPE_SECTORS) \
+ x(move_extent_fail, 38, TYPE_COUNTER) \
+ x(move_extent_start_fail, 39, TYPE_COUNTER) \
+ x(copygc, 40, TYPE_COUNTER) \
+ x(copygc_wait, 41, TYPE_COUNTER) \
+ x(gc_gens_end, 42, TYPE_COUNTER) \
+ x(gc_gens_start, 43, TYPE_COUNTER) \
+ x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \
+ x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \
+ x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \
+ x(trans_restart_fault_inject, 47, TYPE_COUNTER) \
+ x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \
+ x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \
+ x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \
+ x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \
+ x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \
+ x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \
+ x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \
+ x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \
+ x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \
+ x(trans_restart_relock, 57, TYPE_COUNTER) \
+ x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \
+ x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \
+ x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \
+ x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \
+ x(trans_restart_relock_path, 62, TYPE_COUNTER) \
+ x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \
+ x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \
+ x(trans_restart_traverse, 65, TYPE_COUNTER) \
+ x(trans_restart_upgrade, 66, TYPE_COUNTER) \
+ x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \
+ x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \
+ x(trans_restart_injected, 69, TYPE_COUNTER) \
+ x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \
+ x(trans_traverse_all, 71, TYPE_COUNTER) \
+ x(transaction_commit, 72, TYPE_COUNTER) \
+ x(write_super, 73, TYPE_COUNTER) \
+ x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \
+ x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \
+ x(trans_restart_split_race, 76, TYPE_COUNTER) \
+ x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \
+ x(write_buffer_flush_sync, 78, TYPE_COUNTER)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 441dcb1bf160..051214fdc735 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -7,7 +7,7 @@
#include "bcachefs.h"
#include "darray.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "super-io.h"
@@ -45,9 +45,100 @@
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \
x(rebalance_work, \
- BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
-
-#define DOWNGRADE_TABLE()
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \
+ x(subvolume_fs_parent, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
+ BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \
+ x(btree_subvolume_children, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \
+ BCH_FSCK_ERR_subvol_children_not_set) \
+ x(mi_btree_bitmap, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_btree_bitmap_not_marked) \
+ x(disk_accounting_v2, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_bkey_version_in_future, \
+ BCH_FSCK_ERR_dev_usage_buckets_wrong, \
+ BCH_FSCK_ERR_dev_usage_sectors_wrong, \
+ BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
+ BCH_FSCK_ERR_accounting_mismatch) \
+ x(disk_accounting_v3, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_bkey_version_in_future, \
+ BCH_FSCK_ERR_dev_usage_buckets_wrong, \
+ BCH_FSCK_ERR_dev_usage_sectors_wrong, \
+ BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
+ BCH_FSCK_ERR_accounting_mismatch, \
+ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
+ BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \
+ BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \
+ BCH_FSCK_ERR_accounting_key_junk_at_end) \
+ x(disk_accounting_inum, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch) \
+ x(rebalance_work_acct_fix, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch) \
+ x(inode_has_child_snapshots, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
+ BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \
+ x(backpointer_bucket_gen, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
+ BCH_FSCK_ERR_backpointer_to_missing_ptr, \
+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \
+ x(disk_accounting_big_endian, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch, \
+ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
+ BCH_FSCK_ERR_accounting_key_junk_at_end)
+
+#define DOWNGRADE_TABLE() \
+ x(bucket_stripe_sectors, \
+ 0) \
+ x(disk_accounting_v2, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_dev_usage_buckets_wrong, \
+ BCH_FSCK_ERR_dev_usage_sectors_wrong, \
+ BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
+ BCH_FSCK_ERR_fs_usage_hidden_wrong, \
+ BCH_FSCK_ERR_fs_usage_btree_wrong, \
+ BCH_FSCK_ERR_fs_usage_data_wrong, \
+ BCH_FSCK_ERR_fs_usage_cached_wrong, \
+ BCH_FSCK_ERR_fs_usage_reserved_wrong, \
+ BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
+ BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
+ BCH_FSCK_ERR_fs_usage_replicas_wrong, \
+ BCH_FSCK_ERR_bkey_version_in_future) \
+ x(disk_accounting_v3, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_dev_usage_buckets_wrong, \
+ BCH_FSCK_ERR_dev_usage_sectors_wrong, \
+ BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
+ BCH_FSCK_ERR_fs_usage_hidden_wrong, \
+ BCH_FSCK_ERR_fs_usage_btree_wrong, \
+ BCH_FSCK_ERR_fs_usage_data_wrong, \
+ BCH_FSCK_ERR_fs_usage_cached_wrong, \
+ BCH_FSCK_ERR_fs_usage_reserved_wrong, \
+ BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
+ BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
+ BCH_FSCK_ERR_fs_usage_replicas_wrong, \
+ BCH_FSCK_ERR_accounting_replicas_not_marked, \
+ BCH_FSCK_ERR_bkey_version_in_future) \
+ x(rebalance_work_acct_fix, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch, \
+ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
+ BCH_FSCK_ERR_accounting_key_junk_at_end) \
+ x(backpointer_bucket_gen, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
+ BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \
+ BCH_FSCK_ERR_backpointer_to_missing_ptr, \
+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \
+ x(disk_accounting_big_endian, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch, \
+ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
+ BCH_FSCK_ERR_accounting_key_junk_at_end)
struct upgrade_downgrade_entry {
u64 recovery_passes;
@@ -71,6 +162,40 @@ UPGRADE_TABLE()
#undef x
};
+static int have_stripes(struct bch_fs *c)
+{
+ if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b))
+ return 0;
+
+ return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
+}
+
+int bch2_sb_set_upgrade_extra(struct bch_fs *c)
+{
+ unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
+ unsigned new_version = c->sb.version;
+ bool write_sb = false;
+ int ret = 0;
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (old_version < bcachefs_metadata_version_bucket_stripe_sectors &&
+ new_version >= bcachefs_metadata_version_bucket_stripe_sectors &&
+ (ret = have_stripes(c) > 0)) {
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent);
+ write_sb = true;
+ }
+
+ if (write_sb)
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return ret < 0 ? ret : 0;
+}
+
void bch2_sb_set_upgrade(struct bch_fs *c,
unsigned old_version,
unsigned new_version)
@@ -92,16 +217,12 @@ void bch2_sb_set_upgrade(struct bch_fs *c,
ext->recovery_passes_required[0] |=
cpu_to_le64(bch2_recovery_passes_to_stable(passes));
- for (const u16 *e = i->errors;
- e < i->errors + i->nr_errors;
- e++) {
- __set_bit(*e, c->sb.errors_silent);
- ext->errors_silent[*e / 64] |= cpu_to_le64(BIT_ULL(*e % 64));
- }
+ for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++)
+ __set_bit_le64(*e, ext->errors_silent);
}
}
-#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ };
+#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
DOWNGRADE_TABLE()
#undef x
@@ -116,6 +237,37 @@ DOWNGRADE_TABLE()
#undef x
};
+static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
+{
+ struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
+ unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
+ int ret = 0;
+
+ unsigned nr_errors = le16_to_cpu(dst->nr_errors);
+
+ switch (le16_to_cpu(dst->version)) {
+ case bcachefs_metadata_version_bucket_stripe_sectors:
+ if (have_stripes(c)) {
+ bytes += sizeof(dst->errors[0]) * 2;
+
+ ret = darray_make_room(table, bytes);
+ if (ret)
+ return ret;
+
+ /* open coded __set_bit_le64, as dst is packed and
+ * dst->recovery_passes is misaligned */
+ unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations;
+ dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64));
+
+ dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong);
+ }
+ break;
+ }
+
+ dst->nr_errors = cpu_to_le16(nr_errors);
+ return ret;
+}
+
static inline const struct bch_sb_field_downgrade_entry *
downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
{
@@ -125,15 +277,32 @@ downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
#define for_each_downgrade_entry(_d, _i) \
for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \
(void *) _i < vstruct_end(&(_d)->field) && \
- (void *) &_i->errors[0] < vstruct_end(&(_d)->field); \
+ (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) && \
+ (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field); \
_i = downgrade_entry_next_c(_i))
static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
- for_each_downgrade_entry(e, i) {
+ for (const struct bch_sb_field_downgrade_entry *i = e->entries;
+ (void *) i < vstruct_end(&e->field);
+ i = downgrade_entry_next_c(i)) {
+ /*
+ * Careful: sb_field_downgrade_entry is only 2 byte aligned, but
+ * section sizes are 8 byte aligned - an empty entry spanning
+ * the end of the section is allowed (and ignored):
+ */
+ if ((void *) &i->errors[0] > vstruct_end(&e->field))
+ break;
+
+ if (flags & BCH_VALIDATE_write &&
+ (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) {
+ prt_printf(err, "downgrade entry overruns end of superblock section");
+ return -BCH_ERR_invalid_sb_downgrade;
+ }
+
if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) !=
BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) {
prt_printf(err, "downgrade entry with mismatched major version (%u != %u)",
@@ -155,26 +324,22 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
printbuf_tabstop_push(out, 16);
for_each_downgrade_entry(e, i) {
- prt_str(out, "version:");
- prt_tab(out);
+ prt_str(out, "version:\t");
bch2_version_to_text(out, le16_to_cpu(i->version));
prt_newline(out);
- prt_str(out, "recovery passes:");
- prt_tab(out);
+ prt_str(out, "recovery passes:\t");
prt_bitflags(out, bch2_recovery_passes,
bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0])));
prt_newline(out);
- prt_str(out, "errors:");
- prt_tab(out);
+ prt_str(out, "errors:\t");
bool first = true;
for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
if (!first)
prt_char(out, ',');
first = false;
- unsigned e = le16_to_cpu(i->errors[j]);
- prt_str(out, e < BCH_SB_ERR_MAX ? bch2_sb_error_strs[e] : "(unknown)");
+ bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j]));
}
prt_newline(out);
}
@@ -187,6 +352,9 @@ const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
int bch2_sb_downgrade_update(struct bch_fs *c)
{
+ if (!test_bit(BCH_FS_btree_running, &c->flags))
+ return 0;
+
darray_char table = {};
int ret = 0;
@@ -205,13 +373,22 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
dst = (void *) &darray_top(table);
dst->version = cpu_to_le16(src->version);
- dst->recovery_passes[0] = cpu_to_le64(src->recovery_passes);
+ dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes));
dst->recovery_passes[1] = 0;
dst->nr_errors = cpu_to_le16(src->nr_errors);
for (unsigned i = 0; i < src->nr_errors; i++)
dst->errors[i] = cpu_to_le16(src->errors[i]);
- table.nr += bytes;
+ ret = downgrade_table_extra(c, &table);
+ if (ret)
+ goto out;
+
+ if (!dst->recovery_passes[0] &&
+ !dst->recovery_passes[1] &&
+ !dst->nr_errors)
+ continue;
+
+ table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
}
struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
@@ -250,10 +427,10 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi
for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
unsigned e = le16_to_cpu(i->errors[j]);
- if (e < BCH_SB_ERR_MAX)
+ if (e < BCH_FSCK_ERR_MAX)
__set_bit(e, c->sb.errors_silent);
if (e < sizeof(ext->errors_silent) * 8)
- ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64));
+ __set_bit_le64(e, ext->errors_silent);
}
}
}
diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h
index 57e6c916fc73..095b7cc9bb47 100644
--- a/fs/bcachefs/sb-downgrade.h
+++ b/fs/bcachefs/sb-downgrade.h
@@ -6,6 +6,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
int bch2_sb_downgrade_update(struct bch_fs *);
void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
+int bch2_sb_set_upgrade_extra(struct bch_fs *);
void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
#endif /* _BCACHEFS_SB_DOWNGRADE_H */
diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h
new file mode 100644
index 000000000000..cffd932be3ec
--- /dev/null
+++ b/fs/bcachefs/sb-downgrade_format.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H
+#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H
+
+struct bch_sb_field_downgrade_entry {
+ __le16 version;
+ __le64 recovery_passes[2];
+ __le16 nr_errors;
+ __le16 errors[] __counted_by(nr_errors);
+} __packed __aligned(2);
+
+struct bch_sb_field_downgrade {
+ struct bch_sb_field field;
+ struct bch_sb_field_downgrade_entry entries[];
+};
+
+#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index 5f5bcae391fb..013a96883b4e 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -7,12 +7,12 @@
const char * const bch2_sb_error_strs[] = {
#define x(t, n, ...) [n] = #t,
BCH_SB_ERRS()
- NULL
+#undef x
};
-static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
+void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
{
- if (id < BCH_SB_ERR_MAX)
+ if (id < BCH_FSCK_ERR_MAX)
prt_str(out, bch2_sb_error_strs[id]);
else
prt_printf(out, "(unknown error %u)", id);
@@ -30,7 +30,7 @@ static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
}
static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_errors *e = field_to_type(f, errors);
unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
@@ -110,19 +110,25 @@ out:
void bch2_sb_errors_from_cpu(struct bch_fs *c)
{
bch_sb_errors_cpu *src = &c->fsck_error_counts;
- struct bch_sb_field_errors *dst =
- bch2_sb_field_resize(&c->disk_sb, errors,
- bch2_sb_field_errors_u64s(src->nr));
+ struct bch_sb_field_errors *dst;
unsigned i;
+ mutex_lock(&c->fsck_error_counts_lock);
+
+ dst = bch2_sb_field_resize(&c->disk_sb, errors,
+ bch2_sb_field_errors_u64s(src->nr));
+
if (!dst)
- return;
+ goto err;
for (i = 0; i < src->nr; i++) {
SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
}
+
+err:
+ mutex_unlock(&c->fsck_error_counts_lock);
}
static int bch2_sb_errors_to_cpu(struct bch_fs *c)
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
index 8889001e7db4..b2357b8e6107 100644
--- a/fs/bcachefs/sb-errors.h
+++ b/fs/bcachefs/sb-errors.h
@@ -6,6 +6,8 @@
extern const char * const bch2_sb_error_strs[];
+void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
+
extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
new file mode 100644
index 000000000000..b86ec013d7d7
--- /dev/null
+++ b/fs/bcachefs/sb-errors_format.h
@@ -0,0 +1,336 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H
+#define _BCACHEFS_SB_ERRORS_FORMAT_H
+
+enum bch_fsck_flags {
+ FSCK_CAN_FIX = 1 << 0,
+ FSCK_CAN_IGNORE = 1 << 1,
+ FSCK_NO_RATELIMIT = 1 << 2,
+ FSCK_AUTOFIX = 1 << 3,
+};
+
+#define BCH_SB_ERRS() \
+ x(clean_but_journal_not_empty, 0, 0) \
+ x(dirty_but_no_journal_entries, 1, 0) \
+ x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \
+ x(sb_clean_journal_seq_mismatch, 3, 0) \
+ x(sb_clean_btree_root_mismatch, 4, 0) \
+ x(sb_clean_missing, 5, 0) \
+ x(jset_unsupported_version, 6, 0) \
+ x(jset_unknown_csum, 7, 0) \
+ x(jset_last_seq_newer_than_seq, 8, 0) \
+ x(jset_past_bucket_end, 9, 0) \
+ x(jset_seq_blacklisted, 10, 0) \
+ x(journal_entries_missing, 11, 0) \
+ x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \
+ x(journal_entry_past_jset_end, 13, 0) \
+ x(journal_entry_replicas_data_mismatch, 14, 0) \
+ x(journal_entry_bkey_u64s_0, 15, 0) \
+ x(journal_entry_bkey_past_end, 16, 0) \
+ x(journal_entry_bkey_bad_format, 17, 0) \
+ x(journal_entry_bkey_invalid, 18, 0) \
+ x(journal_entry_btree_root_bad_size, 19, 0) \
+ x(journal_entry_blacklist_bad_size, 20, 0) \
+ x(journal_entry_blacklist_v2_bad_size, 21, 0) \
+ x(journal_entry_blacklist_v2_start_past_end, 22, 0) \
+ x(journal_entry_usage_bad_size, 23, 0) \
+ x(journal_entry_data_usage_bad_size, 24, 0) \
+ x(journal_entry_clock_bad_size, 25, 0) \
+ x(journal_entry_clock_bad_rw, 26, 0) \
+ x(journal_entry_dev_usage_bad_size, 27, 0) \
+ x(journal_entry_dev_usage_bad_dev, 28, 0) \
+ x(journal_entry_dev_usage_bad_pad, 29, 0) \
+ x(btree_node_unreadable, 30, 0) \
+ x(btree_node_fault_injected, 31, 0) \
+ x(btree_node_bad_magic, 32, 0) \
+ x(btree_node_bad_seq, 33, 0) \
+ x(btree_node_unsupported_version, 34, 0) \
+ x(btree_node_bset_older_than_sb_min, 35, 0) \
+ x(btree_node_bset_newer_than_sb, 36, 0) \
+ x(btree_node_data_missing, 37, 0) \
+ x(btree_node_bset_after_end, 38, 0) \
+ x(btree_node_replicas_sectors_written_mismatch, 39, 0) \
+ x(btree_node_replicas_data_mismatch, 40, 0) \
+ x(bset_unknown_csum, 41, 0) \
+ x(bset_bad_csum, 42, 0) \
+ x(bset_past_end_of_btree_node, 43, 0) \
+ x(bset_wrong_sector_offset, 44, 0) \
+ x(bset_empty, 45, 0) \
+ x(bset_bad_seq, 46, 0) \
+ x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \
+ x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \
+ x(btree_node_bad_btree, 49, 0) \
+ x(btree_node_bad_level, 50, 0) \
+ x(btree_node_bad_min_key, 51, 0) \
+ x(btree_node_bad_max_key, 52, 0) \
+ x(btree_node_bad_format, 53, 0) \
+ x(btree_node_bkey_past_bset_end, 54, 0) \
+ x(btree_node_bkey_bad_format, 55, 0) \
+ x(btree_node_bad_bkey, 56, 0) \
+ x(btree_node_bkey_out_of_order, 57, FSCK_AUTOFIX) \
+ x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \
+ x(btree_root_read_error, 59, FSCK_AUTOFIX) \
+ x(btree_root_bad_min_key, 60, 0) \
+ x(btree_root_bad_max_key, 61, 0) \
+ x(btree_node_read_error, 62, FSCK_AUTOFIX) \
+ x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \
+ x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \
+ x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \
+ x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \
+ x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \
+ x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \
+ x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \
+ x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \
+ x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \
+ x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \
+ x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \
+ x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \
+ x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \
+ x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \
+ x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \
+ x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \
+ x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \
+ x(bkey_version_in_future, 80, 0) \
+ x(bkey_u64s_too_small, 81, 0) \
+ x(bkey_invalid_type_for_btree, 82, 0) \
+ x(bkey_extent_size_zero, 83, 0) \
+ x(bkey_extent_size_greater_than_offset, 84, 0) \
+ x(bkey_size_nonzero, 85, 0) \
+ x(bkey_snapshot_nonzero, 86, 0) \
+ x(bkey_snapshot_zero, 87, 0) \
+ x(bkey_at_pos_max, 88, 0) \
+ x(bkey_before_start_of_btree_node, 89, 0) \
+ x(bkey_after_end_of_btree_node, 90, 0) \
+ x(bkey_val_size_nonzero, 91, 0) \
+ x(bkey_val_size_too_small, 92, 0) \
+ x(alloc_v1_val_size_bad, 93, 0) \
+ x(alloc_v2_unpack_error, 94, 0) \
+ x(alloc_v3_unpack_error, 95, 0) \
+ x(alloc_v4_val_size_bad, 96, 0) \
+ x(alloc_v4_backpointers_start_bad, 97, 0) \
+ x(alloc_key_data_type_bad, 98, 0) \
+ x(alloc_key_empty_but_have_data, 99, 0) \
+ x(alloc_key_dirty_sectors_0, 100, 0) \
+ x(alloc_key_data_type_inconsistency, 101, 0) \
+ x(alloc_key_to_missing_dev_bucket, 102, 0) \
+ x(alloc_key_cached_inconsistency, 103, 0) \
+ x(alloc_key_cached_but_read_time_zero, 104, FSCK_AUTOFIX) \
+ x(alloc_key_to_missing_lru_entry, 105, FSCK_AUTOFIX) \
+ x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \
+ x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \
+ x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \
+ x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \
+ x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \
+ x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \
+ x(alloc_key_journal_seq_in_future, 298, FSCK_AUTOFIX) \
+ x(bucket_sector_count_overflow, 112, 0) \
+ x(bucket_metadata_type_mismatch, 113, 0) \
+ x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \
+ x(freespace_key_wrong, 115, FSCK_AUTOFIX) \
+ x(freespace_hole_missing, 116, FSCK_AUTOFIX) \
+ x(bucket_gens_val_size_bad, 117, 0) \
+ x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \
+ x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \
+ x(bucket_gens_to_invalid_dev, 120, FSCK_AUTOFIX) \
+ x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \
+ x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \
+ x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \
+ x(need_discard_freespace_key_bad, 124, 0) \
+ x(discarding_bucket_not_in_need_discard_btree, 291, 0) \
+ x(backpointer_bucket_offset_wrong, 125, 0) \
+ x(backpointer_level_bad, 294, 0) \
+ x(backpointer_dev_bad, 297, 0) \
+ x(backpointer_to_missing_device, 126, FSCK_AUTOFIX) \
+ x(backpointer_to_missing_alloc, 127, FSCK_AUTOFIX) \
+ x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \
+ x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \
+ x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \
+ x(lru_entry_bad, 131, FSCK_AUTOFIX) \
+ x(btree_ptr_val_too_big, 132, 0) \
+ x(btree_ptr_v2_val_too_big, 133, 0) \
+ x(btree_ptr_has_non_ptr, 134, 0) \
+ x(extent_ptrs_invalid_entry, 135, 0) \
+ x(extent_ptrs_no_ptrs, 136, 0) \
+ x(extent_ptrs_too_many_ptrs, 137, 0) \
+ x(extent_ptrs_redundant_crc, 138, 0) \
+ x(extent_ptrs_redundant_stripe, 139, 0) \
+ x(extent_ptrs_unwritten, 140, 0) \
+ x(extent_ptrs_written_and_unwritten, 141, 0) \
+ x(ptr_to_invalid_device, 142, 0) \
+ x(ptr_to_duplicate_device, 143, 0) \
+ x(ptr_after_last_bucket, 144, 0) \
+ x(ptr_before_first_bucket, 145, 0) \
+ x(ptr_spans_multiple_buckets, 146, 0) \
+ x(ptr_to_missing_backpointer, 147, FSCK_AUTOFIX) \
+ x(ptr_to_missing_alloc_key, 148, FSCK_AUTOFIX) \
+ x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \
+ x(ptr_to_missing_stripe, 150, 0) \
+ x(ptr_to_incorrect_stripe, 151, 0) \
+ x(ptr_gen_newer_than_bucket_gen, 152, 0) \
+ x(ptr_too_stale, 153, 0) \
+ x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \
+ x(ptr_bucket_data_type_mismatch, 155, 0) \
+ x(ptr_cached_and_erasure_coded, 156, 0) \
+ x(ptr_crc_uncompressed_size_too_small, 157, 0) \
+ x(ptr_crc_uncompressed_size_too_big, 161, 0) \
+ x(ptr_crc_uncompressed_size_mismatch, 300, 0) \
+ x(ptr_crc_csum_type_unknown, 158, 0) \
+ x(ptr_crc_compression_type_unknown, 159, 0) \
+ x(ptr_crc_redundant, 160, 0) \
+ x(ptr_crc_nonce_mismatch, 162, 0) \
+ x(ptr_stripe_redundant, 163, 0) \
+ x(reservation_key_nr_replicas_invalid, 164, 0) \
+ x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
+ x(reflink_v_pos_bad, 292, 0) \
+ x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \
+ x(reflink_refcount_underflow, 293, 0) \
+ x(stripe_pos_bad, 167, 0) \
+ x(stripe_val_size_bad, 168, 0) \
+ x(stripe_csum_granularity_bad, 290, 0) \
+ x(stripe_sector_count_wrong, 169, 0) \
+ x(snapshot_tree_pos_bad, 170, 0) \
+ x(snapshot_tree_to_missing_snapshot, 171, 0) \
+ x(snapshot_tree_to_missing_subvol, 172, 0) \
+ x(snapshot_tree_to_wrong_subvol, 173, 0) \
+ x(snapshot_tree_to_snapshot_subvol, 174, 0) \
+ x(snapshot_pos_bad, 175, 0) \
+ x(snapshot_parent_bad, 176, 0) \
+ x(snapshot_children_not_normalized, 177, 0) \
+ x(snapshot_child_duplicate, 178, 0) \
+ x(snapshot_child_bad, 179, 0) \
+ x(snapshot_skiplist_not_normalized, 180, 0) \
+ x(snapshot_skiplist_bad, 181, 0) \
+ x(snapshot_should_not_have_subvol, 182, 0) \
+ x(snapshot_to_bad_snapshot_tree, 183, FSCK_AUTOFIX) \
+ x(snapshot_bad_depth, 184, 0) \
+ x(snapshot_bad_skiplist, 185, 0) \
+ x(subvol_pos_bad, 186, 0) \
+ x(subvol_not_master_and_not_snapshot, 187, 0) \
+ x(subvol_to_missing_root, 188, 0) \
+ x(subvol_root_wrong_bi_subvol, 189, 0) \
+ x(bkey_in_missing_snapshot, 190, 0) \
+ x(inode_pos_inode_nonzero, 191, 0) \
+ x(inode_pos_blockdev_range, 192, 0) \
+ x(inode_alloc_cursor_inode_bad, 301, 0) \
+ x(inode_unpack_error, 193, 0) \
+ x(inode_str_hash_invalid, 194, 0) \
+ x(inode_v3_fields_start_bad, 195, 0) \
+ x(inode_snapshot_mismatch, 196, 0) \
+ x(inode_unlinked_but_clean, 197, 0) \
+ x(inode_unlinked_but_nlink_nonzero, 198, 0) \
+ x(inode_unlinked_and_not_open, 281, 0) \
+ x(inode_unlinked_but_has_dirent, 285, 0) \
+ x(inode_checksum_type_invalid, 199, 0) \
+ x(inode_compression_type_invalid, 200, 0) \
+ x(inode_subvol_root_but_not_dir, 201, 0) \
+ x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \
+ x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \
+ x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \
+ x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \
+ x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \
+ x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \
+ x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \
+ x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
+ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
+ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
+ x(inode_has_child_snapshots_wrong, 287, 0) \
+ x(inode_unreachable, 210, FSCK_AUTOFIX) \
+ x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \
+ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
+ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
+ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
+ x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
+ x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \
+ x(extent_overlapping, 215, 0) \
+ x(key_in_missing_inode, 216, 0) \
+ x(key_in_wrong_inode_type, 217, 0) \
+ x(extent_past_end_of_inode, 218, 0) \
+ x(dirent_empty_name, 219, 0) \
+ x(dirent_val_too_big, 220, 0) \
+ x(dirent_name_too_long, 221, 0) \
+ x(dirent_name_embedded_nul, 222, 0) \
+ x(dirent_name_dot_or_dotdot, 223, 0) \
+ x(dirent_name_has_slash, 224, 0) \
+ x(dirent_d_type_wrong, 225, 0) \
+ x(inode_bi_parent_wrong, 226, 0) \
+ x(dirent_in_missing_dir_inode, 227, 0) \
+ x(dirent_in_non_dir_inode, 228, 0) \
+ x(dirent_to_missing_inode, 229, 0) \
+ x(dirent_to_overwritten_inode, 302, 0) \
+ x(dirent_to_missing_subvol, 230, 0) \
+ x(dirent_to_itself, 231, 0) \
+ x(quota_type_invalid, 232, 0) \
+ x(xattr_val_size_too_small, 233, 0) \
+ x(xattr_val_size_too_big, 234, 0) \
+ x(xattr_invalid_type, 235, 0) \
+ x(xattr_name_invalid_chars, 236, 0) \
+ x(xattr_in_missing_inode, 237, 0) \
+ x(root_subvol_missing, 238, 0) \
+ x(root_dir_missing, 239, 0) \
+ x(root_inode_not_dir, 240, 0) \
+ x(dir_loop, 241, 0) \
+ x(hash_table_key_duplicate, 242, 0) \
+ x(hash_table_key_wrong_offset, 243, 0) \
+ x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \
+ x(reflink_p_front_pad_bad, 245, 0) \
+ x(journal_entry_dup_same_device, 246, 0) \
+ x(inode_bi_subvol_missing, 247, 0) \
+ x(inode_bi_subvol_wrong, 248, 0) \
+ x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \
+ x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \
+ x(inode_bi_parent_nonzero, 251, 0) \
+ x(dirent_to_missing_parent_subvol, 252, 0) \
+ x(dirent_not_visible_in_parent_subvol, 253, 0) \
+ x(subvol_fs_path_parent_wrong, 254, 0) \
+ x(subvol_root_fs_path_parent_nonzero, 255, 0) \
+ x(subvol_children_not_set, 256, 0) \
+ x(subvol_children_bad, 257, 0) \
+ x(subvol_loop, 258, 0) \
+ x(subvol_unreachable, 259, FSCK_AUTOFIX) \
+ x(btree_node_bkey_bad_u64s, 260, 0) \
+ x(btree_node_topology_empty_interior_node, 261, 0) \
+ x(btree_ptr_v2_min_key_bad, 262, 0) \
+ x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \
+ x(snapshot_node_missing, 264, 0) \
+ x(dup_backpointer_to_bad_csum_extent, 265, 0) \
+ x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \
+ x(sb_clean_entry_overrun, 267, 0) \
+ x(btree_ptr_v2_written_0, 268, 0) \
+ x(subvol_snapshot_bad, 269, 0) \
+ x(subvol_inode_bad, 270, 0) \
+ x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
+ x(accounting_mismatch, 272, FSCK_AUTOFIX) \
+ x(accounting_replicas_not_marked, 273, 0) \
+ x(accounting_to_invalid_device, 289, 0) \
+ x(invalid_btree_id, 274, 0) \
+ x(alloc_key_io_time_bad, 275, 0) \
+ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
+ x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \
+ x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
+ x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
+ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
+ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
+ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
+ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
+ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
+ x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
+ x(MAX, 304, 0)
+
+enum bch_sb_error_id {
+#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
+ BCH_SB_ERRS()
+#undef x
+};
+
+struct bch_sb_field_errors {
+ struct bch_sb_field field;
+ struct bch_sb_field_error_entry {
+ __le64 v;
+ __le64 last_error_time;
+ } entries[];
+};
+
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
+
+#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index c08aacdfd073..40325239c3b0 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -4,261 +4,6 @@
#include "darray.h"
-#define BCH_SB_ERRS() \
- x(clean_but_journal_not_empty, 0) \
- x(dirty_but_no_journal_entries, 1) \
- x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \
- x(sb_clean_journal_seq_mismatch, 3) \
- x(sb_clean_btree_root_mismatch, 4) \
- x(sb_clean_missing, 5) \
- x(jset_unsupported_version, 6) \
- x(jset_unknown_csum, 7) \
- x(jset_last_seq_newer_than_seq, 8) \
- x(jset_past_bucket_end, 9) \
- x(jset_seq_blacklisted, 10) \
- x(journal_entries_missing, 11) \
- x(journal_entry_replicas_not_marked, 12) \
- x(journal_entry_past_jset_end, 13) \
- x(journal_entry_replicas_data_mismatch, 14) \
- x(journal_entry_bkey_u64s_0, 15) \
- x(journal_entry_bkey_past_end, 16) \
- x(journal_entry_bkey_bad_format, 17) \
- x(journal_entry_bkey_invalid, 18) \
- x(journal_entry_btree_root_bad_size, 19) \
- x(journal_entry_blacklist_bad_size, 20) \
- x(journal_entry_blacklist_v2_bad_size, 21) \
- x(journal_entry_blacklist_v2_start_past_end, 22) \
- x(journal_entry_usage_bad_size, 23) \
- x(journal_entry_data_usage_bad_size, 24) \
- x(journal_entry_clock_bad_size, 25) \
- x(journal_entry_clock_bad_rw, 26) \
- x(journal_entry_dev_usage_bad_size, 27) \
- x(journal_entry_dev_usage_bad_dev, 28) \
- x(journal_entry_dev_usage_bad_pad, 29) \
- x(btree_node_unreadable, 30) \
- x(btree_node_fault_injected, 31) \
- x(btree_node_bad_magic, 32) \
- x(btree_node_bad_seq, 33) \
- x(btree_node_unsupported_version, 34) \
- x(btree_node_bset_older_than_sb_min, 35) \
- x(btree_node_bset_newer_than_sb, 36) \
- x(btree_node_data_missing, 37) \
- x(btree_node_bset_after_end, 38) \
- x(btree_node_replicas_sectors_written_mismatch, 39) \
- x(btree_node_replicas_data_mismatch, 40) \
- x(bset_unknown_csum, 41) \
- x(bset_bad_csum, 42) \
- x(bset_past_end_of_btree_node, 43) \
- x(bset_wrong_sector_offset, 44) \
- x(bset_empty, 45) \
- x(bset_bad_seq, 46) \
- x(bset_blacklisted_journal_seq, 47) \
- x(first_bset_blacklisted_journal_seq, 48) \
- x(btree_node_bad_btree, 49) \
- x(btree_node_bad_level, 50) \
- x(btree_node_bad_min_key, 51) \
- x(btree_node_bad_max_key, 52) \
- x(btree_node_bad_format, 53) \
- x(btree_node_bkey_past_bset_end, 54) \
- x(btree_node_bkey_bad_format, 55) \
- x(btree_node_bad_bkey, 56) \
- x(btree_node_bkey_out_of_order, 57) \
- x(btree_root_bkey_invalid, 58) \
- x(btree_root_read_error, 59) \
- x(btree_root_bad_min_key, 60) \
- x(btree_root_bad_max_key, 61) \
- x(btree_node_read_error, 62) \
- x(btree_node_topology_bad_min_key, 63) \
- x(btree_node_topology_bad_max_key, 64) \
- x(btree_node_topology_overwritten_by_prev_node, 65) \
- x(btree_node_topology_overwritten_by_next_node, 66) \
- x(btree_node_topology_interior_node_empty, 67) \
- x(fs_usage_hidden_wrong, 68) \
- x(fs_usage_btree_wrong, 69) \
- x(fs_usage_data_wrong, 70) \
- x(fs_usage_cached_wrong, 71) \
- x(fs_usage_reserved_wrong, 72) \
- x(fs_usage_persistent_reserved_wrong, 73) \
- x(fs_usage_nr_inodes_wrong, 74) \
- x(fs_usage_replicas_wrong, 75) \
- x(dev_usage_buckets_wrong, 76) \
- x(dev_usage_sectors_wrong, 77) \
- x(dev_usage_fragmented_wrong, 78) \
- x(dev_usage_buckets_ec_wrong, 79) \
- x(bkey_version_in_future, 80) \
- x(bkey_u64s_too_small, 81) \
- x(bkey_invalid_type_for_btree, 82) \
- x(bkey_extent_size_zero, 83) \
- x(bkey_extent_size_greater_than_offset, 84) \
- x(bkey_size_nonzero, 85) \
- x(bkey_snapshot_nonzero, 86) \
- x(bkey_snapshot_zero, 87) \
- x(bkey_at_pos_max, 88) \
- x(bkey_before_start_of_btree_node, 89) \
- x(bkey_after_end_of_btree_node, 90) \
- x(bkey_val_size_nonzero, 91) \
- x(bkey_val_size_too_small, 92) \
- x(alloc_v1_val_size_bad, 93) \
- x(alloc_v2_unpack_error, 94) \
- x(alloc_v3_unpack_error, 95) \
- x(alloc_v4_val_size_bad, 96) \
- x(alloc_v4_backpointers_start_bad, 97) \
- x(alloc_key_data_type_bad, 98) \
- x(alloc_key_empty_but_have_data, 99) \
- x(alloc_key_dirty_sectors_0, 100) \
- x(alloc_key_data_type_inconsistency, 101) \
- x(alloc_key_to_missing_dev_bucket, 102) \
- x(alloc_key_cached_inconsistency, 103) \
- x(alloc_key_cached_but_read_time_zero, 104) \
- x(alloc_key_to_missing_lru_entry, 105) \
- x(alloc_key_data_type_wrong, 106) \
- x(alloc_key_gen_wrong, 107) \
- x(alloc_key_dirty_sectors_wrong, 108) \
- x(alloc_key_cached_sectors_wrong, 109) \
- x(alloc_key_stripe_wrong, 110) \
- x(alloc_key_stripe_redundancy_wrong, 111) \
- x(bucket_sector_count_overflow, 112) \
- x(bucket_metadata_type_mismatch, 113) \
- x(need_discard_key_wrong, 114) \
- x(freespace_key_wrong, 115) \
- x(freespace_hole_missing, 116) \
- x(bucket_gens_val_size_bad, 117) \
- x(bucket_gens_key_wrong, 118) \
- x(bucket_gens_hole_wrong, 119) \
- x(bucket_gens_to_invalid_dev, 120) \
- x(bucket_gens_to_invalid_buckets, 121) \
- x(bucket_gens_nonzero_for_invalid_buckets, 122) \
- x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \
- x(need_discard_freespace_key_bad, 124) \
- x(backpointer_pos_wrong, 125) \
- x(backpointer_to_missing_device, 126) \
- x(backpointer_to_missing_alloc, 127) \
- x(backpointer_to_missing_ptr, 128) \
- x(lru_entry_at_time_0, 129) \
- x(lru_entry_to_invalid_bucket, 130) \
- x(lru_entry_bad, 131) \
- x(btree_ptr_val_too_big, 132) \
- x(btree_ptr_v2_val_too_big, 133) \
- x(btree_ptr_has_non_ptr, 134) \
- x(extent_ptrs_invalid_entry, 135) \
- x(extent_ptrs_no_ptrs, 136) \
- x(extent_ptrs_too_many_ptrs, 137) \
- x(extent_ptrs_redundant_crc, 138) \
- x(extent_ptrs_redundant_stripe, 139) \
- x(extent_ptrs_unwritten, 140) \
- x(extent_ptrs_written_and_unwritten, 141) \
- x(ptr_to_invalid_device, 142) \
- x(ptr_to_duplicate_device, 143) \
- x(ptr_after_last_bucket, 144) \
- x(ptr_before_first_bucket, 145) \
- x(ptr_spans_multiple_buckets, 146) \
- x(ptr_to_missing_backpointer, 147) \
- x(ptr_to_missing_alloc_key, 148) \
- x(ptr_to_missing_replicas_entry, 149) \
- x(ptr_to_missing_stripe, 150) \
- x(ptr_to_incorrect_stripe, 151) \
- x(ptr_gen_newer_than_bucket_gen, 152) \
- x(ptr_too_stale, 153) \
- x(stale_dirty_ptr, 154) \
- x(ptr_bucket_data_type_mismatch, 155) \
- x(ptr_cached_and_erasure_coded, 156) \
- x(ptr_crc_uncompressed_size_too_small, 157) \
- x(ptr_crc_csum_type_unknown, 158) \
- x(ptr_crc_compression_type_unknown, 159) \
- x(ptr_crc_redundant, 160) \
- x(ptr_crc_uncompressed_size_too_big, 161) \
- x(ptr_crc_nonce_mismatch, 162) \
- x(ptr_stripe_redundant, 163) \
- x(reservation_key_nr_replicas_invalid, 164) \
- x(reflink_v_refcount_wrong, 165) \
- x(reflink_p_to_missing_reflink_v, 166) \
- x(stripe_pos_bad, 167) \
- x(stripe_val_size_bad, 168) \
- x(stripe_sector_count_wrong, 169) \
- x(snapshot_tree_pos_bad, 170) \
- x(snapshot_tree_to_missing_snapshot, 171) \
- x(snapshot_tree_to_missing_subvol, 172) \
- x(snapshot_tree_to_wrong_subvol, 173) \
- x(snapshot_tree_to_snapshot_subvol, 174) \
- x(snapshot_pos_bad, 175) \
- x(snapshot_parent_bad, 176) \
- x(snapshot_children_not_normalized, 177) \
- x(snapshot_child_duplicate, 178) \
- x(snapshot_child_bad, 179) \
- x(snapshot_skiplist_not_normalized, 180) \
- x(snapshot_skiplist_bad, 181) \
- x(snapshot_should_not_have_subvol, 182) \
- x(snapshot_to_bad_snapshot_tree, 183) \
- x(snapshot_bad_depth, 184) \
- x(snapshot_bad_skiplist, 185) \
- x(subvol_pos_bad, 186) \
- x(subvol_not_master_and_not_snapshot, 187) \
- x(subvol_to_missing_root, 188) \
- x(subvol_root_wrong_bi_subvol, 189) \
- x(bkey_in_missing_snapshot, 190) \
- x(inode_pos_inode_nonzero, 191) \
- x(inode_pos_blockdev_range, 192) \
- x(inode_unpack_error, 193) \
- x(inode_str_hash_invalid, 194) \
- x(inode_v3_fields_start_bad, 195) \
- x(inode_snapshot_mismatch, 196) \
- x(inode_unlinked_but_clean, 197) \
- x(inode_unlinked_but_nlink_nonzero, 198) \
- x(inode_checksum_type_invalid, 199) \
- x(inode_compression_type_invalid, 200) \
- x(inode_subvol_root_but_not_dir, 201) \
- x(inode_i_size_dirty_but_clean, 202) \
- x(inode_i_sectors_dirty_but_clean, 203) \
- x(inode_i_sectors_wrong, 204) \
- x(inode_dir_wrong_nlink, 205) \
- x(inode_dir_multiple_links, 206) \
- x(inode_multiple_links_but_nlink_0, 207) \
- x(inode_wrong_backpointer, 208) \
- x(inode_wrong_nlink, 209) \
- x(inode_unreachable, 210) \
- x(deleted_inode_but_clean, 211) \
- x(deleted_inode_missing, 212) \
- x(deleted_inode_is_dir, 213) \
- x(deleted_inode_not_unlinked, 214) \
- x(extent_overlapping, 215) \
- x(extent_in_missing_inode, 216) \
- x(extent_in_non_reg_inode, 217) \
- x(extent_past_end_of_inode, 218) \
- x(dirent_empty_name, 219) \
- x(dirent_val_too_big, 220) \
- x(dirent_name_too_long, 221) \
- x(dirent_name_embedded_nul, 222) \
- x(dirent_name_dot_or_dotdot, 223) \
- x(dirent_name_has_slash, 224) \
- x(dirent_d_type_wrong, 225) \
- x(dirent_d_parent_subvol_wrong, 226) \
- x(dirent_in_missing_dir_inode, 227) \
- x(dirent_in_non_dir_inode, 228) \
- x(dirent_to_missing_inode, 229) \
- x(dirent_to_missing_subvol, 230) \
- x(dirent_to_itself, 231) \
- x(quota_type_invalid, 232) \
- x(xattr_val_size_too_small, 233) \
- x(xattr_val_size_too_big, 234) \
- x(xattr_invalid_type, 235) \
- x(xattr_name_invalid_chars, 236) \
- x(xattr_in_missing_inode, 237) \
- x(root_subvol_missing, 238) \
- x(root_dir_missing, 239) \
- x(root_inode_not_dir, 240) \
- x(dir_loop, 241) \
- x(hash_table_key_duplicate, 242) \
- x(hash_table_key_wrong_offset, 243) \
- x(unlinked_inode_not_on_deleted_list, 244) \
- x(reflink_p_front_pad_bad, 245)
-
-enum bch_sb_error_id {
-#define x(t, n) BCH_FSCK_ERR_##t = n,
- BCH_SB_ERRS()
-#undef x
- BCH_SB_ERR_MAX
-};
-
struct bch_sb_error_entry_cpu {
u64 id:16,
nr:48;
@@ -268,4 +13,3 @@ struct bch_sb_error_entry_cpu {
typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */
-
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index eff5ce18c69c..116131f95815 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -1,12 +1,25 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_cache.h"
#include "disk_groups.h"
+#include "error.h"
#include "opts.h"
#include "replicas.h"
#include "sb-members.h"
#include "super-io.h"
+void bch2_dev_missing(struct bch_fs *c, unsigned dev)
+{
+ if (dev != BCH_SB_MEMBER_INVALID)
+ bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
+}
+
+void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket)
+{
+ bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset);
+}
+
#define x(t, n, ...) [n] = #t,
static const char * const bch2_iops_measurements[] = {
BCH_IOPS_MEASUREMENTS()
@@ -123,9 +136,9 @@ static int validate_member(struct printbuf *err,
struct bch_sb *sb,
int i)
{
- if (le64_to_cpu(m.nbuckets) > LONG_MAX) {
- prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
- i, le64_to_cpu(m.nbuckets), LONG_MAX);
+ if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) {
+ prt_printf(err, "device %u: too many buckets (got %llu, max %u)",
+ i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX);
return -BCH_ERR_invalid_sb_members;
}
@@ -150,6 +163,11 @@ static int validate_member(struct printbuf *err,
return -BCH_ERR_invalid_sb_members;
}
+ if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) {
+ prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift);
+ return -BCH_ERR_invalid_sb_members;
+ }
+
return 0;
}
@@ -163,18 +181,14 @@ static void member_to_text(struct printbuf *out,
u64 bucket_size = le16_to_cpu(m.bucket_size);
u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
- if (!bch2_member_exists(&m))
+ if (!bch2_member_alive(&m))
return;
- prt_printf(out, "Device:");
- prt_tab(out);
- prt_printf(out, "%u", i);
- prt_newline(out);
+ prt_printf(out, "Device:\t%u\n", i);
printbuf_indent_add(out, 2);
- prt_printf(out, "Label:");
- prt_tab(out);
+ prt_printf(out, "Label:\t");
if (BCH_MEMBER_GROUP(&m)) {
unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
@@ -188,103 +202,76 @@ static void member_to_text(struct printbuf *out,
}
prt_newline(out);
- prt_printf(out, "UUID:");
- prt_tab(out);
+ prt_printf(out, "UUID:\t");
pr_uuid(out, m.uuid.b);
prt_newline(out);
- prt_printf(out, "Size:");
- prt_tab(out);
+ prt_printf(out, "Size:\t");
prt_units_u64(out, device_size << 9);
prt_newline(out);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
- prt_printf(out, "%s errors:", bch2_member_error_strs[i]);
- prt_tab(out);
- prt_u64(out, le64_to_cpu(m.errors[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+ prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i]));
- for (unsigned i = 0; i < BCH_IOPS_NR; i++) {
- prt_printf(out, "%s iops:", bch2_iops_measurements[i]);
- prt_tab(out);
- prt_printf(out, "%u", le32_to_cpu(m.iops[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_IOPS_NR; i++)
+ prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i]));
- prt_printf(out, "Bucket size:");
- prt_tab(out);
+ prt_printf(out, "Bucket size:\t");
prt_units_u64(out, bucket_size << 9);
prt_newline(out);
- prt_printf(out, "First bucket:");
- prt_tab(out);
- prt_printf(out, "%u", le16_to_cpu(m.first_bucket));
- prt_newline(out);
-
- prt_printf(out, "Buckets:");
- prt_tab(out);
- prt_printf(out, "%llu", le64_to_cpu(m.nbuckets));
- prt_newline(out);
+ prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket));
+ prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets));
- prt_printf(out, "Last mount:");
- prt_tab(out);
+ prt_printf(out, "Last mount:\t");
if (m.last_mount)
bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
else
prt_printf(out, "(never)");
prt_newline(out);
- prt_printf(out, "Last superblock write:");
- prt_tab(out);
- prt_u64(out, le64_to_cpu(m.seq));
- prt_newline(out);
+ prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq));
- prt_printf(out, "State:");
- prt_tab(out);
- prt_printf(out, "%s",
+ prt_printf(out, "State:\t%s\n",
BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
? bch2_member_states[BCH_MEMBER_STATE(&m)]
: "unknown");
- prt_newline(out);
- prt_printf(out, "Data allowed:");
- prt_tab(out);
+ prt_printf(out, "Data allowed:\t");
if (BCH_MEMBER_DATA_ALLOWED(&m))
prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
else
prt_printf(out, "(none)");
prt_newline(out);
- prt_printf(out, "Has data:");
- prt_tab(out);
+ prt_printf(out, "Has data:\t");
if (data_have)
prt_bitflags(out, __bch2_data_types, data_have);
else
prt_printf(out, "(none)");
prt_newline(out);
- prt_str(out, "Durability:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
+ prt_printf(out, "Btree allocated bitmap blocksize:\t");
+ if (m.btree_bitmap_shift < 64)
+ prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
+ else
+ prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift);
prt_newline(out);
- prt_printf(out, "Discard:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));
+ prt_printf(out, "Btree allocated bitmap:\t");
+ bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64);
prt_newline(out);
- prt_printf(out, "Freespace initialized:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
- prt_newline(out);
+ prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
+
+ prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m));
+ prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
printbuf_indent_sub(out, 2);
}
-static int bch2_sb_members_v1_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
unsigned i;
@@ -332,9 +319,8 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
member_to_text(out, members_v2_get(mi, i), gi, sb, i);
}
-static int bch2_sb_members_v2_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
@@ -389,12 +375,8 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
prt_newline(out);
printbuf_indent_add(out, 2);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
- prt_printf(out, "%s:", bch2_member_error_strs[i]);
- prt_tab(out);
- prt_u64(out, atomic64_read(&ca->errors[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+ prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i]));
printbuf_indent_sub(out, 2);
prt_str(out, "IO errors since ");
@@ -403,12 +385,9 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
prt_newline(out);
printbuf_indent_add(out, 2);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
- prt_printf(out, "%s:", bch2_member_error_strs[i]);
- prt_tab(out);
- prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+ prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i],
+ atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
printbuf_indent_sub(out, 2);
}
@@ -426,3 +405,128 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
+
+/*
+ * Per member "range has btree nodes" bitmap:
+ *
+ * This is so that if we ever have to run the btree node scan to repair we don't
+ * have to scan full devices:
+ */
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
+{
+ bool ret = true;
+ rcu_read_lock();
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
+
+ if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) {
+ ret = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
+ u64 start, unsigned sectors)
+{
+ struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
+ u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
+
+ u64 end = start + sectors;
+
+ int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
+ if (resize > 0) {
+ u64 new_bitmap = 0;
+
+ for (unsigned i = 0; i < 64; i++)
+ if (bitmap & BIT_ULL(i))
+ new_bitmap |= BIT_ULL(i >> resize);
+ bitmap = new_bitmap;
+ m->btree_bitmap_shift += resize;
+ }
+
+ BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX);
+ BUG_ON(end > 64ULL << m->btree_bitmap_shift);
+
+ for (unsigned bit = start >> m->btree_bitmap_shift;
+ (u64) bit << m->btree_bitmap_shift < end;
+ bit++)
+ bitmap |= BIT_ULL(bit);
+
+ m->btree_allocated_bitmap = cpu_to_le64(bitmap);
+}
+
+void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ if (!bch2_member_exists(c->disk_sb.sb, ptr->dev))
+ continue;
+
+ __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
+ }
+}
+
+unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
+{
+ unsigned nr = 0;
+
+ for (unsigned i = 0; i < sb->nr_devices; i++)
+ nr += bch2_member_exists((struct bch_sb *) sb, i);
+ return nr;
+}
+
+int bch2_sb_member_alloc(struct bch_fs *c)
+{
+ unsigned dev_idx = c->sb.nr_devices;
+ struct bch_sb_field_members_v2 *mi;
+ unsigned nr_devices;
+ unsigned u64s;
+ int best = -1;
+ u64 best_last_mount = 0;
+
+ if (dev_idx < BCH_SB_MEMBERS_MAX)
+ goto have_slot;
+
+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
+ /* eventually BCH_SB_MEMBERS_MAX will be raised */
+ if (dev_idx == BCH_SB_MEMBER_INVALID)
+ continue;
+
+ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
+ if (bch2_member_alive(&m))
+ continue;
+
+ u64 last_mount = le64_to_cpu(m.last_mount);
+ if (best < 0 || last_mount < best_last_mount) {
+ best = dev_idx;
+ best_last_mount = last_mount;
+ }
+ }
+ if (best >= 0) {
+ dev_idx = best;
+ goto have_slot;
+ }
+
+ return -BCH_ERR_ENOSPC_sb_members;
+have_slot:
+ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+
+ mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
+ le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+
+ mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+ if (!mi)
+ return -BCH_ERR_ENOSPC_sb_members;
+
+ c->disk_sb.sb->nr_devices = nr_devices;
+ return dev_idx;
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index be0a94183271..762083b564ee 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_SB_MEMBERS_H
#include "darray.h"
+#include "bkey_types.h"
extern char * const bch2_member_error_strs[];
@@ -28,19 +29,6 @@ static inline bool bch2_dev_is_readable(struct bch_dev *ca)
ca->mi.state != BCH_MEMBER_STATE_failed;
}
-static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
-{
- if (!percpu_ref_tryget(&ca->io_ref))
- return false;
-
- if (ca->mi.state == BCH_MEMBER_STATE_rw ||
- (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
- return true;
-
- percpu_ref_put(&ca->io_ref);
- return false;
-}
-
static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
{
return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
@@ -104,14 +92,41 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *
for (struct bch_dev *_ca = NULL; \
(_ca = __bch2_next_dev((_c), _ca, (_mask)));)
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
+static inline void bch2_dev_get(struct bch_dev *ca)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L);
+#else
+ percpu_ref_get(&ca->ref);
+#endif
+}
+
+static inline void __bch2_dev_put(struct bch_dev *ca)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ long r = atomic_long_dec_return(&ca->ref);
+ if (r < (long) !ca->dying)
+ panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put);
+ ca->last_put = _THIS_IP_;
+ if (!r)
+ complete(&ca->ref_completion);
+#else
+ percpu_ref_put(&ca->ref);
+#endif
+}
+
+static inline void bch2_dev_put(struct bch_dev *ca)
{
if (ca)
- percpu_ref_put(&ca->ref);
+ __bch2_dev_put(ca);
+}
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
+{
rcu_read_lock();
+ bch2_dev_put(ca);
if ((ca = __bch2_next_dev(c, ca, NULL)))
- percpu_ref_get(&ca->ref);
+ bch2_dev_get(ca);
rcu_read_unlock();
return ca;
@@ -131,10 +146,10 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
struct bch_dev *ca,
unsigned state_mask)
{
+ rcu_read_lock();
if (ca)
percpu_ref_put(&ca->io_ref);
- rcu_read_lock();
while ((ca = __bch2_next_dev(c, ca, NULL)) &&
(!((1 << ca->mi.state) & state_mask) ||
!percpu_ref_tryget(&ca->io_ref)))
@@ -157,26 +172,121 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
#define for_each_readable_member(c, ca) \
__for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
-/*
- * If a key exists that references a device, the device won't be going away and
- * we can omit rcu_read_lock():
- */
-static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
+{
+ return dev < c->sb.nr_devices && c->devs[dev];
+}
+
+static inline bool bucket_valid(const struct bch_dev *ca, u64 b)
+{
+ return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first;
+}
+
+static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev)
{
- EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+ EBUG_ON(!bch2_dev_exists(c, dev));
- return rcu_dereference_check(c->devs[idx], 1);
+ return rcu_dereference_check(c->devs[dev], 1);
}
-static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
{
- EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+ EBUG_ON(!bch2_dev_exists(c, dev));
- return rcu_dereference_protected(c->devs[idx],
+ return rcu_dereference_protected(c->devs[dev],
lockdep_is_held(&c->sb_lock) ||
lockdep_is_held(&c->state_lock));
}
+static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev)
+{
+ return c && dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[dev])
+ : NULL;
+}
+
+void bch2_dev_missing(struct bch_fs *, unsigned);
+
+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
+{
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
+ if (unlikely(!ca))
+ bch2_dev_missing(c, dev);
+ return ca;
+}
+
+static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
+{
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
+ if (ca)
+ bch2_dev_get(ca);
+ rcu_read_unlock();
+ return ca;
+}
+
+static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
+{
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
+ if (unlikely(!ca))
+ bch2_dev_missing(c, dev);
+ return ca;
+}
+
+static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket)
+{
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode);
+ if (ca && !bucket_valid(ca, bucket.offset)) {
+ bch2_dev_put(ca);
+ ca = NULL;
+ }
+ return ca;
+}
+
+void bch2_dev_bucket_missing(struct bch_fs *, struct bpos);
+
+static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket)
+{
+ struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket);
+ if (!ca)
+ bch2_dev_bucket_missing(c, bucket);
+ return ca;
+}
+
+static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
+{
+ if (ca && ca->dev_idx == dev_idx)
+ return ca;
+ bch2_dev_put(ca);
+ return bch2_dev_tryget_noerror(c, dev_idx);
+}
+
+static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
+{
+ if (ca && ca->dev_idx == dev_idx)
+ return ca;
+ bch2_dev_put(ca);
+ return bch2_dev_tryget(c, dev_idx);
+}
+
+static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
+{
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ if (ca && !percpu_ref_tryget(&ca->io_ref))
+ ca = NULL;
+ rcu_read_unlock();
+
+ if (ca &&
+ (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)))
+ return ca;
+
+ if (ca)
+ percpu_ref_put(&ca->io_ref);
+ return NULL;
+}
+
/* XXX kill, move to struct bch_fs */
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
{
@@ -191,24 +301,28 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
-static inline bool bch2_member_exists(struct bch_member *m)
+static inline bool bch2_member_alive(struct bch_member *m)
{
return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
}
-static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev)
+static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
{
if (dev < sb->nr_devices) {
struct bch_member m = bch2_sb_member_get(sb, dev);
- return bch2_member_exists(&m);
+ return bch2_member_alive(&m);
}
return false;
}
+unsigned bch2_sb_nr_devices(const struct bch_sb *);
+
static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
{
return (struct bch_member_cpu) {
.nbuckets = le64_to_cpu(mi->nbuckets),
+ .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) -
+ le16_to_cpu(mi->first_bucket),
.first_bucket = le16_to_cpu(mi->first_bucket),
.bucket_size = le16_to_cpu(mi->bucket_size),
.group = BCH_MEMBER_GROUP(mi),
@@ -219,7 +333,9 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
? BCH_MEMBER_DURABILITY(mi) - 1
: 1,
.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
- .valid = bch2_member_exists(mi),
+ .valid = bch2_member_alive(mi),
+ .btree_bitmap_shift = mi->btree_bitmap_shift,
+ .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
};
}
@@ -228,4 +344,24 @@ void bch2_sb_members_from_cpu(struct bch_fs *);
void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
void bch2_dev_errors_reset(struct bch_dev *);
+static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
+{
+ u64 end = start + sectors;
+
+ if (end > 64ULL << ca->mi.btree_bitmap_shift)
+ return false;
+
+ for (unsigned bit = start >> ca->mi.btree_bitmap_shift;
+ (u64) bit << ca->mi.btree_bitmap_shift < end;
+ bit++)
+ if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
+ return false;
+ return true;
+}
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
+void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
+
+int bch2_sb_member_alloc(struct bch_fs *);
+
#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
new file mode 100644
index 000000000000..2adf1221a440
--- /dev/null
+++ b/fs/bcachefs/sb-members_format.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H
+#define _BCACHEFS_SB_MEMBERS_FORMAT_H
+
+/*
+ * We refer to members with bitmasks in various places - but we need to get rid
+ * of this limit:
+ */
+#define BCH_SB_MEMBERS_MAX 64
+
+/*
+ * Sentinal value - indicates a device that does not exist
+ */
+#define BCH_SB_MEMBER_INVALID 255
+
+#define BCH_MIN_NR_NBUCKETS (1 << 6)
+
+#define BCH_IOPS_MEASUREMENTS() \
+ x(seqread, 0) \
+ x(seqwrite, 1) \
+ x(randread, 2) \
+ x(randwrite, 3)
+
+enum bch_iops_measurement {
+#define x(t, n) BCH_IOPS_##t = n,
+ BCH_IOPS_MEASUREMENTS()
+#undef x
+ BCH_IOPS_NR
+};
+
+#define BCH_MEMBER_ERROR_TYPES() \
+ x(read, 0) \
+ x(write, 1) \
+ x(checksum, 2)
+
+enum bch_member_error_type {
+#define x(t, n) BCH_MEMBER_ERROR_##t = n,
+ BCH_MEMBER_ERROR_TYPES()
+#undef x
+ BCH_MEMBER_ERROR_NR
+};
+
+struct bch_member {
+ __uuid_t uuid;
+ __le64 nbuckets; /* device size */
+ __le16 first_bucket; /* index of first bucket used */
+ __le16 bucket_size; /* sectors */
+ __u8 btree_bitmap_shift;
+ __u8 pad[3];
+ __le64 last_mount; /* time_t */
+
+ __le64 flags;
+ __le32 iops[4];
+ __le64 errors[BCH_MEMBER_ERROR_NR];
+ __le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
+ __le64 errors_reset_time;
+ __le64 seq;
+ __le64 btree_allocated_bitmap;
+ /*
+ * On recovery from a clean shutdown we don't normally read the journal,
+ * but we still want to resume writing from where we left off so we
+ * don't overwrite more than is necessary, for list journal debugging:
+ */
+ __le32 last_journal_bucket;
+ __le32 last_journal_bucket_offset;
+};
+
+/*
+ * btree_allocated_bitmap can represent sector addresses of a u64: it itself has
+ * 64 elements, so 64 - ilog2(64)
+ */
+#define BCH_MI_BTREE_BITMAP_SHIFT_MAX 58
+
+/*
+ * This limit comes from the bucket_gens array - it's a single allocation, and
+ * kernel allocation are limited to INT_MAX
+ */
+#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64)
+
+#define BCH_MEMBER_V1_BYTES 56
+
+LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
+/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
+LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+ struct bch_member, flags, 30, 31)
+
+#if 0
+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
+#endif
+
+#define BCH_MEMBER_STATES() \
+ x(rw, 0) \
+ x(ro, 1) \
+ x(failed, 2) \
+ x(spare, 3)
+
+enum bch_member_state {
+#define x(t, n) BCH_MEMBER_STATE_##t = n,
+ BCH_MEMBER_STATES()
+#undef x
+ BCH_MEMBER_STATE_NR
+};
+
+struct bch_sb_field_members_v1 {
+ struct bch_sb_field field;
+ struct bch_member _members[]; //Members are now variable size
+};
+
+struct bch_sb_field_members_v2 {
+ struct bch_sb_field field;
+ __le16 member_bytes; //size of single member entry
+ u8 pad[6];
+ struct bch_member _members[];
+};
+
+#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h
new file mode 100644
index 000000000000..c0eda888fe39
--- /dev/null
+++ b/fs/bcachefs/sb-members_types.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H
+#define _BCACHEFS_SB_MEMBERS_TYPES_H
+
+struct bch_member_cpu {
+ u64 nbuckets; /* device size */
+ u64 nbuckets_minus_first;
+ u16 first_bucket; /* index of first bucket used */
+ u16 bucket_size; /* sectors */
+ u16 group;
+ u8 state;
+ u8 discard;
+ u8 data_allowed;
+ u8 durability;
+ u8 freespace_initialized;
+ u8 valid;
+ u8 btree_bitmap_shift;
+ u64 btree_allocated_bitmap;
+};
+
+#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
index c1860d8163fb..c4b3d8d3f414 100644
--- a/fs/bcachefs/seqmutex.h
+++ b/fs/bcachefs/seqmutex.h
@@ -19,17 +19,14 @@ static inline bool seqmutex_trylock(struct seqmutex *lock)
static inline void seqmutex_lock(struct seqmutex *lock)
{
mutex_lock(&lock->lock);
-}
-
-static inline void seqmutex_unlock(struct seqmutex *lock)
-{
lock->seq++;
- mutex_unlock(&lock->lock);
}
-static inline u32 seqmutex_seq(struct seqmutex *lock)
+static inline u32 seqmutex_unlock(struct seqmutex *lock)
{
- return lock->seq;
+ u32 seq = lock->seq;
+ mutex_unlock(&lock->lock);
+ return seq;
}
static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
index dc1a27cc31cd..a1cc44e66c7e 100644
--- a/fs/bcachefs/siphash.c
+++ b/fs/bcachefs/siphash.c
@@ -45,7 +45,7 @@
*/
#include <asm/byteorder.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/bitops.h>
#include <linux/string.h>
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 3a494c5d1247..7c403427fbdb 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -169,11 +169,17 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
ret = -1 - SIX_LOCK_write;
}
} else if (type == SIX_LOCK_write && lock->readers) {
- if (try) {
+ if (try)
atomic_add(SIX_LOCK_HELD_write, &lock->state);
- smp_mb__after_atomic();
- }
+ /*
+ * Make sure atomic_add happens before pcpu_read_count and
+ * six_set_bitmask in slow path happens before pcpu_read_count.
+ *
+ * Paired with the smp_mb() in read lock fast path (per-cpu mode)
+ * and the one before atomic_read in read unlock path.
+ */
+ smp_mb();
ret = !pcpu_read_count(lock);
if (try && !ret) {
@@ -335,7 +341,7 @@ static inline bool six_owner_running(struct six_lock *lock)
*/
rcu_read_lock();
struct task_struct *owner = READ_ONCE(lock->owner);
- bool ret = owner ? owner_on_cpu(owner) : !rt_task(current);
+ bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
rcu_read_unlock();
return ret;
@@ -485,8 +491,12 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
list_del(&wait->list);
raw_spin_unlock(&lock->wait_lock);
- if (unlikely(acquired))
+ if (unlikely(acquired)) {
do_six_unlock_type(lock, type);
+ } else if (type == SIX_LOCK_write) {
+ six_clear_bitmask(lock, SIX_LOCK_HELD_write);
+ six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
+ }
break;
}
@@ -495,10 +505,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
__set_current_state(TASK_RUNNING);
out:
- if (ret && type == SIX_LOCK_write) {
- six_clear_bitmask(lock, SIX_LOCK_HELD_write);
- six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
- }
trace_contention_end(lock, 0);
return ret;
@@ -610,8 +616,6 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long
if (type != SIX_LOCK_write)
six_release(&lock->dep_map, ip);
- else
- lock->seq++;
if (type == SIX_LOCK_intent &&
lock->intent_lock_recurse) {
@@ -619,6 +623,15 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long
return;
}
+ if (type == SIX_LOCK_write &&
+ lock->write_lock_recurse) {
+ --lock->write_lock_recurse;
+ return;
+ }
+
+ if (type == SIX_LOCK_write)
+ lock->seq++;
+
do_six_unlock_type(lock, type);
}
EXPORT_SYMBOL_GPL(six_unlock_ip);
@@ -729,13 +742,13 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
atomic_add(l[type].lock_val, &lock->state);
}
break;
+ case SIX_LOCK_write:
+ lock->write_lock_recurse++;
+ fallthrough;
case SIX_LOCK_intent:
EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
lock->intent_lock_recurse++;
break;
- case SIX_LOCK_write:
- BUG();
- break;
}
}
EXPORT_SYMBOL_GPL(six_lock_increment);
@@ -837,7 +850,8 @@ void six_lock_exit(struct six_lock *lock)
EXPORT_SYMBOL_GPL(six_lock_exit);
void __six_lock_init(struct six_lock *lock, const char *name,
- struct lock_class_key *key, enum six_lock_init_flags flags)
+ struct lock_class_key *key, enum six_lock_init_flags flags,
+ gfp_t gfp)
{
atomic_set(&lock->state, 0);
raw_spin_lock_init(&lock->wait_lock);
@@ -860,7 +874,7 @@ void __six_lock_init(struct six_lock *lock, const char *name,
* failure if they wish by checking lock->readers, but generally
* will not want to treat it as an error.
*/
- lock->readers = alloc_percpu(unsigned);
+ lock->readers = alloc_percpu_gfp(unsigned, gfp);
}
#endif
}
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 68d46fd7f391..59b851cf8bac 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -137,6 +137,7 @@ struct six_lock {
atomic_t state;
u32 seq;
unsigned intent_lock_recurse;
+ unsigned write_lock_recurse;
struct task_struct *owner;
unsigned __percpu *readers;
raw_spinlock_t wait_lock;
@@ -163,18 +164,19 @@ enum six_lock_init_flags {
};
void __six_lock_init(struct six_lock *lock, const char *name,
- struct lock_class_key *key, enum six_lock_init_flags flags);
+ struct lock_class_key *key, enum six_lock_init_flags flags,
+ gfp_t gfp);
/**
* six_lock_init - initialize a six lock
* @lock: lock to initialize
* @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU
*/
-#define six_lock_init(lock, flags) \
+#define six_lock_init(lock, flags, gfp) \
do { \
static struct lock_class_key __key; \
\
- __six_lock_init((lock), #lock, &__key, flags); \
+ __six_lock_init((lock), #lock, &__key, flags, gfp); \
} while (0)
/**
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index ac6ba04d5521..c54091a28909 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -2,12 +2,14 @@
#include "bcachefs.h"
#include "bkey_buf.h"
+#include "btree_cache.h"
#include "btree_key_cache.h"
#include "btree_update.h"
#include "buckets.h"
#include "errcode.h"
#include "error.h"
#include "fs.h"
+#include "recovery_passes.h"
#include "snapshot.h"
#include <linux/random.h>
@@ -30,15 +32,14 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
le32_to_cpu(t.v->root_snapshot));
}
-int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
int ret = 0;
bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
- bkey_lt(k.k->p, POS(0, 1)), c, err,
- snapshot_tree_pos_bad,
+ bkey_lt(k.k->p, POS(0, 1)),
+ c, snapshot_tree_pos_bad,
"bad pos");
fsck_err:
return ret;
@@ -48,7 +49,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot_tree *s)
{
int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
- BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
+ BTREE_ITER_with_updates, snapshot_tree, s);
if (bch2_err_matches(ret, ENOENT))
ret = -BCH_ERR_ENOENT_snapshot_tree;
@@ -91,23 +92,29 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans,
/* Snapshot nodes: */
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
{
- struct snapshot_table *t;
+ while (id && id < ancestor) {
+ const struct snapshot_t *s = __snapshot_t(t, id);
+ id = s ? s->parent : 0;
+ }
+ return id == ancestor;
+}
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
rcu_read_lock();
- t = rcu_dereference(c->snapshots);
-
- while (id && id < ancestor)
- id = __snapshot_t(t, id)->parent;
+ bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
rcu_read_unlock();
- return id == ancestor;
+ return ret;
}
static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
{
const struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
+ return 0;
if (s->skip[2] <= ancestor)
return s->skip[2];
@@ -118,27 +125,36 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances
return s->parent;
}
+static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+ const struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
+ return false;
+
+ return test_bit(ancestor - id - 1, s->is_ancestor);
+}
+
bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
- struct snapshot_table *t;
bool ret;
- EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots);
-
rcu_read_lock();
- t = rcu_dereference(c->snapshots);
+ struct snapshot_table *t = rcu_dereference(c->snapshots);
+
+ if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
+ ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
+ goto out;
+ }
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
id = get_ancestor_below(t, id, ancestor);
- if (id && id < ancestor) {
- ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
-
- EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
- } else {
- ret = id == ancestor;
- }
+ ret = id && id < ancestor
+ ? test_ancestor_bitmap(t, id, ancestor)
+ : id == ancestor;
+ EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
+out:
rcu_read_unlock();
return ret;
@@ -147,36 +163,42 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
{
size_t idx = U32_MAX - id;
- size_t new_size;
struct snapshot_table *new, *old;
- new_size = max(16UL, roundup_pow_of_two(idx + 1));
+ size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
+ size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
+
+ if (unlikely(new_bytes > INT_MAX))
+ return NULL;
- new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
+ new = kvzalloc(new_bytes, GFP_KERNEL);
if (!new)
return NULL;
+ new->nr = new_size;
+
old = rcu_dereference_protected(c->snapshots, true);
if (old)
- memcpy(new->s,
- rcu_dereference_protected(c->snapshots, true)->s,
- sizeof(new->s[0]) * c->snapshot_table_size);
+ memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
rcu_assign_pointer(c->snapshots, new);
- c->snapshot_table_size = new_size;
- kvfree_rcu_mightsleep(old);
+ kvfree_rcu(old, rcu);
- return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+ return &rcu_dereference_protected(c->snapshots,
+ lockdep_is_held(&c->snapshot_table_lock))->s[idx];
}
static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
{
size_t idx = U32_MAX - id;
+ struct snapshot_table *table =
+ rcu_dereference_protected(c->snapshots,
+ lockdep_is_held(&c->snapshot_table_lock));
lockdep_assert_held(&c->snapshot_table_lock);
- if (likely(idx < c->snapshot_table_size))
- return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+ if (likely(table && idx < table->nr))
+ return &table->s[idx];
return __snapshot_t_mut(c, id);
}
@@ -203,55 +225,54 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
le32_to_cpu(s.v->skip[2]));
}
-int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_s_c_snapshot s;
u32 i, id;
int ret = 0;
bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
- bkey_lt(k.k->p, POS(0, 1)), c, err,
- snapshot_pos_bad,
+ bkey_lt(k.k->p, POS(0, 1)),
+ c, snapshot_pos_bad,
"bad pos");
s = bkey_s_c_to_snapshot(k);
id = le32_to_cpu(s.v->parent);
- bkey_fsck_err_on(id && id <= k.k->p.offset, c, err,
- snapshot_parent_bad,
+ bkey_fsck_err_on(id && id <= k.k->p.offset,
+ c, snapshot_parent_bad,
"bad parent node (%u <= %llu)",
id, k.k->p.offset);
- bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err,
- snapshot_children_not_normalized,
+ bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
+ c, snapshot_children_not_normalized,
"children not normalized");
- bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err,
- snapshot_child_duplicate,
+ bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
+ c, snapshot_child_duplicate,
"duplicate child nodes");
for (i = 0; i < 2; i++) {
id = le32_to_cpu(s.v->children[i]);
- bkey_fsck_err_on(id >= k.k->p.offset, c, err,
- snapshot_child_bad,
+ bkey_fsck_err_on(id >= k.k->p.offset,
+ c, snapshot_child_bad,
"bad child node (%u >= %llu)",
id, k.k->p.offset);
}
if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
- le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err,
- snapshot_skiplist_not_normalized,
+ le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
+ c, snapshot_skiplist_not_normalized,
"skiplist not normalized");
for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
id = le32_to_cpu(s.v->skip[i]);
- bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err,
- snapshot_skiplist_bad,
+ bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
+ c, snapshot_skiplist_bad,
"bad skiplist node %u", id);
}
}
@@ -259,27 +280,10 @@ fsck_err:
return ret;
}
-static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
-{
- struct snapshot_t *t = snapshot_t_mut(c, id);
- u32 parent = id;
-
- while ((parent = bch2_snapshot_parent_early(c, parent)) &&
- parent - id - 1 < IS_ANCESTOR_BITMAP)
- __set_bit(parent - id - 1, t->is_ancestor);
-}
-
-static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
-{
- mutex_lock(&c->snapshot_table_lock);
- __set_is_ancestor_bitmap(c, id);
- mutex_unlock(&c->snapshot_table_lock);
-}
-
static int __bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
struct snapshot_t *t;
@@ -297,6 +301,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
if (new.k->type == KEY_TYPE_snapshot) {
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+ t->live = true;
t->parent = le32_to_cpu(s.v->parent);
t->children[0] = le32_to_cpu(s.v->children[0]);
t->children[1] = le32_to_cpu(s.v->children[1]);
@@ -315,7 +320,11 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
t->skip[2] = 0;
}
- __set_is_ancestor_bitmap(c, id);
+ u32 parent = id;
+
+ while ((parent = bch2_snapshot_parent_early(c, parent)) &&
+ parent - id - 1 < IS_ANCESTOR_BITMAP)
+ __set_bit(parent - id - 1, t->is_ancestor);
if (BCH_SNAPSHOT_DELETED(s.v)) {
set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
@@ -333,7 +342,7 @@ err:
int bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
}
@@ -342,71 +351,7 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s)
{
return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_WITH_UPDATES, snapshot, s);
-}
-
-static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
-{
- struct bch_snapshot v;
- int ret;
-
- if (!id)
- return 0;
-
- ret = bch2_snapshot_lookup(trans, id, &v);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(trans->c, "snapshot node %u not found", id);
- if (ret)
- return ret;
-
- return !BCH_SNAPSHOT_DELETED(&v);
-}
-
-/*
- * If @k is a snapshot with just one live child, it's part of a linear chain,
- * which we consider to be an equivalence class: and then after snapshot
- * deletion cleanup, there should only be a single key at a given position in
- * this equivalence class.
- *
- * This sets the equivalence class of @k to be the child's equivalence class, if
- * it's part of such a linear chain: this correctly sets equivalence classes on
- * startup if we run leaf to root (i.e. in natural key order).
- */
-static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- unsigned i, nr_live = 0, live_idx = 0;
- struct bkey_s_c_snapshot snap;
- u32 id = k.k->p.offset, child[2];
-
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- snap = bkey_s_c_to_snapshot(k);
-
- child[0] = le32_to_cpu(snap.v->children[0]);
- child[1] = le32_to_cpu(snap.v->children[1]);
-
- for (i = 0; i < 2; i++) {
- int ret = bch2_snapshot_live(trans, child[i]);
-
- if (ret < 0)
- return ret;
-
- if (ret)
- live_idx = i;
- nr_live += ret;
- }
-
- mutex_lock(&c->snapshot_table_lock);
-
- snapshot_t_mut(c, id)->equiv = nr_live == 1
- ? snapshot_t_mut(c, child[live_idx])->equiv
- : id;
-
- mutex_unlock(&c->snapshot_table_lock);
-
- return 0;
+ BTREE_ITER_with_updates, snapshot, s);
}
/* fsck: */
@@ -449,6 +394,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
u32 id = snapshot_root;
u32 subvol = 0, s;
+ rcu_read_lock();
while (id) {
s = snapshot_t(c, id)->subvol;
@@ -457,6 +403,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
id = bch2_snapshot_tree_next(c, id);
}
+ rcu_read_unlock();
return subvol;
}
@@ -484,7 +431,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
break;
}
}
-
bch2_trans_iter_exit(trans, &iter);
if (!ret && !found) {
@@ -514,6 +460,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
struct bch_snapshot s;
struct bch_subvolume subvol;
struct printbuf buf = PRINTBUF;
+ struct btree_iter snapshot_iter = {};
u32 root_id;
int ret;
@@ -523,39 +470,52 @@ static int check_snapshot_tree(struct btree_trans *trans,
st = bkey_s_c_to_snapshot_tree(k);
root_id = le32_to_cpu(st.v->root_snapshot);
- ret = bch2_snapshot_lookup(trans, root_id, &s);
+ struct bkey_s_c_snapshot snapshot_k =
+ bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
+ POS(0, root_id), 0, snapshot);
+ ret = bkey_err(snapshot_k);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
+ if (!ret)
+ bkey_val_copy(&s, snapshot_k);
+
if (fsck_err_on(ret ||
root_id != bch2_snapshot_root(c, root_id) ||
st.k->p.offset != le32_to_cpu(s.tree),
- c, snapshot_tree_to_missing_snapshot,
+ trans, snapshot_tree_to_missing_snapshot,
"snapshot tree points to missing/incorrect snapshot:\n %s",
- (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+ (bch2_bkey_val_to_text(&buf, c, st.s_c),
+ prt_newline(&buf),
+ ret
+ ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
+ : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
+ buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
goto err;
}
- ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
- false, 0, &subvol);
+ if (!st.v->master_subvol)
+ goto out;
+
+ ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
if (fsck_err_on(ret,
- c, snapshot_tree_to_missing_subvol,
+ trans, snapshot_tree_to_missing_subvol,
"snapshot tree points to missing subvolume:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
- fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+ fsck_err_on(!bch2_snapshot_is_ancestor(c,
le32_to_cpu(subvol.snapshot),
root_id),
- c, snapshot_tree_to_wrong_subvol,
+ trans, snapshot_tree_to_wrong_subvol,
"snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
- c, snapshot_tree_to_snapshot_subvol,
+ trans, snapshot_tree_to_snapshot_subvol,
"snapshot tree points to snapshot subvolume:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
@@ -563,6 +523,13 @@ static int check_snapshot_tree(struct btree_trans *trans,
u32 subvol_id;
ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+ bch_err_fn(c, ret);
+
+ if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
+ ret = 0;
+ goto err;
+ }
+
if (ret)
goto err;
@@ -574,8 +541,10 @@ static int check_snapshot_tree(struct btree_trans *trans,
u->v.master_subvol = cpu_to_le32(subvol_id);
st = snapshot_tree_i_to_s_c(u);
}
+out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &snapshot_iter);
printbuf_exit(&buf);
return ret;
}
@@ -592,7 +561,7 @@ int bch2_check_snapshot_trees(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_snapshot_trees, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_snapshot_tree(trans, &iter, k)));
bch_err_fn(c, ret);
@@ -669,7 +638,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans,
root = bch2_bkey_get_iter_typed(trans, &root_iter,
BTREE_ID_snapshots, POS(0, root_id),
- BTREE_ITER_WITH_UPDATES, snapshot);
+ BTREE_ITER_with_updates, snapshot);
ret = bkey_err(root);
if (ret)
goto err;
@@ -720,7 +689,6 @@ static int check_snapshot(struct btree_trans *trans,
u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
u32 real_depth;
struct printbuf buf = PRINTBUF;
- bool should_have_subvol;
u32 i, id;
int ret = 0;
@@ -766,12 +734,12 @@ static int check_snapshot(struct btree_trans *trans,
}
}
- should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+ bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
!BCH_SNAPSHOT_DELETED(&s);
if (should_have_subvol) {
id = le32_to_cpu(s.subvol);
- ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+ ret = bch2_subvolume_get(trans, id, false, &subvol);
if (bch2_err_matches(ret, ENOENT))
bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -786,7 +754,7 @@ static int check_snapshot(struct btree_trans *trans,
}
} else {
if (fsck_err_on(s.subvol,
- c, snapshot_should_not_have_subvol,
+ trans, snapshot_should_not_have_subvol,
"snapshot should not point to subvol:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
@@ -803,7 +771,8 @@ static int check_snapshot(struct btree_trans *trans,
if (ret < 0)
goto err;
- if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree,
+ if (fsck_err_on(!ret,
+ trans, snapshot_to_bad_snapshot_tree,
"snapshot points to missing/incorrect tree:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
@@ -815,7 +784,7 @@ static int check_snapshot(struct btree_trans *trans,
real_depth = bch2_snapshot_depth(c, parent_id);
if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
- c, snapshot_bad_depth,
+ trans, snapshot_bad_depth,
"snapshot with incorrect depth field, should be %u:\n %s",
real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
@@ -831,7 +800,8 @@ static int check_snapshot(struct btree_trans *trans,
if (ret < 0)
goto err;
- if (fsck_err_on(!ret, c, snapshot_bad_skiplist,
+ if (fsck_err_on(!ret,
+ trans, snapshot_bad_skiplist,
"snapshot with bad skiplist field:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
@@ -861,26 +831,221 @@ int bch2_check_snapshots(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_reverse_commit(trans, iter,
BTREE_ID_snapshots, POS_MAX,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_snapshot(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
}
+static int check_snapshot_exists(struct btree_trans *trans, u32 id)
+{
+ struct bch_fs *c = trans->c;
+
+ if (bch2_snapshot_exists(c, id))
+ return 0;
+
+ /* Do we need to reconstruct the snapshot_tree entry as well? */
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+ u32 tree_id = 0;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
+ 0, k, ret) {
+ if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
+ tree_id = k.k->p.offset;
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ return ret;
+
+ if (!tree_id) {
+ ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+ if (ret)
+ return ret;
+ }
+
+ struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
+ ret = PTR_ERR_OR_ZERO(snapshot);
+ if (ret)
+ return ret;
+
+ bkey_snapshot_init(&snapshot->k_i);
+ snapshot->k.p = POS(0, id);
+ snapshot->v.tree = cpu_to_le32(tree_id);
+ snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
+ 0, k, ret) {
+ if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
+ snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
+ SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
+ bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+ bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
+}
+
+/* Figure out which snapshot nodes belong in the same tree: */
+struct snapshot_tree_reconstruct {
+ enum btree_id btree;
+ struct bpos cur_pos;
+ snapshot_id_list cur_ids;
+ DARRAY(snapshot_id_list) trees;
+};
+
+static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
+{
+ darray_for_each(r->trees, i)
+ darray_exit(i);
+ darray_exit(&r->trees);
+ darray_exit(&r->cur_ids);
+}
+
+static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+ return r->btree == BTREE_ID_inodes
+ ? r->cur_pos.offset == pos.offset
+ : r->cur_pos.inode == pos.inode;
+}
+
+static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
+{
+ darray_for_each(*l, i)
+ if (snapshot_list_has_id(r, *i))
+ return true;
+ return false;
+}
+
+static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
+{
+ bool first = true;
+ darray_for_each(*s, i) {
+ if (!first)
+ prt_char(out, ' ');
+ first = false;
+ prt_printf(out, "%u", *i);
+ }
+}
+
+static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
+{
+ if (r->cur_ids.nr) {
+ darray_for_each(r->trees, i)
+ if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
+ int ret = snapshot_list_merge(c, i, &r->cur_ids);
+ if (ret)
+ return ret;
+ goto out;
+ }
+ darray_push(&r->trees, r->cur_ids);
+ darray_init(&r->cur_ids);
+ }
+out:
+ r->cur_ids.nr = 0;
+ return 0;
+}
+
+static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+ if (!same_snapshot(r, pos))
+ snapshot_tree_reconstruct_next(c, r);
+ r->cur_pos = pos;
+ return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
+}
+
+int bch2_reconstruct_snapshots(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct printbuf buf = PRINTBUF;
+ struct snapshot_tree_reconstruct r = {};
+ int ret = 0;
+
+ for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
+ if (btree_type_has_snapshots(btree)) {
+ r.btree = btree;
+
+ ret = for_each_btree_key(trans, iter, btree, POS_MIN,
+ BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
+ get_snapshot_trees(c, &r, k.k->p);
+ }));
+ if (ret)
+ goto err;
+
+ snapshot_tree_reconstruct_next(c, &r);
+ }
+ }
+
+ darray_for_each(r.trees, t) {
+ printbuf_reset(&buf);
+ snapshot_id_list_to_text(&buf, t);
+
+ darray_for_each(*t, id) {
+ if (fsck_err_on(!bch2_snapshot_exists(c, *id),
+ trans, snapshot_node_missing,
+ "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
+ if (t->nr > 1) {
+ bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
+ }
+
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_snapshot_exists(trans, *id));
+ if (ret)
+ goto err;
+ }
+ }
+ }
+fsck_err:
+err:
+ bch2_trans_put(trans);
+ snapshot_tree_reconstruct_exit(&r);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_check_key_has_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
+ trans, bkey_in_missing_snapshot,
+ "key in missing snapshot %s, delete?",
+ (bch2_btree_id_to_text(&buf, iter->btree_id),
+ prt_char(&buf, ' '),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node) ?: 1;
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
/*
* Mark a snapshot as deleted, for future cleanup:
*/
int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
{
struct btree_iter iter;
- struct bkey_i_snapshot *s;
- int ret = 0;
-
- s = bch2_bkey_get_mut_typed(trans, &iter,
+ struct bkey_i_snapshot *s =
+ bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_snapshots, POS(0, id),
0, snapshot);
- ret = PTR_ERR_OR_ZERO(s);
+ int ret = PTR_ERR_OR_ZERO(s);
if (unlikely(ret)) {
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
trans->c, "missing snapshot %u", id);
@@ -917,7 +1082,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
int ret = 0;
s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_INTENT, snapshot);
+ BTREE_ITER_intent, snapshot);
ret = bkey_err(s);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
"missing snapshot %u", id);
@@ -1026,7 +1191,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
- POS_MIN, BTREE_ITER_INTENT);
+ POS_MIN, BTREE_ITER_intent);
k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
@@ -1068,10 +1233,6 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
goto err;
new_snapids[i] = iter.pos.offset;
-
- mutex_lock(&c->snapshot_table_lock);
- snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
- mutex_unlock(&c->snapshot_table_lock);
}
err:
bch2_trans_iter_exit(trans, &iter);
@@ -1177,126 +1338,153 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
* that key to snapshot leaf nodes, where we can mutate it
*/
-static int snapshot_delete_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- snapshot_id_list *deleted,
- snapshot_id_list *equiv_seen,
- struct bpos *last_pos)
+struct snapshot_interior_delete {
+ u32 id;
+ u32 live_child;
+};
+typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
+
+static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
{
- struct bch_fs *c = trans->c;
- u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+ darray_for_each(*l, i)
+ if (i->id == id)
+ return i->live_child;
+ return 0;
+}
- if (!bkey_eq(k.k->p, *last_pos))
- equiv_seen->nr = 0;
- *last_pos = k.k->p;
+static unsigned __live_child(struct snapshot_table *t, u32 id,
+ snapshot_id_list *delete_leaves,
+ interior_delete_list *delete_interior)
+{
+ struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
+ return 0;
- if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
- snapshot_list_has_id(equiv_seen, equiv)) {
- return bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
- } else {
- return snapshot_list_add(c, equiv_seen, equiv);
+ for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
+ if (s->children[i] &&
+ !snapshot_list_has_id(delete_leaves, s->children[i]) &&
+ !interior_delete_has_id(delete_interior, s->children[i]))
+ return s->children[i];
+
+ for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
+ u32 live_child = s->children[i]
+ ? __live_child(t, s->children[i], delete_leaves, delete_interior)
+ : 0;
+ if (live_child)
+ return live_child;
}
+
+ return 0;
}
-static int move_key_to_correct_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
+static unsigned live_child(struct bch_fs *c, u32 id,
+ snapshot_id_list *delete_leaves,
+ interior_delete_list *delete_interior)
{
- struct bch_fs *c = trans->c;
- u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+ rcu_read_lock();
+ u32 ret = __live_child(rcu_dereference(c->snapshots), id,
+ delete_leaves, delete_interior);
+ rcu_read_unlock();
+ return ret;
+}
- /*
- * When we have a linear chain of snapshot nodes, we consider
- * those to form an equivalence class: we're going to collapse
- * them all down to a single node, and keep the leaf-most node -
- * which has the same id as the equivalence class id.
- *
- * If there are multiple keys in different snapshots at the same
- * position, we're only going to keep the one in the newest
- * snapshot - the rest have been overwritten and are redundant,
- * and for the key we're going to keep we need to move it to the
- * equivalance class ID if it's not there already.
- */
- if (equiv != k.k->p.snapshot) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- struct btree_iter new_iter;
- int ret;
+static int delete_dead_snapshots_process_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ snapshot_id_list *delete_leaves,
+ interior_delete_list *delete_interior)
+{
+ if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node);
- ret = PTR_ERR_OR_ZERO(new);
+ u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
+ if (live_child) {
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ int ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
- new->k.p.snapshot = equiv;
+ new->k.p.snapshot = live_child;
- bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
-
- ret = bch2_btree_iter_traverse(&new_iter) ?:
- bch2_trans_update(trans, &new_iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
- bch2_trans_iter_exit(trans, &new_iter);
+ struct btree_iter dst_iter;
+ struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
+ iter->btree_id, new->k.p,
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_intent);
+ ret = bkey_err(dst_k);
if (ret)
return ret;
+
+ ret = (bkey_deleted(dst_k.k)
+ ? bch2_trans_update(trans, &dst_iter, new,
+ BTREE_UPDATE_internal_snapshot_node)
+ : 0) ?:
+ bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node);
+ bch2_trans_iter_exit(trans, &dst_iter);
+ return ret;
}
return 0;
}
-static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
+/*
+ * For a given snapshot, if it doesn't have a subvolume that points to it, and
+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
+ * as deleted.
+ */
+static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
+ snapshot_id_list *delete_leaves,
+ interior_delete_list *delete_interior)
{
- struct bkey_s_c_snapshot snap;
- u32 children[2];
- int ret;
-
if (k.k->type != KEY_TYPE_snapshot)
return 0;
- snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v) ||
- BCH_SNAPSHOT_SUBVOL(snap.v))
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+ unsigned live_children = 0;
+
+ if (BCH_SNAPSHOT_SUBVOL(s.v))
return 0;
- children[0] = le32_to_cpu(snap.v->children[0]);
- children[1] = le32_to_cpu(snap.v->children[1]);
+ for (unsigned i = 0; i < 2; i++) {
+ u32 child = le32_to_cpu(s.v->children[i]);
- ret = bch2_snapshot_live(trans, children[0]) ?:
- bch2_snapshot_live(trans, children[1]);
- if (ret < 0)
- return ret;
- return !ret;
-}
+ live_children += child &&
+ !snapshot_list_has_id(delete_leaves, child);
+ }
-/*
- * For a given snapshot, if it doesn't have a subvolume that points to it, and
- * it doesn't have child snapshot nodes - it's now redundant and we can mark it
- * as deleted.
- */
-static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
-{
- int ret = bch2_snapshot_needs_delete(trans, k);
+ if (live_children == 0) {
+ return snapshot_list_add(c, delete_leaves, s.k->p.offset);
+ } else if (live_children == 1) {
+ struct snapshot_interior_delete d = {
+ .id = s.k->p.offset,
+ .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior),
+ };
+
+ if (!d.live_child) {
+ bch_err(c, "error finding live child of snapshot %u", d.id);
+ return -EINVAL;
+ }
- return ret <= 0
- ? ret
- : bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+ return darray_push(delete_interior, d);
+ } else {
+ return 0;
+ }
}
static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
- snapshot_id_list *skip)
+ interior_delete_list *skip)
{
rcu_read_lock();
- while (snapshot_list_has_id(skip, id))
+ while (interior_delete_has_id(skip, id))
id = __bch2_snapshot_parent(c, id);
while (n--) {
do {
id = __bch2_snapshot_parent(c, id);
- } while (snapshot_list_has_id(skip, id));
+ } while (interior_delete_has_id(skip, id));
}
rcu_read_unlock();
@@ -1305,7 +1493,7 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
struct btree_iter *iter, struct bkey_s_c k,
- snapshot_id_list *deleted)
+ interior_delete_list *deleted)
{
struct bch_fs *c = trans->c;
u32 nr_deleted_ancestors = 0;
@@ -1315,7 +1503,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_snapshot)
return 0;
- if (snapshot_list_has_id(deleted, k.k->p.offset))
+ if (interior_delete_has_id(deleted, k.k->p.offset))
return 0;
s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
@@ -1324,7 +1512,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
return ret;
darray_for_each(*deleted, i)
- nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
+ nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
if (!nr_deleted_ancestors)
return 0;
@@ -1342,7 +1530,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
u32 id = le32_to_cpu(s->v.skip[j]);
- if (snapshot_list_has_id(deleted, id)) {
+ if (interior_delete_has_id(deleted, id)) {
id = bch2_snapshot_nth_parent_skip(c,
parent,
depth > 1
@@ -1361,108 +1549,74 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
int bch2_delete_dead_snapshots(struct bch_fs *c)
{
- struct btree_trans *trans;
- snapshot_id_list deleted = { 0 };
- snapshot_id_list deleted_interior = { 0 };
- u32 id;
- int ret = 0;
-
if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
return 0;
- if (!test_bit(BCH_FS_started, &c->flags)) {
- ret = bch2_fs_read_write_early(c);
- bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
- if (ret)
- return ret;
- }
-
- trans = bch2_trans_get(c);
+ struct btree_trans *trans = bch2_trans_get(c);
+ snapshot_id_list delete_leaves = {};
+ interior_delete_list delete_interior = {};
+ int ret = 0;
/*
* For every snapshot node: If we have no live children and it's not
* pointed to by a subvolume, delete it:
*/
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- NULL, NULL, 0,
- bch2_delete_redundant_snapshot(trans, k));
- bch_err_msg(c, ret, "deleting redundant snapshots");
+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
+ check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_msg(c, ret, "walking snapshots");
if (ret)
goto err;
- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- bch2_snapshot_set_equiv(trans, k));
- bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
- if (ret)
+ if (!delete_leaves.nr && !delete_interior.nr)
goto err;
- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ({
- if (k.k->type != KEY_TYPE_snapshot)
- continue;
+ {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "deleting leaves");
+ darray_for_each(delete_leaves, i)
+ prt_printf(&buf, " %u", *i);
- BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
- ? snapshot_list_add(c, &deleted, k.k->p.offset)
- : 0;
- }));
- bch_err_msg(c, ret, "walking snapshots");
- if (ret)
- goto err;
+ prt_printf(&buf, " interior");
+ darray_for_each(delete_interior, i)
+ prt_printf(&buf, " %u->%u", i->id, i->live_child);
- for (id = 0; id < BTREE_ID_NR; id++) {
- struct bpos last_pos = POS_MIN;
- snapshot_id_list equiv_seen = { 0 };
- struct disk_reservation res = { 0 };
+ ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
+ printbuf_exit(&buf);
+ if (ret)
+ goto err;
+ }
- if (!btree_type_has_snapshots(id))
- continue;
+ for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
+ struct disk_reservation res = { 0 };
- /*
- * deleted inodes btree is maintained by a trigger on the inodes
- * btree - no work for us to do here, and it's not safe to scan
- * it because we'll see out of date keys due to the btree write
- * buffer:
- */
- if (id == BTREE_ID_deleted_inodes)
+ if (!btree_type_has_snapshots(btree))
continue;
ret = for_each_btree_key_commit(trans, iter,
- id, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL, BCH_TRANS_COMMIT_no_enospc,
- snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
- for_each_btree_key_commit(trans, iter,
- id, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ btree, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc,
- move_key_to_correct_snapshot(trans, &iter, k));
+ delete_dead_snapshots_process_key(trans, &iter, k,
+ &delete_leaves,
+ &delete_interior));
bch2_disk_reservation_put(c, &res);
- darray_exit(&equiv_seen);
- bch_err_msg(c, ret, "deleting keys from dying snapshots");
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_msg(c, ret, "deleting keys from dying snapshots");
if (ret)
goto err;
}
- bch2_trans_unlock(trans);
- down_write(&c->snapshot_create_lock);
-
- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ({
- u32 snapshot = k.k->p.offset;
- u32 equiv = bch2_snapshot_equiv(c, snapshot);
-
- equiv != snapshot
- ? snapshot_list_add(c, &deleted_interior, snapshot)
- : 0;
- }));
-
- bch_err_msg(c, ret, "walking snapshots");
- if (ret)
- goto err_create_lock;
+ darray_for_each(delete_leaves, i) {
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_snapshot_node_delete(trans, *i));
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ if (ret)
+ goto err;
+ }
/*
* Fixing children of deleted snapshots can't be done completely
@@ -1470,34 +1624,26 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
* nodes some depth fields will be off:
*/
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
- BTREE_ITER_INTENT, k,
+ BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
+ bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
if (ret)
- goto err_create_lock;
-
- darray_for_each(deleted, i) {
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(trans, *i));
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
- if (ret)
- goto err_create_lock;
- }
+ goto err;
- darray_for_each(deleted_interior, i) {
+ darray_for_each(delete_interior, i) {
ret = commit_do(trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(trans, *i));
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ bch2_snapshot_node_delete(trans, i->id));
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_msg(c, ret, "deleting snapshot %u", i->id);
if (ret)
- goto err_create_lock;
+ goto err;
}
-err_create_lock:
- up_write(&c->snapshot_create_lock);
err:
- darray_exit(&deleted_interior);
- darray_exit(&deleted);
+ darray_exit(&delete_interior);
+ darray_exit(&delete_leaves);
bch2_trans_put(trans);
- bch_err_fn(c, ret);
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_fn(c, ret);
return ret;
}
@@ -1505,14 +1651,20 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+ set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
+
bch2_delete_dead_snapshots(c);
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
void bch2_delete_dead_snapshots_async(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
- !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
+ return;
+
+ BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
+
+ if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
@@ -1525,18 +1677,10 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- bch2_trans_iter_init(trans, &iter, id, pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
- while (1) {
- k = bch2_btree_iter_prev(&iter);
- ret = bkey_err(k);
- if (ret)
- break;
-
- if (!k.k)
- break;
-
+ for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
+ BTREE_ITER_not_extents|
+ BTREE_ITER_all_snapshots,
+ k, ret) {
if (!bkey_eq(pos, k.k->p))
break;
@@ -1550,134 +1694,51 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
return ret;
}
-static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s = snapshot_t(c, id);
-
- return s->children[1] ?: s->children[0];
-}
-
-static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
-{
- u32 child;
-
- while ((child = bch2_snapshot_smallest_child(c, id)))
- id = child;
- return id;
-}
-
-static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_s_c interior_k,
- u32 leaf_id, struct bpos *new_min_pos)
-{
- struct btree_iter iter;
- struct bpos pos = interior_k.k->p;
- struct bkey_s_c k;
- struct bkey_i *new;
- int ret;
-
- pos.snapshot = leaf_id;
-
- bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto out;
-
- /* key already overwritten in this snapshot? */
- if (k.k->p.snapshot != interior_k.k->p.snapshot)
- goto out;
-
- if (bpos_eq(*new_min_pos, POS_MIN)) {
- *new_min_pos = k.k->p;
- new_min_pos->snapshot = leaf_id;
- }
-
- new = bch2_bkey_make_mut_noupdate(trans, interior_k);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto out;
-
- new->k.p.snapshot = leaf_id;
- ret = bch2_trans_update(trans, &iter, new, 0);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_s_c k,
- struct bpos *new_min_pos)
+static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
{
- struct bch_fs *c = trans->c;
- struct bkey_buf sk;
- u32 restart_count = trans->restart_count;
- int ret = 0;
-
- bch2_bkey_buf_init(&sk);
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
-
- *new_min_pos = POS_MIN;
-
- for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
- id < k.k->p.snapshot;
- id++) {
- if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
- !bch2_snapshot_is_leaf(c, id))
- continue;
-again:
- ret = btree_trans_too_many_iters(trans) ?:
- bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
- if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- bch2_trans_begin(trans);
- goto again;
- }
-
- if (ret)
- break;
- }
-
- bch2_bkey_buf_exit(&sk, c);
-
- return ret ?: trans_was_restarted(trans, restart_count);
+ /* If there's one child, it's redundant and keys will be moved to the child */
+ return !!snap.v->children[0] + !!snap.v->children[1] == 1;
}
static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_snapshot snap;
- int ret = 0;
-
if (k.k->type != KEY_TYPE_snapshot)
return 0;
- snap = bkey_s_c_to_snapshot(k);
+ struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
if (BCH_SNAPSHOT_DELETED(snap.v) ||
- bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
- (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
- set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
- return 0;
- }
+ interior_snapshot_needs_delete(snap))
+ set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
- return ret;
+ return 0;
}
int bch2_snapshots_read(struct bch_fs *c)
{
+ /*
+ * Initializing the is_ancestor bitmaps requires ancestors to already be
+ * initialized - so mark in reverse:
+ */
int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
+ for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
+ POS_MAX, 0, k,
__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
- bch2_snapshot_set_equiv(trans, k) ?:
- bch2_check_snapshot_needs_deletion(trans, k)) ?:
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
+ bch2_check_snapshot_needs_deletion(trans, k)));
bch_err_fn(c, ret);
+
+ /*
+ * It's important that we check if we need to reconstruct snapshots
+ * before going RW, so we mark that pass as required in the superblock -
+ * otherwise, we could end up deleting keys with missing snapshot nodes
+ * instead
+ */
+ BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
+ test_bit(BCH_FS_may_go_rw, &c->flags));
+
+ if (bch2_err_matches(ret, EIO) ||
+ (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
+ ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
+
return ret;
}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 7c66ffc06385..00373cf32e7b 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -2,14 +2,12 @@
#ifndef _BCACHEFS_SNAPSHOT_H
#define _BCACHEFS_SNAPSHOT_H
-enum bkey_invalid_flags;
-
void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \
- .key_invalid = bch2_snapshot_tree_invalid, \
+ .key_validate = bch2_snapshot_tree_validate, \
.val_to_text = bch2_snapshot_tree_to_text, \
.min_val_size = 8, \
})
@@ -19,13 +17,14 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \
- .key_invalid = bch2_snapshot_invalid, \
+ .key_validate = bch2_snapshot_validate, \
.val_to_text = bch2_snapshot_to_text, \
.trigger = bch2_mark_snapshot, \
.min_val_size = 24, \
@@ -33,7 +32,11 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
{
- return &t->s[U32_MAX - id];
+ u32 idx = U32_MAX - id;
+
+ return likely(t && idx < t->nr)
+ ? &t->s[idx]
+ : NULL;
}
static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
@@ -44,7 +47,8 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
{
rcu_read_lock();
- id = snapshot_t(c, id)->tree;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ id = s ? s->tree : 0;
rcu_read_unlock();
return id;
@@ -52,7 +56,8 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{
- return snapshot_t(c, id)->parent;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s ? s->parent : 0;
}
static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
@@ -66,19 +71,19 @@ static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
- u32 parent = snapshot_t(c, id)->parent;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ if (!s)
+ return 0;
- if (parent &&
- snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
+ u32 parent = s->parent;
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+ parent &&
+ s->depth != snapshot_t(c, parent)->depth + 1)
panic("id %u depth=%u parent %u depth=%u\n",
id, snapshot_t(c, id)->depth,
parent, snapshot_t(c, parent)->depth);
return parent;
-#else
- return snapshot_t(c, id)->parent;
-#endif
}
static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
@@ -114,57 +119,37 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
return id;
}
-static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id)
{
- return snapshot_t(c, id)->equiv;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s ? s->live : 0;
}
-static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
{
rcu_read_lock();
- id = __bch2_snapshot_equiv(c, id);
+ bool ret = __bch2_snapshot_exists(c, id);
rcu_read_unlock();
- return id;
-}
-
-static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
-{
- return id == bch2_snapshot_equiv(c, id);
+ return ret;
}
-static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
+static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *s;
- bool ret;
-
rcu_read_lock();
- s = snapshot_t(c, id);
- ret = s->children[0];
+ const struct snapshot_t *s = snapshot_t(c, id);
+ int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
rcu_read_unlock();
return ret;
}
-static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-{
- return !bch2_snapshot_is_internal_node(c, id);
-}
-
-static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *s;
- u32 parent = __bch2_snapshot_parent(c, id);
-
- if (!parent)
- return 0;
-
- s = snapshot_t(c, __bch2_snapshot_parent(c, id));
- if (id == s->children[0])
- return s->children[1];
- if (id == s->children[1])
- return s->children[0];
- return 0;
+ int ret = bch2_snapshot_is_internal_node(c, id);
+ if (ret < 0)
+ return ret;
+ return !ret;
}
static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
@@ -189,12 +174,9 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *t;
- bool ret;
-
rcu_read_lock();
- t = snapshot_t(c, id);
- ret = (t->children[0]|t->children[1]) != 0;
+ const struct snapshot_t *t = snapshot_t(c, id);
+ bool ret = t && (t->children[0]|t->children[1]) != 0;
rcu_read_unlock();
return ret;
@@ -218,15 +200,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list
static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
- int ret;
-
BUG_ON(snapshot_list_has_id(s, id));
- ret = darray_push(s, id);
+ int ret = darray_push(s, id);
if (ret)
bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
return ret;
}
+static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+ int ret = snapshot_list_has_id(s, id)
+ ? 0
+ : darray_push(s, id);
+ if (ret)
+ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+ return ret;
+}
+
+static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
+{
+ darray_for_each(*src, i) {
+ int ret = snapshot_list_add_nodup(c, dst, *i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s);
int bch2_snapshot_get_subvol(struct btree_trans *, u32,
@@ -238,6 +239,8 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *);
+int bch2_reconstruct_snapshots(struct bch_fs *);
+int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
void bch2_delete_dead_snapshots_work(struct work_struct *);
@@ -249,15 +252,12 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
struct bpos pos)
{
if (!btree_type_has_snapshots(id) ||
- bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+ bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0)
return 0;
return __bch2_key_has_snapshot_overwrites(trans, id, pos);
}
-int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id,
- struct bkey_s_c, struct bpos *);
-
int bch2_snapshots_read(struct bch_fs *);
void bch2_fs_snapshots_exit(struct bch_fs *);
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
new file mode 100644
index 000000000000..d78451c2a0c6
--- /dev/null
+++ b/fs/bcachefs/str_hash.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "fsck.h"
+#include "str_hash.h"
+#include "subvolume.h"
+
+static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
+{
+ if (d.v->d_type == DT_SUBVOL) {
+ struct bch_subvolume subvol;
+ int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol),
+ false, &subvol);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+ return !ret;
+ } else {
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = bkey_is_inode(k.k);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+ }
+}
+
+static noinline int fsck_rename_dirent(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c_dirent old)
+{
+ struct qstr old_name = bch2_dirent_get_name(old);
+ struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
+ int ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ bkey_dirent_init(&new->k_i);
+ dirent_copy_target(new, old);
+ new->k.p = old.k->p;
+
+ for (unsigned i = 0; i < 1000; i++) {
+ unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
+ old_name.len, old_name.name, i);
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
+
+ if (u64s > U8_MAX)
+ return -EINVAL;
+
+ new->k.u64s = u64s;
+
+ ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+ (subvol_inum) { 0, old.k->p.inode },
+ old.k->p.snapshot, &new->k_i,
+ BTREE_UPDATE_internal_snapshot_node);
+ if (!bch2_err_matches(ret, EEXIST))
+ break;
+ }
+
+ if (ret)
+ return ret;
+
+ return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
+}
+
+static noinline int hash_pick_winner(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c k1,
+ struct bkey_s_c k2)
+{
+ if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
+ !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
+ return 0;
+
+ switch (desc.btree_id) {
+ case BTREE_ID_dirents: {
+ int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return 0;
+
+ ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return 1;
+ return 2;
+ }
+ default:
+ return 0;
+ }
+}
+
+static int repair_inode_hash_info(struct btree_trans *trans,
+ struct bch_inode_unpacked *snapshot_root)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
+ SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != snapshot_root->bi_inum)
+ break;
+ if (!bkey_is_inode(k.k))
+ continue;
+
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_unpack(k, &inode);
+ if (ret)
+ break;
+
+ if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed ||
+ INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root),
+ trans, inode_snapshot_mismatch,
+ "inode hash info in different snapshots don't match")) {
+ inode.bi_hash_seed = snapshot_root->bi_hash_seed;
+ SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root));
+ ret = __bch2_fsck_write_inode(trans, &inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+ break;
+ }
+ }
+fsck_err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/*
+ * All versions of the same inode in different snapshots must have the same hash
+ * seed/type: verify that the hash info we're using matches the root
+ */
+static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
+ struct bch_hash_info *hash_info)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+ if (bkey_is_inode(k.k))
+ goto found;
+ }
+ bch_err(c, "%s(): inum %llu not found", __func__, inum);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
+found:;
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_unpack(k, &inode);
+ if (ret)
+ goto err;
+
+ struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
+ if (hash_info->type != hash2.type ||
+ memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) {
+ ret = repair_inode_hash_info(trans, &inode);
+ if (!ret) {
+ bch_err(c, "inode hash info mismatch with root, but mismatch not found\n"
+ "%u %llx %llx\n"
+ "%u %llx %llx",
+ hash_info->type,
+ hash_info->siphash_key.k0,
+ hash_info->siphash_key.k1,
+ hash2.type,
+ hash2.siphash_key.k0,
+ hash2.siphash_key.k1);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ }
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int __bch2_str_hash_check_key(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc *desc,
+ struct bch_hash_info *hash_info,
+ struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter = { NULL };
+ struct printbuf buf = PRINTBUF;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ u64 hash = desc->hash_bkey(hash_info, hash_k);
+ if (hash_k.k->p.offset < hash)
+ goto bad_hash;
+
+ for_each_btree_key_norestart(trans, iter, desc->btree_id,
+ SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
+ BTREE_ITER_slots, k, ret) {
+ if (bkey_eq(k.k->p, hash_k.k->p))
+ break;
+
+ if (k.k->type == desc->key_type &&
+ !desc->cmp_bkey(k, hash_k))
+ goto duplicate_entries;
+
+ if (bkey_deleted(k.k)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto bad_hash;
+ }
+ }
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+bad_hash:
+ /*
+ * Before doing any repair, check hash_info itself:
+ */
+ ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info);
+ if (ret)
+ goto out;
+
+ if (fsck_err(trans, hash_table_key_wrong_offset,
+ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s",
+ bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info,
+ (subvol_inum) { 0, hash_k.k->p.inode },
+ hash_k.k->p.snapshot, new,
+ STR_HASH_must_create|
+ BTREE_ITER_with_updates|
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = bkey_err(k);
+ if (ret)
+ goto out;
+ if (k.k)
+ goto duplicate_entries;
+
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
+ BTREE_UPDATE_internal_snapshot_node) ?:
+ bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
+ }
+fsck_err:
+ goto out;
+duplicate_entries:
+ ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k);
+ if (ret < 0)
+ goto out;
+
+ if (!fsck_err(trans, hash_table_key_duplicate,
+ "duplicate hash table keys%s:\n%s",
+ ret != 2 ? "" : ", both point to valid inodes",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k),
+ prt_newline(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf)))
+ goto out;
+
+ switch (ret) {
+ case 0:
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
+ break;
+ case 1:
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0);
+ break;
+ case 2:
+ ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
+ bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
+ goto out;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
+}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index fcaa5a888744..55a4ac7bf220 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -15,16 +15,6 @@
#include <crypto/hash.h>
#include <crypto/sha2.h>
-typedef unsigned __bitwise bch_str_hash_flags_t;
-
-enum bch_str_hash_flags {
- __BCH_HASH_SET_MUST_CREATE,
- __BCH_HASH_SET_MUST_REPLACE,
-};
-
-#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE)
-#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE)
-
static inline enum bch_str_hash_type
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
{
@@ -56,8 +46,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
/* XXX ick */
struct bch_hash_info info = {
- .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
- ~(~0U << INODE_STR_HASH_BITS),
+ .type = INODE_STR_HASH(bi),
.siphash_key = { .k0 = bi->bi_hash_seed }
};
@@ -159,24 +148,25 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s
desc.is_visible(inum, k));
}
-static __always_inline int
+static __always_inline struct bkey_s_c
bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, const void *key,
- unsigned flags, u32 snapshot)
+ enum btree_iter_update_trigger_flags flags,
+ u32 snapshot)
{
struct bkey_s_c k;
int ret;
- for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+ for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
- BTREE_ITER_SLOTS|flags, k, ret) {
+ BTREE_ITER_slots|flags, k, ret) {
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_key(k, key))
- return 0;
+ return k;
} else if (k.k->type == KEY_TYPE_hash_whiteout) {
;
} else {
@@ -186,20 +176,23 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
}
bch2_trans_iter_exit(trans, iter);
- return ret ?: -BCH_ERR_ENOENT_str_hash_lookup;
+ return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup);
}
-static __always_inline int
+static __always_inline struct bkey_s_c
bch2_hash_lookup(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, const void *key,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
u32 snapshot;
- return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
- bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
+ int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return bkey_s_c_err(ret);
+
+ return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
}
static __always_inline int
@@ -217,10 +210,10 @@ bch2_hash_hole(struct btree_trans *trans,
if (ret)
return ret;
- for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+ for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
+ BTREE_ITER_slots|BTREE_ITER_intent, k, ret)
if (!is_visible_key(desc, inum, k))
return 0;
bch2_trans_iter_exit(trans, iter);
@@ -242,7 +235,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
bch2_btree_iter_advance(&iter);
- for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) {
+ for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) {
if (k.k->type != desc.key_type &&
k.k->type != KEY_TYPE_hash_whiteout)
break;
@@ -259,25 +252,25 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
}
static __always_inline
-int bch2_hash_set_snapshot(struct btree_trans *trans,
+struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
- bch_str_hash_flags_t str_hash_flags,
- int update_flags)
+ enum btree_iter_update_trigger_flags flags)
{
- struct btree_iter iter, slot = { NULL };
+ struct btree_iter slot = {};
struct bkey_s_c k;
bool found = false;
int ret;
- for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
+ for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
SPOS(insert->k.p.inode,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
POS(insert->k.p.inode, U64_MAX),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) {
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
@@ -286,9 +279,8 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
continue;
}
- if (!slot.path &&
- !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE))
- bch2_trans_copy_iter(&slot, &iter);
+ if (!slot.path && !(flags & STR_HASH_must_replace))
+ bch2_trans_copy_iter(&slot, iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
goto not_found;
@@ -298,47 +290,63 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
ret = -BCH_ERR_ENOSPC_str_hash_create;
out:
bch2_trans_iter_exit(trans, &slot);
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
+ bch2_trans_iter_exit(trans, iter);
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
found:
found = true;
not_found:
-
- if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) {
+ if (found && (flags & STR_HASH_must_create)) {
+ bch2_trans_iter_exit(trans, &slot);
+ return k;
+ } else if (!found && (flags & STR_HASH_must_replace)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
- } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) {
- ret = -EEXIST;
} else {
if (!found && slot.path)
- swap(iter, slot);
+ swap(*iter, slot);
- insert->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, insert, update_flags);
+ insert->k.p = iter->pos;
+ ret = bch2_trans_update(trans, iter, insert, flags);
}
goto out;
}
static __always_inline
+int bch2_hash_set_in_snapshot(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, u32 snapshot,
+ struct bkey_i *insert,
+ enum btree_iter_update_trigger_flags flags)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum,
+ snapshot, insert, flags);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+ if (k.k) {
+ bch2_trans_iter_exit(trans, &iter);
+ return -BCH_ERR_EEXIST_str_hash_set;
+ }
+
+ return 0;
+}
+
+static __always_inline
int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum,
struct bkey_i *insert,
- bch_str_hash_flags_t str_hash_flags)
+ enum btree_iter_update_trigger_flags flags)
{
- u32 snapshot;
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
insert->k.p.inode = inum.inum;
- return bch2_hash_set_snapshot(trans, desc, info, inum,
- snapshot, insert, str_hash_flags, 0);
+ u32 snapshot;
+ return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+ bch2_hash_set_in_snapshot(trans, desc, info, inum,
+ snapshot, insert, flags);
}
static __always_inline
@@ -346,7 +354,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct btree_iter *iter,
- unsigned update_flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_i *delete;
int ret;
@@ -364,7 +372,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
delete->k.p = iter->pos;
delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
- return bch2_trans_update(trans, iter, delete, update_flags);
+ return bch2_trans_update(trans, iter, delete, flags);
}
static __always_inline
@@ -374,10 +382,9 @@ int bch2_hash_delete(struct btree_trans *trans,
subvol_inum inum, const void *key)
{
struct btree_iter iter;
- int ret;
-
- ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
- BTREE_ITER_INTENT);
+ struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (ret)
return ret;
@@ -386,4 +393,26 @@ int bch2_hash_delete(struct btree_trans *trans,
return ret;
}
+struct snapshots_seen;
+int __bch2_str_hash_check_key(struct btree_trans *,
+ struct snapshots_seen *,
+ const struct bch_hash_desc *,
+ struct bch_hash_info *,
+ struct btree_iter *, struct bkey_s_c);
+
+static inline int bch2_str_hash_check_key(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc *desc,
+ struct bch_hash_info *hash_info,
+ struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+ if (hash_k.k->type != desc->key_type)
+ return 0;
+
+ if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset))
+ return 0;
+
+ return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k);
+}
+
#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 7c67c28d3ef8..b7b96283c316 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -13,13 +13,26 @@
static int bch2_subvolume_delete(struct btree_trans *, u32);
+static struct bpos subvolume_children_pos(struct bkey_s_c k)
+{
+ if (k.k->type != KEY_TYPE_subvolume)
+ return POS_MIN;
+
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+ if (!s.v->fs_path_parent)
+ return POS_MIN;
+ return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset);
+}
+
static int check_subvol(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_s_c_subvolume subvol;
+ struct btree_iter subvol_children_iter = {};
struct bch_snapshot snapshot;
+ struct printbuf buf = PRINTBUF;
unsigned snapid;
int ret = 0;
@@ -42,6 +55,72 @@ static int check_subvol(struct btree_trans *trans,
return ret ?: -BCH_ERR_transaction_restart_nested;
}
+ if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
+ subvol.v->fs_path_parent,
+ trans, subvol_root_fs_path_parent_nonzero,
+ "root subvolume has nonzero fs_path_parent\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ struct bkey_i_subvolume *n =
+ bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ n->v.fs_path_parent = 0;
+ }
+
+ if (subvol.v->fs_path_parent) {
+ struct bpos pos = subvolume_children_pos(k);
+
+ struct bkey_s_c subvol_children_k =
+ bch2_bkey_get_iter(trans, &subvol_children_iter,
+ BTREE_ID_subvolume_children, pos, 0);
+ ret = bkey_err(subvol_children_k);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
+ trans, subvol_children_not_set,
+ "subvolume not set in subvolume_children btree at %llu:%llu\n%s",
+ pos.inode, pos.offset,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true);
+ if (ret)
+ goto err;
+ }
+ }
+
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_find_by_inum_nowarn_trans(trans,
+ (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
+ &inode);
+ if (!ret) {
+ if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
+ trans, subvol_root_wrong_bi_subvol,
+ "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
+ inode.bi_inum, inode.bi_snapshot,
+ inode.bi_subvol, subvol.k->p.offset)) {
+ inode.bi_subvol = subvol.k->p.offset;
+ inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot);
+ ret = __bch2_fsck_write_inode(trans, &inode);
+ if (ret)
+ goto err;
+ }
+ } else if (bch2_err_matches(ret, ENOENT)) {
+ if (fsck_err(trans, subvol_to_missing_root,
+ "subvolume %llu points to missing subvolume root %llu:%u",
+ k.k->p.offset, le64_to_cpu(subvol.v->inode),
+ le32_to_cpu(subvol.v->snapshot))) {
+ ret = bch2_subvolume_delete(trans, iter->pos.offset);
+ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+ ret = ret ?: -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+ } else {
+ goto err;
+ }
+
if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
u32 snapshot_tree;
@@ -57,23 +136,25 @@ static int check_subvol(struct btree_trans *trans,
"%s: snapshot tree %u not found", __func__, snapshot_tree);
if (ret)
- return ret;
+ goto err;
if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
- c, subvol_not_master_and_not_snapshot,
+ trans, subvol_not_master_and_not_snapshot,
"subvolume %llu is not set as snapshot but is not master subvolume",
k.k->p.offset)) {
struct bkey_i_subvolume *s =
bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
ret = PTR_ERR_OR_ZERO(s);
if (ret)
- return ret;
+ goto err;
SET_BCH_SUBVOLUME_SNAP(&s->v, true);
}
}
-
+err:
fsck_err:
+ bch2_trans_iter_exit(trans, &subvol_children_iter);
+ printbuf_exit(&buf);
return ret;
}
@@ -81,24 +162,68 @@ int bch2_check_subvols(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_subvol(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
}
+static int check_subvol_child(struct btree_trans *trans,
+ struct btree_iter *child_iter,
+ struct bkey_s_c child_k)
+{
+ struct bch_subvolume s;
+ int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
+ 0, subvolume, &s);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (fsck_err_on(ret ||
+ le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
+ trans, subvol_children_bad,
+ "incorrect entry in subvolume_children btree %llu:%llu",
+ child_k.k->p.inode, child_k.k->p.offset)) {
+ ret = bch2_btree_delete_at(trans, child_iter, 0);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ return ret;
+}
+
+int bch2_check_subvol_children(struct bch_fs *c)
+{
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_subvol_child(trans, &iter, k)));
+ bch_err_fn(c, ret);
+ return 0;
+}
+
/* Subvolumes: */
-int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
+ struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k);
int ret = 0;
bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
- bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err,
- subvol_pos_bad,
+ bkey_gt(k.k->p, SUBVOL_POS_MAX),
+ c, subvol_pos_bad,
"invalid pos");
+
+ bkey_fsck_err_on(!subvol.v->snapshot,
+ c, subvol_snapshot_bad,
+ "invalid snapshot");
+
+ bkey_fsck_err_on(!subvol.v->inode,
+ c, subvol_inode_bad,
+ "invalid inode");
fsck_err:
return ret;
}
@@ -112,18 +237,60 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
le64_to_cpu(s.v->inode),
le32_to_cpu(s.v->snapshot));
- if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
- prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
+ if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) {
+ prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
+ prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
+ }
+}
+
+static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
+{
+ return !bpos_eq(pos, POS_MIN)
+ ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set)
+ : 0;
+}
+
+int bch2_subvolume_trigger(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ enum btree_iter_update_trigger_flags flags)
+{
+ if (flags & BTREE_TRIGGER_transactional) {
+ struct bpos children_pos_old = subvolume_children_pos(old);
+ struct bpos children_pos_new = subvolume_children_pos(new.s_c);
+
+ if (!bpos_eq(children_pos_old, children_pos_new)) {
+ int ret = subvolume_children_mod(trans, children_pos_old, false) ?:
+ subvolume_children_mod(trans, children_pos_new, true);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
+{
+ struct btree_iter iter;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
+ struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+
+ return bkey_err(k) ?: k.k && k.k->p.inode == subvol
+ ? -BCH_ERR_ENOTEMPTY_subvol_not_empty
+ : 0;
}
static __always_inline int
bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
bool inconsistent_if_not_found,
- int iter_flags,
struct bch_subvolume *s)
{
int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
- iter_flags, subvolume, s);
+ BTREE_ITER_cached|
+ BTREE_ITER_with_updates, subvolume, s);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
inconsistent_if_not_found,
trans->c, "missing subvolume %u", subvol);
@@ -132,16 +299,15 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
bool inconsistent_if_not_found,
- int iter_flags,
struct bch_subvolume *s)
{
- return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
+ return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s);
}
int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
{
struct bch_subvolume s;
- int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s);
+ int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s);
if (ret)
return ret;
@@ -152,8 +318,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol)
{
- return bch2_trans_do(c, NULL, NULL, 0,
- bch2_subvol_is_ro_trans(trans, subvol));
+ return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol));
}
int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
@@ -162,11 +327,11 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
struct bch_snapshot snap;
return bch2_snapshot_lookup(trans, snapshot, &snap) ?:
- bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
+ bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol);
}
-int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
- u32 *snapid)
+int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
+ u32 *snapid, bool warn)
{
struct btree_iter iter;
struct bkey_s_c_subvolume subvol;
@@ -174,10 +339,11 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
subvol = bch2_bkey_get_iter_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES,
+ BTREE_ITER_cached|BTREE_ITER_with_updates,
subvolume);
ret = bkey_err(subvol);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+
+ bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c,
"missing subvolume %u", subvolid);
if (likely(!ret))
@@ -186,6 +352,12 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
return ret;
}
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
+ u32 *snapid)
+{
+ return __bch2_subvolume_get_snapshot(trans, subvolid, snapid, true);
+}
+
static int bch2_subvolume_reparent(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -197,8 +369,8 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_subvolume)
return 0;
- if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
- le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
+ if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) &&
+ le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent)
return 0;
s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
@@ -206,7 +378,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
if (ret)
return ret;
- s->v.parent = cpu_to_le32(new_parent);
+ s->v.creation_parent = cpu_to_le32(new_parent);
return 0;
}
@@ -223,13 +395,12 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
struct bch_subvolume s;
return lockrestart_do(trans,
- bch2_subvolume_get(trans, subvolid_to_delete, true,
- BTREE_ITER_CACHED, &s)) ?:
+ bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?:
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
- subvolid_to_delete, le32_to_cpu(s.parent)));
+ subvolid_to_delete, le32_to_cpu(s.creation_parent)));
}
/*
@@ -238,26 +409,61 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
*/
static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
- struct btree_iter iter;
- struct bkey_s_c_subvolume subvol;
- u32 snapid;
- int ret = 0;
+ struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {};
- subvol = bch2_bkey_get_iter_typed(trans, &iter,
+ struct bkey_s_c_subvolume subvol =
+ bch2_bkey_get_iter_typed(trans, &subvol_iter,
BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_CACHED|BTREE_ITER_INTENT,
+ BTREE_ITER_cached|BTREE_ITER_intent,
subvolume);
- ret = bkey_err(subvol);
+ int ret = bkey_err(subvol);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
"missing subvolume %u", subvolid);
if (ret)
- return ret;
+ goto err;
- snapid = le32_to_cpu(subvol.v->snapshot);
+ u32 snapid = le32_to_cpu(subvol.v->snapshot);
+
+ struct bkey_s_c_snapshot snapshot =
+ bch2_bkey_get_iter_typed(trans, &snapshot_iter,
+ BTREE_ID_snapshots, POS(0, snapid),
+ 0, snapshot);
+ ret = bkey_err(snapshot);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+ "missing snapshot %u", snapid);
+ if (ret)
+ goto err;
+
+ u32 treeid = le32_to_cpu(snapshot.v->tree);
+
+ struct bkey_s_c_snapshot_tree snapshot_tree =
+ bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter,
+ BTREE_ID_snapshot_trees, POS(0, treeid),
+ 0, snapshot_tree);
+ ret = bkey_err(snapshot_tree);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+ "missing snapshot tree %u", treeid);
+ if (ret)
+ goto err;
+
+ if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) {
+ struct bkey_i_snapshot_tree *snapshot_tree_mut =
+ bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter,
+ &snapshot_tree.s_c,
+ 0, snapshot_tree);
+ ret = PTR_ERR_OR_ZERO(snapshot_tree_mut);
+ if (ret)
+ goto err;
- ret = bch2_btree_delete_at(trans, &iter, 0) ?:
+ snapshot_tree_mut->v.master_subvol = 0;
+ }
+
+ ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?:
bch2_snapshot_node_set_deleted(trans, snapid);
- bch2_trans_iter_exit(trans, &iter);
+err:
+ bch2_trans_iter_exit(trans, &snapshot_tree_iter);
+ bch2_trans_iter_exit(trans, &snapshot_iter);
+ bch2_trans_iter_exit(trans, &subvol_iter);
return ret;
}
@@ -346,7 +552,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
n = bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_CACHED, subvolume);
+ BTREE_ITER_cached, subvolume);
ret = PTR_ERR_OR_ZERO(n);
if (unlikely(ret)) {
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
@@ -360,6 +566,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
}
int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+ u32 parent_subvolid,
u32 src_subvolid,
u32 *new_subvolid,
u32 *new_snapshotid,
@@ -387,7 +594,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
BTREE_ID_subvolumes, POS(0, src_subvolid),
- BTREE_ITER_CACHED, subvolume);
+ BTREE_ITER_cached, subvolume);
ret = PTR_ERR_OR_ZERO(src_subvol);
if (unlikely(ret)) {
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
@@ -416,12 +623,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
if (ret)
goto err;
- new_subvol->v.flags = 0;
- new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
- new_subvol->v.inode = cpu_to_le64(inode);
- new_subvol->v.parent = cpu_to_le32(src_subvolid);
- new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
- new_subvol->v.otime.hi = 0;
+ new_subvol->v.flags = 0;
+ new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
+ new_subvol->v.inode = cpu_to_le64(inode);
+ new_subvol->v.creation_parent = cpu_to_le32(src_subvolid);
+ new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid);
+ new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
+ new_subvol->v.otime.hi = 0;
SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
@@ -434,6 +642,78 @@ err:
return ret;
}
+int bch2_initialize_subvolumes(struct bch_fs *c)
+{
+ struct bkey_i_snapshot_tree root_tree;
+ struct bkey_i_snapshot root_snapshot;
+ struct bkey_i_subvolume root_volume;
+ int ret;
+
+ bkey_snapshot_tree_init(&root_tree.k_i);
+ root_tree.k.p.offset = 1;
+ root_tree.v.master_subvol = cpu_to_le32(1);
+ root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
+
+ bkey_snapshot_init(&root_snapshot.k_i);
+ root_snapshot.k.p.offset = U32_MAX;
+ root_snapshot.v.flags = 0;
+ root_snapshot.v.parent = 0;
+ root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
+ root_snapshot.v.tree = cpu_to_le32(1);
+ SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+ bkey_subvolume_init(&root_volume.k_i);
+ root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_volume.v.flags = 0;
+ root_volume.v.snapshot = cpu_to_le32(U32_MAX);
+ root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
+
+ ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0, 0);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!bkey_is_inode(k.k)) {
+ bch_err(trans->c, "root inode not found");
+ ret = -BCH_ERR_ENOENT_inode;
+ goto err;
+ }
+
+ ret = bch2_inode_unpack(k, &inode);
+ BUG_ON(ret);
+
+ inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ ret = bch2_inode_write(trans, &iter, &inode);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/* set bi_subvol on root inode */
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
+{
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __bch2_fs_upgrade_for_subvolumes(trans));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
int bch2_fs_subvolumes_init(struct bch_fs *c)
{
INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index a6f56f66e27c..910f6196700e 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -5,33 +5,86 @@
#include "darray.h"
#include "subvolume_types.h"
-enum bkey_invalid_flags;
-
int bch2_check_subvols(struct bch_fs *);
+int bch2_check_subvol_children(struct bch_fs *);
-int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
- .key_invalid = bch2_subvolume_invalid, \
+ .key_validate = bch2_subvolume_validate, \
.val_to_text = bch2_subvolume_to_text, \
+ .trigger = bch2_subvolume_trigger, \
.min_val_size = 16, \
})
+int bch2_subvol_has_children(struct btree_trans *, u32);
int bch2_subvolume_get(struct btree_trans *, unsigned,
- bool, int, struct bch_subvolume *);
+ bool, struct bch_subvolume *);
+int __bch2_subvolume_get_snapshot(struct btree_trans *, u32,
+ u32 *, bool);
int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
int bch2_subvol_is_ro(struct bch_fs *, u32);
+static inline struct bkey_s_c
+bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end,
+ u32 subvolid, unsigned flags)
+{
+ u32 snapshot;
+ int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
+ if (ret)
+ return bkey_s_c_err(ret);
+
+ bch2_btree_iter_set_snapshot(iter, snapshot);
+ return bch2_btree_iter_peek_max_type(iter, end, flags);
+}
+
+#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \
+ _end, _subvolid, _flags, _k, _do) \
+({ \
+ struct bkey_s_c _k; \
+ int _ret3 = 0; \
+ \
+ do { \
+ _ret3 = lockrestart_do(_trans, ({ \
+ (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \
+ _end, _subvolid, (_flags)); \
+ if (!(_k).k) \
+ break; \
+ \
+ bkey_err(_k) ?: (_do); \
+ })); \
+ } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret3; \
+})
+
+#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \
+ _start, _end, _subvolid, _flags, _k, _do) \
+({ \
+ struct btree_iter _iter; \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \
+ _end, _subvolid, _flags, _k, _do); \
+})
+
int bch2_delete_dead_snapshots(struct bch_fs *);
void bch2_delete_dead_snapshots_async(struct bch_fs *);
int bch2_subvolume_unlink(struct btree_trans *, u32);
-int bch2_subvolume_create(struct btree_trans *, u64, u32,
- u32 *, u32 *, bool);
+int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
+
+int bch2_initialize_subvolumes(struct bch_fs *);
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
int bch2_fs_subvolumes_init(struct bch_fs *);
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
index af79134b07d6..e029df7ba89f 100644
--- a/fs/bcachefs/subvolume_format.h
+++ b/fs/bcachefs/subvolume_format.h
@@ -19,8 +19,8 @@ struct bch_subvolume {
* This is _not_ necessarily the subvolume of the directory containing
* this subvolume:
*/
- __le32 parent;
- __le32 pad;
+ __le32 creation_parent;
+ __le32 fs_path_parent;
bch_le128 otime;
};
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index ae644adfc391..1549d6daf7af 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -9,17 +9,19 @@ typedef DARRAY(u32) snapshot_id_list;
#define IS_ANCESTOR_BITMAP 128
struct snapshot_t {
+ bool live;
u32 parent;
u32 skip[3];
u32 depth;
u32 children[2];
u32 subvol; /* Nonzero only if a subvolume points to this node: */
u32 tree;
- u32 equiv;
unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
};
struct snapshot_table {
+ struct rcu_head rcu;
+ size_t nr;
#ifndef RUST_BINDGEN
DECLARE_FLEX_ARRAY(struct snapshot_t, s);
#else
@@ -28,7 +30,8 @@ struct snapshot_table {
};
typedef struct {
- u32 subvol;
+ /* we can't have padding in this struct: */
+ u64 subvol;
u64 inum;
} subvol_inum;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 36988add581f..a81a7b6c0989 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -8,7 +8,7 @@
#include "journal.h"
#include "journal_sb.h"
#include "journal_seq_blacklist.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
@@ -23,6 +23,7 @@
#include <linux/backing-dev.h>
#include <linux/sort.h>
+#include <linux/string_choices.h>
static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
};
@@ -41,7 +42,7 @@ static const struct bch2_metadata_version bch2_metadata_versions[] = {
#undef x
};
-void bch2_version_to_text(struct printbuf *out, unsigned v)
+void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v)
{
const char *str = "(unknown version)";
@@ -54,7 +55,7 @@ void bch2_version_to_text(struct printbuf *out, unsigned v)
prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str);
}
-unsigned bch2_latest_compatible_version(unsigned v)
+enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v)
{
if (!BCH_VERSION_MAJOR(v))
return v;
@@ -68,6 +69,22 @@ unsigned bch2_latest_compatible_version(unsigned v)
return v;
}
+bool bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
+{
+ bool ret = (c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
+ version <= c->sb.version_incompat_allowed;
+
+ if (ret) {
+ mutex_lock(&c->sb_lock);
+ SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
+ max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
+ return ret;
+}
+
const char * const bch2_sb_fields[] = {
#define x(name, nr) #name,
BCH_SB_FIELDS()
@@ -76,7 +93,7 @@ const char * const bch2_sb_fields[] = {
};
static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
- struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
enum bch_sb_field_type type)
@@ -142,8 +159,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
void bch2_free_super(struct bch_sb_handle *sb)
{
kfree(sb->bio);
- if (!IS_ERR_OR_NULL(sb->bdev_handle))
- bdev_release(sb->bdev_handle);
+ if (!IS_ERR_OR_NULL(sb->s_bdev_file))
+ bdev_fput(sb->s_bdev_file);
kfree(sb->holder);
kfree(sb->sb_name);
@@ -232,7 +249,7 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
- percpu_ref_put(&ca->ref);
+ percpu_ref_put(&ca->io_ref);
return NULL;
}
}
@@ -287,6 +304,11 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
return -BCH_ERR_invalid_sb_layout_nr_superblocks;
}
+ if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) {
+ prt_printf(out, "Invalid superblock layout: max_size_bits too high");
+ return -BCH_ERR_invalid_sb_layout_sb_max_size_bits;
+ }
+
max_sectors = 1 << layout->sb_max_size_bits;
prev_offset = le64_to_cpu(layout->sb_offset[0]);
@@ -344,8 +366,8 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
return 0;
}
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
- int rw)
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
+ enum bch_validate_flags flags, struct printbuf *out)
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field_members_v1 *mi;
@@ -363,6 +385,12 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
return -BCH_ERR_invalid_sb_features;
}
+ if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
+ BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
+ prt_printf(out, "Filesystem has incompatible version");
+ return -BCH_ERR_invalid_sb_features;
+ }
+
block_size = le16_to_cpu(sb->block_size);
if (block_size > PAGE_SECTORS) {
@@ -401,7 +429,22 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
return -BCH_ERR_invalid_sb_time_precision;
}
- if (rw == READ) {
+ /* old versions didn't know to downgrade this field */
+ if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version))
+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version));
+
+ if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) {
+ prt_printf(out, "Invalid version_incompat ");
+ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
+ prt_str(out, " > incompat_allowed ");
+ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
+ if (flags & BCH_VALIDATE_write)
+ return -BCH_ERR_invalid_sb_version;
+ else
+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb));
+ }
+
+ if (!flags) {
/*
* Been seeing a bug where these are getting inexplicably
* zeroed, so we're now validating them, but we have to be
@@ -414,8 +457,20 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
+
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
+ !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
+ SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
+
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
+ SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
}
+#ifdef __KERNEL__
+ if (!BCH_SB_SHARD_INUMS_NBITS(sb))
+ SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus())));
+#endif
+
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
const struct bch_option *opt = bch2_opt_table + opt_id;
@@ -457,7 +512,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
return -BCH_ERR_invalid_sb_members_missing;
}
- ret = bch2_sb_field_validate(sb, &mi->field, out);
+ ret = bch2_sb_field_validate(sb, &mi->field, flags, out);
if (ret)
return ret;
@@ -465,11 +520,19 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
continue;
- ret = bch2_sb_field_validate(sb, f, out);
+ ret = bch2_sb_field_validate(sb, f, flags, out);
if (ret)
return ret;
}
+ if ((flags & BCH_VALIDATE_write) &&
+ bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
+ prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
+ le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
+ le64_to_cpu(sb->seq));
+ return -BCH_ERR_invalid_sb_members_missing;
+ }
+
return 0;
}
@@ -499,6 +562,9 @@ static void bch2_sb_update(struct bch_fs *c)
c->sb.uuid = src->uuid;
c->sb.user_uuid = src->user_uuid;
c->sb.version = le16_to_cpu(src->version);
+ c->sb.version_incompat = BCH_SB_VERSION_INCOMPAT(src);
+ c->sb.version_incompat_allowed
+ = BCH_SB_VERSION_INCOMPAT_ALLOWED(src);
c->sb.version_min = le16_to_cpu(src->version_min);
c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src);
c->sb.nr_devices = src->nr_devices;
@@ -519,9 +585,11 @@ static void bch2_sb_update(struct bch_fs *c)
memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
- if (ext)
+ if (ext) {
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
+ c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
+ }
for_each_member_device(c, ca) {
struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
@@ -639,9 +707,10 @@ reread:
bytes = vstruct_bytes(sb->sb);
- if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
- prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
- bytes, 512UL << sb->sb->layout.sb_max_size_bits);
+ u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits);
+ if (bytes > sb_size) {
+ prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)",
+ bytes, sb_size);
return -BCH_ERR_invalid_sb_too_big;
}
@@ -653,7 +722,8 @@ reread:
}
enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
- if (csum_type >= BCH_CSUM_NR) {
+ if (csum_type >= BCH_CSUM_NR ||
+ bch2_csum_type_is_encryption(csum_type)) {
prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
return -BCH_ERR_invalid_sb_csum_type;
}
@@ -690,8 +760,11 @@ retry:
return -ENOMEM;
sb->sb_name = kstrdup(path, GFP_KERNEL);
- if (!sb->sb_name)
- return -ENOMEM;
+ if (!sb->sb_name) {
+ ret = -ENOMEM;
+ prt_printf(&err, "error allocating memory for sb_name");
+ goto err;
+ }
#ifndef __KERNEL__
if (opt_get(*opts, direct_io) == false)
@@ -704,22 +777,23 @@ retry:
if (!opt_get(*opts, nochanges))
sb->mode |= BLK_OPEN_WRITE;
- sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (IS_ERR(sb->bdev_handle) &&
- PTR_ERR(sb->bdev_handle) == -EACCES &&
+ sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (IS_ERR(sb->s_bdev_file) &&
+ PTR_ERR(sb->s_bdev_file) == -EACCES &&
opt_get(*opts, read_only)) {
sb->mode &= ~BLK_OPEN_WRITE;
- sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (!IS_ERR(sb->bdev_handle))
+ sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (!IS_ERR(sb->s_bdev_file))
opt_set(*opts, nochanges, true);
}
- if (IS_ERR(sb->bdev_handle)) {
- ret = PTR_ERR(sb->bdev_handle);
+ if (IS_ERR(sb->s_bdev_file)) {
+ ret = PTR_ERR(sb->s_bdev_file);
+ prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
goto err;
}
- sb->bdev = sb->bdev_handle->bdev;
+ sb->bdev = file_bdev(sb->s_bdev_file);
ret = bch2_sb_realloc(sb, 0);
if (ret) {
@@ -743,9 +817,9 @@ retry:
prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
- printk(KERN_INFO "%s", err2.buf);
+ bch2_print_opts(opts, KERN_INFO "%s", err2.buf);
else
- printk(KERN_ERR "%s", err2.buf);
+ bch2_print_opts(opts, KERN_ERR "%s", err2.buf);
printbuf_exit(&err2);
printbuf_reset(&err);
@@ -777,8 +851,10 @@ retry:
i < layout.sb_offset + layout.nr_superblocks; i++) {
offset = le64_to_cpu(*i);
- if (offset == opt_get(*opts, sb))
+ if (offset == opt_get(*opts, sb)) {
+ ret = -BCH_ERR_invalid;
continue;
+ }
ret = read_one_super(sb, offset, &err);
if (!ret)
@@ -803,21 +879,20 @@ got_super:
goto err;
}
- ret = 0;
sb->have_layout = true;
- ret = bch2_sb_validate(sb, &err, READ);
+ ret = bch2_sb_validate(sb, 0, &err);
if (ret) {
- printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
- path, err.buf);
+ bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
+ path, err.buf);
goto err_no_print;
}
out:
printbuf_exit(&err);
return ret;
err:
- printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
- path, err.buf);
+ bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
+ path, err.buf);
err_no_print:
bch2_free_super(sb);
goto out;
@@ -850,7 +925,7 @@ static void write_super_endio(struct bio *bio)
? BCH_MEMBER_ERROR_write
: BCH_MEMBER_ERROR_read,
"superblock %s error: %s",
- bio_data_dir(bio) ? "write" : "read",
+ str_write_read(bio_data_dir(bio)),
bch2_blk_status_to_str(bio->bi_status)))
ca->sb_write_error = 1;
@@ -863,14 +938,15 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
struct bch_sb *sb = ca->disk_sb.sb;
struct bio *bio = ca->disk_sb.bio;
+ memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE);
+
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]);
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
- bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
+ bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE);
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
- bio_sectors(bio));
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
percpu_ref_get(&ca->io_ref);
closure_bio_submit(bio, &c->sb_write);
@@ -910,6 +986,7 @@ int bch2_write_super(struct bch_fs *c)
struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
+ DARRAY(struct bch_dev *) online_devices = {};
int ret = 0;
trace_and_count(c, write_super, c, _RET_IP_);
@@ -922,6 +999,15 @@ int bch2_write_super(struct bch_fs *c)
closure_init_stack(cl);
memset(&sb_written, 0, sizeof(sb_written));
+ for_each_online_member(c, ca) {
+ ret = darray_push(&online_devices, ca);
+ if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
+ percpu_ref_put(&ca->io_ref);
+ goto out;
+ }
+ percpu_ref_get(&ca->io_ref);
+ }
+
/* Make sure we're using the new magic numbers: */
c->disk_sb.sb->magic = BCHFS_MAGIC;
c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
@@ -929,8 +1015,8 @@ int bch2_write_super(struct bch_fs *c)
le64_add_cpu(&c->disk_sb.sb->seq, 1);
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- for_each_online_member(c, ca)
- __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq;
+ darray_for_each(online_devices, ca)
+ __bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq;
c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
if (test_bit(BCH_FS_error, &c->flags))
@@ -946,16 +1032,15 @@ int bch2_write_super(struct bch_fs *c)
bch2_sb_errors_from_cpu(c);
bch2_sb_downgrade_update(c);
- for_each_online_member(c, ca)
- bch2_sb_from_fs(c, ca);
+ darray_for_each(online_devices, ca)
+ bch2_sb_from_fs(c, (*ca));
- for_each_online_member(c, ca) {
+ darray_for_each(online_devices, ca) {
printbuf_reset(&err);
- ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
+ ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err);
if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
- percpu_ref_put(&ca->io_ref);
goto out;
}
}
@@ -977,58 +1062,79 @@ int bch2_write_super(struct bch_fs *c)
prt_str(&buf, " > ");
bch2_version_to_text(&buf, bcachefs_metadata_version_current);
prt_str(&buf, ")");
- bch2_fs_fatal_error(c, "%s", buf.buf);
+ bch2_fs_fatal_error(c, ": %s", buf.buf);
printbuf_exit(&buf);
return -BCH_ERR_sb_not_downgraded;
}
- for_each_online_member(c, ca) {
- __set_bit(ca->dev_idx, sb_written.d);
- ca->sb_write_error = 0;
+ darray_for_each(online_devices, ca) {
+ __set_bit((*ca)->dev_idx, sb_written.d);
+ (*ca)->sb_write_error = 0;
}
- for_each_online_member(c, ca)
- read_back_super(c, ca);
+ darray_for_each(online_devices, ca)
+ read_back_super(c, *ca);
closure_sync(cl);
- for_each_online_member(c, ca) {
+ darray_for_each(online_devices, cap) {
+ struct bch_dev *ca = *cap;
+
if (ca->sb_write_error)
continue;
if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
- bch2_fs_fatal_error(c,
- "Superblock write was silently dropped! (seq %llu expected %llu)",
+ struct printbuf buf = PRINTBUF;
+ prt_char(&buf, ' ');
+ prt_bdevname(&buf, ca->disk_sb.bdev);
+ prt_printf(&buf,
+ ": Superblock write was silently dropped! (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
- percpu_ref_put(&ca->io_ref);
- ret = -BCH_ERR_erofs_sb_err;
- goto out;
+
+ if (c->opts.errors != BCH_ON_ERROR_continue &&
+ c->opts.errors != BCH_ON_ERROR_fix_safe) {
+ ret = -BCH_ERR_erofs_sb_err;
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ } else {
+ bch_err(c, "%s", buf.buf);
+ }
+
+ printbuf_exit(&buf);
}
if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
- bch2_fs_fatal_error(c,
- "Superblock modified by another process (seq %llu expected %llu)",
+ struct printbuf buf = PRINTBUF;
+ prt_char(&buf, ' ');
+ prt_bdevname(&buf, ca->disk_sb.bdev);
+ prt_printf(&buf,
+ ": Superblock modified by another process (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
- percpu_ref_put(&ca->io_ref);
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
ret = -BCH_ERR_erofs_sb_err;
- goto out;
}
}
+ if (ret)
+ goto out;
+
do {
wrote = false;
- for_each_online_member(c, ca)
+ darray_for_each(online_devices, cap) {
+ struct bch_dev *ca = *cap;
if (!ca->sb_write_error &&
sb < ca->disk_sb.sb->layout.nr_superblocks) {
write_one_super(c, ca, sb);
wrote = true;
}
+ }
closure_sync(cl);
sb++;
} while (wrote);
- for_each_online_member(c, ca) {
+ darray_for_each(online_devices, cap) {
+ struct bch_dev *ca = *cap;
if (ca->sb_write_error)
__clear_bit(ca->dev_idx, sb_written.d);
else
@@ -1058,12 +1164,15 @@ int bch2_write_super(struct bch_fs *c)
!can_mount_with_written ||
(can_mount_without_written &&
!can_mount_with_written), c,
- "Unable to write superblock to sufficient devices (from %ps)",
+ ": Unable to write superblock to sufficient devices (from %ps)",
(void *) _RET_IP_))
ret = -1;
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
+ darray_for_each(online_devices, ca)
+ percpu_ref_put(&(*ca)->io_ref);
+ darray_exit(&online_devices);
printbuf_exit(&err);
return ret;
}
@@ -1093,23 +1202,19 @@ bool bch2_check_version_downgrade(struct bch_fs *c)
* c->sb will be checked before we write the superblock, so update it as
* well:
*/
- if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) {
+ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
- c->sb.version_upgrade_complete = bcachefs_metadata_version_current;
- }
- if (c->sb.version > bcachefs_metadata_version_current) {
+ if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current)
+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current);
+ if (c->sb.version > bcachefs_metadata_version_current)
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
- c->sb.version = bcachefs_metadata_version_current;
- }
- if (c->sb.version_min > bcachefs_metadata_version_current) {
+ if (c->sb.version_min > bcachefs_metadata_version_current)
c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
- c->sb.version_min = bcachefs_metadata_version_current;
- }
c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
return ret;
}
-void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
+void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
{
lockdep_assert_held(&c->sb_lock);
@@ -1119,10 +1224,16 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
c->disk_sb.sb->version = cpu_to_le16(new_version);
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+
+ if (incompat) {
+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
+ max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field);
+ }
}
static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
if (vstruct_bytes(f) < 88) {
prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88);
@@ -1137,8 +1248,7 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
{
struct bch_sb_field_ext *e = field_to_type(f, ext);
- prt_printf(out, "Recovery passes required:");
- prt_tab(out);
+ prt_printf(out, "Recovery passes required:\t");
prt_bitflags(out, bch2_recovery_passes,
bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0])));
prt_newline(out);
@@ -1147,13 +1257,17 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
if (errors_silent) {
le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
- prt_printf(out, "Errors to silently fix:");
- prt_tab(out);
- prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8);
+ prt_printf(out, "Errors to silently fix:\t");
+ prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent,
+ min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8));
prt_newline(out);
kfree(errors_silent);
}
+
+ prt_printf(out, "Btrees with missing data:\t");
+ prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
+ prt_newline(out);
}
static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
@@ -1178,14 +1292,14 @@ static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
}
static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
unsigned type = le32_to_cpu(f->type);
struct printbuf field_err = PRINTBUF;
const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
int ret;
- ret = ops->validate ? ops->validate(sb, f, &field_err) : 0;
+ ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0;
if (ret) {
prt_printf(err, "Invalid superblock section %s: %s",
bch2_sb_fields[type], field_err.buf);
@@ -1252,104 +1366,86 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
bool print_layout, unsigned fields)
{
- u64 fields_have = 0;
- unsigned nr_devices = 0;
-
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 44);
- for (int i = 0; i < sb->nr_devices; i++)
- nr_devices += bch2_dev_exists(sb, i);
-
- prt_printf(out, "External UUID:");
- prt_tab(out);
+ prt_printf(out, "External UUID:\t");
pr_uuid(out, sb->user_uuid.b);
prt_newline(out);
- prt_printf(out, "Internal UUID:");
- prt_tab(out);
+ prt_printf(out, "Internal UUID:\t");
pr_uuid(out, sb->uuid.b);
prt_newline(out);
- prt_printf(out, "Magic number:");
- prt_tab(out);
+ prt_printf(out, "Magic number:\t");
pr_uuid(out, sb->magic.b);
prt_newline(out);
- prt_str(out, "Device index:");
- prt_tab(out);
- prt_printf(out, "%u", sb->dev_idx);
- prt_newline(out);
+ prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
- prt_str(out, "Label:");
- prt_tab(out);
- prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+ prt_printf(out, "Label:\t");
+ if (!strlen(sb->label))
+ prt_printf(out, "(none)");
+ else
+ prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
prt_newline(out);
- prt_str(out, "Version:");
- prt_tab(out);
+ prt_printf(out, "Version:\t");
bch2_version_to_text(out, le16_to_cpu(sb->version));
prt_newline(out);
- prt_str(out, "Version upgrade complete:");
- prt_tab(out);
+ prt_printf(out, "Incompatible features allowed:\t");
+ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
+ prt_newline(out);
+
+ prt_printf(out, "Incompatible features in use:\t");
+ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
+ prt_newline(out);
+
+ prt_printf(out, "Version upgrade complete:\t");
bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
prt_newline(out);
- prt_printf(out, "Oldest version on disk:");
- prt_tab(out);
+ prt_printf(out, "Oldest version on disk:\t");
bch2_version_to_text(out, le16_to_cpu(sb->version_min));
prt_newline(out);
- prt_printf(out, "Created:");
- prt_tab(out);
+ prt_printf(out, "Created:\t");
if (sb->time_base_lo)
bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
else
prt_printf(out, "(not set)");
prt_newline(out);
- prt_printf(out, "Sequence number:");
- prt_tab(out);
+ prt_printf(out, "Sequence number:\t");
prt_printf(out, "%llu", le64_to_cpu(sb->seq));
prt_newline(out);
- prt_printf(out, "Time of last write:");
- prt_tab(out);
+ prt_printf(out, "Time of last write:\t");
bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
prt_newline(out);
- prt_printf(out, "Superblock size:");
- prt_tab(out);
+ prt_printf(out, "Superblock size:\t");
prt_units_u64(out, vstruct_bytes(sb));
prt_str(out, "/");
prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
prt_newline(out);
- prt_printf(out, "Clean:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_SB_CLEAN(sb));
- prt_newline(out);
+ prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
+ prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb));
- prt_printf(out, "Devices:");
- prt_tab(out);
- prt_printf(out, "%u", nr_devices);
- prt_newline(out);
-
- prt_printf(out, "Sections:");
+ prt_printf(out, "Sections:\t");
+ u64 fields_have = 0;
vstruct_for_each(sb, f)
fields_have |= 1 << le32_to_cpu(f->type);
- prt_tab(out);
prt_bitflags(out, bch2_sb_fields, fields_have);
prt_newline(out);
- prt_printf(out, "Features:");
- prt_tab(out);
+ prt_printf(out, "Features:\t");
prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
prt_newline(out);
- prt_printf(out, "Compat features:");
- prt_tab(out);
+ prt_printf(out, "Compat features:\t");
prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
prt_newline(out);
@@ -1366,8 +1462,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
if (opt->get_sb != BCH2_NO_SB_OPT) {
u64 v = bch2_opt_from_sb(sb, id);
- prt_printf(out, "%s:", opt->attr.name);
- prt_tab(out);
+ prt_printf(out, "%s:\t", opt->attr.name);
bch2_opt_to_text(out, NULL, sb, opt, v,
OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
prt_newline(out);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 95e80e06316b..b4cff9ebdebb 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -10,14 +10,26 @@
#include <asm/byteorder.h>
+#define BCH_SB_READ_SCRATCH_BUF_SIZE 4096
+
static inline bool bch2_version_compatible(u16 version)
{
return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) &&
version >= bcachefs_metadata_version_min;
}
-void bch2_version_to_text(struct printbuf *, unsigned);
-unsigned bch2_latest_compatible_version(unsigned);
+void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
+enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
+
+bool bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
+
+static inline bool bch2_request_incompat_feature(struct bch_fs *c,
+ enum bcachefs_metadata_version version)
+{
+ return likely(version <= c->sb.version_incompat)
+ ? true
+ : bch2_set_version_incompat(c, version);
+}
static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
{
@@ -51,7 +63,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
extern const char * const bch2_sb_fields[];
struct bch_sb_field_ops {
- int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *);
+ int (*validate)(struct bch_sb *, struct bch_sb_field *,
+ enum bch_validate_flags, struct printbuf *);
void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
};
@@ -91,7 +104,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
}
bool bch2_check_version_downgrade(struct bch_fs *);
-void bch2_sb_upgrade(struct bch_fs *, unsigned);
+void bch2_sb_upgrade(struct bch_fs *, unsigned, bool);
void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
struct bch_sb_field *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6b23e11825e6..0459c875e189 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -15,6 +15,7 @@
#include "btree_gc.h"
#include "btree_journal_iter.h"
#include "btree_key_cache.h"
+#include "btree_node_scan.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_write_buffer.h"
@@ -24,6 +25,7 @@
#include "clock.h"
#include "compress.h"
#include "debug.h"
+#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
#include "errcode.h"
@@ -56,6 +58,7 @@
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
+#include "thread_with_file.h"
#include "trace.h"
#include <linux/backing-dev.h>
@@ -86,26 +89,51 @@ const char * const bch2_fs_flag_strs[] = {
NULL
};
-void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+void bch2_print_str(struct bch_fs *c, const char *str)
{
+#ifdef __KERNEL__
struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
- va_list args;
- va_start(args, fmt);
- if (likely(!stdio)) {
- vprintk(fmt, args);
- } else {
- unsigned long flags;
+ if (unlikely(stdio)) {
+ bch2_stdio_redirect_printf(stdio, true, "%s", str);
+ return;
+ }
+#endif
+ bch2_print_string_as_lines(KERN_ERR, str);
+}
+__printf(2, 0)
+static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
+{
+#ifdef __KERNEL__
+ if (unlikely(stdio)) {
if (fmt[0] == KERN_SOH[0])
fmt += 2;
- spin_lock_irqsave(&stdio->output_lock, flags);
- prt_vprintf(&stdio->output_buf, fmt, args);
- spin_unlock_irqrestore(&stdio->output_lock, flags);
-
- wake_up(&stdio->output_wait);
+ bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
+ return;
}
+#endif
+ vprintk(fmt, args);
+}
+
+void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
+{
+ struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
+
+ va_list args;
+ va_start(args, fmt);
+ bch2_print_maybe_redirect(stdio, fmt, args);
+ va_end(args);
+}
+
+void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+{
+ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
+
+ va_list args;
+ va_start(args, fmt);
+ bch2_print_maybe_redirect(stdio, fmt, args);
va_end(args);
}
@@ -156,6 +184,7 @@ static DEFINE_MUTEX(bch_fs_list_lock);
DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
+static void bch2_dev_unlink(struct bch_dev *);
static void bch2_dev_free(struct bch_dev *);
static int bch2_dev_alloc(struct bch_fs *, unsigned);
static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
@@ -208,22 +237,6 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
return c;
}
-static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
-{
- unsigned nr = 0, u64s =
- ((sizeof(struct jset_entry_dev_usage) +
- sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
- sizeof(u64);
-
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, NULL)
- nr++;
- rcu_read_unlock();
-
- bch2_journal_entry_res_resize(&c->journal,
- &c->dev_usage_journal_res, u64s * nr);
-}
-
/* Filesystem RO/RW: */
/*
@@ -250,7 +263,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_open_buckets_stop(c, NULL, true);
bch2_rebalance_stop(c);
bch2_copygc_stop(c);
- bch2_gc_thread_stop(c);
bch2_fs_ec_flush(c);
bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
@@ -260,6 +272,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
clean_passes++;
if (bch2_btree_interior_updates_flush(c) ||
+ bch2_btree_write_buffer_flush_going_ro(c) ||
bch2_journal_flush_all_pins(&c->journal) ||
bch2_btree_flush_all_writes(c) ||
seq != atomic64_read(&c->journal.seq)) {
@@ -271,11 +284,16 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
journal_cur_seq(&c->journal));
- if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+ if (test_bit(JOURNAL_replay_done, &c->journal.flags) &&
!test_bit(BCH_FS_emergency_ro, &c->flags))
set_bit(BCH_FS_clean_shutdown, &c->flags);
+
bch2_fs_journal_stop(&c->journal);
+ bch_info(c, "%sclean shutdown complete, journal seq %llu",
+ test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
+ c->journal.seq_ondisk);
+
/*
* After stopping journal:
*/
@@ -352,12 +370,13 @@ void bch2_fs_read_only(struct bch_fs *c)
!test_bit(BCH_FS_emergency_ro, &c->flags) &&
test_bit(BCH_FS_started, &c->flags) &&
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
- !c->opts.norecovery) {
+ c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
- BUG_ON(atomic_read(&c->btree_cache.dirty));
+ BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
BUG_ON(c->btree_write_buffer.inc.keys.nr);
BUG_ON(c->btree_write_buffer.flushing.keys.nr);
+ bch2_verify_accounting_clean(c);
bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
@@ -392,6 +411,17 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
return ret;
}
+bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
+{
+ bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
+
+ bch2_journal_halt_locked(&c->journal);
+ bch2_fs_read_only_async(c);
+
+ wake_up(&bch2_read_only_wait);
+ return ret;
+}
+
static int bch2_fs_read_write_late(struct bch_fs *c)
{
int ret;
@@ -422,6 +452,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
{
int ret;
+ BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
+
if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
bch_err(c, "cannot go rw, unfixed btree errors");
return -BCH_ERR_erofs_unfixed_errors;
@@ -448,7 +480,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
* overwriting whatever was there previously, and there must always be
* at least one non-flush write in the journal or recovery will fail:
*/
- set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
+ set_bit(JOURNAL_need_flush_write, &c->journal.flags);
+ set_bit(JOURNAL_running, &c->journal.flags);
for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
@@ -466,12 +499,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
}
#endif
- ret = bch2_gc_thread_start(c);
- if (ret) {
- bch_err(c, "error starting gc thread");
- return ret;
- }
-
ret = bch2_journal_reclaim_start(&c->journal);
if (ret)
goto err;
@@ -497,7 +524,8 @@ err:
int bch2_fs_read_write(struct bch_fs *c)
{
- if (c->opts.norecovery)
+ if (c->opts.recovery_pass_last &&
+ c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
return -BCH_ERR_erofs_norecovery;
if (c->opts.nochanges)
@@ -517,12 +545,12 @@ int bch2_fs_read_write_early(struct bch_fs *c)
static void __bch2_fs_free(struct bch_fs *c)
{
- unsigned i;
-
- for (i = 0; i < BCH_TIME_STAT_NR; i++)
+ for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
+ bch2_fs_accounting_exit(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
@@ -530,6 +558,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_fs_fs_io_direct_exit(c);
bch2_fs_fs_io_buffered_exit(c);
bch2_fs_fsio_exit(c);
+ bch2_fs_vfs_exit(c);
bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_nocow_locking_exit(c);
@@ -537,22 +566,29 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_fs_io_read_exit(c);
bch2_fs_buckets_waiting_for_journal_exit(c);
bch2_fs_btree_interior_update_exit(c);
- bch2_fs_btree_iter_exit(c);
bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
bch2_fs_btree_cache_exit(c);
+ bch2_fs_btree_iter_exit(c);
bch2_fs_replicas_exit(c);
bch2_fs_journal_exit(&c->journal);
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
+ bch2_fs_btree_gc_exit(c);
bch2_journal_keys_put_initial(c);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
- free_percpu(c->online_reserved);
+ if (c->online_reserved) {
+ u64 v = percpu_u64_get(c->online_reserved);
+ WARN(v, "online_reserved not 0 at shutdown: %lli", v);
+ free_percpu(c->online_reserved);
+ }
darray_exit(&c->btree_roots_extra);
free_percpu(c->pcpu);
+ free_percpu(c->usage);
mempool_exit(&c->large_bkey_pool);
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
@@ -562,12 +598,13 @@ static void __bch2_fs_free(struct bch_fs *c)
#endif
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
- kfree(c->unused_inode_hints);
if (c->write_ref_wq)
destroy_workqueue(c->write_ref_wq);
- if (c->io_complete_wq)
- destroy_workqueue(c->io_complete_wq);
+ if (c->btree_write_submit_wq)
+ destroy_workqueue(c->btree_write_submit_wq);
+ if (c->btree_read_complete_wq)
+ destroy_workqueue(c->btree_read_complete_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
if (c->btree_io_complete_wq)
@@ -576,7 +613,7 @@ static void __bch2_fs_free(struct bch_fs *c)
destroy_workqueue(c->btree_update_wq);
bch2_free_super(&c->disk_sb);
- kvpfree(c, sizeof(*c));
+ kvfree(c);
module_put(THIS_MODULE);
}
@@ -593,16 +630,12 @@ void __bch2_fs_stop(struct bch_fs *c)
set_bit(BCH_FS_stopping, &c->flags);
- cancel_work_sync(&c->journal_seq_blacklist_gc_work);
-
down_write(&c->state_lock);
bch2_fs_read_only(c);
up_write(&c->state_lock);
for_each_member_device(c, ca)
- if (ca->kobj.state_in_sysfs &&
- ca->disk_sb.bdev)
- sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+ bch2_dev_unlink(ca);
if (c->kobj.state_in_sysfs)
kobject_del(&c->kobj);
@@ -642,6 +675,7 @@ void bch2_fs_free(struct bch_fs *c)
struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
if (ca) {
+ EBUG_ON(atomic_long_read(&ca->ref) != 1);
bch2_free_super(&ca->disk_sb);
bch2_dev_free(ca);
}
@@ -696,7 +730,7 @@ static int bch2_fs_online(struct bch_fs *c)
ret = bch2_dev_sysfs_online(c, ca);
if (ret) {
bch_err(c, "error creating sysfs objects");
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
goto err;
}
}
@@ -715,7 +749,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
unsigned i, iter_size;
int ret = 0;
- c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+ c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
if (!c) {
c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
goto out;
@@ -745,13 +779,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
refcount_set(&c->ro_ref, 1);
init_waitqueue_head(&c->ro_ref_wait);
+ spin_lock_init(&c->recovery_pass_lock);
sema_init(&c->online_fsck_mutex, 1);
- init_rwsem(&c->gc_lock);
- mutex_init(&c->gc_gens_lock);
- atomic_set(&c->journal_keys.ref, 1);
- c->journal_keys.initial_ref_held = true;
-
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
@@ -759,6 +789,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
bch2_fs_btree_iter_init_early(c);
bch2_fs_btree_interior_update_init_early(c);
+ bch2_fs_journal_keys_init(c);
bch2_fs_allocator_background_init(c);
bch2_fs_allocator_foreground_init(c);
bch2_fs_rebalance_init(c);
@@ -769,24 +800,17 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->list);
- mutex_init(&c->usage_scratch_lock);
-
mutex_init(&c->bio_bounce_pages_lock);
mutex_init(&c->snapshot_table_lock);
init_rwsem(&c->snapshot_create_lock);
spin_lock_init(&c->btree_write_error_lock);
- INIT_WORK(&c->journal_seq_blacklist_gc_work,
- bch2_blacklist_entries_gc);
-
INIT_LIST_HEAD(&c->journal_iters);
INIT_LIST_HEAD(&c->fsck_error_msgs);
mutex_init(&c->fsck_error_msgs_lock);
- seqcount_init(&c->gc_pos_lock);
-
seqcount_init(&c->usage_lock);
sema_init(&c->io_in_flight, 128);
@@ -794,10 +818,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->vfs_inodes_list);
mutex_init(&c->vfs_inodes_lock);
- c->copy_gc_enabled = 1;
- c->rebalance.enabled = 1;
- c->promote_whole_extents = true;
-
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
@@ -818,13 +838,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto err;
pr_uuid(&name, c->sb.user_uuid.b);
- strscpy(c->name, name.buf, sizeof(c->name));
- printbuf_exit(&name);
-
ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
if (ret)
goto err;
+ strscpy(c->name, name.buf, sizeof(c->name));
+ printbuf_exit(&name);
+
/* Compat: */
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
@@ -859,16 +879,16 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
(btree_blocks(c) + 1) * 2 *
sizeof(struct sort_iter_set);
- c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
-
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
- WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
!(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
- WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
- WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+ !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
+ !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)) ||
#ifndef BCH_WRITE_REF_DEBUG
@@ -881,12 +901,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+ !(c->usage = alloc_percpu(struct bch_fs_usage_base)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
- mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
- c->opts.btree_node_size) ||
- mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
- !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
- sizeof(u64), GFP_KERNEL))) {
+ mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
+ c->opts.btree_node_size) ||
+ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
ret = -BCH_ERR_ENOMEM_fs_other_alloc;
goto err;
}
@@ -896,11 +915,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_io_clock_init(&c->io_clock[READ]) ?:
bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_journal_init(&c->journal) ?:
- bch2_fs_replicas_init(c) ?:
+ bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_cache_init(c) ?:
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
- bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_interior_update_init(c) ?:
+ bch2_fs_btree_gc_init(c) ?:
bch2_fs_buckets_waiting_for_journal_init(c) ?:
bch2_fs_btree_write_buffer_init(c) ?:
bch2_fs_subvolumes_init(c) ?:
@@ -910,23 +929,24 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_encryption_init(c) ?:
bch2_fs_compress_init(c) ?:
bch2_fs_ec_init(c) ?:
+ bch2_fs_vfs_init(c) ?:
bch2_fs_fsio_init(c) ?:
bch2_fs_fs_io_buffered_init(c) ?:
bch2_fs_fs_io_direct_init(c);
if (ret)
goto err;
- for (i = 0; i < c->sb.nr_devices; i++)
- if (bch2_dev_exists(c->disk_sb.sb, i) &&
- bch2_dev_alloc(c, i)) {
- ret = -EEXIST;
+ for (i = 0; i < c->sb.nr_devices; i++) {
+ if (!bch2_member_exists(c->disk_sb.sb, i))
+ continue;
+ ret = bch2_dev_alloc(c, i);
+ if (ret)
goto err;
- }
+ }
bch2_journal_entry_res_resize(&c->journal,
&c->btree_root_journal_res,
BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
- bch2_dev_usage_journal_reserve(c);
bch2_journal_entry_res_resize(&c->journal,
&c->clock_journal_res,
(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
@@ -952,7 +972,7 @@ static void print_mount_opts(struct bch_fs *c)
struct printbuf p = PRINTBUF;
bool first = true;
- prt_str(&p, "mounting version ");
+ prt_str(&p, "starting version ");
bch2_version_to_text(&p, c->sb.version);
if (c->opts.read_only) {
@@ -1002,15 +1022,26 @@ int bch2_fs_start(struct bch_fs *c)
for_each_online_member(c, ca)
bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
+ struct bch_sb_field_ext *ext =
+ bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
mutex_unlock(&c->sb_lock);
+ if (!ext) {
+ bch_err(c, "insufficient space in superblock for sb_field_ext");
+ ret = -BCH_ERR_ENOSPC_sb;
+ goto err;
+ }
+
for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+ c->recovery_task = current;
ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
? bch2_fs_recovery(c)
: bch2_fs_initialize(c);
+ c->recovery_task = NULL;
+
if (ret)
goto err;
@@ -1061,7 +1092,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
}
static int bch2_dev_in_fs(struct bch_sb_handle *fs,
- struct bch_sb_handle *sb)
+ struct bch_sb_handle *sb,
+ struct bch_opts *opts)
{
if (fs == sb)
return 0;
@@ -1069,7 +1101,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
return -BCH_ERR_device_not_a_member_of_filesystem;
- if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx))
+ if (!bch2_member_exists(fs->sb, sb->sb->dev_idx))
return -BCH_ERR_device_has_been_removed;
if (fs->sb->block_size != sb->sb->block_size)
@@ -1094,19 +1126,22 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
prt_bdevname(&buf, fs->bdev);
prt_char(&buf, ' ');
- bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));;
+ bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));
prt_newline(&buf);
prt_bdevname(&buf, sb->bdev);
prt_char(&buf, ' ');
- bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
+ bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));
prt_newline(&buf);
- prt_printf(&buf, "Not using older sb");
+ if (!opts->no_splitbrain_check)
+ prt_printf(&buf, "Not using older sb");
pr_err("%s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_device_splitbrain;
+
+ if (!opts->no_splitbrain_check)
+ return -BCH_ERR_device_splitbrain;
}
struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
@@ -1124,17 +1159,22 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
prt_newline(&buf);
prt_bdevname(&buf, fs->bdev);
- prt_str(&buf, "believes seq of ");
+ prt_str(&buf, " believes seq of ");
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " to be %llu, but ", seq_from_fs);
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " has %llu\n", seq_from_member);
- prt_str(&buf, "Not using ");
- prt_bdevname(&buf, sb->bdev);
+
+ if (!opts->no_splitbrain_check) {
+ prt_str(&buf, "Not using ");
+ prt_bdevname(&buf, sb->bdev);
+ }
pr_err("%s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_device_splitbrain;
+
+ if (!opts->no_splitbrain_check)
+ return -BCH_ERR_device_splitbrain;
}
return 0;
@@ -1153,26 +1193,26 @@ static void bch2_dev_free(struct bch_dev *ca)
{
cancel_work_sync(&ca->io_error_work);
- if (ca->kobj.state_in_sysfs &&
- ca->disk_sb.bdev)
- sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+ bch2_dev_unlink(ca);
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
bch2_free_super(&ca->disk_sb);
+ bch2_dev_allocator_background_exit(ca);
bch2_dev_journal_exit(ca);
free_percpu(ca->io_done);
- bioset_exit(&ca->replica_set);
bch2_dev_buckets_free(ca);
- free_page((unsigned long) ca->sb_read_scratch);
+ kfree(ca->sb_read_scratch);
- bch2_time_stats_exit(&ca->io_latency[WRITE]);
- bch2_time_stats_exit(&ca->io_latency[READ]);
+ bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
+ bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
percpu_ref_exit(&ca->io_ref);
+#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_exit(&ca->ref);
+#endif
kobject_put(&ca->kobj);
}
@@ -1190,21 +1230,20 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
percpu_ref_kill(&ca->io_ref);
wait_for_completion(&ca->io_ref_completion);
- if (ca->kobj.state_in_sysfs) {
- sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
- sysfs_remove_link(&ca->kobj, "block");
- }
+ bch2_dev_unlink(ca);
bch2_free_super(&ca->disk_sb);
bch2_dev_journal_exit(ca);
}
+#ifndef CONFIG_BCACHEFS_DEBUG
static void bch2_dev_ref_complete(struct percpu_ref *ref)
{
struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
complete(&ca->ref_completion);
}
+#endif
static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
{
@@ -1213,6 +1252,26 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
complete(&ca->io_ref_completion);
}
+static void bch2_dev_unlink(struct bch_dev *ca)
+{
+ struct kobject *b;
+
+ /*
+ * This is racy w.r.t. the underlying block device being hot-removed,
+ * which removes it from sysfs.
+ *
+ * It'd be lovely if we had a way to handle this race, but the sysfs
+ * code doesn't appear to provide a good method and block/holder.c is
+ * susceptible as well:
+ */
+ if (ca->kobj.state_in_sysfs &&
+ ca->disk_sb.bdev &&
+ (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {
+ sysfs_remove_link(b, "bcachefs");
+ sysfs_remove_link(&ca->kobj, "block");
+ }
+}
+
static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
{
int ret;
@@ -1256,12 +1315,10 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
init_completion(&ca->ref_completion);
init_completion(&ca->io_ref_completion);
- init_rwsem(&ca->bucket_lock);
-
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
- bch2_time_stats_init(&ca->io_latency[READ]);
- bch2_time_stats_init(&ca->io_latency[WRITE]);
+ bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
+ bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);
@@ -1273,14 +1330,19 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / btree_sectors(c));
- if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
- 0, GFP_KERNEL) ||
- percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
+#ifndef CONFIG_BCACHEFS_DEBUG
+ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL))
+ goto err;
+#else
+ atomic_long_set(&ca->ref, 1);
+#endif
+
+ bch2_dev_allocator_background_init(ca);
+
+ if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
- !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
+ !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
bch2_dev_buckets_alloc(c, ca) ||
- bioset_init(&ca->replica_set, 4,
- offsetof(struct bch_write_bio, bio), 0) ||
!(ca->io_done = alloc_percpu(*ca->io_done)))
goto err;
@@ -1308,7 +1370,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
{
struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
struct bch_dev *ca = NULL;
- int ret = 0;
if (bch2_fs_init_fault("dev_alloc"))
goto err;
@@ -1320,10 +1381,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
ca->fs = c;
bch2_dev_attach(c, ca, dev_idx);
- return ret;
+ return 0;
err:
- if (ca)
- bch2_dev_free(ca);
return -BCH_ERR_ENOMEM_dev_alloc;
}
@@ -1371,10 +1430,9 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
le64_to_cpu(c->disk_sb.sb->seq))
bch2_sb_to_fs(c, sb->sb);
- BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
- !c->devs[sb->sb->dev_idx]);
+ BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));
- ca = bch_dev_locked(c, sb->sb->dev_idx);
+ ca = bch2_dev_locked(c, sb->sb->dev_idx);
ret = __bch2_dev_attach_bdev(ca, sb);
if (ret)
@@ -1466,10 +1524,10 @@ static bool bch2_fs_may_start(struct bch_fs *c)
mutex_lock(&c->sb_lock);
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
- if (!bch2_dev_exists(c->disk_sb.sb, i))
+ if (!bch2_member_exists(c->disk_sb.sb, i))
continue;
- ca = bch_dev_locked(c, i);
+ ca = bch2_dev_locked(c, i);
if (!bch2_dev_is_online(ca) &&
(ca->mi.state == BCH_MEMBER_STATE_rw ||
@@ -1490,6 +1548,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
* The allocator thread itself allocates btree nodes, so stop it first:
*/
bch2_dev_allocator_remove(c, ca);
+ bch2_recalc_capacity(c);
bch2_dev_journal_stop(&c->journal, ca);
}
@@ -1501,6 +1560,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+ bch2_dev_do_discards(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1548,32 +1608,6 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
/* Device add/removal: */
-static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bpos start = POS(ca->dev_idx, 0);
- struct bpos end = POS(ca->dev_idx, U64_MAX);
- int ret;
-
- /*
- * We clear the LRU and need_discard btrees first so that we don't race
- * with bch2_do_invalidates() and bch2_do_discards()
- */
- ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
- BTREE_TRIGGER_NORUN, NULL);
- bch_err_msg(c, ret, "removing dev alloc info");
- return ret;
-}
-
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_member *m;
@@ -1586,7 +1620,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
* We consume a reference to ca->ref, regardless of whether we succeed
* or fail:
*/
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot remove without losing data");
@@ -1597,27 +1631,37 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
__bch2_dev_read_only(c, ca);
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
- bch_err_msg(ca, ret, "dropping data");
+ bch_err_msg(ca, ret, "bch2_dev_data_drop()");
if (ret)
goto err;
ret = bch2_dev_remove_alloc(c, ca);
- bch_err_msg(ca, ret, "deleting alloc info");
+ bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
if (ret)
goto err;
+ /*
+ * We need to flush the entire journal to get rid of keys that reference
+ * the device being removed before removing the superblock entry
+ */
+ bch2_journal_flush_all_pins(&c->journal);
+
+ /*
+ * this is really just needed for the bch2_replicas_gc_(start|end)
+ * calls, and could be cleaned up:
+ */
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
- bch_err_msg(ca, ret, "flushing journal");
+ bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
if (ret)
goto err;
ret = bch2_journal_flush(&c->journal);
- bch_err(ca, "journal error");
+ bch_err_msg(ca, ret, "bch2_journal_flush()");
if (ret)
goto err;
ret = bch2_replicas_gc2(c);
- bch_err_msg(ca, ret, "in replicas_gc2()");
+ bch_err_msg(ca, ret, "bch2_replicas_gc2()");
if (ret)
goto err;
@@ -1638,23 +1682,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
mutex_unlock(&c->sb_lock);
+#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_kill(&ca->ref);
+#else
+ ca->dying = true;
+ bch2_dev_put(ca);
+#endif
wait_for_completion(&ca->ref_completion);
bch2_dev_free(ca);
/*
- * At this point the device object has been removed in-core, but the
- * on-disk journal might still refer to the device index via sb device
- * usage entries. Recovery fails if it sees usage information for an
- * invalid device. Flush journal pins to push the back of the journal
- * past now invalid device index references before we update the
- * superblock, but after the device object has been removed so any
- * further journal writes elide usage info for the device.
- */
- bch2_journal_flush_all_pins(&c->journal);
-
- /*
* Free this device's slot in the bch_member array - all pointers to
* this device must be gone:
*/
@@ -1666,8 +1704,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
mutex_unlock(&c->sb_lock);
up_write(&c->state_lock);
-
- bch2_dev_usage_journal_reserve(c);
return 0;
err:
if (ca->mi.state == BCH_MEMBER_STATE_rw &&
@@ -1683,9 +1719,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb;
struct bch_dev *ca = NULL;
- struct bch_sb_field_members_v2 *mi;
- struct bch_member dev_mi;
- unsigned dev_idx, nr_devices, u64s;
struct printbuf errbuf = PRINTBUF;
struct printbuf label = PRINTBUF;
int ret;
@@ -1695,7 +1728,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (ret)
goto err;
- dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
+ struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
if (BCH_MEMBER_GROUP(&dev_mi)) {
bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
@@ -1715,17 +1748,10 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err;
}
- bch2_dev_usage_init(ca);
-
ret = __bch2_dev_attach_bdev(ca, &sb);
if (ret)
goto err;
- ret = bch2_dev_journal_alloc(ca);
- bch_err_msg(c, ret, "allocating journal");
- if (ret)
- goto err;
-
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
@@ -1735,36 +1761,19 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_unlock;
if (dynamic_fault("bcachefs:add:no_slot"))
- goto no_slot;
-
- for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
- if (!bch2_dev_exists(c->disk_sb.sb, dev_idx))
- goto have_slot;
-no_slot:
- ret = -BCH_ERR_ENOSPC_sb_members;
- bch_err_msg(c, ret, "setting up new superblock");
- goto err_unlock;
-
-have_slot:
- nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
-
- mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
- le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+ goto err_unlock;
- mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
- if (!mi) {
- ret = -BCH_ERR_ENOSPC_sb_members;
+ ret = bch2_sb_member_alloc(c);
+ if (ret < 0) {
bch_err_msg(c, ret, "setting up new superblock");
goto err_unlock;
}
- struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+ unsigned dev_idx = ret;
/* success: */
- *m = dev_mi;
- m->last_mount = cpu_to_le64(ktime_get_real_seconds());
- c->disk_sb.sb->nr_devices = nr_devices;
+ dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());
+ *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
@@ -1779,9 +1788,11 @@ have_slot:
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- bch2_dev_usage_journal_reserve(c);
+ ret = bch2_dev_usage_init(ca, false);
+ if (ret)
+ goto err_late;
- ret = bch2_trans_mark_dev_sb(c, ca);
+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
bch_err_msg(ca, ret, "marking new superblock");
if (ret)
goto err_late;
@@ -1791,13 +1802,20 @@ have_slot:
if (ret)
goto err_late;
- ca->new_fs_bucket_idx = 0;
-
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
+ ret = bch2_dev_journal_alloc(ca, false);
+ bch_err_msg(c, ret, "allocating journal");
+ if (ret)
+ goto err_late;
+
up_write(&c->state_lock);
- return 0;
+out:
+ printbuf_exit(&label);
+ printbuf_exit(&errbuf);
+ bch_err_fn(c, ret);
+ return ret;
err_unlock:
mutex_unlock(&c->sb_lock);
@@ -1806,10 +1824,7 @@ err:
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
- printbuf_exit(&label);
- printbuf_exit(&errbuf);
- bch_err_fn(c, ret);
- return ret;
+ goto out;
err_late:
up_write(&c->state_lock);
ca = NULL;
@@ -1835,7 +1850,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
- ret = bch2_dev_in_fs(&c->disk_sb, &sb);
+ ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
bch_err_msg(c, ret, "bringing %s online", path);
if (ret)
goto err;
@@ -1844,9 +1859,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
if (ret)
goto err;
- ca = bch_dev_locked(c, dev_idx);
+ ca = bch2_dev_locked(c, dev_idx);
- ret = bch2_trans_mark_dev_sb(c, ca);
+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
if (ret)
goto err;
@@ -1862,7 +1877,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
}
if (!ca->journal.nr) {
- ret = bch2_dev_journal_alloc(ca);
+ ret = bch2_dev_journal_alloc(ca, false);
bch_err_msg(ca, ret, "allocating journal");
if (ret)
goto err;
@@ -1919,6 +1934,13 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
goto err;
}
+ if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
+ bch_err(ca, "New device size too big (%llu greater than max %u)",
+ nbuckets, BCH_MEMBER_NBUCKETS_MAX);
+ ret = -BCH_ERR_device_size_too_big;
+ goto err;
+ }
+
if (bch2_dev_is_online(ca) &&
get_capacity(ca->disk_sb.bdev->bd_disk) <
ca->mi.bucket_size * nbuckets) {
@@ -1932,7 +1954,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (ret)
goto err;
- ret = bch2_trans_mark_dev_sb(c, ca);
+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
if (ret)
goto err;
@@ -1944,15 +1966,18 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
mutex_unlock(&c->sb_lock);
if (ca->mi.freespace_initialized) {
- ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_dev_data_type,
+ .dev_data_type.dev = ca->dev_idx,
+ .dev_data_type.data_type = BCH_DATA_free,
+ };
+ u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
+
+ ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
+ bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
+ bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
if (ret)
goto err;
-
- /*
- * XXX: this is all wrong transactionally - we'll be able to do
- * this correctly after the disk space accounting rewrite
- */
- ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets;
}
bch2_recalc_capacity(c);
@@ -1964,13 +1989,12 @@ err:
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, NULL)
- if (!strcmp(name, ca->name)) {
- rcu_read_unlock();
+ if (!strncmp(name, "/dev/", strlen("/dev/")))
+ name += strlen("/dev/");
+
+ for_each_member_device(c, ca)
+ if (!strcmp(name, ca->name))
return ca;
- }
- rcu_read_unlock();
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
@@ -2023,7 +2047,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
best = sb;
darray_for_each_reverse(sbs, sb) {
- ret = bch2_dev_in_fs(best, sb);
+ ret = bch2_dev_in_fs(best, sb, &opts);
if (ret == -BCH_ERR_device_has_been_removed ||
ret == -BCH_ERR_device_splitbrain) {
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index dada09331d2e..04f8287eff5c 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -29,21 +29,12 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
bool bch2_fs_emergency_read_only(struct bch_fs *);
+bool bch2_fs_emergency_read_only_locked(struct bch_fs *);
void bch2_fs_read_only(struct bch_fs *);
int bch2_fs_read_write(struct bch_fs *);
int bch2_fs_read_write_early(struct bch_fs *);
-/*
- * Only for use in the recovery/fsck path:
- */
-static inline void bch2_fs_lazy_rw(struct bch_fs *c)
-{
- if (!test_bit(BCH_FS_rw, &c->flags) &&
- !test_bit(BCH_FS_was_rw, &c->flags))
- bch2_fs_read_write_early(c);
-}
-
void __bch2_fs_stop(struct bch_fs *);
void bch2_fs_free(struct bch_fs *);
void bch2_fs_stop(struct bch_fs *);
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 0e5a14fc8e7f..368a63d938cf 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -4,7 +4,7 @@
struct bch_sb_handle {
struct bch_sb *sb;
- struct bdev_handle *bdev_handle;
+ struct file *s_bdev_file;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
@@ -26,17 +26,4 @@ struct bch_devs_list {
u8 data[BCH_BKEY_PTRS_MAX];
};
-struct bch_member_cpu {
- u64 nbuckets; /* device size */
- u16 first_bucket; /* index of first bucket used */
- u16 bucket_size; /* sectors */
- u16 group;
- u8 state;
- u8 discard;
- u8 data_allowed;
- u8 durability;
- u8 freespace_initialized;
- u8 valid;
-};
-
#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index cee80c47feea..a7eb1f511484 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -22,10 +22,12 @@
#include "buckets.h"
#include "clock.h"
#include "compress.h"
+#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
#include "inode.h"
#include "journal.h"
+#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
#include "movinggc.h"
@@ -139,10 +141,12 @@ do { \
write_attribute(trigger_gc);
write_attribute(trigger_discards);
write_attribute(trigger_invalidates);
-write_attribute(prune_cache);
-write_attribute(btree_wakeup);
-rw_attribute(btree_gc_periodic);
-rw_attribute(gc_gens_pos);
+write_attribute(trigger_journal_flush);
+write_attribute(trigger_journal_writes);
+write_attribute(trigger_btree_cache_shrink);
+write_attribute(trigger_btree_key_cache_shrink);
+write_attribute(trigger_freelist_wakeup);
+read_attribute(gc_gens_pos);
read_attribute(uuid);
read_attribute(minor);
@@ -166,9 +170,9 @@ read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
-read_attribute(btree_updates);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
+read_attribute(btree_reserve_cache);
read_attribute(stripes_heap);
read_attribute(open_buckets);
read_attribute(open_buckets_partial);
@@ -189,12 +193,8 @@ static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
{
bch2_printbuf_tabstop_push(out, 24);
- for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) {
- prt_str(out, bch2_write_refs[i]);
- prt_tab(out);
- prt_printf(out, "%li", atomic_long_read(&c->writes[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++)
+ prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i]));
}
#endif
@@ -203,21 +203,20 @@ read_attribute(disk_groups);
read_attribute(has_data);
read_attribute(alloc_debug);
+read_attribute(usage_base);
#define x(t, n, ...) read_attribute(t);
BCH_PERSISTENT_COUNTERS()
#undef x
rw_attribute(discard);
+read_attribute(state);
rw_attribute(label);
-rw_attribute(copy_gc_enabled);
read_attribute(copy_gc_wait);
-rw_attribute(rebalance_enabled);
sysfs_pd_controller_attribute(rebalance);
read_attribute(rebalance_status);
-rw_attribute(promote_whole_extents);
read_attribute(new_stripes);
@@ -232,149 +231,88 @@ write_attribute(perf_test);
#define x(_name) \
static struct attribute sysfs_time_stat_##_name = \
- { .name = #_name, .mode = 0444 };
+ { .name = #_name, .mode = 0644 };
BCH_TIME_STATS()
#undef x
-static struct attribute sysfs_state_rw = {
- .name = "state",
- .mode = 0444,
-};
-
static size_t bch2_btree_cache_size(struct bch_fs *c)
{
+ struct btree_cache *bc = &c->btree_cache;
size_t ret = 0;
struct btree *b;
- mutex_lock(&c->btree_cache.lock);
- list_for_each_entry(b, &c->btree_cache.live, list)
+ mutex_lock(&bc->lock);
+ list_for_each_entry(b, &bc->live[0].list, list)
ret += btree_buf_bytes(b);
-
- mutex_unlock(&c->btree_cache.lock);
+ list_for_each_entry(b, &bc->live[1].list, list)
+ ret += btree_buf_bytes(b);
+ list_for_each_entry(b, &bc->freeable, list)
+ ret += btree_buf_bytes(b);
+ mutex_unlock(&bc->lock);
return ret;
}
static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct btree_trans *trans;
- enum btree_id id;
- struct compression_type_stats {
- u64 nr_extents;
- u64 sectors_compressed;
- u64 sectors_uncompressed;
- } s[BCH_COMPRESSION_TYPE_NR];
- u64 compressed_incompressible = 0;
- int ret = 0;
-
- memset(s, 0, sizeof(s));
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EPERM;
-
- trans = bch2_trans_get(c);
-
- for (id = 0; id < BTREE_ID_NR; id++) {
- if (!btree_type_has_ptrs(id))
- continue;
-
- ret = for_each_btree_key(trans, iter, id, POS_MIN,
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_extent_crc_unpacked crc;
- const union bch_extent_entry *entry;
- bool compressed = false, incompressible = false;
-
- bkey_for_each_crc(k.k, ptrs, crc, entry) {
- incompressible |= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible;
- compressed |= crc_is_compressed(crc);
-
- if (crc_is_compressed(crc)) {
- s[crc.compression_type].nr_extents++;
- s[crc.compression_type].sectors_compressed += crc.compressed_size;
- s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size;
- }
- }
-
- compressed_incompressible += compressed && incompressible;
-
- if (!compressed) {
- unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0;
-
- s[t].nr_extents++;
- s[t].sectors_compressed += k.k->size;
- s[t].sectors_uncompressed += k.k->size;
- }
- 0;
- }));
- }
-
- bch2_trans_put(trans);
-
- if (ret)
- return ret;
-
prt_str(out, "type");
printbuf_tabstop_push(out, 12);
- prt_tab(out);
-
- prt_str(out, "compressed");
printbuf_tabstop_push(out, 16);
- prt_tab_rjust(out);
-
- prt_str(out, "uncompressed");
printbuf_tabstop_push(out, 16);
- prt_tab_rjust(out);
-
- prt_str(out, "average extent size");
printbuf_tabstop_push(out, 24);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
+
+ for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) {
+ struct disk_accounting_pos a = {
+ .type = BCH_DISK_ACCOUNTING_compression,
+ .compression.type = i,
+ };
+ struct bpos p = disk_accounting_pos_to_bpos(&a);
+ u64 v[3];
+ bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v));
+
+ u64 nr_extents = v[0];
+ u64 sectors_uncompressed = v[1];
+ u64 sectors_compressed = v[2];
- for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
bch2_prt_compression_type(out, i);
prt_tab(out);
- prt_human_readable_u64(out, s[i].sectors_compressed << 9);
+ prt_human_readable_u64(out, sectors_compressed << 9);
prt_tab_rjust(out);
- prt_human_readable_u64(out, s[i].sectors_uncompressed << 9);
+ prt_human_readable_u64(out, sectors_uncompressed << 9);
prt_tab_rjust(out);
- prt_human_readable_u64(out, s[i].nr_extents
- ? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents)
+ prt_human_readable_u64(out, nr_extents
+ ? div64_u64(sectors_uncompressed << 9, nr_extents)
: 0);
prt_tab_rjust(out);
prt_newline(out);
}
- if (compressed_incompressible) {
- prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible);
- prt_newline(out);
- }
-
return 0;
}
static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
{
- prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree));
+ bch2_btree_id_to_text(out, c->gc_gens_btree);
+ prt_printf(out, ": ");
bch2_bpos_to_text(out, c->gc_gens_pos);
prt_printf(out, "\n");
}
-static void bch2_btree_wakeup_all(struct bch_fs *c)
+static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct btree_trans *trans;
-
- seqmutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
+ struct bch_fs_usage_base b = {};
- if (b)
- six_lock_wakeup_all(&b->lock);
+ acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64));
- }
- seqmutex_unlock(&c->btree_trans_lock);
+ prt_printf(out, "hidden:\t\t%llu\n", b.hidden);
+ prt_printf(out, "btree:\t\t%llu\n", b.btree);
+ prt_printf(out, "data:\t\t%llu\n", b.data);
+ prt_printf(out, "cached:\t%llu\n", b.cached);
+ prt_printf(out, "reserved:\t\t%llu\n", b.reserved);
+ prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes);
}
SHOW(bch2_fs)
@@ -392,14 +330,9 @@ SHOW(bch2_fs)
if (attr == &sysfs_btree_write_stats)
bch2_btree_write_stats_to_text(out, c);
- sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
-
if (attr == &sysfs_gc_gens_pos)
bch2_gc_gens_pos_to_text(out, c);
- sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
-
- sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled);
sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */
if (attr == &sysfs_copy_gc_wait)
@@ -408,27 +341,25 @@ SHOW(bch2_fs)
if (attr == &sysfs_rebalance_status)
bch2_rebalance_status_to_text(out, c);
- sysfs_print(promote_whole_extents, c->promote_whole_extents);
-
/* Debugging: */
if (attr == &sysfs_journal_debug)
bch2_journal_debug_to_text(out, &c->journal);
- if (attr == &sysfs_btree_updates)
- bch2_btree_updates_to_text(out, c);
-
if (attr == &sysfs_btree_cache)
- bch2_btree_cache_to_text(out, c);
+ bch2_btree_cache_to_text(out, &c->btree_cache);
if (attr == &sysfs_btree_key_cache)
bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
+ if (attr == &sysfs_btree_reserve_cache)
+ bch2_btree_reserve_cache_to_text(out, c);
+
if (attr == &sysfs_stripes_heap)
bch2_stripes_heap_to_text(out, c);
if (attr == &sysfs_open_buckets)
- bch2_open_buckets_to_text(out, c);
+ bch2_open_buckets_to_text(out, c, NULL);
if (attr == &sysfs_open_buckets_partial)
bch2_open_buckets_partial_to_text(out, c);
@@ -462,6 +393,12 @@ SHOW(bch2_fs)
if (attr == &sysfs_disk_groups)
bch2_disk_groups_to_text(out, c);
+ if (attr == &sysfs_alloc_debug)
+ bch2_fs_alloc_debug_to_text(out, c);
+
+ if (attr == &sysfs_usage_base)
+ bch2_fs_usage_base_to_text(out, c);
+
return 0;
}
@@ -469,35 +406,8 @@ STORE(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
- if (attr == &sysfs_btree_gc_periodic) {
- ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
- ?: (ssize_t) size;
-
- wake_up_process(c->gc_thread);
- return ret;
- }
-
- if (attr == &sysfs_copy_gc_enabled) {
- ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
- ?: (ssize_t) size;
-
- if (c->copygc_thread)
- wake_up_process(c->copygc_thread);
- return ret;
- }
-
- if (attr == &sysfs_rebalance_enabled) {
- ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
- ?: (ssize_t) size;
-
- rebalance_wakeup(c);
- return ret;
- }
-
sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
- sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
-
/* Debugging: */
if (!test_bit(BCH_FS_started, &c->flags))
@@ -505,39 +415,46 @@ STORE(bch2_fs)
/* Debugging: */
- if (!test_bit(BCH_FS_rw, &c->flags))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
return -EROFS;
- if (attr == &sysfs_prune_cache) {
+ if (attr == &sysfs_trigger_btree_cache_shrink) {
+ struct btree_cache *bc = &c->btree_cache;
struct shrink_control sc;
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
+ bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
}
- if (attr == &sysfs_btree_wakeup)
- bch2_btree_wakeup_all(c);
-
- if (attr == &sysfs_trigger_gc) {
- /*
- * Full gc is currently incompatible with btree key cache:
- */
-#if 0
- down_read(&c->state_lock);
- bch2_gc(c, false, false);
- up_read(&c->state_lock);
-#else
- bch2_gc_gens(c);
-#endif
+ if (attr == &sysfs_trigger_btree_key_cache_shrink) {
+ struct shrink_control sc;
+
+ sc.gfp_mask = GFP_KERNEL;
+ sc.nr_to_scan = strtoul_or_return(buf);
+ c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc);
}
+ if (attr == &sysfs_trigger_gc)
+ bch2_gc_gens(c);
+
if (attr == &sysfs_trigger_discards)
bch2_do_discards(c);
if (attr == &sysfs_trigger_invalidates)
bch2_do_invalidates(c);
+ if (attr == &sysfs_trigger_journal_flush) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_meta(&c->journal);
+ }
+
+ if (attr == &sysfs_trigger_journal_writes)
+ bch2_journal_do_writes(&c->journal);
+
+ if (attr == &sysfs_trigger_freelist_wakeup)
+ closure_wake_up(&c->freelist_wait);
+
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -558,6 +475,7 @@ STORE(bch2_fs)
size = ret;
}
#endif
+ bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
return size;
}
SYSFS_OPS(bch2_fs);
@@ -567,7 +485,7 @@ struct attribute *bch2_fs_files[] = {
&sysfs_btree_cache_size,
&sysfs_btree_write_stats,
- &sysfs_promote_whole_extents,
+ &sysfs_rebalance_status,
&sysfs_compression_stats,
@@ -587,17 +505,22 @@ SHOW(bch2_fs_counters)
printbuf_tabstop_push(out, 32);
- #define x(t, ...) \
+ #define x(t, n, f, ...) \
if (attr == &sysfs_##t) { \
counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
- prt_printf(out, "since mount:"); \
- prt_tab(out); \
+ if (f & TYPE_SECTORS) { \
+ counter <<= 9; \
+ counter_since_mount <<= 9; \
+ } \
+ \
+ prt_printf(out, "since mount:\t"); \
+ (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\
prt_human_readable_u64(out, counter_since_mount); \
prt_newline(out); \
\
- prt_printf(out, "since filesystem creation:"); \
- prt_tab(out); \
+ prt_printf(out, "since filesystem creation:\t"); \
+ (f & TYPE_COUNTER) ? prt_u64(out, counter) : \
prt_human_readable_u64(out, counter); \
prt_newline(out); \
}
@@ -639,9 +562,9 @@ SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
&sysfs_flags,
&sysfs_journal_debug,
- &sysfs_btree_updates,
&sysfs_btree_cache,
&sysfs_btree_key_cache,
+ &sysfs_btree_reserve_cache,
&sysfs_new_stripes,
&sysfs_stripes_heap,
&sysfs_open_buckets,
@@ -657,16 +580,16 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_gc,
&sysfs_trigger_discards,
&sysfs_trigger_invalidates,
- &sysfs_prune_cache,
- &sysfs_btree_wakeup,
+ &sysfs_trigger_journal_flush,
+ &sysfs_trigger_journal_writes,
+ &sysfs_trigger_btree_cache_shrink,
+ &sysfs_trigger_btree_key_cache_shrink,
+ &sysfs_trigger_freelist_wakeup,
&sysfs_gc_gens_pos,
- &sysfs_copy_gc_enabled,
&sysfs_copy_gc_wait,
- &sysfs_rebalance_enabled,
- &sysfs_rebalance_status,
sysfs_pd_controller_files(rebalance),
&sysfs_moving_ctxts,
@@ -674,6 +597,8 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_internal_uuid,
&sysfs_disk_groups,
+ &sysfs_alloc_debug,
+ &sysfs_usage_base,
NULL
};
@@ -723,7 +648,7 @@ STORE(bch2_fs_opts_dir)
if (ret < 0)
goto err;
- bch2_opt_set_sb(c, opt, v);
+ bch2_opt_set_sb(c, NULL, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
if (v &&
@@ -732,6 +657,13 @@ STORE(bch2_fs_opts_dir)
(id == Opt_compression && !c->opts.background_compression)))
bch2_set_rebalance_needs_scan(c, 0);
+ if (v && id == Opt_rebalance_enabled)
+ rebalance_wakeup(c);
+
+ if (v && id == Opt_copygc_enabled &&
+ c->copygc_thread)
+ wake_up_process(c->copygc_thread);
+
ret = size;
err:
bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
@@ -777,6 +709,13 @@ SHOW(bch2_fs_time_stats)
STORE(bch2_fs_time_stats)
{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+
+#define x(name) \
+ if (attr == &sysfs_time_stat_##name) \
+ bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
+ BCH_TIME_STATS()
+#undef x
return size;
}
SYSFS_OPS(bch2_fs_time_stats);
@@ -789,88 +728,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
NULL
};
-static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bch_dev_usage stats = bch2_dev_usage_read(ca);
- unsigned i, nr[BCH_DATA_NR];
-
- memset(nr, 0, sizeof(nr));
-
- for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
- nr[c->open_buckets[i].data_type]++;
-
- printbuf_tabstop_push(out, 8);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
-
- bch2_dev_usage_to_text(out, &stats);
-
- prt_newline(out);
-
- prt_printf(out, "reserves:");
- prt_newline(out);
- for (i = 0; i < BCH_WATERMARK_NR; i++) {
- prt_str(out, bch2_watermarks[i]);
- prt_tab(out);
- prt_u64(out, bch2_dev_buckets_reserved(ca, i));
- prt_tab_rjust(out);
- prt_newline(out);
- }
-
- prt_newline(out);
-
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 24);
-
- prt_str(out, "freelist_wait");
- prt_tab(out);
- prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty");
- prt_newline(out);
-
- prt_str(out, "open buckets allocated");
- prt_tab(out);
- prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
- prt_newline(out);
-
- prt_str(out, "open buckets this dev");
- prt_tab(out);
- prt_u64(out, ca->nr_open_buckets);
- prt_newline(out);
-
- prt_str(out, "open buckets total");
- prt_tab(out);
- prt_u64(out, OPEN_BUCKETS_COUNT);
- prt_newline(out);
-
- prt_str(out, "open_buckets_wait");
- prt_tab(out);
- prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty");
- prt_newline(out);
-
- prt_str(out, "open_buckets_btree");
- prt_tab(out);
- prt_u64(out, nr[BCH_DATA_btree]);
- prt_newline(out);
-
- prt_str(out, "open_buckets_user");
- prt_tab(out);
- prt_u64(out, nr[BCH_DATA_user]);
- prt_newline(out);
-
- prt_str(out, "buckets_to_invalidate");
- prt_tab(out);
- prt_u64(out, should_invalidate_buckets(ca, stats));
- prt_newline(out);
-
- prt_str(out, "btree reserve cache");
- prt_tab(out);
- prt_u64(out, c->btree_reserve_cache_nr);
- prt_newline(out);
-}
-
static const char * const bch2_rw[] = {
"read",
"write",
@@ -915,7 +772,7 @@ SHOW(bch2_dev)
prt_char(out, '\n');
}
- if (attr == &sysfs_state_rw) {
+ if (attr == &sysfs_state) {
prt_string_option(out, bch2_member_states, ca->mi.state);
prt_char(out, '\n');
}
@@ -930,17 +787,20 @@ SHOW(bch2_dev)
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
if (attr == &sysfs_io_latency_stats_read)
- bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+ bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
if (attr == &sysfs_io_latency_stats_write)
- bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
+ bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
sysfs_printf(congested, "%u%%",
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
* 100 / CONGESTED_MAX);
if (attr == &sysfs_alloc_debug)
- dev_alloc_debug_to_text(out, ca);
+ bch2_dev_alloc_debug_to_text(out, ca);
+
+ if (attr == &sysfs_open_buckets)
+ bch2_open_buckets_to_text(out, c, ca);
return 0;
}
@@ -949,32 +809,17 @@ STORE(bch2_dev)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
struct bch_fs *c = ca->fs;
- struct bch_member *mi;
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
- mutex_lock(&c->sb_lock);
- mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-
- if (v != BCH_MEMBER_DISCARD(mi)) {
- SET_BCH_MEMBER_DISCARD(mi, v);
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
+ bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v);
}
if (attr == &sysfs_durability) {
u64 v = strtoul_or_return(buf);
- mutex_lock(&c->sb_lock);
- mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-
- if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
- SET_BCH_MEMBER_DURABILITY(mi, v + 1);
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
+ bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v);
}
if (attr == &sysfs_label) {
@@ -1007,7 +852,7 @@ struct attribute *bch2_dev_files[] = {
/* settings: */
&sysfs_discard,
- &sysfs_state_rw,
+ &sysfs_state,
&sysfs_label,
&sysfs_has_data,
@@ -1023,6 +868,7 @@ struct attribute *bch2_dev_files[] = {
/* debug: */
&sysfs_alloc_debug,
+ &sysfs_open_buckets,
NULL
};
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index b3fe9fc57747..6c6469814637 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -40,7 +40,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
k.k.p.snapshot = U32_MAX;
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = commit_do(trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
@@ -81,7 +81,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
k.k.p.snapshot = U32_MAX;
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = commit_do(trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
@@ -121,7 +121,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
ck.k.p.offset = i;
ck.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
return ret;
@@ -131,7 +131,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
i = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k, ({
BUG_ON(k.k->p.offset != i++);
@@ -176,7 +176,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
ck.k.p.snapshot = U32_MAX;
ck.k.size = 8;
- ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
return ret;
@@ -186,7 +186,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
i = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ for_each_btree_key_max(trans, iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k, ({
BUG_ON(bkey_start_offset(k.k) != i);
@@ -232,7 +232,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
ck.k.p.offset = i * 2;
ck.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
return ret;
@@ -242,7 +242,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
i = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k, ({
BUG_ON(k.k->p.offset != i);
@@ -259,9 +259,9 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
i = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_SLOTS, k, ({
+ BTREE_ITER_slots, k, ({
if (i >= nr * 2)
break;
@@ -292,7 +292,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
ck.k.p.snapshot = U32_MAX;
ck.k.size = 8;
- ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
return ret;
@@ -302,7 +302,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
i = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ for_each_btree_key_max(trans, iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k, ({
BUG_ON(bkey_start_offset(k.k) != i + 8);
@@ -320,9 +320,9 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
i = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ for_each_btree_key_max(trans, iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_SLOTS, k, ({
+ BTREE_ITER_slots, k, ({
if (i == nr)
break;
BUG_ON(bkey_deleted(k.k) != !(i % 16));
@@ -349,10 +349,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
bch2_trans_iter_exit(trans, &iter);
@@ -369,10 +369,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
bch2_trans_iter_exit(trans, &iter);
@@ -394,9 +394,9 @@ static int insert_test_extent(struct bch_fs *c,
k.k_i.k.p.offset = end;
k.k_i.k.p.snapshot = U32_MAX;
k.k_i.k.size = end - start;
- k.k_i.k.version.lo = test_version++;
+ k.k_i.k.bversion.lo = test_version++;
- ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
bch_err_fn(c, ret);
return ret;
}
@@ -450,9 +450,9 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start,
k.k_i.k.p.snapshot = snapid;
k.k_i.k.size = len;
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+ BTREE_UPDATE_internal_snapshot_node));
bch_err_fn(c, ret);
return ret;
}
@@ -481,14 +481,14 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
bkey_cookie_init(&cookie.k_i);
cookie.k.p.snapshot = snapid_hi;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
if (ret)
return ret;
trans = bch2_trans_get(c);
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, snapid_lo), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
BUG_ON(k.k->p.snapshot != U32_MAX);
@@ -506,11 +506,11 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
bkey_cookie_init(&cookie.k_i);
cookie.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
if (ret)
return ret;
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_snapshot_node_create(trans, U32_MAX,
snapids,
snapid_subvols,
@@ -671,8 +671,8 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
- BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(&iter);
+ BTREE_ITER_intent);
+ k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
@@ -714,7 +714,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
return bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
+ BTREE_ITER_slots|BTREE_ITER_intent, k,
NULL, NULL, 0, ({
if (iter.pos.offset >= nr)
break;
@@ -726,7 +726,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
static int seq_lookup(struct bch_fs *c, u64 nr)
{
return bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k,
0));
@@ -737,7 +737,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
return bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX),
- BTREE_ITER_INTENT, k,
+ BTREE_ITER_intent, k,
NULL, NULL, 0, ({
struct bkey_i_cookie u;
@@ -809,6 +809,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
unsigned i;
u64 time;
+ if (nr == 0 || nr_threads == 0) {
+ pr_err("nr of iterations or threads is not allowed to be 0");
+ return -EINVAL;
+ }
+
atomic_set(&j.ready, nr_threads);
init_waitqueue_head(&j.ready_wait);
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 9220d7de10db..dea73bc1cb51 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -2,7 +2,6 @@
#ifndef NO_BCACHEFS_FS
#include "bcachefs.h"
-#include "printbuf.h"
#include "thread_with_file.h"
#include <linux/anon_inodes.h>
@@ -10,6 +9,7 @@
#include <linux/kthread.h>
#include <linux/pagemap.h>
#include <linux/poll.h>
+#include <linux/sched/sysctl.h>
void bch2_thread_with_file_exit(struct thread_with_file *thr)
{
@@ -65,68 +65,87 @@ err:
return ret;
}
-static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
+/* stdio_redirect */
+
+static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen)
{
- return thr->stdio.output_buf.pos ||
- thr->output2.nr ||
- thr->thr.done;
+ return stdio->input.buf.nr > seen || stdio->done;
}
-static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
+static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
- size_t copied = 0, b;
- int ret = 0;
+ return stdio_redirect_has_more_input(stdio, 0);
+}
- if ((file->f_flags & O_NONBLOCK) &&
- !thread_with_stdio_has_output(thr))
- return -EAGAIN;
+static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
+{
+ return stdio->output.buf.nr || stdio->done;
+}
- ret = wait_event_interruptible(thr->stdio.output_wait,
- thread_with_stdio_has_output(thr));
- if (ret)
- return ret;
+#define STDIO_REDIRECT_BUFSIZE 4096
- if (thr->thr.done)
- return 0;
+static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
+{
+ return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
- while (len) {
- ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
- if (ret)
- break;
+static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
+{
+ return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
- spin_lock_irq(&thr->stdio.output_lock);
- b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
+static void stdio_buf_init(struct stdio_buf *buf)
+{
+ spin_lock_init(&buf->lock);
+ init_waitqueue_head(&buf->wait);
+ darray_init(&buf->buf);
+}
- memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
- memmove(thr->stdio.output_buf.buf,
- thr->stdio.output_buf.buf + b,
- thr->stdio.output_buf.pos - b);
+/* thread_with_stdio */
- thr->output2.nr += b;
- thr->stdio.output_buf.pos -= b;
- spin_unlock_irq(&thr->stdio.output_lock);
+static void thread_with_stdio_done(struct thread_with_stdio *thr)
+{
+ thr->thr.done = true;
+ thr->stdio.done = true;
+ wake_up(&thr->stdio.input.wait);
+ wake_up(&thr->stdio.output.wait);
+}
- b = min(len, thr->output2.nr);
- if (!b)
- break;
+static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+ struct stdio_buf *buf = &thr->stdio.output;
+ size_t copied = 0, b;
+ int ret = 0;
- b -= copy_to_user(buf, thr->output2.data, b);
- if (!b) {
+ if (!(file->f_flags & O_NONBLOCK)) {
+ ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
+ if (ret)
+ return ret;
+ } else if (!stdio_redirect_has_output(&thr->stdio))
+ return -EAGAIN;
+
+ while (len && buf->buf.nr) {
+ if (fault_in_writeable(ubuf, len) == len) {
ret = -EFAULT;
break;
}
- copied += b;
- buf += b;
- len -= b;
-
- memmove(thr->output2.data,
- thr->output2.data + b,
- thr->output2.nr - b);
- thr->output2.nr -= b;
+ spin_lock_irq(&buf->lock);
+ b = min_t(size_t, len, buf->buf.nr);
+
+ if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
+ ubuf += b;
+ len -= b;
+ copied += b;
+ buf->buf.nr -= b;
+ memmove(buf->buf.data,
+ buf->buf.data + b,
+ buf->buf.nr);
+ }
+ spin_unlock_irq(&buf->lock);
}
return copied ?: ret;
@@ -137,27 +156,20 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file)
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
+ thread_with_stdio_done(thr);
bch2_thread_with_file_exit(&thr->thr);
- printbuf_exit(&thr->stdio.input_buf);
- printbuf_exit(&thr->stdio.output_buf);
- darray_exit(&thr->output2);
- thr->exit(thr);
+ darray_exit(&thr->stdio.input.buf);
+ darray_exit(&thr->stdio.output.buf);
+ thr->ops->exit(thr);
return 0;
}
-#define WRITE_BUFFER 4096
-
-static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
-{
- return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
-}
-
static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
size_t len, loff_t *ppos)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
- struct printbuf *buf = &thr->stdio.input_buf;
+ struct stdio_buf *buf = &thr->stdio.input;
size_t copied = 0;
ssize_t ret = 0;
@@ -173,29 +185,34 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu
break;
}
- spin_lock(&thr->stdio.input_lock);
- if (buf->pos < WRITE_BUFFER)
- bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
- b = min(len, printbuf_remaining_size(buf));
-
- if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
- ubuf += b;
- len -= b;
- copied += b;
- buf->pos += b;
+ spin_lock(&buf->lock);
+ size_t makeroom = b;
+ if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr))
+ makeroom = min_t(ssize_t, makeroom,
+ max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr,
+ 0));
+ darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT);
+
+ b = min(len, darray_room(buf->buf));
+
+ if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
+ buf->buf.nr += b;
+ ubuf += b;
+ len -= b;
+ copied += b;
}
- spin_unlock(&thr->stdio.input_lock);
+ spin_unlock(&buf->lock);
if (b) {
- wake_up(&thr->stdio.input_wait);
+ wake_up(&buf->wait);
} else {
if ((file->f_flags & O_NONBLOCK)) {
ret = -EAGAIN;
break;
}
- ret = wait_event_interruptible(thr->stdio.input_wait,
- thread_with_stdio_has_input_space(thr));
+ ret = wait_event_interruptible(buf->wait,
+ stdio_redirect_has_input_space(&thr->stdio));
if (ret)
break;
}
@@ -209,90 +226,266 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
- poll_wait(file, &thr->stdio.output_wait, wait);
- poll_wait(file, &thr->stdio.input_wait, wait);
+ poll_wait(file, &thr->stdio.output.wait, wait);
+ poll_wait(file, &thr->stdio.input.wait, wait);
__poll_t mask = 0;
- if (thread_with_stdio_has_output(thr))
+ if (stdio_redirect_has_output(&thr->stdio))
mask |= EPOLLIN;
- if (thread_with_stdio_has_input_space(thr))
+ if (stdio_redirect_has_input_space(&thr->stdio))
mask |= EPOLLOUT;
if (thr->thr.done)
mask |= EPOLLHUP|EPOLLERR;
return mask;
}
+static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ poll_wait(file, &thr->stdio.output.wait, wait);
+
+ __poll_t mask = 0;
+
+ if (stdio_redirect_has_output(&thr->stdio))
+ mask |= EPOLLIN;
+ if (thr->thr.done)
+ mask |= EPOLLHUP|EPOLLERR;
+ return mask;
+}
+
+static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ return thr->thr.ret;
+}
+
+static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ if (thr->ops->unlocked_ioctl)
+ return thr->ops->unlocked_ioctl(thr, cmd, p);
+ return -ENOTTY;
+}
+
static const struct file_operations thread_with_stdio_fops = {
- .release = thread_with_stdio_release,
.read = thread_with_stdio_read,
.write = thread_with_stdio_write,
.poll = thread_with_stdio_poll,
- .llseek = no_llseek,
+ .flush = thread_with_stdio_flush,
+ .release = thread_with_stdio_release,
+ .unlocked_ioctl = thread_with_stdio_ioctl,
+};
+
+static const struct file_operations thread_with_stdout_fops = {
+ .read = thread_with_stdio_read,
+ .poll = thread_with_stdout_poll,
+ .flush = thread_with_stdio_flush,
+ .release = thread_with_stdio_release,
+ .unlocked_ioctl = thread_with_stdio_ioctl,
};
+static int thread_with_stdio_fn(void *arg)
+{
+ struct thread_with_stdio *thr = arg;
+
+ thr->thr.ret = thr->ops->fn(thr);
+
+ thread_with_stdio_done(thr);
+ return 0;
+}
+
+void bch2_thread_with_stdio_init(struct thread_with_stdio *thr,
+ const struct thread_with_stdio_ops *ops)
+{
+ stdio_buf_init(&thr->stdio.input);
+ stdio_buf_init(&thr->stdio.output);
+ thr->ops = ops;
+}
+
+int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr)
+{
+ return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
+}
+
int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
- void (*exit)(struct thread_with_stdio *),
- int (*fn)(void *))
+ const struct thread_with_stdio_ops *ops)
{
- thr->stdio.input_buf = PRINTBUF;
- thr->stdio.input_buf.atomic++;
- spin_lock_init(&thr->stdio.input_lock);
- init_waitqueue_head(&thr->stdio.input_wait);
+ bch2_thread_with_stdio_init(thr, ops);
- thr->stdio.output_buf = PRINTBUF;
- thr->stdio.output_buf.atomic++;
- spin_lock_init(&thr->stdio.output_lock);
- init_waitqueue_head(&thr->stdio.output_wait);
+ return __bch2_run_thread_with_stdio(thr);
+}
- darray_init(&thr->output2);
- thr->exit = exit;
+int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
+ const struct thread_with_stdio_ops *ops)
+{
+ stdio_buf_init(&thr->stdio.input);
+ stdio_buf_init(&thr->stdio.output);
+ thr->ops = ops;
- return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
+ return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
}
+EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
+int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
{
- wait_event(stdio->input_wait,
- stdio->input_buf.pos || stdio->done);
+ struct stdio_buf *buf = &stdio->input;
+
+ /*
+ * we're waiting on user input (or for the file descriptor to be
+ * closed), don't want a hung task warning:
+ */
+ do {
+ wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+ sysctl_hung_task_timeout_secs * HZ / 2);
+ } while (!stdio_redirect_has_input(stdio));
if (stdio->done)
return -1;
- spin_lock(&stdio->input_lock);
- int ret = min(len, stdio->input_buf.pos);
- stdio->input_buf.pos -= ret;
- memcpy(buf, stdio->input_buf.buf, ret);
- memmove(stdio->input_buf.buf,
- stdio->input_buf.buf + ret,
- stdio->input_buf.pos);
- spin_unlock(&stdio->input_lock);
+ spin_lock(&buf->lock);
+ int ret = min(len, buf->buf.nr);
+ buf->buf.nr -= ret;
+ memcpy(ubuf, buf->buf.data, ret);
+ memmove(buf->buf.data,
+ buf->buf.data + ret,
+ buf->buf.nr);
+ spin_unlock(&buf->lock);
- wake_up(&stdio->input_wait);
+ wake_up(&buf->wait);
return ret;
}
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
+int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio,
+ darray_char *line,
+ unsigned long timeout)
{
- wait_event(stdio->input_wait,
- stdio->input_buf.pos || stdio->done);
+ unsigned long until = jiffies + timeout, t;
+ struct stdio_buf *buf = &stdio->input;
+ size_t seen = 0;
+again:
+ t = timeout != MAX_SCHEDULE_TIMEOUT
+ ? max_t(long, until - jiffies, 0)
+ : timeout;
+
+ t = min(t, sysctl_hung_task_timeout_secs * HZ / 2);
+
+ wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t);
if (stdio->done)
return -1;
- spin_lock(&stdio->input_lock);
- int ret = min(len, stdio->input_buf.pos);
- char *n = memchr(stdio->input_buf.buf, '\n', ret);
- if (n)
- ret = min(ret, n + 1 - stdio->input_buf.buf);
- stdio->input_buf.pos -= ret;
- memcpy(buf, stdio->input_buf.buf, ret);
- memmove(stdio->input_buf.buf,
- stdio->input_buf.buf + ret,
- stdio->input_buf.pos);
- spin_unlock(&stdio->input_lock);
-
- wake_up(&stdio->input_wait);
+ spin_lock(&buf->lock);
+ seen = buf->buf.nr;
+ char *n = memchr(buf->buf.data, '\n', seen);
+
+ if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
+ spin_unlock(&buf->lock);
+ return -ETIME;
+ }
+
+ if (!n) {
+ buf->waiting_for_line = true;
+ spin_unlock(&buf->lock);
+ goto again;
+ }
+
+ size_t b = n + 1 - buf->buf.data;
+ if (b > line->size) {
+ spin_unlock(&buf->lock);
+ int ret = darray_resize(line, b);
+ if (ret)
+ return ret;
+ seen = 0;
+ goto again;
+ }
+
+ buf->buf.nr -= b;
+ memcpy(line->data, buf->buf.data, b);
+ memmove(buf->buf.data,
+ buf->buf.data + b,
+ buf->buf.nr);
+ line->nr = b;
+
+ buf->waiting_for_line = false;
+ spin_unlock(&buf->lock);
+
+ wake_up(&buf->wait);
+ return 0;
+}
+
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line)
+{
+ return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT);
+}
+
+__printf(3, 0)
+static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
+{
+ ssize_t ret;
+
+ do {
+ va_list args2;
+ size_t len;
+
+ va_copy(args2, args);
+ len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
+ va_end(args2);
+
+ if (len + 1 <= darray_room(*out)) {
+ out->nr += len;
+ return len;
+ }
+
+ ret = darray_make_room_gfp(out, len + 1, gfp);
+ } while (ret == 0);
+
+ return ret;
+}
+
+ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
+ const char *fmt, va_list args)
+{
+ struct stdio_buf *buf = &stdio->output;
+ unsigned long flags;
+ ssize_t ret;
+
+again:
+ spin_lock_irqsave(&buf->lock, flags);
+ ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
+ spin_unlock_irqrestore(&buf->lock, flags);
+
+ if (ret < 0) {
+ if (nonblocking)
+ return -EAGAIN;
+
+ ret = wait_event_interruptible(buf->wait,
+ stdio_redirect_has_output_space(stdio));
+ if (ret)
+ return ret;
+ goto again;
+ }
+
+ wake_up(&buf->wait);
+ return ret;
+}
+
+ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
+ const char *fmt, ...)
+{
+ va_list args;
+ ssize_t ret;
+
+ va_start(args, fmt);
+ ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
+ va_end(args);
+
return ret;
}
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index 05879c5048c8..72497b921911 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -4,6 +4,38 @@
#include "thread_with_file_types.h"
+/*
+ * Thread with file: Run a kthread and connect it to a file descriptor, so that
+ * it can be interacted with via fd read/write methods and closing the file
+ * descriptor stops the kthread.
+ *
+ * We have two different APIs:
+ *
+ * thread_with_file, the low level version.
+ * You get to define the full file_operations, including your release function,
+ * which means that you must call bch2_thread_with_file_exit() from your
+ * .release method
+ *
+ * thread_with_stdio, the higher level version
+ * This implements full piping of input and output, including .poll.
+ *
+ * Notes on behaviour:
+ * - kthread shutdown behaves like writing or reading from a pipe that has been
+ * closed
+ * - Input and output buffers are 4096 bytes, although buffers may in some
+ * situations slightly exceed that limit so as to avoid chopping off a
+ * message in the middle in nonblocking mode.
+ * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
+ * should be fine but might change in future revisions.
+ * - Output buffer may grow past 4096 bytes to deal with messages that are
+ * bigger than 4096 bytes
+ * - Writing may be done blocking or nonblocking; in nonblocking mode, we only
+ * drop entire messages.
+ *
+ * To write, use stdio_redirect_printf()
+ * To read, use stdio_redirect_read() or stdio_redirect_readline()
+ */
+
struct task_struct;
struct thread_with_file {
@@ -17,25 +49,33 @@ int bch2_run_thread_with_file(struct thread_with_file *,
const struct file_operations *,
int (*fn)(void *));
+struct thread_with_stdio;
+
+struct thread_with_stdio_ops {
+ void (*exit)(struct thread_with_stdio *);
+ int (*fn)(struct thread_with_stdio *);
+ long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
+};
+
struct thread_with_stdio {
struct thread_with_file thr;
struct stdio_redirect stdio;
- DARRAY(char) output2;
- void (*exit)(struct thread_with_stdio *);
+ const struct thread_with_stdio_ops *ops;
};
-static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
-{
- thr->thr.done = true;
- thr->stdio.done = true;
- wake_up(&thr->stdio.input_wait);
- wake_up(&thr->stdio.output_wait);
-}
-
+void bch2_thread_with_stdio_init(struct thread_with_stdio *,
+ const struct thread_with_stdio_ops *);
+int __bch2_run_thread_with_stdio(struct thread_with_stdio *);
int bch2_run_thread_with_stdio(struct thread_with_stdio *,
- void (*exit)(struct thread_with_stdio *),
- int (*fn)(void *));
+ const struct thread_with_stdio_ops *);
+int bch2_run_thread_with_stdout(struct thread_with_stdio *,
+ const struct thread_with_stdio_ops *);
int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
-int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
+
+int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long);
+int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *);
+
+__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
+__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
index 90b5e645e98c..f4d484d44f63 100644
--- a/fs/bcachefs/thread_with_file_types.h
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -2,14 +2,18 @@
#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-struct stdio_redirect {
- spinlock_t output_lock;
- wait_queue_head_t output_wait;
- struct printbuf output_buf;
+#include "darray.h"
+
+struct stdio_buf {
+ spinlock_t lock;
+ wait_queue_head_t wait;
+ darray_char buf;
+ bool waiting_for_line;
+};
- spinlock_t input_lock;
- wait_queue_head_t input_wait;
- struct printbuf input_buf;
+struct stdio_redirect {
+ struct stdio_buf input;
+ struct stdio_buf output;
bool done;
};
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
new file mode 100644
index 000000000000..3fe82757f93a
--- /dev/null
+++ b/fs/bcachefs/time_stats.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+
+#include "eytzinger.h"
+#include "time_stats.h"
+
+static const struct time_unit time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", (u64) NSEC_PER_SEC * 60},
+ { "h", (u64) NSEC_PER_SEC * 3600},
+ { "d", (u64) NSEC_PER_SEC * 3600 * 24},
+ { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7},
+ { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
+ { "eon", U64_MAX },
+};
+
+const struct time_unit *bch2_pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+ unsigned i = 0;
+
+ while (i < ARRAY_SIZE(q->entries)) {
+ struct quantile_entry *e = q->entries + i;
+
+ if (unlikely(!e->step)) {
+ e->m = v;
+ e->step = max_t(unsigned, v / 2, 1024);
+ } else if (e->m > v) {
+ e->m = e->m >= e->step
+ ? e->m - e->step
+ : 0;
+ } else if (e->m < v) {
+ e->m = e->m + e->step > e->m
+ ? e->m + e->step
+ : U32_MAX;
+ }
+
+ if ((e->m > v ? e->m - v : v - e->m) < e->step)
+ e->step = max_t(unsigned, e->step / 2, 1);
+
+ if (v >= e->m)
+ break;
+
+ i = eytzinger0_child(i, v > e->m);
+ }
+}
+
+static inline void time_stats_update_one(struct bch2_time_stats *stats,
+ u64 start, u64 end)
+{
+ u64 duration, freq;
+ bool initted = stats->last_event != 0;
+
+ if (time_after64(end, start)) {
+ struct quantiles *quantiles = time_stats_to_quantiles(stats);
+
+ duration = end - start;
+ mean_and_variance_update(&stats->duration_stats, duration);
+ mean_and_variance_weighted_update(&stats->duration_stats_weighted,
+ duration, initted, TIME_STATS_MV_WEIGHT);
+ stats->max_duration = max(stats->max_duration, duration);
+ stats->min_duration = min(stats->min_duration, duration);
+ stats->total_duration += duration;
+
+ if (quantiles)
+ quantiles_update(quantiles, duration);
+ }
+
+ if (stats->last_event && time_after64(end, stats->last_event)) {
+ freq = end - stats->last_event;
+ mean_and_variance_update(&stats->freq_stats, freq);
+ mean_and_variance_weighted_update(&stats->freq_stats_weighted,
+ freq, initted, TIME_STATS_MV_WEIGHT);
+ stats->max_freq = max(stats->max_freq, freq);
+ stats->min_freq = min(stats->min_freq, freq);
+ }
+
+ stats->last_event = end;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct time_stat_buffer *b)
+{
+ for (struct time_stat_buffer_entry *i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ time_stats_update_one(stats, i->start, i->end);
+ b->nr = 0;
+}
+
+static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct time_stat_buffer *b)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&stats->lock, flags);
+ __bch2_time_stats_clear_buffer(stats, b);
+ spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
+{
+ unsigned long flags;
+
+ if (!stats->buffer) {
+ spin_lock_irqsave(&stats->lock, flags);
+ time_stats_update_one(stats, start, end);
+
+ if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
+ stats->duration_stats.n > 1024)
+ stats->buffer =
+ alloc_percpu_gfp(struct time_stat_buffer,
+ GFP_ATOMIC);
+ spin_unlock_irqrestore(&stats->lock, flags);
+ } else {
+ struct time_stat_buffer *b;
+
+ preempt_disable();
+ b = this_cpu_ptr(stats->buffer);
+
+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+ b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+ .start = start,
+ .end = end
+ };
+
+ if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+ time_stats_clear_buffer(stats, b);
+ preempt_enable();
+ }
+}
+
+void bch2_time_stats_reset(struct bch2_time_stats *stats)
+{
+ spin_lock_irq(&stats->lock);
+ unsigned offset = offsetof(struct bch2_time_stats, min_duration);
+ memset((void *) stats + offset, 0, sizeof(*stats) - offset);
+
+ if (stats->buffer) {
+ int cpu;
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(stats->buffer, cpu)->nr = 0;
+ }
+ spin_unlock_irq(&stats->lock);
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
+{
+ free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct bch2_time_stats *stats)
+{
+ memset(stats, 0, sizeof(*stats));
+ stats->min_duration = U64_MAX;
+ stats->min_freq = U64_MAX;
+ spin_lock_init(&stats->lock);
+}
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
new file mode 100644
index 000000000000..dc6493f7bbab
--- /dev/null
+++ b/fs/bcachefs/time_stats.h
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * bch2_time_stats - collect statistics on events that have a duration, with nicely
+ * formatted textual output on demand
+ *
+ * - percpu buffering of event collection: cheap enough to shotgun
+ * everywhere without worrying about overhead
+ *
+ * tracks:
+ * - number of events
+ * - maximum event duration ever seen
+ * - sum of all event durations
+ * - average event duration, standard and weighted
+ * - standard deviation of event durations, standard and weighted
+ * and analagous statistics for the frequency of events
+ *
+ * We provide both mean and weighted mean (exponentially weighted), and standard
+ * deviation and weighted standard deviation, to give an efficient-to-compute
+ * view of current behaviour versus. average behaviour - "did this event source
+ * just become wonky, or is this typical?".
+ *
+ * Particularly useful for tracking down latency issues.
+ */
+#ifndef _BCACHEFS_TIME_STATS_H
+#define _BCACHEFS_TIME_STATS_H
+
+#include <linux/sched/clock.h>
+#include <linux/spinlock_types.h>
+#include <linux/string.h>
+
+#include "mean_and_variance.h"
+
+struct time_unit {
+ const char *name;
+ u64 nsecs;
+};
+
+/*
+ * given a nanosecond value, pick the preferred time units for printing:
+ */
+const struct time_unit *bch2_pick_time_units(u64 ns);
+
+/*
+ * quantiles - do not use:
+ *
+ * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
+ * use in new code.
+ */
+
+#define NR_QUANTILES 15
+#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+ struct quantile_entry {
+ u64 m;
+ u64 step;
+ } entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+ unsigned nr;
+ struct time_stat_buffer_entry {
+ u64 start;
+ u64 end;
+ } entries[31];
+};
+
+struct bch2_time_stats {
+ spinlock_t lock;
+ bool have_quantiles;
+ struct time_stat_buffer __percpu *buffer;
+ /* all fields are in nanoseconds */
+ u64 min_duration;
+ u64 max_duration;
+ u64 total_duration;
+ u64 max_freq;
+ u64 min_freq;
+ u64 last_event;
+ u64 last_event_start;
+
+ struct mean_and_variance duration_stats;
+ struct mean_and_variance freq_stats;
+
+/* default weight for weighted mean and variance calculations */
+#define TIME_STATS_MV_WEIGHT 8
+
+ struct mean_and_variance_weighted duration_stats_weighted;
+ struct mean_and_variance_weighted freq_stats_weighted;
+};
+
+struct bch2_time_stats_quantiles {
+ struct bch2_time_stats stats;
+ struct quantiles quantiles;
+};
+
+static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
+{
+ return stats->have_quantiles
+ ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
+ : NULL;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+
+/**
+ * time_stats_update - collect a new event being tracked
+ *
+ * @stats - bch2_time_stats to update
+ * @start - start time of event, recorded with local_clock()
+ *
+ * The end duration of the event will be the current time
+ */
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
+{
+ __bch2_time_stats_update(stats, start, local_clock());
+}
+
+/**
+ * track_event_change - track state change events
+ *
+ * @stats - bch2_time_stats to update
+ * @v - new state, true or false
+ *
+ * Use this when tracking time stats for state changes, i.e. resource X becoming
+ * blocked/unblocked.
+ */
+static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
+{
+ if (v != !!stats->last_event_start) {
+ if (!v) {
+ bch2_time_stats_update(stats, stats->last_event_start);
+ stats->last_event_start = 0;
+ } else {
+ stats->last_event_start = local_clock() ?: 1;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void bch2_time_stats_reset(struct bch2_time_stats *);
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
+
+static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
+{
+ bch2_time_stats_exit(&statq->stats);
+}
+static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
+{
+ bch2_time_stats_init(&statq->stats);
+ statq->stats.have_quantiles = true;
+ memset(&statq->quantiles, 0, sizeof(statq->quantiles));
+}
+
+#endif /* _BCACHEFS_TIME_STATS_H */
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
index dc48b52b01b4..dfad1d06633d 100644
--- a/fs/bcachefs/trace.c
+++ b/fs/bcachefs/trace.c
@@ -4,6 +4,7 @@
#include "buckets.h"
#include "btree_cache.h"
#include "btree_iter.h"
+#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "keylist.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 293b90d704fb..c1b51009edf6 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -3,7 +3,6 @@
#define TRACE_SYSTEM bcachefs
#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_BCACHEFS_H
#include <linux/tracepoint.h>
@@ -43,7 +42,7 @@ DECLARE_EVENT_CLASS(fs_str,
TP_fast_assign(
__entry->dev = c->dev;
- __assign_str(str, str);
+ __assign_str(str);
),
TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
@@ -64,7 +63,7 @@ DECLARE_EVENT_CLASS(trans_str,
__entry->dev = trans->c->dev;
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
- __assign_str(str, str);
+ __assign_str(str);
),
TP_printk("%d,%d %s %pS %s",
@@ -85,7 +84,7 @@ DECLARE_EVENT_CLASS(trans_str_nocaller,
TP_fast_assign(
__entry->dev = trans->c->dev;
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __assign_str(str, str);
+ __assign_str(str);
),
TP_printk("%d,%d %s %s",
@@ -200,6 +199,80 @@ DECLARE_EVENT_CLASS(bio,
(unsigned long long)__entry->sector, __entry->nr_sector)
);
+/* disk_accounting.c */
+
+TRACE_EVENT(accounting_mem_insert,
+ TP_PROTO(struct bch_fs *c, const char *acc),
+ TP_ARGS(c, acc),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(unsigned, new_nr )
+ __string(acc, acc )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->new_nr = c->accounting.k.nr;
+ __assign_str(acc);
+ ),
+
+ TP_printk("%d,%d entries %u added %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->new_nr,
+ __get_str(acc))
+);
+
+/* fs.c: */
+TRACE_EVENT(bch2_sync_fs,
+ TP_PROTO(struct super_block *sb, int wait),
+
+ TP_ARGS(sb, wait),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( int, wait )
+
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sb->s_dev;
+ __entry->wait = wait;
+ ),
+
+ TP_printk("dev %d,%d wait %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->wait)
+);
+
+/* fs-io.c: */
+TRACE_EVENT(bch2_fsync,
+ TP_PROTO(struct file *file, int datasync),
+
+ TP_ARGS(file, datasync),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( ino_t, parent )
+ __field( int, datasync )
+ ),
+
+ TP_fast_assign(
+ struct dentry *dentry = file->f_path.dentry;
+
+ __entry->dev = dentry->d_sb->s_dev;
+ __entry->ino = d_inode(dentry)->i_ino;
+ __entry->parent = d_inode(dentry->d_parent)->i_ino;
+ __entry->datasync = datasync;
+ ),
+
+ TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ (unsigned long) __entry->parent, __entry->datasync)
+);
+
/* super-io.c: */
TRACE_EVENT(write_super,
TP_PROTO(struct bch_fs *c, unsigned long ip),
@@ -508,6 +581,7 @@ TRACE_EVENT(btree_path_relock_fail,
__field(unsigned long, caller_ip )
__field(u8, btree_id )
__field(u8, level )
+ __field(u8, path_idx)
TRACE_BPOS_entries(pos)
__array(char, node, 24 )
__field(u8, self_read_count )
@@ -525,7 +599,8 @@ TRACE_EVENT(btree_path_relock_fail,
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->btree_id = path->btree_id;
- __entry->level = path->level;
+ __entry->level = level;
+ __entry->path_idx = path - trans->paths;
TRACE_BPOS_assign(pos, path->pos);
c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
@@ -538,7 +613,7 @@ TRACE_EVENT(btree_path_relock_fail,
c = six_lock_counts(&path->l[level].b->c.lock);
__entry->read_count = c.n[SIX_LOCK_read];
__entry->intent_count = c.n[SIX_LOCK_intent];
- scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+ scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c);
}
__entry->iter_lock_seq = path->l[level].lock_seq;
__entry->node_lock_seq = is_btree_node(path, level)
@@ -546,9 +621,10 @@ TRACE_EVENT(btree_path_relock_fail,
: 0;
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
+ TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
+ __entry->path_idx,
bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
@@ -575,6 +651,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
__field(unsigned long, caller_ip )
__field(u8, btree_id )
__field(u8, level )
+ __field(u8, path_idx)
TRACE_BPOS_entries(pos)
__field(u8, locked )
__field(u8, self_read_count )
@@ -592,6 +669,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
__entry->caller_ip = caller_ip;
__entry->btree_id = path->btree_id;
__entry->level = level;
+ __entry->path_idx = path - trans->paths;
TRACE_BPOS_assign(pos, path->pos);
__entry->locked = btree_node_locked(path, level);
@@ -607,9 +685,10 @@ TRACE_EVENT(btree_path_upgrade_fail,
: 0;
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
+ TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
+ __entry->path_idx,
bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
@@ -638,102 +717,17 @@ DEFINE_EVENT(bch_fs, gc_gens_end,
/* Allocator */
-DECLARE_EVENT_CLASS(bucket_alloc,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
- u64 bucket,
- u64 free,
- u64 avail,
- u64 copygc_wait_amount,
- s64 copygc_waiting_for,
- struct bucket_alloc_state *s,
- bool nonblocking,
- const char *err),
- TP_ARGS(ca, alloc_reserve, bucket, free, avail,
- copygc_wait_amount, copygc_waiting_for,
- s, nonblocking, err),
-
- TP_STRUCT__entry(
- __field(u8, dev )
- __array(char, reserve, 16 )
- __field(u64, bucket )
- __field(u64, free )
- __field(u64, avail )
- __field(u64, copygc_wait_amount )
- __field(s64, copygc_waiting_for )
- __field(u64, seen )
- __field(u64, open )
- __field(u64, need_journal_commit )
- __field(u64, nouse )
- __field(bool, nonblocking )
- __field(u64, nocow )
- __array(char, err, 32 )
- ),
-
- TP_fast_assign(
- __entry->dev = ca->dev_idx;
- strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
- __entry->bucket = bucket;
- __entry->free = free;
- __entry->avail = avail;
- __entry->copygc_wait_amount = copygc_wait_amount;
- __entry->copygc_waiting_for = copygc_waiting_for;
- __entry->seen = s->buckets_seen;
- __entry->open = s->skipped_open;
- __entry->need_journal_commit = s->skipped_need_journal_commit;
- __entry->nouse = s->skipped_nouse;
- __entry->nonblocking = nonblocking;
- __entry->nocow = s->skipped_nocow;
- strscpy(__entry->err, err, sizeof(__entry->err));
- ),
+DEFINE_EVENT(fs_str, bucket_alloc,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
- TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
- __entry->reserve,
- __entry->dev,
- __entry->bucket,
- __entry->free,
- __entry->avail,
- __entry->copygc_wait_amount,
- __entry->copygc_waiting_for,
- __entry->seen,
- __entry->open,
- __entry->need_journal_commit,
- __entry->nouse,
- __entry->nocow,
- __entry->nonblocking,
- __entry->err)
+DEFINE_EVENT(fs_str, bucket_alloc_fail,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bucket_alloc, bucket_alloc,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
- u64 bucket,
- u64 free,
- u64 avail,
- u64 copygc_wait_amount,
- s64 copygc_waiting_for,
- struct bucket_alloc_state *s,
- bool nonblocking,
- const char *err),
- TP_ARGS(ca, alloc_reserve, bucket, free, avail,
- copygc_wait_amount, copygc_waiting_for,
- s, nonblocking, err)
-);
-
-DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
- u64 bucket,
- u64 free,
- u64 avail,
- u64 copygc_wait_amount,
- s64 copygc_waiting_for,
- struct bucket_alloc_state *s,
- bool nonblocking,
- const char *err),
- TP_ARGS(ca, alloc_reserve, bucket, free, avail,
- copygc_wait_amount, copygc_waiting_for,
- s, nonblocking, err)
-);
-
-TRACE_EVENT(discard_buckets,
+DECLARE_EVENT_CLASS(discard_buckets_class,
TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
u64 need_journal_commit, u64 discarded, const char *err),
TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
@@ -765,6 +759,18 @@ TRACE_EVENT(discard_buckets,
__entry->err)
);
+DEFINE_EVENT(discard_buckets_class, discard_buckets,
+ TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+ u64 need_journal_commit, u64 discarded, const char *err),
+ TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
+);
+
+DEFINE_EVENT(discard_buckets_class, discard_buckets_fast,
+ TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+ u64 need_journal_commit, u64 discarded, const char *err),
+ TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
+);
+
TRACE_EVENT(bucket_invalidate,
TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
TP_ARGS(c, dev, bucket, sectors),
@@ -878,8 +884,8 @@ TRACE_EVENT(move_data,
TRACE_EVENT(evacuate_bucket,
TP_PROTO(struct bch_fs *c, struct bpos *bucket,
unsigned sectors, unsigned bucket_size,
- u64 fragmentation, int ret),
- TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret),
+ int ret),
+ TP_ARGS(c, bucket, sectors, bucket_size, ret),
TP_STRUCT__entry(
__field(dev_t, dev )
@@ -887,7 +893,6 @@ TRACE_EVENT(evacuate_bucket,
__field(u64, bucket )
__field(u32, sectors )
__field(u32, bucket_size )
- __field(u64, fragmentation )
__field(int, ret )
),
@@ -897,45 +902,42 @@ TRACE_EVENT(evacuate_bucket,
__entry->bucket = bucket->offset;
__entry->sectors = sectors;
__entry->bucket_size = bucket_size;
- __entry->fragmentation = fragmentation;
__entry->ret = ret;
),
- TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i",
+ TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->member, __entry->bucket,
__entry->sectors, __entry->bucket_size,
- __entry->fragmentation, __entry->ret)
+ __entry->ret)
);
TRACE_EVENT(copygc,
TP_PROTO(struct bch_fs *c,
- u64 sectors_moved, u64 sectors_not_moved,
- u64 buckets_moved, u64 buckets_not_moved),
- TP_ARGS(c,
- sectors_moved, sectors_not_moved,
- buckets_moved, buckets_not_moved),
+ u64 buckets,
+ u64 sectors_seen,
+ u64 sectors_moved),
+ TP_ARGS(c, buckets, sectors_seen, sectors_moved),
TP_STRUCT__entry(
__field(dev_t, dev )
+ __field(u64, buckets )
+ __field(u64, sectors_seen )
__field(u64, sectors_moved )
- __field(u64, sectors_not_moved )
- __field(u64, buckets_moved )
- __field(u64, buckets_not_moved )
),
TP_fast_assign(
__entry->dev = c->dev;
+ __entry->buckets = buckets;
+ __entry->sectors_seen = sectors_seen;
__entry->sectors_moved = sectors_moved;
- __entry->sectors_not_moved = sectors_not_moved;
- __entry->buckets_moved = buckets_moved;
- __entry->buckets_not_moved = buckets_moved;
),
- TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu",
+ TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->sectors_moved, __entry->sectors_not_moved,
- __entry->buckets_moved, __entry->buckets_not_moved)
+ __entry->buckets,
+ __entry->sectors_seen,
+ __entry->sectors_moved)
);
TRACE_EVENT(copygc_wait,
@@ -1023,10 +1025,33 @@ TRACE_EVENT(trans_restart_split_race,
__entry->u64s_remaining)
);
-DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim,
+TRACE_EVENT(trans_blocked_journal_reclaim,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
+ TP_ARGS(trans, caller_ip),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+
+ __field(unsigned long, key_cache_nr_keys )
+ __field(unsigned long, key_cache_nr_dirty )
+ __field(long, must_wait )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->key_cache_nr_keys = atomic_long_read(&trans->c->btree_key_cache.nr_keys);
+ __entry->key_cache_nr_dirty = atomic_long_read(&trans->c->btree_key_cache.nr_dirty);
+ __entry->must_wait = __bch2_btree_key_cache_must_wait(trans->c);
+ ),
+
+ TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li",
+ __entry->trans_fn, (void *) __entry->caller_ip,
+ __entry->key_cache_nr_keys,
+ __entry->key_cache_nr_dirty,
+ __entry->must_wait)
);
TRACE_EVENT(trans_restart_journal_preres_get,
@@ -1323,6 +1348,12 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
__entry->new_u64s)
);
+DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
TRACE_EVENT(path_downgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@@ -1359,10 +1390,21 @@ TRACE_EVENT(path_downgrade,
__entry->pos_snapshot)
);
-DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
+TRACE_EVENT(key_cache_fill,
+ TP_PROTO(struct btree_trans *trans, const char *key),
+ TP_ARGS(trans, key),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __string(key, key )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __assign_str(key);
+ ),
+
+ TP_printk("%s %s", __entry->trans_fn, __get_str(key))
);
TRACE_EVENT(write_buffer_flush,
@@ -1421,6 +1463,24 @@ TRACE_EVENT(write_buffer_flush_slowpath,
TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
);
+TRACE_EVENT(write_buffer_maybe_flush,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key),
+ TP_ARGS(trans, caller_ip, key),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __string(key, key )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __assign_str(key);
+ ),
+
+ TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key))
+);
+
DEFINE_EVENT(fs_str, rebalance_extent,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
@@ -1431,6 +1491,475 @@ DEFINE_EVENT(fs_str, data_update,
TP_ARGS(c, str)
);
+TRACE_EVENT(error_downcast,
+ TP_PROTO(int bch_err, int std_err, unsigned long ip),
+ TP_ARGS(bch_err, std_err, ip),
+
+ TP_STRUCT__entry(
+ __array(char, bch_err, 32 )
+ __array(char, std_err, 32 )
+ __array(char, ip, 32 )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
+ strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
+ snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+ ),
+
+ TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
+);
+
+#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
+
+TRACE_EVENT(update_by_path,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path,
+ struct btree_insert_entry *i, bool overwrite),
+ TP_ARGS(trans, path, i, overwrite),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(btree_path_idx_t, path_idx )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ __field(u8, overwrite )
+ __field(btree_path_idx_t, update_idx )
+ __field(btree_path_idx_t, nr_updates )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->path_idx = path - trans->paths;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ __entry->overwrite = overwrite;
+ __entry->update_idx = i - trans->updates;
+ __entry->nr_updates = trans->nr_updates;
+ ),
+
+ TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u",
+ __entry->trans_fn,
+ __entry->path_idx,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->overwrite,
+ __entry->update_idx,
+ __entry->nr_updates)
+);
+
+TRACE_EVENT(btree_path_lock,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_bkey_cached_common *b),
+ TP_ARGS(trans, caller_ip, b),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, btree_id )
+ __field(u8, level )
+ __array(char, node, 24 )
+ __field(u32, lock_seq )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = b->btree_id;
+ __entry->level = b->level;
+
+ scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+ __entry->lock_seq = six_lock_seq(&b->lock);
+ ),
+
+ TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->level,
+ __entry->node,
+ __entry->lock_seq)
+);
+
+DECLARE_EVENT_CLASS(btree_path_ev,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path),
+
+ TP_STRUCT__entry(
+ __field(u16, idx )
+ __field(u8, ref )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->ref = path->ref;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ ),
+
+ TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u",
+ __entry->idx, __entry->ref,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_get_ll,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_put_ll,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+TRACE_EVENT(btree_path_alloc,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, locks_want )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->locks_want = path->locks_want;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ ),
+
+ TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u",
+ __entry->idx,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->locks_want,
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot)
+);
+
+TRACE_EVENT(btree_path_get,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos),
+ TP_ARGS(trans, path, new_pos),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, ref )
+ __field(u8, preserve )
+ __field(u8, locks_want )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(old_pos)
+ TRACE_BPOS_entries(new_pos)
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->ref = path->ref;
+ __entry->preserve = path->preserve;
+ __entry->locks_want = path->locks_want;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(old_pos, path->pos);
+ TRACE_BPOS_assign(new_pos, *new_pos);
+ ),
+
+ TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u",
+ __entry->idx,
+ __entry->ref,
+ __entry->preserve,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->locks_want,
+ __entry->old_pos_inode,
+ __entry->old_pos_offset,
+ __entry->old_pos_snapshot,
+ __entry->new_pos_inode,
+ __entry->new_pos_offset,
+ __entry->new_pos_snapshot)
+);
+
+DECLARE_EVENT_CLASS(btree_path_clone,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+ TP_ARGS(trans, path, new),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, new_idx )
+ __field(u8, btree_id )
+ __field(u8, ref )
+ __field(u8, preserve )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->new_idx = new - trans->paths;
+ __entry->btree_id = path->btree_id;
+ __entry->ref = path->ref;
+ __entry->preserve = path->preserve;
+ TRACE_BPOS_assign(pos, path->pos);
+ ),
+
+ TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u",
+ __entry->idx,
+ __entry->ref,
+ __entry->preserve,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->new_idx)
+);
+
+DEFINE_EVENT(btree_path_clone, btree_path_clone,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+ TP_ARGS(trans, path, new)
+);
+
+DEFINE_EVENT(btree_path_clone, btree_path_save_pos,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+ TP_ARGS(trans, path, new)
+);
+
+DECLARE_EVENT_CLASS(btree_path_traverse,
+ TP_PROTO(struct btree_trans *trans,
+ struct btree_path *path),
+ TP_ARGS(trans, path),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(btree_path_idx_t, idx )
+ __field(u8, ref )
+ __field(u8, preserve )
+ __field(u8, should_be_locked )
+ __field(u8, btree_id )
+ __field(u8, level )
+ TRACE_BPOS_entries(pos)
+ __field(u8, locks_want )
+ __field(u8, nodes_locked )
+ __array(char, node0, 24 )
+ __array(char, node1, 24 )
+ __array(char, node2, 24 )
+ __array(char, node3, 24 )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+
+ __entry->idx = path - trans->paths;
+ __entry->ref = path->ref;
+ __entry->preserve = path->preserve;
+ __entry->btree_id = path->btree_id;
+ __entry->level = path->level;
+ TRACE_BPOS_assign(pos, path->pos);
+
+ __entry->locks_want = path->locks_want;
+ __entry->nodes_locked = path->nodes_locked;
+ struct btree *b = path->l[0].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[1].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[2].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[3].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
+ ),
+
+ TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n"
+ "locks %u %u %u %u node %s %s %s %s",
+ __entry->trans_fn,
+ __entry->idx,
+ __entry->ref,
+ __entry->preserve,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->level,
+ __entry->locks_want,
+ (__entry->nodes_locked >> 6) & 3,
+ (__entry->nodes_locked >> 4) & 3,
+ (__entry->nodes_locked >> 2) & 3,
+ (__entry->nodes_locked >> 0) & 3,
+ __entry->node3,
+ __entry->node2,
+ __entry->node1,
+ __entry->node0)
+);
+
+DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start,
+ TP_PROTO(struct btree_trans *trans,
+ struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+TRACE_EVENT(btree_path_set_pos,
+ TP_PROTO(struct btree_trans *trans,
+ struct btree_path *path,
+ struct bpos *new_pos),
+ TP_ARGS(trans, path, new_pos),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, ref )
+ __field(u8, preserve )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(old_pos)
+ TRACE_BPOS_entries(new_pos)
+ __field(u8, locks_want )
+ __field(u8, nodes_locked )
+ __array(char, node0, 24 )
+ __array(char, node1, 24 )
+ __array(char, node2, 24 )
+ __array(char, node3, 24 )
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->ref = path->ref;
+ __entry->preserve = path->preserve;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(old_pos, path->pos);
+ TRACE_BPOS_assign(new_pos, *new_pos);
+
+ __entry->nodes_locked = path->nodes_locked;
+ struct btree *b = path->l[0].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[1].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[2].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[3].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
+ ),
+
+ TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n"
+ "locks %u %u %u %u node %s %s %s %s",
+ __entry->idx,
+ __entry->ref,
+ __entry->preserve,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->old_pos_inode,
+ __entry->old_pos_offset,
+ __entry->old_pos_snapshot,
+ __entry->new_pos_inode,
+ __entry->new_pos_offset,
+ __entry->new_pos_snapshot,
+ (__entry->nodes_locked >> 6) & 3,
+ (__entry->nodes_locked >> 4) & 3,
+ (__entry->nodes_locked >> 2) & 3,
+ (__entry->nodes_locked >> 0) & 3,
+ __entry->node3,
+ __entry->node2,
+ __entry->node1,
+ __entry->node0)
+);
+
+TRACE_EVENT(btree_path_free,
+ TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup),
+ TP_ARGS(trans, path, dup),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, preserve )
+ __field(u8, should_be_locked)
+ __field(s8, dup )
+ __field(u8, dup_locked )
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path;
+ __entry->preserve = trans->paths[path].preserve;
+ __entry->should_be_locked = trans->paths[path].should_be_locked;
+ __entry->dup = dup ? dup - trans->paths : -1;
+ __entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0;
+ ),
+
+ TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx,
+ __entry->preserve ? 'P' : ' ',
+ __entry->should_be_locked ? 'S' : ' ',
+ __entry->dup,
+ __entry->dup_locked)
+);
+
+TRACE_EVENT(btree_path_free_trans_begin,
+ TP_PROTO(btree_path_idx_t path),
+ TP_ARGS(path),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path;
+ ),
+
+ TP_printk(" path %3u", __entry->idx)
+);
+
+#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
+#ifndef _TRACE_BCACHEFS_H
+
+static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path,
+ struct btree_insert_entry *i, bool overwrite) {}
+static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {}
+static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
+static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
+static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
+static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
+static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
+static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {}
+
+#endif
+#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
+
+#define _TRACE_BCACHEFS_H
#endif /* _TRACE_BCACHEFS_H */
/* This part must be outside protection */
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
index 905801772002..7f647846b511 100644
--- a/fs/bcachefs/two_state_shared_lock.h
+++ b/fs/bcachefs/two_state_shared_lock.h
@@ -36,15 +36,14 @@ static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
{
long i = s ? 1 : -1;
- long v = atomic_long_read(&lock->v), old;
+ long old;
+ old = atomic_long_read(&lock->v);
do {
- old = v;
-
- if (i > 0 ? v < 0 : v > 0)
+ if (i > 0 ? old < 0 : old > 0)
return false;
- } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
- old, old + i)) != old);
+ } while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i));
+
return true;
}
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 3a32faa86b5c..da2cd11b3025 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * random utiility code, for bcache but in theory not specific to bcache
+ * random utility code, for bcache but in theory not specific to bcache
*
* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
* Copyright 2012 Google, Inc.
@@ -64,7 +64,7 @@ static int bch2_pow(u64 n, u64 p, u64 *res)
*res = 1;
while (p--) {
- if (*res > div_u64(U64_MAX, n))
+ if (*res > div64_u64(U64_MAX, n))
return -ERANGE;
*res *= n;
}
@@ -140,14 +140,14 @@ static int __bch2_strtou64_h(const char *cp, u64 *res)
parse_or_ret(cp, parse_unit_suffix(cp, &b));
- if (v > div_u64(U64_MAX, b))
+ if (v > div64_u64(U64_MAX, b))
return -ERANGE;
v *= b;
- if (f_n > div_u64(U64_MAX, b))
+ if (f_n > div64_u64(U64_MAX, b))
return -ERANGE;
- f_n = div_u64(f_n * b, f_d);
+ f_n = div64_u64(f_n * b, f_d);
if (v + f_n < v)
return -ERANGE;
v += f_n;
@@ -204,7 +204,7 @@ STRTO_H(strtoll, long long)
STRTO_H(strtoull, unsigned long long)
STRTO_H(strtou64, u64)
-u64 bch2_read_flag_list(char *opt, const char * const list[])
+u64 bch2_read_flag_list(const char *opt, const char * const list[])
{
u64 ret = 0;
char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
@@ -214,7 +214,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[])
s = strim(d);
- while ((p = strsep(&s, ","))) {
+ while ((p = strsep(&s, ",;"))) {
int flag = match_string(list, -1, p);
if (flag < 0) {
@@ -222,7 +222,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[])
break;
}
- ret |= 1 << flag;
+ ret |= BIT_ULL(flag);
}
kfree(d);
@@ -252,8 +252,10 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v)
bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
}
-void bch2_print_string_as_lines(const char *prefix, const char *lines)
+static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
+ bool nonblocking)
{
+ bool locked = false;
const char *p;
if (!lines) {
@@ -261,7 +263,13 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
return;
}
- console_lock();
+ if (!nonblocking) {
+ console_lock();
+ locked = true;
+ } else {
+ locked = console_trylock();
+ }
+
while (1) {
p = strchrnul(lines, '\n');
printk("%s%.*s\n", prefix, (int) (p - lines), lines);
@@ -269,7 +277,18 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
break;
lines = p + 1;
}
- console_unlock();
+ if (locked)
+ console_unlock();
+}
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
+{
+ return __bch2_print_string_as_lines(prefix, lines, false);
+}
+
+void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines)
+{
+ return __bch2_print_string_as_lines(prefix, lines, true);
}
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
@@ -337,167 +356,23 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
}
#endif
-static const struct time_unit {
- const char *name;
- u64 nsecs;
-} time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "s", NSEC_PER_SEC },
- { "m", (u64) NSEC_PER_SEC * 60},
- { "h", (u64) NSEC_PER_SEC * 3600},
- { "eon", U64_MAX },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
- const struct time_unit *u;
-
- for (u = time_units;
- u + 1 < time_units + ARRAY_SIZE(time_units) &&
- ns >= u[1].nsecs << 1;
- u++)
- ;
-
- return u;
-}
-
void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
- const struct time_unit *u = pick_time_units(ns);
+ const struct time_unit *u = bch2_pick_time_units(ns);
- prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
-}
-
-/* time stats: */
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
-{
- unsigned i = 0;
-
- while (i < ARRAY_SIZE(q->entries)) {
- struct bch2_quantile_entry *e = q->entries + i;
-
- if (unlikely(!e->step)) {
- e->m = v;
- e->step = max_t(unsigned, v / 2, 1024);
- } else if (e->m > v) {
- e->m = e->m >= e->step
- ? e->m - e->step
- : 0;
- } else if (e->m < v) {
- e->m = e->m + e->step > e->m
- ? e->m + e->step
- : U32_MAX;
- }
-
- if ((e->m > v ? e->m - v : v - e->m) < e->step)
- e->step = max_t(unsigned, e->step / 2, 1);
-
- if (v >= e->m)
- break;
-
- i = eytzinger0_child(i, v > e->m);
- }
-}
-
-static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
- u64 start, u64 end)
-{
- u64 duration, freq;
-
- if (time_after64(end, start)) {
- duration = end - start;
- mean_and_variance_update(&stats->duration_stats, duration);
- mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
- stats->max_duration = max(stats->max_duration, duration);
- stats->min_duration = min(stats->min_duration, duration);
- stats->total_duration += duration;
- bch2_quantiles_update(&stats->quantiles, duration);
- }
-
- if (stats->last_event && time_after64(end, stats->last_event)) {
- freq = end - stats->last_event;
- mean_and_variance_update(&stats->freq_stats, freq);
- mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
- stats->max_freq = max(stats->max_freq, freq);
- stats->min_freq = min(stats->min_freq, freq);
- }
-
- stats->last_event = end;
-}
-
-static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct bch2_time_stat_buffer *b)
-{
- for (struct bch2_time_stat_buffer_entry *i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- bch2_time_stats_update_one(stats, i->start, i->end);
- b->nr = 0;
-}
-
-static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct bch2_time_stat_buffer *b)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&stats->lock, flags);
- __bch2_time_stats_clear_buffer(stats, b);
- spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
- unsigned long flags;
-
- WARN_ONCE(!stats->duration_stats_weighted.weight ||
- !stats->freq_stats_weighted.weight,
- "uninitialized time_stats");
-
- if (!stats->buffer) {
- spin_lock_irqsave(&stats->lock, flags);
- bch2_time_stats_update_one(stats, start, end);
-
- if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
- stats->duration_stats.n > 1024)
- stats->buffer =
- alloc_percpu_gfp(struct bch2_time_stat_buffer,
- GFP_ATOMIC);
- spin_unlock_irqrestore(&stats->lock, flags);
- } else {
- struct bch2_time_stat_buffer *b;
-
- preempt_disable();
- b = this_cpu_ptr(stats->buffer);
-
- BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
- b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
- .start = start,
- .end = end
- };
-
- if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
- bch2_time_stats_clear_buffer(stats, b);
- preempt_enable();
- }
+ prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name);
}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
- const struct time_unit *u = pick_time_units(ns);
+ const struct time_unit *u = bch2_pick_time_units(ns);
- prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
- prt_tab_rjust(out);
- prt_printf(out, "%s", u->name);
+ prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name);
}
static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
{
- prt_str(out, name);
- prt_tab(out);
+ prt_printf(out, "%s\t", name);
bch2_pr_time_units_aligned(out, ns);
prt_newline(out);
}
@@ -506,10 +381,9 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
- const struct time_unit *u;
+ struct quantiles *quantiles = time_stats_to_quantiles(stats);
s64 f_mean = 0, d_mean = 0;
- u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
- int i;
+ u64 f_stddev = 0, d_stddev = 0;
if (stats->buffer) {
int cpu;
@@ -531,12 +405,8 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
}
printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
- prt_printf(out, "count:");
- prt_tab(out);
- prt_printf(out, "%llu ",
- stats->duration_stats.n);
+ prt_printf(out, "count:\t%llu\n", stats->duration_stats.n);
printbuf_tabstop_pop(out);
- prt_newline(out);
printbuf_tabstops_reset(out);
@@ -545,13 +415,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
printbuf_tabstop_push(out, 0);
printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
- prt_tab(out);
- prt_printf(out, "since mount");
- prt_tab_rjust(out);
- prt_tab(out);
- prt_printf(out, "recent");
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\tsince mount\r\trecent\r\n");
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, out->indent + 20);
@@ -559,88 +423,67 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
printbuf_tabstop_push(out, 2);
printbuf_tabstop_push(out, TABSTOP_SIZE);
- prt_printf(out, "duration of events");
- prt_newline(out);
+ prt_printf(out, "duration of events\n");
printbuf_indent_add(out, 2);
pr_name_and_units(out, "min:", stats->min_duration);
pr_name_and_units(out, "max:", stats->max_duration);
pr_name_and_units(out, "total:", stats->total_duration);
- prt_printf(out, "mean:");
- prt_tab(out);
+ prt_printf(out, "mean:\t");
bch2_pr_time_units_aligned(out, d_mean);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
- prt_printf(out, "stddev:");
- prt_tab(out);
+ prt_printf(out, "stddev:\t");
bch2_pr_time_units_aligned(out, d_stddev);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
printbuf_indent_sub(out, 2);
prt_newline(out);
- prt_printf(out, "time between events");
- prt_newline(out);
+ prt_printf(out, "time between events\n");
printbuf_indent_add(out, 2);
pr_name_and_units(out, "min:", stats->min_freq);
pr_name_and_units(out, "max:", stats->max_freq);
- prt_printf(out, "mean:");
- prt_tab(out);
+ prt_printf(out, "mean:\t");
bch2_pr_time_units_aligned(out, f_mean);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
- prt_printf(out, "stddev:");
- prt_tab(out);
+ prt_printf(out, "stddev:\t");
bch2_pr_time_units_aligned(out, f_stddev);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
printbuf_indent_sub(out, 2);
prt_newline(out);
printbuf_tabstops_reset(out);
- i = eytzinger0_first(NR_QUANTILES);
- u = pick_time_units(stats->quantiles.entries[i].m);
-
- prt_printf(out, "quantiles (%s):\t", u->name);
- eytzinger0_for_each(i, NR_QUANTILES) {
- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
-
- q = max(stats->quantiles.entries[i].m, last_q);
- prt_printf(out, "%llu ",
- div_u64(q, u->nsecs));
- if (is_last)
- prt_newline(out);
- last_q = q;
+ if (quantiles) {
+ int i = eytzinger0_first(NR_QUANTILES);
+ const struct time_unit *u =
+ bch2_pick_time_units(quantiles->entries[i].m);
+ u64 last_q = 0;
+
+ prt_printf(out, "quantiles (%s):\t", u->name);
+ eytzinger0_for_each(i, NR_QUANTILES) {
+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+ u64 q = max(quantiles->entries[i].m, last_q);
+ prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
+ if (is_last)
+ prt_newline(out);
+ last_q = q;
+ }
}
}
-#else
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
-#endif
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
- free_percpu(stats->buffer);
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
- memset(stats, 0, sizeof(*stats));
- stats->duration_stats_weighted.weight = 8;
- stats->freq_stats_weighted.weight = 8;
- stats->min_duration = U64_MAX;
- stats->min_freq = U64_MAX;
- spin_lock_init(&stats->lock);
-}
/* ratelimit: */
@@ -746,40 +589,31 @@ void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_contro
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 20);
- prt_printf(out, "rate:");
- prt_tab(out);
+ prt_printf(out, "rate:\t");
prt_human_readable_s64(out, pd->rate.rate);
prt_newline(out);
- prt_printf(out, "target:");
- prt_tab(out);
+ prt_printf(out, "target:\t");
prt_human_readable_u64(out, pd->last_target);
prt_newline(out);
- prt_printf(out, "actual:");
- prt_tab(out);
+ prt_printf(out, "actual:\t");
prt_human_readable_u64(out, pd->last_actual);
prt_newline(out);
- prt_printf(out, "proportional:");
- prt_tab(out);
+ prt_printf(out, "proportional:\t");
prt_human_readable_s64(out, pd->last_proportional);
prt_newline(out);
- prt_printf(out, "derivative:");
- prt_tab(out);
+ prt_printf(out, "derivative:\t");
prt_human_readable_s64(out, pd->last_derivative);
prt_newline(out);
- prt_printf(out, "change:");
- prt_tab(out);
+ prt_printf(out, "change:\t");
prt_human_readable_s64(out, pd->last_change);
prt_newline(out);
- prt_printf(out, "next io:");
- prt_tab(out);
- prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
- prt_newline(out);
+ prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
}
/* misc: */
@@ -819,19 +653,25 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
return 0;
}
-size_t bch2_rand_range(size_t max)
+u64 bch2_get_random_u64_below(u64 ceil)
{
- size_t rand;
+ if (ceil <= U32_MAX)
+ return __get_random_u32_below(ceil);
- if (!max)
- return 0;
+ /* this is the same (clever) algorithm as in __get_random_u32_below() */
+ u64 rand = get_random_u64();
+ u64 mult = ceil * rand;
- do {
- rand = get_random_long();
- rand &= roundup_pow_of_two(max) - 1;
- } while (rand >= max);
+ if (unlikely(mult < ceil)) {
+ u64 bound;
+ div64_u64_rem(-ceil, ceil, &bound);
+ while (unlikely(mult < bound)) {
+ rand = get_random_u64();
+ mult = ceil * rand;
+ }
+ }
- return rand;
+ return mul_u64_u64_shr(ceil, rand, 64);
}
void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
@@ -864,171 +704,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
-static int alignment_ok(const void *base, size_t align)
-{
- return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
- ((unsigned long)base & (align - 1)) == 0;
-}
-
-static void u32_swap(void *a, void *b, size_t size)
-{
- u32 t = *(u32 *)a;
- *(u32 *)a = *(u32 *)b;
- *(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, size_t size)
-{
- u64 t = *(u64 *)a;
- *(u64 *)a = *(u64 *)b;
- *(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
- char t;
-
- do {
- t = *(char *)a;
- *(char *)a++ = *(char *)b;
- *(char *)b++ = t;
- } while (--size > 0);
-}
-
-static inline int do_cmp(void *base, size_t n, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- size_t l, size_t r)
-{
- return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
- size);
-}
-
-static inline void do_swap(void *base, size_t n, size_t size,
- void (*swap_func)(void *, void *, size_t),
- size_t l, size_t r)
-{
- swap_func(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
- size);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t))
-{
- int i, c, r;
-
- if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
- else
- swap_func = generic_swap;
- }
-
- /* heapify */
- for (i = n / 2 - 1; i >= 0; --i) {
- for (r = i; r * 2 + 1 < n; r = c) {
- c = r * 2 + 1;
-
- if (c + 1 < n &&
- do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
- c++;
-
- if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
- break;
-
- do_swap(base, n, size, swap_func, r, c);
- }
- }
-
- /* sort */
- for (i = n - 1; i > 0; --i) {
- do_swap(base, n, size, swap_func, 0, i);
-
- for (r = 0; r * 2 + 1 < i; r = c) {
- c = r * 2 + 1;
-
- if (c + 1 < i &&
- do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
- c++;
-
- if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
- break;
-
- do_swap(base, n, size, swap_func, r, c);
- }
- }
-}
-
-void sort_cmp_size(void *base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t size))
-{
- /* pre-scale counters for performance */
- int i = (num/2 - 1) * size, n = num * size, c, r;
-
- if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
- else
- swap_func = generic_swap;
- }
-
- /* heapify */
- for ( ; i >= 0; i -= size) {
- for (r = i; r * 2 + size < n; r = c) {
- c = r * 2 + size;
- if (c < n - size &&
- cmp_func(base + c, base + c + size, size) < 0)
- c += size;
- if (cmp_func(base + r, base + c, size) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-
- /* sort */
- for (i = n - size; i > 0; i -= size) {
- swap_func(base, base + i, size);
- for (r = 0; r * 2 + size < i; r = c) {
- c = r * 2 + size;
- if (c < i - size &&
- cmp_func(base + c, base + c + size, size) < 0)
- c += size;
- if (cmp_func(base + r, base + c, size) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-}
-
-static void mempool_free_vp(void *element, void *pool_data)
-{
- size_t size = (size_t) pool_data;
-
- vpfree(element, size);
-}
-
-static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
-{
- size_t size = (size_t) pool_data;
-
- return vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
- return size < PAGE_SIZE
- ? mempool_init_kmalloc_pool(pool, min_nr, size)
- : mempool_init(pool, min_nr, mempool_alloc_vp,
- mempool_free_vp, (void *) size);
-}
-
#if 0
void eytzinger1_test(void)
{
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index b414736d59a5..f4a4783219d9 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -8,6 +8,7 @@
#include <linux/errno.h>
#include <linux/freezer.h>
#include <linux/kernel.h>
+#include <linux/min_heap.h>
#include <linux/sched/clock.h>
#include <linux/llist.h>
#include <linux/log2.h>
@@ -21,6 +22,7 @@
#include "mean_and_variance.h"
#include "darray.h"
+#include "time_stats.h"
struct closure;
@@ -53,167 +55,30 @@ static inline size_t buf_pages(void *p, size_t len)
PAGE_SIZE);
}
-static inline void vpfree(void *p, size_t size)
+static inline void *bch2_kvmalloc(size_t n, gfp_t flags)
{
- if (is_vmalloc_addr(p))
- vfree(p);
- else
- free_pages((unsigned long) p, get_order(size));
+ void *p = unlikely(n >= INT_MAX)
+ ? vmalloc(n)
+ : kvmalloc(n, flags & ~__GFP_ZERO);
+ if (p && (flags & __GFP_ZERO))
+ memset(p, 0, n);
+ return p;
}
-static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
-{
- return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
- get_order(size)) ?:
- __vmalloc(size, gfp_mask);
-}
-
-static inline void kvpfree(void *p, size_t size)
-{
- if (size < PAGE_SIZE)
- kfree(p);
- else
- vpfree(p, size);
-}
-
-static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
-{
- return size < PAGE_SIZE
- ? kmalloc(size, gfp_mask)
- : vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
-
-#define HEAP(type) \
-struct { \
- size_t size, used; \
- type *data; \
-}
-
-#define DECLARE_HEAP(type, name) HEAP(type) name
-
#define init_heap(heap, _size, gfp) \
({ \
- (heap)->used = 0; \
+ (heap)->nr = 0; \
(heap)->size = (_size); \
- (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+ (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
(gfp)); \
})
#define free_heap(heap) \
do { \
- kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \
+ kvfree((heap)->data); \
(heap)->data = NULL; \
} while (0)
-#define heap_set_backpointer(h, i, _fn) \
-do { \
- void (*fn)(typeof(h), size_t) = _fn; \
- if (fn) \
- fn(h, i); \
-} while (0)
-
-#define heap_swap(h, i, j, set_backpointer) \
-do { \
- swap((h)->data[i], (h)->data[j]); \
- heap_set_backpointer(h, i, set_backpointer); \
- heap_set_backpointer(h, j, set_backpointer); \
-} while (0)
-
-#define heap_peek(h) \
-({ \
- EBUG_ON(!(h)->used); \
- (h)->data[0]; \
-})
-
-#define heap_full(h) ((h)->used == (h)->size)
-
-#define heap_sift_down(h, i, cmp, set_backpointer) \
-do { \
- size_t _c, _j = i; \
- \
- for (; _j * 2 + 1 < (h)->used; _j = _c) { \
- _c = _j * 2 + 1; \
- if (_c + 1 < (h)->used && \
- cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \
- _c++; \
- \
- if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \
- break; \
- heap_swap(h, _c, _j, set_backpointer); \
- } \
-} while (0)
-
-#define heap_sift_up(h, i, cmp, set_backpointer) \
-do { \
- while (i) { \
- size_t p = (i - 1) / 2; \
- if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \
- break; \
- heap_swap(h, i, p, set_backpointer); \
- i = p; \
- } \
-} while (0)
-
-#define __heap_add(h, d, cmp, set_backpointer) \
-({ \
- size_t _i = (h)->used++; \
- (h)->data[_i] = d; \
- heap_set_backpointer(h, _i, set_backpointer); \
- \
- heap_sift_up(h, _i, cmp, set_backpointer); \
- _i; \
-})
-
-#define heap_add(h, d, cmp, set_backpointer) \
-({ \
- bool _r = !heap_full(h); \
- if (_r) \
- __heap_add(h, d, cmp, set_backpointer); \
- _r; \
-})
-
-#define heap_add_or_replace(h, new, cmp, set_backpointer) \
-do { \
- if (!heap_add(h, new, cmp, set_backpointer) && \
- cmp(h, new, heap_peek(h)) >= 0) { \
- (h)->data[0] = new; \
- heap_set_backpointer(h, 0, set_backpointer); \
- heap_sift_down(h, 0, cmp, set_backpointer); \
- } \
-} while (0)
-
-#define heap_del(h, i, cmp, set_backpointer) \
-do { \
- size_t _i = (i); \
- \
- BUG_ON(_i >= (h)->used); \
- (h)->used--; \
- if ((_i) < (h)->used) { \
- heap_swap(h, _i, (h)->used, set_backpointer); \
- heap_sift_up(h, _i, cmp, set_backpointer); \
- heap_sift_down(h, _i, cmp, set_backpointer); \
- } \
-} while (0)
-
-#define heap_pop(h, d, cmp, set_backpointer) \
-({ \
- bool _r = (h)->used; \
- if (_r) { \
- (d) = (h)->data[0]; \
- heap_del(h, 0, cmp, set_backpointer); \
- } \
- _r; \
-})
-
-#define heap_resort(heap, cmp, set_backpointer) \
-do { \
- ssize_t _i; \
- for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \
- heap_sift_down(heap, _i, cmp, set_backpointer); \
-} while (0)
-
#define ANYSINT_MAX(t) \
((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
@@ -340,12 +205,13 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
bool bch2_is_zero(const void *, size_t);
-u64 bch2_read_flag_list(char *, const char * const[]);
+u64 bch2_read_flag_list(const char *, const char * const[]);
void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
void bch2_prt_u64_base2(struct printbuf *, u64);
void bch2_print_string_as_lines(const char *prefix, const char *lines);
+void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines);
typedef DARRAY(unsigned long) bch_stacktrace;
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
@@ -361,84 +227,8 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
#endif
}
-#define NR_QUANTILES 15
-#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
-
-struct bch2_quantiles {
- struct bch2_quantile_entry {
- u64 m;
- u64 step;
- } entries[NR_QUANTILES];
-};
-
-struct bch2_time_stat_buffer {
- unsigned nr;
- struct bch2_time_stat_buffer_entry {
- u64 start;
- u64 end;
- } entries[32];
-};
-
-struct bch2_time_stats {
- spinlock_t lock;
- /* all fields are in nanoseconds */
- u64 min_duration;
- u64 max_duration;
- u64 total_duration;
- u64 max_freq;
- u64 min_freq;
- u64 last_event;
- struct bch2_quantiles quantiles;
-
- struct mean_and_variance duration_stats;
- struct mean_and_variance_weighted duration_stats_weighted;
- struct mean_and_variance freq_stats;
- struct mean_and_variance_weighted freq_stats_weighted;
- struct bch2_time_stat_buffer __percpu *buffer;
-};
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
- __bch2_time_stats_update(stats, start, local_clock());
-}
-
-static inline bool track_event_change(struct bch2_time_stats *stats,
- u64 *start, bool v)
-{
- if (v != !!*start) {
- if (!v) {
- bch2_time_stats_update(stats, *start);
- *start = 0;
- } else {
- *start = local_clock() ?: 1;
- return true;
- }
- }
-
- return false;
-}
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
-static inline bool track_event_change(struct bch2_time_stats *stats,
- u64 *start, bool v)
-{
- bool ret = v && !*start;
- *start = v;
- return ret;
-}
-#endif
-
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
-
#define ewma_add(ewma, val, weight) \
({ \
typeof(ewma) _ewma = (ewma); \
@@ -537,6 +327,19 @@ do { \
_ptr ? container_of(_ptr, type, member) : NULL; \
})
+static inline struct list_head *list_pop(struct list_head *head)
+{
+ if (list_empty(head))
+ return NULL;
+
+ struct list_head *ret = head->next;
+ list_del_init(ret);
+ return ret;
+}
+
+#define list_pop_entry(head, type, member) \
+ container_of_or_null(list_pop(head), type, member)
+
/* Does linear interpolation between powers of two */
static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
{
@@ -552,11 +355,6 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
void bch2_bio_map(struct bio *bio, void *base, size_t);
int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
-static inline sector_t bdev_sectors(struct block_device *bdev)
-{
- return bdev->bd_inode->i_size >> 9;
-}
-
#define closure_bio_submit(bio, cl) \
do { \
closure_get(cl); \
@@ -603,7 +401,7 @@ do { \
_ret; \
})
-size_t bch2_rand_range(size_t);
+u64 bch2_get_random_u64_below(u64);
void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
@@ -738,10 +536,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-void sort_cmp_size(void *base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t));
-
/* just the memmove, doesn't update @_nr */
#define __array_insert_item(_array, _nr, _pos) \
memmove(&(_array)[(_pos) + 1], \
@@ -788,8 +582,15 @@ static inline void __move_gap(void *array, size_t element_size,
}
/* Move the gap in a gap buffer: */
-#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \
- __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+#define move_gap(_d, _new_gap) \
+do { \
+ BUG_ON(_new_gap > (_d)->nr); \
+ BUG_ON((_d)->gap > (_d)->nr); \
+ \
+ __move_gap((_d)->data, sizeof((_d)->data[0]), \
+ (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \
+ (_d)->gap = _new_gap; \
+} while (0)
#define bubble_sort(_base, _nr, _cmp) \
do { \
@@ -806,14 +607,19 @@ do { \
} \
} while (0)
+#define per_cpu_sum(_p) \
+({ \
+ typeof(*_p) _ret = 0; \
+ \
+ int cpu; \
+ for_each_possible_cpu(cpu) \
+ _ret += *per_cpu_ptr(_p, cpu); \
+ _ret; \
+})
+
static inline u64 percpu_u64_get(u64 __percpu *src)
{
- u64 ret = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- ret += *per_cpu_ptr(src, cpu);
- return ret;
+ return per_cpu_sum(src);
}
static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
@@ -827,9 +633,7 @@ static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
{
- unsigned i;
-
- for (i = 0; i < nr; i++)
+ for (unsigned i = 0; i < nr; i++)
acc[i] += src[i];
}
@@ -866,8 +670,6 @@ static inline int cmp_le32(__le32 l, __le32 r)
#include <linux/uuid.h>
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-
static inline bool qstr_eq(const struct qstr l, const struct qstr r)
{
return l.len == r.len && !memcmp(l.name, r.name, l.len);
@@ -876,4 +678,52 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r)
void bch2_darray_str_exit(darray_str *);
int bch2_split_devs(const char *, darray_str *);
+#ifdef __KERNEL__
+
+__must_check
+static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
+{
+ return copy_to_user(to, from, n) ? -EFAULT : 0;
+}
+
+__must_check
+static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
+{
+ return copy_from_user(to, from, n) ? -EFAULT : 0;
+}
+
+#endif
+
+static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
+{
+ if (v)
+ set_bit(nr, addr);
+ else
+ clear_bit(nr, addr);
+}
+
+static inline void __set_bit_le64(size_t bit, __le64 *addr)
+{
+ addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline void __clear_bit_le64(size_t bit, __le64 *addr)
+{
+ addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline bool test_bit_le64(size_t bit, __le64 *addr)
+{
+ return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
+}
+
+static inline void memcpy_swab(void *_dst, void *_src, size_t len)
+{
+ u8 *dst = _dst + len;
+ u8 *src = _src;
+
+ while (len--)
+ *--dst = *src++;
+}
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index cb4f33ed9ab3..6620ecae26af 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -3,12 +3,13 @@
#include <linux/bitops.h>
#include <linux/math.h>
#include <linux/string.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#ifdef CONFIG_VALGRIND
#include <valgrind/memcheck.h>
#endif
+#include "errcode.h"
#include "varint.h"
/**
@@ -53,7 +54,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
u64 v;
if (unlikely(in + bytes > end))
- return -1;
+ return -BCH_ERR_varint_decode_error;
if (likely(bytes < 9)) {
__le64 v_le = 0;
@@ -85,7 +86,7 @@ int bch2_varint_encode_fast(u8 *out, u64 v)
if (likely(bytes < 9)) {
v <<= bytes;
- v |= ~(~0 << (bytes - 1));
+ v |= ~(~0U << (bytes - 1));
} else {
*out++ = 255;
bytes = 9;
@@ -115,7 +116,7 @@ int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
unsigned bytes = ffz(*in) + 1;
if (unlikely(in + bytes > end))
- return -1;
+ return -BCH_ERR_varint_decode_error;
if (likely(bytes < 9)) {
v >>= bytes;
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 9c0d2316031b..aed7c6984173 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -70,17 +70,16 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
.cmp_bkey = xattr_cmp_bkey,
};
-int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
{
struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
le16_to_cpu(xattr.v->x_val_len));
int ret = 0;
- bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err,
- xattr_val_size_too_small,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s,
+ c, xattr_val_size_too_small,
"value too small (%zu < %u)",
bkey_val_u64s(k.k), val_u64s);
@@ -88,17 +87,17 @@ int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
val_u64s = xattr_val_u64s(xattr.v->x_name_len,
le16_to_cpu(xattr.v->x_val_len) + 4);
- bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err,
- xattr_val_size_too_big,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s,
+ c, xattr_val_size_too_big,
"value too big (%zu > %u)",
bkey_val_u64s(k.k), val_u64s);
- bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err,
- xattr_invalid_type,
+ bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type),
+ c, xattr_invalid_type,
"invalid type (%u)", xattr.v->x_type);
- bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err,
- xattr_name_invalid_chars,
+ bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len),
+ c, xattr_name_invalid_chars,
"xattr name has invalid characters");
fsck_err:
return ret;
@@ -118,11 +117,17 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
else
prt_printf(out, "(unknown type %u)", xattr.v->x_type);
+ unsigned name_len = xattr.v->x_name_len;
+ unsigned val_len = le16_to_cpu(xattr.v->x_val_len);
+ unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) -
+ offsetof(struct bch_xattr, x_name);
+
+ val_len = min_t(int, val_len, max_name_val_bytes - name_len);
+ name_len = min(name_len, max_name_val_bytes);
+
prt_printf(out, "%.*s:%.*s",
- xattr.v->x_name_len,
- xattr.v->x_name,
- le16_to_cpu(xattr.v->x_val_len),
- (char *) xattr_val(xattr.v));
+ name_len, xattr.v->x_name,
+ val_len, (char *) xattr_val(xattr.v));
if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
@@ -138,21 +143,13 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
struct btree_iter iter;
- struct bkey_s_c_xattr xattr;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
- inode_inum(inode), &search, 0);
- if (ret)
- goto err1;
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
+ struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
+ inode_inum(inode), &search, 0);
+ int ret = bkey_err(k);
if (ret)
- goto err2;
+ return ret;
- xattr = bkey_s_c_to_xattr(k);
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
ret = le16_to_cpu(xattr.v->x_val_len);
if (buffer) {
if (ret > size)
@@ -160,10 +157,8 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
else
memcpy(buffer, xattr_val(xattr.v), ret);
}
-err2:
bch2_trans_iter_exit(trans, &iter);
-err1:
- return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret;
+ return ret;
}
int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
@@ -177,7 +172,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
int ret;
ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
- bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+ bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
if (ret)
return ret;
@@ -212,8 +207,8 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
inum, &xattr->k_i,
- (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
- (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
+ (flags & XATTR_CREATE ? STR_HASH_must_create : 0)|
+ (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0));
} else {
struct xattr_search_key search =
X_SEARCH(type, name, strlen(name));
@@ -255,17 +250,27 @@ static int __bch2_xattr_emit(const char *prefix,
return 0;
}
+static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry)
+{
+ const struct xattr_handler *handler = bch2_xattr_type_to_handler(type);
+
+ if (!xattr_handler_can_list(handler, dentry))
+ return NULL;
+
+ return xattr_prefix(handler);
+}
+
static int bch2_xattr_emit(struct dentry *dentry,
const struct bch_xattr *xattr,
struct xattr_buf *buf)
{
- const struct xattr_handler *handler =
- bch2_xattr_type_to_handler(xattr->x_type);
+ const char *prefix;
+
+ prefix = bch2_xattr_prefix(xattr->x_type, dentry);
+ if (!prefix)
+ return 0;
- return handler && (!handler->list || handler->list(dentry))
- ? __bch2_xattr_emit(handler->prefix ?: handler->name,
- xattr->x_name, xattr->x_name_len, buf)
- : 0;
+ return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf);
}
static int bch2_xattr_list_bcachefs(struct bch_fs *c,
@@ -300,54 +305,23 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
struct bch_fs *c = dentry->d_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
u64 offset = 0, inum = inode->ei_inode.bi_inum;
- u32 snapshot;
- int ret;
-retry:
- bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
- if (ret)
- goto err;
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
- SPOS(inum, offset, snapshot),
- POS(inum, U64_MAX), 0, k, ret) {
- if (k.k->type != KEY_TYPE_xattr)
- continue;
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs,
+ POS(inum, offset),
+ POS(inum, U64_MAX),
+ inode->ei_inum.subvol, 0, k, ({
+ if (k.k->type != KEY_TYPE_xattr)
+ continue;
- ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
- if (ret)
- break;
- }
+ bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
+ }))) ?:
+ bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?:
+ bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
-
- if (ret)
- goto out;
-
- ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
- if (ret)
- goto out;
-
- ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
- if (ret)
- goto out;
-
- return buf.used;
-out:
- return bch2_err_class(ret);
+ return ret ? bch2_err_class(ret) : buf.used;
}
static int bch2_xattr_get_handler(const struct xattr_handler *handler,
@@ -356,9 +330,12 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret = bch2_trans_do(c, NULL, NULL, 0,
+ int ret = bch2_trans_do(c,
bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
+ if (ret < 0 && bch2_err_matches(ret, ENOENT))
+ ret = -ENODATA;
+
return bch2_err_class(ret);
}
@@ -544,11 +521,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
kfree(buf);
if (ret < 0)
- return ret;
+ goto err_class_exit;
ret = bch2_opt_check_may_set(c, opt_id, v);
if (ret < 0)
- return ret;
+ goto err_class_exit;
s.v = v + 1;
s.defined = true;
@@ -588,13 +565,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
err:
mutex_unlock(&inode->ei_update_lock);
-
- if (value &&
- (opt_id == Opt_background_target ||
- opt_id == Opt_background_compression ||
- (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
- bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
-
+err_class_exit:
return bch2_err_class(ret);
}
@@ -613,20 +584,26 @@ static int bch2_xattr_bcachefs_get_effective(
name, buffer, size, true);
}
+/* Noop - xattrs in the bcachefs_effective namespace are inherited */
+static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *dentry, struct inode *vinode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return 0;
+}
+
static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
.prefix = "bcachefs_effective.",
.get = bch2_xattr_bcachefs_get_effective,
- .set = bch2_xattr_bcachefs_set,
+ .set = bch2_xattr_bcachefs_set_effective,
};
#endif /* NO_BCACHEFS_FS */
-const struct xattr_handler *bch2_xattr_handlers[] = {
+const struct xattr_handler * const bch2_xattr_handlers[] = {
&bch_xattr_user_handler,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- &nop_posix_acl_access,
- &nop_posix_acl_default,
-#endif
&bch_xattr_trusted_handler,
&bch_xattr_security_handler,
#ifndef NO_BCACHEFS_FS
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 1337f31a5c49..132fbbd15a66 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -6,12 +6,12 @@
extern const struct bch_hash_desc bch2_xattr_hash_desc;
-int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_xattr ((struct bkey_ops) { \
- .key_invalid = bch2_xattr_invalid, \
+ .key_validate = bch2_xattr_validate, \
.val_to_text = bch2_xattr_to_text, \
.min_val_size = 8, \
})
@@ -45,6 +45,6 @@ int bch2_xattr_set(struct btree_trans *, subvol_inum,
ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
-extern const struct xattr_handler *bch2_xattr_handlers[];
+extern const struct xattr_handler * const bch2_xattr_handlers[];
#endif /* _BCACHEFS_XATTR_H */
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
index e9f810539552..c7916011ef34 100644
--- a/fs/bcachefs/xattr_format.h
+++ b/fs/bcachefs/xattr_format.h
@@ -13,7 +13,7 @@ struct bch_xattr {
__u8 x_type;
__u8 x_name_len;
__le16 x_val_len;
- __u8 x_name[];
+ __u8 x_name[] __counted_by(x_name_len);
} __packed __aligned(8);
#endif /* _BCACHEFS_XATTR_FORMAT_H */