diff options
Diffstat (limited to 'fs/bcachefs')
214 files changed, 28364 insertions, 17231 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 5cdfef3b551a..fc7efd0a7525 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -15,6 +15,7 @@ config BCACHEFS_FS select ZLIB_INFLATE select ZSTD_COMPRESS select ZSTD_DECOMPRESS + select CRYPTO select CRYPTO_SHA256 select CRYPTO_CHACHA20 select CRYPTO_POLY1305 @@ -24,6 +25,7 @@ config BCACHEFS_FS select XXHASH select SRCU select SYMBOLIC_ERRNAME + select MIN_HEAP help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. @@ -59,6 +61,13 @@ config BCACHEFS_DEBUG The resulting code will be significantly slower than normal; you probably shouldn't select this option unless you're a developer. +config BCACHEFS_INJECT_TRANSACTION_RESTARTS + bool "Randomly inject transaction restarts" + depends on BCACHEFS_DEBUG + help + Randomly inject transaction restarts in a few core paths - may have a + significant performance penalty + config BCACHEFS_TESTS bool "bcachefs unit and performance tests" depends on BCACHEFS_FS @@ -87,6 +96,13 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN is held by another thread, spin for a short while, as long as the thread owning the lock is running. +config BCACHEFS_PATH_TRACEPOINTS + bool "Extra btree_path tracepoints" + depends on BCACHEFS_FS && TRACING + help + Enable extra tracepoints for debugging btree_path operations; we don't + normally want these enabled because they happen at very high rates. + config MEAN_AND_VARIANCE_UNIT_TEST tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 1a05cecda7cc..d2689388d5e8 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -17,6 +17,7 @@ bcachefs-y := \ btree_journal_iter.o \ btree_key_cache.o \ btree_locking.o \ + btree_node_scan.o \ btree_trans_commit.o \ btree_update.o \ btree_update_interior.o \ @@ -28,15 +29,17 @@ bcachefs-y := \ clock.o \ compress.o \ darray.o \ + data_update.o \ debug.o \ dirent.o \ + disk_accounting.o \ disk_groups.o \ - data_update.o \ ec.o \ errcode.o \ error.o \ extents.o \ extent_update.o \ + eytzinger.o \ fs.o \ fs-common.o \ fs-ioctl.o \ @@ -66,7 +69,9 @@ bcachefs-y := \ printbuf.o \ quota.o \ rebalance.o \ + rcu_pending.o \ recovery.o \ + recovery_passes.o \ reflink.o \ replicas.o \ sb-clean.o \ @@ -77,11 +82,13 @@ bcachefs-y := \ siphash.o \ six.o \ snapshot.o \ + str_hash.o \ subvolume.o \ super.o \ super-io.o \ sysfs.o \ tests.o \ + time_stats.o \ thread_with_file.o \ trace.o \ two_state_shared_lock.o \ @@ -90,3 +97,6 @@ bcachefs-y := \ xattr.o obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o + +# Silence "note: xyz changed in GCC X.X" messages +subdir-ccflags-y += $(call cc-disable-warning, psabi) diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 3640f417cce1..99487727ae64 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -184,11 +184,6 @@ invalid: return ERR_PTR(-EINVAL); } -#define acl_for_each_entry(acl, acl_e) \ - for (acl_e = acl->a_entries; \ - acl_e < acl->a_entries + acl->a_count; \ - acl_e++) - /* * Convert from in-memory to filesystem representation. */ @@ -199,11 +194,11 @@ bch2_acl_to_xattr(struct btree_trans *trans, { struct bkey_i_xattr *xattr; bch_acl_header *acl_header; - const struct posix_acl_entry *acl_e; + const struct posix_acl_entry *acl_e, *pe; void *outptr; unsigned nr_short = 0, nr_long = 0, acl_len, u64s; - acl_for_each_entry(acl, acl_e) { + FOREACH_ACL_ENTRY(acl_e, acl, pe) { switch (acl_e->e_tag) { case ACL_USER: case ACL_GROUP: @@ -241,7 +236,7 @@ bch2_acl_to_xattr(struct btree_trans *trans, outptr = (void *) acl_header + sizeof(*acl_header); - acl_for_each_entry(acl, acl_e) { + FOREACH_ACL_ENTRY(acl_e, acl, pe) { bch_acl_entry *entry = outptr; entry->e_tag = cpu_to_le16(acl_e->e_tag); @@ -272,46 +267,41 @@ bch2_acl_to_xattr(struct btree_trans *trans, return xattr; } -struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, - struct dentry *dentry, int type) +struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) { - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; - struct bkey_s_c_xattr xattr; struct posix_acl *acl = NULL; - struct bkey_s_c k; - int ret; + + if (rcu) + return ERR_PTR(-ECHILD); + + struct btree_trans *trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); - ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash, inode_inum(inode), &search, 0); - if (ret) { - if (!bch2_err_matches(ret, ENOENT)) - acl = ERR_PTR(ret); - goto out; - } - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) { - acl = ERR_PTR(ret); - goto out; - } + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash, inode_inum(inode), &search, 0); + int ret = bkey_err(k); + if (ret) + goto err; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); + le16_to_cpu(xattr.v->x_val_len)); + ret = PTR_ERR_OR_ZERO(acl); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (ret) + acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL; - if (!IS_ERR(acl)) + if (!IS_ERR_OR_NULL(acl)) set_cached_acl(&inode->v, type, acl); -out: - if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart)) - goto retry; bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); @@ -354,7 +344,6 @@ int bch2_set_acl(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl; @@ -362,13 +351,14 @@ int bch2_set_acl(struct mnt_idmap *idmap, int ret; mutex_lock(&inode->ei_update_lock); + struct btree_trans *trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); acl = _acl; - ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?: bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto btree_err; @@ -402,8 +392,8 @@ btree_err: set_cached_acl(&inode->v, type, acl); err: - mutex_unlock(&inode->ei_update_lock); bch2_trans_put(trans); + mutex_unlock(&inode->ei_update_lock); return ret; } @@ -416,39 +406,30 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); struct btree_iter iter; - struct bkey_s_c_xattr xattr; - struct bkey_i_xattr *new; struct posix_acl *acl = NULL; - struct bkey_s_c k; - int ret; - ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash_info, inum, &search, BTREE_ITER_INTENT); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash_info, inum, &search, BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) return bch2_err_matches(ret, ENOENT) ? 0 : ret; - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); ret = PTR_ERR_OR_ZERO(acl); - if (IS_ERR_OR_NULL(acl)) + if (ret) goto err; - ret = allocate_dropping_locks_errcode(trans, - __posix_acl_chmod(&acl, _gfp, mode)); + ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode)); if (ret) goto err; - new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); - if (IS_ERR(new)) { - ret = PTR_ERR(new); + struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); + ret = PTR_ERR_OR_ZERO(new); + if (ret) goto err; - } new->k.p = iter.pos; ret = bch2_trans_update(trans, &iter, &new->k_i, 0); diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h index 27e7eec0f278..fe730a6bf0c1 100644 --- a/fs/bcachefs/acl.h +++ b/fs/bcachefs/acl.h @@ -28,7 +28,7 @@ void bch2_acl_to_text(struct printbuf *, const void *, size_t); #ifdef CONFIG_BCACHEFS_POSIX_ACL -struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int); +struct posix_acl *bch2_get_acl(struct inode *, int, bool); int bch2_set_acl_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index fd3e175d8342..3ea809990ef1 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -3,6 +3,7 @@ #include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_key_cache.h" @@ -14,6 +15,7 @@ #include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" +#include "disk_accounting.h" #include "ec.h" #include "error.h" #include "lru.h" @@ -28,6 +30,9 @@ #include <linux/rcupdate.h> #include <linux/sched/task.h> #include <linux/sort.h> +#include <linux/jiffies.h> + +static void bch2_discard_one_bucket_fast(struct bch_dev *, u64); /* Persistent alloc info: */ @@ -192,99 +197,119 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) return DIV_ROUND_UP(bytes, sizeof(u64)); } -int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); int ret = 0; /* allow for unknown fields */ - bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err, - alloc_v1_val_size_bad, + bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), + c, alloc_v1_val_size_bad, "incorrect value size (%zu < %u)", bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); fsck_err: return ret; } -int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_alloc_unpacked u; int ret = 0; - bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err, - alloc_v2_unpack_error, + bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), + c, alloc_v2_unpack_error, "unpack error"); fsck_err: return ret; } -int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_alloc_unpacked u; int ret = 0; - bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err, - alloc_v2_unpack_error, + bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), + c, alloc_v2_unpack_error, "unpack error"); fsck_err: return ret; } -int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) +int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { - struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + struct bch_alloc_v4 a; int ret = 0; - bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err, - alloc_v4_val_size_bad, + bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k)); + + bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k), + c, alloc_v4_val_size_bad, "bad val size (%u > %zu)", - alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); + alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k)); - bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err, - alloc_v4_backpointers_start_bad, + bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) && + BCH_ALLOC_V4_NR_BACKPOINTERS(&a), + c, alloc_v4_backpointers_start_bad, "invalid backpointers_start"); - bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err, - alloc_key_data_type_bad, + bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type, + c, alloc_key_data_type_bad, "invalid data type (got %u should be %u)", - a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + a.data_type, alloc_data_type(a, a.data_type)); + + for (unsigned i = 0; i < 2; i++) + bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX, + c, alloc_key_io_time_bad, + "invalid io_time[%s]: %llu, max %llu", + i == READ ? "read" : "write", + a.io_time[i], LRU_TIME_MAX); + + unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) > + offsetof(struct bch_alloc_v4, stripe_sectors) + ? a.stripe_sectors + : 0; - switch (a.v->data_type) { + switch (a.data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: - bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe, - c, err, alloc_key_empty_but_have_data, - "empty data type free but have data"); + bkey_fsck_err_on(stripe_sectors || + a.dirty_sectors || + a.cached_sectors || + a.stripe, + c, alloc_key_empty_but_have_data, + "empty data type free but have data %u.%u.%u %u", + stripe_sectors, + a.dirty_sectors, + a.cached_sectors, + a.stripe); break; case BCH_DATA_sb: case BCH_DATA_journal: case BCH_DATA_btree: case BCH_DATA_user: case BCH_DATA_parity: - bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v), - c, err, alloc_key_dirty_sectors_0, + bkey_fsck_err_on(!a.dirty_sectors && + !stripe_sectors, + c, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", - bch2_data_type_str(a.v->data_type)); + bch2_data_type_str(a.data_type)); break; case BCH_DATA_cached: - bkey_fsck_err_on(!a.v->cached_sectors || - bch2_bucket_sectors_dirty(*a.v) || - a.v->stripe, - c, err, alloc_key_cached_inconsistency, + bkey_fsck_err_on(!a.cached_sectors || + a.dirty_sectors || + stripe_sectors || + a.stripe, + c, alloc_key_cached_inconsistency, "data type inconsistency"); - bkey_fsck_err_on(!a.v->io_time[READ] && + bkey_fsck_err_on(!a.io_time[READ] && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, - c, err, alloc_key_cached_but_read_time_zero, + c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; case BCH_DATA_stripe: @@ -297,9 +322,9 @@ fsck_err: void bch2_alloc_v4_swab(struct bkey_s k) { struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; - struct bch_backpointer *bp, *bps; - a->journal_seq = swab64(a->journal_seq); + a->journal_seq_nonempty = swab64(a->journal_seq_nonempty); + a->journal_seq_empty = swab64(a->journal_seq_empty); a->flags = swab32(a->flags); a->dirty_sectors = swab32(a->dirty_sectors); a->cached_sectors = swab32(a->cached_sectors); @@ -307,20 +332,14 @@ void bch2_alloc_v4_swab(struct bkey_s k) a->io_time[1] = swab64(a->io_time[1]); a->stripe = swab32(a->stripe); a->nr_external_backpointers = swab32(a->nr_external_backpointers); - a->fragmentation_lru = swab64(a->fragmentation_lru); - - bps = alloc_v4_backpointers(a); - for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { - bp->bucket_offset = swab40(bp->bucket_offset); - bp->bucket_len = swab32(bp->bucket_len); - bch2_bpos_swab(&bp->pos); - } + a->stripe_sectors = swab32(a->stripe_sectors); } void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); + struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL; prt_newline(out); printbuf_indent_add(out, 2); @@ -328,28 +347,24 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); bch2_prt_data_type(out, a->data_type); prt_newline(out); - prt_printf(out, "journal_seq %llu", a->journal_seq); - prt_newline(out); - prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); - prt_newline(out); - prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); - prt_newline(out); - prt_printf(out, "dirty_sectors %u", a->dirty_sectors); - prt_newline(out); - prt_printf(out, "cached_sectors %u", a->cached_sectors); - prt_newline(out); - prt_printf(out, "stripe %u", a->stripe); - prt_newline(out); - prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); - prt_newline(out); - prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); - prt_newline(out); - prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); - prt_newline(out); - prt_printf(out, "fragmentation %llu", a->fragmentation_lru); - prt_newline(out); - prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); + prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty); + prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); + prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); + prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); + prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); + prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); + prt_printf(out, "cached_sectors %u\n", a->cached_sectors); + prt_printf(out, "stripe %u\n", a->stripe); + prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); + prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); + prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); + + if (ca) + prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca)); + prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a)); printbuf_indent_sub(out, 2); + + bch2_dev_put(ca); } void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) @@ -371,7 +386,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); *out = (struct bch_alloc_v4) { - .journal_seq = u.journal_seq, + .journal_seq_nonempty = u.journal_seq, .flags = u.need_discard, .gen = u.gen, .oldest_gen = u.oldest_gen, @@ -437,22 +452,18 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct b } struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos) +bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) { - struct bkey_s_c k; - struct bkey_i_alloc_v4 *a; - int ret; - - k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - ret = bkey_err(k); + struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, + BTREE_ITER_with_updates| + BTREE_ITER_cached| + BTREE_ITER_intent); + int ret = bkey_err(k); if (unlikely(ret)) return ERR_PTR(ret); - a = bch2_alloc_to_v4_mut_inlined(trans, k); + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); ret = PTR_ERR_OR_ZERO(a); if (unlikely(ret)) goto err; @@ -462,6 +473,21 @@ err: return ERR_PTR(ret); } +__flatten +struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos, + enum btree_iter_update_trigger_flags flags) +{ + struct btree_iter iter; + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); + int ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ERR_PTR(ret); + + ret = bch2_trans_update(trans, &iter, &a->k_i, flags); + bch2_trans_iter_exit(trans, &iter); + return unlikely(ret) ? ERR_PTR(ret) : a; +} + static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) { *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; @@ -484,14 +510,13 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) : 0; } -int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { int ret = 0; - bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err, - bucket_gens_val_size_bad, + bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), + c, bucket_gens_val_size_bad, "bad val size (%zu != %zu)", bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); fsck_err: @@ -518,7 +543,7 @@ int bch2_bucket_gens_init(struct bch_fs *c) int ret; ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: @@ -530,13 +555,13 @@ int bch2_bucket_gens_init(struct bch_fs *c) u8 gen = bch2_alloc_to_v4(k, &a)->gen; unsigned offset; struct bpos pos = alloc_gens_pos(iter.pos, &offset); + int ret2 = 0; - if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); - if (ret) - break; + if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) { + ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret2) + goto iter_err; have_bucket_gens_key = false; } @@ -547,7 +572,8 @@ int bch2_bucket_gens_init(struct bch_fs *c) } g.v.gens[offset] = gen; - 0; +iter_err: + ret2; })); if (have_bucket_gens_key && !ret) @@ -564,29 +590,29 @@ int bch2_bucket_gens_init(struct bch_fs *c) int bch2_alloc_read(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); + struct bch_dev *ca = NULL; int ret; - down_read(&c->gc_lock); - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; if (k.k->type != KEY_TYPE_bucket_gens) continue; - const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; - + ca = bch2_dev_iterate(c, ca, k.k->p.inode); /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ - if (!bch2_dev_exists2(c, k.k->p.inode)) + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; + } - struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; for (u64 b = max_t(u64, ca->mi.first_bucket, start); b < min_t(u64, ca->mi.nbuckets, end); @@ -596,15 +622,26 @@ int bch2_alloc_read(struct bch_fs *c) })); } else { ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ + ca = bch2_dev_iterate(c, ca, k.k->p.inode); /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ - if (!bch2_dev_bucket_exists(c, k.k->p)) + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + + if (k.k->p.offset < ca->mi.first_bucket) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); continue; + } - struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + if (k.k->p.offset >= ca->mi.nbuckets) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; @@ -612,8 +649,8 @@ int bch2_alloc_read(struct bch_fs *c) })); } + bch2_dev_put(ca); bch2_trans_put(trans); - up_read(&c->gc_lock); bch_err_fn(c, ret); return ret; @@ -621,74 +658,80 @@ int bch2_alloc_read(struct bch_fs *c) /* Free space/discard btree: */ +static int __need_discard_or_freespace_err(struct btree_trans *trans, + struct bkey_s_c alloc_k, + bool set, bool discard, bool repair) +{ + struct bch_fs *c = trans->c; + enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0); + enum bch_sb_error_id err_id = discard + ? BCH_FSCK_ERR_need_discard_key_wrong + : BCH_FSCK_ERR_freespace_key_wrong; + enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace; + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, alloc_k); + + int ret = __bch2_fsck_err(NULL, trans, flags, err_id, + "bucket incorrectly %sset in %s btree\n" + " %s", + set ? "" : "un", + bch2_btree_id_str(btree), + buf.buf); + if (ret == -BCH_ERR_fsck_ignore || + ret == -BCH_ERR_fsck_errors_not_fixed) + ret = 0; + + printbuf_exit(&buf); + return ret; +} + +#define need_discard_or_freespace_err(...) \ + fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__)) + +#define need_discard_or_freespace_err_on(cond, ...) \ + (unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false) + static int bch2_bucket_do_index(struct btree_trans *trans, + struct bch_dev *ca, struct bkey_s_c alloc_k, const struct bch_alloc_v4 *a, bool set) { - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); - struct btree_iter iter; - struct bkey_s_c old; - struct bkey_i *k; enum btree_id btree; - enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; - enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - struct printbuf buf = PRINTBUF; - int ret; + struct bpos pos; if (a->data_type != BCH_DATA_free && a->data_type != BCH_DATA_need_discard) return 0; - k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); - - bkey_init(&k->k); - k->k.type = new_type; - switch (a->data_type) { case BCH_DATA_free: btree = BTREE_ID_freespace; - k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); - bch2_key_resize(&k->k, 1); + pos = alloc_freespace_pos(alloc_k.k->p, *a); break; case BCH_DATA_need_discard: btree = BTREE_ID_need_discard; - k->k.p = alloc_k.k->p; + pos = alloc_k.k->p; break; default: return 0; } - old = bch2_bkey_get_iter(trans, &iter, btree, - bkey_start_pos(&k->k), - BTREE_ITER_INTENT); - ret = bkey_err(old); + struct btree_iter iter; + struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent); + int ret = bkey_err(old); if (ret) return ret; - if (ca->mi.freespace_initialized && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && - bch2_trans_inconsistent_on(old.k->type != old_type, trans, - "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" - " for %s", - set ? "setting" : "clearing", - bch2_btree_id_str(btree), - iter.pos.inode, - iter.pos.offset, - bch2_bkey_types[old.k->type], - bch2_bkey_types[old_type], - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - ret = -EIO; - goto err; - } + need_discard_or_freespace_err_on(ca->mi.freespace_initialized && + !old.k->type != set, + trans, alloc_k, set, + btree == BTREE_ID_need_discard, false); - ret = bch2_trans_update(trans, &iter, k, 0); -err: + ret = bch2_btree_bit_mod_iter(trans, &iter, set); +fsck_err: bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); return ret; } @@ -708,8 +751,8 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, return ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES); + BTREE_ITER_intent| + BTREE_ITER_with_updates); ret = bkey_err(k); if (ret) return ret; @@ -728,31 +771,99 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, return ret; } +static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca, + enum bch_data_type data_type, + s64 delta_buckets, + s64 delta_sectors, + s64 delta_fragmented, unsigned flags) +{ + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_dev_data_type, + .dev_data_type.dev = ca->dev_idx, + .dev_data_type.data_type = data_type, + }; + s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; + + return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc); +} + +int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, + const struct bch_alloc_v4 *old, + const struct bch_alloc_v4 *new, + unsigned flags) +{ + s64 old_sectors = bch2_bucket_sectors(*old); + s64 new_sectors = bch2_bucket_sectors(*new); + if (old->data_type != new->data_type) { + int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, + 1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?: + bch2_dev_data_type_accounting_mod(trans, ca, old->data_type, + -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags); + if (ret) + return ret; + } else if (old_sectors != new_sectors) { + int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, + 0, + new_sectors - old_sectors, + bch2_bucket_sectors_fragmented(ca, *new) - + bch2_bucket_sectors_fragmented(ca, *old), flags); + if (ret) + return ret; + } + + s64 old_unstriped = bch2_bucket_sectors_unstriped(*old); + s64 new_unstriped = bch2_bucket_sectors_unstriped(*new); + if (old_unstriped != new_unstriped) { + int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped, + !!new_unstriped - !!old_unstriped, + new_unstriped - old_unstriped, + 0, + flags); + if (ret) + return ret; + } + + return 0; +} + int bch2_trigger_alloc(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; int ret = 0; - if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, - "alloc key for invalid device or bucket")) + struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); + if (!ca) return -EIO; - struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); - struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { - struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; + struct bch_alloc_v4 *new_a; + if (likely(new.k->type == KEY_TYPE_alloc_v4)) { + new_a = bkey_s_to_alloc_v4(new).v; + } else { + BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair))); + + struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); + ret = PTR_ERR_OR_ZERO(new_ka); + if (unlikely(ret)) + goto err; + new_a = &new_ka->v; + } + + if (flags & BTREE_TRIGGER_transactional) { + alloc_data_type_set(new_a, new_a->data_type); - new_a->data_type = alloc_data_type(*new_a, new_a->data_type); + int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - + (int) data_type_is_empty(old_a->data_type); - if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) { - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + if (is_empty_delta < 0) { + new_a->io_time[READ] = bch2_current_io_time(c, READ); + new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); } @@ -762,20 +873,21 @@ int bch2_trigger_alloc(struct btree_trans *trans, !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { new_a->gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + alloc_data_type_set(new_a, new_a->data_type); } if (old_a->data_type != new_a->data_type || (new_a->data_type == BCH_DATA_free && alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { - ret = bch2_bucket_do_index(trans, old, old_a, false) ?: - bch2_bucket_do_index(trans, new.s_c, new_a, true); + ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?: + bch2_bucket_do_index(trans, ca, new.s_c, new_a, true); if (ret) - return ret; + goto err; } if (new_a->data_type == BCH_DATA_cached && !new_a->io_time[READ]) - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[READ] = bch2_current_io_time(c, READ); u64 old_lru = alloc_lru_idx_read(*old_a); u64 new_lru = alloc_lru_idx_read(*new_a); @@ -784,128 +896,150 @@ int bch2_trigger_alloc(struct btree_trans *trans, bucket_to_u64(new.k->p), old_lru, new_lru); if (ret) - return ret; + goto err; } - new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, - bch_dev_bkey_exists(c, new.k->p.inode)); - if (old_a->fragmentation_lru != new_a->fragmentation_lru) { + old_lru = alloc_lru_idx_fragmentation(*old_a, ca); + new_lru = alloc_lru_idx_fragmentation(*new_a, ca); + if (old_lru != new_lru) { ret = bch2_lru_change(trans, BCH_LRU_FRAGMENTATION_START, bucket_to_u64(new.k->p), - old_a->fragmentation_lru, new_a->fragmentation_lru); + old_lru, new_lru); if (ret) - return ret; + goto err; } if (old_a->gen != new_a->gen) { ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); if (ret) - return ret; + goto err; } - /* - * need to know if we're getting called from the invalidate path or - * not: - */ - - if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + if ((flags & BTREE_TRIGGER_bucket_invalidate) && old_a->cached_sectors) { - ret = bch2_update_cached_sectors_list(trans, new.k->p.inode, - -((s64) old_a->cached_sectors)); + ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx, + -((s64) old_a->cached_sectors), + flags & BTREE_TRIGGER_gc); if (ret) - return ret; + goto err; } + + ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); + if (ret) + goto err; } - if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { - struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; - u64 journal_seq = trans->journal_res.seq; - u64 bucket_journal_seq = new_a->journal_seq; + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { + u64 transaction_seq = trans->journal_res.seq; + BUG_ON(!transaction_seq); - if ((flags & BTREE_TRIGGER_INSERT) && - data_type_is_empty(old_a->data_type) != - data_type_is_empty(new_a->data_type) && - new.k->type == KEY_TYPE_alloc_v4) { - struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; + if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq, + trans, alloc_key_journal_seq_in_future, + "bucket journal seq in future (currently at %llu)\n%s", + journal_cur_seq(&c->journal), + (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf))) + new_a->journal_seq_nonempty = transaction_seq; - /* - * If the btree updates referring to a bucket weren't flushed - * before the bucket became empty again, then the we don't have - * to wait on a journal flush before we can reuse the bucket: - */ - v->journal_seq = bucket_journal_seq = - data_type_is_empty(new_a->data_type) && - (journal_seq == v->journal_seq || - bch2_journal_noflush_seq(&c->journal, v->journal_seq)) - ? 0 : journal_seq; + int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - + (int) data_type_is_empty(old_a->data_type); + + /* + * Record journal sequence number of empty -> nonempty transition: + * Note that there may be multiple empty -> nonempty + * transitions, data in a bucket may be overwritten while we're + * still writing to it - so be careful to only record the first: + * */ + if (is_empty_delta < 0 && + new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) { + new_a->journal_seq_nonempty = transaction_seq; + new_a->journal_seq_empty = 0; } - if (!data_type_is_empty(old_a->data_type) && - data_type_is_empty(new_a->data_type) && - bucket_journal_seq) { - ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - new.k->p.inode, new.k->p.offset, - bucket_journal_seq); - if (ret) { - bch2_fs_fatal_error(c, - "error setting bucket_needs_journal_commit: %i", ret); - return ret; + /* + * Bucket becomes empty: mark it as waiting for a journal flush, + * unless updates since empty -> nonempty transition were never + * flushed - we may need to ask the journal not to flush + * intermediate sequence numbers: + */ + if (is_empty_delta > 0) { + if (new_a->journal_seq_nonempty == transaction_seq || + bch2_journal_noflush_seq(&c->journal, + new_a->journal_seq_nonempty, + transaction_seq)) { + new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0; + } else { + new_a->journal_seq_empty = transaction_seq; + + ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + new.k->p.inode, new.k->p.offset, + transaction_seq); + if (bch2_fs_fatal_err_on(ret, c, + "setting bucket_needs_journal_commit: %s", + bch2_err_str(ret))) + goto err; } } - percpu_down_read(&c->mark_lock); - if (new_a->gen != old_a->gen) - *bucket_gen(ca, new.k->p.offset) = new_a->gen; + if (new_a->gen != old_a->gen) { + rcu_read_lock(); + u8 *gen = bucket_gen(ca, new.k->p.offset); + if (unlikely(!gen)) { + rcu_read_unlock(); + goto invalid_bucket; + } + *gen = new_a->gen; + rcu_read_unlock(); + } - bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); +#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) +#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) +#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk) - if (new_a->data_type == BCH_DATA_free && - (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) + if (statechange(a->data_type == BCH_DATA_free) && + bucket_flushed(new_a)) closure_wake_up(&c->freelist_wait); - if (new_a->data_type == BCH_DATA_need_discard && - (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) - bch2_do_discards(c); + if (statechange(a->data_type == BCH_DATA_need_discard) && + !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && + bucket_flushed(new_a)) + bch2_discard_one_bucket_fast(ca, new.k->p.offset); - if (old_a->data_type != BCH_DATA_cached && - new_a->data_type == BCH_DATA_cached && + if (statechange(a->data_type == BCH_DATA_cached) && + !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) - bch2_do_invalidates(c); + bch2_dev_do_invalidates(ca); - if (new_a->data_type == BCH_DATA_need_gc_gens) - bch2_do_gc_gens(c); - percpu_up_read(&c->mark_lock); + if (statechange(a->data_type == BCH_DATA_need_gc_gens)) + bch2_gc_gens_async(c); } - if ((flags & BTREE_TRIGGER_GC) && - (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) { - struct bch_alloc_v4 new_a_convert; - const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert); - - percpu_down_read(&c->mark_lock); + if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { + rcu_read_lock(); struct bucket *g = gc_bucket(ca, new.k->p.offset); - - bucket_lock(g); - - g->gen_valid = 1; - g->gen = new_a->gen; - g->data_type = new_a->data_type; - g->stripe = new_a->stripe; - g->stripe_redundancy = new_a->stripe_redundancy; - g->dirty_sectors = new_a->dirty_sectors; - g->cached_sectors = new_a->cached_sectors; - - bucket_unlock(g); - percpu_up_read(&c->mark_lock); + if (unlikely(!g)) { + rcu_read_unlock(); + goto invalid_bucket; + } + g->gen_valid = 1; + g->gen = new_a->gen; + rcu_read_unlock(); } - - return 0; +err: +fsck_err: + printbuf_exit(&buf); + bch2_dev_put(ca); + return ret; +invalid_bucket: + bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", + (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); + ret = -EIO; + goto err; } /* - * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for + * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for * extents style btrees, but works on non-extents btrees: */ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) @@ -933,7 +1067,7 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos * btree node min/max is a closed interval, upto takes a half * open interval: */ - k = bch2_btree_iter_peek_upto(&iter2, end); + k = bch2_btree_iter_peek_max(&iter2, end); next = iter2.pos; bch2_trans_iter_exit(iter->trans, &iter2); @@ -950,35 +1084,34 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos } } -static bool next_bucket(struct bch_fs *c, struct bpos *bucket) +static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket) { - struct bch_dev *ca; - - if (bch2_dev_bucket_exists(c, *bucket)) - return true; + if (*ca) { + if (bucket->offset < (*ca)->mi.first_bucket) + bucket->offset = (*ca)->mi.first_bucket; - if (bch2_dev_exists2(c, bucket->inode)) { - ca = bch_dev_bkey_exists(c, bucket->inode); - - if (bucket->offset < ca->mi.first_bucket) { - bucket->offset = ca->mi.first_bucket; + if (bucket->offset < (*ca)->mi.nbuckets) return true; - } + bch2_dev_put(*ca); + *ca = NULL; bucket->inode++; bucket->offset = 0; } rcu_read_lock(); - ca = __bch2_next_dev_idx(c, bucket->inode, NULL); - if (ca) - *bucket = POS(ca->dev_idx, ca->mi.first_bucket); + *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); + if (*ca) { + *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); + bch2_dev_get(*ca); + } rcu_read_unlock(); - return ca != NULL; + return *ca != NULL; } -static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, + struct bch_dev **ca, struct bkey *hole) { struct bch_fs *c = iter->trans->c; struct bkey_s_c k; @@ -987,22 +1120,21 @@ again: if (bkey_err(k)) return k; + *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode); + if (!k.k->type) { - struct bpos bucket = bkey_start_pos(k.k); + struct bpos hole_start = bkey_start_pos(k.k); - if (!bch2_dev_bucket_exists(c, bucket)) { - if (!next_bucket(c, &bucket)) + if (!*ca || !bucket_valid(*ca, hole_start.offset)) { + if (!next_bucket(c, ca, &hole_start)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bucket); + bch2_btree_iter_set_pos(iter, hole_start); goto again; } - if (!bch2_dev_bucket_exists(c, k.k->p)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); - - bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset); - } + if (k.k->p.offset > (*ca)->mi.nbuckets) + bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset); } return k; @@ -1017,87 +1149,51 @@ int bch2_check_alloc_key(struct btree_trans *trans, struct btree_iter *bucket_gens_iter) { struct bch_fs *c = trans->c; - struct bch_dev *ca; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; - unsigned discard_key_type, freespace_key_type; unsigned gens_offset; struct bkey_s_c k; struct printbuf buf = PRINTBUF; - int ret; + int ret = 0; - if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, - alloc_key_to_missing_dev_bucket, + struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); + if (fsck_err_on(!ca, + trans, alloc_key_to_missing_dev_bucket, "alloc key for invalid device:bucket %llu:%llu", alloc_k.k->p.inode, alloc_k.k->p.offset)) - return bch2_btree_delete_at(trans, alloc_iter, 0); + ret = bch2_btree_delete_at(trans, alloc_iter, 0); + if (!ca) + return ret; - ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); if (!ca->mi.freespace_initialized) - return 0; + goto out; a = bch2_alloc_to_v4(alloc_k, &a_convert); - discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); k = bch2_btree_iter_peek_slot(discard_iter); ret = bkey_err(k); if (ret) goto err; - if (k.k->type != discard_key_type && - (c->opts.reconstruct_alloc || - fsck_err(c, need_discard_key_wrong, - "incorrect key in need_discard btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[discard_key_type], - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { - struct bkey_i *update = - bch2_trans_kmalloc(trans, sizeof(*update)); - - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_init(&update->k); - update->k.type = discard_key_type; - update->k.p = discard_iter->pos; - - ret = bch2_trans_update(trans, discard_iter, update, 0); + bool is_discarded = a->data_type == BCH_DATA_need_discard; + if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded, + trans, alloc_k, !is_discarded, true, true)) { + ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded); if (ret) goto err; } - freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); k = bch2_btree_iter_peek_slot(freespace_iter); ret = bkey_err(k); if (ret) goto err; - if (k.k->type != freespace_key_type && - (c->opts.reconstruct_alloc || - fsck_err(c, freespace_key_wrong, - "incorrect key in freespace btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[freespace_key_type], - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { - struct bkey_i *update = - bch2_trans_kmalloc(trans, sizeof(*update)); - - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_init(&update->k); - update->k.type = freespace_key_type; - update->k.p = freespace_iter->pos; - bch2_key_resize(&update->k, 1); - - ret = bch2_trans_update(trans, freespace_iter, update, 0); + bool is_free = a->data_type == BCH_DATA_free; + if (need_discard_or_freespace_err_on(!!k.k->type != is_free, + trans, alloc_k, !is_free, false, true)) { + ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free); if (ret) goto err; } @@ -1108,14 +1204,13 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (a->gen != alloc_gen(k, gens_offset) && - (c->opts.reconstruct_alloc || - fsck_err(c, bucket_gens_key_wrong, - "incorrect gen in bucket_gens btree (got %u should be %u)\n" - " %s", - alloc_gen(k, gens_offset), a->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), + trans, bucket_gens_key_wrong, + "incorrect gen in bucket_gens btree (got %u should be %u)\n" + " %s", + alloc_gen(k, gens_offset), a->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i_bucket_gens *g = bch2_trans_kmalloc(trans, sizeof(*g)); @@ -1136,25 +1231,25 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; } +out: err: fsck_err: + bch2_dev_put(ca); printbuf_exit(&buf); return ret; } static noinline_for_stack int bch2_check_alloc_hole_freespace(struct btree_trans *trans, + struct bch_dev *ca, struct bpos start, struct bpos *end, struct btree_iter *freespace_iter) { - struct bch_fs *c = trans->c; - struct bch_dev *ca; struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret; - ca = bch_dev_bkey_exists(c, start.inode); if (!ca->mi.freespace_initialized) return 0; @@ -1167,14 +1262,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, *end = bkey_min(k.k->p, *end); - if (k.k->type != KEY_TYPE_set && - (c->opts.reconstruct_alloc || - fsck_err(c, freespace_hole_missing, - "hole in alloc btree missing in freespace btree\n" - " device %llu buckets %llu-%llu", - freespace_iter->pos.inode, - freespace_iter->pos.offset, - end->offset))) { + if (fsck_err_on(k.k->type != KEY_TYPE_set, + trans, freespace_hole_missing, + "hole in alloc btree missing in freespace btree\n" + " device %llu buckets %llu-%llu", + freespace_iter->pos.inode, + freespace_iter->pos.offset, + end->offset)) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); @@ -1205,7 +1299,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, struct bpos *end, struct btree_iter *bucket_gens_iter) { - struct bch_fs *c = trans->c; struct bkey_s_c k; struct printbuf buf = PRINTBUF; unsigned i, gens_offset, gens_end_offset; @@ -1229,7 +1322,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, bkey_reassemble(&g.k_i, k); for (i = gens_offset; i < gens_end_offset; i++) { - if (fsck_err_on(g.v.gens[i], c, + if (fsck_err_on(g.v.gens[i], trans, bucket_gens_hole_wrong, "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", bucket_gens_pos_to_alloc(k.k->p, i).inode, @@ -1262,63 +1355,129 @@ fsck_err: return ret; } -static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, - struct btree_iter *iter) +struct check_discard_freespace_key_async { + struct work_struct work; + struct bch_fs *c; + struct bbpos pos; +}; + +static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0); + int ret = bkey_err(k); + if (ret) + return ret; + + u8 gen; + ret = k.k->type != KEY_TYPE_set + ? bch2_check_discard_freespace_key(trans, &iter, &gen, false) + : 0; + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static void check_discard_freespace_key_work(struct work_struct *work) +{ + struct check_discard_freespace_key_async *w = + container_of(work, struct check_discard_freespace_key_async, work); + + bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); + bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key); + kfree(w); +} + +int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, + bool async_repair) { struct bch_fs *c = trans->c; - struct btree_iter alloc_iter; - struct bkey_s_c alloc_k; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - u64 genbits; - struct bpos pos; enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard ? BCH_DATA_need_discard : BCH_DATA_free; struct printbuf buf = PRINTBUF; - int ret; - pos = iter->pos; - pos.offset &= ~(~0ULL << 56); - genbits = iter->pos.offset & (~0ULL << 56); + struct bpos bucket = iter->pos; + bucket.offset &= ~(~0ULL << 56); + u64 genbits = iter->pos.offset & (~0ULL << 56); - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); - ret = bkey_err(alloc_k); + struct btree_iter alloc_iter; + struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, + BTREE_ID_alloc, bucket, + async_repair ? BTREE_ITER_cached : 0); + int ret = bkey_err(alloc_k); if (ret) return ret; - if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, - need_discard_freespace_key_to_invalid_dev_bucket, - "entry in %s btree for nonexistant dev:bucket %llu:%llu", - bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) - goto delete; + if (!bch2_dev_bucket_exists(c, bucket)) { + if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket, + "entry in %s btree for nonexistant dev:bucket %llu:%llu", + bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) + goto delete; + ret = 1; + goto out; + } - a = bch2_alloc_to_v4(alloc_k, &a_convert); + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); + + if (a->data_type != state || + (state == BCH_DATA_free && + genbits != alloc_freespace_genbits(*a))) { + if (fsck_err(trans, need_discard_freespace_key_bad, + "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), + bch2_btree_id_str(iter->btree_id), + iter->pos.inode, + iter->pos.offset, + a->data_type == state, + genbits >> 56, alloc_freespace_genbits(*a) >> 56)) + goto delete; + ret = 1; + goto out; + } - if (fsck_err_on(a->data_type != state || - (state == BCH_DATA_free && - genbits != alloc_freespace_genbits(*a)), c, - need_discard_freespace_key_bad, - "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), - bch2_btree_id_str(iter->btree_id), - iter->pos.inode, - iter->pos.offset, - a->data_type == state, - genbits >> 56, alloc_freespace_genbits(*a) >> 56)) - goto delete; + *gen = a->gen; out: fsck_err: - set_btree_iter_dontneed(&alloc_iter); + bch2_set_btree_iter_dontneed(&alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; delete: - ret = bch2_btree_delete_extent_at(trans, iter, - iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - goto out; + if (!async_repair) { + ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_commit; + goto out; + } else { + /* + * We can't repair here when called from the allocator path: the + * commit will recurse back into the allocator + */ + struct check_discard_freespace_key_async *w = + kzalloc(sizeof(*w), GFP_KERNEL); + if (!w) + goto out; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) { + kfree(w); + goto out; + } + + INIT_WORK(&w->work, check_discard_freespace_key_work); + w->c = c; + w->pos = BBPOS(iter->btree_id, iter->pos); + queue_work(c->write_ref_wq, &w->work); + goto out; + } +} + +static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter) +{ + u8 gen; + int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false); + return ret < 0 ? ret : 0; } /* @@ -1333,33 +1492,28 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_i_bucket_gens g; - struct bch_dev *ca; u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; u64 b; - bool need_update = false, dev_exists; + bool need_update = false; struct printbuf buf = PRINTBUF; int ret = 0; BUG_ON(k.k->type != KEY_TYPE_bucket_gens); bkey_reassemble(&g.k_i, k); - /* if no bch_dev, skip out whether we repair or not */ - dev_exists = bch2_dev_exists2(c, k.k->p.inode); - if (!dev_exists) { - if (fsck_err_on(!dev_exists, c, - bucket_gens_to_invalid_dev, - "bucket_gens key for invalid device:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); + if (!ca) { + if (fsck_err(trans, bucket_gens_to_invalid_dev, + "bucket_gens key for invalid device:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, 0); - } goto out; } - ca = bch_dev_bkey_exists(c, k.k->p.inode); if (fsck_err_on(end <= ca->mi.first_bucket || - start >= ca->mi.nbuckets, c, - bucket_gens_to_invalid_buckets, + start >= ca->mi.nbuckets, + trans, bucket_gens_to_invalid_buckets, "bucket_gens key for invalid buckets:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); @@ -1367,16 +1521,16 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, } for (b = start; b < ca->mi.first_bucket; b++) - if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, - bucket_gens_nonzero_for_invalid_buckets, + if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], + trans, bucket_gens_nonzero_for_invalid_buckets, "bucket_gens key has nonzero gen for invalid bucket")) { g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; need_update = true; } for (b = ca->mi.nbuckets; b < end; b++) - if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, - bucket_gens_nonzero_for_invalid_buckets, + if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], + trans, bucket_gens_nonzero_for_invalid_buckets, "bucket_gens key has nonzero gen for invalid bucket")) { g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; need_update = true; @@ -1394,6 +1548,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, } out: fsck_err: + bch2_dev_put(ca); printbuf_exit(&buf); return ret; } @@ -1402,25 +1557,26 @@ int bch2_check_alloc_info(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; + struct bch_dev *ca = NULL; struct bkey hole; struct bkey_s_c k; int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); while (1) { struct bpos next; bch2_trans_begin(trans); - k = bch2_get_key_or_real_bucket_hole(&iter, &hole); + k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); ret = bkey_err(k); if (ret) goto bkey_err; @@ -1441,7 +1597,7 @@ int bch2_check_alloc_info(struct bch_fs *c) } else { next = k.k->p; - ret = bch2_check_alloc_hole_freespace(trans, + ret = bch2_check_alloc_hole_freespace(trans, ca, bkey_start_pos(k.k), &next, &freespace_iter) ?: @@ -1469,19 +1625,21 @@ bkey_err: bch2_trans_iter_exit(trans, &freespace_iter); bch2_trans_iter_exit(trans, &discard_iter); bch2_trans_iter_exit(trans, &iter); + bch2_dev_put(ca); + ca = NULL; if (ret < 0) goto err; ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(trans, &iter)); + BTREE_ITER_prefetch, k, + bch2_check_discard_freespace_key_fsck(trans, &iter)); if (ret) goto err; bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); while (1) { bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); @@ -1489,7 +1647,7 @@ bkey_err: break; ret = bkey_err(k) ?: - bch2_check_discard_freespace_key(trans, &iter); + bch2_check_discard_freespace_key_fsck(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; @@ -1511,7 +1669,7 @@ bkey_err: ret = for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_bucket_gens_key(trans, &iter, k)); err: @@ -1521,13 +1679,13 @@ err: } static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, - struct btree_iter *alloc_iter) + struct btree_iter *alloc_iter, + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct btree_iter lru_iter; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; - struct bkey_s_c alloc_k, lru_k; + struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret; @@ -1539,13 +1697,25 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) return ret; + struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode); + if (!ca) + return 0; + a = bch2_alloc_to_v4(alloc_k, &a_convert); + u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); + if (lru_idx) { + ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, + lru_idx, alloc_k, last_flushed); + if (ret) + goto err; + } + if (a->data_type != BCH_DATA_cached) - return 0; + goto err; - if (fsck_err_on(!a->io_time[READ], c, - alloc_key_cached_but_read_time_zero, + if (fsck_err_on(!a->io_time[READ], + trans, alloc_key_cached_but_read_time_zero, "cached bucket with read_time 0\n" " %s", (printbuf_reset(&buf), @@ -1556,119 +1726,117 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) goto err; - a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); ret = bch2_trans_update(trans, alloc_iter, - &a_mut->k_i, BTREE_TRIGGER_NORUN); + &a_mut->k_i, BTREE_TRIGGER_norun); if (ret) goto err; a = &a_mut->v; } - lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ]), 0); - ret = bkey_err(lru_k); + ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], + alloc_k, last_flushed); if (ret) - return ret; - - if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, - alloc_key_to_missing_lru_entry, - "missing lru entry\n" - " %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - ret = bch2_lru_set(trans, - alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ]); - if (ret) - goto err; - } + goto err; err: fsck_err: - bch2_trans_iter_exit(trans, &lru_iter); + bch2_dev_put(ca); printbuf_exit(&buf); return ret; } int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { + struct bkey_buf last_flushed; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_PREFETCH, k, + POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter))); + bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; } +static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) +{ + int ret; + + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) + if (i->bucket == bucket) { + ret = -BCH_ERR_EEXIST_discard_in_flight_add; + goto out; + } + + ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { + .in_progress = in_progress, + .bucket = bucket, + })); +out: + mutex_unlock(&ca->discard_buckets_in_flight_lock); + return ret; +} + +static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) +{ + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) + if (i->bucket == bucket) { + BUG_ON(!i->in_progress); + darray_remove_item(&ca->discard_buckets_in_flight, i); + goto found; + } + BUG(); +found: + mutex_unlock(&ca->discard_buckets_in_flight_lock); +} + struct discard_buckets_state { u64 seen; u64 open; u64 need_journal_commit; u64 discarded; - struct bch_dev *ca; - u64 need_journal_commit_this_dev; }; -static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) -{ - if (s->ca == ca) - return; - - if (s->ca && s->need_journal_commit_this_dev > - bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) - bch2_journal_flush_async(&c->journal, NULL); - - if (s->ca) - percpu_ref_put(&s->ca->ref); - if (ca) - percpu_ref_get(&ca->ref); - s->ca = ca; - s->need_journal_commit_this_dev = 0; -} - static int bch2_discard_one_bucket(struct btree_trans *trans, + struct bch_dev *ca, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, - struct discard_buckets_state *s) + struct discard_buckets_state *s, + bool fastpath) { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; struct btree_iter iter = { NULL }; struct bkey_s_c k; - struct bch_dev *ca; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; + bool discard_locked = false; int ret = 0; - ca = bch_dev_bkey_exists(c, pos.inode); - - if (!percpu_ref_tryget(&ca->io_ref)) { - bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); - return 0; - } - - discard_buckets_next_dev(c, s, ca); - if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { s->open++; goto out; } - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - pos.inode, pos.offset)) { - s->need_journal_commit++; - s->need_journal_commit_this_dev++; + u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, + pos.inode, pos.offset); + if (seq_ready > c->journal.flushed_seq_ondisk) { + if (seq_ready > c->journal.flushing_seq) + s->need_journal_commit++; goto out; } k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, need_discard_iter->pos, - BTREE_ITER_CACHED); + BTREE_ITER_cached); ret = bkey_err(k); if (ret) goto out; @@ -1678,78 +1846,73 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (ret) goto out; - if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { - a->v.gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); - goto write; - } - - if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - bch2_trans_inconsistent(trans, - "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" - "%s", - a->v.journal_seq, - c->journal.flushed_seq_ondisk, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; + if (a->v.data_type != BCH_DATA_need_discard) { + if (need_discard_or_freespace_err(trans, k, true, true, true)) { + ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false); + if (ret) + goto out; + goto commit; } + goto out; } - if (a->v.data_type != BCH_DATA_need_discard) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - bch2_trans_inconsistent(trans, - "bucket incorrectly set in need_discard btree\n" - "%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - } + if (!fastpath) { + if (discard_in_flight_add(ca, iter.pos.offset, true)) + goto out; - goto out; + discard_locked = true; } - if (!bkey_eq(*discard_pos_done, iter.pos) && - ca->mi.discard && !c->opts.nochanges) { - /* - * This works without any other locks because this is the only - * thread that removes items from the need_discard tree - */ - bch2_trans_unlock_long(trans); - blkdev_issue_discard(ca->disk_sb.bdev, - k.k->p.offset * ca->mi.bucket_size, - ca->mi.bucket_size, - GFP_KERNEL); + if (!bkey_eq(*discard_pos_done, iter.pos)) { + s->discarded++; *discard_pos_done = iter.pos; - ret = bch2_trans_relock_notrace(trans); - if (ret) - goto out; + if (ca->mi.discard && !c->opts.nochanges) { + /* + * This works without any other locks because this is the only + * thread that removes items from the need_discard tree + */ + bch2_trans_unlock_long(trans); + blkdev_issue_discard(ca->disk_sb.bdev, + k.k->p.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL); + ret = bch2_trans_relock_notrace(trans); + if (ret) + goto out; + } } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - a->v.data_type = alloc_data_type(a->v, a->v.data_type); -write: - ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); + alloc_data_type_set(&a->v, a->v.data_type); + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + if (ret) + goto out; +commit: + ret = bch2_trans_commit(trans, NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; count_event(c, bucket_discard); - s->discarded++; out: - s->seen++; +fsck_err: + if (discard_locked) + discard_in_flight_remove(ca, iter.pos.offset); + if (!ret) + s->seen++; bch2_trans_iter_exit(trans, &iter); - percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); return ret; } static void bch2_do_discards_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, discard_work); + struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); + struct bch_fs *c = ca->fs; struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; @@ -1760,23 +1923,136 @@ static void bch2_do_discards_work(struct work_struct *work) * successful commit: */ ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); + for_each_btree_key_max(trans, iter, + BTREE_ID_need_discard, + POS(ca->dev_idx, 0), + POS(ca->dev_idx, U64_MAX), 0, k, + bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); - discard_buckets_next_dev(c, &s, NULL); + if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal)) + bch2_journal_flush_async(&c->journal, NULL); trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_discard); +} + +void bch2_dev_do_discards(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) + return; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + goto put_write_ref; + + if (queue_work(c->write_ref_wq, &ca->discard_work)) + return; + + percpu_ref_put(&ca->io_ref); +put_write_ref: bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && - !queue_work(c->write_ref_wq, &c->discard_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + for_each_member_device(c, ca) + bch2_dev_do_discards(ca); +} + +static int bch2_do_discards_fast_one(struct btree_trans *trans, + struct bch_dev *ca, + u64 bucket, + struct bpos *discard_pos_done, + struct discard_buckets_state *s) +{ + struct btree_iter need_discard_iter; + struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter, + BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); + int ret = bkey_err(discard_k); + if (ret) + return ret; + + if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set, + trans, discarding_bucket_not_in_need_discard_btree, + "attempting to discard bucket %u:%llu not in need_discard btree", + ca->dev_idx, bucket)) + goto out; + + ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); +out: +fsck_err: + bch2_trans_iter_exit(trans, &need_discard_iter); + return ret; +} + +static void bch2_do_discards_fast_work(struct work_struct *work) +{ + struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); + struct bch_fs *c = ca->fs; + struct discard_buckets_state s = {}; + struct bpos discard_pos_done = POS_MAX; + struct btree_trans *trans = bch2_trans_get(c); + int ret = 0; + + while (1) { + bool got_bucket = false; + u64 bucket; + + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) { + if (i->in_progress) + continue; + + got_bucket = true; + bucket = i->bucket; + i->in_progress = true; + break; + } + mutex_unlock(&ca->discard_buckets_in_flight_lock); + + if (!got_bucket) + break; + + ret = lockrestart_do(trans, + bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s)); + bch_err_fn(c, ret); + + discard_in_flight_remove(ca, bucket); + + if (ret) + break; + } + + trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + + bch2_trans_put(trans); + percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); +} + +static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) +{ + struct bch_fs *c = ca->fs; + + if (discard_in_flight_add(ca, bucket, false)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) + return; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + goto put_ref; + + if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) + return; + + percpu_ref_put(&ca->io_ref); +put_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } static int invalidate_one_bucket(struct btree_trans *trans, @@ -1785,7 +2061,6 @@ static int invalidate_one_bucket(struct btree_trans *trans, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; - struct btree_iter alloc_iter = { NULL }; struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); @@ -1796,14 +2071,17 @@ static int invalidate_one_bucket(struct btree_trans *trans, return 1; if (!bch2_dev_bucket_exists(c, bucket)) { - prt_str(&buf, "lru entry points to invalid bucket"); - goto err; + if (fsck_err(trans, lru_entry_to_invalid_bucket, + "lru key points to nonexistent device:bucket %llu:%llu", + bucket.inode, bucket.offset)) + return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); + goto out; } if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; - a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); + a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; @@ -1813,6 +2091,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, goto out; BUG_ON(a->v.data_type != BCH_DATA_cached); + BUG_ON(a->v.dirty_sectors); if (!a->v.cached_sectors) bch_err(c, "invalidating empty bucket, confused"); @@ -1823,49 +2102,44 @@ static int invalidate_one_bucket(struct btree_trans *trans, a->v.gen++; a->v.data_type = 0; a->v.dirty_sectors = 0; + a->v.stripe_sectors = 0; a->v.cached_sectors = 0; - a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); - a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); - - ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, - BTREE_TRIGGER_BUCKET_INVALIDATE) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); + a->v.io_time[READ] = bch2_current_io_time(c, READ); + a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); + + ret = bch2_trans_commit(trans, NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); --*nr_to_invalidate; out: - bch2_trans_iter_exit(trans, &alloc_iter); +fsck_err: printbuf_exit(&buf); return ret; -err: - prt_str(&buf, "\n lru key: "); - bch2_bkey_val_to_text(&buf, c, lru_k); - - prt_str(&buf, "\n lru entry: "); - bch2_lru_pos_to_text(&buf, lru_iter->pos); - - prt_str(&buf, "\n alloc key: "); - if (!a) - bch2_bpos_to_text(&buf, bucket); - else - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); - - bch_err(c, "%s", buf.buf); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { - bch2_inconsistent_error(c); - ret = -EINVAL; +} + +static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, + struct bch_dev *ca, bool *wrapped) +{ + struct bkey_s_c k; +again: + k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + if (!k.k && !*wrapped) { + bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); + *wrapped = true; + goto again; } - goto out; + return k; } static void bch2_do_invalidates_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); + struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); + struct bch_fs *c = ca->fs; struct btree_trans *trans = bch2_trans_get(c); int ret = 0; @@ -1873,31 +2147,64 @@ static void bch2_do_invalidates_work(struct work_struct *work) if (ret) goto err; - for_each_member_device(c, ca) { - s64 nr_to_invalidate = - should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + s64 nr_to_invalidate = + should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + struct btree_iter iter; + bool wrapped = false; - ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, - lru_pos(ca->dev_idx, 0, 0), - lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), - BTREE_ITER_INTENT, k, - invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); + bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, + lru_pos(ca->dev_idx, 0, + ((bch2_current_io_time(c, READ) + U32_MAX) & + LRU_TIME_MAX)), 0); - if (ret < 0) { - percpu_ref_put(&ca->ref); + while (true) { + bch2_trans_begin(trans); + + struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); + ret = bkey_err(k); + if (ret) + goto restart_err; + if (!k.k) break; - } + + ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); +restart_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + bch2_btree_iter_advance(&iter); } + bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); + percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); +} + +void bch2_dev_do_invalidates(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) + return; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + goto put_ref; + + if (queue_work(c->write_ref_wq, &ca->invalidate_work)) + return; + + percpu_ref_put(&ca->io_ref); +put_ref: bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && - !queue_work(c->write_ref_wq, &c->invalidate_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + for_each_member_device(c, ca) + bch2_dev_do_invalidates(ca); } int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, @@ -1917,13 +2224,13 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); /* * Scan the alloc btree for every bucket on @ca, and add buckets to the * freespace/need_discard/need_gc_gens btrees as needed: */ while (1) { - if (last_updated + HZ * 10 < jiffies) { + if (time_after(jiffies, last_updated + HZ * 10)) { bch_info(ca, "%s: currently at %llu/%llu", __func__, iter.pos.offset, ca->mi.nbuckets); last_updated = jiffies; @@ -1949,7 +2256,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - ret = bch2_bucket_do_index(trans, k, a, true) ?: + ret = bch2_bucket_do_index(trans, ca, k, a, true) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) @@ -2021,7 +2328,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); if (ret) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); bch_err_fn(c, ret); return ret; } @@ -2037,23 +2344,51 @@ int bch2_fs_freespace_init(struct bch_fs *c) return 0; } +/* device removal */ + +int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) +{ + struct bpos start = POS(ca->dev_idx, 0); + struct bpos end = POS(ca->dev_idx, U64_MAX); + int ret; + + /* + * We clear the LRU and need_discard btrees first so that we don't race + * with bch2_do_invalidates() and bch2_do_discards() + */ + ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?: + bch2_btree_delete_range(c, BTREE_ID_lru, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_dev_usage_remove(c, ca->dev_idx); + bch_err_msg(ca, ret, "removing dev alloc info"); + return ret; +} + /* Bucket IO clocks: */ -int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, - size_t bucket_nr, int rw) +static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; - u64 now; - int ret = 0; - a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); - ret = PTR_ERR_OR_ZERO(a); + struct btree_iter iter; + struct bkey_i_alloc_v4 *a = + bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); + int ret = PTR_ERR_OR_ZERO(a); if (ret) return ret; - now = atomic64_read(&c->io_clock[rw].now); + u64 now = bch2_current_io_time(c, rw); if (a->v.io_time[rw] == now) goto out; @@ -2066,6 +2401,15 @@ out: return ret; } +int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) +{ + if (bch2_trans_relock(trans)) + bch2_trans_begin(trans); + + return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw)); +} + /* Startup/shutdown (ro/rw): */ void bch2_recalc_capacity(struct bch_fs *c) @@ -2130,6 +2474,7 @@ void bch2_recalc_capacity(struct bch_fs *c) reserved_sectors = min(reserved_sectors, capacity); + c->reserved = reserved_sectors; c->capacity = capacity - reserved_sectors; c->bucket_size_max = bucket_size_max; @@ -2168,13 +2513,15 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) /* device goes ro: */ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { - unsigned i; + lockdep_assert_held(&c->state_lock); /* First, remove device from allocation groups: */ - for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) clear_bit(ca->dev_idx, c->rw_devs[i].d); + c->rw_devs_change_count++; + /* * Capacity is calculated based off of devices in allocation groups: */ @@ -2203,16 +2550,29 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) /* device goes rw: */ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { - unsigned i; + lockdep_assert_held(&c->state_lock); - for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) if (ca->mi.data_allowed & (1 << i)) set_bit(ca->dev_idx, c->rw_devs[i].d); + + c->rw_devs_change_count++; +} + +void bch2_dev_allocator_background_exit(struct bch_dev *ca) +{ + darray_exit(&ca->discard_buckets_in_flight); +} + +void bch2_dev_allocator_background_init(struct bch_dev *ca) +{ + mutex_init(&ca->discard_buckets_in_flight_lock); + INIT_WORK(&ca->discard_work, bch2_do_discards_work); + INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work); + INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work); } void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); - INIT_WORK(&c->discard_work, bch2_do_discards_work); - INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index e7f7e842ee1b..de25ba4ee94b 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -8,21 +8,16 @@ #include "debug.h" #include "super.h" -enum bkey_invalid_flags; - /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) { - struct bch_dev *ca; - - if (!bch2_dev_exists2(c, pos.inode)) - return false; - - ca = bch_dev_bkey_exists(c, pos.inode); - return pos.offset >= ca->mi.first_bucket && - pos.offset < ca->mi.nbuckets; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); + bool ret = ca && bucket_valid(ca, pos.offset); + rcu_read_unlock(); + return ret; } static inline u64 bucket_to_u64(struct bpos bucket) @@ -40,58 +35,122 @@ static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) return a.gen - a.oldest_gen; } -static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors, - u32 cached_sectors, - u32 stripe, - struct bch_alloc_v4 a, - enum bch_data_type data_type) +static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src) { - if (stripe) - return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; - if (dirty_sectors) - return data_type; - if (cached_sectors) - return BCH_DATA_cached; - if (BCH_ALLOC_V4_NEED_DISCARD(&a)) - return BCH_DATA_need_discard; - if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) - return BCH_DATA_need_gc_gens; - return BCH_DATA_free; + dst->gen = src.gen; + dst->data_type = src.data_type; + dst->stripe_sectors = src.stripe_sectors; + dst->dirty_sectors = src.dirty_sectors; + dst->cached_sectors = src.cached_sectors; + dst->stripe = src.stripe; } -static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, - enum bch_data_type data_type) +static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src) +{ + dst->gen = src.gen; + dst->data_type = src.data_type; + dst->stripe_sectors = src.stripe_sectors; + dst->dirty_sectors = src.dirty_sectors; + dst->cached_sectors = src.cached_sectors; + dst->stripe = src.stripe; +} + +static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) { - return __alloc_data_type(a.dirty_sectors, a.cached_sectors, - a.stripe, a, data_type); + struct bch_alloc_v4 ret = {}; + __bucket_m_to_alloc(&ret, b); + return ret; } static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) { - return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type; + switch (data_type) { + case BCH_DATA_cached: + case BCH_DATA_stripe: + return BCH_DATA_user; + default: + return data_type; + } +} + +static inline bool bucket_data_type_mismatch(enum bch_data_type bucket, + enum bch_data_type ptr) +{ + return !data_type_is_empty(bucket) && + bucket_data_type(bucket) != bucket_data_type(ptr); } -static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a) +/* + * It is my general preference to use unsigned types for unsigned quantities - + * however, these helpers are used in disk accounting calculations run by + * triggers where the output will be negated and added to an s64. unsigned is + * right out even though all these quantities will fit in 32 bits, since it + * won't be sign extended correctly; u64 will negate "correctly", but s64 is the + * simpler option here. + */ +static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a) { - return a.dirty_sectors + a.cached_sectors; + return a.stripe_sectors + a.dirty_sectors + a.cached_sectors; } -static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a) +static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a) { - return a.dirty_sectors; + return a.stripe_sectors + a.dirty_sectors; } -static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca, +static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a) +{ + return a.data_type == BCH_DATA_cached + ? a.cached_sectors + : bch2_bucket_sectors_dirty(a); +} + +static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca, struct bch_alloc_v4 a) { - int d = bch2_bucket_sectors_dirty(a); + int d = bch2_bucket_sectors(a); + + return d ? max(0, ca->mi.bucket_size - d) : 0; +} + +static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a) +{ + int d = a.stripe_sectors + a.dirty_sectors; return d ? max(0, ca->mi.bucket_size - d) : 0; } +static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a) +{ + return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0; +} + +static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, + enum bch_data_type data_type) +{ + if (a.stripe) + return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; + if (bch2_bucket_sectors_dirty(a)) + return data_type; + if (a.cached_sectors) + return BCH_DATA_cached; + if (BCH_ALLOC_V4_NEED_DISCARD(&a)) + return BCH_DATA_need_discard; + if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) + return BCH_DATA_need_gc_gens; + return BCH_DATA_free; +} + +static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type) +{ + a->data_type = alloc_data_type(*a, data_type); +} + static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) { - return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; + return a.data_type == BCH_DATA_cached + ? a.io_time[READ] & LRU_TIME_MAX + : 0; } #define DATA_TYPES_MOVABLE \ @@ -107,11 +166,20 @@ static inline bool data_type_movable(enum bch_data_type type) static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, struct bch_dev *ca) { + if (a.data_type >= BCH_DATA_NR) + return 0; + if (!data_type_movable(a.data_type) || !bch2_bucket_sectors_fragmented(ca, a)) return 0; - u64 d = bch2_bucket_sectors_dirty(a); + /* + * avoid overflowing LRU_TIME_BITS on a corrupted fs, when + * bucket_sectors_dirty is (much) bigger than bucket_size + */ + u64 d = min_t(s64, bch2_bucket_sectors_dirty(a), + ca->mi.bucket_size); + return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } @@ -126,13 +194,17 @@ static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_ return pos; } -static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) +static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a) { - unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: + return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: BCH_ALLOC_V4_U64s_V0) + BCH_ALLOC_V4_NR_BACKPOINTERS(a) * (sizeof(struct bch_backpointer) / sizeof(u64)); +} +static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) +{ + unsigned ret = alloc_v4_u64s_noerror(a); BUG_ON(ret > U8_MAX - BKEY_U64s); return ret; } @@ -143,7 +215,10 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) } struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); +bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos); +struct bkey_i_alloc_v4 * +bch2_trans_start_alloc_update(struct btree_trans *, struct bpos, + enum btree_iter_update_trigger_flags); void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); @@ -168,52 +243,52 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); -int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); -int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); -int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v1_invalid, \ + .key_validate = bch2_alloc_v1_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v2_invalid, \ + .key_validate = bch2_alloc_v2_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v3_invalid, \ + .key_validate = bch2_alloc_v3_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 16, \ }) #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v4_invalid, \ + .key_validate = bch2_alloc_v4_validate, \ .val_to_text = bch2_alloc_to_text, \ .swab = bch2_alloc_v4_swab, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 48, \ }) -int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ - .key_invalid = bch2_bucket_gens_invalid, \ + .key_validate = bch2_bucket_gens_validate, \ .val_to_text = bch2_bucket_gens_to_text, \ }) @@ -228,10 +303,17 @@ static inline bool bkey_is_alloc(const struct bkey *k) int bch2_alloc_read(struct bch_fs *); +int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *, + const struct bch_alloc_v4 *, + const struct bch_alloc_v4 *, unsigned); int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); + +int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); +void bch2_dev_do_discards(struct bch_dev *); void bch2_do_discards(struct bch_fs *); static inline u64 should_invalidate_buckets(struct bch_dev *ca, @@ -246,6 +328,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); } +void bch2_dev_do_invalidates(struct bch_dev *); void bch2_do_invalidates(struct bch_fs *); static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) @@ -262,6 +345,7 @@ static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64); int bch2_fs_freespace_init(struct bch_fs *); +int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); void bch2_recalc_capacity(struct bch_fs *); u64 bch2_min_rw_member_capacity(struct bch_fs *); @@ -269,6 +353,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); +void bch2_dev_allocator_background_exit(struct bch_dev *); +void bch2_dev_allocator_background_init(struct bch_dev *); + void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h index b4ec20be93b8..740238369a5a 100644 --- a/fs/bcachefs/alloc_background_format.h +++ b/fs/bcachefs/alloc_background_format.h @@ -58,7 +58,7 @@ LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) struct bch_alloc_v4 { struct bch_val v; - __u64 journal_seq; + __u64 journal_seq_nonempty; __u32 flags; __u8 gen; __u8 oldest_gen; @@ -69,7 +69,10 @@ struct bch_alloc_v4 { __u64 io_time[2]; __u32 stripe; __u32 nr_external_backpointers; - __u64 fragmentation_lru; + /* end of fields in original version of alloc_v4 */ + __u64 journal_seq_empty; + __u32 stripe_sectors; + __u32 pad; } __packed __aligned(8); #define BCH_ALLOC_V4_U64s_V0 6 diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 633d3223b353..5a781fb4c794 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -71,7 +71,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *c) { rcu_read_lock(); for_each_member_device_rcu(c, ca, NULL) - ca->alloc_cursor = 0; + memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); rcu_read_unlock(); } @@ -100,21 +100,17 @@ static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *o void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); if (ob->ec) { ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); return; } - percpu_down_read(&c->mark_lock); spin_lock(&ob->lock); - ob->valid = false; ob->data_type = 0; - spin_unlock(&ob->lock); - percpu_up_read(&c->mark_lock); spin_lock(&c->freelist_lock); bch2_open_bucket_hash_remove(c, ob); @@ -156,12 +152,24 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) return ob; } +static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) +{ + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) + return false; + + return bch2_is_superblock_bucket(ca, b); +} + static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) { BUG_ON(c->open_buckets_partial_nr >= ARRAY_SIZE(c->open_buckets_partial)); spin_lock(&c->freelist_lock); + rcu_read_lock(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; + rcu_read_unlock(); + ob->on_partial_list = true; c->open_buckets_partial[c->open_buckets_partial_nr++] = ob - c->open_buckets; @@ -171,25 +179,13 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->freelist_wait); } -/* _only_ for allocating the journal on a new device: */ -long bch2_bucket_alloc_new_fs(struct bch_dev *ca) -{ - while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { - u64 b = ca->new_fs_bucket_idx++; - - if (!is_superblock_bucket(ca, b) && - (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) - return b; - } - - return -1; -} - static inline unsigned open_buckets_reserved(enum bch_watermark watermark) { switch (watermark) { - case BCH_WATERMARK_reclaim: + case BCH_WATERMARK_interior_updates: return 0; + case BCH_WATERMARK_reclaim: + return OPEN_BUCKETS_COUNT / 6; case BCH_WATERMARK_btree: case BCH_WATERMARK_btree_copygc: return OPEN_BUCKETS_COUNT / 4; @@ -200,33 +196,44 @@ static inline unsigned open_buckets_reserved(enum bch_watermark watermark) } } -static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - u64 bucket, - enum bch_watermark watermark, - const struct bch_alloc_v4 *a, - struct bucket_alloc_state *s, - struct closure *cl) +static inline bool may_alloc_bucket(struct bch_fs *c, + struct bpos bucket, + struct bucket_alloc_state *s) { - struct open_bucket *ob; - - if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { - s->skipped_nouse++; - return NULL; - } - - if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { s->skipped_open++; - return NULL; + return false; } - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { + u64 journal_seq_ready = + bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, + bucket.inode, bucket.offset); + if (journal_seq_ready > c->journal.flushed_seq_ondisk) { + if (journal_seq_ready > c->journal.flushing_seq) + s->need_journal_commit++; s->skipped_need_journal_commit++; - return NULL; + return false; } - if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { + if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { s->skipped_nocow++; + return false; + } + + return true; +} + +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + u64 bucket, u8 gen, + enum bch_watermark watermark, + struct bucket_alloc_state *s, + struct closure *cl) +{ + if (unlikely(is_superblock_bucket(c, ca, bucket))) + return NULL; + + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { + s->skipped_nouse++; return NULL; } @@ -236,8 +243,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * if (cl) closure_wait(&c->open_buckets_wait, cl); - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], - &c->blocked_allocate_open_bucket, true); + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); spin_unlock(&c->freelist_lock); return ERR_PTR(-BCH_ERR_open_buckets_empty); } @@ -249,136 +255,50 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * return NULL; } - ob = bch2_open_bucket_alloc(c); + struct open_bucket *ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); - ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->dev = ca->dev_idx; - ob->gen = a->gen; + ob->gen = gen; ob->bucket = bucket; spin_unlock(&ob->lock); ca->nr_open_buckets++; bch2_open_bucket_hash_add(c, ob); - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], - &c->blocked_allocate_open_bucket, false); - - track_event_change(&c->times[BCH_TIME_blocked_allocate], - &c->blocked_allocate, false); + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); + track_event_change(&c->times[BCH_TIME_blocked_allocate], false); spin_unlock(&c->freelist_lock); return ob; } static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, - enum bch_watermark watermark, u64 free_entry, + enum bch_watermark watermark, struct bucket_alloc_state *s, - struct bkey_s_c freespace_k, + struct btree_iter *freespace_iter, struct closure *cl) { struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; - struct bkey_s_c k; - struct open_bucket *ob; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - u64 b = free_entry & ~(~0ULL << 56); - unsigned genbits = free_entry >> 56; - struct printbuf buf = PRINTBUF; - int ret; - - if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { - prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" - " freespace key ", - ca->mi.first_bucket, ca->mi.nbuckets); - bch2_bkey_val_to_text(&buf, c, freespace_k); - bch2_trans_inconsistent(trans, "%s", buf.buf); - ob = ERR_PTR(-EIO); - goto err; - } - - k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_alloc, POS(ca->dev_idx, b), - BTREE_ITER_CACHED); - ret = bkey_err(k); - if (ret) { - ob = ERR_PTR(ret); - goto err; - } - - a = bch2_alloc_to_v4(k, &a_convert); - - if (a->data_type != BCH_DATA_free) { - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { - ob = NULL; - goto err; - } - - prt_printf(&buf, "non free bucket in freespace btree\n" - " freespace key "); - bch2_bkey_val_to_text(&buf, c, freespace_k); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, k); - bch2_trans_inconsistent(trans, "%s", buf.buf); - ob = ERR_PTR(-EIO); - goto err; - } - - if (genbits != (alloc_freespace_genbits(*a) >> 56) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" - " freespace key ", - genbits, alloc_freespace_genbits(*a) >> 56); - bch2_bkey_val_to_text(&buf, c, freespace_k); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, k); - bch2_trans_inconsistent(trans, "%s", buf.buf); - ob = ERR_PTR(-EIO); - goto err; - } - - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { - struct bch_backpointer bp; - struct bpos bp_pos = POS_MIN; + u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, - &bp_pos, &bp, - BTREE_ITER_NOPRESERVE); - if (ret) { - ob = ERR_PTR(ret); - goto err; - } + if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) + return NULL; - if (!bkey_eq(bp_pos, POS_MAX)) { - /* - * Bucket may have data in it - we don't call - * bc2h_trans_inconnsistent() because fsck hasn't - * finished yet - */ - ob = NULL; - goto err; - } - } + u8 gen; + int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); + if (ret < 0) + return ERR_PTR(ret); + if (ret) + return NULL; - ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); - if (!ob) - set_btree_iter_dontneed(&iter); -err: - if (iter.path) - set_btree_iter_dontneed(&iter); - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ob; + return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); } /* * This path is for before the freespace btree is initialized: - * - * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & - * journal buckets - journal buckets will be < ca->new_fs_bucket_idx */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, @@ -387,11 +307,13 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bucket_alloc_state *s, struct closure *cl) { + struct bch_fs *c = trans->c; struct btree_iter iter, citer; struct bkey_s_c k, ck; struct open_bucket *ob = NULL; - u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); - u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 first_bucket = ca->mi.first_bucket; + u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; int ret; @@ -406,23 +328,35 @@ bch2_bucket_alloc_early(struct btree_trans *trans, */ again: for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), - BTREE_ITER_SLOTS, k, ret) { - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; + BTREE_ITER_slots, k, ret) { + u64 bucket = k.k->p.offset; if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; - if (ca->new_fs_bucket_idx && - is_superblock_bucket(ca, k.k->p.offset)) + if (s->btree_bitmap != BTREE_BITMAP_ANY && + s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { + if (s->btree_bitmap == BTREE_BITMAP_YES && + bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) + break; + + bucket = sector_to_bucket(ca, + round_up(bucket_to_sector(ca, bucket) + 1, + 1ULL << ca->mi.btree_bitmap_shift)); + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket)); + s->buckets_seen++; + s->skipped_mi_btree_bitmap++; continue; + } - a = bch2_alloc_to_v4(k, &a_convert); + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); if (a->data_type != BCH_DATA_free) continue; /* now check the cached key to serialize concurrent allocs of the bucket */ - ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED); + ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached); ret = bkey_err(ck); if (ret) break; @@ -433,9 +367,12 @@ again: s->buckets_seen++; - ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); + ob = may_alloc_bucket(c, k.k->p, s) + ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, + watermark, s, cl) + : NULL; next: - set_btree_iter_dontneed(&citer); + bch2_set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); if (ob) break; @@ -443,7 +380,6 @@ next: bch2_trans_iter_exit(trans, &iter); alloc_cursor = iter.pos.offset; - ca->alloc_cursor = alloc_cursor; if (!ob && ret) ob = ERR_PTR(ret); @@ -453,6 +389,8 @@ next: goto again; } + *dev_alloc_cursor = alloc_cursor; + return ob; } @@ -465,44 +403,63 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; - - BUG_ON(ca->new_fs_bucket_idx); again: - for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, - POS(ca->dev_idx, alloc_cursor), 0, k, ret) { - if (k.k->p.inode != ca->dev_idx) - break; - - for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); - alloc_cursor < k.k->p.offset; - alloc_cursor++) { - ret = btree_trans_too_many_iters(trans); - if (ret) { - ob = ERR_PTR(ret); - break; - } + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, + POS(ca->dev_idx, alloc_cursor), + POS(ca->dev_idx, U64_MAX), + 0, k, ret) { + /* + * peek normally dosen't trim extents - they can span iter.pos, + * which is not what we want here: + */ + iter.k.size = iter.k.p.offset - iter.pos.offset; + while (iter.k.size) { s->buckets_seen++; - ob = try_alloc_bucket(trans, ca, watermark, - alloc_cursor, s, k, cl); + u64 bucket = iter.pos.offset & ~(~0ULL << 56); + if (s->btree_bitmap != BTREE_BITMAP_ANY && + s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { + if (s->btree_bitmap == BTREE_BITMAP_YES && + bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) + goto fail; + + bucket = sector_to_bucket(ca, + round_up(bucket_to_sector(ca, bucket + 1), + 1ULL << ca->mi.btree_bitmap_shift)); + alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); + + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); + s->skipped_mi_btree_bitmap++; + goto next; + } + + ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); if (ob) { - set_btree_iter_dontneed(&iter); + if (!IS_ERR(ob)) + *dev_alloc_cursor = iter.pos.offset; + bch2_set_btree_iter_dontneed(&iter); break; } - } + iter.k.size--; + iter.pos.offset++; + } +next: if (ob || ret) break; } +fail: bch2_trans_iter_exit(trans, &iter); - ca->alloc_cursor = alloc_cursor; + BUG_ON(ob && ret); - if (!ob && ret) + if (ret) ob = ERR_PTR(ret); if (!ob && alloc_start > ca->mi.first_bucket) { @@ -513,12 +470,53 @@ again: return ob; } +static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, + enum bch_watermark watermark, + enum bch_data_type data_type, + struct closure *cl, + struct bch_dev_usage *usage, + struct bucket_alloc_state *s, + struct open_bucket *ob) +{ + struct printbuf buf = PRINTBUF; + + printbuf_tabstop_push(&buf, 24); + + prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); + prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); + prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); + prt_printf(&buf, "blocking\t%u\n", cl != NULL); + prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets); + prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); + prt_printf(&buf, "copygc_wait\t%lu/%lli\n", + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); + prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); + prt_printf(&buf, "open\t%llu\n", s->skipped_open); + prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); + prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); + prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); + prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); + + if (!IS_ERR(ob)) { + prt_printf(&buf, "allocated\t%llu\n", ob->bucket); + trace_bucket_alloc(c, buf.buf); + } else { + prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob))); + trace_bucket_alloc_fail(c, buf.buf); + } + + printbuf_exit(&buf); +} + /** * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object * @ca: device to allocate from * @watermark: how important is this allocation? + * @data_type: BCH_DATA_journal, btree, user... * @cl: if not NULL, closure to be used to wait if buckets not available + * @nowait: if true, do not wait for buckets to become available * @usage: for secondarily also returning the current device usage * * Returns: an open_bucket on success, or an ERR_PTR() on failure. @@ -526,37 +524,44 @@ again: static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bch_dev *ca, enum bch_watermark watermark, + enum bch_data_type data_type, struct closure *cl, + bool nowait, struct bch_dev_usage *usage) { struct bch_fs *c = trans->c; struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; - struct bucket_alloc_state s = { 0 }; - bool waiting = false; + struct bucket_alloc_state s = { + .btree_bitmap = data_type == BCH_DATA_btree, + }; + bool waiting = nowait; again: bch2_dev_usage_read_fast(ca, usage); avail = dev_buckets_free(ca, *usage, watermark); if (usage->d[BCH_DATA_need_discard].buckets > avail) - bch2_do_discards(c); + bch2_dev_do_discards(ca); if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) - bch2_do_gc_gens(c); + bch2_gc_gens_async(c); if (should_invalidate_buckets(ca, *usage)) - bch2_do_invalidates(c); + bch2_dev_do_invalidates(ca); if (!avail) { + if (watermark > BCH_WATERMARK_normal && + c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + goto alloc; + if (cl && !waiting) { closure_wait(&c->freelist_wait, cl); waiting = true; goto again; } - track_event_change(&c->times[BCH_TIME_blocked_allocate], - &c->blocked_allocate, true); + track_event_change(&c->times[BCH_TIME_blocked_allocate], true); ob = ERR_PTR(-BCH_ERR_freelist_empty); goto err; @@ -569,9 +574,14 @@ alloc: ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); - if (s.skipped_need_journal_commit * 2 > avail) + if (s.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); + if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { + s.btree_bitmap = BTREE_BITMAP_ANY; + goto alloc; + } + if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { freespace = false; goto alloc; @@ -581,41 +591,32 @@ err: ob = ERR_PTR(-BCH_ERR_no_buckets_found); if (!IS_ERR(ob)) - trace_and_count(c, bucket_alloc, ca, - bch2_watermarks[watermark], - ob->bucket, - usage->d[BCH_DATA_free].buckets, - avail, - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), - &s, - cl == NULL, - ""); + ob->data_type = data_type; + + if (!IS_ERR(ob)) + count_event(c, bucket_alloc); else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) - trace_and_count(c, bucket_alloc_fail, ca, - bch2_watermarks[watermark], - 0, - usage->d[BCH_DATA_free].buckets, - avail, - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), - &s, - cl == NULL, - bch2_err_str(PTR_ERR(ob))); + count_event(c, bucket_alloc_fail); + + if (!IS_ERR(ob) + ? trace_bucket_alloc_enabled() + : trace_bucket_alloc_fail_enabled()) + trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); return ob; } struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, enum bch_watermark watermark, + enum bch_data_type data_type, struct closure *cl) { struct bch_dev_usage usage; struct open_bucket *ob; - bch2_trans_do(c, NULL, NULL, 0, + bch2_trans_do(c, PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, - cl, &usage))); + data_type, cl, false, &usage))); return ob; } @@ -636,9 +637,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, unsigned i; for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) - ret.devs[ret.nr++] = i; + ret.data[ret.nr++] = i; - bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); + bubble_sort(ret.data, ret.nr, dev_stripe_cmp); return ret; } @@ -678,11 +679,9 @@ static int add_new_bucket(struct bch_fs *c, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, - unsigned flags, struct open_bucket *ob) { - unsigned durability = - bch_dev_bkey_exists(c, ob->dev)->mi.durability; + unsigned durability = ob_dev(c, ob)->mi.durability; BUG_ON(*nr_effective >= nr_replicas); @@ -706,45 +705,33 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, - unsigned flags, + enum bch_write_flags flags, enum bch_data_type data_type, enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; - struct dev_alloc_list devs_sorted = - bch2_dev_alloc_list(c, stripe, devs_may_alloc); - unsigned dev; - struct bch_dev *ca; int ret = -BCH_ERR_insufficient_devices; - unsigned i; BUG_ON(*nr_effective >= nr_replicas); - for (i = 0; i < devs_sorted.nr; i++) { - struct bch_dev_usage usage; - struct open_bucket *ob; - - dev = devs_sorted.devs[i]; - - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + darray_for_each(devs_sorted, i) { + struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); if (!ca) continue; if (!ca->mi.durability && *have_cache) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); continue; } - ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage); + struct bch_dev_usage usage; + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, + cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); if (IS_ERR(ob)) { ret = PTR_ERR(ob); @@ -753,11 +740,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - ob->data_type = data_type; - if (add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, - have_cache, flags, ob)) { + have_cache, ob)) { ret = 0; break; } @@ -783,14 +768,10 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, unsigned *nr_effective, bool *have_cache, enum bch_watermark watermark, - unsigned flags, + enum bch_write_flags flags, struct closure *cl) { struct bch_fs *c = trans->c; - struct dev_alloc_list devs_sorted; - struct ec_stripe_head *h; - struct open_bucket *ob; - unsigned i, ec_idx; int ret = 0; if (nr_replicas < 2) @@ -799,34 +780,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, if (ec_open_bucket(c, ptrs)) return 0; - h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + struct ec_stripe_head *h = + bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) return 0; - devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); - - for (i = 0; i < devs_sorted.nr; i++) - for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + darray_for_each(devs_sorted, i) + for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) continue; - ob = c->open_buckets + h->s->blocks[ec_idx]; - if (ob->dev == devs_sorted.devs[i] && - !test_and_set_bit(ec_idx, h->s->blocks_allocated)) - goto got_bucket; + struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; + if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { + ob->ec_idx = ec_idx; + ob->ec = h->s; + ec_stripe_new_get(h->s, STRIPE_REF_io); + + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, ob); + goto out; + } } - goto out_put_head; -got_bucket: - ob->ec_idx = ec_idx; - ob->ec = h->s; - ec_stripe_new_get(h->s, STRIPE_REF_io); - - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, flags, ob); -out_put_head: +out: bch2_ec_stripe_head_put(c, h); return ret; } @@ -839,7 +818,7 @@ static bool want_bucket(struct bch_fs *c, bool *have_cache, bool ec, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); if (!test_bit(ob->dev, devs_may_alloc->d)) return false; @@ -864,7 +843,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, - bool ec, unsigned flags) + bool ec) { struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; @@ -876,7 +855,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c, have_cache, ec, ob)) ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, - have_cache, flags, ob); + have_cache, ob); else ob_push(c, &ptrs_skip, ob); } @@ -892,8 +871,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, bool ec, - enum bch_watermark watermark, - unsigned flags) + enum bch_watermark watermark) { int i, ret = 0; @@ -909,12 +887,12 @@ static int bucket_alloc_set_partial(struct bch_fs *c, struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); struct bch_dev_usage usage; u64 avail; bch2_dev_usage_read_fast(ca, &usage); - avail = dev_buckets_free(ca, usage, watermark); + avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; if (!avail) continue; @@ -923,9 +901,13 @@ static int bucket_alloc_set_partial(struct bch_fs *c, i); ob->on_partial_list = false; + rcu_read_lock(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; + rcu_read_unlock(); + ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, - have_cache, flags, ob); + have_cache, ob); if (ret) break; } @@ -945,7 +927,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, unsigned *nr_effective, bool *have_cache, enum bch_watermark watermark, - unsigned flags, + enum bch_write_flags flags, struct closure *_cl) { struct bch_fs *c = trans->c; @@ -964,18 +946,15 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, open_bucket_for_each(c, ptrs, ob, i) __clear_bit(ob->dev, devs.d); - if (erasure_code && ec_open_bucket(c, ptrs)) - return 0; - ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, nr_replicas, nr_effective, - have_cache, erasure_code, flags); + have_cache, erasure_code); if (ret) return ret; ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, nr_replicas, nr_effective, - have_cache, erasure_code, watermark, flags); + have_cache, erasure_code, watermark); if (ret) return ret; @@ -1016,12 +995,12 @@ static int open_bucket_add_buckets(struct btree_trans *trans, unsigned *nr_effective, bool *have_cache, enum bch_watermark watermark, - unsigned flags, + enum bch_write_flags flags, struct closure *cl) { int ret; - if (erasure_code) { + if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { ret = __open_bucket_add_buckets(trans, ptrs, wp, devs_have, target, erasure_code, nr_replicas, nr_effective, have_cache, @@ -1136,7 +1115,13 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, --c->open_buckets_partial_nr; swap(c->open_buckets_partial[i], c->open_buckets_partial[c->open_buckets_partial_nr]); + ob->on_partial_list = false; + + rcu_read_lock(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; + rcu_read_unlock(); + spin_unlock(&c->freelist_lock); bch2_open_bucket_put(c, ob); spin_lock(&c->freelist_lock); @@ -1294,7 +1279,7 @@ deallocate_extra_replicas(struct bch_fs *c, unsigned i; open_bucket_for_each(c, ptrs, ob, i) { - unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability; + unsigned d = ob_dev(c, ob)->mi.durability; if (d && d <= extra_replicas) { extra_replicas -= d; @@ -1318,7 +1303,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, unsigned nr_replicas, unsigned nr_replicas_required, enum bch_watermark watermark, - unsigned flags, + enum bch_write_flags flags, struct closure *cl, struct write_point **wp_ret) { @@ -1334,8 +1319,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) erasure_code = false; - BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); - BUG_ON(!nr_replicas || !nr_replicas_required); retry: ptrs.nr = 0; @@ -1345,6 +1328,10 @@ retry: *wp_ret = wp = writepoint_find(trans, write_point.v); + ret = bch2_trans_relock(trans); + if (ret) + goto err; + /* metadata may not allocate on cache devices: */ if (wp->data_type != BCH_DATA_user) have_cache = true; @@ -1361,15 +1348,17 @@ retry: /* Don't retry from all devices if we're out of open buckets: */ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, &have_cache, watermark, flags, cl); - if (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + if (!ret2 || + bch2_err_matches(ret2, BCH_ERR_transaction_restart) || + bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { + ret = ret2; goto alloc_done; + } } /* @@ -1434,18 +1423,19 @@ err: try_decrease_writepoints(trans, write_points_nr)) goto retry; - if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || + if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + ret = -BCH_ERR_bucket_alloc_blocked; + + if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) - return cl - ? -BCH_ERR_bucket_alloc_blocked - : -BCH_ERR_ENOSPC_bucket_alloc; + ret = -BCH_ERR_bucket_alloc_blocked; return ret; } struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); return (struct bch_extent_ptr) { .type = 1 << BCH_EXTENT_ENTRY_ptr, @@ -1519,9 +1509,9 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) } } -static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) +void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); unsigned data_type = ob->data_type; barrier(); /* READ_ONCE() doesn't work on bitfields */ @@ -1539,7 +1529,8 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str prt_newline(out); } -void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) +void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca) { struct open_bucket *ob; @@ -1549,7 +1540,7 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list) + if (ob->valid && (!ca || ob->dev == ca->dev_idx)) bch2_open_bucket_to_text(out, c, ob); spin_unlock(&ob->lock); } @@ -1623,3 +1614,121 @@ void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) prt_str(out, "Btree write point\n"); bch2_write_point_to_text(out, c, &c->btree_write_point); } + +void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) +{ + unsigned nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); + + for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].data_type]++; + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 24); + + prt_printf(out, "capacity\t%llu\n", c->capacity); + prt_printf(out, "reserved\t%llu\n", c->reserved); + prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden)); + prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree)); + prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data)); + prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached)); + prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved)); + prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved)); + prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes)); + + prt_newline(out); + prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty"); + prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); + prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT); + prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty"); + prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]); + prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]); + prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr); +} + +void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_dev_usage stats = bch2_dev_usage_read(ca); + unsigned nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); + + for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].data_type]++; + + bch2_dev_usage_to_text(out, ca, &stats); + + prt_newline(out); + + prt_printf(out, "reserves:\n"); + for (unsigned i = 0; i < BCH_WATERMARK_NR; i++) + prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i)); + + prt_newline(out); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 16); + + prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); + prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); +} + +static noinline void bch2_print_allocator_stuck(struct bch_fs *c) +{ + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n", + c->opts.allocator_stuck_timeout); + + prt_printf(&buf, "Allocator debug:\n"); + printbuf_indent_add(&buf, 2); + bch2_fs_alloc_debug_to_text(&buf, c); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + + for_each_online_member(c, ca) { + prt_printf(&buf, "Dev %u:\n", ca->dev_idx); + printbuf_indent_add(&buf, 2); + bch2_dev_alloc_debug_to_text(&buf, ca); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } + + prt_printf(&buf, "Copygc debug:\n"); + printbuf_indent_add(&buf, 2); + bch2_copygc_wait_to_text(&buf, c); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + + prt_printf(&buf, "Journal debug:\n"); + printbuf_indent_add(&buf, 2); + bch2_journal_debug_to_text(&buf, &c->journal); + printbuf_indent_sub(&buf, 2); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +} + +static inline unsigned allocator_wait_timeout(struct bch_fs *c) +{ + if (c->allocator_last_stuck && + time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies)) + return 0; + + return c->opts.allocator_stuck_timeout * HZ; +} + +void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) +{ + unsigned t = allocator_wait_timeout(c); + + if (t && closure_sync_timeout(cl, t)) { + c->allocator_last_stuck = jiffies; + bch2_print_allocator_stuck(c); + } + + closure_sync(cl); +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 7aaeec44c746..f25481a0d1a0 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -20,7 +20,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *); struct dev_alloc_list { unsigned nr; - u8 devs[BCH_SB_MEMBERS_MAX]; + u8 data[BCH_SB_MEMBERS_MAX]; }; struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, @@ -28,10 +28,14 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, struct bch_devs_mask *); void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); -long bch2_bucket_alloc_new_fs(struct bch_dev *); +static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) +{ + return bch2_dev_have_ref(c, ob->dev); +} struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, - enum bch_watermark, struct closure *); + enum bch_watermark, enum bch_data_type, + struct closure *); static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, struct open_bucket *ob) @@ -149,9 +153,10 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 return ret; } +enum bch_write_flags; int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, struct dev_stripe_state *, struct bch_devs_mask *, - unsigned, unsigned *, bool *, unsigned, + unsigned, unsigned *, bool *, enum bch_write_flags, enum bch_data_type, enum bch_watermark, struct closure *); @@ -161,7 +166,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *, struct bch_devs_list *, unsigned, unsigned, enum bch_watermark, - unsigned, + enum bch_write_flags, struct closure *, struct write_point **); @@ -184,7 +189,7 @@ bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp, wp->sectors_allocated += sectors; open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); ptr.cached = cached || @@ -216,9 +221,20 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp void bch2_fs_allocator_foreground_init(struct bch_fs *); -void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); +void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *); +void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *); void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); +void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *); +void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *); + +void __bch2_wait_on_allocator(struct bch_fs *, struct closure *); +static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) +{ + if (cl->closure_get_happened) + __bch2_wait_on_allocator(c, cl); +} + #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index b91b7a461056..4aa8ee026cb8 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -9,11 +9,19 @@ #include "fifo.h" struct bucket_alloc_state { + enum { + BTREE_BITMAP_NO, + BTREE_BITMAP_YES, + BTREE_BITMAP_ANY, + } btree_bitmap; + u64 buckets_seen; u64 skipped_open; u64 skipped_need_journal_commit; + u64 need_journal_commit; u64 skipped_nocow; u64 skipped_nouse; + u64 skipped_mi_btree_bitmap; }; #define BCH_WATERMARKS() \ @@ -22,7 +30,8 @@ struct bucket_alloc_state { x(copygc) \ x(btree) \ x(btree_copygc) \ - x(reclaim) + x(reclaim) \ + x(interior_updates) enum bch_watermark { #define x(name) BCH_WATERMARK_##name, diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 569b97904da4..ebeb6a5ff9d2 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -8,88 +8,87 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" +#include "checksum.h" +#include "disk_accounting.h" #include "error.h" #include <linux/mm.h> -static bool extent_matches_bp(struct bch_fs *c, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, - struct bpos bucket, - struct bch_backpointer bp) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket2; - struct bch_backpointer bp2; - - if (p.ptr.cached) - continue; - - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, - &bucket2, &bp2); - if (bpos_eq(bucket, bucket2) && - !memcmp(&bp, &bp2, sizeof(bp))) - return true; - } - - return false; -} - -int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); int ret = 0; - bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), - c, err, - backpointer_pos_wrong, - "backpointer at wrong pos"); + bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH, + c, backpointer_level_bad, + "backpointer level bad: %u >= %u", + bp.v->level, BTREE_MAX_DEPTH); + + bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, + c, backpointer_dev_bad, + "backpointer for BCH_SB_MEMBER_INVALID"); fsck_err: return ret; } -void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) +void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", - bch2_btree_id_str(bp->btree_id), - bp->level, - (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), - (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), - bp->bucket_len); - bch2_bpos_to_text(out, bp->pos); -} + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); -void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - if (bch2_dev_exists2(c, k.k->p.inode)) { - prt_str(out, "bucket="); - bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); - prt_str(out, " "); + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); + if (ca) { + u32 bucket_offset; + struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); + rcu_read_unlock(); + prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); + } else { + rcu_read_unlock(); + prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); } - bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); + bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); + prt_printf(out, " suboffset=%u len=%u gen=%u pos=", + (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), + bp.v->bucket_len, + bp.v->bucket_gen); + bch2_bpos_to_text(out, bp.v->pos); } void bch2_backpointer_swab(struct bkey_s k) { struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); - bp.v->bucket_offset = swab40(bp.v->bucket_offset); bp.v->bucket_len = swab32(bp.v->bucket_len); bch2_bpos_swab(&bp.v->pos); } +static bool extent_matches_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, + struct bkey_s_c_backpointer bp) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bkey_i_backpointer bp2; + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2); + + if (bpos_eq(bp.k->p, bp2.k.p) && + !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) + return true; + } + + return false; +} + static noinline int backpointer_mod_err(struct btree_trans *trans, - struct bch_backpointer bp, - struct bkey_s_c bp_k, struct bkey_s_c orig_k, + struct bkey_i_backpointer *new_bp, + struct bkey_s_c found_bp, bool insert) { struct bch_fs *c = trans->c; @@ -97,12 +96,12 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, if (insert) { prt_printf(&buf, "existing backpointer found when inserting "); - bch2_backpointer_to_text(&buf, &bp); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); prt_newline(&buf); printbuf_indent_add(&buf, 2); prt_printf(&buf, "found "); - bch2_bkey_val_to_text(&buf, c, bp_k); + bch2_bkey_val_to_text(&buf, c, found_bp); prt_newline(&buf); prt_printf(&buf, "for "); @@ -110,16 +109,15 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch_err(c, "%s", buf.buf); } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { - prt_printf(&buf, "backpointer not found when deleting"); - prt_newline(&buf); + prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); prt_printf(&buf, "searching for "); - bch2_backpointer_to_text(&buf, &bp); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); prt_newline(&buf); prt_printf(&buf, "got "); - bch2_bkey_val_to_text(&buf, c, bp_k); + bch2_bkey_val_to_text(&buf, c, found_bp); prt_newline(&buf); prt_printf(&buf, "for "); @@ -131,257 +129,225 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, printbuf_exit(&buf); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { - bch2_inconsistent_error(c); - return -EIO; + return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0; } else { return 0; } } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, - struct bpos bucket, - struct bch_backpointer bp, struct bkey_s_c orig_k, + struct bkey_i_backpointer *bp, bool insert) { struct btree_iter bp_iter; - struct bkey_s_c k; - struct bkey_i_backpointer *bp_k; - int ret; - - bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); - ret = PTR_ERR_OR_ZERO(bp_k); + struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, + bp->k.p, + BTREE_ITER_intent| + BTREE_ITER_slots| + BTREE_ITER_with_updates); + int ret = bkey_err(k); if (ret) return ret; - bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); - bp_k->v = bp; - - if (!insert) { - bp_k->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp_k->k, 0); - } - - k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bp_k->k.p, - BTREE_ITER_INTENT| - BTREE_ITER_SLOTS| - BTREE_ITER_WITH_UPDATES); - ret = bkey_err(k); - if (ret) - goto err; - if (insert ? k.k->type : (k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) { - ret = backpointer_mod_err(trans, bp, k, orig_k, insert); + memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { + ret = backpointer_mod_err(trans, orig_k, bp, k, insert); if (ret) goto err; } - ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0); + if (!insert) { + bp->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp->k, 0); + } + + ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); err: bch2_trans_iter_exit(trans, &bp_iter); return ret; } -/* - * Find the next backpointer >= *bp_offset: - */ -int bch2_get_next_backpointer(struct btree_trans *trans, - struct bpos bucket, int gen, - struct bpos *bp_pos, - struct bch_backpointer *bp, - unsigned iter_flags) +static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) { - struct bch_fs *c = trans->c; - struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); - struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; - struct bkey_s_c k; - int ret = 0; - - if (bpos_ge(*bp_pos, bp_end_pos)) - goto done; - - if (gen >= 0) { - k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, - bucket, BTREE_ITER_CACHED|iter_flags); - ret = bkey_err(k); - if (ret) - goto out; - - if (k.k->type != KEY_TYPE_alloc_v4 || - bkey_s_c_to_alloc_v4(k).v->gen != gen) - goto done; - } - - *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0)); - - for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, - *bp_pos, iter_flags, k, ret) { - if (bpos_ge(k.k->p, bp_end_pos)) - break; + return (likely(!bch2_backpointers_no_use_write_buffer) + ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) + : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); +} - *bp_pos = k.k->p; - *bp = *bkey_s_c_to_backpointer(k).v; - goto out; - } -done: - *bp_pos = SPOS_MAX; -out: - bch2_trans_iter_exit(trans, &bp_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - return ret; +static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, + struct bkey_s_c visiting_k, + struct bkey_buf *last_flushed) +{ + return likely(!bch2_backpointers_no_use_write_buffer) + ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) + : 0; } -static void backpointer_not_found(struct btree_trans *trans, - struct bpos bp_pos, - struct bch_backpointer bp, - struct bkey_s_c k) +static int backpointer_target_not_found(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct bkey_s_c target_k, + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + int ret = 0; /* * If we're using the btree write buffer, the backpointer we were * looking at may have already been deleted - failure to find what it * pointed to is not an error: */ - if (likely(!bch2_backpointers_no_use_write_buffer)) - return; + ret = last_flushed + ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) + : 0; + if (ret) + return ret; prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", - bp.level ? "btree node" : "extent"); - prt_printf(&buf, "bucket: "); - bch2_bpos_to_text(&buf, bucket); - prt_printf(&buf, "\n "); + bp.v->level ? "btree node" : "extent"); + bch2_bkey_val_to_text(&buf, c, bp.s_c); - prt_printf(&buf, "backpointer pos: "); - bch2_bpos_to_text(&buf, bp_pos); prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, target_k); - bch2_backpointer_to_text(&buf, &bp); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, k); - if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers) - bch_err_ratelimited(c, "%s", buf.buf); - else - bch2_trans_inconsistent(trans, "%s", buf.buf); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) + if (p.ptr.dev == bp.k->p.inode) { + prt_printf(&buf, "\n "); + struct bkey_i_backpointer bp2; + bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); + } + if (fsck_err(trans, backpointer_to_missing_ptr, + "%s", buf.buf)) + ret = bch2_backpointer_del(trans, bp.k->p); +fsck_err: printbuf_exit(&buf); + return ret; } struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, struct btree_iter *iter, - struct bpos bp_pos, - struct bch_backpointer bp, - unsigned iter_flags) + unsigned iter_flags, + struct bkey_buf *last_flushed) { - if (likely(!bp.level)) { - struct bch_fs *c = trans->c; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); - struct bkey_s_c k; + struct bch_fs *c = trans->c; + if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) + return bkey_s_c_null; + + if (likely(!bp.v->level)) { bch2_trans_node_iter_init(trans, iter, - bp.btree_id, - bp.pos, + bp.v->btree_id, + bp.v->pos, 0, 0, iter_flags); - k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k)) { bch2_trans_iter_exit(trans, iter); return k; } - if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) + if (k.k && + extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) return k; bch2_trans_iter_exit(trans, iter); - backpointer_not_found(trans, bp_pos, bp, k); - return bkey_s_c_null; + int ret = backpointer_target_not_found(trans, bp, k, last_flushed); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { - struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); + struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); + if (IS_ERR_OR_NULL(b)) + return ((struct bkey_s_c) { .k = ERR_CAST(b) }); - if (IS_ERR_OR_NULL(b)) { - bch2_trans_iter_exit(trans, iter); - return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null; - } return bkey_i_to_s_c(&b->key); } } struct btree *bch2_backpointer_get_node(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, struct btree_iter *iter, - struct bpos bp_pos, - struct bch_backpointer bp) + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); - struct btree *b; - BUG_ON(!bp.level); + BUG_ON(!bp.v->level); bch2_trans_node_iter_init(trans, iter, - bp.btree_id, - bp.pos, + bp.v->btree_id, + bp.v->pos, 0, - bp.level - 1, + bp.v->level - 1, 0); - b = bch2_btree_iter_peek_node(iter); + struct btree *b = bch2_btree_iter_peek_node(iter); if (IS_ERR_OR_NULL(b)) goto err; - BUG_ON(b->c.level != bp.level - 1); + BUG_ON(b->c.level != bp.v->level - 1); - if (extent_matches_bp(c, bp.btree_id, bp.level, - bkey_i_to_s_c(&b->key), - bucket, bp)) + if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, + bkey_i_to_s_c(&b->key), bp)) return b; if (btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { - backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); - b = NULL; + int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); + b = ret ? ERR_PTR(ret) : NULL; } err: bch2_trans_iter_exit(trans, iter); return b; } -static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter, - struct bkey_s_c k) +static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, + struct bkey_buf *last_flushed) { + if (k.k->type != KEY_TYPE_backpointer) + return 0; + struct bch_fs *c = trans->c; struct btree_iter alloc_iter = { NULL }; struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret = 0; - if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, - backpointer_to_missing_device, - "backpointer for missing device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, bp_iter, 0); + struct bpos bucket; + if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { + ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); + if (ret) + goto out; + + if (fsck_err(trans, backpointer_to_missing_device, + "backpointer for missing device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_backpointer_del(trans, k.k->p); goto out; } - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, - bp_pos_to_bucket(c, k.k->p), 0); + alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); ret = bkey_err(alloc_k); if (ret) goto out; - if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c, - backpointer_to_missing_alloc, - "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", - alloc_iter.pos.inode, alloc_iter.pos.offset, - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - ret = bch2_btree_delete_at(trans, bp_iter, 0); - goto out; + if (alloc_k.k->type != KEY_TYPE_alloc_v4) { + ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); + if (ret) + goto out; + + if (fsck_err(trans, backpointer_to_missing_alloc, + "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", + alloc_iter.pos.inode, alloc_iter.pos.offset, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_backpointer_del(trans, k.k->p); } out: fsck_err: @@ -393,94 +359,222 @@ fsck_err: /* verify that every backpointer has a corresponding alloc key */ int bch2_check_btree_backpointers(struct bch_fs *c) { + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_btree_backpointer(trans, &iter, k))); + bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; } -static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) -{ - return bpos_eq(l.k->p, r.k->p) && - bkey_bytes(l.k) == bkey_bytes(r.k) && - !memcmp(l.v, r.v, bkey_val_bytes(l.k)); -} - struct extents_to_bp_state { - struct bpos bucket_start; - struct bpos bucket_end; + struct bpos bp_start; + struct bpos bp_end; struct bkey_buf last_flushed; }; +static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, + struct bkey_s_c extent, unsigned dev) +{ + struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bch2_bkey_drop_device(bkey_i_to_s(n), dev); + return bch2_btree_insert_trans(trans, btree, n, 0); +} + +static int check_extent_checksum(struct btree_trans *trans, + enum btree_id btree, struct bkey_s_c extent, + enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct printbuf buf = PRINTBUF; + void *data_buf = NULL; + struct bio *bio = NULL; + size_t bytes; + int ret = 0; + + if (bkey_is_btree_ptr(extent.k)) + return false; + + bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) + if (p.ptr.dev == dev) + goto found; + BUG(); +found: + if (!p.crc.csum_type) + return false; + + bytes = p.crc.compressed_size << 9; + + struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); + if (!ca) + return false; + + data_buf = kvmalloc(bytes, GFP_KERNEL); + if (!data_buf) { + ret = -ENOMEM; + goto err; + } + + bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL); + bio->bi_iter.bi_sector = p.ptr.offset; + bch2_bio_map(bio, data_buf, bytes); + ret = submit_bio_wait(bio); + if (ret) + goto err; + + prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); + prt_printf(&buf, "\n "); + bch2_btree_id_to_text(&buf, btree); + prt_str(&buf, " "); + bch2_bkey_val_to_text(&buf, c, extent); + prt_printf(&buf, "\n "); + bch2_btree_id_to_text(&buf, o_btree); + prt_str(&buf, " "); + bch2_bkey_val_to_text(&buf, c, extent2); + + struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); + struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); + if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), + trans, dup_backpointer_to_bad_csum_extent, + "%s", buf.buf)) + ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; +fsck_err: +err: + if (bio) + bio_put(bio); + kvfree(data_buf); + percpu_ref_put(&ca->io_ref); + printbuf_exit(&buf); + return ret; +} + static int check_bp_exists(struct btree_trans *trans, struct extents_to_bp_state *s, - struct bpos bucket, - struct bch_backpointer bp, + struct bkey_i_backpointer *bp, struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; - struct btree_iter bp_iter = { NULL }; + struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; - struct bkey_s_c bp_k; - struct bkey_buf tmp; - int ret; - - bch2_bkey_buf_init(&tmp); - if (bpos_lt(bucket, s->bucket_start) || - bpos_gt(bucket, s->bucket_end)) + if (bpos_lt(bp->k.p, s->bp_start) || + bpos_gt(bp->k.p, s->bp_end)) return 0; - if (!bch2_dev_bucket_exists(c, bucket)) + struct btree_iter bp_iter; + struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); + int ret = bkey_err(bp_k); + if (ret) + goto err; + + if (bp_k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { + ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); + if (ret) + goto err; + + goto check_existing_bp; + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &other_extent_iter); + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + return ret; +check_existing_bp: + /* Do we have a backpointer for a different extent? */ + if (bp_k.k->type != KEY_TYPE_backpointer) goto missing; - bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(c, bucket, bp.bucket_offset), - 0); - ret = bkey_err(bp_k); + struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); + + struct bkey_s_c other_extent = + bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); + ret = bkey_err(other_extent); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + ret = 0; if (ret) goto err; - if (bp_k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - bch2_bkey_buf_reassemble(&tmp, c, orig_k); + if (!other_extent.k) + goto missing; - if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) { - if (bp.level) { - bch2_trans_unlock(trans); - bch2_btree_interior_updates_flush(c); - } + if (bch2_extents_match(orig_k, other_extent)) { + printbuf_reset(&buf); + prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n "); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, other_extent); + bch_err(c, "%s", buf.buf); - ret = bch2_btree_write_buffer_flush_sync(trans); + if (other_extent.k->size <= orig_k.k->size) { + ret = drop_dev_and_update(trans, other_bp.v->btree_id, + other_extent, bp->k.p.inode); if (ret) goto err; - - bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k); - ret = -BCH_ERR_transaction_restart_write_buffer_flush; goto out; + } else { + ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); + if (ret) + goto err; + goto missing; } + } + + ret = check_extent_checksum(trans, + other_bp.v->btree_id, other_extent, + bp->v.btree_id, orig_k, + bp->k.p.inode); + if (ret < 0) + goto err; + if (ret) { + ret = 0; goto missing; } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); - bch2_bkey_buf_exit(&tmp, c); - printbuf_exit(&buf); - return ret; + + ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, + other_bp.v->btree_id, other_extent, bp->k.p.inode); + if (ret < 0) + goto err; + if (ret) { + ret = 0; + goto out; + } + + printbuf_reset(&buf); + prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, other_extent); + bch_err(c, "%s", buf.buf); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; missing: - prt_printf(&buf, "missing backpointer for btree=%s l=%u ", - bch2_btree_id_str(bp.btree_id), bp.level); + printbuf_reset(&buf); + prt_str(&buf, "missing backpointer\n for: "); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\nbp pos "); - bch2_bpos_to_text(&buf, bp_iter.pos); + prt_printf(&buf, "\n want: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); + prt_printf(&buf, "\n got: "); + bch2_bkey_val_to_text(&buf, c, bp_k); - if (c->opts.reconstruct_alloc || - fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) - ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); + if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) + ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true); goto out; } @@ -491,25 +585,33 @@ static int check_extent_to_backpointers(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - int ret; - ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket_pos; - struct bch_backpointer bp; - if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree, level, - k, p, &bucket_pos, &bp); + if (p.ptr.dev == BCH_SB_MEMBER_INVALID) + continue; - ret = check_bp_exists(trans, s, bucket_pos, bp, k); - if (ret) - return ret; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); + bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); + bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); + rcu_read_unlock(); + + if (check || empty) { + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); + + int ret = check + ? check_bp_exists(trans, s, &bp, k) + : bch2_bucket_backpointer_mod(trans, k, &bp, true); + if (ret) + return ret; + } } return 0; @@ -555,69 +657,138 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) }; } -static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +static u64 mem_may_pin_bytes(struct bch_fs *c) { struct sysinfo i; - u64 mem_bytes; - si_meminfo(&i); - mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes >> 1, c->opts.btree_node_size); + + u64 mem_bytes = i.totalram * i.mem_unit; + return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); +} + +static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +{ + return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); } static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, - unsigned btree_leaf_mask, - unsigned btree_interior_mask, + u64 btree_leaf_mask, + u64 btree_interior_mask, struct bbpos start, struct bbpos *end) { - struct btree_iter iter; - struct bkey_s_c k; - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); - enum btree_id btree; + struct bch_fs *c = trans->c; + s64 mem_may_pin = mem_may_pin_bytes(c); int ret = 0; - for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { - unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; + bch2_btree_cache_unpin(c); + + btree_interior_mask |= btree_leaf_mask; + + c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask; + c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask; + c->btree_cache.pinned_nodes_start = start; + c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; - if (!((1U << btree) & btree_leaf_mask) && - !((1U << btree) & btree_interior_mask)) + for (enum btree_id btree = start.btree; + btree < BTREE_ID_NR && !ret; + btree++) { + unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1; + + if (!(BIT_ULL(btree) & btree_leaf_mask) && + !(BIT_ULL(btree) & btree_interior_mask)) continue; - bch2_trans_node_iter_init(trans, &iter, btree, - btree == start.btree ? start.pos : POS_MIN, - 0, depth, 0); - /* - * for_each_btree_key_contineu() doesn't check the return value - * from bch2_btree_iter_advance(), which is needed when - * iterating over interior nodes where we'll see keys at - * SPOS_MAX: - */ - do { - k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); - ret = bkey_err(k); - if (!k.k || ret) + ret = __for_each_btree_node(trans, iter, btree, + btree == start.btree ? start.pos : POS_MIN, + 0, depth, BTREE_ITER_prefetch, b, ({ + mem_may_pin -= btree_buf_bytes(b); + if (mem_may_pin <= 0) { + c->btree_cache.pinned_nodes_end = *end = + BBPOS(btree, b->key.k.p); break; - - --btree_nodes; - if (!btree_nodes) { - *end = BBPOS(btree, k.k->p); - bch2_trans_iter_exit(trans, &iter); - return 0; } - } while (bch2_btree_iter_advance(&iter)); - bch2_trans_iter_exit(trans, &iter); + bch2_node_pin(c, b); + 0; + })); + } + + return ret; +} + +struct progress_indicator_state { + unsigned long next_print; + u64 nodes_seen; + u64 nodes_total; + struct btree *last_node; +}; + +static inline void progress_init(struct progress_indicator_state *s, + struct bch_fs *c, + u64 btree_id_mask) +{ + memset(s, 0, sizeof(*s)); + + s->next_print = jiffies + HZ * 10; + + for (unsigned i = 0; i < BTREE_ID_NR; i++) { + if (!(btree_id_mask & BIT_ULL(i))) + continue; + + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_btree, + .btree.id = i, + }; + + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + s->nodes_total += div64_ul(v, btree_sectors(c)); } +} + +static inline bool progress_update_p(struct progress_indicator_state *s) +{ + bool ret = time_after_eq(jiffies, s->next_print); - *end = BBPOS_MAX; + if (ret) + s->next_print = jiffies + HZ * 10; return ret; } +static void progress_update_iter(struct btree_trans *trans, + struct progress_indicator_state *s, + struct btree_iter *iter, + const char *msg) +{ + struct bch_fs *c = trans->c; + struct btree *b = path_l(btree_iter_path(trans, iter))->b; + + s->nodes_seen += b != s->last_node; + s->last_node = b; + + if (progress_update_p(s)) { + struct printbuf buf = PRINTBUF; + unsigned percent = s->nodes_total + ? div64_u64(s->nodes_seen * 100, s->nodes_total) + : 0; + + prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", + msg, percent, s->nodes_seen, s->nodes_total); + bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); + + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + } +} + static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct extents_to_bp_state *s) { struct bch_fs *c = trans->c; + struct progress_indicator_state progress; int ret = 0; + progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); + for (enum btree_id btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { @@ -631,31 +802,14 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, while (level >= depth) { struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, - level, - BTREE_ITER_PREFETCH); - while (1) { - bch2_trans_begin(trans); - - struct bkey_s_c k = bch2_btree_iter_peek(&iter); - if (!k.k) - break; - ret = bkey_err(k) ?: - check_extent_to_backpointers(trans, s, btree_id, level, k) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - if (ret) - break; - if (bpos_eq(iter.pos, SPOS_MAX)) - break; - bch2_btree_iter_advance(&iter); - } - bch2_trans_iter_exit(trans, &iter); - + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, + BTREE_ITER_prefetch); + + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); + check_extent_to_backpointers(trans, s, btree_id, level, k) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + })); if (ret) return ret; @@ -666,102 +820,330 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, return 0; } -static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, - struct bpos bucket) +enum alloc_sector_counter { + ALLOC_dirty, + ALLOC_cached, + ALLOC_stripe, + ALLOC_SECTORS_NR +}; + +static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) { - return bch2_dev_exists2(c, bucket.inode) - ? bucket_pos_to_bp(c, bucket, 0) - : bucket; + switch (t) { + case BCH_DATA_btree: + case BCH_DATA_user: + return ALLOC_dirty; + case BCH_DATA_cached: + return ALLOC_cached; + case BCH_DATA_stripe: + return ALLOC_stripe; + default: + BUG(); + } } -static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, - struct bpos start, struct bpos *end) +static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); + +static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, + struct bkey_buf *last_flushed) { - struct btree_iter alloc_iter; - struct btree_iter bp_iter; - struct bkey_s_c alloc_k, bp_k; - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); - bool alloc_end = false, bp_end = false; + struct bch_fs *c = trans->c; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); + bool need_commit = false; + + if (a->data_type == BCH_DATA_sb || + a->data_type == BCH_DATA_journal || + a->data_type == BCH_DATA_parity) + return 0; + + u32 sectors[ALLOC_SECTORS_NR]; + memset(sectors, 0, sizeof(sectors)); + + struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); + if (!ca) + return 0; + + struct btree_iter iter; + struct bkey_s_c bp_k; int ret = 0; + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, alloc_k.k->p), + bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { + if (bp_k.k->type != KEY_TYPE_backpointer) + continue; - bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, - start, 0, 1, 0); - bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0); - while (1) { - alloc_k = !alloc_end - ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) - : bkey_s_c_null; - bp_k = !bp_end - ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) - : bkey_s_c_null; - - ret = bkey_err(alloc_k) ?: bkey_err(bp_k); - if ((!alloc_k.k && !bp_k.k) || ret) { - *end = SPOS_MAX; - break; + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && + (bp.v->bucket_gen != a->gen || + bp.v->pad)) { + ret = bch2_backpointer_del(trans, bp_k.k->p); + if (ret) + break; + + need_commit = true; + continue; } - --btree_nodes; - if (!btree_nodes) { - *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX; - break; + if (bp.v->bucket_gen != a->gen) + continue; + + sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; + }; + bch2_trans_iter_exit(trans, &iter); + if (ret) + goto err; + + if (need_commit) { + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + } + + /* Cached pointers don't have backpointers: */ + + if (sectors[ALLOC_dirty] != a->dirty_sectors || + sectors[ALLOC_stripe] != a->stripe_sectors) { + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { + ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); + if (ret) + goto err; } - if (bpos_lt(alloc_iter.pos, SPOS_MAX) && - bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) { - if (!bch2_btree_iter_advance(&alloc_iter)) - alloc_end = true; - } else { - if (!bch2_btree_iter_advance(&bp_iter)) - bp_end = true; + if (sectors[ALLOC_dirty] > a->dirty_sectors || + sectors[ALLOC_stripe] > a->stripe_sectors) { + ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: + -BCH_ERR_transaction_restart_nested; + goto err; } + + if (!sectors[ALLOC_dirty] && + !sectors[ALLOC_stripe]) + __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); + else + __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); } - bch2_trans_iter_exit(trans, &bp_iter); - bch2_trans_iter_exit(trans, &alloc_iter); +err: + bch2_dev_put(ca); + return ret; +} + +static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_btree_ptr_v2: { + bool ret = false; + + rcu_read_lock(); + struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; + while (pos.inode <= k.k->p.inode) { + if (pos.inode >= c->sb.nr_devices) + break; + + struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); + if (!ca) + goto next; + + struct bpos bucket = bp_pos_to_bucket(ca, pos); + bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, + ca->mi.nbuckets, bucket.offset); + if (bucket.offset == ca->mi.nbuckets) + goto next; + + ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p); + if (ret) + break; +next: + pos = SPOS(pos.inode + 1, 0, 0); + } + rcu_read_unlock(); + + return ret; + } + case KEY_TYPE_btree_ptr: + return true; + default: + return false; + } +} + +static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, + enum btree_id btree, unsigned level) +{ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + if (b) + bch2_node_pin(trans->c, b); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, + struct bpos start, struct bpos *end) +{ + struct bch_fs *c = trans->c; + int ret = 0; + + struct bkey_buf tmp; + bch2_bkey_buf_init(&tmp); + + bch2_btree_cache_unpin(c); + + *end = SPOS_MAX; + + s64 mem_may_pin = mem_may_pin_bytes(c); + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, + 0, 1, BTREE_ITER_prefetch); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + if (!backpointer_node_has_missing(c, k)) + continue; + + mem_may_pin -= c->opts.btree_node_size; + if (mem_may_pin <= 0) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + struct btree_path *path = btree_iter_path(trans, &iter); + + BUG_ON(path->level != 1); + + bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); + })); + if (ret) + return ret; + + struct bpos pinned = SPOS_MAX; + mem_may_pin = mem_may_pin_bytes(c); + bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, + 0, 1, BTREE_ITER_prefetch); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + if (!backpointer_node_has_missing(c, k)) + continue; + + mem_may_pin -= c->opts.btree_node_size; + if (mem_may_pin <= 0) { + *end = pinned; + break; + } + + bch2_bkey_buf_reassemble(&tmp, c, k); + struct btree_path *path = btree_iter_path(trans, &iter); + + BUG_ON(path->level != 1); + + int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1); + + if (!ret2) + pinned = tmp.k->k.p; + + ret; + })); + if (ret) + return ret; + return ret; } int bch2_check_extents_to_backpointers(struct bch_fs *c) { + int ret = 0; + + /* + * Can't allow devices to come/go/resize while we have bucket bitmaps + * allocated + */ + lockdep_assert_held(&c->state_lock); + + for_each_member_device(c, ca) { + BUG_ON(ca->bucket_backpointer_mismatches); + ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), + sizeof(unsigned long), + GFP_KERNEL); + ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), + sizeof(unsigned long), + GFP_KERNEL); + if (!ca->bucket_backpointer_mismatches || + !ca->bucket_backpointer_empty) { + bch2_dev_put(ca); + ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + goto err_free_bitmaps; + } + } + struct btree_trans *trans = bch2_trans_get(c); - struct extents_to_bp_state s = { .bucket_start = POS_MIN }; - int ret; + struct extents_to_bp_state s = { .bp_start = POS_MIN }; bch2_bkey_buf_init(&s.last_flushed); bkey_init(&s.last_flushed.k->k); + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, + POS_MIN, BTREE_ITER_prefetch, k, ({ + check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); + })); + if (ret) + goto err; + + u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; + for_each_member_device(c, ca) { + nr_buckets += ca->mi.nbuckets; + nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); + nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); + } + + if (!nr_mismatches && !nr_empty) + goto err; + + bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", + nr_mismatches + nr_empty, nr_buckets); + while (1) { - ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); + ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); if (ret) break; - if ( bpos_eq(s.bucket_start, POS_MIN) && - !bpos_eq(s.bucket_end, SPOS_MAX)) + if ( bpos_eq(s.bp_start, POS_MIN) && + !bpos_eq(s.bp_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", __func__, btree_nodes_fit_in_ram(c)); - if (!bpos_eq(s.bucket_start, POS_MIN) || - !bpos_eq(s.bucket_end, SPOS_MAX)) { + if (!bpos_eq(s.bp_start, POS_MIN) || + !bpos_eq(s.bp_end, SPOS_MAX)) { struct printbuf buf = PRINTBUF; prt_str(&buf, "check_extents_to_backpointers(): "); - bch2_bpos_to_text(&buf, s.bucket_start); + bch2_bpos_to_text(&buf, s.bp_start); prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, s.bucket_end); + bch2_bpos_to_text(&buf, s.bp_end); bch_verbose(c, "%s", buf.buf); printbuf_exit(&buf); } ret = bch2_check_extents_to_backpointers_pass(trans, &s); - if (ret || bpos_eq(s.bucket_end, SPOS_MAX)) + if (ret || bpos_eq(s.bp_end, SPOS_MAX)) break; - s.bucket_start = bpos_successor(s.bucket_end); + s.bp_start = bpos_successor(s.bp_end); } +err: bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); + bch2_btree_cache_unpin(c); +err_free_bitmaps: + for_each_member_device(c, ca) { + kvfree(ca->bucket_backpointer_empty); + ca->bucket_backpointer_empty = NULL; + kvfree(ca->bucket_backpointer_mismatches); + ca->bucket_backpointer_mismatches = NULL; + } bch_err_fn(c, ret); return ret; @@ -770,61 +1152,70 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, - struct bkey_s_c_backpointer bp, - struct bpos *last_flushed_pos) + struct bkey_s_c bp_k, + struct bkey_buf *last_flushed) { - struct bch_fs *c = trans->c; - struct btree_iter iter; + if (bp_k.k->type != KEY_TYPE_backpointer) + return 0; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); struct bbpos pos = bp_to_bbpos(*bp.v); - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - int ret; if (bbpos_cmp(pos, start) < 0 || bbpos_cmp(pos, end) > 0) return 0; - k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed); + int ret = bkey_err(k); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) return 0; if (ret) return ret; - if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) { - *last_flushed_pos = bp.k->p; - ret = bch2_btree_write_buffer_flush_sync(trans) ?: - -BCH_ERR_transaction_restart_write_buffer_flush; - goto out; - } - - if (fsck_err_on(!k.k, c, - backpointer_to_missing_ptr, - "backpointer for missing %s\n %s", - bp.v->level ? "btree node" : "extent", - (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { - ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); - goto out; - } -out: -fsck_err: bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); return ret; } +static int check_bucket_backpointers_to_extents(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket) +{ + u32 restart_count = trans->restart_count; + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, bucket), + bucket_pos_to_bp_end(ca, bucket), + 0, k, + check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) + ); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + return ret ?: trans_was_restarted(trans, restart_count); +} + static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) { - struct bpos last_flushed_pos = SPOS_MAX; + struct bch_fs *c = trans->c; + struct bkey_buf last_flushed; + struct progress_indicator_state progress; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); - return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, - POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_one_backpointer(trans, start, end, - bkey_s_c_to_backpointer(k), - &last_flushed_pos)); + int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, + POS_MIN, BTREE_ITER_prefetch, k, ({ + progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); + check_one_backpointer(trans, start, end, k, &last_flushed); + })); + + bch2_bkey_buf_exit(&last_flushed, c); + return ret; } int bch2_check_backpointers_to_extents(struct bch_fs *c) @@ -835,8 +1226,8 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) while (1) { ret = bch2_get_btree_in_memory_pos(trans, - (1U << BTREE_ID_extents)| - (1U << BTREE_ID_reflink), + BIT_ULL(BTREE_ID_extents)| + BIT_ULL(BTREE_ID_reflink), ~0, start, &end); if (ret) @@ -868,6 +1259,8 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_put(trans); + bch2_btree_cache_unpin(c); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 327365a9feac..060dad1521ee 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -6,6 +6,7 @@ #include "btree_iter.h" #include "btree_update.h" #include "buckets.h" +#include "error.h" #include "super.h" static inline u64 swab40(u64 x) @@ -17,15 +18,14 @@ static inline u64 swab40(u64 x) ((x & 0xff00000000ULL) >> 32)); } -int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k, - enum bkey_invalid_flags, struct printbuf *); -void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); -void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, + struct bkey_validate_context); +void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); #define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ - .key_invalid = bch2_backpointer_invalid, \ - .val_to_text = bch2_backpointer_k_to_text, \ + .key_validate = bch2_backpointer_validate, \ + .val_to_text = bch2_backpointer_to_text, \ .swab = bch2_backpointer_swab, \ .min_val_size = 32, \ }) @@ -36,96 +36,133 @@ void bch2_backpointer_swab(struct bkey_s); * Convert from pos in backpointer btree to pos of corresponding bucket in alloc * btree: */ -static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, - struct bpos bp_pos) +static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode); u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); } +static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos, + u32 *bucket_offset) +{ + u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset)); +} + +static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); + if (ca) + *bucket = bp_pos_to_bucket(ca, bp_pos); + rcu_read_unlock(); + return ca != NULL; +} + +static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, + struct bpos bucket, + u64 bucket_offset) +{ + return POS(bucket.inode, + (bucket_to_sector(ca, bucket.offset) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); +} + /* * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: */ -static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, +static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca, struct bpos bucket, u64 bucket_offset) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); - struct bpos ret; - - ret = POS(bucket.inode, - (bucket_to_sector(ca, bucket.offset) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); + struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset); + EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret))); + return ret; +} - EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); +static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket) +{ + return bucket_pos_to_bp(ca, bucket, 0); +} - return ret; +static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket) +{ + return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0)); } -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket, - struct bch_backpointer, struct bkey_s_c, bool); +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, + struct bkey_s_c, + struct bkey_i_backpointer *, + bool); static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, - struct bpos bucket, - struct bch_backpointer bp, struct bkey_s_c orig_k, + struct bkey_i_backpointer *bp, bool insert) { if (unlikely(bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert); - - struct bkey_i_backpointer bp_k; - - bkey_backpointer_init(&bp_k.k_i); - bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); - bp_k.v = bp; + return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); if (!insert) { - bp_k.k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp_k.k, 0); + bp->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp->k, 0); } - return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); + return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i); } -static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p) +static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, + struct extent_ptr_decoded p, + const union bch_extent_entry *entry) { - return level ? BCH_DATA_btree : - p.has_ec ? BCH_DATA_stripe : - BCH_DATA_user; + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + return BCH_DATA_btree; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; + case KEY_TYPE_stripe: { + const struct bch_extent_ptr *ptr = &entry->ptr; + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + BUG_ON(ptr < s.v->ptrs || + ptr >= s.v->ptrs + s.v->nr_blocks); + + return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity + : BCH_DATA_user; + } + default: + BUG(); + } } static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, - struct bpos *bucket_pos, struct bch_backpointer *bp) + const union bch_extent_entry *entry, + struct bkey_i_backpointer *bp) { - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); - s64 sectors = level ? btree_sectors(c) : k.k->size; - u32 bucket_offset; - - *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); - *bp = (struct bch_backpointer) { + bkey_backpointer_init(&bp->k_i); + bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset); + bp->v = (struct bch_backpointer) { .btree_id = btree_id, .level = level, - .data_type = data_type, - .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + - p.crc.offset, - .bucket_len = ptr_disk_sectors(sectors, p), + .data_type = bch2_bkey_ptr_data_type(k, p, entry), + .bucket_gen = p.ptr.gen, + .bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p), .pos = k.k->p, }; } -int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, - struct bpos *, struct bch_backpointer *, unsigned); -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, - struct bpos, struct bch_backpointer, - unsigned); -struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, - struct bpos, struct bch_backpointer); +struct bkey_buf; +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer, + struct btree_iter *, unsigned, struct bkey_buf *); +struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, + struct btree_iter *, struct bkey_buf *); int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h index be2edced5213..63abe17f35ea 100644 --- a/fs/bcachefs/bbpos.h +++ b/fs/bcachefs/bbpos.h @@ -29,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos) static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) { - prt_str(out, bch2_btree_id_str(pos.btree)); + bch2_btree_id_to_text(out, pos.btree); prt_char(out, ':'); bch2_bpos_to_text(out, pos.pos); } diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h index 5198e94cf3b8..f63893344f80 100644 --- a/fs/bcachefs/bbpos_types.h +++ b/fs/bcachefs/bbpos_types.h @@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) } #define BBPOS_MIN BBPOS(0, POS_MIN) -#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) +#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX) #endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 69d0d60d50e3..161cf2f05d2a 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -205,13 +205,16 @@ #include <linux/zstd.h> #include "bcachefs_format.h" +#include "btree_journal_iter_types.h" +#include "disk_accounting_types.h" #include "errcode.h" #include "fifo.h" #include "nocow_locking_types.h" #include "opts.h" -#include "recovery_types.h" +#include "recovery_passes_types.h" #include "sb-errors_types.h" #include "seqmutex.h" +#include "time_stats.h" #include "util.h" #ifdef CONFIG_BCACHEFS_DEBUG @@ -265,6 +268,11 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") +void bch2_print_str(struct bch_fs *, const char *); + +__printf(2, 3) +void bch2_print_opts(struct bch_opts *, const char *, ...); + __printf(2, 3) void __bch2_print(struct bch_fs *c, const char *fmt, ...); @@ -286,6 +294,8 @@ do { \ #define bch_info(c, fmt, ...) \ bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_info_ratelimited(c, fmt, ...) \ + bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_notice(c, fmt, ...) \ bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn(c, fmt, ...) \ @@ -345,6 +355,12 @@ do { \ bch_info(c, fmt, ##__VA_ARGS__); \ } while (0) +#define bch_verbose_ratelimited(c, fmt, ...) \ +do { \ + if ((c)->opts.verbose) \ + bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \ +} while (0) + #define pr_verbose_init(opts, fmt, ...) \ do { \ if (opt_get(opts, verbose)) \ @@ -355,6 +371,8 @@ do { \ #define BCH_DEBUG_PARAMS_ALWAYS() \ BCH_DEBUG_PARAM(key_merging_disabled, \ "Disables merging of extents") \ + BCH_DEBUG_PARAM(btree_node_merging_disabled, \ + "Disables merging of btree nodes") \ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ "Causes mark and sweep to compact and rewrite every " \ "btree node it traverses") \ @@ -438,6 +456,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(blocked_journal_low_on_space) \ x(blocked_journal_low_on_pin) \ x(blocked_journal_max_in_flight) \ + x(blocked_key_cache_flush) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ x(blocked_write_buffer_full) \ @@ -451,7 +470,9 @@ enum bch_time_stats { }; #include "alloc_types.h" +#include "btree_gc_types.h" #include "btree_types.h" +#include "btree_node_scan_types.h" #include "btree_write_buffer_types.h" #include "buckets_types.h" #include "buckets_waiting_for_journal_types.h" @@ -463,6 +484,7 @@ enum bch_time_stats { #include "quota_types.h" #include "rebalance_types.h" #include "replicas_types.h" +#include "sb-members_types.h" #include "subvolume_types.h" #include "super_types.h" #include "thread_with_file_types.h" @@ -480,55 +502,24 @@ enum bch_time_stats { struct btree; -enum gc_phase { - GC_PHASE_NOT_RUNNING, - GC_PHASE_START, - GC_PHASE_SB, - - GC_PHASE_BTREE_stripes, - GC_PHASE_BTREE_extents, - GC_PHASE_BTREE_inodes, - GC_PHASE_BTREE_dirents, - GC_PHASE_BTREE_xattrs, - GC_PHASE_BTREE_alloc, - GC_PHASE_BTREE_quotas, - GC_PHASE_BTREE_reflink, - GC_PHASE_BTREE_subvolumes, - GC_PHASE_BTREE_snapshots, - GC_PHASE_BTREE_lru, - GC_PHASE_BTREE_freespace, - GC_PHASE_BTREE_need_discard, - GC_PHASE_BTREE_backpointers, - GC_PHASE_BTREE_bucket_gens, - GC_PHASE_BTREE_snapshot_trees, - GC_PHASE_BTREE_deleted_inodes, - GC_PHASE_BTREE_logged_ops, - GC_PHASE_BTREE_rebalance_work, - - GC_PHASE_PENDING_DELETE, -}; - -struct gc_pos { - enum gc_phase phase; - struct bpos pos; - unsigned level; -}; - -struct reflink_gc { - u64 offset; - u32 size; - u32 refcount; -}; - -typedef GENRADIX(struct reflink_gc) reflink_gc_table; - struct io_count { u64 sectors[2][BCH_DATA_NR]; }; +struct discard_in_flight { + bool in_progress:1; + u64 bucket:63; +}; + struct bch_dev { struct kobject kobj; +#ifdef CONFIG_BCACHEFS_DEBUG + atomic_long_t ref; + bool dying; + unsigned long last_put; +#else struct percpu_ref ref; +#endif struct completion ref_completion; struct percpu_ref io_ref; struct completion io_ref_completion; @@ -554,36 +545,38 @@ struct bch_dev { struct bch_devs_mask self; - /* biosets used in cloned bios for writing multiple replicas */ - struct bio_set replica_set; - /* * Buckets: - * Per-bucket arrays are protected by c->mark_lock, bucket_lock and - * gc_lock, for device resize - holding any is sufficient for access: - * Or rcu_read_lock(), but only for ptr_stale(): + * Per-bucket arrays are protected by either rcu_read_lock or + * state_lock, for device resize. */ - struct bucket_array __rcu *buckets_gc; + GENRADIX(struct bucket) buckets_gc; struct bucket_gens __rcu *bucket_gens; u8 *oldest_gen; unsigned long *buckets_nouse; - struct rw_semaphore bucket_lock; - struct bch_dev_usage *usage_base; - struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; - struct bch_dev_usage __percpu *usage_gc; + unsigned long *bucket_backpointer_mismatches; + unsigned long *bucket_backpointer_empty; + + struct bch_dev_usage __percpu *usage; /* Allocator: */ - u64 new_fs_bucket_idx; - u64 alloc_cursor; + u64 alloc_cursor[3]; unsigned nr_open_buckets; + unsigned nr_partial_buckets; unsigned nr_btree_reserve; size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; size_t buckets_waiting_on_journal; + struct work_struct invalidate_work; + struct work_struct discard_work; + struct mutex discard_buckets_in_flight_lock; + DARRAY(struct discard_in_flight) discard_buckets_in_flight; + struct work_struct discard_fast_work; + atomic64_t rebalance_work; struct journal_device journal; @@ -593,7 +586,7 @@ struct bch_dev { /* The rest of this all shows up in sysfs */ atomic64_t cur_latency[2]; - struct bch2_time_stats io_latency[2]; + struct bch2_time_stats_quantiles io_latency[2]; #define CONGESTED_MAX 1024 atomic_t congested; @@ -609,7 +602,11 @@ struct bch_dev { */ #define BCH_FS_FLAGS() \ + x(new_fs) \ x(started) \ + x(clean_recovery) \ + x(btree_running) \ + x(accounting_replay_done) \ x(may_go_rw) \ x(rw) \ x(was_rw) \ @@ -618,14 +615,15 @@ struct bch_dev { x(going_ro) \ x(write_disable_complete) \ x(clean_shutdown) \ + x(recovery_running) \ x(fsck_running) \ x(initial_gc_unfixed) \ - x(need_another_gc) \ x(need_delete_dead_snapshots) \ x(error) \ x(topology_error) \ x(errors_fixed) \ - x(errors_not_fixed) + x(errors_not_fixed) \ + x(no_invalid_checks) enum bch_fs_flags { #define x(n) BCH_FS_##n, @@ -662,38 +660,15 @@ struct journal_seq_blacklist_table { } entries[]; }; -struct journal_keys { - struct journal_key { - u64 journal_seq; - u32 journal_offset; - enum btree_id btree_id:8; - unsigned level:8; - bool allocated; - bool overwritten; - struct bkey_i *k; - } *d; - /* - * Gap buffer: instead of all the empty space in the array being at the - * end of the buffer - from @nr to @size - the empty space is at @gap. - * This means that sequential insertions are O(n) instead of O(n^2). - */ - size_t gap; - size_t nr; - size_t size; - atomic_t ref; - bool initial_ref_held; -}; - struct btree_trans_buf { struct btree_trans *trans; }; -#define REPLICAS_DELTA_LIST_MAX (1U << 16) - #define BCACHEFS_ROOT_SUBVOL_INUM \ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) #define BCH_WRITE_REFS() \ + x(journal) \ x(trans) \ x(write) \ x(promote) \ @@ -702,9 +677,14 @@ struct btree_trans_buf { x(stripe_delete) \ x(reflink) \ x(fallocate) \ + x(fsync) \ + x(dio_write) \ x(discard) \ + x(discard_fast) \ + x(check_discard_freespace_key) \ x(invalidate) \ x(delete_dead_snapshots) \ + x(gc_gens) \ x(snapshot_delete_pagecache) \ x(sysfs) \ x(btree_write_buffer) @@ -745,6 +725,12 @@ struct bch_fs { struct percpu_ref writes; #endif /* + * Certain operations are only allowed in single threaded mode, during + * recovery, and we want to assert that this is the case: + */ + struct task_struct *recovery_task; + + /* * Analagous to c->writes, for asynchronous ops that don't necessarily * need fs to be read-write */ @@ -755,15 +741,14 @@ struct bch_fs { struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; + struct bch_accounting_mem accounting; + struct bch_replicas_cpu replicas; struct bch_replicas_cpu replicas_gc; struct mutex replicas_gc_lock; - mempool_t replicas_delta_pool; struct journal_entry_res btree_root_journal_res; - struct journal_entry_res replicas_journal_res; struct journal_entry_res clock_journal_res; - struct journal_entry_res dev_usage_journal_res; struct bch_disk_groups_cpu __rcu *disk_groups; @@ -775,6 +760,8 @@ struct bch_fs { __uuid_t user_uuid; u16 version; + u16 version_incompat; + u16 version_incompat_allowed; u16 version_min; u16 version_upgrade_complete; @@ -789,7 +776,8 @@ struct bch_fs { unsigned nsec_per_time_unit; u64 features; u64 compat; - unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)]; + unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; + u64 btrees_lost_data; } sb; @@ -804,7 +792,6 @@ struct bch_fs { /* snapshot.c: */ struct snapshot_table __rcu *snapshots; - size_t snapshot_table_size; struct mutex snapshot_table_lock; struct rw_semaphore snapshot_create_lock; @@ -815,7 +802,8 @@ struct bch_fs { /* BTREE CACHE */ struct bio_set btree_bio; - struct workqueue_struct *io_complete_wq; + struct workqueue_struct *btree_read_complete_wq; + struct workqueue_struct *btree_write_submit_wq; struct btree_root btree_roots_known[BTREE_ID_NR]; DARRAY(struct btree_root) btree_roots_extra; @@ -843,8 +831,11 @@ struct bch_fs { struct workqueue_struct *btree_interior_update_worker; struct work_struct btree_interior_update_work; - struct list_head pending_node_rewrites; - struct mutex pending_node_rewrites_lock; + struct workqueue_struct *btree_node_rewrite_worker; + struct list_head btree_node_rewrites; + struct list_head btree_node_rewrites_pending; + spinlock_t btree_node_rewrites_lock; + struct closure_waitlist btree_node_rewrites_wait; /* btree_io.c: */ spinlock_t btree_write_error_lock; @@ -881,8 +872,10 @@ struct bch_fs { /* ALLOCATION */ struct bch_devs_mask rw_devs[BCH_DATA_NR]; + unsigned long rw_devs_change_count; u64 capacity; /* sectors */ + u64 reserved; /* sectors */ /* * When capacity _decreases_ (due to a disk being removed), we @@ -900,27 +893,20 @@ struct bch_fs { struct percpu_rw_semaphore mark_lock; seqcount_t usage_lock; - struct bch_fs_usage *usage_base; - struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; - struct bch_fs_usage __percpu *usage_gc; + struct bch_fs_usage_base __percpu *usage; u64 __percpu *online_reserved; - /* single element mempool: */ - struct mutex usage_scratch_lock; - struct bch_fs_usage_online *usage_scratch; + unsigned long allocator_last_stuck; struct io_clock io_clock[2]; /* JOURNAL SEQ BLACKLIST */ struct journal_seq_blacklist_table * journal_seq_blacklist_table; - struct work_struct journal_seq_blacklist_gc_work; /* ALLOCATOR */ spinlock_t freelist_lock; struct closure_waitlist freelist_wait; - u64 blocked_allocate; - u64 blocked_allocate_open_bucket; open_bucket_idx_t open_buckets_freelist; open_bucket_idx_t open_buckets_nr_free; @@ -940,12 +926,9 @@ struct bch_fs { unsigned write_points_nr; struct buckets_waiting_for_journal buckets_waiting_for_journal; - struct work_struct discard_work; - struct work_struct invalidate_work; /* GARBAGE COLLECTION */ - struct task_struct *gc_thread; - atomic_t kick_gc; + struct work_struct gc_gens_work; unsigned long gc_count; enum btree_id gc_gens_btree; @@ -975,6 +958,7 @@ struct bch_fs { struct bio_set bio_read; struct bio_set bio_read_split; struct bio_set bio_write; + struct bio_set replica_set; struct mutex bio_bounce_pages_lock; mempool_t bio_bounce_pages; struct bucket_nocow_lock_table @@ -982,8 +966,7 @@ struct bch_fs { struct rhashtable promote_table; mempool_t compression_bounce[2]; - mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; - mempool_t decompress_workspace; + mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; size_t zstd_workspace_size; struct crypto_shash *sha256; @@ -1041,6 +1024,8 @@ struct bch_fs { /* fs.c */ struct list_head vfs_inodes_list; struct mutex vfs_inodes_lock; + struct rhashtable vfs_inodes_table; + struct rhltable vfs_inodes_by_inum_table; /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; @@ -1062,12 +1047,12 @@ struct bch_fs { * for signaling to the toplevel code which pass we want to run now. */ enum bch_recovery_pass curr_recovery_pass; - /* bitmap of explicitly enabled recovery passes: */ - u64 recovery_passes_explicit; + enum bch_recovery_pass next_recovery_pass; /* bitmask of recovery passes that we actually ran */ u64 recovery_passes_complete; /* never rewinds version of curr_recovery_pass */ enum bch_recovery_pass recovery_pass_done; + spinlock_t recovery_pass_lock; struct semaphore online_fsck_mutex; /* DEBUG JUNK */ @@ -1078,9 +1063,6 @@ struct bch_fs { struct btree_node *verify_ondisk; struct mutex verify_lock; - u64 *unused_inode_hints; - unsigned inode_shard_bits; - /* * A btree node on disk could have too many bsets for an iterator to fit * on the stack - have to dynamically allocate them @@ -1095,15 +1077,13 @@ struct bch_fs { struct journal_keys journal_keys; struct list_head journal_iters; + struct find_btree_nodes found_btree_nodes; + u64 last_bucket_seq_cleanup; u64 counters_on_mount[BCH_COUNTER_NR]; u64 __percpu *counters; - unsigned btree_gc_periodic:1; - unsigned copy_gc_enabled:1; - bool promote_whole_extents; - struct bch2_time_stats times[BCH_TIME_STAT_NR]; struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; @@ -1212,12 +1192,15 @@ static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) { struct timespec64 t; + s64 sec; s32 rem; time += c->sb.time_base_lo; - t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); - t.tv_nsec = rem * c->sb.nsec_per_time_unit; + sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); + + set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit); + return t; } @@ -1235,9 +1218,9 @@ static inline s64 bch2_current_time(const struct bch_fs *c) return timespec_to_bch2_time(c, now); } -static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) +static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw) { - return dev < c->sb.nr_devices && c->devs[dev]; + return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX); } static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0668b682a21c..f70f0108401f 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -76,6 +76,7 @@ #include <asm/byteorder.h> #include <linux/kernel.h> #include <linux/uuid.h> +#include <uapi/linux/magic.h> #include "vstructs.h" #ifdef __KERNEL__ @@ -189,7 +190,11 @@ struct bversion { __u32 hi; __u64 lo; #endif -} __packed __aligned(4); +} __packed +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +__aligned(4) +#endif +; struct bkey { /* Size of combined key and value, in u64s */ @@ -212,17 +217,46 @@ struct bkey { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ __u8 pad[1]; - struct bversion version; + struct bversion bversion; __u32 size; /* extent size, in sectors */ struct bpos p; #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ struct bpos p; __u32 size; /* extent size, in sectors */ - struct bversion version; + struct bversion bversion; __u8 pad[1]; #endif -} __packed __aligned(8); +} __packed +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +/* + * The big-endian version of bkey can't be compiled by rustc with the "aligned" + * attr since it doesn't allow types to have both "packed" and "aligned" attrs. + * So for Rust compatibility, don't include this. It can be included in the LE + * version because the "packed" attr is redundant in that case. + * + * History: (quoting Kent) + * + * Specifically, when i was designing bkey, I wanted the header to be no + * bigger than necessary so that bkey_packed could use the rest. That means that + * decently offten extent keys will fit into only 8 bytes, instead of spilling over + * to 16. + * + * But packed_bkey treats the part after the header - the packed section - + * as a single multi word, variable length integer. And bkey, the unpacked + * version, is just a special case version of a bkey_packed; all the packed + * bkey code will work on keys in any packed format, the in-memory + * representation of an unpacked key also is just one type of packed key... + * + * So that constrains the key part of a bkig endian bkey to start right + * after the header. + * + * If we ever do a bkey_v2 and need to expand the hedaer by another byte for + * some reason - that will clean up this wart. + */ +__aligned(8) +#endif +; struct bkey_packed { __u64 _data[0]; @@ -294,8 +328,8 @@ enum bch_bkey_fields { bkey_format_field(OFFSET, p.offset), \ bkey_format_field(SNAPSHOT, p.snapshot), \ bkey_format_field(SIZE, size), \ - bkey_format_field(VERSION_HI, version.hi), \ - bkey_format_field(VERSION_LO, version.lo), \ + bkey_format_field(VERSION_HI, bversion.hi), \ + bkey_format_field(VERSION_LO, bversion.lo), \ }, \ }) @@ -383,7 +417,9 @@ static inline void bkey_init(struct bkey *k) x(bucket_gens, 30) \ x(snapshot_tree, 31) \ x(logged_op_truncate, 32) \ - x(logged_op_finsert, 33) + x(logged_op_finsert, 33) \ + x(accounting, 34) \ + x(inode_alloc_cursor, 35) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -428,20 +464,12 @@ struct bch_backpointer { __u8 btree_id; __u8 level; __u8 data_type; - __u64 bucket_offset:40; + __u8 bucket_gen; + __u32 pad; __u32 bucket_len; struct bpos pos; } __packed __aligned(8); -/* LRU btree: */ - -struct bch_lru { - struct bch_val v; - __le64 idx; -} __packed __aligned(8); - -#define LRU_ID_STRIPES (1U << 16) - /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -468,17 +496,25 @@ struct bch_sb_field { x(downgrade, 14) #include "alloc_background_format.h" +#include "dirent_format.h" +#include "disk_accounting_format.h" +#include "disk_groups_format.h" #include "extents_format.h" -#include "reflink_format.h" #include "ec_format.h" #include "inode_format.h" -#include "dirent_format.h" -#include "xattr_format.h" -#include "quota_format.h" +#include "journal_seq_blacklist_format.h" #include "logged_ops_format.h" +#include "lru_format.h" +#include "quota_format.h" +#include "reflink_format.h" +#include "replicas_format.h" #include "snapshot_format.h" #include "subvolume_format.h" #include "sb-counters_format.h" +#include "sb-downgrade_format.h" +#include "sb-errors_format.h" +#include "sb-members_format.h" +#include "xattr_format.h" enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -511,92 +547,6 @@ struct bch_sb_field_journal_v2 { } d[]; }; -/* BCH_SB_FIELD_members_v1: */ - -#define BCH_MIN_NR_NBUCKETS (1 << 6) - -#define BCH_IOPS_MEASUREMENTS() \ - x(seqread, 0) \ - x(seqwrite, 1) \ - x(randread, 2) \ - x(randwrite, 3) - -enum bch_iops_measurement { -#define x(t, n) BCH_IOPS_##t = n, - BCH_IOPS_MEASUREMENTS() -#undef x - BCH_IOPS_NR -}; - -#define BCH_MEMBER_ERROR_TYPES() \ - x(read, 0) \ - x(write, 1) \ - x(checksum, 2) - -enum bch_member_error_type { -#define x(t, n) BCH_MEMBER_ERROR_##t = n, - BCH_MEMBER_ERROR_TYPES() -#undef x - BCH_MEMBER_ERROR_NR -}; - -struct bch_member { - __uuid_t uuid; - __le64 nbuckets; /* device size */ - __le16 first_bucket; /* index of first bucket used */ - __le16 bucket_size; /* sectors */ - __le32 pad; - __le64 last_mount; /* time_t */ - - __le64 flags; - __le32 iops[4]; - __le64 errors[BCH_MEMBER_ERROR_NR]; - __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; - __le64 errors_reset_time; - __le64 seq; -}; - -#define BCH_MEMBER_V1_BYTES 56 - -LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) -/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ -LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) -LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) -LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) -LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) -LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, - struct bch_member, flags, 30, 31) - -#if 0 -LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -#endif - -#define BCH_MEMBER_STATES() \ - x(rw, 0) \ - x(ro, 1) \ - x(failed, 2) \ - x(spare, 3) - -enum bch_member_state { -#define x(t, n) BCH_MEMBER_STATE_##t = n, - BCH_MEMBER_STATES() -#undef x - BCH_MEMBER_STATE_NR -}; - -struct bch_sb_field_members_v1 { - struct bch_sb_field field; - struct bch_member _members[]; //Members are now variable size -}; - -struct bch_sb_field_members_v2 { - struct bch_sb_field field; - __le16 member_bytes; //size of single member entry - u8 pad[6]; - struct bch_member _members[]; -}; - /* BCH_SB_FIELD_crypt: */ struct nonce { @@ -645,94 +595,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -/* BCH_SB_FIELD_replicas: */ - -#define BCH_DATA_TYPES() \ - x(free, 0) \ - x(sb, 1) \ - x(journal, 2) \ - x(btree, 3) \ - x(user, 4) \ - x(cached, 5) \ - x(parity, 6) \ - x(stripe, 7) \ - x(need_gc_gens, 8) \ - x(need_discard, 9) - -enum bch_data_type { -#define x(t, n) BCH_DATA_##t, - BCH_DATA_TYPES() -#undef x - BCH_DATA_NR -}; - -static inline bool data_type_is_empty(enum bch_data_type type) -{ - switch (type) { - case BCH_DATA_free: - case BCH_DATA_need_gc_gens: - case BCH_DATA_need_discard: - return true; - default: - return false; - } -} - -static inline bool data_type_is_hidden(enum bch_data_type type) -{ - switch (type) { - case BCH_DATA_sb: - case BCH_DATA_journal: - return true; - default: - return false; - } -} - -struct bch_replicas_entry_v0 { - __u8 data_type; - __u8 nr_devs; - __u8 devs[]; -} __packed; - -struct bch_sb_field_replicas_v0 { - struct bch_sb_field field; - struct bch_replicas_entry_v0 entries[]; -} __packed __aligned(8); - -struct bch_replicas_entry_v1 { - __u8 data_type; - __u8 nr_devs; - __u8 nr_required; - __u8 devs[]; -} __packed; - -#define replicas_entry_bytes(_i) \ - (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) - -struct bch_sb_field_replicas { - struct bch_sb_field field; - struct bch_replicas_entry_v1 entries[]; -} __packed __aligned(8); - -/* BCH_SB_FIELD_disk_groups: */ - -#define BCH_SB_LABEL_SIZE 32 - -struct bch_disk_group { - __u8 label[BCH_SB_LABEL_SIZE]; - __le64 flags[2]; -} __packed __aligned(8); - -LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) -LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) - -struct bch_sb_field_disk_groups { - struct bch_sb_field field; - struct bch_disk_group entries[]; -} __packed __aligned(8); - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: @@ -760,43 +622,11 @@ struct bch_sb_field_clean { __u64 _data[]; }; -struct journal_seq_blacklist_entry { - __le64 start; - __le64 end; -}; - -struct bch_sb_field_journal_seq_blacklist { - struct bch_sb_field field; - struct journal_seq_blacklist_entry start[]; -}; - -struct bch_sb_field_errors { - struct bch_sb_field field; - struct bch_sb_field_error_entry { - __le64 v; - __le64 last_error_time; - } entries[]; -}; - -LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); -LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); - struct bch_sb_field_ext { struct bch_sb_field field; __le64 recovery_passes_required[2]; __le64 errors_silent[8]; -}; - -struct bch_sb_field_downgrade_entry { - __le16 version; - __le64 recovery_passes[2]; - __le16 nr_errors; - __le16 errors[] __counted_by(nr_errors); -} __packed __aligned(2); - -struct bch_sb_field_downgrade { - struct bch_sb_field field; - struct bch_sb_field_downgrade_entry entries[]; + __le64 btrees_lost_data; }; /* Superblock: */ @@ -840,7 +670,23 @@ struct bch_sb_field_downgrade { x(snapshot_skiplists, BCH_VERSION(1, 1)) \ x(deleted_inodes, BCH_VERSION(1, 2)) \ x(rebalance_work, BCH_VERSION(1, 3)) \ - x(member_seq, BCH_VERSION(1, 4)) + x(member_seq, BCH_VERSION(1, 4)) \ + x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ + x(btree_subvolume_children, BCH_VERSION(1, 6)) \ + x(mi_btree_bitmap, BCH_VERSION(1, 7)) \ + x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ + x(disk_accounting_v2, BCH_VERSION(1, 9)) \ + x(disk_accounting_v3, BCH_VERSION(1, 10)) \ + x(disk_accounting_inum, BCH_VERSION(1, 11)) \ + x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ + x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \ + x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ + x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ + x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ + x(inode_depth, BCH_VERSION(1, 17)) \ + x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ + x(autofix_errors, BCH_VERSION(1, 19)) \ + x(directory_size, BCH_VERSION(1, 20)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -856,7 +702,8 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) #define BCH_SB_SECTOR 8 -#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ + +#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */ struct bch_sb_layout { __uuid_t magic; /* bcachefs superblock UUID */ @@ -956,6 +803,8 @@ LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62); LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); +LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS, + struct bch_sb, flags[0], 63, 64); LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8); @@ -1000,6 +849,12 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI, LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, struct bch_sb, flags[5], 0, 16); +LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, + struct bch_sb, flags[5], 16, 32); +LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); +LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, + struct bch_sb, flags[5], 48, 64); +LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -1052,21 +907,22 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(new_varint, 15) \ x(journal_no_flush, 16) \ x(alloc_v2, 17) \ - x(extents_across_btree_nodes, 18) + x(extents_across_btree_nodes, 18) \ + x(incompat_version_field, 19) #define BCH_SB_FEATURES_ALWAYS \ - ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ - (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ - (1ULL << BCH_FEATURE_btree_updates_journalled)|\ - (1ULL << BCH_FEATURE_alloc_v2)|\ - (1ULL << BCH_FEATURE_extents_across_btree_nodes)) + (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ + BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\ + BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\ + BIT_ULL(BCH_FEATURE_alloc_v2)|\ + BIT_ULL(BCH_FEATURE_extents_across_btree_nodes)) #define BCH_SB_FEATURES_ALL \ (BCH_SB_FEATURES_ALWAYS| \ - (1ULL << BCH_FEATURE_new_siphash)| \ - (1ULL << BCH_FEATURE_btree_ptr_v2)| \ - (1ULL << BCH_FEATURE_new_varint)| \ - (1ULL << BCH_FEATURE_journal_no_flush)) + BIT_ULL(BCH_FEATURE_new_siphash)| \ + BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ + BIT_ULL(BCH_FEATURE_new_varint)| \ + BIT_ULL(BCH_FEATURE_journal_no_flush)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -1107,8 +963,9 @@ enum bch_version_upgrade_opts { #define BCH_ERROR_ACTIONS() \ x(continue, 0) \ - x(ro, 1) \ - x(panic, 2) + x(fix_safe, 1) \ + x(panic, 2) \ + x(ro, 3) enum bch_error_actions { #define x(t, n) BCH_ON_ERROR_##t = n, @@ -1187,7 +1044,7 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) x(crc64, 2) \ x(xxhash, 3) -enum bch_csum_opts { +enum bch_csum_opt { #define x(t, n) BCH_CSUM_OPT_##t = n, BCH_CSUM_OPTS() #undef x @@ -1236,7 +1093,7 @@ enum bch_compression_opts { UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) -#define BCACHEFS_STATFS_MAGIC 0xca451a4e +#define BCACHEFS_STATFS_MAGIC BCACHEFS_SUPER_MAGIC #define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) #define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) @@ -1275,9 +1132,10 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(dev_usage, 8) \ x(log, 9) \ x(overwrite, 10) \ - x(write_buffer_keys, 11) + x(write_buffer_keys, 11) \ + x(datetime, 12) -enum { +enum bch_jset_entry_type { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, BCH_JSET_ENTRY_TYPES() #undef x @@ -1289,7 +1147,6 @@ static inline bool jset_entry_is_key(struct jset_entry *e) switch (e->type) { case BCH_JSET_ENTRY_btree_keys: case BCH_JSET_ENTRY_btree_root: - case BCH_JSET_ENTRY_overwrite: case BCH_JSET_ENTRY_write_buffer_keys: return true; } @@ -1323,7 +1180,7 @@ struct jset_entry_blacklist_v2 { x(inodes, 1) \ x(key_version, 2) -enum { +enum bch_fs_usage_type { #define x(f, nr) BCH_FS_USAGE_##f = nr, BCH_FS_USAGE_TYPES() #undef x @@ -1376,6 +1233,20 @@ struct jset_entry_log { u8 d[]; } __packed __aligned(8); +static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l) +{ + unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d); + + while (b && !l->d[b - 1]) + --b; + return b; +} + +struct jset_entry_datetime { + struct jset_entry entry; + __le64 seconds; +} __packed __aligned(8); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique @@ -1418,14 +1289,18 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); /* Btree: */ enum btree_id_flags { - BTREE_ID_EXTENTS = BIT(0), - BTREE_ID_SNAPSHOTS = BIT(1), - BTREE_ID_SNAPSHOT_FIELD = BIT(2), - BTREE_ID_DATA = BIT(3), + BTREE_IS_extents = BIT(0), + BTREE_IS_snapshots = BIT(1), + BTREE_IS_snapshot_field = BIT(2), + BTREE_IS_data = BIT(3), + BTREE_IS_write_buffer = BIT(4), }; #define BCH_BTREE_IDS() \ - x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\ + x(extents, 0, \ + BTREE_IS_extents| \ + BTREE_IS_snapshots| \ + BTREE_IS_data, \ BIT_ULL(KEY_TYPE_whiteout)| \ BIT_ULL(KEY_TYPE_error)| \ BIT_ULL(KEY_TYPE_cookie)| \ @@ -1433,17 +1308,20 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_reservation)| \ BIT_ULL(KEY_TYPE_reflink_p)| \ BIT_ULL(KEY_TYPE_inline_data)) \ - x(inodes, 1, BTREE_ID_SNAPSHOTS, \ + x(inodes, 1, \ + BTREE_IS_snapshots, \ BIT_ULL(KEY_TYPE_whiteout)| \ BIT_ULL(KEY_TYPE_inode)| \ BIT_ULL(KEY_TYPE_inode_v2)| \ BIT_ULL(KEY_TYPE_inode_v3)| \ BIT_ULL(KEY_TYPE_inode_generation)) \ - x(dirents, 2, BTREE_ID_SNAPSHOTS, \ + x(dirents, 2, \ + BTREE_IS_snapshots, \ BIT_ULL(KEY_TYPE_whiteout)| \ BIT_ULL(KEY_TYPE_hash_whiteout)| \ BIT_ULL(KEY_TYPE_dirent)) \ - x(xattrs, 3, BTREE_ID_SNAPSHOTS, \ + x(xattrs, 3, \ + BTREE_IS_snapshots, \ BIT_ULL(KEY_TYPE_whiteout)| \ BIT_ULL(KEY_TYPE_cookie)| \ BIT_ULL(KEY_TYPE_hash_whiteout)| \ @@ -1457,32 +1335,49 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_quota)) \ x(stripes, 6, 0, \ BIT_ULL(KEY_TYPE_stripe)) \ - x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \ + x(reflink, 7, \ + BTREE_IS_extents| \ + BTREE_IS_data, \ BIT_ULL(KEY_TYPE_reflink_v)| \ - BIT_ULL(KEY_TYPE_indirect_inline_data)) \ + BIT_ULL(KEY_TYPE_indirect_inline_data)| \ + BIT_ULL(KEY_TYPE_error)) \ x(subvolumes, 8, 0, \ BIT_ULL(KEY_TYPE_subvolume)) \ x(snapshots, 9, 0, \ BIT_ULL(KEY_TYPE_snapshot)) \ - x(lru, 10, 0, \ + x(lru, 10, \ + BTREE_IS_write_buffer, \ BIT_ULL(KEY_TYPE_set)) \ - x(freespace, 11, BTREE_ID_EXTENTS, \ + x(freespace, 11, \ + BTREE_IS_extents, \ BIT_ULL(KEY_TYPE_set)) \ x(need_discard, 12, 0, \ BIT_ULL(KEY_TYPE_set)) \ - x(backpointers, 13, 0, \ + x(backpointers, 13, \ + BTREE_IS_write_buffer, \ BIT_ULL(KEY_TYPE_backpointer)) \ x(bucket_gens, 14, 0, \ BIT_ULL(KEY_TYPE_bucket_gens)) \ x(snapshot_trees, 15, 0, \ BIT_ULL(KEY_TYPE_snapshot_tree)) \ - x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \ + x(deleted_inodes, 16, \ + BTREE_IS_snapshot_field| \ + BTREE_IS_write_buffer, \ BIT_ULL(KEY_TYPE_set)) \ x(logged_ops, 17, 0, \ BIT_ULL(KEY_TYPE_logged_op_truncate)| \ - BIT_ULL(KEY_TYPE_logged_op_finsert)) \ - x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ - BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) + BIT_ULL(KEY_TYPE_logged_op_finsert)| \ + BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \ + x(rebalance_work, 18, \ + BTREE_IS_snapshot_field| \ + BTREE_IS_write_buffer, \ + BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ + x(subvolume_children, 19, 0, \ + BIT_ULL(KEY_TYPE_set)) \ + x(accounting, 20, \ + BTREE_IS_snapshot_field| \ + BTREE_IS_write_buffer, \ + BIT_ULL(KEY_TYPE_accounting)) \ enum btree_id { #define x(name, nr, ...) BTREE_ID_##name = nr, @@ -1491,6 +1386,29 @@ enum btree_id { BTREE_ID_NR }; +/* + * Maximum number of btrees that we will _ever_ have under the current scheme, + * where we refer to them with 64 bit bitfields - and we also need a bit for + * the interior btree node type: + */ +#define BTREE_ID_NR_MAX 63 + +static inline bool btree_id_is_alloc(enum btree_id id) +{ + switch (id) { + case BTREE_ID_alloc: + case BTREE_ID_backpointers: + case BTREE_ID_need_discard: + case BTREE_ID_freespace: + case BTREE_ID_bucket_gens: + case BTREE_ID_lru: + case BTREE_ID_accounting: + return true; + default: + return false; + } +} + #define BTREE_MAX_DEPTH 4U /* Btree nodes */ diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index 4b8fba754b1c..3c23bdf788ce 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -5,6 +5,7 @@ #include <linux/uuid.h> #include <asm/ioctl.h> #include "bcachefs_format.h" +#include "bkey_types.h" /* * Flags common to multiple ioctls: @@ -85,6 +86,7 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) #define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) +#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) /* ioctl below act on a particular file, not the filesystem as a whole: */ @@ -251,12 +253,18 @@ struct bch_replicas_usage { struct bch_replicas_entry_v1 r; } __packed; +static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u) +{ + return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r); +} + static inline struct bch_replicas_usage * replicas_usage_next(struct bch_replicas_usage *u) { - return (void *) u + replicas_entry_bytes(&u->r) + 8; + return (void *) u + replicas_usage_bytes(u); } +/* Obsolete */ /* * BCH_IOCTL_FS_USAGE: query filesystem disk space usage * @@ -282,6 +290,7 @@ struct bch_ioctl_fs_usage { struct bch_replicas_usage replicas[]; }; +/* Obsolete */ /* * BCH_IOCTL_DEV_USAGE: query device disk space usage * @@ -306,6 +315,7 @@ struct bch_ioctl_dev_usage { } d[10]; }; +/* Obsolete */ struct bch_ioctl_dev_usage_v2 { __u64 dev; __u32 flags; @@ -409,4 +419,28 @@ struct bch_ioctl_fsck_online { __u64 opts; /* string */ }; +/* + * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting + * + * Returns disk space usage broken out by data type, number of replicas, and + * by component device + * + * @replica_entries_bytes - size, in bytes, allocated for replica usage entries + * + * On success, @replica_entries_bytes will be changed to indicate the number of + * bytes actually used. + * + * Returns -ERANGE if @replica_entries_bytes was too small + */ +struct bch_ioctl_query_accounting { + __u64 capacity; + __u64 used; + __u64 online_reserved; + + __u32 accounting_u64s; /* input parameter */ + __u32 accounting_types_mask; /* input parameter */ + + struct bkey_i_accounting accounting[]; +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 76e79a15ba08..995ba32e9b6e 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -640,10 +640,10 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) int bch2_bkey_format_invalid(struct bch_fs *c, struct bkey_format *f, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { - unsigned i, bits = KEY_PACKED_BITS_START; + unsigned bits = KEY_PACKED_BITS_START; if (f->nr_fields != BKEY_NR_FIELDS) { prt_printf(err, "incorrect number of fields: got %u, should be %u", @@ -655,21 +655,18 @@ int bch2_bkey_format_invalid(struct bch_fs *c, * Verify that the packed format can't represent fields larger than the * unpacked format: */ - for (i = 0; i < f->nr_fields; i++) { - if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) { + for (unsigned i = 0; i < f->nr_fields; i++) { + if (bch2_bkey_format_field_overflows(f, i)) { unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 packed_max = f->bits_per_field[i] - ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + unsigned packed_bits = min(64, f->bits_per_field[i]); + u64 packed_max = packed_bits + ? ~((~0ULL << 1) << (packed_bits - 1)) : 0; - u64 field_offset = le64_to_cpu(f->field_offset[i]); - if (packed_max + field_offset < packed_max || - packed_max + field_offset > unpacked_max) { - prt_printf(err, "field %u too large: %llu + %llu > %llu", - i, packed_max, field_offset, unpacked_max); - return -BCH_ERR_invalid; - } + prt_printf(err, "field %u too large: %llu + %llu > %llu", + i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max); + return -BCH_ERR_invalid; } bits += f->bits_per_field[i]; @@ -1067,7 +1064,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) { const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; u8 *l = k->key_start; - u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; + u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1; while (l < h) { swap(*l, *h); diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 831be01809f2..054e2d5e8448 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -4,17 +4,11 @@ #include <linux/bug.h> #include "bcachefs_format.h" - +#include "bkey_types.h" #include "btree_types.h" #include "util.h" #include "vstructs.h" -enum bkey_invalid_flags { - BKEY_INVALID_WRITE = (1U << 0), - BKEY_INVALID_COMMIT = (1U << 1), - BKEY_INVALID_JOURNAL = (1U << 2), -}; - #if 0 /* @@ -31,57 +25,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *, const struct bkey_format *, const struct bkey_packed *); -/* bkey with split value, const */ -struct bkey_s_c { - const struct bkey *k; - const struct bch_val *v; -}; - -/* bkey with split value */ -struct bkey_s { - union { - struct { - struct bkey *k; - struct bch_val *v; - }; - struct bkey_s_c s_c; - }; -}; - -#define bkey_p_next(_k) vstruct_next(_k) - -static inline struct bkey_i *bkey_next(struct bkey_i *k) -{ - return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); -} - -#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) - -static inline size_t bkey_val_bytes(const struct bkey *k) -{ - return bkey_val_u64s(k) * sizeof(u64); -} - -static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -{ - unsigned u64s = BKEY_U64s + val_u64s; - - BUG_ON(u64s > U8_MAX); - k->u64s = u64s; -} - -static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -{ - set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); -} - -#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) - -#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) - -#define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) - enum bkey_lr_packed { BKEY_PACKED_BOTH, BKEY_PACKED_RIGHT, @@ -245,6 +188,13 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) return bkey_gt(l, r) ? l : r; } +static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) +{ + return bpos_eq(l.k->p, r.k->p) && + bkey_bytes(l.k) == bkey_bytes(r.k) && + !memcmp(l.v, r.v, bkey_val_bytes(l.k)); +} + void bch2_bpos_swab(struct bpos *); void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); @@ -257,9 +207,9 @@ static __always_inline int bversion_cmp(struct bversion l, struct bversion r) #define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) #define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) -static __always_inline int bversion_zero(struct bversion v) +static __always_inline bool bversion_zero(struct bversion v) { - return !bversion_cmp(v, ZERO_VERSION); + return bversion_cmp(v, ZERO_VERSION) == 0; } #ifdef CONFIG_BCACHEFS_DEBUG @@ -362,10 +312,13 @@ static inline struct bpos bkey_start_pos(const struct bkey *k) static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, const struct bkey_packed *k) { - unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; + return bkey_packed(k) ? format->key_u64s : BKEY_U64s; +} - EBUG_ON(k->u64s < ret); - return ret; +static inline bool bkeyp_u64s_valid(const struct bkey_format *f, + const struct bkey_packed *k) +{ + return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s); } static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, @@ -553,155 +506,6 @@ static inline void bkey_reassemble(struct bkey_i *dst, memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); } -#define bkey_s_null ((struct bkey_s) { .k = NULL }) -#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) - -#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) - -static inline struct bkey_s bkey_to_s(struct bkey *k) -{ - return (struct bkey_s) { .k = k, .v = NULL }; -} - -static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -{ - return (struct bkey_s_c) { .k = k, .v = NULL }; -} - -static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -{ - return (struct bkey_s) { .k = &k->k, .v = &k->v }; -} - -static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -{ - return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -} - -/* - * For a given type of value (e.g. struct bch_extent), generates the types for - * bkey + bch_extent - inline, split, split const - and also all the conversion - * functions, which also check that the value is of the correct type. - * - * We use anonymous unions for upcasting - e.g. converting from e.g. a - * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion - * functions. - */ -#define x(name, ...) \ -struct bkey_i_##name { \ - union { \ - struct bkey k; \ - struct bkey_i k_i; \ - }; \ - struct bch_##name v; \ -}; \ - \ -struct bkey_s_c_##name { \ - union { \ - struct { \ - const struct bkey *k; \ - const struct bch_##name *v; \ - }; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -struct bkey_s_##name { \ - union { \ - struct { \ - struct bkey *k; \ - struct bch_##name *v; \ - }; \ - struct bkey_s_c_##name c; \ - struct bkey_s s; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline const struct bkey_i_##name * \ -bkey_i_to_##name##_c(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -{ \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -name##_i_to_s_c(const struct bkey_i_##name *k) \ -{ \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -bkey_i_to_s_c_##name(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -{ \ - struct bkey_i_##name *k = \ - container_of(&_k->k, struct bkey_i_##name, k); \ - \ - bkey_init(&k->k); \ - memset(&k->v, 0, sizeof(k->v)); \ - k->k.type = KEY_TYPE_##name; \ - set_bkey_val_bytes(&k->k, sizeof(k->v)); \ - \ - return k; \ -} - -BCH_BKEY_TYPES(); -#undef x - /* byte order helpers */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ @@ -743,8 +547,8 @@ static inline void bch2_bkey_pack_test(void) {} x(BKEY_FIELD_OFFSET, p.offset) \ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ x(BKEY_FIELD_SIZE, size) \ - x(BKEY_FIELD_VERSION_HI, version.hi) \ - x(BKEY_FIELD_VERSION_LO, version.lo) + x(BKEY_FIELD_VERSION_HI, bversion.hi) \ + x(BKEY_FIELD_VERSION_LO, bversion.lo) struct bkey_format_state { u64 field_min[BKEY_NR_FIELDS]; @@ -771,8 +575,31 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); + +static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i) +{ + unsigned f_bits = f->bits_per_field[i]; + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (f_bits > unpacked_bits) + return true; + + if ((f_bits == unpacked_bits) && field_offset) + return true; + + u64 f_mask = f_bits + ? ~((~0ULL << (f_bits - 1)) << 1) + : 0; + + if (((field_offset + f_mask) & unpacked_mask) < field_offset) + return true; + return false; +} + int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); #endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 5e52684764eb..15c93576b5c2 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -7,6 +7,7 @@ #include "btree_types.h" #include "alloc_background.h" #include "dirent.h" +#include "disk_accounting.h" #include "ec.h" #include "error.h" #include "extents.h" @@ -26,27 +27,27 @@ const char * const bch2_bkey_types[] = { NULL }; -static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) +static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { return 0; } #define bch2_bkey_ops_deleted ((struct bkey_ops) { \ - .key_invalid = deleted_key_invalid, \ + .key_validate = deleted_key_validate, \ }) #define bch2_bkey_ops_whiteout ((struct bkey_ops) { \ - .key_invalid = deleted_key_invalid, \ + .key_validate = deleted_key_validate, \ }) -static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) +static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { int ret = 0; - bkey_fsck_err_on(bkey_val_bytes(k.k), c, err, - bkey_val_size_nonzero, + bkey_fsck_err_on(bkey_val_bytes(k.k), + c, bkey_val_size_nonzero, "incorrect value size (%zu != 0)", bkey_val_bytes(k.k)); fsck_err: @@ -54,11 +55,11 @@ fsck_err: } #define bch2_bkey_ops_error ((struct bkey_ops) { \ - .key_invalid = empty_val_key_invalid, \ + .key_validate = empty_val_key_validate, \ }) -static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) +static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { return 0; } @@ -72,17 +73,17 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, } #define bch2_bkey_ops_cookie ((struct bkey_ops) { \ - .key_invalid = key_type_cookie_invalid, \ + .key_validate = key_type_cookie_validate, \ .val_to_text = key_type_cookie_to_text, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ - .key_invalid = empty_val_key_invalid, \ + .key_validate = empty_val_key_validate, \ }) -static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) +static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { return 0; } @@ -97,9 +98,9 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, datalen, min(datalen, 32U), d.v->data); } -#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ - .key_invalid = key_type_inline_data_invalid, \ - .val_to_text = key_type_inline_data_to_text, \ +#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ + .key_validate = key_type_inline_data_validate, \ + .val_to_text = key_type_inline_data_to_text, \ }) static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) @@ -109,7 +110,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_ } #define bch2_bkey_ops_set ((struct bkey_ops) { \ - .key_invalid = empty_val_key_invalid, \ + .key_validate = empty_val_key_validate, \ .key_merge = key_type_set_merge, \ }) @@ -122,22 +123,24 @@ const struct bkey_ops bch2_bkey_ops[] = { const struct bkey_ops bch2_bkey_null_ops = { }; -int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { + if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) + return 0; + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); int ret = 0; - bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err, - bkey_val_size_too_small, + bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, + c, bkey_val_size_too_small, "bad val size (%zu < %u)", bkey_val_bytes(k.k), ops->min_val_size); - if (!ops->key_invalid) + if (!ops->key_validate) return 0; - ret = ops->key_invalid(c, k, flags, err); + ret = ops->key_validate(c, k, from); fsck_err: return ret; } @@ -157,38 +160,45 @@ const char *bch2_btree_node_type_str(enum btree_node_type type) return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); } -int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, - enum btree_node_type type, - enum bkey_invalid_flags flags, - struct printbuf *err) +int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { + enum btree_node_type type = __btree_node_type(from.level, from.btree); + + if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) + return 0; + int ret = 0; - bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err, - bkey_u64s_too_small, + bkey_fsck_err_on(k.k->u64s < BKEY_U64s, + c, bkey_u64s_too_small, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); if (type >= BKEY_TYPE_NR) return 0; - bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && - !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, - bkey_invalid_type_for_btree, + bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && + (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) && + !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), + c, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", - bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]); + bch2_btree_node_type_str(type), + k.k->type < KEY_TYPE_MAX + ? bch2_bkey_types[k.k->type] + : "(unknown)"); if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { - bkey_fsck_err_on(k.k->size == 0, c, err, - bkey_extent_size_zero, + bkey_fsck_err_on(k.k->size == 0, + c, bkey_extent_size_zero, "size == 0"); - bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err, - bkey_extent_size_greater_than_offset, + bkey_fsck_err_on(k.k->size > k.k->p.offset, + c, bkey_extent_size_greater_than_offset, "size greater than offset (%u > %llu)", k.k->size, k.k->p.offset); } else { - bkey_fsck_err_on(k.k->size, c, err, - bkey_size_nonzero, + bkey_fsck_err_on(k.k->size, + c, bkey_size_nonzero, "size != 0"); } @@ -196,12 +206,12 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_id btree = type - 1; if (btree_type_has_snapshots(btree)) { - bkey_fsck_err_on(!k.k->p.snapshot, c, err, - bkey_snapshot_zero, + bkey_fsck_err_on(!k.k->p.snapshot, + c, bkey_snapshot_zero, "snapshot == 0"); } else if (!btree_type_has_snapshot_field(btree)) { - bkey_fsck_err_on(k.k->p.snapshot, c, err, - bkey_snapshot_nonzero, + bkey_fsck_err_on(k.k->p.snapshot, + c, bkey_snapshot_nonzero, "nonzero snapshot"); } else { /* @@ -210,34 +220,33 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, */ } - bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err, - bkey_at_pos_max, + bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), + c, bkey_at_pos_max, "key at POS_MAX"); } fsck_err: return ret; } -int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, - enum btree_node_type type, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { - return __bch2_bkey_invalid(c, k, type, flags, err) ?: - bch2_bkey_val_invalid(c, k, flags, err); + return __bch2_bkey_validate(c, k, from) ?: + bch2_bkey_val_validate(c, k, from); } int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, struct printbuf *err) + struct bkey_s_c k, + struct bkey_validate_context from) { int ret = 0; - bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err, - bkey_before_start_of_btree_node, + bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), + c, bkey_before_start_of_btree_node, "key before start of btree node"); - bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err, - bkey_after_end_of_btree_node, + bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), + c, bkey_after_end_of_btree_node, "key past end of btree node"); fsck_err: return ret; @@ -281,7 +290,7 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) bch2_bpos_to_text(out, k->p); - prt_printf(out, " len %u ver %llu", k->size, k->version.lo); + prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo); } else { prt_printf(out, "(null)"); } @@ -388,8 +397,12 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, for (i = 0; i < nr_compat; i++) switch (!write ? i : nr_compat - 1 - i) { case 0: - if (big_endian != CPU_BIG_ENDIAN) + if (big_endian != CPU_BIG_ENDIAN) { + bch2_bkey_swab_key(f, k); + } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { bch2_bkey_swab_key(f, k); + bch2_bkey_swab_key(f, k); + } break; case 1: if (version < bcachefs_metadata_version_bkey_renumber) diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 03efe8ee565a..bf34111cdf00 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -14,22 +14,23 @@ extern const char * const bch2_bkey_types[]; extern const struct bkey_ops bch2_bkey_null_ops; /* - * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If + * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If * invalid, entire key will be deleted. * * When invalid, error string is returned via @err. @rw indicates whether key is * being read or written; more aggressive checks can be enabled when rw == WRITE. */ struct bkey_ops { - int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err); + int (*key_validate)(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); int (*trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); void (*compat)(enum btree_id id, unsigned version, unsigned big_endian, int write, struct bkey_s); @@ -47,14 +48,14 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) : &bch2_bkey_null_ops; } -int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); -int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bkey_invalid_flags, struct printbuf *); -int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bkey_invalid_flags, struct printbuf *); -int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, - struct bkey_s_c, struct printbuf *); +int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, + struct bkey_validate_context from); void bch2_bpos_to_text(struct printbuf *, struct bpos); void bch2_bkey_to_text(struct printbuf *, const struct bkey *); @@ -70,62 +71,16 @@ bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) { return l->type == r->type && - !bversion_cmp(l->version, r->version) && + !bversion_cmp(l->bversion, r->bversion) && bpos_eq(l->p, bkey_start_pos(r)); } bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -enum btree_update_flags { - __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, - __BTREE_UPDATE_NOJOURNAL, - __BTREE_UPDATE_KEY_CACHE_RECLAIM, - - __BTREE_TRIGGER_NORUN, - __BTREE_TRIGGER_TRANSACTIONAL, - __BTREE_TRIGGER_ATOMIC, - __BTREE_TRIGGER_GC, - __BTREE_TRIGGER_INSERT, - __BTREE_TRIGGER_OVERWRITE, - __BTREE_TRIGGER_BUCKET_INVALIDATE, -}; - -#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) -#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) -#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) - -/* Don't run triggers at all */ -#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) - -/* - * If set, we're running transactional triggers as part of a transaction commit: - * triggers may generate new updates - * - * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set, - * we're running atomic triggers during a transaction commit: we have our - * journal reservation, we're holding btree node write locks, and we know the - * transaction is going to commit (returning an error here is a fatal error, - * causing us to go emergency read-only) - */ -#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL) -#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC) - -/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) - -/* @new is entering the btree */ -#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) - -/* @old is leaving the btree */ -#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) - -/* signal from bucket invalidate path to alloc trigger */ -#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) - static inline int bch2_key_trigger(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); @@ -135,8 +90,9 @@ static inline int bch2_key_trigger(struct btree_trans *trans, } static inline int bch2_key_trigger_old(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, unsigned flags) + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + enum btree_iter_update_trigger_flags flags) { struct bkey_i deleted; @@ -144,12 +100,13 @@ static inline int bch2_key_trigger_old(struct btree_trans *trans, deleted.k.p = old.k->p; return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted), - BTREE_TRIGGER_OVERWRITE|flags); + BTREE_TRIGGER_overwrite|flags); } static inline int bch2_key_trigger_new(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s new, unsigned flags) + enum btree_id btree_id, unsigned level, + struct bkey_s new, + enum btree_iter_update_trigger_flags flags) { struct bkey_i deleted; @@ -157,7 +114,7 @@ static inline int bch2_key_trigger_new(struct btree_trans *trans, deleted.k.p = new.k->p; return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, - BTREE_TRIGGER_INSERT|flags); + BTREE_TRIGGER_insert|flags); } void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); @@ -172,7 +129,8 @@ static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, struct bkey_packed *k) { if (version < bcachefs_metadata_version_current || - big_endian != CPU_BIG_ENDIAN) + big_endian != CPU_BIG_ENDIAN || + IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) __bch2_bkey_compat(level, btree_id, version, big_endian, write, f, k); diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c index bcca9e76a0b4..4536eb50fc40 100644 --- a/fs/bcachefs/bkey_sort.c +++ b/fs/bcachefs/bkey_sort.c @@ -6,9 +6,9 @@ #include "bset.h" #include "extents.h" -typedef int (*sort_cmp_fn)(struct btree *, - struct bkey_packed *, - struct bkey_packed *); +typedef int (*sort_cmp_fn)(const struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); static inline bool sort_iter_end(struct sort_iter *iter) { @@ -70,9 +70,9 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, /* * If keys compare equal, compare by pointer order: */ -static inline int key_sort_fix_overlapping_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) +static inline int key_sort_fix_overlapping_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { return bch2_bkey_cmp_packed(b, l, r) ?: cmp_int((unsigned long) l, (unsigned long) r); @@ -154,46 +154,59 @@ bch2_sort_repack(struct bset *dst, struct btree *src, return nr; } -static inline int sort_keys_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) +static inline int keep_unwritten_whiteouts_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { return bch2_bkey_cmp_packed_inlined(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: - (int) l->needs_whiteout - (int) r->needs_whiteout; + (long) l - (long) r; } -unsigned bch2_sort_keys(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) +#include "btree_update_interior.h" + +/* + * For sorting in the btree node write path: whiteouts not in the unwritten + * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are + * dropped if overwritten by real keys: + */ +unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter) { - const struct bkey_format *f = &iter->b->format; struct bkey_packed *in, *next, *out = dst; - sort_iter_sort(iter, sort_keys_cmp); + sort_iter_sort(iter, keep_unwritten_whiteouts_cmp); - while ((in = sort_iter_next(iter, sort_keys_cmp))) { - bool needs_whiteout = false; + while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) { + if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b)) + continue; - if (bkey_deleted(in) && - (filter_whiteouts || !in->needs_whiteout)) + if ((next = sort_iter_peek(iter)) && + !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) continue; - while ((next = sort_iter_peek(iter)) && - !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) { - BUG_ON(in->needs_whiteout && - next->needs_whiteout); - needs_whiteout |= in->needs_whiteout; - in = sort_iter_next(iter, sort_keys_cmp); - } + bkey_p_copy(out, in); + out = bkey_p_next(out); + } - if (bkey_deleted(in)) { - memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in)); - set_bkeyp_val_u64s(f, out, 0); - } else { - bkey_p_copy(out, in); - } - out->needs_whiteout |= needs_whiteout; + return (u64 *) out - (u64 *) dst; +} + +/* + * Main sort routine for compacting a btree node in memory: we always drop + * whiteouts because any whiteouts that need to be written are in the unwritten + * whiteouts area: + */ +unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter) +{ + struct bkey_packed *in, *out = dst; + + sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined); + + while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) { + if (bkey_deleted(in)) + continue; + + bkey_p_copy(out, in); out = bkey_p_next(out); } diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h index 7c0f0b160f18..9be969d46890 100644 --- a/fs/bcachefs/bkey_sort.h +++ b/fs/bcachefs/bkey_sort.h @@ -48,7 +48,7 @@ bch2_sort_repack(struct bset *, struct btree *, struct btree_node_iter *, struct bkey_format *, bool); -unsigned bch2_sort_keys(struct bkey_packed *, - struct sort_iter *, bool); +unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *); +unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *); #endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h new file mode 100644 index 000000000000..b4f328f9853c --- /dev/null +++ b/fs/bcachefs/bkey_types.h @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_TYPES_H +#define _BCACHEFS_BKEY_TYPES_H + +#include "bcachefs_format.h" + +/* + * bkey_i - bkey with inline value + * bkey_s - bkey with split value + * bkey_s_c - bkey with split value, const + */ + +#define bkey_p_next(_k) vstruct_next(_k) + +static inline struct bkey_i *bkey_next(struct bkey_i *k) +{ + return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); +} + +#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) + +static inline size_t bkey_val_bytes(const struct bkey *k) +{ + return bkey_val_u64s(k) * sizeof(u64); +} + +static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) +{ + unsigned u64s = BKEY_U64s + val_u64s; + + BUG_ON(u64s > U8_MAX); + k->u64s = u64s; +} + +static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) +{ + set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); +} + +#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) + +#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) + +#define bkey_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) + +/* bkey with split value, const */ +struct bkey_s_c { + const struct bkey *k; + const struct bch_val *v; +}; + +/* bkey with split value */ +struct bkey_s { + union { + struct { + struct bkey *k; + struct bch_val *v; + }; + struct bkey_s_c s_c; + }; +}; + +#define bkey_s_null ((struct bkey_s) { .k = NULL }) +#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) + +#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) +#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) + +static inline struct bkey_s bkey_to_s(struct bkey *k) +{ + return (struct bkey_s) { .k = k, .v = NULL }; +} + +static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) +{ + return (struct bkey_s_c) { .k = k, .v = NULL }; +} + +static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) +{ + return (struct bkey_s) { .k = &k->k, .v = &k->v }; +} + +static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) +{ + return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; +} + +/* + * For a given type of value (e.g. struct bch_extent), generates the types for + * bkey + bch_extent - inline, split, split const - and also all the conversion + * functions, which also check that the value is of the correct type. + * + * We use anonymous unions for upcasting - e.g. converting from e.g. a + * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion + * functions. + */ +#define x(name, ...) \ +struct bkey_i_##name { \ + union { \ + struct bkey k; \ + struct bkey_i k_i; \ + }; \ + struct bch_##name v; \ +}; \ + \ +struct bkey_s_c_##name { \ + union { \ + struct { \ + const struct bkey *k; \ + const struct bch_##name *v; \ + }; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +struct bkey_s_##name { \ + union { \ + struct { \ + struct bkey *k; \ + struct bch_##name *v; \ + }; \ + struct bkey_s_c_##name c; \ + struct bkey_s s; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline const struct bkey_i_##name * \ +bkey_i_to_##name##_c(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ +{ \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +name##_i_to_s_c(const struct bkey_i_##name *k) \ +{ \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +bkey_i_to_s_c_##name(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ +{ \ + struct bkey_i_##name *k = \ + container_of(&_k->k, struct bkey_i_##name, k); \ + \ + bkey_init(&k->k); \ + memset(&k->v, 0, sizeof(k->v)); \ + k->k.type = KEY_TYPE_##name; \ + set_bkey_val_bytes(&k->k, sizeof(k->v)); \ + \ + return k; \ +} + +BCH_BKEY_TYPES(); +#undef x + +enum bch_validate_flags { + BCH_VALIDATE_write = BIT(0), + BCH_VALIDATE_commit = BIT(1), + BCH_VALIDATE_silent = BIT(2), +}; + +#define BKEY_VALIDATE_CONTEXTS() \ + x(unknown) \ + x(superblock) \ + x(journal) \ + x(btree_root) \ + x(btree_node) \ + x(commit) + +struct bkey_validate_context { + enum { +#define x(n) BKEY_VALIDATE_##n, + BKEY_VALIDATE_CONTEXTS() +#undef x + } from:8; + enum bch_validate_flags flags:8; + u8 level; + enum btree_id btree; + bool root:1; + unsigned journal_offset; + u64 journal_seq; +}; + +#endif /* _BCACHEFS_BKEY_TYPES_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 3fd1085b6c61..9a4a83d6fd2d 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -13,7 +13,7 @@ #include "trace.h" #include "util.h" -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/console.h> #include <linux/random.h> #include <linux/prefetch.h> @@ -103,8 +103,6 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) { - struct bset_tree *t; - console_lock(); for_each_bset(b, t) bch2_dump_bset(c, b, bset(b, t), t - b->set); @@ -134,18 +132,23 @@ void bch2_dump_btree_node_iter(struct btree *b, printbuf_exit(&buf); } -#ifdef CONFIG_BCACHEFS_DEBUG - -void __bch2_verify_btree_nr_keys(struct btree *b) +struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) { - struct bset_tree *t; struct bkey_packed *k; - struct btree_nr_keys nr = { 0 }; + struct btree_nr_keys nr = {}; for_each_bset(b, t) bset_tree_for_each_key(b, t, k) if (!bkey_deleted(k)) btree_keys_account_key_add(&nr, t - b->set, k); + return nr; +} + +#ifdef CONFIG_BCACHEFS_DEBUG + +void __bch2_verify_btree_nr_keys(struct btree *b) +{ + struct btree_nr_keys nr = bch2_btree_node_count_keys(b); BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); } @@ -192,7 +195,6 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, { struct btree_node_iter_set *set, *s2; struct bkey_packed *k, *p; - struct bset_tree *t; if (bch2_btree_node_iter_end(iter)) return; @@ -207,12 +209,14 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, /* Verify that set->end is correct: */ btree_node_iter_for_each(iter, set) { for_each_bset(b, t) - if (set->end == t->end_offset) + if (set->end == t->end_offset) { + BUG_ON(set->k < btree_bkey_first_offset(t) || + set->k >= t->end_offset); goto found; + } BUG(); found: - BUG_ON(set->k < btree_bkey_first_offset(t) || - set->k >= t->end_offset); + do {} while (0); } /* Verify iterator is sorted: */ @@ -300,11 +304,6 @@ struct bkey_float { }; #define BKEY_MANTISSA_BITS 16 -static unsigned bkey_float_byte_offset(unsigned idx) -{ - return idx * sizeof(struct bkey_float); -} - struct ro_aux_tree { u8 nothing[0]; struct bkey_float f[]; @@ -324,8 +323,7 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) return t->aux_data_offset; case BSET_RO_AUX_TREE: return t->aux_data_offset + - DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + - t->size * sizeof(u8), 8); + DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8); case BSET_RW_AUX_TREE: return t->aux_data_offset + DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); @@ -356,14 +354,6 @@ static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, return __aux_tree_base(b, t); } -static u8 *ro_aux_tree_prev(const struct btree *b, - const struct bset_tree *t) -{ - EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); - - return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); -} - static struct bkey_float *bkey_float(const struct btree *b, const struct bset_tree *t, unsigned idx) @@ -371,11 +361,9 @@ static struct bkey_float *bkey_float(const struct btree *b, return ro_aux_tree_base(b, t)->f + idx; } -static void bset_aux_tree_verify(const struct btree *b) +static void bset_aux_tree_verify(struct btree *b) { #ifdef CONFIG_BCACHEFS_DEBUG - const struct bset_tree *t; - for_each_bset(b, t) { if (t->aux_data_offset == U16_MAX) continue; @@ -477,15 +465,6 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b, bkey_float(b, t, j)->key_offset); } -static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, - const struct bset_tree *t, - unsigned j) -{ - unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; - - return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s); -} - static struct rw_aux_tree *rw_aux_tree(const struct btree *b, const struct bset_tree *t) { @@ -583,8 +562,7 @@ static unsigned rw_aux_tree_bsearch(struct btree *b, } static inline unsigned bkey_mantissa(const struct bkey_packed *k, - const struct bkey_float *f, - unsigned idx) + const struct bkey_float *f) { u64 v; @@ -615,7 +593,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, struct bkey_packed *m = tree_to_bkey(b, t, j); struct bkey_packed *l = is_power_of_2(j) ? min_key - : tree_to_prev_bkey(b, t, j >> ffs(j)); + : tree_to_bkey(b, t, j >> ffs(j)); struct bkey_packed *r = is_power_of_2(j + 1) ? max_key : tree_to_bkey(b, t, j >> (ffz(j) + 1)); @@ -666,7 +644,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); f->exponent = shift; - mantissa = bkey_mantissa(m, f, j); + mantissa = bkey_mantissa(m, f); /* * If we've got garbage bits, set them to all 1s - it's legal for the @@ -679,20 +657,19 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, } /* bytes remaining - only valid for last bset: */ -static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) +static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t) { bset_aux_tree_verify(b); return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); } -static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) +static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t) { - return __bset_tree_capacity(b, t) / - (sizeof(struct bkey_float) + sizeof(u8)); + return __bset_tree_capacity(b, t) / sizeof(struct bkey_float); } -static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) +static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); } @@ -718,7 +695,7 @@ static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) { - struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); + struct bkey_packed *k = btree_bkey_first(b, t); struct bkey_i min_key, max_key; unsigned cacheline = 1; @@ -731,12 +708,12 @@ retry: return; } - t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; + t->extra = eytzinger1_extra(t->size - 1); /* First we figure out where the first key in each cacheline is */ eytzinger1_for_each(j, t->size - 1) { while (bkey_to_cacheline(b, t, k) < cacheline) - prev = k, k = bkey_p_next(k); + k = bkey_p_next(k); if (k >= btree_bkey_last(b, t)) { /* XXX: this path sucks */ @@ -744,17 +721,12 @@ retry: goto retry; } - ro_aux_tree_prev(b, t)[j] = prev->u64s; bkey_float(b, t, j)->key_offset = bkey_to_cacheline_offset(b, t, cacheline++, k); - EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); EBUG_ON(tree_to_bkey(b, t, j) != k); } - while (k != btree_bkey_last(b, t)) - prev = k, k = bkey_p_next(k); - if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { bkey_init(&min_key.k); min_key.k.p = b->data->min_key; @@ -913,6 +885,38 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, /* Insert */ +static void rw_aux_tree_insert_entry(struct btree *b, + struct bset_tree *t, + unsigned idx) +{ + EBUG_ON(!idx || idx > t->size); + struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1); + struct bkey_packed *end = idx < t->size + ? rw_aux_to_bkey(b, t, idx) + : btree_bkey_last(b, t); + + if (t->size < bset_rw_tree_capacity(b, t) && + (void *) end - (void *) start > L1_CACHE_BYTES) { + struct bkey_packed *k = start; + + while (1) { + k = bkey_p_next(k); + if (k == end) + break; + + if ((void *) k - (void *) start >= L1_CACHE_BYTES) { + memmove(&rw_aux_tree(b, t)[idx + 1], + &rw_aux_tree(b, t)[idx], + (void *) &rw_aux_tree(b, t)[t->size] - + (void *) &rw_aux_tree(b, t)[idx]); + t->size++; + rw_aux_tree_set(b, t, idx, k); + break; + } + } + } +} + static void bch2_bset_fix_lookup_table(struct btree *b, struct bset_tree *t, struct bkey_packed *_where, @@ -920,84 +924,59 @@ static void bch2_bset_fix_lookup_table(struct btree *b, unsigned new_u64s) { int shift = new_u64s - clobber_u64s; - unsigned l, j, where = __btree_node_key_to_offset(b, _where); + unsigned idx, j, where = __btree_node_key_to_offset(b, _where); EBUG_ON(bset_has_ro_aux_tree(t)); if (!bset_has_rw_aux_tree(t)) return; + if (where > rw_aux_tree(b, t)[t->size - 1].offset) { + rw_aux_tree_insert_entry(b, t, t->size); + goto verify; + } + /* returns first entry >= where */ - l = rw_aux_tree_bsearch(b, t, where); - - if (!l) /* never delete first entry */ - l++; - else if (l < t->size && - where < t->end_offset && - rw_aux_tree(b, t)[l].offset == where) - rw_aux_tree_set(b, t, l++, _where); - - /* l now > where */ - - for (j = l; - j < t->size && - rw_aux_tree(b, t)[j].offset < where + clobber_u64s; - j++) - ; - - if (j < t->size && - rw_aux_tree(b, t)[j].offset + shift == - rw_aux_tree(b, t)[l - 1].offset) - j++; - - memmove(&rw_aux_tree(b, t)[l], - &rw_aux_tree(b, t)[j], - (void *) &rw_aux_tree(b, t)[t->size] - - (void *) &rw_aux_tree(b, t)[j]); - t->size -= j - l; - - for (j = l; j < t->size; j++) - rw_aux_tree(b, t)[j].offset += shift; + idx = rw_aux_tree_bsearch(b, t, where); + + if (rw_aux_tree(b, t)[idx].offset == where) { + if (!idx) { /* never delete first entry */ + idx++; + } else if (where < t->end_offset) { + rw_aux_tree_set(b, t, idx++, _where); + } else { + EBUG_ON(where != t->end_offset); + rw_aux_tree_insert_entry(b, t, --t->size); + goto verify; + } + } - EBUG_ON(l < t->size && - rw_aux_tree(b, t)[l].offset == - rw_aux_tree(b, t)[l - 1].offset); + EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where); + if (idx < t->size && + rw_aux_tree(b, t)[idx].offset + shift == + rw_aux_tree(b, t)[idx - 1].offset) { + memmove(&rw_aux_tree(b, t)[idx], + &rw_aux_tree(b, t)[idx + 1], + (void *) &rw_aux_tree(b, t)[t->size] - + (void *) &rw_aux_tree(b, t)[idx + 1]); + t->size -= 1; + } - if (t->size < bset_rw_tree_capacity(b, t) && - (l < t->size - ? rw_aux_tree(b, t)[l].offset - : t->end_offset) - - rw_aux_tree(b, t)[l - 1].offset > - L1_CACHE_BYTES / sizeof(u64)) { - struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); - struct bkey_packed *end = l < t->size - ? rw_aux_to_bkey(b, t, l) - : btree_bkey_last(b, t); - struct bkey_packed *k = start; + for (j = idx; j < t->size; j++) + rw_aux_tree(b, t)[j].offset += shift; - while (1) { - k = bkey_p_next(k); - if (k == end) - break; + EBUG_ON(idx < t->size && + rw_aux_tree(b, t)[idx].offset == + rw_aux_tree(b, t)[idx - 1].offset); - if ((void *) k - (void *) start >= L1_CACHE_BYTES) { - memmove(&rw_aux_tree(b, t)[l + 1], - &rw_aux_tree(b, t)[l], - (void *) &rw_aux_tree(b, t)[t->size] - - (void *) &rw_aux_tree(b, t)[l]); - t->size++; - rw_aux_tree_set(b, t, l, k); - break; - } - } - } + rw_aux_tree_insert_entry(b, t, idx); +verify: bch2_bset_verify_rw_aux_tree(b, t); bset_aux_tree_verify(b); } void bch2_bset_insert(struct btree *b, - struct btree_node_iter *iter, struct bkey_packed *where, struct bkey_i *insert, unsigned clobber_u64s) @@ -1096,8 +1075,7 @@ static inline void prefetch_four_cachelines(void *p) } static inline bool bkey_mantissa_bits_dropped(const struct btree *b, - const struct bkey_float *f, - unsigned idx) + const struct bkey_float *f) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; @@ -1131,9 +1109,9 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, goto slowpath; l = f->mantissa; - r = bkey_mantissa(packed_search, f, n); + r = bkey_mantissa(packed_search, f); - if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) + if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f)) goto slowpath; n = n * 2 + (l < r); @@ -1368,8 +1346,6 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, struct btree *b) { - struct bset_tree *t; - memset(iter, 0, sizeof(*iter)); for_each_bset(b, t) @@ -1475,7 +1451,6 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, { struct bkey_packed *k, *prev = NULL; struct btree_node_iter_set *set; - struct bset_tree *t; unsigned end = 0; if (bch2_expensive_debug_checks) @@ -1544,9 +1519,7 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) { - const struct bset_tree *t; - - for_each_bset(b, t) { + for_each_bset_c(b, t) { enum bset_aux_tree_type type = bset_aux_tree_type(t); size_t j; diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 79c77baaa383..6953d55b72cc 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -206,7 +206,10 @@ static inline size_t btree_aux_data_u64s(const struct btree *b) } #define for_each_bset(_b, _t) \ - for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + +#define for_each_bset_c(_b, _t) \ + for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) #define bset_tree_for_each_key(_b, _t, _k) \ for (_k = btree_bkey_first(_b, _t); \ @@ -267,8 +270,8 @@ void bch2_bset_init_first(struct btree *, struct bset *); void bch2_bset_init_next(struct btree *, struct btree_node_entry *); void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); -void bch2_bset_insert(struct btree *, struct btree_node_iter *, - struct bkey_packed *, struct bkey_i *, unsigned); +void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *, + unsigned); void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); /* Bkey utility code */ @@ -294,7 +297,6 @@ static inline struct bset_tree * bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k) { unsigned offset = __btree_node_key_to_offset(b, k); - struct bset_tree *t; for_each_bset(b, t) if (offset <= t->end_offset) { @@ -458,6 +460,8 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, /* Accounting: */ +struct btree_nr_keys bch2_btree_node_count_keys(struct btree *); + static inline void btree_keys_account_key(struct btree_nr_keys *n, unsigned bset, struct bkey_packed *k, diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index d7c81beac14a..1ec1f90e0eb3 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" @@ -14,9 +15,19 @@ #include <linux/prefetch.h> #include <linux/sched/mm.h> +#include <linux/swap.h> + +#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ +do { \ + if (shrinker_counter) \ + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ +} while (0) const char * const bch2_btree_node_flags[] = { -#define x(f) #f, + "typebit", + "typebit", + "typebit", +#define x(f) [BTREE_NODE_##f] = #f, BTREE_FLAGS() #undef x NULL @@ -24,43 +35,82 @@ const char * const bch2_btree_node_flags[] = { void bch2_recalc_btree_reserve(struct bch_fs *c) { - unsigned i, reserve = 16; + unsigned reserve = 16; if (!c->btree_roots_known[0].b) reserve += 8; - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) reserve += min_t(unsigned, 1, r->b->c.level) * 8; } - c->btree_cache.reserve = reserve; + c->btree_cache.nr_reserve = reserve; } -static inline unsigned btree_cache_can_free(struct btree_cache *bc) +static inline size_t btree_cache_can_free(struct btree_cache_list *list) { - return max_t(int, 0, bc->used - bc->reserve); + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + + size_t can_free = list->nr; + if (!list->idx) + can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); + return can_free; } static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) { + BUG_ON(!list_empty(&b->list)); + if (b->c.lock.readers) - list_move(&b->list, &bc->freed_pcpu); + list_add(&b->list, &bc->freed_pcpu); else - list_move(&b->list, &bc->freed_nonpcpu); + list_add(&b->list, &bc->freed_nonpcpu); } -static void btree_node_data_free(struct bch_fs *c, struct btree *b) +static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(!list_empty(&b->list)); + BUG_ON(!b->data); + + bc->nr_freeable++; + list_add(&b->list, &bc->freeable); +} + +void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; + mutex_lock(&bc->lock); + __bch2_btree_node_to_freelist(bc, b); + mutex_unlock(&bc->lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +} + +static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(!list_empty(&b->list)); + BUG_ON(btree_node_hashed(b)); + + /* + * This should really be done in slub/vmalloc, but we're using the + * kmalloc_large() path, so we're working around a slub bug by doing + * this here: + */ + if (b->data) + mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); + if (b->aux_data) + mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); + EBUG_ON(btree_node_write_in_flight(b)); clear_btree_node_just_written(b); - kvpfree(b->data, btree_buf_bytes(b)); + kvfree(b->data); b->data = NULL; #ifdef __KERNEL__ kvfree(b->aux_data); @@ -69,11 +119,17 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) #endif b->aux_data = NULL; - bc->used--; - btree_node_to_freedlist(bc, b); } +static void btree_node_data_free(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(list_empty(&b->list)); + list_del_init(&b->list); + --bc->nr_freeable; + __btree_node_data_free(bc, b); +} + static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -84,17 +140,20 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params bch_btree_cache_params = { - .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, hash_val), - .key_len = sizeof(u64), - .obj_cmpfn = bch2_btree_cache_cmp_fn, + .head_offset = offsetof(struct btree, hash), + .key_offset = offsetof(struct btree, hash_val), + .key_len = sizeof(u64), + .obj_cmpfn = bch2_btree_cache_cmp_fn, + .automatic_shrinking = true, }; static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); - b->data = kvpmalloc(btree_buf_bytes(b), gfp); + gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; + + b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ @@ -107,7 +166,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = NULL; #endif if (!b->aux_data) { - kvpfree(b->data, btree_buf_bytes(b)); + kvfree(b->data); b->data = NULL; return -BCH_ERR_ENOMEM_btree_node_mem_alloc; } @@ -144,51 +203,144 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) return NULL; } - bch2_btree_lock_init(&b->c, 0); + bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); - bc->used++; - list_add(&b->list, &bc->freeable); + __bch2_btree_node_to_freelist(bc, b); return b; } +static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) +{ + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); + + u64 mask = bc->pinned_nodes_mask[!!b->c.level]; + + return ((mask & BIT_ULL(b->c.btree_id)) && + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); +} + +void bch2_node_pin(struct bch_fs *c, struct btree *b) +{ + struct btree_cache *bc = &c->btree_cache; + + mutex_lock(&bc->lock); + if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { + set_btree_node_pinned(b); + list_move(&b->list, &bc->live[1].list); + bc->live[0].nr--; + bc->live[1].nr++; + } + mutex_unlock(&bc->lock); +} + +void bch2_btree_cache_unpin(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b, *n; + + mutex_lock(&bc->lock); + c->btree_cache.pinned_nodes_mask[0] = 0; + c->btree_cache.pinned_nodes_mask[1] = 0; + + list_for_each_entry_safe(b, n, &bc->live[1].list, list) { + clear_btree_node_pinned(b); + list_move(&b->list, &bc->live[0].list); + bc->live[0].nr++; + bc->live[1].nr--; + } + + mutex_unlock(&bc->lock); +} + /* Btree in memory cache - hash table */ -void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { - int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); + lockdep_assert_held(&bc->lock); + int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); BUG_ON(ret); /* Cause future lookups for this node to fail: */ b->hash_val = 0; + + if (b->c.btree_id < BTREE_ID_NR) + --bc->nr_by_btree[b->c.btree_id]; + --bc->live[btree_node_pinned(b)].nr; + list_del_init(&b->list); +} + +void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +{ + __bch2_btree_node_hash_remove(bc, b); + __bch2_btree_node_to_freelist(bc, b); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) { + BUG_ON(!list_empty(&b->list)); BUG_ON(b->hash_val); + b->hash_val = btree_ptr_hash_val(&b->key); + int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, + bch_btree_cache_params); + if (ret) + return ret; + + if (b->c.btree_id < BTREE_ID_NR) + bc->nr_by_btree[b->c.btree_id]++; + + bool p = __btree_node_pinned(bc, b); + mod_bit(BTREE_NODE_pinned, &b->flags, p); - return rhashtable_lookup_insert_fast(&bc->table, &b->hash, - bch_btree_cache_params); + list_add_tail(&b->list, &bc->live[p].list); + bc->live[p].nr++; + return 0; } int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, unsigned level, enum btree_id id) { - int ret; - b->c.level = level; b->c.btree_id = id; mutex_lock(&bc->lock); - ret = __bch2_btree_node_hash_insert(bc, b); - if (!ret) - list_add_tail(&b->list, &bc->live); + int ret = __bch2_btree_node_hash_insert(bc, b); mutex_unlock(&bc->lock); return ret; } +void bch2_btree_node_update_key_early(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_i *new) +{ + struct bch_fs *c = trans->c; + struct btree *b; + struct bkey_buf tmp; + int ret; + + bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_reassemble(&tmp, c, old); + + b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); + if (!IS_ERR_OR_NULL(b)) { + mutex_lock(&c->btree_cache.lock); + + __bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + + mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&b->c.lock); + } + + bch2_bkey_buf_exit(&tmp, c); +} + __flatten static inline struct btree *btree_cache_find(struct btree_cache *bc, const struct bkey_i *k) @@ -202,7 +354,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, * this version is for btree nodes that have already been freed (we're not * reaping a real btree node) */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) { struct btree_cache *bc = &c->btree_cache; int ret = 0; @@ -212,38 +364,64 @@ wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) + if (!flush) { + if (btree_node_dirty(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(dirty); + else if (btree_node_read_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + else if (btree_node_write_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); return -BCH_ERR_ENOMEM_btree_node_reclaim; + } /* XXX: waiting on IO with btree cache lock held */ bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_write(b); } - if (!six_trylock_intent(&b->c.lock)) + if (!six_trylock_intent(&b->c.lock)) { + BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); return -BCH_ERR_ENOMEM_btree_node_reclaim; + } - if (!six_trylock_write(&b->c.lock)) + if (!six_trylock_write(&b->c.lock)) { + BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); goto out_unlock_intent; + } /* recheck under lock */ if (b->flags & ((1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) + if (!flush) { + if (btree_node_read_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + else if (btree_node_write_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); goto out_unlock; + } six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; } - if (btree_node_noevict(b) || - btree_node_write_blocked(b) || - btree_node_will_make_reachable(b)) + if (btree_node_noevict(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(noevict); + goto out_unlock; + } + if (btree_node_write_blocked(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); goto out_unlock; + } + if (btree_node_will_make_reachable(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); + goto out_unlock; + } if (btree_node_dirty(b)) { - if (!flush) + if (!flush) { + BTREE_CACHE_NOT_FREED_INCREMENT(dirty); goto out_unlock; + } /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted @@ -273,21 +451,22 @@ out_unlock_intent: goto out; } -static int btree_node_reclaim(struct bch_fs *c, struct btree *b) +static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) { - return __btree_node_reclaim(c, b, false); + return __btree_node_reclaim(c, b, false, shrinker_counter); } static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, true); + return __btree_node_reclaim(c, b, true, false); } static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = shrink->private_data; - struct btree_cache *bc = &c->btree_cache; + struct btree_cache_list *list = shrink->private_data; + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); struct btree *b, *t; unsigned long nr = sc->nr_to_scan; unsigned long can_free = 0; @@ -295,8 +474,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long touched = 0; unsigned i, flags; unsigned long ret = SHRINK_STOP; - bool trigger_writes = atomic_read(&bc->dirty) + nr >= - bc->used * 3 / 4; + bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; if (bch2_btree_shrinker_disabled) return SHRINK_STOP; @@ -311,7 +489,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, * succeed, so that inserting keys into the btree can always succeed and * IO can always make forward progress: */ - can_free = btree_cache_can_free(bc); + can_free = btree_cache_can_free(list); nr = min_t(unsigned long, nr, can_free); i = 0; @@ -328,24 +506,29 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (touched >= nr) goto out; - if (!btree_node_reclaim(c, b)) { - btree_node_data_free(c, b); + if (!btree_node_reclaim(c, b, true)) { + btree_node_data_free(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; + bc->nr_freed++; } } restart: - list_for_each_entry_safe(b, t, &bc->live, list) { + list_for_each_entry_safe(b, t, &list->list, list) { touched++; if (btree_node_accessed(b)) { clear_btree_node_accessed(b); - } else if (!btree_node_reclaim(c, b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; + --touched;; + } else if (!btree_node_reclaim(c, b, true)) { + __bch2_btree_node_hash_remove(bc, b); + __btree_node_data_free(bc, b); + freed++; - btree_node_data_free(c, b); + bc->nr_freed++; - bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -356,7 +539,7 @@ restart: !btree_node_will_make_reachable(b) && !btree_node_write_blocked(b) && six_trylock_read(&b->c.lock)) { - list_move(&bc->live, &b->list); + list_move(&list->list, &b->list); mutex_unlock(&bc->lock); __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); six_unlock_read(&b->c.lock); @@ -370,8 +553,8 @@ restart: break; } out_rotate: - if (&t->list != &bc->live) - list_move_tail(&bc->live, &t->list); + if (&t->list != &list->list) + list_move_tail(&list->list, &t->list); out: mutex_unlock(&bc->lock); out_nounlock: @@ -384,57 +567,57 @@ out_nounlock: static unsigned long bch2_btree_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = shrink->private_data; - struct btree_cache *bc = &c->btree_cache; + struct btree_cache_list *list = shrink->private_data; if (bch2_btree_shrinker_disabled) return 0; - return btree_cache_can_free(bc); + return btree_cache_can_free(list); } void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; - struct btree *b; - unsigned i, flags; + struct btree *b, *t; + unsigned long flags; - shrinker_free(bc->shrink); + shrinker_free(bc->live[1].shrink); + shrinker_free(bc->live[0].shrink); /* vfree() can allocate memory: */ flags = memalloc_nofs_save(); mutex_lock(&bc->lock); if (c->verify_data) - list_move(&c->verify_data->list, &bc->live); + list_move(&c->verify_data->list, &bc->live[0].list); - kvpfree(c->verify_ondisk, c->opts.btree_node_size); + kvfree(c->verify_ondisk); - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) - list_add(&r->b->list, &bc->live); + list_add(&r->b->list, &bc->live[0].list); } - list_splice(&bc->freeable, &bc->live); - - while (!list_empty(&bc->live)) { - b = list_first_entry(&bc->live, struct btree, list); + list_for_each_entry_safe(b, t, &bc->live[1].list, list) + bch2_btree_node_hash_remove(bc, b); + list_for_each_entry_safe(b, t, &bc->live[0].list, list) + bch2_btree_node_hash_remove(bc, b); + list_for_each_entry_safe(b, t, &bc->freeable, list) { BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); - btree_node_data_free(c, b); + btree_node_data_free(bc, b); } BUG_ON(!bch2_journal_error(&c->journal) && - atomic_read(&c->btree_cache.dirty)); + atomic_long_read(&c->btree_cache.nr_dirty)); list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); - while (!list_empty(&bc->freed_nonpcpu)) { - b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); + list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { list_del(&b->list); six_lock_exit(&b->c.lock); kfree(b); @@ -443,6 +626,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) mutex_unlock(&bc->lock); memalloc_nofs_restore(flags); + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) + BUG_ON(bc->nr_by_btree[i]); + BUG_ON(bc->live[0].nr); + BUG_ON(bc->live[1].nr); + BUG_ON(bc->nr_freeable); + if (bc->table_init_done) rhashtable_destroy(&bc->table); } @@ -462,22 +651,32 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); - for (i = 0; i < bc->reserve; i++) + for (i = 0; i < bc->nr_reserve; i++) if (!__bch2_btree_node_mem_alloc(c)) goto err; - list_splice_init(&bc->live, &bc->freeable); + list_splice_init(&bc->live[0].list, &bc->freeable); mutex_init(&c->verify_lock); shrink = shrinker_alloc(0, "%s-btree_cache", c->name); if (!shrink) goto err; - bc->shrink = shrink; + bc->live[0].shrink = shrink; + shrink->count_objects = bch2_btree_cache_count; + shrink->scan_objects = bch2_btree_cache_scan; + shrink->seeks = 2; + shrink->private_data = &bc->live[0]; + shrinker_register(shrink); + + shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); + if (!shrink) + goto err; + bc->live[1].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; - shrink->seeks = 4; - shrink->private_data = c; + shrink->seeks = 8; + shrink->private_data = &bc->live[1]; shrinker_register(shrink); return 0; @@ -488,7 +687,10 @@ err: void bch2_fs_btree_cache_init_early(struct btree_cache *bc) { mutex_init(&bc->lock); - INIT_LIST_HEAD(&bc->live); + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { + bc->live[i].idx = i; + INIT_LIST_HEAD(&bc->live[i].list); + } INIT_LIST_HEAD(&bc->freeable); INIT_LIST_HEAD(&bc->freed_pcpu); INIT_LIST_HEAD(&bc->freed_nonpcpu); @@ -518,8 +720,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure struct btree_cache *bc = &c->btree_cache; struct task_struct *old; - old = cmpxchg(&bc->alloc_lock, NULL, current); - if (old == NULL || old == current) + old = NULL; + if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) goto success; if (!cl) { @@ -530,8 +732,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure closure_wait(&bc->alloc_wait, cl); /* Try again, after adding ourselves to waitlist */ - old = cmpxchg(&bc->alloc_lock, NULL, current); - if (old == NULL || old == current) { + old = NULL; + if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { /* We raced */ closure_wake_up(&bc->alloc_wait); goto success; @@ -550,14 +752,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) struct btree_cache *bc = &c->btree_cache; struct btree *b; - list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_reclaim(c, b)) - return b; + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) + if (!btree_node_reclaim(c, b, false)) + return b; while (1) { - list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_write_and_reclaim(c, b)) - return b; + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) + if (!btree_node_write_and_reclaim(c, b)) + return b; /* * Rare case: all nodes were intent-locked. @@ -577,9 +781,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea : &bc->freed_nonpcpu; struct btree *b, *b2; u64 start_time = local_clock(); - unsigned flags; - flags = memalloc_nofs_save(); mutex_lock(&bc->lock); /* @@ -587,36 +789,42 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b)) { + if (!btree_node_reclaim(c, b, false)) { list_del_init(&b->list); goto got_node; } b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); - if (!b) { + if (b) { + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); + } else { mutex_unlock(&bc->lock); bch2_trans_unlock(trans); b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) goto err; + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); mutex_lock(&bc->lock); } - bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); - BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); -got_node: +got_node: /* * btree_free() doesn't free memory; it sticks the node on the end of * the list. Check if there's any freed nodes there: */ list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2)) { + if (!btree_node_reclaim(c, b2, false)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); + + list_del_init(&b2->list); + --bc->nr_freeable; btree_node_to_freedlist(bc, b2); + mutex_unlock(&bc->lock); + six_unlock_write(&b2->c.lock); six_unlock_intent(&b2->c.lock); goto got_mem; @@ -630,11 +838,8 @@ got_node: goto err; } - mutex_lock(&bc->lock); - bc->used++; got_mem: - mutex_unlock(&bc->lock); - + BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_write_in_flight(b)); @@ -651,7 +856,12 @@ out: bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); - memalloc_nofs_restore(flags); + int ret = bch2_trans_relock(trans); + if (unlikely(ret)) { + bch2_btree_node_to_freelist(c, b); + return ERR_PTR(ret); + } + return b; err: mutex_lock(&bc->lock); @@ -660,7 +870,7 @@ err: if (bc->alloc_lock == current) { b2 = btree_node_cannibalize(c); clear_btree_node_just_written(b2); - bch2_btree_node_hash_remove(bc, b2); + __bch2_btree_node_hash_remove(bc, b2); if (b) { swap(b->data, b2->data); @@ -670,9 +880,9 @@ err: six_unlock_intent(&b2->c.lock); } else { b = b2; - list_del_init(&b->list); } + BUG_ON(!list_empty(&b->list)); mutex_unlock(&bc->lock); trace_and_count(c, btree_cache_cannibalize, trans); @@ -680,7 +890,6 @@ err: } mutex_unlock(&bc->lock); - memalloc_nofs_restore(flags); return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); } @@ -696,9 +905,31 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - u32 seq; - BUG_ON(level + 1 >= BTREE_MAX_DEPTH); + if (unlikely(level >= BTREE_MAX_DEPTH)) { + int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", + level, BTREE_MAX_DEPTH); + return ERR_PTR(ret); + } + + if (unlikely(!bkey_is_btree_ptr(&k->k))) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); + printbuf_exit(&buf); + return ERR_PTR(ret); + } + + if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); + printbuf_exit(&buf); + return ERR_PTR(ret); + } + /* * Parent node must be locked, else we could read in a btree node that's * been freed: @@ -711,6 +942,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, b = bch2_btree_node_mem_alloc(trans, level != 0); if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { + if (!path) + return b; + trans->memory_allocation_failure = true; trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); @@ -727,7 +961,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, b->hash_val = 0; mutex_lock(&bc->lock); - list_add(&b->list, &bc->freeable); + __bch2_btree_node_to_freelist(bc, b); mutex_unlock(&bc->lock); six_unlock_write(&b->c.lock); @@ -736,33 +970,30 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, } set_btree_node_read_in_flight(b); - six_unlock_write(&b->c.lock); - seq = six_lock_seq(&b->c.lock); - six_unlock_intent(&b->c.lock); - /* Unlock before doing IO: */ - if (path && sync) - bch2_trans_unlock_noassert(trans); + if (path) { + u32 seq = six_lock_seq(&b->c.lock); - bch2_btree_node_read(trans, b, sync); + /* Unlock before doing IO: */ + six_unlock_intent(&b->c.lock); + bch2_trans_unlock_noassert(trans); - if (!sync) - return NULL; + bch2_btree_node_read(trans, b, sync); - if (path) { - int ret = bch2_trans_relock(trans) ?: - bch2_btree_path_relock_intent(trans, path); - if (ret) { - BUG_ON(!trans->restarted); + int ret = bch2_trans_relock(trans); + if (ret) return ERR_PTR(ret); - } - } - if (!six_relock_type(&b->c.lock, lock_type, seq)) { - if (path) - trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); + if (!sync) + return NULL; + + if (!six_relock_type(&b->c.lock, lock_type, seq)) + b = NULL; + } else { + bch2_btree_node_read(trans, b, sync); + if (lock_type == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); } return b; @@ -776,22 +1007,21 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) return; prt_printf(&buf, - "btree node header doesn't match ptr\n" - "btree %s level %u\n" - "ptr: ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + "btree node header doesn't match ptr: "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, "\nptr: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_printf(&buf, "\nheader: btree %s level %llu\n" - "min ", - bch2_btree_id_str(BTREE_NODE_ID(b->data)), - BTREE_NODE_LEVEL(b->data)); + prt_str(&buf, "\nheader: "); + bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); + prt_str(&buf, "\nmin "); bch2_bpos_to_text(&buf, b->data->min_key); prt_printf(&buf, "\nmax "); bch2_bpos_to_text(&buf, b->data->max_key); - bch2_fs_inconsistent(c, "%s", buf.buf); + bch2_fs_topology_error(c, "%s", buf.buf); + printbuf_exit(&buf); } @@ -814,7 +1044,6 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - struct bset_tree *t; bool need_relock = false; int ret; @@ -872,6 +1101,10 @@ retry: bch2_btree_node_wait_on_read(b); + ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + /* * should_be_locked is not set on this path yet, so we need to * relock it specifically: @@ -901,7 +1134,7 @@ retry: if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-EIO); + return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -934,7 +1167,6 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * { struct bch_fs *c = trans->c; struct btree *b; - struct bset_tree *t; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); @@ -992,7 +1224,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-EIO); + return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1011,7 +1243,6 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - struct bset_tree *t; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); @@ -1075,7 +1306,7 @@ lock_node: if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); - b = ERR_PTR(-EIO); + b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); goto out; } @@ -1094,18 +1325,22 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; - struct btree *b; - BUG_ON(trans && !btree_node_locked(path, level + 1)); + BUG_ON(path && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); - b = btree_cache_find(bc, k); + struct btree *b = btree_cache_find(bc, k); if (b) return 0; b = bch2_btree_node_fill(trans, path, k, btree_id, level, SIX_LOCK_read, false); - return PTR_ERR_OR_ZERO(b); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret; + if (b) + six_unlock_read(&b->c.lock); + return 0; } void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) @@ -1117,6 +1352,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) b = btree_cache_find(bc, k); if (!b) return; + + BUG_ON(b == btree_node_root(trans->c, b)); wait_on_io: /* not allowed to wait on io with btree locks held: */ @@ -1128,6 +1365,8 @@ wait_on_io: btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + if (unlikely(b->hash_val != btree_ptr_hash_val(k))) + goto out; if (btree_node_dirty(b)) { __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); @@ -1139,10 +1378,10 @@ wait_on_io: BUG_ON(btree_node_dirty(b)); mutex_lock(&bc->lock); - btree_node_data_free(c, b); bch2_btree_node_hash_remove(bc, b); + btree_node_data_free(bc, b); mutex_unlock(&bc->lock); - +out: six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } @@ -1152,13 +1391,39 @@ const char *bch2_btree_id_str(enum btree_id btree) return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; } +void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) +{ + if (btree < BTREE_ID_NR) + prt_str(out, __bch2_btree_ids[btree]); + else + prt_printf(out, "(unknown btree %u)", btree); +} + +void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) +{ + prt_str(out, "btree="); + bch2_btree_id_to_text(out, btree); + prt_printf(out, " level=%u", level); +} + +void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, + enum btree_id btree, unsigned level, struct bkey_s_c k) +{ + bch2_btree_id_to_text(out, btree); + prt_printf(out, " level %u/", level); + struct btree_root *r = bch2_btree_id_root(c, btree); + if (r) + prt_printf(out, "%u", r->level); + else + prt_printf(out, "(unknown)"); + prt_printf(out, "\n "); + + bch2_bkey_val_to_text(out, c, k); +} + void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { - prt_printf(out, "%s level %u/%u\n ", - bch2_btree_id_str(b->c.btree_id), - b->c.level, - bch2_btree_id_root(c, b->c.btree_id)->level); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); } void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) @@ -1203,9 +1468,47 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc stats.failed); } -void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) +static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, + const char *label, size_t nr) +{ + prt_printf(out, "%s\t", label); + prt_human_readable_u64(out, nr * c->opts.btree_node_size); + prt_printf(out, " (%zu)\n", nr); +} + +static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { +#define x(n) #n, + BCH_BTREE_CACHE_NOT_FREED_REASONS() +#undef x + NULL +}; + +void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) { - prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); - prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); - prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_btree_cache_line(out, c, "live:", bc->live[0].nr); + prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); + prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); + prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); + prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); + prt_newline(out); + + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { + bch2_btree_id_to_text(out, i); + prt_printf(out, "\t"); + prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); + prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); + } + + prt_newline(out); + prt_printf(out, "freed:\t%zu\n", bc->nr_freed); + prt_printf(out, "not freed:\n"); + + for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) + prt_printf(out, " %s\t%llu\n", + bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); } diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 6d33885fdbde..ca3c1b145330 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -12,11 +12,21 @@ struct btree_iter; void bch2_recalc_btree_reserve(struct bch_fs *); +void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *); + +void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); + int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); +void bch2_node_pin(struct bch_fs *, struct btree *); +void bch2_btree_cache_unpin(struct bch_fs *); + +void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *); + void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); @@ -118,19 +128,29 @@ static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned i } else { unsigned idx = id - BTREE_ID_NR; - EBUG_ON(idx >= c->btree_roots_extra.nr); + /* This can happen when we're called from btree_node_scan */ + if (idx >= c->btree_roots_extra.nr) + return NULL; + return &c->btree_roots_extra.data[idx]; } } static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) { - return bch2_btree_id_root(c, b->c.btree_id)->b; + struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); + + return r ? r->b : NULL; } -const char *bch2_btree_id_str(enum btree_id); +const char *bch2_btree_id_str(enum btree_id); /* avoid */ +void bch2_btree_id_to_text(struct printbuf *, enum btree_id); +void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned); + +void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, + enum btree_id, unsigned, struct bkey_s_c); void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); -void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *); +void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 1102995643b1..dd1d9b74076e 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -7,25 +7,29 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "bkey_methods.h" #include "bkey_buf.h" #include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_locking.h" +#include "btree_node_scan.h" #include "btree_update_interior.h" #include "btree_io.h" #include "btree_gc.h" #include "buckets.h" #include "clock.h" #include "debug.h" +#include "disk_accounting.h" #include "ec.h" #include "error.h" #include "extents.h" #include "journal.h" #include "keylist.h" #include "move.h" -#include "recovery.h" +#include "recovery_passes.h" #include "reflink.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" #include "trace.h" @@ -40,6 +44,23 @@ #define DROP_THIS_NODE 10 #define DROP_PREV_NODE 11 +#define DID_FILL_FROM_SCAN 12 + +static const char * const bch2_gc_phase_strs[] = { +#define x(n) #n, + GC_PHASES() +#undef x + NULL +}; + +void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p) +{ + prt_str(out, bch2_gc_phase_strs[p->phase]); + prt_char(out, ' '); + bch2_btree_id_level_to_text(out, p->btree, p->level); + prt_char(out, ' '); + bch2_bpos_to_text(out, p->pos); +} static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) { @@ -49,12 +70,6 @@ static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) }}}; } -static bool should_restart_for_topology_repair(struct bch_fs *c) -{ - return c->opts.fix_errors != FSCK_FIX_no && - !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); -} - static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { preempt_disable(); @@ -66,94 +81,10 @@ static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { - BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); + BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0); __gc_pos_set(c, new_pos); } -/* - * Missing: if an interior btree node is empty, we need to do something - - * perhaps just kill it - */ -static int bch2_gc_check_topology(struct bch_fs *c, - struct btree *b, - struct bkey_buf *prev, - struct bkey_buf cur, - bool is_last) -{ - struct bpos node_start = b->data->min_key; - struct bpos node_end = b->data->max_key; - struct bpos expected_start = bkey_deleted(&prev->k->k) - ? node_start - : bpos_successor(prev->k->k.p); - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - int ret = 0; - - if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { - struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); - - if (!bpos_eq(expected_start, bp->v.min_key)) { - bch2_topology_error(c); - - if (bkey_deleted(&prev->k->k)) { - prt_printf(&buf1, "start of node: "); - bch2_bpos_to_text(&buf1, node_start); - } else { - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); - } - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); - - if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, - btree_node_topology_bad_min_key, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " cur %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto err; - } else { - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - } - } - } - - if (is_last && !bpos_eq(cur.k->k.p, node_end)) { - bch2_topology_error(c); - - printbuf_reset(&buf1); - printbuf_reset(&buf2); - - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); - bch2_bpos_to_text(&buf2, node_end); - - if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " %s\n" - " expected %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf) && - should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto err; - } else { - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - } - } - - bch2_bkey_buf_copy(prev, c, cur.k); -err: -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) { switch (b->key.k.type) { @@ -178,40 +109,22 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) } } -static void bch2_btree_node_update_key_early(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_i *new) +static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) { - struct bch_fs *c = trans->c; - struct btree *b; - struct bkey_buf tmp; + struct bkey_i_btree_ptr_v2 *new; int ret; - bch2_bkey_buf_init(&tmp); - bch2_bkey_buf_reassemble(&tmp, c, old); - - b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); - if (!IS_ERR_OR_NULL(b)) { - mutex_lock(&c->btree_cache.lock); + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; - bch2_btree_node_hash_remove(&c->btree_cache, b); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, " -> "); + bch2_bpos_to_text(&buf, new_min); - bkey_copy(&b->key, new); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - - mutex_unlock(&c->btree_cache.lock); - six_unlock_read(&b->c.lock); + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); } - bch2_bkey_buf_exit(&tmp, c); -} - -static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) -{ - struct bkey_i_btree_ptr_v2 *new; - int ret; - new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) return -BCH_ERR_ENOMEM_gc_repair_key; @@ -237,6 +150,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) struct bkey_i_btree_ptr_v2 *new; int ret; + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, " -> "); + bch2_bpos_to_text(&buf, new_max); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); if (ret) return ret; @@ -259,7 +183,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) bch2_btree_node_drop_keys_outside_node(b); mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); + __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, &new->k_i); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); @@ -268,128 +192,144 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) return 0; } -static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, - struct btree *prev, struct btree *cur) +static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b, + struct btree *prev, struct btree *cur, + struct bpos *pulled_from_scan) { + struct bch_fs *c = trans->c; struct bpos expected_start = !prev ? b->data->min_key : bpos_successor(prev->key.k.p); - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; int ret = 0; - if (!prev) { - prt_printf(&buf1, "start of node: "); - bch2_bpos_to_text(&buf1, b->data->min_key); - } else { - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); + + if (bpos_eq(expected_start, cur->data->min_key)) + return 0; + + prt_printf(&buf, " at "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_printf(&buf, ":\n parent: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + if (prev) { + prt_printf(&buf, "\n prev: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); } - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); - - if (prev && - bpos_gt(expected_start, cur->data->min_key) && - BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { - /* cur overwrites prev: */ - - if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, - cur->data->min_key), c, - btree_node_topology_overwritten_by_next_node, - "btree node overwritten by next node at btree %s level %u:\n" - " node %s\n" - " next %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = DROP_PREV_NODE; - goto out; - } + prt_str(&buf, "\n next: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); - if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, - bpos_predecessor(cur->data->min_key)), c, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " node %s\n" - " next %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) - ret = set_node_max(c, prev, - bpos_predecessor(cur->data->min_key)); - } else { - /* prev overwrites cur: */ - - if (mustfix_fsck_err_on(bpos_ge(expected_start, - cur->data->max_key), c, - btree_node_topology_overwritten_by_prev_node, - "btree node overwritten by prev node at btree %s level %u:\n" - " prev %s\n" - " node %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = DROP_THIS_NODE; - goto out; - } + if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ + if (b->c.level == 1 && + bpos_lt(*pulled_from_scan, cur->data->min_key)) { + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, + expected_start, + bpos_predecessor(cur->data->min_key)); + if (ret) + goto err; - if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, - btree_node_topology_bad_min_key, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " node %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) - ret = set_node_min(c, cur, expected_start); + *pulled_from_scan = cur->data->min_key; + ret = DID_FILL_FROM_SCAN; + } else { + if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, + "btree node with incorrect min_key%s", buf.buf)) + ret = set_node_min(c, cur, expected_start); + } + } else { /* overlap */ + if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */ + if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ + if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node, + "btree node overwritten by next node%s", buf.buf)) + ret = DROP_PREV_NODE; + } else { + if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, + "btree node with incorrect max_key%s", buf.buf)) + ret = set_node_max(c, prev, + bpos_predecessor(cur->data->min_key)); + } + } else { + if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ + if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node, + "btree node overwritten by prev node%s", buf.buf)) + ret = DROP_THIS_NODE; + } else { + if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, + "btree node with incorrect min_key%s", buf.buf)) + ret = set_node_min(c, cur, expected_start); + } + } } -out: +err: fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); + printbuf_exit(&buf); return ret; } -static int btree_repair_node_end(struct bch_fs *c, struct btree *b, - struct btree *child) +static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, + struct btree *child, struct bpos *pulled_from_scan) { - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; int ret = 0; - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); - bch2_bpos_to_text(&buf2, b->key.k.p); - - if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " %s\n" - " expected %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = set_node_max(c, child, b->key.k.p); - if (ret) - goto err; + if (bpos_eq(child->key.k.p, b->key.k.p)) + return 0; + + prt_printf(&buf, " at "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_printf(&buf, ":\n parent: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + prt_str(&buf, "\n child: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); + + if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, + "btree node with incorrect max_key%s", buf.buf)) { + if (b->c.level == 1 && + bpos_lt(*pulled_from_scan, b->key.k.p)) { + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, + bpos_successor(child->key.k.p), b->key.k.p); + if (ret) + goto err; + + *pulled_from_scan = b->key.k.p; + ret = DID_FILL_FROM_SCAN; + } else { + ret = set_node_max(c, child, b->key.k.p); + } } err: fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); + printbuf_exit(&buf); return ret; } -static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) +static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b, + struct bpos *pulled_from_scan) { struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf prev_k, cur_k; struct btree *prev = NULL, *cur = NULL; - bool have_child, dropped_children = false; + bool have_child, new_pass = false; struct printbuf buf = PRINTBUF; int ret = 0; if (!b->c.level) return 0; -again: - prev = NULL; - have_child = dropped_children = false; + bch2_bkey_buf_init(&prev_k); bch2_bkey_buf_init(&cur_k); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); +again: + cur = prev = NULL; + have_child = new_pass = false; + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); @@ -404,19 +344,23 @@ again: ret = PTR_ERR_OR_ZERO(cur); printbuf_reset(&buf); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1); + prt_char(&buf, ' '); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - if (mustfix_fsck_err_on(ret == -EIO, c, - btree_node_unreadable, - "Topology repair: unreadable btree node at btree %s level %u:\n" + if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), + trans, btree_node_read_error, + "Topology repair: unreadable btree node at\n" " %s", - bch2_btree_id_str(b->c.btree_id), - b->c.level - 1, buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); + cur = NULL; ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); - cur = NULL; + if (ret) + break; + + ret = bch2_btree_lost_data(c, b->c.btree_id); if (ret) break; continue; @@ -426,7 +370,23 @@ again: if (ret) break; - ret = btree_repair_node_boundaries(c, b, prev, cur); + if (bch2_btree_node_is_stale(c, cur)) { + bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf); + six_unlock_read(&cur->c.lock); + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + cur = NULL; + if (ret) + break; + continue; + } + + ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan); + if (ret == DID_FILL_FROM_SCAN) { + new_pass = true; + ret = 0; + } if (ret == DROP_THIS_NODE) { six_unlock_read(&cur->c.lock); @@ -444,6 +404,7 @@ again: prev = NULL; if (ret == DROP_PREV_NODE) { + bch_info(c, "dropped prev node"); bch2_btree_node_evict(trans, prev_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, prev_k.k->k.p); @@ -451,8 +412,6 @@ again: break; bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); goto again; } else if (ret) break; @@ -464,7 +423,11 @@ again: if (!ret && !IS_ERR_OR_NULL(prev)) { BUG_ON(cur); - ret = btree_repair_node_end(c, b, prev); + ret = btree_repair_node_end(trans, b, prev, pulled_from_scan); + if (ret == DID_FILL_FROM_SCAN) { + new_pass = true; + ret = 0; + } } if (!IS_ERR_OR_NULL(prev)) @@ -478,7 +441,12 @@ again: goto err; bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + if (new_pass) + goto again; + + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { bch2_bkey_buf_reassemble(&cur_k, c, k); @@ -493,7 +461,7 @@ again: if (ret) goto err; - ret = bch2_btree_repair_topology_recurse(trans, cur); + ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan); six_unlock_read(&cur->c.lock); cur = NULL; @@ -501,7 +469,7 @@ again: bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); - dropped_children = true; + new_pass = true; } if (ret) @@ -511,14 +479,13 @@ again: } printbuf_reset(&buf); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - if (mustfix_fsck_err_on(!have_child, c, - btree_node_topology_interior_node_empty, - "empty interior btree node at btree %s level %u\n" - " %s", - bch2_btree_id_str(b->c.btree_id), - b->c.level, buf.buf)) + if (mustfix_fsck_err_on(!have_child, + trans, btree_node_topology_interior_node_empty, + "empty interior btree node at %s", buf.buf)) ret = DROP_THIS_NODE; err: fsck_err: @@ -528,12 +495,14 @@ fsck_err: six_unlock_read(&cur->c.lock); bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); - if (!ret && dropped_children) + if (!ret && new_pass) goto again; + BUG_ON(!ret && bch2_btree_node_check_topology(trans, b)); + + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); printbuf_exit(&buf); return ret; } @@ -541,550 +510,215 @@ fsck_err: int bch2_check_topology(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree *b; - unsigned i; + struct bpos pulled_from_scan = POS_MIN; + struct printbuf buf = PRINTBUF; int ret = 0; - for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (!r->alive) - continue; + bch2_trans_srcu_unlock(trans); - b = r->b; - if (btree_node_fake(b)) - continue; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - ret = bch2_btree_repair_topology_recurse(trans, b); - six_unlock_read(&b->c.lock); - - if (ret == DROP_THIS_NODE) { - bch_err(c, "empty btree root - repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; - } - } - - bch2_trans_put(trans); - - return ret; -} - -static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id, - unsigned level, bool is_root, - struct bkey_s_c *k) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k); - const union bch_extent_entry *entry_c; - struct extent_ptr_decoded p = { 0 }; - bool do_update = false; - struct printbuf buf = PRINTBUF; - int ret = 0; + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + bool reconstructed_root = false; - /* - * XXX - * use check_bucket_ref here - */ - bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); - - if (!g->gen_valid && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - } else { - do_update = true; - } - } + printbuf_reset(&buf); + bch2_btree_id_to_text(&buf, i); - if (gen_cmp(p.ptr.gen, g->gen) > 0 && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - set_bit(BCH_FS_need_another_gc, &c->flags); - } else { - do_update = true; - } - } + if (r->error) { + ret = bch2_btree_lost_data(c, i); + if (ret) + break; +reconstruct_root: + bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); - if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) - do_update = true; - - if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && - (c->opts.reconstruct_alloc || - fsck_err(c, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) - do_update = true; - - if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) - continue; + r->alive = false; + r->error = 0; - if (fsck_err_on(bucket_data_type(g->data_type) && - bucket_data_type(g->data_type) != data_type, c, - ptr_bucket_data_type_mismatch, - "bucket %u:%zu different types of data in same bucket: %s, %s\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { - if (data_type == BCH_DATA_btree) { - g->data_type = data_type; - set_bit(BCH_FS_need_another_gc, &c->flags); + if (!bch2_btree_has_scanned_nodes(c, i)) { + mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing, + "no nodes found for btree %s, continue?", buf.buf); + bch2_btree_root_alloc_fake_trans(trans, i, 0); } else { - do_update = true; + bch2_btree_root_alloc_fake_trans(trans, i, 1); + bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); + if (ret) + break; } - } - if (p.has_ec) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); - - if (fsck_err_on(!m || !m->alive, c, - ptr_to_missing_stripe, - "pointer to nonexistent stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; - - if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, - ptr_to_incorrect_stripe, - "pointer does not match stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; + reconstructed_root = true; } - } - if (do_update) { - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; - struct bch_extent_ptr *ptr; - struct bkey_i *new; + struct btree *b = r->b; - if (is_root) { - bch_err(c, "cannot update btree roots yet"); - ret = -EINVAL; - goto err; - } - - new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); - if (!new) { - ret = -BCH_ERR_ENOMEM_gc_repair_key; - bch_err_msg(c, ret, "allocating new key"); - goto err; - } - - bkey_reassemble(new, *k); - - if (level) { - /* - * We don't want to drop btree node pointers - if the - * btree node isn't there anymore, the read path will - * sort it out: - */ - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - - ptr->gen = g->gen; - } - } else { - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); - - (ptr->cached && - (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || - (!ptr->cached && - gen_cmp(ptr->gen, g->gen) < 0) || - gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || - (g->data_type && - g->data_type != data_type); - })); -again: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_extent_entry_for_each(ptrs, entry) { - if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, - entry->stripe_ptr.idx); - union bch_extent_entry *next_ptr; - - bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) - if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) - goto found; - next_ptr = NULL; -found: - if (!next_ptr) { - bch_err(c, "aieee, found stripe ptr with no data ptr"); - continue; - } - - if (!m || !m->alive || - !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], - &next_ptr->ptr, - m->sectors)) { - bch2_bkey_extent_entry_drop(new, entry); - goto again; - } - } - } - } + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); + six_unlock_read(&b->c.lock); - ret = bch2_journal_key_insert_take(c, btree_id, level, new); - if (ret) { - kfree(new); - goto err; - } + if (ret == DROP_THIS_NODE) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_unlock(&c->btree_cache.lock); - if (level) - bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); + r->b = NULL; - if (0) { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, *k); - bch_info(c, "updated %s", buf.buf); + if (!reconstructed_root) + goto reconstruct_root; - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); - bch_info(c, "new key %s", buf.buf); + bch_err(c, "empty btree root %s", buf.buf); + bch2_btree_root_alloc_fake_trans(trans, i, 0); + r->alive = false; + ret = 0; } - - *k = bkey_i_to_s_c(new); } -err: fsck_err: printbuf_exit(&buf); + bch2_trans_put(trans); return ret; } /* marking of btree keys/nodes: */ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, - unsigned level, bool is_root, - struct bkey_s_c *k, + unsigned level, struct btree **prev, + struct btree_iter *iter, struct bkey_s_c k, bool initial) { struct bch_fs *c = trans->c; - struct bkey deleted = KEY(0, 0, 0); - struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; - int ret = 0; - - deleted.p = k->k->p; - if (initial) { - BUG_ON(bch2_journal_seq_verify && - k->k->version.lo > atomic64_read(&c->journal.seq)); - - ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); - if (ret) - goto err; + if (iter) { + struct btree_path *path = btree_iter_path(trans, iter); + struct btree *b = path_l(path)->b; - if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, - bkey_version_in_future, - "key version number higher than recorded: %llu > %llu", - k->k->version.lo, - atomic64_read(&c->key_version))) - atomic64_set(&c->key_version, k->k->version.lo); - } - - ret = commit_do(trans, NULL, NULL, 0, - bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); -fsck_err: -err: - bch_err_fn(c, ret); - return ret; -} - -static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) -{ - struct bch_fs *c = trans->c; - struct btree_node_iter iter; - struct bkey unpacked; - struct bkey_s_c k; - struct bkey_buf prev, cur; - int ret = 0; - - if (!btree_node_type_needs_gc(btree_node_type(b))) - return 0; - - bch2_btree_node_iter_init_from_start(&iter, b); - bch2_bkey_buf_init(&prev); - bch2_bkey_buf_init(&cur); - bkey_init(&prev.k->k); - - while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, - &k, initial); - if (ret) - break; - - bch2_btree_node_iter_advance(&iter, b); - - if (b->c.level) { - bch2_bkey_buf_reassemble(&cur, c, k); - - ret = bch2_gc_check_topology(c, b, &prev, cur, - bch2_btree_node_iter_end(&iter)); + if (*prev != b) { + int ret = bch2_btree_node_check_topology(trans, b); if (ret) - break; + return ret; } + *prev = b; } - bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); - return ret; -} - -static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, - bool initial, bool metadata_only) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct btree *b; - unsigned depth = metadata_only ? 1 : 0; + struct bkey deleted = KEY(0, 0, 0); + struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + struct printbuf buf = PRINTBUF; int ret = 0; - gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); - - __for_each_btree_node(trans, iter, btree_id, POS_MIN, - 0, depth, BTREE_ITER_PREFETCH, b, ret) { - bch2_verify_btree_nr_keys(b); + deleted.p = k.k->p; - gc_pos_set(c, gc_pos_btree_node(b)); - - ret = btree_gc_mark_node(trans, b, initial); - if (ret) - break; + if (initial) { + BUG_ON(bch2_journal_seq_verify && + k.k->bversion.lo > atomic64_read(&c->journal.seq)); + + if (fsck_err_on(btree_id != BTREE_ID_accounting && + k.k->bversion.lo > atomic64_read(&c->key_version), + trans, bkey_version_in_future, + "key version number higher than recorded %llu\n %s", + atomic64_read(&c->key_version), + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + atomic64_set(&c->key_version, k.k->bversion.lo); } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - return ret; - mutex_lock(&c->btree_root_lock); - b = bch2_btree_id_root(c, btree_id)->b; - if (!btree_node_fake(b)) { - struct bkey_s_c k = bkey_i_to_s_c(&b->key); - - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, - true, &k, initial); + if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), + trans, btree_bitmap_not_marked, + "btree ptr not marked in member info btree allocated bitmap\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { + mutex_lock(&c->sb_lock); + bch2_dev_btree_bitmap_mark(c, k); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); } - gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); - mutex_unlock(&c->btree_root_lock); - - return ret; -} -static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, - unsigned target_depth) -{ - struct bch_fs *c = trans->c; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct bkey_buf cur, prev; - struct printbuf buf = PRINTBUF; - int ret = 0; - - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); - bch2_bkey_buf_init(&prev); - bch2_bkey_buf_init(&cur); - bkey_init(&prev.k->k); - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - BUG_ON(bpos_lt(k.k->p, b->data->min_key)); - BUG_ON(bpos_gt(k.k->p, b->data->max_key)); - - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, - false, &k, true); - if (ret) - goto fsck_err; - - if (b->c.level) { - bch2_bkey_buf_reassemble(&cur, c, k); - k = bkey_i_to_s_c(cur.k); + /* + * We require a commit before key_trigger() because + * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the + * wrong result if we run it multiple times. + */ + unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0; - bch2_btree_and_journal_iter_advance(&iter); + ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), + BTREE_TRIGGER_check_repair|flags); + if (ret) + goto out; - ret = bch2_gc_check_topology(c, b, - &prev, cur, - !bch2_btree_and_journal_iter_peek(&iter).k); - if (ret) - goto fsck_err; - } else { - bch2_btree_and_journal_iter_advance(&iter); - } + if (trans->nr_updates) { + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; } - if (b->c.level > target_depth) { - bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - struct btree *child; - - bch2_bkey_buf_reassemble(&cur, c, k); - bch2_btree_and_journal_iter_advance(&iter); - - child = bch2_btree_node_get_noiter(trans, cur.k, - b->c.btree_id, b->c.level - 1, - false); - ret = PTR_ERR_OR_ZERO(child); - - if (ret == -EIO) { - bch2_topology_error(c); - - if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, - btree_node_read_error, - "Unreadable btree node at btree %s level %u:\n" - " %s", - bch2_btree_id_str(b->c.btree_id), - b->c.level - 1, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && - should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto fsck_err; - } else { - /* Continue marking when opted to not - * fix the error: */ - ret = 0; - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - continue; - } - } else if (ret) { - bch_err_msg(c, ret, "getting btree node"); - break; - } - - ret = bch2_gc_btree_init_recurse(trans, child, - target_depth); - six_unlock_read(&child->c.lock); - - if (ret) - break; - } - } + ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), + BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags); +out: fsck_err: - bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); - bch2_btree_and_journal_iter_exit(&iter); printbuf_exit(&buf); + bch_err_fn(c, ret); return ret; } -static int bch2_gc_btree_init(struct btree_trans *trans, - enum btree_id btree_id, - bool metadata_only) +static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) { struct bch_fs *c = trans->c; - struct btree *b; - unsigned target_depth = metadata_only ? 1 : 0; - struct printbuf buf = PRINTBUF; + unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; int ret = 0; - b = bch2_btree_id_root(c, btree_id)->b; + /* We need to make sure every leaf node is readable before going RW */ + if (initial) + target_depth = 0; - if (btree_node_fake(b)) - return 0; + for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) { + struct btree *prev = NULL; + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level, + BTREE_ITER_prefetch); - six_lock_read(&b->c.lock, NULL, NULL); - printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->data->min_key); - if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, - btree_root_bad_min_key, - "btree root with incorrect min_key: %s", buf.buf)) { - bch_err(c, "repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto fsck_err; + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); + bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); + })); + if (ret) + goto err; } - printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->data->max_key); - if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, - btree_root_bad_max_key, - "btree root with incorrect max_key: %s", buf.buf)) { - bch_err(c, "repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto fsck_err; - } + /* root */ + do { +retry_root: + bch2_trans_begin(trans); + + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, + 0, bch2_btree_id_root(c, btree)->b->c.level, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err_root; - if (b->c.level >= target_depth) - ret = bch2_gc_btree_init_recurse(trans, b, target_depth); + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(trans, &iter); + goto retry_root; + } - if (!ret) { + gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX)); struct bkey_s_c k = bkey_i_to_s_c(&b->key); - - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true, - &k, true); - } -fsck_err: - six_unlock_read(&b->c.lock); - + ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial); +err_root: + bch2_trans_iter_exit(trans, &iter); + } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); +err: bch_err_fn(c, ret); - printbuf_exit(&buf); return ret; } static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) { - return (int) btree_id_to_gc_phase(l) - - (int) btree_id_to_gc_phase(r); + return cmp_int(gc_btree_order(l), gc_btree_order(r)); } -static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) +static int bch2_gc_btrees(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id ids[BTREE_ID_NR]; + struct printbuf buf = PRINTBUF; unsigned i; int ret = 0; @@ -1092,259 +726,52 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) ids[i] = i; bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - for (i = 0; i < BTREE_ID_NR && !ret; i++) - ret = initial - ? bch2_gc_btree_init(trans, ids[i], metadata_only) - : bch2_gc_btree(trans, ids[i], initial, metadata_only); + for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + unsigned btree = i < BTREE_ID_NR ? ids[i] : i; - for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { - if (!bch2_btree_id_root(c, i)->alive) + if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) continue; - ret = initial - ? bch2_gc_btree_init(trans, i, metadata_only) - : bch2_gc_btree(trans, i, initial, metadata_only); + ret = bch2_gc_btree(trans, btree, true); } + printbuf_exit(&buf); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } -static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, - u64 start, u64 end, - enum bch_data_type type, - unsigned flags) -{ - u64 b = sector_to_bucket(ca, start); - - do { - unsigned sectors = - min_t(u64, bucket_to_sector(ca, b + 1), end) - start; - - bch2_mark_metadata_bucket(c, ca, b, type, sectors, - gc_phase(GC_PHASE_SB), flags); - b++; - start += sectors; - } while (start < end); -} - -static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, - unsigned flags) +static int bch2_mark_superblocks(struct bch_fs *c) { - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - unsigned i; - u64 b; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - - if (offset == BCH_SB_SECTOR) - mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, - BCH_DATA_sb, flags); - - mark_metadata_sectors(c, ca, offset, - offset + (1 << layout->sb_max_size_bits), - BCH_DATA_sb, flags); - } + gc_pos_set(c, gc_phase(GC_PHASE_sb)); - for (i = 0; i < ca->journal.nr; i++) { - b = ca->journal.buckets[i]; - bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, - ca->mi.bucket_size, - gc_phase(GC_PHASE_SB), flags); - } + return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); } -static void bch2_mark_superblocks(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - gc_pos_set(c, gc_phase(GC_PHASE_SB)); - - for_each_online_member(c, ca) - bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); - mutex_unlock(&c->sb_lock); -} - -#if 0 -/* Also see bch2_pending_btree_node_free_insert_done() */ -static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) -{ - struct btree_update *as; - struct pending_btree_node_free *d; - - mutex_lock(&c->btree_interior_update_lock); - gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); - - for_each_pending_btree_node_free(c, as, d) - if (d->index_update_done) - bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); - - mutex_unlock(&c->btree_interior_update_lock); -} -#endif - static void bch2_gc_free(struct bch_fs *c) { + bch2_accounting_gc_free(c); + genradix_free(&c->reflink_gc_table); genradix_free(&c->gc_stripes); - for_each_member_device(c, ca) { - kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), - sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket)); - ca->buckets_gc = NULL; - - free_percpu(ca->usage_gc); - ca->usage_gc = NULL; - } - - free_percpu(c->usage_gc); - c->usage_gc = NULL; -} - -static int bch2_gc_done(struct bch_fs *c, - bool initial, bool metadata_only) -{ - struct bch_dev *ca = NULL; - struct printbuf buf = PRINTBUF; - bool verify = !metadata_only && - !c->opts.reconstruct_alloc && - (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); - unsigned i; - int ret = 0; - - percpu_down_write(&c->mark_lock); - -#define copy_field(_err, _f, _msg, ...) \ - if (dst->_f != src->_f && \ - (!verify || \ - fsck_err(c, _err, _msg ": got %llu, should be %llu" \ - , ##__VA_ARGS__, dst->_f, src->_f))) \ - dst->_f = src->_f -#define copy_dev_field(_err, _f, _msg, ...) \ - copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__) -#define copy_fs_field(_err, _f, _msg, ...) \ - copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) - - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - bch2_fs_usage_acc_to_base(c, i); - - __for_each_member_device(c, ca) { - struct bch_dev_usage *dst = ca->usage_base; - struct bch_dev_usage *src = (void *) - bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, - dev_usage_u64s()); - - for (i = 0; i < BCH_DATA_NR; i++) { - copy_dev_field(dev_usage_buckets_wrong, - d[i].buckets, "%s buckets", bch2_data_type_str(i)); - copy_dev_field(dev_usage_sectors_wrong, - d[i].sectors, "%s sectors", bch2_data_type_str(i)); - copy_dev_field(dev_usage_fragmented_wrong, - d[i].fragmented, "%s fragmented", bch2_data_type_str(i)); - } - } - - { - unsigned nr = fs_usage_u64s(c); - struct bch_fs_usage *dst = c->usage_base; - struct bch_fs_usage *src = (void *) - bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); - - copy_fs_field(fs_usage_hidden_wrong, - b.hidden, "hidden"); - copy_fs_field(fs_usage_btree_wrong, - b.btree, "btree"); - - if (!metadata_only) { - copy_fs_field(fs_usage_data_wrong, - b.data, "data"); - copy_fs_field(fs_usage_cached_wrong, - b.cached, "cached"); - copy_fs_field(fs_usage_reserved_wrong, - b.reserved, "reserved"); - copy_fs_field(fs_usage_nr_inodes_wrong, - b.nr_inodes,"nr_inodes"); - - for (i = 0; i < BCH_REPLICAS_MAX; i++) - copy_fs_field(fs_usage_persistent_reserved_wrong, - persistent_reserved[i], - "persistent_reserved[%i]", i); - } - - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); - - if (metadata_only && - (e->data_type == BCH_DATA_user || - e->data_type == BCH_DATA_cached)) - continue; - - printbuf_reset(&buf); - bch2_replicas_entry_to_text(&buf, e); - - copy_fs_field(fs_usage_replicas_wrong, - replicas[i], "%s", buf.buf); - } - } - -#undef copy_fs_field -#undef copy_dev_field -#undef copy_stripe_field -#undef copy_field -fsck_err: - if (ca) - percpu_ref_put(&ca->ref); - bch_err_fn(c, ret); - - percpu_up_write(&c->mark_lock); - printbuf_exit(&buf); - return ret; + for_each_member_device(c, ca) + genradix_free(&ca->buckets_gc); } static int bch2_gc_start(struct bch_fs *c) { - BUG_ON(c->usage_gc); - - c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), - sizeof(u64), GFP_KERNEL); - if (!c->usage_gc) { - bch_err(c, "error allocating c->usage_gc"); - return -BCH_ERR_ENOMEM_gc_start; - } - for_each_member_device(c, ca) { - BUG_ON(ca->usage_gc); - - ca->usage_gc = alloc_percpu(struct bch_dev_usage); - if (!ca->usage_gc) { - bch_err(c, "error allocating ca->usage_gc"); - percpu_ref_put(&ca->ref); - return -BCH_ERR_ENOMEM_gc_start; + int ret = bch2_dev_usage_init(ca, true); + if (ret) { + bch2_dev_put(ca); + return ret; } - - this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets, - ca->mi.nbuckets - ca->mi.first_bucket); } return 0; } -static int bch2_gc_reset(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - free_percpu(ca->usage_gc); - ca->usage_gc = NULL; - } - - free_percpu(c->usage_gc); - c->usage_gc = NULL; - - return bch2_gc_start(c); -} - /* returns true if not equal */ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, struct bch_alloc_v4 r) @@ -1353,6 +780,7 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, l.oldest_gen != r.oldest_gen || l.data_type != r.data_type || l.dirty_sectors != r.dirty_sectors || + l.stripe_sectors != r.stripe_sectors || l.cached_sectors != r.cached_sectors || l.stripe_redundancy != r.stripe_redundancy || l.stripe != r.stripe; @@ -1360,59 +788,55 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, static int bch2_alloc_write_key(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - bool metadata_only) + struct bch_dev *ca, + struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); - struct bucket gc, *b; struct bkey_i_alloc_v4 *a; - struct bch_alloc_v4 old_convert, new; + struct bch_alloc_v4 old_gc, gc, old_convert, new; const struct bch_alloc_v4 *old; - enum bch_data_type type; int ret; + if (!bucket_valid(ca, k.k->p.offset)) + return 0; + old = bch2_alloc_to_v4(k, &old_convert); - new = *old; + gc = new = *old; + + __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); - percpu_down_read(&c->mark_lock); - b = gc_bucket(ca, iter->pos.offset); + old_gc = gc; + + if ((old->data_type == BCH_DATA_sb || + old->data_type == BCH_DATA_journal) && + !bch2_dev_is_online(ca)) { + gc.data_type = old->data_type; + gc.dirty_sectors = old->dirty_sectors; + } /* - * b->data_type doesn't yet include need_discard & need_gc_gen states - + * gc.data_type doesn't yet include need_discard & need_gc_gen states - * fix that here: */ - type = __alloc_data_type(b->dirty_sectors, - b->cached_sectors, - b->stripe, - *old, - b->data_type); - if (b->data_type != type) { - struct bch_dev_usage *u; - - preempt_disable(); - u = this_cpu_ptr(ca->usage_gc); - u->d[b->data_type].buckets--; - b->data_type = type; - u->d[b->data_type].buckets++; - preempt_enable(); - } - - gc = *b; - percpu_up_read(&c->mark_lock); - - if (metadata_only && - gc.data_type != BCH_DATA_sb && - gc.data_type != BCH_DATA_journal && - gc.data_type != BCH_DATA_btree) - return 0; + alloc_data_type_set(&gc, gc.data_type); + if (gc.data_type != old_gc.data_type || + gc.dirty_sectors != old_gc.dirty_sectors) { + ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc); + if (ret) + return ret; - if (gen_after(old->gen, gc.gen)) - return 0; + /* + * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not + * safe w.r.t. transaction restarts, so fixup the gc_bucket so + * we don't run it twice: + */ + struct bucket *gc_m = gc_bucket(ca, iter->pos.offset); + gc_m->data_type = gc.data_type; + gc_m->dirty_sectors = gc.dirty_sectors; + } - if (c->opts.reconstruct_alloc || - fsck_err_on(new.data_type != gc.data_type, c, - alloc_key_data_type_wrong, + if (fsck_err_on(new.data_type != gc.data_type, + trans, alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" ": got %s, should be %s", iter->pos.inode, iter->pos.offset, @@ -1422,26 +846,22 @@ static int bch2_alloc_write_key(struct btree_trans *trans, new.data_type = gc.data_type; #define copy_bucket_field(_errtype, _f) \ - if (c->opts.reconstruct_alloc || \ - fsck_err_on(new._f != gc._f, c, _errtype, \ + if (fsck_err_on(new._f != gc._f, \ + trans, _errtype, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ - ": got %u, should be %u", \ + ": got %llu, should be %llu", \ iter->pos.inode, iter->pos.offset, \ gc.gen, \ bch2_data_type_str(gc.data_type), \ - new._f, gc._f)) \ + (u64) new._f, (u64) gc._f)) \ new._f = gc._f; \ - copy_bucket_field(alloc_key_gen_wrong, - gen); - copy_bucket_field(alloc_key_dirty_sectors_wrong, - dirty_sectors); - copy_bucket_field(alloc_key_cached_sectors_wrong, - cached_sectors); - copy_bucket_field(alloc_key_stripe_wrong, - stripe); - copy_bucket_field(alloc_key_stripe_redundancy_wrong, - stripe_redundancy); + copy_bucket_field(alloc_key_gen_wrong, gen); + copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors); + copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors); + copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors); + copy_bucket_field(alloc_key_stripe_wrong, stripe); + copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy); #undef copy_bucket_field if (!bch2_alloc_v4_cmp(*old, new)) @@ -1455,31 +875,31 @@ static int bch2_alloc_write_key(struct btree_trans *trans, a->v = new; /* - * The trigger normally makes sure this is set, but we're not running + * The trigger normally makes sure these are set, but we're not running * triggers: */ if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); + ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun); fsck_err: return ret; } -static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) +static int bch2_gc_alloc_done(struct bch_fs *c) { int ret = 0; for_each_member_device(c, ca) { ret = bch2_trans_run(c, - for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc, + for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, ca->mi.first_bucket), POS(ca->dev_idx, ca->mi.nbuckets - 1), - BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, - NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, - bch2_alloc_write_key(trans, &iter, k, metadata_only))); + BTREE_ITER_slots|BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_alloc_write_key(trans, &iter, ca, k))); if (ret) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); break; } } @@ -1488,179 +908,23 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) return ret; } -static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) +static int bch2_gc_alloc_start(struct bch_fs *c) { - for_each_member_device(c, ca) { - struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO); - if (!buckets) { - percpu_ref_put(&ca->ref); - bch_err(c, "error allocating ca->buckets[gc]"); - return -BCH_ERR_ENOMEM_gc_alloc_start; - } - - buckets->first_bucket = ca->mi.first_bucket; - buckets->nbuckets = ca->mi.nbuckets; - rcu_assign_pointer(ca->buckets_gc, buckets); - } - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ - struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); - struct bucket *g = gc_bucket(ca, k.k->p.offset); - - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - - g->gen_valid = 1; - g->gen = a->gen; - - if (metadata_only && - (a->data_type == BCH_DATA_user || - a->data_type == BCH_DATA_cached || - a->data_type == BCH_DATA_parity)) { - g->data_type = a->data_type; - g->dirty_sectors = a->dirty_sectors; - g->cached_sectors = a->cached_sectors; - g->stripe = a->stripe; - g->stripe_redundancy = a->stripe_redundancy; - } - - 0; - }))); - bch_err_fn(c, ret); - return ret; -} + int ret = 0; -static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) -{ for_each_member_device(c, ca) { - struct bucket_array *buckets = gc_bucket_array(ca); - struct bucket *g; - - for_each_bucket(g, buckets) { - if (metadata_only && - (g->data_type == BCH_DATA_user || - g->data_type == BCH_DATA_cached || - g->data_type == BCH_DATA_parity)) - continue; - g->data_type = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; + ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL); + if (ret) { + bch2_dev_put(ca); + ret = -BCH_ERR_ENOMEM_gc_alloc_start; + break; } } -} - -static int bch2_gc_write_reflink_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - size_t *idx) -{ - struct bch_fs *c = trans->c; - const __le64 *refcount = bkey_refcount_c(k); - struct printbuf buf = PRINTBUF; - struct reflink_gc *r; - int ret = 0; - - if (!refcount) - return 0; - - while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && - r->offset < k.k->p.offset) - ++*idx; - - if (!r || - r->offset != k.k->p.offset || - r->size != k.k->size) { - bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); - return -EINVAL; - } - - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, - reflink_v_refcount_wrong, - "reflink key has wrong refcount:\n" - " %s\n" - " should be %u", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf), - r->refcount)) { - struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); - - ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - if (!r->refcount) - new->k.type = KEY_TYPE_deleted; - else - *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) -{ - size_t idx = 0; - - if (metadata_only) - return 0; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_gc_write_reflink_key(trans, &iter, k, &idx))); - c->reflink_gc_nr = 0; - return ret; -} - -static int bch2_gc_reflink_start(struct bch_fs *c, - bool metadata_only) -{ - - if (metadata_only) - return 0; - - c->reflink_gc_nr = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - continue; - - struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, - c->reflink_gc_nr++, GFP_KERNEL); - if (!r) { - ret = -BCH_ERR_ENOMEM_gc_reflink_start; - break; - } - - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - 0; - }))); bch_err_fn(c, ret); return ret; } -static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) -{ - struct genradix_iter iter; - struct reflink_gc *r; - - genradix_for_each(&c->reflink_gc_table, iter, r) - r->refcount = 0; -} - static int bch2_gc_write_stripes_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) @@ -1693,7 +957,8 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans, if (bad) bch2_bkey_val_to_text(&buf, c, k); - if (fsck_err_on(bad, c, stripe_sector_count_wrong, + if (fsck_err_on(bad, + trans, stripe_sector_count_wrong, "%s", buf.buf)) { struct bkey_i_stripe *new; @@ -1714,30 +979,20 @@ fsck_err: return ret; } -static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) +static int bch2_gc_stripes_done(struct bch_fs *c) { - if (metadata_only) - return 0; - return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_gc_write_stripes_key(trans, &iter, k))); } -static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) -{ - genradix_free(&c->gc_stripes); -} - /** - * bch2_gc - walk _all_ references to buckets, and recompute them: + * bch2_check_allocations - walk all references to buckets, and recompute them: * * @c: filesystem object - * @initial: are we in recovery? - * @metadata_only: are we just checking metadata references, or everything? * * Returns: 0 on success, or standard errcode on failure * @@ -1756,9 +1011,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) * move around - if references move backwards in the ordering GC * uses, GC could skip past them */ -int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) +int bch2_check_allocations(struct bch_fs *c) { - unsigned iter = 0; int ret; lockdep_assert_held(&c->state_lock); @@ -1767,67 +1021,34 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) bch2_btree_interior_updates_flush(c); - ret = bch2_gc_start(c) ?: - bch2_gc_alloc_start(c, metadata_only) ?: - bch2_gc_reflink_start(c, metadata_only); + ret = bch2_gc_accounting_start(c) ?: + bch2_gc_start(c) ?: + bch2_gc_alloc_start(c) ?: + bch2_gc_reflink_start(c); if (ret) goto out; -again: - gc_pos_set(c, gc_phase(GC_PHASE_START)); - bch2_mark_superblocks(c); + gc_pos_set(c, gc_phase(GC_PHASE_start)); - ret = bch2_gc_btrees(c, initial, metadata_only); + ret = bch2_mark_superblocks(c); + bch_err_msg(c, ret, "marking superblocks"); + if (ret) + goto out; + ret = bch2_gc_btrees(c); if (ret) goto out; -#if 0 - bch2_mark_pending_btree_node_frees(c); -#endif c->gc_count++; - if (test_bit(BCH_FS_need_another_gc, &c->flags) || - (!iter && bch2_test_restart_gc)) { - if (iter++ > 2) { - bch_info(c, "Unable to fix bucket gens, looping"); - ret = -EINVAL; - goto out; - } - - /* - * XXX: make sure gens we fixed got saved - */ - bch_info(c, "Second GC pass needed, restarting:"); - clear_bit(BCH_FS_need_another_gc, &c->flags); - __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); - - bch2_gc_stripes_reset(c, metadata_only); - bch2_gc_alloc_reset(c, metadata_only); - bch2_gc_reflink_reset(c, metadata_only); - ret = bch2_gc_reset(c); - if (ret) - goto out; - - /* flush fsck errors, reset counters */ - bch2_flush_fsck_errs(c); - goto again; - } + ret = bch2_gc_alloc_done(c) ?: + bch2_gc_accounting_done(c) ?: + bch2_gc_stripes_done(c) ?: + bch2_gc_reflink_done(c); out: - if (!ret) { - bch2_journal_block(&c->journal); - - ret = bch2_gc_stripes_done(c, metadata_only) ?: - bch2_gc_reflink_done(c, metadata_only) ?: - bch2_gc_alloc_done(c, metadata_only) ?: - bch2_gc_done(c, initial, metadata_only); - - bch2_journal_unblock(&c->journal); - } - percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ - __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); bch2_gc_free(c); percpu_up_write(&c->mark_lock); @@ -1852,24 +1073,31 @@ static int gc_btree_gens_key(struct btree_trans *trans, struct bkey_i *u; int ret; - percpu_down_read(&c->mark_lock); + if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) + return -EROFS; + + rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; - if (ptr_stale(ca, ptr) > 16) { - percpu_up_read(&c->mark_lock); + if (dev_ptr_stale(ca, ptr) > 16) { + rcu_read_unlock(); goto update; } } bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; if (gen_after(*gen, ptr->gen)) *gen = ptr->gen; } - percpu_up_read(&c->mark_lock); + rcu_read_unlock(); return 0; update: u = bch2_bkey_make_mut(trans, iter, &k, 0); @@ -1881,10 +1109,9 @@ update: return 0; } -static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) +static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca, + struct btree_iter *iter, struct bkey_s_c k) { - struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); struct bkey_i_alloc_v4 *a_mut; @@ -1899,7 +1126,6 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i return ret; a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; - a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type); return bch2_trans_update(trans, iter, &a_mut->k_i, 0); } @@ -1909,16 +1135,20 @@ int bch2_gc_gens(struct bch_fs *c) u64 b, start_time = local_clock(); int ret; - /* - * Ideally we would be using state_lock and not gc_lock here, but that - * introduces a deadlock in the RO path - we currently take the state - * lock at the start of going RO, thus the gc thread may get stuck: - */ if (!mutex_trylock(&c->gc_gens_lock)) return 0; trace_and_count(c, gc_gens_start, c); - down_read(&c->gc_lock); + + /* + * We have to use trylock here. Otherwise, we would + * introduce a deadlock in the RO path - we take the + * state lock at the start of going RO. + */ + if (!down_read_trylock(&c->state_lock)) { + mutex_unlock(&c->gc_gens_lock); + return 0; + } for_each_member_device(c, ca) { struct bucket_gens *gens = bucket_gens(ca); @@ -1927,7 +1157,7 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); if (!ca->oldest_gen) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); ret = -BCH_ERR_ENOMEM_gc_gens; goto err; } @@ -1945,7 +1175,7 @@ int bch2_gc_gens(struct bch_fs *c) ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, i, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -1954,14 +1184,23 @@ int bch2_gc_gens(struct bch_fs *c) goto err; } + struct bch_dev *ca = NULL; ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, + BTREE_ITER_prefetch, k, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_alloc_write_oldest_gen(trans, &iter, k))); + BCH_TRANS_COMMIT_no_enospc, ({ + ca = bch2_dev_iterate(c, ca, k.k->p.inode); + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + bch2_alloc_write_oldest_gen(trans, ca, &iter, k); + }))); + bch2_dev_put(ca); + if (ret) goto err; @@ -1978,94 +1217,37 @@ err: ca->oldest_gen = NULL; } - up_read(&c->gc_lock); + up_read(&c->state_lock); mutex_unlock(&c->gc_gens_lock); if (!bch2_err_matches(ret, EROFS)) bch_err_fn(c, ret); return ret; } -static int bch2_gc_thread(void *arg) +static void bch2_gc_gens_work(struct work_struct *work) { - struct bch_fs *c = arg; - struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last = atomic64_read(&clock->now); - unsigned last_kick = atomic_read(&c->kick_gc); - - set_freezable(); - - while (1) { - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - return 0; - } - - if (atomic_read(&c->kick_gc) != last_kick) - break; - - if (c->btree_gc_periodic) { - unsigned long next = last + c->capacity / 16; - - if (atomic64_read(&clock->now) >= next) - break; - - bch2_io_clock_schedule_timeout(clock, next); - } else { - schedule(); - } - - try_to_freeze(); - } - __set_current_state(TASK_RUNNING); - - last = atomic64_read(&clock->now); - last_kick = atomic_read(&c->kick_gc); - - /* - * Full gc is currently incompatible with btree key cache: - */ -#if 0 - ret = bch2_gc(c, false, false); -#else - bch2_gc_gens(c); -#endif - debug_check_no_locks_held(); - } - - return 0; + struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); + bch2_gc_gens(c); + bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); } -void bch2_gc_thread_stop(struct bch_fs *c) +void bch2_gc_gens_async(struct bch_fs *c) { - struct task_struct *p; - - p = c->gc_thread; - c->gc_thread = NULL; - - if (p) { - kthread_stop(p); - put_task_struct(p); - } + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) && + !queue_work(c->write_ref_wq, &c->gc_gens_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); } -int bch2_gc_thread_start(struct bch_fs *c) +void bch2_fs_btree_gc_exit(struct bch_fs *c) { - struct task_struct *p; - - if (c->gc_thread) - return 0; +} - p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); - if (IS_ERR(p)) { - bch_err_fn(c, PTR_ERR(p)); - return PTR_ERR(p); - } +int bch2_fs_btree_gc_init(struct bch_fs *c) +{ + seqcount_init(&c->gc_pos_lock); + INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); - get_task_struct(p); - c->gc_thread = p; - wake_up_process(p); + init_rwsem(&c->gc_lock); + mutex_init(&c->gc_gens_lock); return 0; } diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 607575f83a00..9693a90a48a2 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -3,13 +3,11 @@ #define _BCACHEFS_BTREE_GC_H #include "bkey.h" +#include "btree_gc_types.h" #include "btree_types.h" int bch2_check_topology(struct bch_fs *); -int bch2_gc(struct bch_fs *, bool, bool); -int bch2_gc_gens(struct bch_fs *); -void bch2_gc_thread_stop(struct bch_fs *); -int bch2_gc_thread_start(struct bch_fs *); +int bch2_check_allocations(struct bch_fs *); /* * For concurrent mark and sweep (with other index updates), we define a total @@ -35,60 +33,36 @@ int bch2_gc_thread_start(struct bch_fs *); /* Position of (the start of) a gc phase: */ static inline struct gc_pos gc_phase(enum gc_phase phase) { - return (struct gc_pos) { - .phase = phase, - .pos = POS_MIN, - .level = 0, - }; -} - -static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) -{ - return cmp_int(l.phase, r.phase) ?: - bpos_cmp(l.pos, r.pos) ?: - cmp_int(l.level, r.level); -} - -static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) -{ - switch (id) { -#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; - BCH_BTREE_IDS() -#undef x - default: - BUG(); - } + return (struct gc_pos) { .phase = phase, }; } -static inline struct gc_pos gc_pos_btree(enum btree_id id, - struct bpos pos, unsigned level) +static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level, + struct bpos pos) { return (struct gc_pos) { - .phase = btree_id_to_gc_phase(id), - .pos = pos, + .phase = GC_PHASE_btree, + .btree = btree, .level = level, + .pos = pos, }; } -/* - * GC position of the pointers within a btree node: note, _not_ for &b->key - * itself, that lives in the parent node: - */ -static inline struct gc_pos gc_pos_btree_node(struct btree *b) +static inline int gc_btree_order(enum btree_id btree) { - return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); + if (btree == BTREE_ID_alloc) + return -2; + if (btree == BTREE_ID_stripes) + return -1; + return btree; } -/* - * GC position of the pointer to a btree root: we don't use - * gc_pos_pointer_to_btree_node() here to avoid a potential race with - * btree_split() increasing the tree depth - the new root will have level > the - * old root and thus have a greater gc position than the old root, but that - * would be incorrect since once gc has marked the root it's not coming back. - */ -static inline struct gc_pos gc_pos_btree_root(enum btree_id id) +static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) { - return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH); + return cmp_int(l.phase, r.phase) ?: + cmp_int(gc_btree_order(l.btree), + gc_btree_order(r.btree)) ?: + cmp_int(l.level, r.level) ?: + bpos_cmp(l.pos, r.pos); } static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) @@ -104,11 +78,12 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) return ret; } -static inline void bch2_do_gc_gens(struct bch_fs *c) -{ - atomic_inc(&c->kick_gc); - if (c->gc_thread) - wake_up_process(c->gc_thread); -} +void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); + +int bch2_gc_gens(struct bch_fs *); +void bch2_gc_gens_async(struct bch_fs *); + +void bch2_fs_btree_gc_exit(struct bch_fs *); +int bch2_fs_btree_gc_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h new file mode 100644 index 000000000000..c24dd6edf377 --- /dev/null +++ b/fs/bcachefs/btree_gc_types.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_GC_TYPES_H +#define _BCACHEFS_BTREE_GC_TYPES_H + +#include <linux/generic-radix-tree.h> + +#define GC_PHASES() \ + x(not_running) \ + x(start) \ + x(sb) \ + x(btree) + +enum gc_phase { +#define x(n) GC_PHASE_##n, + GC_PHASES() +#undef x +}; + +struct gc_pos { + enum gc_phase phase:8; + enum btree_id btree:8; + u16 level; + struct bpos pos; +}; + +struct reflink_gc { + u64 offset; + u32 size; + u32 refcount; +}; + +typedef GENRADIX(struct reflink_gc) reflink_gc_table; + +#endif /* _BCACHEFS_BTREE_GC_TYPES_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index aa9b6cbe3226..756736f9243d 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -23,6 +23,17 @@ #include <linux/sched/mm.h> +static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) +{ + bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); + prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn)); + prt_str(out, "min: "); + bch2_bpos_to_text(out, bn->min_key); + prt_newline(out); + prt_str(out, "max: "); + bch2_bpos_to_text(out, bn->max_key); +} + void bch2_btree_node_io_unlock(struct btree *b) { EBUG_ON(!btree_node_write_in_flight(b)); @@ -34,8 +45,6 @@ void bch2_btree_node_io_unlock(struct btree *b) void bch2_btree_node_io_lock(struct btree *b) { - bch2_assert_btree_nodes_not_locked(); - wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, TASK_UNINTERRUPTIBLE); } @@ -54,16 +63,12 @@ void __bch2_btree_node_wait_on_write(struct btree *b) void bch2_btree_node_wait_on_read(struct btree *b) { - bch2_assert_btree_nodes_not_locked(); - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, TASK_UNINTERRUPTIBLE); } void bch2_btree_node_wait_on_write(struct btree *b) { - bch2_assert_btree_nodes_not_locked(); - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, TASK_UNINTERRUPTIBLE); } @@ -103,7 +108,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size, if (used_mempool) mempool_free(p, &c->btree_bounce_pool); else - vpfree(p, size); + kvfree(p); } static void *btree_bounce_alloc(struct bch_fs *c, size_t size, @@ -115,7 +120,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; - p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); + p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); if (!p) { *used_mempool = true; p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); @@ -217,7 +222,6 @@ static bool should_compact_bset(struct btree *b, struct bset_tree *t, static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) { - struct bset_tree *t; bool ret = false; for_each_bset(b, t) { @@ -288,8 +292,7 @@ bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, static void btree_node_sort(struct bch_fs *c, struct btree *b, unsigned start_idx, - unsigned end_idx, - bool filter_whiteouts) + unsigned end_idx) { struct btree_node *out; struct sort_iter_stack sort_iter; @@ -320,7 +323,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, start_time = local_clock(); - u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts); + u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter); out->keys.u64s = cpu_to_le16(u64s); @@ -426,13 +429,12 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b) break; if (b->nsets - unwritten_idx > 1) { - btree_node_sort(c, b, unwritten_idx, - b->nsets, false); + btree_node_sort(c, b, unwritten_idx, b->nsets); ret = true; } if (unwritten_idx > 1) { - btree_node_sort(c, b, 0, unwritten_idx, false); + btree_node_sort(c, b, 0, unwritten_idx); ret = true; } @@ -441,8 +443,6 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b) void bch2_btree_build_aux_trees(struct btree *b) { - struct bset_tree *t; - for_each_bset(b, t) bch2_bset_build_aux_tree(b, t, !bset_written(b, bset(b, t)) && @@ -489,8 +489,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) if (b->nsets == MAX_BSETS && !btree_node_write_in_flight(b) && should_compact_all(c, b)) { - bch2_btree_node_write(c, b, SIX_LOCK_write, - BTREE_WRITE_init_next_bset); + bch2_btree_node_write_trans(trans, b, SIX_LOCK_write, + BTREE_WRITE_init_next_bset); reinit_iter = true; } @@ -512,7 +512,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca, - struct btree *b, struct bset *i, + struct btree *b, struct bset *i, struct bkey_packed *k, unsigned offset, int write) { prt_printf(out, bch2_log_msg(c, "%s"), @@ -524,28 +524,36 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_printf(out, "at btree "); bch2_btree_pos_to_text(out, c, b); - prt_printf(out, "\n node offset %u/%u", - b->written, btree_ptr_sectors_written(&b->key)); + printbuf_indent_add(out, 2); + + prt_printf(out, "\nnode offset %u/%u", + b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); + if (k) + prt_printf(out, " bset byte offset %lu", + (unsigned long)(void *)k - + ((unsigned long)(void *)i & ~511UL)); prt_str(out, ": "); } -__printf(9, 10) +__printf(10, 11) static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, + struct bkey_packed *k, int write, bool have_retry, enum bch_sb_error_id err_type, const char *fmt, ...) { struct printbuf out = PRINTBUF; + bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; va_list args; - btree_err_msg(&out, c, ca, b, i, b->written, write); + btree_err_msg(&out, c, ca, b, i, k, b->written, write); va_start(args, fmt); prt_vprintf(&out, fmt, args); @@ -564,12 +572,14 @@ static int __btree_err(int ret, if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) ret = -BCH_ERR_btree_node_read_err_bad_node; - if (ret != -BCH_ERR_btree_node_read_err_fixable) + if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) bch2_sb_error_count(c, err_type); switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: - ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf); + ret = !silent + ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf) + : -BCH_ERR_fsck_fix; if (ret != -BCH_ERR_fsck_fix && ret != -BCH_ERR_fsck_ignore) goto fsck_err; @@ -577,15 +587,17 @@ static int __btree_err(int ret, break; case -BCH_ERR_btree_node_read_err_want_retry: case -BCH_ERR_btree_node_read_err_must_retry: - bch2_print_string_as_lines(KERN_ERR, out.buf); + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); break; case -BCH_ERR_btree_node_read_err_bad_node: - bch2_print_string_as_lines(KERN_ERR, out.buf); - bch2_topology_error(c); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO; + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = bch2_topology_error(c); break; case -BCH_ERR_btree_node_read_err_incompatible: - bch2_print_string_as_lines(KERN_ERR, out.buf); + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); ret = -BCH_ERR_fsck_errors_not_fixed; break; default: @@ -597,9 +609,9 @@ fsck_err: return ret; } -#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \ +#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ ({ \ - int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \ + int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \ BCH_FSCK_ERR_##_err_type, \ msg, ##__VA_ARGS__); \ \ @@ -620,8 +632,6 @@ fsck_err: __cold void bch2_btree_node_drop_keys_outside_node(struct btree *b) { - struct bset_tree *t; - for_each_bset(b, t) { struct bset *i = bset(b, t); struct bkey_packed *k; @@ -655,6 +665,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) */ bch2_bset_set_no_aux_tree(b, b->set); bch2_btree_build_aux_trees(b); + b->nr = bch2_btree_node_count_keys(b); struct bkey_s_c k; struct bkey unpacked; @@ -671,13 +682,14 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, int write, bool have_retry, bool *saw_error) { unsigned version = le16_to_cpu(i->version); + unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; int ret = 0; btree_err_on(!bch2_version_compatible(version), -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_unsupported_version, "unsupported bset version %u.%u", BCH_VERSION_MAJOR(version), @@ -685,7 +697,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(version < c->sb.version_min, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, NULL, btree_node_bset_older_than_sb_min, "bset version %u older than superblock version_min %u", version, c->sb.version_min)) { @@ -698,7 +710,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(BCH_VERSION_MAJOR(version) > BCH_VERSION_MAJOR(c->sb.version), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, NULL, btree_node_bset_newer_than_sb, "bset version %u newer than superblock version %u", version, c->sb.version)) { @@ -710,29 +722,28 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(BSET_SEPARATE_WHITEOUTS(i), -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_unsupported_version, "BSET_SEPARATE_WHITEOUTS no longer supported"); - if (btree_err_on(offset + sectors > btree_sectors(c), + if (!write && + btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)), -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_past_end_of_btree_node, - "bset past end of btree node")) { + "bset past end of btree node (offset %u len %u but written %zu)", + offset, sectors, ptr_written ?: btree_sectors(c))) i->u64s = 0; - ret = 0; - goto out; - } btree_err_on(offset && !i->u64s, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_empty, "empty bset"); btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_wrong_sector_offset, "bset at wrong sector offset"); @@ -748,20 +759,20 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, /* XXX endianness */ btree_err_on(bp->seq != bn->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, bset_bad_seq, "incorrect sequence number (wrong btree node)"); } btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_btree, "incorrect btree id"); btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_level, "incorrect level"); @@ -780,7 +791,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_min_key, "incorrect min_key: got %s should be %s", (printbuf_reset(&buf1), @@ -791,7 +802,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_max_key, "incorrect max key %s", (printbuf_reset(&buf1), @@ -803,7 +814,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), -BCH_ERR_btree_node_read_err_bad_node, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_format, "invalid bkey format: %s\n %s", buf1.buf, (printbuf_reset(&buf2), @@ -814,24 +825,41 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BSET_BIG_ENDIAN(i), write, &bn->format); } -out: fsck_err: printbuf_exit(&buf2); printbuf_exit(&buf1); return ret; } -static int bset_key_invalid(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - bool updated_range, int rw, - struct printbuf *err) +static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, + enum bch_validate_flags flags) +{ + return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level, + .btree = b->c.btree_id, + .flags = flags + }); +} + +static int bset_key_validate(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, + bool updated_range, + enum bch_validate_flags flags) { - return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: - (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?: - (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); + struct bkey_validate_context from = (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level, + .btree = b->c.btree_id, + .flags = flags, + }; + return __bch2_bkey_validate(c, k, from) ?: + (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?: + (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0); } -static bool __bkey_valid(struct bch_fs *c, struct btree *b, +static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, struct bset *i, struct bkey_packed *k) { if (bkey_p_next(k) > vstruct_last(i)) @@ -840,12 +868,26 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b, if (k->format > KEY_FORMAT_CURRENT) return false; - struct printbuf buf = PRINTBUF; + if (!bkeyp_u64s_valid(&b->format, k)) + return false; + struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf); - printbuf_exit(&buf); - return ret; + return !__bch2_bkey_validate(c, u.s_c, + (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level, + .btree = b->c.btree_id, + .flags = BCH_VALIDATE_silent + }); +} + +static inline int btree_node_read_bkey_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) +{ + return bch2_bkey_cmp_packed(b, l, r) + ?: (int) bkey_deleted(r) - (int) bkey_deleted(l); } static int validate_bset_keys(struct bch_fs *c, struct btree *b, @@ -867,7 +909,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, if (btree_err_on(bkey_p_next(k) > vstruct_last(i), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_past_bset_end, "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); @@ -876,12 +918,20 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, if (btree_err_on(k->format > KEY_FORMAT_CURRENT, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_bad_format, "invalid bkey format %u", k->format)) goto drop_this_key; - /* XXX: validate k->u64s */ + if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, k, + btree_node_bkey_bad_u64s, + "bad k->u64s %u (min %u max %zu)", k->u64s, + bkeyp_key_u64s(&b->format, k), + U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k))) + goto drop_this_key; + if (!write) bch2_bkey_compat(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, @@ -889,26 +939,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, u = __bkey_disassemble(b, k, &tmp); - printbuf_reset(&buf); - if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { - printbuf_reset(&buf); - bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, u.s_c); - - btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, - btree_node_bad_bkey, - "invalid bkey: %s", buf.buf); + ret = bset_key_validate(c, b, u.s_c, updated_range, write); + if (ret == -BCH_ERR_fsck_delete_bkey) goto drop_this_key; - } + if (ret) + goto fsck_err; if (write) bch2_bkey_compat(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, &b->format, k); - if (prev && bkey_iter_cmp(b, prev, k) > 0) { + if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) { struct bkey up = bkey_unpack_key(b, prev); printbuf_reset(&buf); @@ -918,7 +960,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_bkey_to_text(&buf, u.k); if (btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_out_of_order, "%s", buf.buf)) goto drop_this_key; @@ -938,13 +980,12 @@ drop_this_key: * do */ - if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { + if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { for (next_good_key = 1; next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; next_good_key++) - if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) + if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) goto got_good_key; - } /* @@ -955,7 +996,8 @@ drop_this_key: } got_good_key: le16_add_cpu(&i->u64s, -next_good_key); - memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); + memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); + set_btree_node_need_rewrite(b); } fsck_err: printbuf_exit(&buf); @@ -974,7 +1016,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); unsigned u64s; - unsigned ptr_written = btree_ptr_sectors_written(&b->key); + unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); + u64 max_journal_seq = 0; struct printbuf buf = PRINTBUF; int ret = 0, retry_read = 0, write = READ; u64 start_time = local_clock(); @@ -988,13 +1031,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (bch2_meta_read_fault("btree")) btree_err(-BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_fault_injected, "dynamic fault"); btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_magic, "bad magic: want %llx, got %llx", bset_magic(c), le64_to_cpu(b->data->magic)); @@ -1009,98 +1052,100 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(b->data->keys.seq != bp->seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_seq, - "got wrong btree node (want %llx got %llx)\n" - "got btree %s level %llu pos %s", - bp->seq, b->data->keys.seq, - bch2_btree_id_str(BTREE_NODE_ID(b->data)), - BTREE_NODE_LEVEL(b->data), - buf.buf); + "got wrong btree node: got\n%s", + (printbuf_reset(&buf), + bch2_btree_node_header_to_text(&buf, b->data), + buf.buf)); } else { btree_err_on(!b->data->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_seq, - "bad btree header: seq 0"); + "bad btree header: seq 0\n%s", + (printbuf_reset(&buf), + bch2_btree_node_header_to_text(&buf, b->data), + buf.buf)); } while (b->written < (ptr_written ?: btree_sectors(c))) { unsigned sectors; - struct nonce nonce; bool first = !b->written; - bool csum_bad; - if (!b->written) { + if (first) { + bne = NULL; i = &b->data->keys; + } else { + bne = write_block(b); + i = &bne->keys; - btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, - bset_unknown_csum, - "unknown checksum type %llu", BSET_CSUM_TYPE(i)); - - nonce = btree_nonce(i, b->written << 9); - - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); - csum_bad = bch2_crc_cmp(b->data->csum, csum); - if (csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + if (i->seq != b->data->keys.seq) + break; + } - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "error decrypting btree node: %i", ret)) - goto fsck_err; + struct nonce nonce = btree_nonce(i, b->written << 9); + bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); + + btree_err_on(!good_csum_type, + bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) + ? -BCH_ERR_btree_node_read_err_must_retry + : -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, NULL, + bset_unknown_csum, + "unknown checksum type %llu", BSET_CSUM_TYPE(i)); + + if (first) { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + bool csum_bad = bch2_crc_cmp(b->data->csum, csum); + if (csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, NULL, + bset_bad_csum, + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), + buf.buf)); + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "decrypting btree node: %s", bch2_err_str(ret))) + goto fsck_err; + } btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), -BCH_ERR_btree_node_read_err_incompatible, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_unsupported_version, "btree node does not have NEW_EXTENT_OVERWRITE set"); sectors = vstruct_sectors(b->data, c->block_bits); } else { - bne = write_block(b); - i = &bne->keys; - - if (i->seq != b->data->keys.seq) - break; - - btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, - bset_unknown_csum, - "unknown checksum type %llu", BSET_CSUM_TYPE(i)); - - nonce = btree_nonce(i, b->written << 9); - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - csum_bad = bch2_crc_cmp(bne->csum, csum); - if (csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "error decrypting btree node: %i\n", ret)) - goto fsck_err; + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + bool csum_bad = bch2_crc_cmp(bne->csum, csum); + if (ca && csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, NULL, + bset_bad_csum, + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), + buf.buf)); + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "decrypting btree node: %s", bch2_err_str(ret))) + goto fsck_err; + } sectors = vstruct_sectors(bne, c->block_bits); } @@ -1128,20 +1173,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(blacklisted && first, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_blacklisted_journal_seq, "first btree node bset has blacklisted journal seq (%llu)", le64_to_cpu(i->journal_seq)); btree_err_on(blacklisted && ptr_written, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, first_bset_blacklisted_journal_seq, "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", le64_to_cpu(i->journal_seq), b->written, b->written + sectors, ptr_written); - b->written += sectors; + b->written = min(b->written + sectors, btree_sectors(c)); if (blacklisted && !first) continue; @@ -1149,12 +1194,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sort_iter_add(iter, vstruct_idx(i, 0), vstruct_last(i)); + + max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq)); } if (ptr_written) { btree_err_on(b->written < ptr_written, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_data_missing, "btree node data missing: expected %u sectors, found %u", ptr_written, b->written); @@ -1167,7 +1214,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, le64_to_cpu(bne->keys.journal_seq), true), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bset_after_end, "found bset signature after last bset"); } @@ -1178,6 +1225,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, set_btree_bset(b, b->set, &b->data->keys); b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); + memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, + btree_buf_bytes(b) - + sizeof(struct btree_node) - + b->nr.live_u64s * sizeof(u64)); u64s = le16_to_cpu(sorted->keys.u64s); *sorted = *b->data; @@ -1185,6 +1236,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, swap(sorted, b->data); set_btree_bset(b, b->set, &b->data->keys); b->nsets = 1; + b->data->keys.journal_seq = cpu_to_le64(max_journal_seq); BUG_ON(b->nr.live_u64s != u64s); @@ -1198,31 +1250,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - printbuf_reset(&buf); - - if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) || + ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); + if (ret == -BCH_ERR_fsck_delete_bkey || (bch2_inject_invalid_keys && - !bversion_cmp(u.k->version, MAX_VERSION))) { - printbuf_reset(&buf); - - prt_printf(&buf, "invalid bkey: "); - bch2_bkey_val_invalid(c, u.s_c, READ, &buf); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, u.s_c); - - btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, - btree_node_bad_bkey, - "%s", buf.buf); - + !bversion_cmp(u.k->bversion, MAX_VERSION))) { btree_keys_account_key_drop(&b->nr, 0, k); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); set_btree_bset_end(b, b->set); + set_btree_node_need_rewrite(b); continue; } + if (ret) + goto fsck_err; if (u.k->type == KEY_TYPE_btree_ptr_v2) { struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); @@ -1239,12 +1281,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); + rcu_read_lock(); bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - if (ca2->mi.state != BCH_MEMBER_STATE_rw) + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) set_btree_node_need_rewrite(b); } + rcu_read_unlock(); if (!ptr_written) set_btree_node_need_rewrite(b); @@ -1255,10 +1299,12 @@ out: return retry_read; fsck_err: if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) + ret == -BCH_ERR_btree_node_read_err_must_retry) { retry_read = 1; - else + } else { set_btree_node_read_error(b); + bch2_btree_lost_data(c, b->c.btree_id); + } goto out; } @@ -1267,8 +1313,8 @@ static void btree_node_read_work(struct work_struct *work) struct btree_read_bio *rb = container_of(work, struct btree_read_bio, work); struct bch_fs *c = rb->c; + struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; struct btree *b = rb->b; - struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; struct printbuf buf = PRINTBUF; @@ -1280,8 +1326,8 @@ static void btree_node_read_work(struct work_struct *work) while (1) { retry = true; bch_info(c, "retrying read"); - ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); - rb->have_ioref = bch2_dev_get_ioref(ca, READ); + ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); + rb->have_ioref = ca != NULL; bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_buf_bytes(b); @@ -1295,7 +1341,7 @@ static void btree_node_read_work(struct work_struct *work) start: printbuf_reset(&buf); bch2_btree_pos_to_text(&buf, c, b); - bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read, "btree read error %s for %s", bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) @@ -1319,6 +1365,7 @@ start: if (!can_retry) { set_btree_node_read_error(b); + bch2_btree_lost_data(c, b->c.btree_id); break; } } @@ -1327,11 +1374,18 @@ start: rb->start_time); bio_put(&rb->bio); - if (saw_error && !btree_node_read_error(b)) { - printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->key.k.p); - bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", - __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf); + if ((saw_error || + btree_node_need_rewrite(b)) && + !btree_node_read_error(b) && + c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { + if (saw_error) { + printbuf_reset(&buf); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", + __func__, buf.buf); + } bch2_btree_node_rewrite_async(c, b); } @@ -1348,12 +1402,12 @@ static void btree_node_read_endio(struct bio *bio) struct bch_fs *c = rb->c; if (rb->have_ioref) { - struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); } - queue_work(c->io_complete_wq, &rb->work); + queue_work(c->btree_read_complete_wq, &rb->work); } struct btree_node_read_all { @@ -1440,18 +1494,18 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) written2 = btree_node_sectors_written(c, ra->buf[i]); if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_replicas_sectors_written_mismatch, "btree node sectors written mismatch: %u != %u", written, written2) || btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_bset_after_end, "found bset signature after last bset") || btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_replicas_data_mismatch, "btree node replicas content mismatch")) dump_bset_maps = true; @@ -1518,9 +1572,10 @@ fsck_err: ret = -1; } - if (ret) + if (ret) { set_btree_node_read_error(b); - else if (*saw_error) + bch2_btree_lost_data(c, b->c.btree_id); + } else if (*saw_error) bch2_btree_node_rewrite_async(c, b); for (i = 0; i < ra->nr; i++) { @@ -1544,7 +1599,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio) struct btree_node_read_all *ra = rb->ra; if (rb->have_ioref) { - struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); } @@ -1586,14 +1641,14 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool i = 0; bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); struct btree_read_bio *rb = container_of(ra->bio[i], struct btree_read_bio, bio); rb->c = c; rb->b = b; rb->ra = ra; rb->start_time = local_clock(); - rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->have_ioref = ca != NULL; rb->idx = i; rb->pick = pick; rb->bio.bi_iter.bi_sector = pick.ptr.offset; @@ -1619,7 +1674,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool btree_node_read_all_replicas_done(&ra->cl.work); } else { continue_at(&ra->cl, btree_node_read_all_replicas_done, - c->io_complete_wq); + c->btree_read_complete_wq); } return 0; @@ -1649,20 +1704,21 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, prt_str(&buf, "btree node read error: no device to read from\n at "); bch2_btree_pos_to_text(&buf, c, b); - bch_err(c, "%s", buf.buf); + bch_err_ratelimited(c, "%s", buf.buf); - if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && + if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) bch2_fatal_error(c); set_btree_node_read_error(b); + bch2_btree_lost_data(c, b->c.btree_id); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); printbuf_exit(&buf); return; } - ca = bch_dev_bkey_exists(c, pick.ptr.dev); + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); bio = bio_alloc_bioset(NULL, buf_pages(b->data, btree_buf_bytes(b)), @@ -1674,7 +1730,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, rb->b = b; rb->ra = NULL; rb->start_time = local_clock(); - rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->have_ioref = ca != NULL; rb->pick = pick; INIT_WORK(&rb->work, btree_node_read_work); bio->bi_iter.bi_sector = pick.ptr.offset; @@ -1699,7 +1755,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, if (sync) btree_node_read_work(&rb->work); else - queue_work(c->io_complete_wq, &rb->work); + queue_work(c->btree_read_complete_wq, &rb->work); } } @@ -1728,16 +1784,16 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, set_btree_node_read_in_flight(b); + /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */ + bch2_trans_unlock(trans); bch2_btree_node_read(trans, b, true); if (btree_node_read_error(b)) { - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); + bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); - ret = -EIO; + ret = -BCH_ERR_btree_node_read_error; goto err; } @@ -1758,15 +1814,16 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, struct btree_write *w) { - unsigned long old, new, v = READ_ONCE(b->will_make_reachable); + unsigned long old, new; + old = READ_ONCE(b->will_make_reachable); do { - old = new = v; + new = old; if (!(old & 1)) break; new &= ~1UL; - } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); + } while (!try_cmpxchg(&b->will_make_reachable, &old, new)); if (old & 1) closure_put(&((struct btree_update *) new)->cl); @@ -1777,14 +1834,14 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, static void __btree_node_write_done(struct bch_fs *c, struct btree *b) { struct btree_write *w = btree_prev_write(b); - unsigned long old, new, v; + unsigned long old, new; unsigned type = 0; bch2_btree_complete_write(c, b, w); - v = READ_ONCE(b->flags); + old = READ_ONCE(b->flags); do { - old = new = v; + new = old; if ((old & (1U << BTREE_NODE_dirty)) && (old & (1U << BTREE_NODE_need_write)) && @@ -1804,7 +1861,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) new &= ~(1U << BTREE_NODE_write_in_flight); new &= ~(1U << BTREE_NODE_write_in_flight_inner); } - } while ((v = cmpxchg(&b->flags, old, new)) != old); + } while (!try_cmpxchg(&b->flags, &old, new)); if (new & (1U << BTREE_NODE_write_in_flight)) __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); @@ -1817,10 +1874,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) struct btree_trans *trans = bch2_trans_get(c); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - __btree_node_write_done(c, b); - six_unlock_read(&b->c.lock); + /* we don't need transaction context anymore after we got the lock. */ bch2_trans_put(trans); + __btree_node_write_done(c, b); + six_unlock_read(&b->c.lock); } static void btree_node_write_work(struct work_struct *work) @@ -1829,7 +1887,6 @@ static void btree_node_write_work(struct work_struct *work) container_of(work, struct btree_write_bio, work); struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; - struct bch_extent_ptr *ptr; int ret = 0; btree_bounce_free(c, @@ -1841,7 +1898,7 @@ static void btree_node_write_work(struct work_struct *work) bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = -BCH_ERR_btree_write_all_failed; + ret = -BCH_ERR_btree_node_write_all_failed; goto err; } @@ -1850,9 +1907,9 @@ static void btree_node_write_work(struct work_struct *work) } } else { - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_do(c, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, - BCH_WATERMARK_reclaim| + BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw, @@ -1866,8 +1923,8 @@ out: return; err: set_btree_node_noevict(b); - if (!bch2_err_matches(ret, EROFS)) - bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret)); + bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, + "writing btree node: %s", bch2_err_str(ret)); goto out; } @@ -1879,13 +1936,14 @@ static void btree_node_write_endio(struct bio *bio) struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); struct bch_fs *c = wbio->c; struct btree *b = wbio->bio.bi_private; - struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; unsigned long flags; if (wbio->have_ioref) bch2_latency_acct(ca, wbio->submit_time, WRITE); - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + if (!ca || + bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, "btree write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("btree")) { @@ -1912,18 +1970,19 @@ static void btree_node_write_endio(struct bio *bio) static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { - struct printbuf buf = PRINTBUF; bool saw_error; - int ret; - ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), - BKEY_TYPE_btree, WRITE, &buf); - - if (ret) - bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf); - printbuf_exit(&buf); - if (ret) + int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), + (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level + 1, + .btree = b->c.btree_id, + .flags = BCH_VALIDATE_write, + }); + if (ret) { + bch2_fs_inconsistent(c, "invalid btree node key before write"); return ret; + } ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); @@ -1952,7 +2011,6 @@ static void btree_write_submit(struct work_struct *work) void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) { struct btree_write_bio *wbio; - struct bset_tree *t; struct bset *i; struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; @@ -1977,8 +2035,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) * dirty bit requires a write lock, we can't race with other threads * redirtying it: */ + old = READ_ONCE(b->flags); do { - old = new = READ_ONCE(b->flags); + new = old; if (!(old & (1 << BTREE_NODE_dirty))) return; @@ -2009,14 +2068,14 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) new |= (1 << BTREE_NODE_write_in_flight_inner); new |= (1 << BTREE_NODE_just_written); new ^= (1 << BTREE_NODE_write_idx); - } while (cmpxchg_acquire(&b->flags, old, new) != old); + } while (!try_cmpxchg_acquire(&b->flags, &old, new)); if (new & (1U << BTREE_NODE_need_write)) return; do_write: BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); - atomic_dec(&c->btree_cache.dirty); + atomic_long_dec(&c->btree_cache.nr_dirty); BUG_ON(btree_node_fake(b)); BUG_ON((b->will_make_reachable != 0) != !b->written); @@ -2078,11 +2137,11 @@ do_write: unwritten_whiteouts_end(b)); SET_BSET_SEPARATE_WHITEOUTS(i, false); - b->whiteout_u64s = 0; - - u64s = bch2_sort_keys(i->start, &sort_iter.iter, false); + u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter); le16_add_cpu(&i->u64s, u64s); + b->whiteout_u64s = 0; + BUG_ON(!b->written && i->u64s != b->data->keys.u64s); set_needs_whiteout(i, false); @@ -2096,7 +2155,7 @@ do_write: if (!b->written && b->key.k.type == KEY_TYPE_btree_ptr_v2) - BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write); + BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write); memset(data + bytes_to_write, 0, (sectors_to_write << 9) - bytes_to_write); @@ -2123,7 +2182,7 @@ do_write: ret = bset_encrypt(c, i, b->written << 9); if (bch2_fs_fatal_err_on(ret, c, - "error encrypting btree node: %i\n", ret)) + "encrypting btree node: %s", bch2_err_str(ret))) goto err; nonce = btree_nonce(i, b->written << 9); @@ -2192,7 +2251,7 @@ do_write: atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); INIT_WORK(&wbio->work, btree_write_submit); - queue_work(c->io_complete_wq, &wbio->work); + queue_work(c->btree_write_submit_wq, &wbio->work); return; err: set_btree_node_noevict(b); @@ -2209,7 +2268,6 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) { bool invalidated_iter = false; struct btree_node_entry *bne; - struct bset_tree *t; if (!btree_node_just_written(b)) return false; @@ -2232,7 +2290,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) * single bset: */ if (b->nsets > 1) { - btree_node_sort(c, b, 0, b->nsets, true); + btree_node_sort(c, b, 0, b->nsets); invalidated_iter = true; } else { invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); @@ -2287,6 +2345,34 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, } } +void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b, + enum six_lock_type lock_type_held, + unsigned flags) +{ + struct bch_fs *c = trans->c; + + if (lock_type_held == SIX_LOCK_intent || + (lock_type_held == SIX_LOCK_read && + six_lock_tryupgrade(&b->c.lock))) { + __bch2_btree_node_write(c, b, flags); + + /* don't cycle lock unnecessarily: */ + if (btree_node_just_written(b) && + six_trylock_write(&b->c.lock)) { + bch2_btree_post_write_cleanup(c, b); + __bch2_btree_node_unlock_write(trans, b); + } + + if (lock_type_held == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); + } else { + __bch2_btree_node_write(c, b, flags); + if (lock_type_held == SIX_LOCK_write && + btree_node_just_written(b)) + bch2_btree_post_write_cleanup(c, b); + } +} + static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) { struct bucket_table *tbl; @@ -2329,20 +2415,13 @@ void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) printbuf_tabstop_push(out, 20); printbuf_tabstop_push(out, 10); - prt_tab(out); - prt_str(out, "nr"); - prt_tab(out); - prt_str(out, "size"); - prt_newline(out); + prt_printf(out, "\tnr\tsize\n"); for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { u64 nr = atomic64_read(&c->btree_write_stats[i].nr); u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); - prt_printf(out, "%s:", bch2_btree_write_types[i]); - prt_tab(out); - prt_u64(out, nr); - prt_tab(out); + prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr); prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); prt_newline(out); } diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index e251cb6b965f..6f9e4a6dacf7 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -18,19 +18,19 @@ struct btree_node_read_all; static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) { if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) - atomic_inc(&c->btree_cache.dirty); + atomic_long_inc(&c->btree_cache.nr_dirty); } static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) { if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) - atomic_dec(&c->btree_cache.dirty); + atomic_long_dec(&c->btree_cache.nr_dirty); } -static inline unsigned btree_ptr_sectors_written(struct bkey_i *k) +static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k) { - return k->k.type == KEY_TYPE_btree_ptr_v2 - ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written) + return k.k->type == KEY_TYPE_btree_ptr_v2 + ? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written) : 0; } @@ -81,8 +81,6 @@ static inline bool should_compact_bset_lazy(struct btree *b, static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) { - struct bset_tree *t; - for_each_bset(b, t) if (should_compact_bset_lazy(b, t)) return bch2_compact_whiteouts(c, b, COMPACT_LAZY); @@ -146,11 +144,13 @@ enum btree_write_flags { void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); void bch2_btree_node_write(struct bch_fs *, struct btree *, enum six_lock_type, unsigned); +void bch2_btree_node_write_trans(struct btree_trans *, struct btree *, + enum six_lock_type, unsigned); -static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, +static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b, enum six_lock_type lock_held) { - bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); + bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); } bool bch2_btree_flush_all_reads(struct bch_fs *); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 3ef338df82f5..e32fce4fd258 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -61,7 +61,7 @@ static inline int btree_path_cmp(const struct btree_path *l, static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) { /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + if (iter->flags & BTREE_ITER_all_snapshots) { p = bpos_successor(p); } else { p = bpos_nosnap_successor(p); @@ -74,7 +74,7 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) { /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + if (iter->flags & BTREE_ITER_all_snapshots) { p = bpos_predecessor(p); } else { p = bpos_nosnap_predecessor(p); @@ -88,7 +88,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) { struct bpos pos = iter->pos; - if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + if ((iter->flags & BTREE_ITER_is_extents) && !bkey_eq(pos, POS_MAX)) pos = bkey_successor(iter, pos); return pos; @@ -221,11 +221,8 @@ static void bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; - unsigned i; - - EBUG_ON(path->btree_id >= BTREE_ID_NR); - for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { + for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { if (!path->l[i].b) { BUG_ON(!path->cached && bch2_btree_id_root(c, path->btree_id)->b->c.level > i); @@ -251,15 +248,13 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - BUG_ON(iter->btree_id >= BTREE_ID_NR); - - BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached); + BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); - BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && - (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + BUG_ON((iter->flags & BTREE_ITER_is_extents) && + (iter->flags & BTREE_ITER_all_snapshots)); - BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && - (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) && + (iter->flags & BTREE_ITER_all_snapshots) && !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) @@ -269,14 +264,16 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { - BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && !iter->pos.snapshot); - BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) && iter->pos.snapshot != iter->snapshot); - BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || - bkey_gt(iter->pos, iter->k.p)); + BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) : + !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) : + (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || + bkey_gt(iter->pos, iter->k.p))); } static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) @@ -289,7 +286,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k if (!bch2_debug_check_iterators) return 0; - if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) + if (!(iter->flags & BTREE_ITER_filter_snapshots)) return 0; if (bkey_err(k) || !k.k) @@ -300,8 +297,8 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k k.k->p.snapshot)); bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, - BTREE_ITER_NOPRESERVE| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_nopreserve| + BTREE_ITER_all_snapshots); prev = bch2_btree_iter_prev(©); if (!prev.k) goto out; @@ -330,8 +327,10 @@ out: } void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos, bool key_cache) + struct bpos pos) { + bch2_trans_verify_not_unlocked_or_in_restart(trans); + struct btree_path *path; struct trans_for_each_path_inorder_iter iter; struct printbuf buf = PRINTBUF; @@ -339,19 +338,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, btree_trans_sort_paths(trans); trans_for_each_path_inorder(trans, path, iter) { - int cmp = cmp_int(path->btree_id, id) ?: - cmp_int(path->cached, key_cache); - - if (cmp > 0) - break; - if (cmp < 0) - continue; - - if (!btree_node_locked(path, 0) || + if (path->btree_id != id || + !btree_node_locked(path, 0) || !path->should_be_locked) continue; - if (!key_cache) { + if (!path->cached) { if (bkey_ge(pos, path->l[0].b->data->min_key) && bkey_le(pos, path->l[0].b->key.k.p)) return; @@ -364,9 +356,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, bch2_dump_trans_paths_updates(trans); bch2_bpos_to_text(&buf, pos); - panic("not locked: %s %s%s\n", - bch2_btree_id_str(id), buf.buf, - key_cache ? " cached" : ""); + panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); } #else @@ -709,6 +699,19 @@ void bch2_trans_node_add(struct btree_trans *trans, bch2_trans_revalidate_updates_in_node(trans, b); } +void bch2_trans_node_drop(struct btree_trans *trans, + struct btree *b) +{ + struct btree_path *path; + unsigned i, level = b->c.level; + + trans_for_each_path(trans, path, i) + if (path->l[level].b == b) { + btree_node_unlock(trans, path, level); + path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); + } +} + /* * A btree node has been modified in such a way as to invalidate iterators - fix * them: @@ -732,7 +735,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; + struct btree_root *r = bch2_btree_id_root(c, path->btree_id); enum six_lock_type lock_type; unsigned i; int ret; @@ -740,7 +743,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans, EBUG_ON(path->nodes_locked); while (1) { - b = READ_ONCE(*rootp); + struct btree *b = READ_ONCE(r->b); + if (unlikely(!b)) { + BUG_ON(!r->error); + return r->error; + } + path->level = READ_ONCE(b->c.level); if (unlikely(path->level < depth_want)) { @@ -760,14 +768,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans, ret = btree_node_lock(trans, path, &b->c, path->level, lock_type, trace_ip); if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) - continue; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; BUG(); } - if (likely(b == READ_ONCE(*rootp) && + if (likely(b == READ_ONCE(r->b) && b->c.level == path->level && !race_fault())) { for (i = 0; i < path->level; i++) @@ -837,6 +843,8 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p bch2_bkey_buf_init(&tmp); + jiter->fail_if_too_many_whiteouts = true; + while (nr-- && !ret) { if (!bch2_btree_node_relock(trans, path, path->level)) break; @@ -891,16 +899,29 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, struct bkey_s_c k; int ret = 0; - __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); + __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); k = bch2_btree_and_journal_iter_peek(&jiter); + if (!k.k) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " at btree "); + bch2_btree_pos_to_text(&buf, c, l->b); + + ret = bch2_fs_topology_error(c, "%s", buf.buf); + printbuf_exit(&buf); + goto err; + } bch2_bkey_buf_reassemble(out, c, k); - if ((flags & BTREE_ITER_PREFETCH) && + if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) ret = btree_path_prefetch_j(trans, path, &jiter); +err: bch2_btree_and_journal_iter_exit(&jiter); return ret; } @@ -927,10 +948,24 @@ static __always_inline int btree_path_down(struct btree_trans *trans, if (ret) goto err; } else { - bch2_bkey_buf_unpack(&tmp, c, l->b, - bch2_btree_node_iter_peek(&l->iter, l->b)); + struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); + if (!k) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " within parent node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key)); + + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + ret = -BCH_ERR_btree_need_topology_repair; + goto err; + } + + bch2_bkey_buf_unpack(&tmp, c, l->b, k); - if ((flags & BTREE_ITER_PREFETCH) && + if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) { ret = btree_path_prefetch(trans, path); if (ret) @@ -962,7 +997,6 @@ err: return ret; } - static int bch2_btree_path_traverse_all(struct btree_trans *trans) { struct bch_fs *c = trans->c; @@ -986,6 +1020,7 @@ retry_all: bch2_trans_unlock(trans); cond_resched(); + trans_set_locked(trans, false); if (unlikely(trans->memory_allocation_failure)) { struct closure cl; @@ -1008,9 +1043,9 @@ retry_all: * the same position: */ if (trans->paths[idx].uptodate) { - __btree_path_get(&trans->paths[idx], false); + __btree_path_get(trans, &trans->paths[idx], false); ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_); - __btree_path_put(&trans->paths[idx], false); + __btree_path_put(trans, &trans->paths[idx], false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, ENOMEM)) @@ -1129,6 +1164,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, if (unlikely(!trans->srcu_held)) bch2_trans_srcu_lock(trans); + trace_btree_path_traverse_start(trans, path); + /* * Ensure we obey path->should_be_locked: if it's set, we can't unlock * and re-traverse the path without a transaction restart: @@ -1146,9 +1183,10 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, path = &trans->paths[path_idx]; if (unlikely(path->level >= BTREE_MAX_DEPTH)) - goto out; + goto out_uptodate; path->level = btree_path_up_until_good_node(trans, path, 0); + unsigned max_level = path->level; EBUG_ON(btree_path_node(path, path->level) && !btree_node_locked(path, path->level)); @@ -1180,7 +1218,18 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, } } + if (unlikely(max_level > path->level)) { + struct btree_path *linked; + unsigned iter; + + trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter) + for (unsigned j = path->level + 1; j < max_level; j++) + linked->l[j] = path->l[j]; + } + +out_uptodate: path->uptodate = BTREE_ITER_UPTODATE; + trace_btree_path_traverse_end(trans, path); out: if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) panic("ret %s (%i) trans->restarted %s (%i)\n", @@ -1208,11 +1257,14 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path } static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src, - bool intent) + bool intent, unsigned long ip) { btree_path_idx_t new = btree_path_alloc(trans, src); btree_path_copy(trans, trans->paths + new, trans->paths + src); - __btree_path_get(trans->paths + new, intent); + __btree_path_get(trans, trans->paths + new, intent); +#ifdef TRACK_PATH_ALLOCATED + trans->paths[new].ip_allocated = ip; +#endif return new; } @@ -1220,8 +1272,10 @@ __flatten btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, btree_path_idx_t path, bool intent, unsigned long ip) { - __btree_path_put(trans->paths + path, intent); - path = btree_path_clone(trans, path, intent); + struct btree_path *old = trans->paths + path; + __btree_path_put(trans, trans->paths + path, intent); + path = btree_path_clone(trans, path, intent, ip); + trace_btree_path_clone(trans, old, trans->paths + path); trans->paths[path].preserve = false; return path; } @@ -1233,9 +1287,11 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, { int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); EBUG_ON(!trans->paths[path_idx].ref); + trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos); + path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip); struct btree_path *path = trans->paths + path_idx; @@ -1321,24 +1377,51 @@ static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t __clear_bit(path, trans->paths_allocated); } +static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path) +{ + unsigned l = path->level; + + do { + if (!btree_path_node(path, l)) + break; + + if (!is_btree_node(path, l)) + return false; + + if (path->l[l].lock_seq != path->l[l].b->c.lock.seq) + return false; + + l++; + } while (l < path->locks_want); + + return true; +} + void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { struct btree_path *path = trans->paths + path_idx, *dup; - if (!__btree_path_put(path, intent)) + if (!__btree_path_put(trans, path, intent)) return; dup = path->preserve ? have_path_at_pos(trans, path) : have_node_at_pos(trans, path); + trace_btree_path_free(trans, path_idx, dup); + if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) return; - if (path->should_be_locked && - !trans->restarted && - (!dup || !bch2_btree_path_relock_norestart(trans, dup))) - return; + if (path->should_be_locked && !trans->restarted) { + if (!dup) + return; + + if (!(trans->locked + ? bch2_btree_path_relock_norestart(trans, dup) + : bch2_btree_path_can_relock(trans, dup))) + return; + } if (dup) { dup->preserve |= path->preserve; @@ -1351,7 +1434,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, bool intent) { - if (!__btree_path_put(trans->paths + path, intent)) + if (!__btree_path_put(trans, trans->paths + path, intent)) return; __bch2_path_free(trans, path); @@ -1364,29 +1447,48 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_ (void *) trans->last_begin_ip); } -void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) +static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) { +#ifdef CONFIG_BCACHEFS_DEBUG + struct printbuf buf = PRINTBUF; + bch2_prt_backtrace(&buf, &trans->last_restarted_trace); + panic("in transaction restart: %s, last restarted by\n%s", + bch2_err_str(trans->restarted), + buf.buf); +#else panic("in transaction restart: %s, last restarted by %pS\n", bch2_err_str(trans->restarted), (void *) trans->last_restarted_ip); +#endif +} + +void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans) +{ + if (trans->restarted) + bch2_trans_in_restart_error(trans); + + if (!trans->locked) + panic("trans should be locked, unlocked by %pS\n", + (void *) trans->last_unlock_ip); + + BUG(); } noinline __cold void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { - prt_printf(buf, "transaction updates for %s journal seq %llu", - trans->fn, trans->journal_res.seq); - prt_newline(buf); + prt_printf(buf, "%u transaction updates for %s journal seq %llu\n", + trans->nr_updates, trans->fn, trans->journal_res.seq); printbuf_indent_add(buf, 2); trans_for_each_update(trans, i) { struct bkey_s_c old = { &i->old_k, i->old_v }; - prt_printf(buf, "update: btree=%s cached=%u %pS", - bch2_btree_id_str(i->btree_id), - i->cached, - (void *) i->ip_allocated); - prt_newline(buf); + prt_str(buf, "update: btree="); + bch2_btree_id_to_text(buf, i->btree_id); + prt_printf(buf, " cached=%u %pS\n", + i->cached, + (void *) i->ip_allocated); prt_printf(buf, " old "); bch2_bkey_val_to_text(buf, trans->c, old); @@ -1411,27 +1513,75 @@ void bch2_dump_trans_updates(struct btree_trans *trans) struct printbuf buf = PRINTBUF; bch2_trans_updates_to_text(&buf, trans); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(trans->c, buf.buf); printbuf_exit(&buf); } -static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) +static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) { struct btree_path *path = trans->paths + path_idx; - prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", + prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ", path_idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', - bch2_btree_id_str(path->btree_id), - path->level); + path->cached ? 'C' : 'B'); + bch2_btree_id_level_to_text(out, path->btree_id, path->level); + prt_str(out, " pos "); bch2_bpos_to_text(out, path->pos); - prt_printf(out, " locks %u", path->nodes_locked); + if (!path->cached && btree_node_locked(path, path->level)) { + prt_char(out, ' '); + struct btree *b = path_l(path)->b; + bch2_bpos_to_text(out, b->data->min_key); + prt_char(out, '-'); + bch2_bpos_to_text(out, b->key.k.p); + } + #ifdef TRACK_PATH_ALLOCATED prt_printf(out, " %pS", (void *) path->ip_allocated); #endif +} + +static const char *btree_node_locked_str(enum btree_node_locked_type t) +{ + switch (t) { + case BTREE_NODE_UNLOCKED: + return "unlocked"; + case BTREE_NODE_READ_LOCKED: + return "read"; + case BTREE_NODE_INTENT_LOCKED: + return "intent"; + case BTREE_NODE_WRITE_LOCKED: + return "write"; + default: + return NULL; + } +} + +void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) +{ + bch2_btree_path_to_text_short(out, trans, path_idx); + + struct btree_path *path = trans->paths + path_idx; + + prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want); prt_newline(out); + + printbuf_indent_add(out, 2); + for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { + prt_printf(out, "l=%u locks %s seq %u node ", l, + btree_node_locked_str(btree_node_locked_type(path, l)), + path->l[l].lock_seq); + + int ret = PTR_ERR_OR_ZERO(path->l[l].b); + if (ret) + prt_str(out, bch2_err_str(ret)); + else + prt_printf(out, "%px", path->l[l].b); + prt_newline(out); + } + printbuf_indent_sub(out, 2); } static noinline __cold @@ -1443,8 +1593,10 @@ void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, if (!nosort) btree_trans_sort_paths(trans); - trans_for_each_path_idx_inorder(trans, iter) - bch2_btree_path_to_text(out, trans, iter.path_idx); + trans_for_each_path_idx_inorder(trans, iter) { + bch2_btree_path_to_text_short(out, trans, iter.path_idx); + prt_newline(out); + } } noinline __cold @@ -1461,7 +1613,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) __bch2_trans_paths_to_text(&buf, trans, nosort); bch2_trans_updates_to_text(&buf, trans); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(trans->c, buf.buf); printbuf_exit(&buf); } @@ -1520,7 +1672,7 @@ static noinline void btree_paths_realloc(struct btree_trans *trans) { unsigned nr = trans->nr_paths * 2; - void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + + void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + sizeof(struct btree_trans_paths) + nr * sizeof(struct btree_path) + nr * sizeof(btree_path_idx_t) + 8 + @@ -1595,12 +1747,12 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, unsigned flags, unsigned long ip) { struct btree_path *path; - bool cached = flags & BTREE_ITER_CACHED; - bool intent = flags & BTREE_ITER_INTENT; + bool cached = flags & BTREE_ITER_cached; + bool intent = flags & BTREE_ITER_intent; struct trans_for_each_path_inorder_iter iter; btree_path_idx_t path_pos = 0, path_idx; - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_trans_verify_locks(trans); btree_trans_sort_paths(trans); @@ -1620,14 +1772,16 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, trans->paths[path_pos].cached == cached && trans->paths[path_pos].btree_id == btree_id && trans->paths[path_pos].level == level) { - __btree_path_get(trans->paths + path_pos, intent); + trace_btree_path_get(trans, trans->paths + path_pos, &pos); + + __btree_path_get(trans, trans->paths + path_pos, intent); path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); path = trans->paths + path_idx; } else { path_idx = btree_path_alloc(trans, path_pos); path = trans->paths + path_idx; - __btree_path_get(path, intent); + __btree_path_get(trans, path, intent); path->pos = pos; path->btree_id = btree_id; path->cached = cached; @@ -1642,9 +1796,11 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, path->ip_allocated = ip; #endif trans->paths_sorted = false; + + trace_btree_path_alloc(trans, path); } - if (!(flags & BTREE_ITER_NOPRESERVE)) + if (!(flags & BTREE_ITER_nopreserve)) path->preserve = true; if (path->intent_ref) @@ -1665,6 +1821,22 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, return path_idx; } +btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans, + enum btree_id btree_id, + unsigned level, + struct bpos pos) +{ + btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, + BTREE_ITER_nopreserve| + BTREE_ITER_intent, _RET_IP_); + path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); + + struct btree_path *path = trans->paths + path_idx; + bch2_btree_path_downgrade(trans, path); + __bch2_btree_path_unlock(trans, path); + return path_idx; +} + struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) { @@ -1688,15 +1860,14 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * goto hole; } else { struct bkey_cached *ck = (void *) path->l[0].b; - - EBUG_ON(ck && - (path->btree_id != ck->key.btree_id || - !bkey_eq(path->pos, ck->key.pos))); - if (!ck || !ck->valid) + if (!ck) return bkey_s_c_null; + EBUG_ON(path->btree_id != ck->key.btree_id || + !bkey_eq(path->pos, ck->key.pos)); + *u = ck->k->k; - k = bkey_i_to_s_c(ck->k); + k = (struct bkey_s_c) { u, &ck->k->v }; } return k; @@ -1706,6 +1877,18 @@ hole: return (struct bkey_s_c) { u, NULL }; } +void bch2_set_btree_iter_dontneed(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + + if (!iter->path || trans->restarted) + return; + + struct btree_path *path = btree_iter_path(trans, iter); + path->preserve = false; + if (path->ref == 1) + path->should_be_locked = false; +} /* Btree iterators: */ int __must_check @@ -1720,16 +1903,20 @@ bch2_btree_iter_traverse(struct btree_iter *iter) struct btree_trans *trans = iter->trans; int ret; + bch2_trans_verify_not_unlocked_or_in_restart(trans); + iter->path = bch2_btree_path_set_pos(trans, iter->path, btree_iter_search_key(iter), - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); if (ret) return ret; - btree_path_set_should_be_locked(trans->paths + iter->path); + struct btree_path *path = btree_iter_path(trans, iter); + if (btree_path_node(path, path->level)) + btree_path_set_should_be_locked(trans, path); return 0; } @@ -1759,9 +1946,9 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->k.p = iter->pos = b->key.k.p; iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(btree_iter_path(trans, iter)); + btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -1772,6 +1959,7 @@ err: goto out; } +/* Only kept for -tools */ struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) { struct btree *b; @@ -1790,9 +1978,14 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) int ret; EBUG_ON(trans->paths[iter->path].cached); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_btree_iter_verify(iter); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (ret) + goto err; + + struct btree_path *path = btree_iter_path(trans, iter); /* already at end? */ @@ -1820,13 +2013,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) if (bpos_eq(iter->pos, b->key.k.p)) { __btree_path_set_level_up(trans, path, path->level++); } else { + if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED) + btree_node_unlock(trans, path, path->level + 1); + /* * Haven't gotten to the end of the parent node: go back down to * the next child node */ iter->path = bch2_btree_path_set_pos(trans, iter->path, bpos_successor(iter->pos), - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); path = btree_iter_path(trans, iter); @@ -1844,9 +2040,9 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->k.p = iter->pos = b->key.k.p; iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(btree_iter_path(trans, iter)); + btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); EBUG_ON(btree_iter_path(trans, iter)->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); @@ -1863,11 +2059,11 @@ err: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { struct bpos pos = iter->k.p; - bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + bool ret = !(iter->flags & BTREE_ITER_all_snapshots ? bpos_eq(pos, SPOS_MAX) : bkey_eq(pos, SPOS_MAX)); - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_successor(iter, pos); bch2_btree_iter_set_pos(iter, pos); return ret; @@ -1876,11 +2072,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) inline bool bch2_btree_iter_rewind(struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); - bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + bool ret = !(iter->flags & BTREE_ITER_all_snapshots ? bpos_eq(pos, POS_MIN) : bkey_eq(pos, POS_MIN)); - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_predecessor(iter, pos); bch2_btree_iter_set_pos(iter, pos); return ret; @@ -1938,7 +2134,7 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, { struct btree_path *path = btree_iter_path(trans, iter); - return bch2_journal_keys_peek_upto(trans->c, iter->btree_id, + return bch2_journal_keys_peek_max(trans->c, iter->btree_id, path->level, path->pos, end_pos, @@ -1961,21 +2157,47 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, } static noinline -struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +void btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = bch2_btree_journal_peek(trans, iter, - k.k ? k.k->p : path_l(path)->b->key.k.p); - + k->k ? k->k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; - k = bkey_i_to_s_c(next_journal); + *k = bkey_i_to_s_c(next_journal); } +} - return k; +static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end_pos) +{ + struct btree_path *path = btree_iter_path(trans, iter); + + return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, + path->level, + path->pos, + end_pos, + &iter->journal_idx); +} + +static noinline +void btree_trans_peek_prev_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c *k) +{ + struct btree_path *path = btree_iter_path(trans, iter); + struct bkey_i *next_journal = + bch2_btree_journal_peek_prev(trans, iter, + k->k ? k->k->p : path_l(path)->b->key.k.p); + + if (next_journal) { + iter->k = next_journal->k; + *k = bkey_i_to_s_c(next_journal); + } } /* @@ -1991,7 +2213,9 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos struct bkey_s_c k; int ret; - if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) && + bch2_trans_verify_not_unlocked_or_in_restart(trans); + + if ((iter->flags & BTREE_ITER_key_cache_fill) && bpos_eq(iter->pos, pos)) return bkey_s_c_null; @@ -2000,28 +2224,32 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos if (!iter->key_cache_path) iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, - iter->flags & BTREE_ITER_INTENT, 0, - iter->flags|BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL, + iter->flags & BTREE_ITER_intent, 0, + iter->flags|BTREE_ITER_cached| + BTREE_ITER_cached_nofill, _THIS_IP_); iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - iter->flags|BTREE_ITER_CACHED) ?: + iter->flags|BTREE_ITER_cached) ?: bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_); if (unlikely(ret)) return bkey_s_c_err(ret); - btree_path_set_should_be_locked(trans->paths + iter->key_cache_path); - k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); - if (k.k && !bkey_err(k)) { - iter->k = u; - k.k = &iter->k; - } + if (!k.k) + return k; + + if ((iter->flags & BTREE_ITER_all_snapshots) && + !bpos_eq(pos, k.k->p)) + return bkey_s_c_null; + + iter->k = u; + k.k = &iter->k; + btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); return k; } @@ -2035,10 +2263,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp bch2_btree_iter_verify(iter); while (1) { - struct btree_path_level *l; - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2046,38 +2272,37 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp /* ensure that iter->k is consistent with iter->pos: */ bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); - goto out; + break; } struct btree_path *path = btree_iter_path(trans, iter); - l = path_l(path); + struct btree_path_level *l = path_l(path); if (unlikely(!l->b)) { /* No btree nodes at requested level: */ bch2_btree_iter_set_pos(iter, SPOS_MAX); k = bkey_s_c_null; - goto out; + break; } - btree_path_set_should_be_locked(path); + btree_path_set_should_be_locked(trans, path); k = btree_path_level_peek_all(trans->c, l, &iter->k); - if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { k = k2; - ret = bkey_err(k); - if (ret) { + if (bkey_err(k)) { bch2_btree_iter_set_pos(iter, iter->pos); - goto out; + break; } } - if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) - k = btree_trans_peek_journal(trans, iter, k); + if (unlikely(iter->flags & BTREE_ITER_with_journal)) + btree_trans_peek_journal(trans, iter, &k); - if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) bch2_btree_trans_peek_updates(trans, iter, &k); @@ -2104,41 +2329,46 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp /* End of btree: */ bch2_btree_iter_set_pos(iter, SPOS_MAX); k = bkey_s_c_null; - goto out; + break; } } -out: - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(iter); return k; } /** - * bch2_btree_iter_peek_upto() - returns first key greater than or equal to + * bch2_btree_iter_peek_max() - returns first key greater than or equal to * iterator's current position * @iter: iterator to peek from * @end: search limit: returns keys less than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end) { struct btree_trans *trans = iter->trans; struct bpos search_key = btree_iter_search_key(iter); struct bkey_s_c k; - struct bpos iter_pos; + struct bpos iter_pos = iter->pos; int ret; - EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); + bch2_trans_verify_not_unlocked_or_in_restart(trans); + bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); + + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->update_path = 0; } - bch2_btree_iter_verify_entry_exit(iter); - while (1) { k = __bch2_btree_iter_peek(iter, search_key); if (unlikely(!k.k)) @@ -2146,75 +2376,75 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e if (unlikely(bkey_err(k))) goto out_no_locked; - /* - * We need to check against @end before FILTER_SNAPSHOTS because - * if we get to a different inode that requested we might be - * seeing keys for a different snapshot tree that will all be - * filtered out. - * - * But we can't do the full check here, because bkey_start_pos() - * isn't monotonically increasing before FILTER_SNAPSHOTS, and - * that's what we check against in extents mode: - */ - if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) - ? bkey_gt(k.k->p, end) - : k.k->p.inode > end.inode)) - goto end; + if (iter->flags & BTREE_ITER_filter_snapshots) { + /* + * We need to check against @end before FILTER_SNAPSHOTS because + * if we get to a different inode that requested we might be + * seeing keys for a different snapshot tree that will all be + * filtered out. + * + * But we can't do the full check here, because bkey_start_pos() + * isn't monotonically increasing before FILTER_SNAPSHOTS, and + * that's what we check against in extents mode: + */ + if (unlikely(!(iter->flags & BTREE_ITER_is_extents) + ? bkey_gt(k.k->p, end) + : k.k->p.inode > end.inode)) + goto end; + + if (iter->update_path && + !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_intent); + iter->update_path = 0; + } - if (iter->update_path && - !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); - iter->update_path = 0; - } + if ((iter->flags & BTREE_ITER_intent) && + !(iter->flags & BTREE_ITER_is_extents) && + !iter->update_path) { + struct bpos pos = k.k->p; - if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && - (iter->flags & BTREE_ITER_INTENT) && - !(iter->flags & BTREE_ITER_IS_EXTENTS) && - !iter->update_path) { - struct bpos pos = k.k->p; + if (pos.snapshot < iter->snapshot) { + search_key = bpos_successor(k.k->p); + continue; + } - if (pos.snapshot < iter->snapshot) { - search_key = bpos_successor(k.k->p); - continue; - } + pos.snapshot = iter->snapshot; - pos.snapshot = iter->snapshot; + /* + * advance, same as on exit for iter->path, but only up + * to snapshot + */ + __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); + iter->update_path = iter->path; + + iter->update_path = bch2_btree_path_set_pos(trans, + iter->update_path, pos, + iter->flags & BTREE_ITER_intent, + _THIS_IP_); + ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + } /* - * advance, same as on exit for iter->path, but only up - * to snapshot + * We can never have a key in a leaf node at POS_MAX, so + * we don't have to check these successor() calls: */ - __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = iter->path; - - iter->update_path = bch2_btree_path_set_pos(trans, - iter->update_path, pos, - iter->flags & BTREE_ITER_INTENT, - _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; + if (!bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)) { + search_key = bpos_successor(k.k->p); + continue; } - } - - /* - * We can never have a key in a leaf node at POS_MAX, so - * we don't have to check these successor() calls: - */ - if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && - !bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - search_key = bpos_successor(k.k->p); - continue; - } - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { - search_key = bkey_successor(iter, k.k->p); - continue; + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_key_cache_fill)) { + search_key = bkey_successor(iter, k.k->p); + continue; + } } /* @@ -2222,14 +2452,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * equal to the key we just returned - except extents can * straddle iter->pos: */ - if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (!(iter->flags & BTREE_ITER_is_extents)) iter_pos = k.k->p; else iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) - ? bkey_gt(iter_pos, end) - : bkey_ge(iter_pos, end))) + if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) : + iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) : + bkey_gt(iter_pos, end))) goto end; break; @@ -2238,20 +2468,20 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->pos = iter_pos; iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(btree_iter_path(trans, iter)); + btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out_no_locked: if (iter->update_path) { ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_); if (unlikely(ret)) k = bkey_s_c_err(ret); else - btree_path_set_should_be_locked(trans->paths + iter->update_path); + btree_path_set_should_be_locked(trans, trans->paths + iter->update_path); } - if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + if (!(iter->flags & BTREE_ITER_all_snapshots)) iter->pos.snapshot = iter->snapshot; ret = bch2_btree_iter_verify_ret(iter, k); @@ -2284,135 +2514,224 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) return bch2_btree_iter_peek(iter); } -/** - * bch2_btree_iter_peek_prev() - returns first key less than or equal to - * iterator's current position - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key) { struct btree_trans *trans = iter->trans; - struct bpos search_key = iter->pos; - struct bkey_s_c k; - struct bkey saved_k; - const struct bch_val *saved_v; - btree_path_idx_t saved_path = 0; - int ret; - - EBUG_ON(btree_iter_path(trans, iter)->cached || - btree_iter_path(trans, iter)->level); - - if (iter->flags & BTREE_ITER_WITH_JOURNAL) - return bkey_s_c_err(-EIO); + struct bkey_s_c k, k2; bch2_btree_iter_verify(iter); - bch2_btree_iter_verify_entry_exit(iter); - - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) - search_key.snapshot = U32_MAX; while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT, - btree_iter_ip_allocated(iter)); + iter->flags & BTREE_ITER_intent, + btree_iter_ip_allocated(iter)); - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { /* ensure that iter->k is consistent with iter->pos: */ bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); - goto out_no_locked; + break; } struct btree_path *path = btree_iter_path(trans, iter); + struct btree_path_level *l = path_l(path); + + if (unlikely(!l->b)) { + /* No btree nodes at requested level: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + break; + } + + btree_path_set_should_be_locked(trans, path); + + k = btree_path_level_peek_all(trans->c, l, &iter->k); + if (!k.k || bpos_gt(k.k->p, search_key)) { + k = btree_path_level_prev(trans, path, l, &iter->k); + + BUG_ON(k.k && bpos_gt(k.k->p, search_key)); + } + + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && + k.k && + (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + k = k2; + if (bkey_err(k2)) { + bch2_btree_iter_set_pos(iter, iter->pos); + break; + } + } - k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); - if (!k.k || - ((iter->flags & BTREE_ITER_IS_EXTENTS) - ? bpos_ge(bkey_start_pos(k.k), search_key) - : bpos_gt(k.k->p, search_key))) - k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); + if (unlikely(iter->flags & BTREE_ITER_with_journal)) + btree_trans_peek_prev_journal(trans, iter, &k); - if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) bch2_btree_trans_peek_prev_updates(trans, iter, &k); - if (likely(k.k)) { - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { - if (k.k->p.snapshot == iter->snapshot) - goto got_key; + if (likely(k.k && !bkey_deleted(k.k))) { + break; + } else if (k.k) { + search_key = bpos_predecessor(k.k->p); + } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { + /* Advance to previous leaf node: */ + search_key = bpos_predecessor(path->l[0].b->data->min_key); + } else { + /* Start of btree: */ + bch2_btree_iter_set_pos(iter, POS_MIN); + k = bkey_s_c_null; + break; + } + } + + bch2_btree_iter_verify(iter); + return k; +} + +/** + * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to + * iterator's current position + * @iter: iterator to peek from + * @end: search limit: returns keys greater than or equal to @end + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end) +{ + if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && + !bkey_eq(iter->pos, POS_MAX)) { + /* + * bkey_start_pos(), for extents, is not monotonically + * increasing until after filtering for snapshots: + * + * Thus, for extents we need to search forward until we find a + * real visible extents - easiest to just use peek_slot() (which + * internally uses peek() for extents) + */ + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) + return k; + + if (!bkey_deleted(k.k) && + (!(iter->flags & BTREE_ITER_is_extents) || + bkey_lt(bkey_start_pos(k.k), iter->pos))) + return k; + } + + struct btree_trans *trans = iter->trans; + struct bpos search_key = iter->pos; + struct bkey_s_c k; + btree_path_idx_t saved_path = 0; + + bch2_trans_verify_not_unlocked_or_in_restart(trans); + bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); + + int ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + + while (1) { + k = __bch2_btree_iter_peek_prev(iter, search_key); + if (unlikely(!k.k)) + goto end; + if (unlikely(bkey_err(k))) + goto out_no_locked; + + if (iter->flags & BTREE_ITER_filter_snapshots) { + struct btree_path *s = saved_path ? trans->paths + saved_path : NULL; + if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) { + /* + * If we have a saved candidate, and we're past + * the last possible snapshot overwrite, return + * it: + */ + bch2_path_put_nokeep(trans, iter->path, + iter->flags & BTREE_ITER_intent); + iter->path = saved_path; + saved_path = 0; + k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); + break; + } + + /* + * We need to check against @end before FILTER_SNAPSHOTS because + * if we get to a different inode that requested we might be + * seeing keys for a different snapshot tree that will all be + * filtered out. + */ + if (unlikely(bkey_lt(k.k->p, end))) + goto end; + + if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { + search_key = bpos_predecessor(k.k->p); + continue; + } + if (k.k->p.snapshot != iter->snapshot) { /* - * If we have a saved candidate, and we're no - * longer at the same _key_ (not pos), return - * that candidate + * Have a key visible in iter->snapshot, but + * might have overwrites: - save it and keep + * searching. Unless it's a whiteout - then drop + * our previous saved candidate: */ - if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { - bch2_path_put_nokeep(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); - iter->path = saved_path; + if (saved_path) { + bch2_path_put_nokeep(trans, saved_path, + iter->flags & BTREE_ITER_intent); saved_path = 0; - iter->k = saved_k; - k.v = saved_v; - goto got_key; } - if (bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - if (saved_path) - bch2_path_put_nokeep(trans, saved_path, - iter->flags & BTREE_ITER_INTENT); + if (!bkey_whiteout(k.k)) { saved_path = btree_path_clone(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); - path = btree_iter_path(trans, iter); - saved_k = *k.k; - saved_v = k.v; + iter->flags & BTREE_ITER_intent, + _THIS_IP_); + trace_btree_path_save_pos(trans, + trans->paths + iter->path, + trans->paths + saved_path); } search_key = bpos_predecessor(k.k->p); continue; } -got_key: - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + + if (bkey_whiteout(k.k)) { search_key = bkey_predecessor(iter, k.k->p); - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) - search_key.snapshot = U32_MAX; + search_key.snapshot = U32_MAX; continue; } - - btree_path_set_should_be_locked(path); - break; - } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { - /* Advance to previous leaf node: */ - search_key = bpos_predecessor(path->l[0].b->data->min_key); - } else { - /* Start of btree: */ - bch2_btree_iter_set_pos(iter, POS_MIN); - k = bkey_s_c_null; - goto out_no_locked; } - } - EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); + EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) : + iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) : + bkey_gt(k.k->p, iter->pos)); + + if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) : + iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) : + bkey_lt(k.k->p, end))) + goto end; + + break; + } /* Extents can straddle iter->pos: */ - if (bkey_lt(k.k->p, iter->pos)) - iter->pos = k.k->p; + iter->pos = bpos_min(iter->pos, k.k->p);; - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + if (iter->flags & BTREE_ITER_filter_snapshots) iter->pos.snapshot = iter->snapshot; out_no_locked: if (saved_path) - bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); + bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); - return k; +end: + bch2_btree_iter_set_pos(iter, end); + k = bkey_s_c_null; + goto out_no_locked; } /** @@ -2437,12 +2756,19 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c k; int ret; + bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); + EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); + + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } /* extents can't span inode numbers: */ - if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { if (iter->pos.inode == KEY_INODE_MAX) return bkey_s_c_null; @@ -2452,7 +2778,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) search_key = btree_iter_search_key(iter); iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2461,22 +2787,26 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out_no_locked; } - if ((iter->flags & BTREE_ITER_CACHED) || - !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { + struct btree_path *path = btree_iter_path(trans, iter); + if (unlikely(!btree_path_node(path, path->level))) + return bkey_s_c_null; + + if ((iter->flags & BTREE_ITER_cached) || + !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { k = bkey_s_c_null; - if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) { bch2_btree_trans_peek_slot_updates(trans, iter, &k); if (k.k) goto out; } - if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && + if (unlikely(iter->flags & BTREE_ITER_with_journal) && (k = btree_trans_peek_slot_journal(trans, iter)).k) goto out; - if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { if (!bkey_err(k)) iter->k = *k.k; @@ -2487,22 +2817,28 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); if (unlikely(!k.k)) goto out_no_locked; + + if (unlikely(k.k->type == KEY_TYPE_whiteout && + (iter->flags & BTREE_ITER_filter_snapshots) && + !(iter->flags & BTREE_ITER_key_cache_fill))) + iter->k.type = KEY_TYPE_deleted; } else { struct bpos next; struct bpos end = iter->pos; - if (iter->flags & BTREE_ITER_IS_EXTENTS) + if (iter->flags & BTREE_ITER_is_extents) end.offset = U64_MAX; EBUG_ON(btree_iter_path(trans, iter)->level); - if (iter->flags & BTREE_ITER_INTENT) { + if (iter->flags & BTREE_ITER_intent) { struct btree_iter iter2; bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek_upto(&iter2, end); + k = bch2_btree_iter_peek_max(&iter2, end); if (k.k && !bkey_err(k)) { + swap(iter->key_cache_path, iter2.key_cache_path); iter->k = iter2.k; k.k = &iter->k; } @@ -2510,7 +2846,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } else { struct bpos pos = iter->pos; - k = bch2_btree_iter_peek_upto(iter, end); + k = bch2_btree_iter_peek_max(iter, end); if (unlikely(bkey_err(k))) bch2_btree_iter_set_pos(iter, pos); else @@ -2526,7 +2862,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bkey_init(&iter->k); iter->k.p = iter->pos; - if (iter->flags & BTREE_ITER_IS_EXTENTS) { + if (iter->flags & BTREE_ITER_is_extents) { bch2_key_resize(&iter->k, min_t(u64, KEY_SIZE_MAX, (next.inode == iter->pos.inode @@ -2540,7 +2876,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } } out: - btree_path_set_should_be_locked(btree_iter_path(trans, iter)); + btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out_no_locked: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2567,6 +2903,7 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) return bch2_btree_iter_peek_slot(iter); } +/* Obsolete, but still used by rust wrapper in -tools */ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) { struct bkey_s_c k; @@ -2710,13 +3047,13 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) { if (iter->update_path) bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); if (iter->path) bch2_path_put(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); if (iter->key_cache_path) bch2_path_put(trans, iter->key_cache_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->path = 0; iter->update_path = 0; iter->key_cache_path = 0; @@ -2729,7 +3066,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans, unsigned flags) { bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, 0, flags), _RET_IP_); } @@ -2741,12 +3078,15 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, unsigned depth, unsigned flags) { - flags |= BTREE_ITER_NOT_EXTENTS; - flags |= __BTREE_ITER_ALL_SNAPSHOTS; - flags |= BTREE_ITER_ALL_SNAPSHOTS; + flags |= BTREE_ITER_not_extents; + flags |= BTREE_ITER_snapshot_field; + flags |= BTREE_ITER_all_snapshots; + + if (!depth && btree_id_cached(trans->c, btree_id)) + flags |= BTREE_ITER_with_key_cache; bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, - __bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, depth, flags), _RET_IP_); iter->min_depth = depth; @@ -2762,10 +3102,13 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) struct btree_trans *trans = src->trans; *dst = *src; +#ifdef TRACK_PATH_ALLOCATED + dst->ip_allocated = _RET_IP_; +#endif if (src->path) - __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); + __btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent); if (src->update_path) - __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT); + __btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent); dst->key_cache_path = 0; } @@ -2781,9 +3124,38 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (ret) + return ERR_PTR(ret); + struct btree_transaction_stats *s = btree_trans_stats(trans); s->max_mem = max(s->max_mem, new_bytes); + if (trans->used_mempool) { + if (trans->mem_bytes >= new_bytes) + goto out_change_top; + + /* No more space from mempool item, need malloc new one */ + new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); + if (unlikely(!new_mem)) { + bch2_trans_unlock(trans); + + new_mem = kmalloc(new_bytes, GFP_KERNEL); + if (!new_mem) + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + + ret = bch2_trans_relock(trans); + if (ret) { + kfree(new_mem); + return ERR_PTR(ret); + } + } + memcpy(new_mem, trans->mem, trans->mem_top); + trans->used_mempool = false; + mempool_free(trans->mem, &c->btree_trans_mem_pool); + goto out_new_mem; + } + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); @@ -2792,6 +3164,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; + memcpy(new_mem, trans->mem, trans->mem_top); + trans->used_mempool = true; kfree(trans->mem); } @@ -2805,15 +3179,16 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (ret) return ERR_PTR(ret); } - +out_new_mem: trans->mem = new_mem; trans->mem_bytes = new_bytes; if (old_bytes) { trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); + return ERR_PTR(btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } - +out_change_top: p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -2907,7 +3282,8 @@ u32 bch2_trans_begin(struct btree_trans *trans) if (!trans->restarted && (need_resched() || time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) { - drop_locks_do(trans, (cond_resched(), 0)); + bch2_trans_unlock(trans); + cond_resched(); now = local_clock(); } trans->last_begin_time = now; @@ -2917,11 +3293,23 @@ u32 bch2_trans_begin(struct btree_trans *trans) bch2_trans_srcu_unlock(trans); trans->last_begin_ip = _RET_IP_; + +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (trans->restarted) { + trans->restart_count_this_trans++; + } else { + trans->restart_count_this_trans = 0; + } +#endif + + trans_set_locked(trans, false); + if (trans->restarted) { bch2_btree_path_traverse_all(trans); trans->notrace_relock_fail = false; } + bch2_trans_verify_not_unlocked_or_in_restart(trans); return trans->restart_count; } @@ -2955,7 +3343,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); memset(trans, 0, sizeof(*trans)); - closure_init_stack(&trans->ref); seqmutex_lock(&c->btree_trans_lock); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { @@ -2974,16 +3361,11 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) */ BUG_ON(pos_task && pid == pos_task->pid && - bch2_trans_locked(pos)); - - if (pos_task && pid < pos_task->pid) { - list_add_tail(&trans->list, &pos->list); - goto list_add_done; - } + pos->locked); } } - list_add_tail(&trans->list, &c->btree_trans_list); -list_add_done: + + list_add(&trans->list, &c->btree_trans_list); seqmutex_unlock(&c->btree_trans_lock); got_trans: trans->c = c; @@ -2991,7 +3373,7 @@ got_trans: trans->fn_idx = fn_idx; trans->locking_wait.task = current; trans->journal_replay_not_finished = - unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && + unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) && atomic_inc_not_zero(&c->journal_keys.ref); trans->nr_paths = ARRAY_SIZE(trans->_paths); trans->paths_allocated = trans->_paths_allocated; @@ -3003,6 +3385,9 @@ got_trans: trans->paths_allocated[0] = 1; + static struct lock_class_key lockdep_key; + lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0); + if (fn_idx < BCH_TRANSACTIONS_NR) { trans->fn = bch2_btree_transaction_fns[fn_idx]; @@ -3023,6 +3408,9 @@ got_trans: trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; + trans_set_locked(trans, false); + + closure_init_stack_release(&trans->ref); return trans; } @@ -3054,12 +3442,14 @@ void bch2_trans_put(struct btree_trans *trans) { struct bch_fs *c = trans->c; + if (trans->restarted) + bch2_trans_in_restart_error(trans); + bch2_trans_unlock(trans); trans_for_each_update(trans, i) - __btree_path_put(trans->paths + i->path, true); + __btree_path_put(trans, trans->paths + i->path, true); trans->nr_updates = 0; - trans->locking_wait.task = NULL; check_btree_paths_leaked(trans); @@ -3068,26 +3458,28 @@ void bch2_trans_put(struct btree_trans *trans) srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); } - if (trans->fs_usage_deltas) { - if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == - REPLICAS_DELTA_LIST_MAX) - mempool_free(trans->fs_usage_deltas, - &c->replicas_delta_pool); - else - kfree(trans->fs_usage_deltas); - } - if (unlikely(trans->journal_replay_not_finished)) bch2_journal_keys_put(c); + /* + * trans->ref protects trans->locking_wait.task, btree_paths array; used + * by cycle detector + */ + closure_return_sync(&trans->ref); + trans->locking_wait.task = NULL; + +#ifdef CONFIG_BCACHEFS_DEBUG + darray_exit(&trans->last_restarted_trace); +#endif + unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; trans->paths = NULL; if (paths_allocated != trans->_paths_allocated) - kfree_rcu_mightsleep(paths_allocated); + kvfree_rcu_mightsleep(paths_allocated); - if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) + if (trans->used_mempool) mempool_free(trans->mem, &c->btree_trans_mem_pool); else kfree(trans->mem); @@ -3097,8 +3489,6 @@ void bch2_trans_put(struct btree_trans *trans) trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); if (trans) { - closure_sync(&trans->ref); - seqmutex_lock(&c->btree_trans_lock); list_del(&trans->list); seqmutex_unlock(&c->btree_trans_lock); @@ -3107,6 +3497,21 @@ void bch2_trans_put(struct btree_trans *trans) } } +bool bch2_current_has_btree_trans(struct bch_fs *c) +{ + seqmutex_lock(&c->btree_trans_lock); + struct btree_trans *trans; + bool ret = false; + list_for_each_entry(trans, &c->btree_trans_list, list) + if (trans->locking_wait.task == current && + trans->locked) { + ret = true; + break; + } + seqmutex_unlock(&c->btree_trans_lock); + return ret; +} + static void __maybe_unused bch2_btree_bkey_cached_common_to_text(struct printbuf *out, struct btree_bkey_cached_common *b) @@ -3120,13 +3525,12 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, pid = owner ? owner->pid : 0; rcu_read_unlock(); - prt_tab(out); - prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', - b->level, bch2_btree_id_str(b->btree_id)); + prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); + bch2_btree_id_to_text(out, b->btree_id); + prt_printf(out, " l=%u:", b->level); bch2_bpos_to_text(out, btree_node_pos(b)); - prt_tab(out); - prt_printf(out, " locks %u:%u:%u held by pid %u", + prt_printf(out, "\t locks %u:%u:%u held by pid %u", c.n[0], c.n[1], c.n[2], pid); } @@ -3162,11 +3566,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) if (!path->nodes_locked) continue; - prt_printf(out, " path %u %c l=%u %s:", - idx, - path->cached ? 'c' : 'b', - path->level, - bch2_btree_id_str(path->btree_id)); + prt_printf(out, " path %u %c ", + idx, + path->cached ? 'c' : 'b'); + bch2_btree_id_to_text(out, path->btree_id); + prt_printf(out, " l=%u:", path->level); bch2_bpos_to_text(out, path->pos); prt_newline(out); @@ -3183,10 +3587,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) b = READ_ONCE(trans->locking); if (b) { - prt_printf(out, " blocked for %lluus on", - div_u64(local_clock() - trans->locking_wait.start_time, - 1000)); - prt_newline(out); + prt_printf(out, " blocked for %lluus on\n", + div_u64(local_clock() - trans->locking_wait.start_time, 1000)); prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); bch2_btree_bkey_cached_common_to_text(out, b); prt_newline(out); @@ -3208,8 +3610,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; if (trans) { - closure_sync(&trans->ref); - seqmutex_lock(&c->btree_trans_lock); list_del(&trans->list); seqmutex_unlock(&c->btree_trans_lock); @@ -3229,8 +3629,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) bch2_time_stats_exit(&s->lock_hold_times); } - if (c->btree_trans_barrier_initialized) + if (c->btree_trans_barrier_initialized) { + synchronize_srcu_expedited(&c->btree_trans_barrier); cleanup_srcu_struct(&c->btree_trans_barrier); + } mempool_exit(&c->btree_trans_mem_pool); mempool_exit(&c->btree_trans_pool); } @@ -3264,7 +3666,22 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, BTREE_TRANS_MEM_MAX) ?: init_srcu_struct(&c->btree_trans_barrier); - if (!ret) - c->btree_trans_barrier_initialized = true; - return ret; + if (ret) + return ret; + + /* + * static annotation (hackily done) for lock ordering of reclaim vs. + * btree node locks: + */ +#ifdef CONFIG_LOCKDEP + fs_reclaim_acquire(GFP_KERNEL); + struct btree_trans *trans = bch2_trans_get(c); + trans_set_locked(trans, false); + bch2_trans_put(trans); + fs_reclaim_release(GFP_KERNEL); +#endif + + c->btree_trans_barrier_initialized = true; + return 0; + } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 24772538e4cc..b96157f3dc9c 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -6,6 +6,12 @@ #include "btree_types.h" #include "trace.h" +void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); +void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); +void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); +void bch2_dump_trans_updates(struct btree_trans *); +void bch2_dump_trans_paths_updates(struct btree_trans *); + static inline int __bkey_err(const struct bkey *k) { return PTR_ERR_OR_ZERO(k); @@ -13,16 +19,30 @@ static inline int __bkey_err(const struct bkey *k) #define bkey_err(_k) __bkey_err((_k).k) -static inline void __btree_path_get(struct btree_path *path, bool intent) +static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent) { + unsigned idx = path - trans->paths; + + EBUG_ON(idx >= trans->nr_paths); + EBUG_ON(!test_bit(idx, trans->paths_allocated)); + if (unlikely(path->ref == U8_MAX)) { + bch2_dump_trans_paths_updates(trans); + panic("path %u refcount overflow\n", idx); + } + path->ref++; path->intent_ref += intent; + trace_btree_path_get_ll(trans, path); } -static inline bool __btree_path_put(struct btree_path *path, bool intent) +static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) { + EBUG_ON(path - trans->paths >= trans->nr_paths); + EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated)); EBUG_ON(!path->ref); EBUG_ON(!path->intent_ref && intent); + + trace_btree_path_put_ll(trans, path); path->intent_ref -= intent; return --path->ref == 0; } @@ -216,9 +236,13 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *, btree_path_idx_t, unsigned, unsigned long); +static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *); + static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) { + bch2_trans_verify_not_unlocked_or_in_restart(trans); + if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) return 0; @@ -227,6 +251,9 @@ static inline int __must_check bch2_btree_path_traverse(struct btree_trans *tran btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, unsigned, unsigned, unsigned, unsigned long); +btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id, + unsigned, struct bpos); + struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); /* @@ -261,12 +288,11 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex #ifdef CONFIG_BCACHEFS_DEBUG void bch2_trans_verify_paths(struct btree_trans *); -void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, - struct bpos, bool); +void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); #else static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos, bool key_cache) {} + struct bpos pos) {} #endif void bch2_btree_path_fix_key_modified(struct btree_trans *trans, @@ -283,7 +309,6 @@ int bch2_trans_relock(struct btree_trans *); int bch2_trans_relock_notrace(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); void bch2_trans_unlock_long(struct btree_trans *); -bool bch2_trans_locked(struct btree_trans *); static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) { @@ -301,30 +326,45 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, bch2_trans_restart_error(trans, restart_count); } -void __noreturn bch2_trans_in_restart_error(struct btree_trans *); +void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *); -static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) +static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans) { - if (trans->restarted) - bch2_trans_in_restart_error(trans); + if (trans->restarted || !trans->locked) + bch2_trans_unlocked_or_in_restart_error(trans); } __always_inline -static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) +static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) { BUG_ON(err <= 0); BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); trans->restarted = err; - trans->last_restarted_ip = _THIS_IP_; + trans->last_restarted_ip = ip; +#ifdef CONFIG_BCACHEFS_DEBUG + darray_exit(&trans->last_restarted_trace); + bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); +#endif return -err; } __always_inline static int btree_trans_restart(struct btree_trans *trans, int err) { - btree_trans_restart_nounlock(trans, err); - return -err; + return btree_trans_restart_ip(trans, err, _THIS_IP_); +} + +static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip) +{ +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) { + trace_and_count(trans->c, trans_restart_injected, trans, ip); + return btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_fault_inject, ip); + } +#endif + return 0; } bool bch2_btree_node_upgrade(struct btree_trans *, @@ -344,6 +384,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans, void bch2_trans_downgrade(struct btree_trans *); void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *); +void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); @@ -353,15 +394,21 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *); struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); struct btree *bch2_btree_iter_next_node(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { - return bch2_btree_iter_peek_upto(iter, SPOS_MAX); + return bch2_btree_iter_peek_max(iter, SPOS_MAX); +} + +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos); + +static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +{ + return bch2_btree_iter_peek_prev_min(iter, POS_MIN); } -struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); @@ -386,10 +433,10 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos if (unlikely(iter->update_path)) bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->update_path = 0; - if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + if (!(iter->flags & BTREE_ITER_all_snapshots)) new_pos.snapshot = iter->snapshot; __bch2_btree_iter_set_pos(iter, new_pos); @@ -397,7 +444,7 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) { - BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); + BUG_ON(!(iter->flags & BTREE_ITER_is_extents)); iter->pos = bkey_start_pos(&iter->k); } @@ -412,41 +459,35 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); -static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, - unsigned btree_id, - unsigned flags) +static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, + unsigned btree_id, + unsigned level, + unsigned flags) { - if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && + if (level || !btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_cached; + flags &= ~BTREE_ITER_with_key_cache; + } else if (!(flags & BTREE_ITER_cached)) + flags |= BTREE_ITER_with_key_cache; + + if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) && btree_id_is_extents(btree_id)) - flags |= BTREE_ITER_IS_EXTENTS; + flags |= BTREE_ITER_is_extents; - if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && + if (!(flags & BTREE_ITER_snapshot_field) && !btree_type_has_snapshot_field(btree_id)) - flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + flags &= ~BTREE_ITER_all_snapshots; - if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && + if (!(flags & BTREE_ITER_all_snapshots) && btree_type_has_snapshots(btree_id)) - flags |= BTREE_ITER_FILTER_SNAPSHOTS; + flags |= BTREE_ITER_filter_snapshots; if (trans->journal_replay_not_finished) - flags |= BTREE_ITER_WITH_JOURNAL; + flags |= BTREE_ITER_with_journal; return flags; } -static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, - unsigned btree_id, - unsigned flags) -{ - if (!btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_CACHED; - flags &= ~BTREE_ITER_WITH_KEY_CACHE; - } else if (!(flags & BTREE_ITER_CACHED)) - flags |= BTREE_ITER_WITH_KEY_CACHE; - - return __bch2_btree_iter_flags(trans, btree_id, flags); -} - static inline void bch2_trans_iter_init_common(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, @@ -483,7 +524,7 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans, if (__builtin_constant_p(btree_id) && __builtin_constant_p(flags)) bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, 0, flags), _THIS_IP_); else bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); @@ -494,16 +535,16 @@ void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, unsigned, unsigned, unsigned); void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); -static inline void set_btree_iter_dontneed(struct btree_iter *iter) -{ - struct btree_trans *trans = iter->trans; - - if (!trans->restarted) - btree_iter_path(trans, iter)->preserve = false; -} +void bch2_set_btree_iter_dontneed(struct btree_iter *); void *__bch2_trans_kmalloc(struct btree_trans *, size_t); +/** + * bch2_trans_kmalloc - allocate memory for use by the current transaction + * + * Must be called after bch2_trans_begin, which on second and further calls + * frees all memory allocated in this transaction + */ static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { size = roundup(size, 8); @@ -562,23 +603,30 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ _btree_id, _pos, _flags, KEY_TYPE_##_type)) +static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k) +{ + unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k)); + memcpy(dst_v, src_k.v, b); + if (unlikely(b < dst_size)) + memset(dst_v + b, 0, dst_size - b); +} + +#define bkey_val_copy(_dst_v, _src_k) \ +do { \ + BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \ + __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \ +} while (0) + static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, unsigned btree_id, struct bpos pos, unsigned flags, unsigned type, unsigned val_size, void *val) { struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); - ret = bkey_err(k); + struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); + int ret = bkey_err(k); if (!ret) { - unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size); - - memcpy(val, k.v, b); - if (unlikely(b < sizeof(*val))) - memset((void *) val + b, 0, sizeof(*val) - b); + __bkey_val_copy(val, val_size, k); bch2_trans_iter_exit(trans, &iter); } @@ -593,44 +641,56 @@ void bch2_trans_srcu_unlock(struct btree_trans *); u32 bch2_trans_begin(struct btree_trans *); -/* - * XXX - * this does not handle transaction restarts from bch2_btree_iter_next_node() - * correctly - */ -#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _locks_want, _depth, _flags, _b, _ret) \ - for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ - _start, _locks_want, _depth, _flags); \ - (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)), \ - !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ - (_b) = bch2_btree_iter_next_node(&(_iter))) +#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _locks_want, _depth, _flags, _b, _do) \ +({ \ + bch2_trans_begin((_trans)); \ + \ + struct btree_iter _iter; \ + bch2_trans_node_iter_init((_trans), &_iter, (_btree_id), \ + _start, _locks_want, _depth, _flags); \ + int _ret3 = 0; \ + do { \ + _ret3 = lockrestart_do((_trans), ({ \ + struct btree *_b = bch2_btree_iter_peek_node(&_iter); \ + if (!_b) \ + break; \ + \ + PTR_ERR_OR_ZERO(_b) ?: (_do); \ + })) ?: \ + lockrestart_do((_trans), \ + PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \ + } while (!_ret3); \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret3; \ +}) #define for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _flags, _b, _ret) \ - __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - 0, 0, _flags, _b, _ret) + _flags, _b, _do) \ + __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + 0, 0, _flags, _b, _do) static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek_prev(iter); } static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek(iter); } -static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, +static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter, struct bpos end, unsigned flags) { - if (!(flags & BTREE_ITER_SLOTS)) - return bch2_btree_iter_peek_upto(iter, end); + if (!(flags & BTREE_ITER_slots)) + return bch2_btree_iter_peek_max(iter, end); if (bkey_gt(iter->pos, end)) return bkey_s_c_null; @@ -642,7 +702,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *); static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8) + if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8) return __bch2_btree_trans_too_many_iters(trans); return 0; @@ -691,22 +751,18 @@ transaction_restart: \ if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ \ - _ret2 ?: trans_was_restarted(_trans, _restart_count); \ + _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ }) -#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ +#define for_each_btree_key_max_continue(_trans, _iter, \ + _end, _flags, _k, _do) \ ({ \ - struct btree_iter _iter; \ struct bkey_s_c _k; \ int _ret3 = 0; \ \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_max_type(&(_iter), \ _end, (_flags)); \ if (!(_k).k) \ break; \ @@ -719,9 +775,24 @@ transaction_restart: \ _ret3; \ }) +#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ + for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) + +#define for_each_btree_key_max(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _do) \ +({ \ + bch2_trans_begin(trans); \ + \ + struct btree_iter _iter; \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\ +}) + #define for_each_btree_key(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ - for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \ + for_each_btree_key_max(_trans, _iter, _btree_id, _start, \ SPOS_MAX, _flags, _k, _do) #define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ @@ -765,59 +836,45 @@ transaction_restart: \ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) -#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \ +#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \ _start, _end, _iter_flags, _k, \ _disk_res, _journal_seq, _commit_flags,\ _do) \ - for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ + for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); -static inline struct bkey_s_c -__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, - struct btree_iter *iter, unsigned flags) -{ - struct bkey_s_c k; - - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_type(iter, flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); - - return k; -} - -#define for_each_btree_key_old(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) - -#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ +#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ + (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) -#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\ +#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\ for (; \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \ + (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) #define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ - for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ + for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ SPOS_MAX, _flags, _k, _ret) +#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_rewind(&(_iter))) + #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ - for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) + for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) /* * This should not be used in a fastpath, without first trying _do in @@ -827,7 +884,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, #define drop_locks_do(_trans, _do) \ ({ \ bch2_trans_unlock(_trans); \ - _do ?: bch2_trans_relock(_trans); \ + (_do) ?: bch2_trans_relock(_trans); \ }) #define allocate_dropping_locks_errcode(_trans, _do) \ @@ -837,7 +894,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, \ if (bch2_err_matches(_ret, ENOMEM)) { \ _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(trans, _do); \ + _ret = drop_locks_do(_trans, _do); \ } \ _ret; \ }) @@ -850,19 +907,26 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, _ret = 0; \ if (unlikely(!_p)) { \ _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(trans, ((_p = _do), 0)); \ + _ret = drop_locks_do(_trans, ((_p = _do), 0)); \ } \ _p; \ }) -void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); -void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); -void bch2_dump_trans_updates(struct btree_trans *); -void bch2_dump_trans_paths_updates(struct btree_trans *); +#define bch2_trans_run(_c, _do) \ +({ \ + struct btree_trans *trans = bch2_trans_get(_c); \ + int _ret = (_do); \ + bch2_trans_put(trans); \ + _ret; \ +}) + +#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); void bch2_trans_put(struct btree_trans *); +bool bch2_current_has_btree_trans(struct bch_fs *); + extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; unsigned bch2_trans_get_fn_idx(const char *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 719a94a84950..6d25e3f85ce8 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "bset.h" +#include "btree_cache.h" #include "btree_journal_iter.h" #include "journal_io.h" @@ -14,19 +16,15 @@ * operations for the regular btree iter code to use: */ -static int __journal_key_cmp(enum btree_id l_btree_id, - unsigned l_level, - struct bpos l_pos, - const struct journal_key *r) +static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos) { - return (cmp_int(l_btree_id, r->btree_id) ?: - cmp_int(l_level, r->level) ?: - bpos_cmp(l_pos, r->k->k.p)); -} + size_t gap_size = keys->size - keys->nr; -static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) -{ - return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); + BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size); + + if (pos >= keys->gap) + pos -= gap_size; + return pos; } static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) @@ -40,7 +38,7 @@ static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) { - return keys->d + idx_to_pos(keys, idx); + return keys->data + idx_to_pos(keys, idx); } static size_t __bch2_journal_key_search(struct journal_keys *keys, @@ -74,7 +72,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys, } /* Returns first non-overwritten key >= search key: */ -struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, +struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bpos pos, struct bpos end_pos, size_t *idx) { @@ -97,27 +95,92 @@ search: } } + struct bkey_i *ret = NULL; + rcu_read_lock(); /* for overwritten_ranges */ + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) - return NULL; + break; if (k->overwritten) { - (*idx)++; + if (k->overwritten_range) + *idx = rcu_dereference(k->overwritten_range)->end; + else + *idx += 1; continue; } - if (__journal_key_cmp(btree_id, level, pos, k) <= 0) - return k->k; + if (__journal_key_cmp(btree_id, level, pos, k) <= 0) { + ret = k->k; + break; + } (*idx)++; iters++; if (iters == 10) { *idx = 0; + rcu_read_unlock(); goto search; } } - return NULL; + rcu_read_unlock(); + return ret; +} + +struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) +{ + struct journal_keys *keys = &c->journal_keys; + unsigned iters = 0; + struct journal_key *k; + + BUG_ON(*idx > keys->nr); +search: + if (!*idx) + *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + + while (*idx && + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { + (*idx)++; + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + struct bkey_i *ret = NULL; + rcu_read_lock(); /* for overwritten_ranges */ + + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) + break; + + if (k->overwritten) { + if (k->overwritten_range) + *idx = rcu_dereference(k->overwritten_range)->start - 1; + else + *idx -= 1; + continue; + } + + if (__journal_key_cmp(btree_id, level, pos, k) >= 0) { + ret = k->k; + break; + } + + --(*idx); + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + rcu_read_unlock(); + return ret; } struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, @@ -125,7 +188,25 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree { size_t idx = 0; - return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); + return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx); +} + +static void journal_iter_verify(struct journal_iter *iter) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct journal_keys *keys = iter->keys; + size_t gap_size = keys->size - keys->nr; + + BUG_ON(iter->idx >= keys->gap && + iter->idx < keys->gap + gap_size); + + if (iter->idx < keys->size) { + struct journal_key *k = keys->data + iter->idx; + + int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); + BUG_ON(cmp > 0); + } +#endif } static void journal_iters_fix(struct bch_fs *c) @@ -133,7 +214,8 @@ static void journal_iters_fix(struct bch_fs *c) struct journal_keys *keys = &c->journal_keys; /* The key we just inserted is immediately before the gap: */ size_t gap_end = keys->gap + (keys->size - keys->nr); - struct btree_and_journal_iter *iter; + struct journal_key *new_key = &keys->data[keys->gap - 1]; + struct journal_iter *iter; /* * If an iterator points one after the key we just inserted, decrement @@ -141,9 +223,14 @@ static void journal_iters_fix(struct bch_fs *c) * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will * handle that: */ - list_for_each_entry(iter, &c->journal_iters, journal.list) - if (iter->journal.idx == gap_end) - iter->journal.idx = keys->gap - 1; + list_for_each_entry(iter, &c->journal_iters, list) { + journal_iter_verify(iter); + if (iter->idx == gap_end && + new_key->btree_id == iter->btree_id && + new_key->level == iter->level) + iter->idx = keys->gap - 1; + journal_iter_verify(iter); + } } static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) @@ -172,7 +259,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, * Ensure these keys are done last by journal replay, to unblock * journal reclaim: */ - .journal_seq = U32_MAX, + .journal_seq = U64_MAX, }; struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); @@ -180,33 +267,38 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, BUG_ON(test_bit(BCH_FS_rw, &c->flags)); if (idx < keys->size && - journal_key_cmp(&n, &keys->d[idx]) == 0) { - if (keys->d[idx].allocated) - kfree(keys->d[idx].k); - keys->d[idx] = n; + journal_key_cmp(&n, &keys->data[idx]) == 0) { + if (keys->data[idx].allocated) + kfree(keys->data[idx].k); + keys->data[idx] = n; return 0; } if (idx > keys->gap) idx -= keys->size - keys->nr; + size_t old_gap = keys->gap; + if (keys->nr == keys->size) { + journal_iters_move_gap(c, old_gap, keys->size); + old_gap = keys->size; + struct journal_keys new_keys = { .nr = keys->nr, .size = max_t(size_t, keys->size, 8) * 2, }; - new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); - if (!new_keys.d) { + new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); + if (!new_keys.data) { bch_err(c, "%s: error allocating new key array (size %zu)", __func__, new_keys.size); return -BCH_ERR_ENOMEM_journal_key_insert; } /* Since @keys was full, there was no gap: */ - memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); - kvfree(keys->d); - keys->d = new_keys.d; + memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); + kvfree(keys->data); + keys->data = new_keys.data; keys->nr = new_keys.nr; keys->size = new_keys.size; @@ -214,13 +306,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, keys->gap = keys->nr; } - journal_iters_move_gap(c, keys->gap, idx); + journal_iters_move_gap(c, old_gap, idx); - move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); - keys->gap = idx; + move_gap(keys, idx); keys->nr++; - keys->d[keys->gap++] = n; + keys->data[keys->gap++] = n; journal_iters_fix(c); @@ -260,6 +351,84 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, return bch2_journal_key_insert(c, id, level, &whiteout); } +bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &trans->c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (!trans->journal_replay_not_finished) + return false; + + return (idx < keys->size && + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos) && + bkey_deleted(&keys->data[idx].k->k)); +} + +static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos) +{ + struct journal_key *k = keys->data + pos; + size_t idx = pos_to_idx(keys, pos); + + k->overwritten = true; + + struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL; + struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL; + + bool prev_overwritten = prev && prev->overwritten; + bool next_overwritten = next && next->overwritten; + + struct journal_key_range_overwritten *prev_range = + prev_overwritten ? prev->overwritten_range : NULL; + struct journal_key_range_overwritten *next_range = + next_overwritten ? next->overwritten_range : NULL; + + BUG_ON(prev_range && prev_range->end != idx); + BUG_ON(next_range && next_range->start != idx + 1); + + if (prev_range && next_range) { + prev_range->end = next_range->end; + + keys->data[pos].overwritten_range = prev_range; + for (size_t i = next_range->start; i < next_range->end; i++) { + struct journal_key *ip = keys->data + idx_to_pos(keys, i); + BUG_ON(ip->overwritten_range != next_range); + ip->overwritten_range = prev_range; + } + + kfree_rcu_mightsleep(next_range); + } else if (prev_range) { + prev_range->end++; + k->overwritten_range = prev_range; + if (next_overwritten) { + prev_range->end++; + next->overwritten_range = prev_range; + } + } else if (next_range) { + next_range->start--; + k->overwritten_range = next_range; + if (prev_overwritten) { + next_range->start--; + prev->overwritten_range = next_range; + } + } else if (prev_overwritten || next_overwritten) { + struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return; + + r->start = idx - (size_t) prev_overwritten; + r->end = idx + 1 + (size_t) next_overwritten; + + rcu_assign_pointer(k->overwritten_range, r); + if (prev_overwritten) + prev->overwritten_range = r; + if (next_overwritten) + next->overwritten_range = r; + } +} + void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos pos) { @@ -267,10 +436,14 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, size_t idx = bch2_journal_key_search(keys, btree, level, pos); if (idx < keys->size && - keys->d[idx].btree_id == btree && - keys->d[idx].level == level && - bpos_eq(keys->d[idx].k->k.p, pos)) - keys->d[idx].overwritten = true; + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos) && + !keys->data[idx].overwritten) { + mutex_lock(&keys->overwrite_lock); + __bch2_journal_key_overwritten(keys, idx); + mutex_unlock(&keys->overwrite_lock); + } } static void bch2_journal_iter_advance(struct journal_iter *iter) @@ -284,19 +457,32 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->keys->d + iter->idx; + struct bkey_s_c ret = bkey_s_c_null; + + journal_iter_verify(iter); - while (k < iter->keys->d + iter->keys->size && - k->btree_id == iter->btree_id && - k->level == iter->level) { - if (!k->overwritten) - return bkey_i_to_s_c(k->k); + rcu_read_lock(); + while (iter->idx < iter->keys->size) { + struct journal_key *k = iter->keys->data + iter->idx; - bch2_journal_iter_advance(iter); - k = iter->keys->d + iter->idx; + int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); + if (cmp < 0) + break; + BUG_ON(cmp); + + if (!k->overwritten) { + ret = bkey_i_to_s_c(k->k); + break; + } + + if (k->overwritten_range) + iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); + else + bch2_journal_iter_advance(iter); } + rcu_read_unlock(); - return bkey_s_c_null; + return ret; } static void bch2_journal_iter_exit(struct journal_iter *iter) @@ -313,6 +499,8 @@ static void bch2_journal_iter_init(struct bch_fs *c, iter->level = level; iter->keys = &c->journal_keys; iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); + + journal_iter_verify(iter); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) @@ -334,20 +522,57 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) iter->pos = bpos_successor(iter->pos); } +static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) +{ + struct btree_and_journal_iter iter = *_iter; + struct bch_fs *c = iter.trans->c; + unsigned level = iter.journal.level; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_started, &c->flags) + ? (level > 1 ? 0 : 2) + : (level > 1 ? 1 : 16); + + iter.prefetch = false; + iter.fail_if_too_many_whiteouts = true; + bch2_bkey_buf_init(&tmp); + + while (nr--) { + bch2_btree_and_journal_iter_advance(&iter); + struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); + if (!k.k) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); + } + + bch2_bkey_buf_exit(&tmp, c); +} + struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { - struct bkey_s_c btree_k, journal_k, ret; + struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; + size_t iters = 0; + + if (iter->prefetch && iter->journal.level) + btree_and_journal_iter_prefetch(iter); again: if (iter->at_end) return bkey_s_c_null; + iters++; + + if (iters > 20 && iter->fail_if_too_many_whiteouts) + return bkey_s_c_null; + while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && bpos_lt(btree_k.k->p, iter->pos)) bch2_journal_iter_advance_btree(iter); - while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && - bpos_lt(journal_k.k->p, iter->pos)) - bch2_journal_iter_advance(&iter->journal); + if (iter->trans->journal_replay_not_finished) + while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + bpos_lt(journal_k.k->p, iter->pos)) + bch2_journal_iter_advance(&iter->journal); ret = journal_k.k && (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) @@ -376,51 +601,44 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) bch2_journal_iter_exit(&iter->journal); } -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + struct btree_and_journal_iter *iter, struct btree *b, struct btree_node_iter node_iter, struct bpos pos) { memset(iter, 0, sizeof(*iter)); + iter->trans = trans; iter->b = b; iter->node_iter = node_iter; - bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); - INIT_LIST_HEAD(&iter->journal.list); iter->pos = b->data->min_key; iter->at_end = false; + INIT_LIST_HEAD(&iter->journal.list); + + if (trans->journal_replay_not_finished) { + bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); + if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags)) + list_add(&iter->journal.list, &trans->c->journal_iters); + } } /* * this version is used by btree_gc before filesystem has gone RW and * multithreaded, so uses the journal_iters list: */ -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, +void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + struct btree_and_journal_iter *iter, struct btree *b) { struct btree_node_iter node_iter; bch2_btree_node_iter_init_from_start(&node_iter, b); - __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); - list_add(&iter->journal.list, &c->journal_iters); + __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); } /* sort and dedup all keys in the journal: */ -void bch2_journal_entries_free(struct bch_fs *c) -{ - struct journal_replay **i; - struct genradix_iter iter; - - genradix_for_each(&c->journal_entries, iter, i) - if (*i) - kvpfree(*i, offsetof(struct journal_replay, j) + - vstruct_bytes(&(*i)->j)); - genradix_free(&c->journal_entries); -} - /* * When keys compare equal, oldest compares first: */ @@ -437,106 +655,78 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) void bch2_journal_keys_put(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; - struct journal_key *i; BUG_ON(atomic_read(&keys->ref) <= 0); if (!atomic_dec_and_test(&keys->ref)) return; - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) { + if (i->overwritten_range && + (i == &darray_last(*keys) || + i->overwritten_range != i[1].overwritten_range)) + kfree(i->overwritten_range); - for (i = keys->d; i < keys->d + keys->nr; i++) if (i->allocated) kfree(i->k); + } - kvfree(keys->d); - keys->d = NULL; + kvfree(keys->data); + keys->data = NULL; keys->nr = keys->gap = keys->size = 0; - bch2_journal_entries_free(c); + struct journal_replay **i; + struct genradix_iter iter; + + genradix_for_each(&c->journal_entries, iter, i) + kvfree(*i); + genradix_free(&c->journal_entries); } static void __journal_keys_sort(struct journal_keys *keys) { - struct journal_key *src, *dst; + sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); - sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); + cond_resched(); - src = dst = keys->d; - while (src < keys->d + keys->nr) { - while (src + 1 < keys->d + keys->nr && - !journal_key_cmp(src, src + 1)) - src++; + struct journal_key *dst = keys->data; + + darray_for_each(*keys, src) { + /* + * We don't accumulate accounting keys here because we have to + * compare each individual accounting key against the version in + * the btree during replay: + */ + if (src->k->k.type != KEY_TYPE_accounting && + src + 1 < &darray_top(*keys) && + !journal_key_cmp(src, src + 1)) + continue; - *dst++ = *src++; + *dst++ = *src; } - keys->nr = dst - keys->d; + keys->nr = dst - keys->data; } int bch2_journal_keys_sort(struct bch_fs *c) { struct genradix_iter iter; struct journal_replay *i, **_i; - struct jset_entry *entry; - struct bkey_i *k; struct journal_keys *keys = &c->journal_keys; - size_t nr_keys = 0, nr_read = 0; - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (!i || i->ignore) - continue; - - for_each_jset_key(k, entry, &i->j) - nr_keys++; - } - - if (!nr_keys) - return 0; - - keys->size = roundup_pow_of_two(nr_keys); - - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - if (!keys->d) { - bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", - nr_keys); - - do { - keys->size >>= 1; - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - } while (!keys->d && keys->size > nr_keys / 8); - - if (!keys->d) { - bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", - keys->size); - return -BCH_ERR_ENOMEM_journal_keys_sort; - } - } + size_t nr_read = 0; genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; cond_resched(); for_each_jset_key(k, entry, &i->j) { - if (keys->nr == keys->size) { - __journal_keys_sort(keys); - - if (keys->nr > keys->size * 7 / 8) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", - keys->nr, keys->size, nr_read, nr_keys); - return -BCH_ERR_ENOMEM_journal_keys_sort; - } - } - - keys->d[keys->nr++] = (struct journal_key) { + struct journal_key n = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, .k = k, @@ -544,6 +734,18 @@ int bch2_journal_keys_sort(struct bch_fs *c) .journal_offset = k->_data - i->j._data, }; + if (darray_push(keys, n)) { + __journal_keys_sort(keys); + + if (keys->nr * 8 > keys->size * 7) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", + keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + + BUG_ON(darray_push(keys, n)); + } + nr_read++; } } @@ -551,6 +753,54 @@ int bch2_journal_keys_sort(struct bch_fs *c) __journal_keys_sort(keys); keys->gap = keys->nr; - bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); + bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); return 0; } + +void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, + unsigned level_min, unsigned level_max, + struct bpos start, struct bpos end) +{ + struct journal_keys *keys = &c->journal_keys; + size_t dst = 0; + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) + if (!(i->btree_id == btree && + i->level >= level_min && + i->level <= level_max && + bpos_ge(i->k->k.p, start) && + bpos_le(i->k->k.p, end))) + keys->data[dst++] = *i; + keys->nr = keys->gap = dst; +} + +void bch2_journal_keys_dump(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + struct printbuf buf = PRINTBUF; + + pr_info("%zu keys:", keys->nr); + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) { + printbuf_reset(&buf); + prt_printf(&buf, "btree="); + bch2_btree_id_to_text(&buf, i->btree_id); + prt_printf(&buf, " l=%u ", i->level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + pr_err("%s", buf.buf); + } + printbuf_exit(&buf); +} + +void bch2_fs_journal_keys_init(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + + atomic_set(&keys->ref, 1); + keys->initial_ref_held = true; + mutex_init(&keys->overwrite_lock); +} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 8ca4c100b2e3..2a3082919b8d 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H #define _BCACHEFS_BTREE_JOURNAL_ITER_H +#include "bkey.h" + struct journal_iter { struct list_head list; enum btree_id btree_id; @@ -15,6 +17,7 @@ struct journal_iter { */ struct btree_and_journal_iter { + struct btree_trans *trans; struct btree *b; struct btree_node_iter node_iter; struct bkey unpacked; @@ -22,32 +25,60 @@ struct btree_and_journal_iter { struct journal_iter journal; struct bpos pos; bool at_end; + bool prefetch; + bool fail_if_too_many_whiteouts; }; -struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, +static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, + unsigned l_level, + const struct journal_key *r) +{ + return -cmp_int(l_level, r->level) ?: + cmp_int(l_btree_id, r->btree_id); +} + +static inline int __journal_key_cmp(enum btree_id l_btree_id, + unsigned l_level, + struct bpos l_pos, + const struct journal_key *r) +{ + return __journal_key_btree_cmp(l_btree_id, l_level, r) ?: + bpos_cmp(l_pos, r->k->k.p); +} + +static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) +{ + return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); +} + +struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, + unsigned, struct bpos, struct bpos, size_t *); +struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos, size_t *); struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, unsigned, struct bpos); +int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *, + struct btree_and_journal_iter *); + int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_insert(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_delete(struct bch_fs *, enum btree_id, unsigned, struct bpos); -void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, - unsigned, struct bpos); +bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos); +void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos); void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, struct btree *, +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, + struct btree_and_journal_iter *, struct btree *, struct btree_node_iter, struct bpos); -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, - struct btree *); +void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, + struct btree_and_journal_iter *, struct btree *); void bch2_journal_keys_put(struct bch_fs *); @@ -58,8 +89,14 @@ static inline void bch2_journal_keys_put_initial(struct bch_fs *c) c->journal_keys.initial_ref_held = false; } -void bch2_journal_entries_free(struct bch_fs *); - int bch2_journal_keys_sort(struct bch_fs *); +void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, + unsigned, unsigned, + struct bpos, struct bpos); + +void bch2_journal_keys_dump(struct bch_fs *); + +void bch2_fs_journal_keys_init(struct bch_fs *); + #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h new file mode 100644 index 000000000000..8b773823704f --- /dev/null +++ b/fs/bcachefs/btree_journal_iter_types.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H +#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H + +struct journal_key_range_overwritten { + size_t start, end; +}; + +struct journal_key { + u64 journal_seq; + u32 journal_offset; + enum btree_id btree_id:8; + unsigned level:8; + bool allocated; + bool overwritten; + struct journal_key_range_overwritten __rcu * + overwritten_range; + struct bkey_i *k; +}; + +struct journal_keys { + /* must match layout in darray_types.h */ + size_t nr, size; + struct journal_key *data; + /* + * Gap buffer: instead of all the empty space in the array being at the + * end of the buffer - from @nr to @size - the empty space is at @gap. + * This means that sequential insertions are O(n) instead of O(n^2). + */ + size_t gap; + atomic_t ref; + bool initial_ref_held; + struct mutex overwrite_lock; +}; + +#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 74e52fd28abe..edce59433375 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -32,12 +32,22 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params bch2_btree_key_cache_params = { - .head_offset = offsetof(struct bkey_cached, hash), - .key_offset = offsetof(struct bkey_cached, key), - .key_len = sizeof(struct bkey_cached_key), - .obj_cmpfn = bch2_btree_key_cache_cmp_fn, + .head_offset = offsetof(struct bkey_cached, hash), + .key_offset = offsetof(struct bkey_cached, key), + .key_len = sizeof(struct bkey_cached_key), + .obj_cmpfn = bch2_btree_key_cache_cmp_fn, + .automatic_shrinking = true, }; +static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path, + struct bkey_cached *ck, + enum btree_node_locked_type lock_held) +{ + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); + path->l[0].b = (void *) ck; + mark_btree_node_locked(trans, path, 0, lock_held); +} + __flatten inline struct bkey_cached * bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) @@ -69,224 +79,95 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) return true; } -static void bkey_cached_evict(struct btree_key_cache *c, +static bool bkey_cached_evict(struct btree_key_cache *c, struct bkey_cached *ck) { - BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, - bch2_btree_key_cache_params)); - memset(&ck->key, ~0, sizeof(ck->key)); + bool ret = !rhashtable_remove_fast(&c->table, &ck->hash, + bch2_btree_key_cache_params); + if (ret) { + memset(&ck->key, ~0, sizeof(ck->key)); + atomic_long_dec(&c->nr_keys); + } - atomic_long_dec(&c->nr_keys); + return ret; } -static void bkey_cached_free(struct btree_key_cache *bc, - struct bkey_cached *ck) +static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu) { - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - - BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); - - ck->btree_trans_barrier_seq = - start_poll_synchronize_srcu(&c->btree_trans_barrier); + struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier); + struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu); - if (ck->c.lock.readers) { - list_move_tail(&ck->list, &bc->freed_pcpu); - bc->nr_freed_pcpu++; - } else { - list_move_tail(&ck->list, &bc->freed_nonpcpu); - bc->nr_freed_nonpcpu++; - } - atomic_long_inc(&bc->nr_freed); + this_cpu_dec(*c->btree_key_cache.nr_pending); + kmem_cache_free(bch2_key_cache, ck); +} +static void bkey_cached_free(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ kfree(ck->k); ck->k = NULL; ck->u64s = 0; six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); -} - -#ifdef __KERNEL__ -static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - struct bkey_cached *pos; - - bc->nr_freed_nonpcpu++; - - list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { - if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, - pos->btree_trans_barrier_seq)) { - list_move(&ck->list, &pos->list); - return; - } - } - list_move(&ck->list, &bc->freed_nonpcpu); + bool pcpu_readers = ck->c.lock.readers != NULL; + rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu); + this_cpu_inc(*bc->nr_pending); } -#endif -static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, - struct bkey_cached *ck) +static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) { - BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); - - if (!ck->c.lock.readers) { -#ifdef __KERNEL__ - struct btree_key_cache_freelist *f; - bool freed = false; - - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - if (f->nr < ARRAY_SIZE(f->objs)) { - f->objs[f->nr++] = ck; - freed = true; - } - preempt_enable(); - - if (!freed) { - mutex_lock(&bc->lock); - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - while (f->nr > ARRAY_SIZE(f->objs) / 2) { - struct bkey_cached *ck2 = f->objs[--f->nr]; - - __bkey_cached_move_to_freelist_ordered(bc, ck2); - } - preempt_enable(); + gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; - __bkey_cached_move_to_freelist_ordered(bc, ck); - mutex_unlock(&bc->lock); - } -#else - mutex_lock(&bc->lock); - list_move_tail(&ck->list, &bc->freed_nonpcpu); - bc->nr_freed_nonpcpu++; - mutex_unlock(&bc->lock); -#endif - } else { - mutex_lock(&bc->lock); - list_move_tail(&ck->list, &bc->freed_pcpu); - mutex_unlock(&bc->lock); + struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); + if (unlikely(!ck)) + return NULL; + ck->k = kmalloc(key_u64s * sizeof(u64), gfp); + if (unlikely(!ck->k)) { + kmem_cache_free(bch2_key_cache, ck); + return NULL; } -} - -static void bkey_cached_free_fast(struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - - ck->btree_trans_barrier_seq = - start_poll_synchronize_srcu(&c->btree_trans_barrier); - - list_del_init(&ck->list); - atomic_long_inc(&bc->nr_freed); - - kfree(ck->k); - ck->k = NULL; - ck->u64s = 0; - - bkey_cached_move_to_freelist(bc, ck); - - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); + ck->u64s = key_u64s; + return ck; } static struct bkey_cached * -bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, - bool *was_new) +bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) { struct bch_fs *c = trans->c; struct btree_key_cache *bc = &c->btree_key_cache; - struct bkey_cached *ck = NULL; bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); int ret; - if (!pcpu_readers) { -#ifdef __KERNEL__ - struct btree_key_cache_freelist *f; - - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - if (f->nr) - ck = f->objs[--f->nr]; - preempt_enable(); - - if (!ck) { - mutex_lock(&bc->lock); - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - while (!list_empty(&bc->freed_nonpcpu) && - f->nr < ARRAY_SIZE(f->objs) / 2) { - ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); - list_del_init(&ck->list); - bc->nr_freed_nonpcpu--; - f->objs[f->nr++] = ck; - } - - ck = f->nr ? f->objs[--f->nr] : NULL; - preempt_enable(); - mutex_unlock(&bc->lock); - } -#else - mutex_lock(&bc->lock); - if (!list_empty(&bc->freed_nonpcpu)) { - ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); - list_del_init(&ck->list); - bc->nr_freed_nonpcpu--; - } - mutex_unlock(&bc->lock); -#endif - } else { - mutex_lock(&bc->lock); - if (!list_empty(&bc->freed_pcpu)) { - ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); - list_del_init(&ck->list); - } - mutex_unlock(&bc->lock); - } - - if (ck) { - ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); - if (unlikely(ret)) { - bkey_cached_move_to_freelist(bc, ck); - return ERR_PTR(ret); - } - - path->l[0].b = (void *) ck; - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); - - ret = bch2_btree_node_lock_write(trans, path, &ck->c); - if (unlikely(ret)) { - btree_node_unlock(trans, path, 0); - bkey_cached_move_to_freelist(bc, ck); - return ERR_PTR(ret); - } - - return ck; - } + struct bkey_cached *ck = container_of_or_null( + rcu_pending_dequeue(&bc->pending[pcpu_readers]), + struct bkey_cached, rcu); + if (ck) + goto lock; ck = allocate_dropping_locks(trans, ret, - kmem_cache_zalloc(bch2_key_cache, _gfp)); + __bkey_cached_alloc(key_u64s, _gfp)); if (ret) { + if (ck) + kfree(ck->k); kmem_cache_free(bch2_key_cache, ck); return ERR_PTR(ret); } - if (!ck) - return NULL; - - INIT_LIST_HEAD(&ck->list); - bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); - - ck->c.cached = true; - BUG_ON(!six_trylock_intent(&ck->c.lock)); - BUG_ON(!six_trylock_write(&ck->c.lock)); - *was_new = true; + if (ck) { + bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); + ck->c.cached = true; + goto lock; + } + + ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]), + struct bkey_cached, rcu); + if (ck) + goto lock; +lock: + six_lock_intent(&ck->c.lock, NULL, NULL); + six_lock_write(&ck->c.lock, NULL, NULL); return ck; } @@ -298,310 +179,215 @@ bkey_cached_reuse(struct btree_key_cache *c) struct bkey_cached *ck; unsigned i; - mutex_lock(&c->lock); rcu_read_lock(); tbl = rht_dereference_rcu(c->table.tbl, &c->table); for (i = 0; i < tbl->size; i++) rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bkey_cached_lock_for_evict(ck)) { - bkey_cached_evict(c, ck); - goto out; + if (bkey_cached_evict(c, ck)) + goto out; + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); } } ck = NULL; out: rcu_read_unlock(); - mutex_unlock(&c->lock); return ck; } -static struct bkey_cached * -btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) +static int btree_key_cache_create(struct btree_trans *trans, + struct btree_path *path, + struct btree_path *ck_path, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct btree_key_cache *bc = &c->btree_key_cache; - struct bkey_cached *ck; - bool was_new = false; - ck = bkey_cached_alloc(trans, path, &was_new); - if (IS_ERR(ck)) - return ck; + /* + * bch2_varint_decode can read past the end of the buffer by at + * most 7 bytes (it won't be used): + */ + unsigned key_u64s = k.k->u64s + 1; + + /* + * Allocate some extra space so that the transaction commit path is less + * likely to have to reallocate, since that requires a transaction + * restart: + */ + key_u64s = min(256U, (key_u64s * 3) / 2); + key_u64s = roundup_pow_of_two(key_u64s); + + struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s); + int ret = PTR_ERR_OR_ZERO(ck); + if (ret) + return ret; if (unlikely(!ck)) { ck = bkey_cached_reuse(bc); if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_id_str(path->btree_id)); - return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); + bch2_btree_id_str(ck_path->btree_id)); + return -BCH_ERR_ENOMEM_btree_key_cache_create; } - - mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); } ck->c.level = 0; - ck->c.btree_id = path->btree_id; - ck->key.btree_id = path->btree_id; - ck->key.pos = path->pos; - ck->valid = false; + ck->c.btree_id = ck_path->btree_id; + ck->key.btree_id = ck_path->btree_id; + ck->key.pos = ck_path->pos; ck->flags = 1U << BKEY_CACHED_ACCESSED; - if (unlikely(rhashtable_lookup_insert_fast(&bc->table, - &ck->hash, - bch2_btree_key_cache_params))) { - /* We raced with another fill: */ - - if (likely(was_new)) { - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - kfree(ck); - } else { - bkey_cached_free_fast(bc, ck); + if (unlikely(key_u64s > ck->u64s)) { + mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); + + struct bkey_i *new_k = allocate_dropping_locks(trans, ret, + kmalloc(key_u64s * sizeof(u64), _gfp)); + if (unlikely(!new_k)) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(ck->key.btree_id), key_u64s); + ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; + } else if (ret) { + kfree(new_k); + goto err; } - mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); - return NULL; + kfree(ck->k); + ck->k = new_k; + ck->u64s = key_u64s; } - atomic_long_inc(&bc->nr_keys); + bkey_reassemble(ck->k, k); + + ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c); + if (unlikely(ret)) + goto err; + ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); + + bch2_btree_node_unlock_write(trans, path, path_l(path)->b); + + if (unlikely(ret)) /* raced with another fill? */ + goto err; + + atomic_long_inc(&bc->nr_keys); six_unlock_write(&ck->c.lock); - return ck; + enum six_lock_type lock_want = __btree_lock_want(ck_path, 0); + if (lock_want == SIX_LOCK_read) + six_lock_downgrade(&ck->c.lock); + btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want); + ck_path->uptodate = BTREE_ITER_UPTODATE; + return 0; +err: + bkey_cached_free(bc, ck); + mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); + + return ret; } -static int btree_key_cache_fill(struct btree_trans *trans, - struct btree_path *ck_path, - struct bkey_cached *ck) +static noinline int btree_key_cache_fill(struct btree_trans *trans, + struct btree_path *ck_path, + unsigned flags) { + if (flags & BTREE_ITER_cached_nofill) { + ck_path->l[0].b = NULL; + return 0; + } + + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - unsigned new_u64s = 0; - struct bkey_i *new_k = NULL; int ret; - k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, - BTREE_ITER_KEY_CACHE_FILL| - BTREE_ITER_CACHED_NOFILL); + bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, + BTREE_ITER_intent| + BTREE_ITER_key_cache_fill| + BTREE_ITER_cached_nofill); + iter.flags &= ~BTREE_ITER_with_journal; + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; - if (!bch2_btree_node_relock(trans, ck_path, 0)) { - trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); - goto err; - } - - /* - * bch2_varint_decode can read past the end of the buffer by at - * most 7 bytes (it won't be used): - */ - new_u64s = k.k->u64s + 1; - - /* - * Allocate some extra space so that the transaction commit path is less - * likely to have to reallocate, since that requires a transaction - * restart: - */ - new_u64s = min(256U, (new_u64s * 3) / 2); - - if (new_u64s > ck->u64s) { - new_u64s = roundup_pow_of_two(new_u64s); - new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); - if (!new_k) { - bch2_trans_unlock(trans); - - new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); - if (!new_k) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(ck->key.btree_id), new_u64s); - ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; - goto err; - } - - if (!bch2_btree_node_relock(trans, ck_path, 0)) { - kfree(new_k); - trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); - goto err; - } - - ret = bch2_trans_relock(trans); - if (ret) { - kfree(new_k); - goto err; - } - } - } + /* Recheck after btree lookup, before allocating: */ + ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; + if (unlikely(ret)) + goto out; - ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); - if (ret) { - kfree(new_k); + ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); + if (ret) goto err; - } - if (new_k) { - kfree(ck->k); - ck->u64s = new_u64s; - ck->k = new_k; - } - - bkey_reassemble(ck->k, k); - ck->valid = true; - bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); + if (trace_key_cache_fill_enabled()) { + struct printbuf buf = PRINTBUF; + bch2_bpos_to_text(&buf, ck_path->pos); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, trans->c, k); + trace_key_cache_fill(trans, buf.buf); + printbuf_exit(&buf); + } +out: /* We're not likely to need this iterator again: */ - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); err: bch2_trans_iter_exit(trans, &iter); return ret; } -static noinline int -bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, - unsigned flags) +static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, + struct btree_path *path) { struct bch_fs *c = trans->c; struct bkey_cached *ck; - int ret = 0; - - BUG_ON(path->level); - - path->l[1].b = NULL; - - if (bch2_btree_node_relock_notrace(trans, path, 0)) { - ck = (void *) path->l[0].b; - goto fill; - } retry: ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) { - ck = btree_key_cache_create(trans, path); - ret = PTR_ERR_OR_ZERO(ck); - if (ret) - goto err; - if (!ck) - goto retry; - - mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); - path->locks_want = 1; - } else { - enum six_lock_type lock_want = __btree_lock_want(path, 0); - - ret = btree_node_lock(trans, path, (void *) ck, 0, - lock_want, _THIS_IP_); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto err; - - BUG_ON(ret); - - if (ck->key.btree_id != path->btree_id || - !bpos_eq(ck->key.pos, path->pos)) { - six_unlock_type(&ck->c.lock, lock_want); - goto retry; - } - - mark_btree_node_locked(trans, path, 0, - (enum btree_node_locked_type) lock_want); - } - - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - path->l[0].b = (void *) ck; -fill: - path->uptodate = BTREE_ITER_UPTODATE; - - if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { - /* - * Using the underscore version because we haven't set - * path->uptodate yet: - */ - if (!path->locks_want && - !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { - trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); - goto err; - } + if (!ck) + return -ENOENT; - ret = btree_key_cache_fill(trans, path, ck); - if (ret) - goto err; + enum six_lock_type lock_want = __btree_lock_want(path, 0); - ret = bch2_btree_path_relock(trans, path, _THIS_IP_); - if (ret) - goto err; + int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_); + if (ret) + return ret; - path->uptodate = BTREE_ITER_UPTODATE; + if (ck->key.btree_id != path->btree_id || + !bpos_eq(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; } if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); - BUG_ON(path->uptodate); - - return ret; -err: - path->uptodate = BTREE_ITER_NEED_TRAVERSE; - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - btree_node_unlock(trans, path, 0); - path->l[0].b = ERR_PTR(ret); - } - return ret; + btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); + path->uptodate = BTREE_ITER_UPTODATE; + return 0; } int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, unsigned flags) { - struct bch_fs *c = trans->c; - struct bkey_cached *ck; - int ret = 0; - EBUG_ON(path->level); path->l[1].b = NULL; - if (bch2_btree_node_relock_notrace(trans, path, 0)) { - ck = (void *) path->l[0].b; - goto fill; - } -retry: - ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) { - return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); - } else { - enum six_lock_type lock_want = __btree_lock_want(path, 0); - - ret = btree_node_lock(trans, path, (void *) ck, 0, - lock_want, _THIS_IP_); - EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); - - if (ret) - return ret; - - if (ck->key.btree_id != path->btree_id || - !bpos_eq(ck->key.pos, path->pos)) { - six_unlock_type(&ck->c.lock, lock_want); - goto retry; + int ret; + do { + ret = btree_path_traverse_cached_fast(trans, path); + if (unlikely(ret == -ENOENT)) + ret = btree_key_cache_fill(trans, path, flags); + } while (ret == -EEXIST); + + if (unlikely(ret)) { + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(ret); } - - mark_btree_node_locked(trans, path, 0, - (enum btree_node_locked_type) lock_want); } - - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - path->l[0].b = (void *) ck; -fill: - if (!ck->valid) - return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); - - if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) - set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - - path->uptodate = BTREE_ITER_UPTODATE; - EBUG_ON(!ck->valid); - EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); - return ret; } @@ -618,13 +404,13 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, - BTREE_ITER_SLOTS| - BTREE_ITER_INTENT| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_slots| + BTREE_ITER_intent| + BTREE_ITER_all_snapshots); bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; + BTREE_ITER_cached| + BTREE_ITER_intent); + b_iter.flags &= ~BTREE_ITER_with_key_cache; ret = bch2_btree_iter_traverse(&c_iter); if (ret) @@ -640,8 +426,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, goto out; } - BUG_ON(!ck->valid); - if (journal_seq && ck->journal.seq != journal_seq) goto out; @@ -657,24 +441,31 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, commit_flags |= BCH_WATERMARK_reclaim; if (ck->journal.seq != journal_last_seq(j) || - j->watermark == BCH_WATERMARK_stripe) + !test_bit(JOURNAL_space_low, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - ret = bch2_btree_iter_traverse(&b_iter) ?: - bch2_trans_update(trans, &b_iter, ck->k, - BTREE_UPDATE_KEY_CACHE_RECLAIM| - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - BTREE_TRIGGER_NORUN) ?: + struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); + ret = bkey_err(btree_k); + if (ret) + goto err; + + /* * Check that we're not violating cache coherency rules: */ + BUG_ON(bkey_deleted(btree_k.k)); + + ret = bch2_trans_update(trans, &b_iter, ck->k, + BTREE_UPDATE_key_cache_reclaim| + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| commit_flags); - +err: bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && !bch2_journal_error(j), c, - "error flushing key cache: %s", bch2_err_str(ret)); + "flushing key cache: %s", bch2_err_str(ret)); if (ret) goto out; @@ -704,8 +495,12 @@ evict: } mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); - bkey_cached_evict(&c->btree_key_cache, ck); - bkey_cached_free_fast(&c->btree_key_cache, ck); + if (bkey_cached_evict(&c->btree_key_cache, ck)) { + bkey_cached_free(&c->btree_key_cache, ck); + } else { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); + } } out: bch2_trans_iter_exit(trans, &b_iter); @@ -763,7 +558,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, BUG_ON(insert->k.u64s > ck->u64s); bkey_copy(ck->k, insert); - ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); @@ -786,7 +580,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, * flushing. The flush callback will not proceed unless ->seq matches * the latest pin, so make sure it starts with a consistent value. */ - if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || + if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || !journal_pin_active(&ck->journal)) { ck->seq = trans->journal_res.seq; } @@ -802,10 +596,9 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; + struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck = (void *) path->l[0].b; - BUG_ON(!ck->valid); - /* * We just did an update to the btree, bypassing the key cache: the key * cache key is now stale and must be dropped, even if dirty: @@ -816,7 +609,22 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, bch2_journal_pin_drop(&c->journal, &ck->journal); } - ck->valid = false; + bkey_cached_evict(bc, ck); + bkey_cached_free(bc, ck); + + mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); + + struct btree_path *path2; + unsigned i; + trans_for_each_path(trans, path2, i) + if (path2->l[0].b == (void *) ck) { + __bch2_btree_path_unlock(trans, path2); + path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); + path2->should_be_locked = false; + btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE); + } + + bch2_trans_verify_locks(trans); } static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, @@ -825,97 +633,75 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, struct bch_fs *c = shrink->private_data; struct btree_key_cache *bc = &c->btree_key_cache; struct bucket_table *tbl; - struct bkey_cached *ck, *t; + struct bkey_cached *ck; size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; - unsigned start, flags; + unsigned iter, start; int srcu_idx; - mutex_lock(&bc->lock); srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - flags = memalloc_nofs_save(); + rcu_read_lock(); + + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); /* - * Newest freed entries are at the end of the list - once we hit one - * that's too new to be freed, we can bail out: + * Scanning is expensive while a rehash is in progress - most elements + * will be on the new hashtable, if it's in progress + * + * A rehash could still start while we're scanning - that's ok, we'll + * still see most elements. */ - scanned += bc->nr_freed_nonpcpu; - - list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { - if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, - ck->btree_trans_barrier_seq)) - break; - - list_del(&ck->list); - six_lock_exit(&ck->c.lock); - kmem_cache_free(bch2_key_cache, ck); - atomic_long_dec(&bc->nr_freed); - freed++; - bc->nr_freed_nonpcpu--; - } - - if (scanned >= nr) - goto out; - - scanned += bc->nr_freed_pcpu; - - list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { - if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, - ck->btree_trans_barrier_seq)) - break; - - list_del(&ck->list); - six_lock_exit(&ck->c.lock); - kmem_cache_free(bch2_key_cache, ck); - atomic_long_dec(&bc->nr_freed); - freed++; - bc->nr_freed_pcpu--; + if (unlikely(tbl->nest)) { + rcu_read_unlock(); + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + return SHRINK_STOP; } - if (scanned >= nr) - goto out; - - rcu_read_lock(); - tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - if (bc->shrink_iter >= tbl->size) - bc->shrink_iter = 0; - start = bc->shrink_iter; + iter = bc->shrink_iter; + if (iter >= tbl->size) + iter = 0; + start = iter; do { struct rhash_head *pos, *next; - pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); + pos = rht_ptr_rcu(&tbl->buckets[iter]); while (!rht_is_a_nulls(pos)) { - next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); + next = rht_dereference_bucket_rcu(pos->next, tbl, iter); ck = container_of(pos, struct bkey_cached, hash); - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) - goto next; - - if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + bc->skipped_dirty++; + } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); - else if (bkey_cached_lock_for_evict(ck)) { - bkey_cached_evict(bc, ck); + bc->skipped_accessed++; + } else if (!bkey_cached_lock_for_evict(ck)) { + bc->skipped_lock_fail++; + } else if (bkey_cached_evict(bc, ck)) { bkey_cached_free(bc, ck); + bc->freed++; + freed++; + } else { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); } scanned++; if (scanned >= nr) - break; -next: + goto out; + pos = next; } - bc->shrink_iter++; - if (bc->shrink_iter >= tbl->size) - bc->shrink_iter = 0; - } while (scanned < nr && bc->shrink_iter != start); + iter++; + if (iter >= tbl->size) + iter = 0; + } while (scanned < nr && iter != start); +out: + bc->shrink_iter = iter; rcu_read_unlock(); -out: - memalloc_nofs_restore(flags); srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - mutex_unlock(&bc->lock); return freed; } @@ -928,6 +714,14 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, long nr = atomic_long_read(&bc->nr_keys) - atomic_long_read(&bc->nr_dirty); + /* + * Avoid hammering our shrinker too much if it's nearly empty - the + * shrinker code doesn't take into account how big our cache is, if it's + * mostly empty but the system is under memory pressure it causes nasty + * lock contention: + */ + nr -= 128; + return max(0L, nr); } @@ -935,60 +729,36 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); struct bucket_table *tbl; - struct bkey_cached *ck, *n; + struct bkey_cached *ck; struct rhash_head *pos; LIST_HEAD(items); unsigned i; -#ifdef __KERNEL__ - int cpu; -#endif shrinker_free(bc->shrink); - mutex_lock(&bc->lock); - /* * The loop is needed to guard against racing with rehash: */ while (atomic_long_read(&bc->nr_keys)) { rcu_read_lock(); tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - if (tbl) + if (tbl) { + if (tbl->nest) { + /* wait for in progress rehash */ + rcu_read_unlock(); + mutex_lock(&bc->table.mutex); + mutex_unlock(&bc->table.mutex); + continue; + } for (i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { - bkey_cached_evict(bc, ck); - list_add(&ck->list, &items); + while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { + ck = container_of(pos, struct bkey_cached, hash); + BUG_ON(!bkey_cached_evict(bc, ck)); + kfree(ck->k); + kmem_cache_free(bch2_key_cache, ck); } - rcu_read_unlock(); - } - -#ifdef __KERNEL__ - for_each_possible_cpu(cpu) { - struct btree_key_cache_freelist *f = - per_cpu_ptr(bc->pcpu_freed, cpu); - - for (i = 0; i < f->nr; i++) { - ck = f->objs[i]; - list_add(&ck->list, &items); } - } -#endif - - BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); - BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); - - list_splice(&bc->freed_pcpu, &items); - list_splice(&bc->freed_nonpcpu, &items); - - mutex_unlock(&bc->lock); - - list_for_each_entry_safe(ck, n, &items, list) { - cond_resched(); - - list_del(&ck->list); - kfree(ck->k); - six_lock_exit(&ck->c.lock); - kmem_cache_free(bch2_key_cache, ck); + rcu_read_unlock(); } if (atomic_long_read(&bc->nr_dirty) && @@ -1004,14 +774,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) if (bc->table_init_done) rhashtable_destroy(&bc->table); - free_percpu(bc->pcpu_freed); + rcu_pending_exit(&bc->pending[0]); + rcu_pending_exit(&bc->pending[1]); + + free_percpu(bc->nr_pending); } void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { - mutex_init(&c->lock); - INIT_LIST_HEAD(&c->freed_pcpu); - INIT_LIST_HEAD(&c->freed_nonpcpu); } int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) @@ -1019,11 +789,13 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); struct shrinker *shrink; -#ifdef __KERNEL__ - bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); - if (!bc->pcpu_freed) + bc->nr_pending = alloc_percpu(size_t); + if (!bc->nr_pending) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; + + if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || + rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) return -BCH_ERR_ENOMEM_fs_btree_cache_init; -#endif if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) return -BCH_ERR_ENOMEM_fs_btree_cache_init; @@ -1034,22 +806,32 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) if (!shrink) return -BCH_ERR_ENOMEM_fs_btree_cache_init; bc->shrink = shrink; - shrink->seeks = 0; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; + shrink->batch = 1 << 14; + shrink->seeks = 0; shrink->private_data = c; shrinker_register(shrink); return 0; } -void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) +void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) { - prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); - prt_newline(out); - prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); + printbuf_tabstop_push(out, 24); + printbuf_tabstop_push(out, 12); + + prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); + prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); + prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size); prt_newline(out); - prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); + prt_printf(out, "shrinker:\n"); + prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); + prt_printf(out, "freed:\t%lu\r\n", bc->freed); + prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); + prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); + prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); prt_newline(out); + prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending)); } void bch2_btree_key_cache_exit(void) diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index e6b2cd0dd2c1..51d6289b8dee 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -11,13 +11,27 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) return max_t(ssize_t, 0, nr_dirty - max_dirty); } -static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c) { size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); size_t max_dirty = 4096 + (nr_keys * 3) / 4; - return nr_dirty > max_dirty; + return nr_dirty - max_dirty; +} + +static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +{ + return __bch2_btree_key_cache_must_wait(c) > 0; +} + +static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c) +{ + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 2048 + (nr_keys * 5) / 8; + + return nr_dirty <= max_dirty; } int bch2_btree_key_cache_journal_flush(struct journal *, diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h index 290e4e57df5b..722f1ed10551 100644 --- a/fs/bcachefs/btree_key_cache_types.h +++ b/fs/bcachefs/btree_key_cache_types.h @@ -2,28 +2,28 @@ #ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H #define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H -struct btree_key_cache_freelist { - struct bkey_cached *objs[16]; - unsigned nr; -}; +#include "rcu_pending.h" struct btree_key_cache { - struct mutex lock; struct rhashtable table; bool table_init_done; - struct list_head freed_pcpu; - size_t nr_freed_pcpu; - struct list_head freed_nonpcpu; - size_t nr_freed_nonpcpu; - struct shrinker *shrink; unsigned shrink_iter; - struct btree_key_cache_freelist __percpu *pcpu_freed; - atomic_long_t nr_freed; + /* 0: non pcpu reader locks, 1: pcpu reader locks */ + struct rcu_pending pending[2]; + size_t __percpu *nr_pending; + atomic_long_t nr_keys; atomic_long_t nr_dirty; + + /* shrinker stats */ + unsigned long requested_to_free; + unsigned long freed; + unsigned long skipped_dirty; + unsigned long skipped_accessed; + unsigned long skipped_lock_fail; }; struct bkey_cached_key { diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 684397442338..caef65adeae4 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -7,22 +7,13 @@ static struct lock_class_key bch2_btree_node_lock_key; void bch2_btree_lock_init(struct btree_bkey_cached_common *b, - enum six_lock_init_flags flags) + enum six_lock_init_flags flags, + gfp_t gfp) { - __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags); - lockdep_set_novalidate_class(&b->lock); + __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp); + lockdep_set_notrack_class(&b->lock); } -#ifdef CONFIG_LOCKDEP -void bch2_assert_btree_nodes_not_locked(void) -{ -#if 0 - //Re-enable when lock_class_is_held() is merged: - BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); -#endif -} -#endif - /* Btree node locking: */ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, @@ -83,8 +74,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) { struct trans_waiting_for_lock *i; - prt_printf(out, "Found lock cycle (%u entries):", g->nr); - prt_newline(out); + prt_printf(out, "Found lock cycle (%u entries):\n", g->nr); for (i = g->g; i < g->g + g->nr; i++) { struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); @@ -120,6 +110,12 @@ static noinline void lock_graph_pop_all(struct lock_graph *g) lock_graph_up(g); } +static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i) +{ + while (g->g + g->nr > i) + lock_graph_up(g); +} + static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) { g->g[g->nr++] = (struct trans_waiting_for_lock) { @@ -135,15 +131,20 @@ static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) __lock_graph_down(g, trans); } -static bool lock_graph_remove_non_waiters(struct lock_graph *g) +static bool lock_graph_remove_non_waiters(struct lock_graph *g, + struct trans_waiting_for_lock *from) { struct trans_waiting_for_lock *i; - for (i = g->g + 1; i < g->g + g->nr; i++) + if (from->trans->locking != from->node_want) { + lock_graph_pop_from(g, from); + return true; + } + + for (i = from + 1; i < g->g + g->nr; i++) if (i->trans->locking != i->node_want || i->trans->locking_wait.start_time != i[-1].lock_start_time) { - while (g->g + g->nr > i) - lock_graph_up(g); + lock_graph_pop_from(g, i); return true; } @@ -190,13 +191,14 @@ static int btree_trans_abort_preference(struct btree_trans *trans) return 3; } -static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) +static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, + struct trans_waiting_for_lock *from) { struct trans_waiting_for_lock *i, *abort = NULL; unsigned best = 0, pref; int ret; - if (lock_graph_remove_non_waiters(g)) + if (lock_graph_remove_non_waiters(g, from)) return 0; /* Only checking, for debugfs: */ @@ -206,7 +208,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) goto out; } - for (i = g->g; i < g->g + g->nr; i++) { + for (i = from; i < g->g + g->nr; i++) { pref = btree_trans_abort_preference(i->trans); if (pref > best) { abort = i; @@ -216,6 +218,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) if (unlikely(!best)) { struct printbuf buf = PRINTBUF; + buf.atomic++; prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); @@ -224,15 +227,14 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) bch2_btree_trans_to_text(&buf, trans); - prt_printf(&buf, "backtrace:"); - prt_newline(&buf); + prt_printf(&buf, "backtrace:\n"); printbuf_indent_add(&buf, 2); bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); printbuf_indent_sub(&buf, 2); prt_newline(&buf); } - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); printbuf_exit(&buf); BUG(); } @@ -240,8 +242,9 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) ret = abort_lock(g, abort); out: if (ret) - while (g->nr) - lock_graph_up(g); + lock_graph_pop_all(g); + else + lock_graph_pop_from(g, abort); return ret; } @@ -254,7 +257,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, for (i = g->g; i < g->g + g->nr; i++) if (i->trans == trans) { closure_put(&trans->ref); - return break_cycle(g, cycle); + return break_cycle(g, cycle, i); } if (g->nr == ARRAY_SIZE(g->g)) { @@ -263,8 +266,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, if (orig_trans->lock_may_not_fail) return 0; - while (g->nr) - lock_graph_up(g); + lock_graph_pop_all(g); if (cycle) return 0; @@ -292,7 +294,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) g.nr = 0; - if (trans->lock_must_abort) { + if (trans->lock_must_abort && !trans->lock_may_not_fail) { if (cycle) return -1; @@ -347,7 +349,7 @@ next: * structures - which means it can't be blocked * waiting on a lock: */ - if (!lock_graph_remove_non_waiters(&g)) { + if (!lock_graph_remove_non_waiters(&g, g.g)) { /* * If lock_graph_remove_non_waiters() * didn't do anything, it must be @@ -440,33 +442,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, struct btree_path *path, struct btree_bkey_cached_common *b) { - struct btree_path *linked; - unsigned i, iter; - int ret; - - /* - * XXX BIG FAT NOTICE - * - * Drop all read locks before taking a write lock: - * - * This is a hack, because bch2_btree_node_lock_write_nofail() is a - * hack - but by dropping read locks first, this should never fail, and - * we only use this in code paths where whatever read locks we've - * already taken are no longer needed: - */ - - trans_for_each_path(trans, linked, iter) { - if (!linked->nodes_locked) - continue; - - for (i = 0; i < BTREE_MAX_DEPTH; i++) - if (btree_node_read_locked(linked, i)) { - btree_node_unlock(trans, linked, i); - btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK); - } - } - - ret = __btree_node_lock_write(trans, path, b, true); + int ret = __btree_node_lock_write(trans, path, b, true); BUG_ON(ret); } @@ -518,8 +494,6 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, if (path->uptodate == BTREE_ITER_NEED_RELOCK) path->uptodate = BTREE_ITER_UPTODATE; - bch2_trans_verify_locks(trans); - return path->uptodate < BTREE_ITER_NEED_RELOCK; } @@ -551,7 +525,6 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned level) { struct btree *b = path->l[level].b; - struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level); if (!is_btree_node(path, level)) return false; @@ -575,24 +548,11 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, if (race_fault()) return false; - if (btree_node_locked(path, level)) { - bool ret; - - six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]); - ret = six_lock_tryupgrade(&b->c.lock); - six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]); - - if (ret) - goto success; - } else { - if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) - goto success; - } + if (btree_node_locked(path, level) + ? six_lock_tryupgrade(&b->c.lock) + : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) + goto success; - /* - * Do we already have an intent lock via another path? If so, just bump - * lock count: - */ if (btree_node_lock_seq_matches(path, b, level) && btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { btree_node_unlock(trans, path, level); @@ -635,7 +595,9 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_pa { struct get_locks_fail f; - return btree_path_get_locks(trans, path, false, &f); + bool ret = btree_path_get_locks(trans, path, false, &f); + bch2_trans_verify_locks(trans); + return ret; } int __bch2_btree_path_relock(struct btree_trans *trans, @@ -658,7 +620,9 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, path->locks_want = new_locks_want; - return btree_path_get_locks(trans, path, true, f); + bool ret = btree_path_get_locks(trans, path, true, f); + bch2_trans_verify_locks(trans); + return ret; } bool __bch2_btree_path_upgrade(struct btree_trans *trans, @@ -666,8 +630,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, unsigned new_locks_want, struct get_locks_fail *f) { - if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f)) - return true; + bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f); + if (ret) + goto out; /* * XXX: this is ugly - we'd prefer to not be mucking with other @@ -701,8 +666,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, btree_path_get_locks(trans, linked, true, NULL); } } - - return false; +out: + bch2_trans_verify_locks(trans); + return ret; } void __bch2_btree_path_downgrade(struct btree_trans *trans, @@ -747,85 +713,102 @@ void bch2_trans_downgrade(struct btree_trans *trans) return; trans_for_each_path(trans, path, i) - bch2_btree_path_downgrade(trans, path); + if (path->ref) + bch2_btree_path_downgrade(trans, path); } -int bch2_trans_relock(struct btree_trans *trans) +static inline void __bch2_trans_unlock(struct btree_trans *trans) { struct btree_path *path; unsigned i; - if (unlikely(trans->restarted)) - return -((int) trans->restarted); + trans_for_each_path(trans, path, i) + __bch2_btree_path_unlock(trans, path); +} - trans_for_each_path(trans, path, i) { - struct get_locks_fail f; +static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, + struct get_locks_fail *f, bool trace) +{ + if (!trace) + goto out; - if (path->should_be_locked && - !btree_path_get_locks(trans, path, false, &f)) { - if (trace_trans_restart_relock_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, " l=%u seq=%u node seq=", - f.l, path->l[f.l].lock_seq); - if (IS_ERR_OR_NULL(f.b)) { - prt_str(&buf, bch2_err_str(PTR_ERR(f.b))); - } else { - prt_printf(&buf, "%u", f.b->c.lock.seq); - - struct six_lock_count c = - bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l); - prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - - c = six_lock_counts(&f.b->c.lock); - prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - } + if (trace_trans_restart_relock_enabled()) { + struct printbuf buf = PRINTBUF; - trace_trans_restart_relock(trans, _RET_IP_, buf.buf); - printbuf_exit(&buf); - } + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq); + if (IS_ERR_OR_NULL(f->b)) { + prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); + } else { + prt_printf(&buf, "%u", f->b->c.lock.seq); - count_event(trans->c, trans_restart_relock); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + struct six_lock_count c = + bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l); + prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + + c = six_lock_counts(&f->b->c.lock); + prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); } + + trace_trans_restart_relock(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); } - return 0; + count_event(trans->c, trans_restart_relock); +out: + __bch2_trans_unlock(trans); + bch2_trans_verify_locks(trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } -int bch2_trans_relock_notrace(struct btree_trans *trans) +static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) { - struct btree_path *path; - unsigned i; + bch2_trans_verify_locks(trans); if (unlikely(trans->restarted)) return -((int) trans->restarted); + if (unlikely(trans->locked)) + goto out; + + struct btree_path *path; + unsigned i; + + trans_for_each_path(trans, path, i) { + struct get_locks_fail f; - trans_for_each_path(trans, path, i) if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path)) { - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); - } + !btree_path_get_locks(trans, path, false, &f)) + return bch2_trans_relock_fail(trans, path, &f, trace); + } + + trans_set_locked(trans, true); +out: + bch2_trans_verify_locks(trans); return 0; } +int bch2_trans_relock(struct btree_trans *trans) +{ + return __bch2_trans_relock(trans, true); +} + +int bch2_trans_relock_notrace(struct btree_trans *trans) +{ + return __bch2_trans_relock(trans, false); +} + void bch2_trans_unlock_noassert(struct btree_trans *trans) { - struct btree_path *path; - unsigned i; + __bch2_trans_unlock(trans); - trans_for_each_path(trans, path, i) - __bch2_btree_path_unlock(trans, path); + trans_set_unlocked(trans); } void bch2_trans_unlock(struct btree_trans *trans) { - struct btree_path *path; - unsigned i; + __bch2_trans_unlock(trans); - trans_for_each_path(trans, path, i) - __bch2_btree_path_unlock(trans, path); + trans_set_unlocked(trans); } void bch2_trans_unlock_long(struct btree_trans *trans) @@ -834,15 +817,15 @@ void bch2_trans_unlock_long(struct btree_trans *trans) bch2_trans_srcu_unlock(trans); } -bool bch2_trans_locked(struct btree_trans *trans) +void bch2_trans_unlock_write(struct btree_trans *trans) { struct btree_path *path; unsigned i; trans_for_each_path(trans, path, i) - if (path->nodes_locked) - return true; - return false; + for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) + if (btree_node_write_locked(path, l)) + bch2_btree_node_unlock_write(trans, path, path->l[l].b); } int __bch2_trans_mutex_lock(struct btree_trans *trans, @@ -861,15 +844,19 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans, void bch2_btree_path_verify_locks(struct btree_path *path) { - unsigned l; + /* + * A path may be uptodate and yet have nothing locked if and only if + * there is no node at path->level, which generally means we were + * iterating over all nodes and got to the end of the btree + */ + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && + btree_path_node(path, path->level) && + !path->nodes_locked); - if (!path->nodes_locked) { - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && - btree_path_node(path, path->level)); + if (!path->nodes_locked) return; - } - for (l = 0; l < BTREE_MAX_DEPTH; l++) { + for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { int want = btree_lock_want(path, l); int have = btree_node_locked_type(path, l); @@ -879,11 +866,30 @@ void bch2_btree_path_verify_locks(struct btree_path *path) (want == BTREE_NODE_UNLOCKED || have != BTREE_NODE_WRITE_LOCKED) && want != have); + + BUG_ON(btree_node_locked(path, l) && + path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); } } +static bool bch2_trans_locked(struct btree_trans *trans) +{ + struct btree_path *path; + unsigned i; + + trans_for_each_path(trans, path, i) + if (path->nodes_locked) + return true; + return false; +} + void bch2_trans_verify_locks(struct btree_trans *trans) { + if (!trans->locked) { + BUG_ON(bch2_trans_locked(trans)); + return; + } + struct btree_path *path; unsigned i; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 4bd72c855da1..b33ab7af8440 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -13,15 +13,10 @@ #include "btree_iter.h" #include "six.h" -void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); - -#ifdef CONFIG_LOCKDEP -void bch2_assert_btree_nodes_not_locked(void); -#else -static inline void bch2_assert_btree_nodes_not_locked(void) {} -#endif +void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); void bch2_trans_unlock_noassert(struct btree_trans *); +void bch2_trans_unlock_write(struct btree_trans *); static inline bool is_btree_node(struct btree_path *path, unsigned l) { @@ -81,13 +76,6 @@ static inline void mark_btree_node_locked_noreset(struct btree_path *path, path->nodes_locked |= (type + 1) << (level << 1); } -static inline void mark_btree_node_unlocked(struct btree_path *path, - unsigned level) -{ - EBUG_ON(btree_node_write_locked(path, level)); - mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); -} - static inline void mark_btree_node_locked(struct btree_trans *trans, struct btree_path *path, unsigned level, @@ -130,6 +118,9 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans, /* unlock: */ +void bch2_btree_node_unlock_write(struct btree_trans *, + struct btree_path *, struct btree *); + static inline void btree_node_unlock(struct btree_trans *trans, struct btree_path *path, unsigned level) { @@ -138,10 +129,14 @@ static inline void btree_node_unlock(struct btree_trans *trans, EBUG_ON(level >= BTREE_MAX_DEPTH); if (lock_type != BTREE_NODE_UNLOCKED) { + if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) { + bch2_btree_node_unlock_write(trans, path, path->l[level].b); + lock_type = BTREE_NODE_INTENT_LOCKED; + } six_unlock_type(&path->l[level].b->c.lock, lock_type); btree_trans_lock_hold_time_update(trans, path, level); + mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); } - mark_btree_node_unlocked(path, level); } static inline int btree_path_lowest_level_locked(struct btree_path *path) @@ -168,47 +163,76 @@ static inline void __bch2_btree_path_unlock(struct btree_trans *trans, * succeed: */ static inline void +__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b) +{ + if (!b->c.lock.write_lock_recurse) { + struct btree_path *linked; + unsigned i; + + trans_for_each_path_with_node(trans, b, linked, i) + linked->l[b->c.level].lock_seq++; + } + + six_unlock_write(&b->c.lock); +} + +static inline void bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, struct btree *b) { - struct btree_path *linked; - unsigned i; - EBUG_ON(path->l[b->c.level].b != b); EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - - trans_for_each_path_with_node(trans, b, linked, i) - linked->l[b->c.level].lock_seq++; - - six_unlock_write(&b->c.lock); + __bch2_btree_node_unlock_write(trans, b); } -void bch2_btree_node_unlock_write(struct btree_trans *, - struct btree_path *, struct btree *); - int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); /* lock: */ +static inline void trans_set_locked(struct btree_trans *trans, bool try) +{ + if (!trans->locked) { + lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_); + trans->locked = true; + trans->last_unlock_ip = 0; + + trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0; + current->flags |= PF_MEMALLOC_NOFS; + } +} + +static inline void trans_set_unlocked(struct btree_trans *trans) +{ + if (trans->locked) { + lock_release(&trans->dep_map, _THIS_IP_); + trans->locked = false; + trans->last_unlock_ip = _RET_IP_; + + if (!trans->pf_memalloc_nofs) + current->flags &= ~PF_MEMALLOC_NOFS; + } +} + static inline int __btree_node_lock_nopath(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type, bool lock_may_not_fail, unsigned long ip) { - int ret; - trans->lock_may_not_fail = lock_may_not_fail; trans->lock_must_abort = false; trans->locking = b; - ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait, - bch2_six_check_for_deadlock, trans, ip); + int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans, ip); WRITE_ONCE(trans->locking, NULL); WRITE_ONCE(trans->locking_wait.start_time, 0); + + if (!ret) + trace_btree_path_lock(trans, _THIS_IP_, b); return ret; } @@ -262,6 +286,7 @@ static inline int btree_node_lock(struct btree_trans *trans, int ret = 0; EBUG_ON(level >= BTREE_MAX_DEPTH); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || @@ -364,14 +389,14 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - struct get_locks_fail f; + struct get_locks_fail f = {}; unsigned old_locks_want = path->locks_want; new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); if (path->locks_want < new_locks_want ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) - : path->uptodate == BTREE_ITER_UPTODATE) + : path->nodes_locked) return 0; trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, @@ -381,12 +406,13 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans, /* misc: */ -static inline void btree_path_set_should_be_locked(struct btree_path *path) +static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path) { EBUG_ON(!btree_node_locked(path, path->level)); EBUG_ON(path->uptodate); path->should_be_locked = true; + trace_btree_path_should_be_locked(trans, path); } static inline void __btree_path_set_level_up(struct btree_trans *trans, diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c new file mode 100644 index 000000000000..a7f06deee13c --- /dev/null +++ b/fs/bcachefs/btree_node_scan.c @@ -0,0 +1,598 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_journal_iter.h" +#include "btree_node_scan.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "error.h" +#include "journal_io.h" +#include "recovery_passes.h" + +#include <linux/kthread.h> +#include <linux/min_heap.h> +#include <linux/sort.h> + +struct find_btree_nodes_worker { + struct closure *cl; + struct find_btree_nodes *f; + struct bch_dev *ca; +}; + +static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) +{ + bch2_btree_id_level_to_text(out, n->btree_id, n->level); + prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ", + n->seq, n->journal_seq, n->cookie); + bch2_bpos_to_text(out, n->min_key); + prt_str(out, "-"); + bch2_bpos_to_text(out, n->max_key); + + if (n->range_updated) + prt_str(out, " range updated"); + + for (unsigned i = 0; i < n->nr_ptrs; i++) { + prt_char(out, ' '); + bch2_extent_ptr_to_text(out, c, n->ptrs + i); + } +} + +static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) +{ + printbuf_indent_add(out, 2); + darray_for_each(nodes, i) { + found_btree_node_to_text(out, c, i); + prt_newline(out); + } + printbuf_indent_sub(out, 2); +} + +static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) +{ + struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); + + set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); + bp->k.p = f->max_key; + bp->v.seq = cpu_to_le64(f->cookie); + bp->v.sectors_written = 0; + bp->v.flags = 0; + bp->v.sectors_written = cpu_to_le16(f->sectors_written); + bp->v.min_key = f->min_key; + SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); + memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); +} + +static inline u64 bkey_journal_seq(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode_v3: + return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq); + default: + return 0; + } +} + +static bool found_btree_node_is_readable(struct btree_trans *trans, + struct found_btree_node *f) +{ + struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; + + found_btree_node_to_key(&tmp.k, f); + + struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false); + bool ret = !IS_ERR_OR_NULL(b); + if (!ret) + return ret; + + f->sectors_written = b->written; + f->journal_seq = le64_to_cpu(b->data->keys.journal_seq); + + struct bkey_s_c k; + struct bkey unpacked; + struct btree_node_iter iter; + for_each_btree_node_key_unpack(b, k, &iter, &unpacked) + f->journal_seq = max(f->journal_seq, bkey_journal_seq(k)); + + six_unlock_read(&b->c.lock); + + /* + * We might update this node's range; if that happens, we need the node + * to be re-read so the read path can trim keys that are no longer in + * this node + */ + if (b != btree_node_root(trans->c, b)) + bch2_btree_node_evict(trans, &tmp.k); + return ret; +} + +static int found_btree_node_cmp_cookie(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->level, r->level) ?: + cmp_int(l->cookie, r->cookie); +} + +/* + * Given two found btree nodes, if their sequence numbers are equal, take the + * one that's readable: + */ +static int found_btree_node_cmp_time(const struct found_btree_node *l, + const struct found_btree_node *r) +{ + return cmp_int(l->seq, r->seq) ?: + cmp_int(l->journal_seq, r->journal_seq); +} + +static int found_btree_node_cmp_pos(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->min_key, r->min_key) ?: + -found_btree_node_cmp_time(l, r); +} + +static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg) +{ + return found_btree_node_cmp_pos(l, r) < 0; +} + +static inline void found_btree_node_swap(void *_l, void *_r, void *arg) +{ + struct found_btree_node *l = _l; + struct found_btree_node *r = _r; + + swap(*l, *r); +} + +static const struct min_heap_callbacks found_btree_node_heap_cbs = { + .less = found_btree_node_cmp_pos_less, + .swp = found_btree_node_swap, +}; + +static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, + struct bio *bio, struct btree_node *bn, u64 offset) +{ + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = offset; + bch2_bio_map(bio, bn, PAGE_SIZE); + + submit_bio_wait(bio); + if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + "IO error in try_read_btree_node() at %llu: %s", + offset, bch2_blk_status_to_str(bio->bi_status))) + return; + + if (le64_to_cpu(bn->magic) != bset_magic(c)) + return; + + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { + if (!c->chacha20) + return; + + struct nonce nonce = btree_nonce(&bn->keys, 0); + unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; + + bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); + } + + if (btree_id_is_alloc(BTREE_NODE_ID(bn))) + return; + + if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) + return; + + if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) + return; + + rcu_read_lock(); + struct found_btree_node n = { + .btree_id = BTREE_NODE_ID(bn), + .level = BTREE_NODE_LEVEL(bn), + .seq = BTREE_NODE_SEQ(bn), + .cookie = le64_to_cpu(bn->keys.seq), + .min_key = bn->min_key, + .max_key = bn->max_key, + .nr_ptrs = 1, + .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, + .ptrs[0].offset = offset, + .ptrs[0].dev = ca->dev_idx, + .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)), + }; + rcu_read_unlock(); + + if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { + mutex_lock(&f->lock); + if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { + bch_err(c, "try_read_btree_node() can't handle endian conversion"); + f->ret = -EINVAL; + goto unlock; + } + + if (darray_push(&f->nodes, n)) + f->ret = -ENOMEM; +unlock: + mutex_unlock(&f->lock); + } +} + +static int read_btree_nodes_worker(void *p) +{ + struct find_btree_nodes_worker *w = p; + struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); + struct bch_dev *ca = w->ca; + void *buf = (void *) __get_free_page(GFP_KERNEL); + struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); + unsigned long last_print = jiffies; + + if (!buf || !bio) { + bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); + w->f->ret = -ENOMEM; + goto err; + } + + for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) + for (unsigned bucket_offset = 0; + bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; + bucket_offset += btree_sectors(c)) { + if (time_after(jiffies, last_print + HZ * 30)) { + u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; + u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; + + bch_info(ca, "%s: %2u%% done", __func__, + (unsigned) div64_u64(cur_sector * 100, end_sector)); + last_print = jiffies; + } + + u64 sector = bucket * ca->mi.bucket_size + bucket_offset; + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && + !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) + continue; + + try_read_btree_node(w->f, ca, bio, buf, sector); + } +err: + bio_put(bio); + free_page((unsigned long) buf); + percpu_ref_get(&ca->io_ref); + closure_put(w->cl); + kfree(w); + return 0; +} + +static int read_btree_nodes(struct find_btree_nodes *f) +{ + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + struct closure cl; + int ret = 0; + + closure_init_stack(&cl); + + for_each_online_member(c, ca) { + if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) + continue; + + struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); + struct task_struct *t; + + if (!w) { + percpu_ref_put(&ca->io_ref); + ret = -ENOMEM; + goto err; + } + + percpu_ref_get(&ca->io_ref); + closure_get(&cl); + w->cl = &cl; + w->f = f; + w->ca = ca; + + t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); + ret = PTR_ERR_OR_ZERO(t); + if (ret) { + percpu_ref_put(&ca->io_ref); + closure_put(&cl); + f->ret = ret; + bch_err(c, "error starting kthread: %i", ret); + break; + } + } +err: + closure_sync(&cl); + return f->ret ?: ret; +} + +static bool nodes_overlap(const struct found_btree_node *l, + const struct found_btree_node *r) +{ + return (l->btree_id == r->btree_id && + l->level == r->level && + bpos_gt(l->max_key, r->min_key)); +} + +static int handle_overwrites(struct bch_fs *c, + struct found_btree_node *l, + found_btree_nodes *nodes_heap) +{ + struct found_btree_node *r; + + while ((r = min_heap_peek(nodes_heap)) && + nodes_overlap(l, r)) { + int cmp = found_btree_node_cmp_time(l, r); + + if (cmp > 0) { + if (bpos_cmp(l->max_key, r->max_key) >= 0) + min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); + else { + r->range_updated = true; + r->min_key = bpos_successor(l->max_key); + r->range_updated = true; + min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); + } + } else if (cmp < 0) { + BUG_ON(bpos_eq(l->min_key, r->min_key)); + + l->max_key = bpos_predecessor(r->min_key); + l->range_updated = true; + } else if (r->level) { + min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); + } else { + if (bpos_cmp(l->max_key, r->max_key) >= 0) + min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); + else { + r->range_updated = true; + r->min_key = bpos_successor(l->max_key); + r->range_updated = true; + min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); + } + } + } + + return 0; +} + +int bch2_scan_for_btree_nodes(struct bch_fs *c) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + struct printbuf buf = PRINTBUF; + found_btree_nodes nodes_heap = {}; + size_t dst; + int ret = 0; + + if (f->nodes.nr) + return 0; + + mutex_init(&f->lock); + + ret = read_btree_nodes(f); + if (ret) + return ret; + + if (!f->nodes.nr) { + bch_err(c, "%s: no btree nodes found", __func__); + ret = -EINVAL; + goto err; + } + + if (0 && c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); + + dst = 0; + darray_for_each(f->nodes, i) { + struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; + + if (prev && + prev->cookie == i->cookie) { + if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { + bch_err(c, "%s: found too many replicas for btree node", __func__); + ret = -EINVAL; + goto err; + } + prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; + } else { + f->nodes.data[dst++] = *i; + } + } + f->nodes.nr = dst; + + sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); + + if (0 && c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + swap(nodes_heap, f->nodes); + + { + /* darray must have same layout as a heap */ + min_heap_char real_heap; + BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr)); + BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size)); + BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr)); + BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size)); + } + + min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL); + + if (nodes_heap.nr) { + ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); + if (ret) + goto err; + + min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); + } + + while (true) { + ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap); + if (ret) + goto err; + + if (!nodes_heap.nr) + break; + + ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); + if (ret) + goto err; + + min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); + } + + for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++) + BUG_ON(nodes_overlap(n, n + 1)); + + if (0 && c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } else { + bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); + } + + eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); +err: + darray_exit(&nodes_heap); + printbuf_exit(&buf); + return ret; +} + +static int found_btree_node_range_start_cmp(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->max_key, r->min_key); +} + +#define for_each_found_btree_node_in_range(_f, _search, _idx) \ + for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ + sizeof((_f)->nodes.data[0]), \ + found_btree_node_range_start_cmp, &search); \ + _idx < (_f)->nodes.nr && \ + (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ + (_f)->nodes.data[_idx].level == _search.level && \ + bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ + _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) + +bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + + struct found_btree_node search = { + .btree_id = b->c.btree_id, + .level = b->c.level, + .min_key = b->data->min_key, + .max_key = b->key.k.p, + }; + + for_each_found_btree_node_in_range(f, search, idx) + if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) + return true; + return false; +} + +bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) +{ + struct found_btree_node search = { + .btree_id = btree, + .level = 0, + .min_key = POS_MIN, + .max_key = SPOS_MAX, + }; + + for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) + return true; + return false; +} + +int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos node_min, struct bpos node_max) +{ + if (btree_id_is_alloc(btree)) + return 0; + + struct find_btree_nodes *f = &c->found_btree_nodes; + + int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + return ret; + + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "recovery "); + bch2_btree_id_level_to_text(&buf, btree, level); + prt_str(&buf, " "); + bch2_bpos_to_text(&buf, node_min); + prt_str(&buf, " - "); + bch2_bpos_to_text(&buf, node_max); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + + struct found_btree_node search = { + .btree_id = btree, + .level = level, + .min_key = node_min, + .max_key = node_max, + }; + + for_each_found_btree_node_in_range(f, search, idx) { + struct found_btree_node n = f->nodes.data[idx]; + + n.range_updated |= bpos_lt(n.min_key, node_min); + n.min_key = bpos_max(n.min_key, node_min); + + n.range_updated |= bpos_gt(n.max_key, node_max); + n.max_key = bpos_min(n.max_key, node_max); + + struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; + + found_btree_node_to_key(&tmp.k, &n); + + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); + bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); + printbuf_exit(&buf); + + BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), + (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = level + 1, + .btree = btree, + })); + + ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); + if (ret) + return ret; + } + + return 0; +} + +void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) +{ + darray_exit(&f->nodes); +} diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h new file mode 100644 index 000000000000..08687b209787 --- /dev/null +++ b/fs/bcachefs/btree_node_scan.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_NODE_SCAN_H +#define _BCACHEFS_BTREE_NODE_SCAN_H + +int bch2_scan_for_btree_nodes(struct bch_fs *); +bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); +bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); +int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); +void bch2_find_btree_nodes_exit(struct find_btree_nodes *); + +#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */ diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h new file mode 100644 index 000000000000..2811b6857c97 --- /dev/null +++ b/fs/bcachefs/btree_node_scan_types.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H +#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H + +#include "darray.h" + +struct found_btree_node { + bool range_updated:1; + u8 btree_id; + u8 level; + unsigned sectors_written; + u32 seq; + u64 journal_seq; + u64 cookie; + + struct bpos min_key; + struct bpos max_key; + + unsigned nr_ptrs; + struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; +}; + +typedef DARRAY(struct found_btree_node) found_btree_nodes; + +struct find_btree_nodes { + int ret; + struct mutex lock; + found_btree_nodes nodes; +}; + +#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 30d69a6d133e..c4f524b2ca9a 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_foreground.h" #include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" @@ -9,6 +10,7 @@ #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "buckets.h" +#include "disk_accounting.h" #include "errcode.h" #include "error.h" #include "journal.h" @@ -19,6 +21,26 @@ #include <linux/prefetch.h> +static const char * const trans_commit_flags_strs[] = { +#define x(n, ...) #n, + BCH_TRANS_COMMIT_FLAGS() +#undef x + NULL +}; + +void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags) +{ + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + + prt_printf(out, "watermark=%s", bch2_watermarks[watermark]); + + flags >>= BCH_WATERMARK_BITS; + if (flags) { + prt_char(out, ' '); + bch2_prt_bitflags(out, trans_commit_flags_strs, flags); + } +} + static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -111,11 +133,12 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans) return 0; } -static inline void bch2_trans_unlock_write(struct btree_trans *trans) +static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans) { if (likely(trans->write_locked)) { trans_for_each_update(trans, i) - if (!same_leaf_as_prev(trans, i)) + if (btree_node_locked_type(trans->paths + i->path, i->level) == + BTREE_NODE_WRITE_LOCKED) bch2_btree_node_unlock_write_inlined(trans, trans->paths + i->path, insert_l(trans, i)->b); trans->write_locked = false; @@ -191,7 +214,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); overwrite: - bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); + bch2_bset_insert(b, k, insert, clobber_u64s); new_u64s = k->u64s; fix_iter: if (clobber_u64s != new_u64s) @@ -207,14 +230,14 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, struct btree_write *w = container_of(pin, struct btree_write, journal); struct btree *b = container_of(w, struct btree, writes[i]); struct btree_trans *trans = bch2_trans_get(c); - unsigned long old, new, v; + unsigned long old, new; unsigned idx = w - b->writes; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - v = READ_ONCE(b->flags); + old = READ_ONCE(b->flags); do { - old = new = v; + new = old; if (!(old & (1 << BTREE_NODE_dirty)) || !!(old & (1 << BTREE_NODE_write_idx)) != idx || @@ -224,9 +247,9 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, new &= ~BTREE_WRITE_TYPE_MASK; new |= BTREE_WRITE_journal_reclaim; new |= 1 << BTREE_NODE_need_write; - } while ((v = cmpxchg(&b->flags, old, new)) != old); + } while (!try_cmpxchg(&b->flags, &old, new)); - btree_node_write_if_need(c, b, SIX_LOCK_read); + btree_node_write_if_need(trans, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); bch2_trans_put(trans); @@ -315,17 +338,17 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->btree_id != path->btree_id); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && - !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && - test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + !(i->flags & BTREE_UPDATE_internal_snapshot_node) && + test_bit(JOURNAL_replay_done, &trans->c->journal.flags) && i->k->k.p.snapshot && - bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); + bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); } static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, unsigned flags) { return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, - trans->journal_u64s, flags); + trans->journal_u64s, flags, trans); } #define JSET_ENTRY_LOG_U64s 4 @@ -361,7 +384,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, struct bkey_i *new_k; int ret; - bch2_trans_unlock_write(trans); + bch2_trans_unlock_updates_write(trans); bch2_trans_unlock(trans); new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); @@ -397,12 +420,13 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags struct bkey_cached *ck = (void *) path->l[0].b; unsigned new_u64s; struct bkey_i *new_k; + unsigned watermark = flags & BCH_WATERMARK_MASK; EBUG_ON(path->level); - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(c) && - !(flags & BCH_TRANS_COMMIT_journal_reclaim)) + if (watermark < BCH_WATERMARK_reclaim && + !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(c)) return -BCH_ERR_btree_insert_need_journal_reclaim; /* @@ -434,34 +458,35 @@ static int run_one_mem_trigger(struct btree_trans *trans, struct btree_insert_entry *i, unsigned flags) { + verify_update_old_key(trans, i); + + if (unlikely(flags & BTREE_TRIGGER_norun)) + return 0; + struct bkey_s_c old = { &i->old_k, i->old_v }; struct bkey_i *new = i->k; const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - int ret; - - verify_update_old_key(trans, i); - if (unlikely(flags & BTREE_TRIGGER_NORUN)) - return 0; - - if (old_ops->trigger == new_ops->trigger) { - ret = bch2_key_trigger(trans, i->btree_id, i->level, + if (old_ops->trigger == new_ops->trigger) + return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(new), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - } else { - ret = bch2_key_trigger_new(trans, i->btree_id, i->level, + BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags); + else + return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(new), flags) ?: - bch2_key_trigger_old(trans, i->btree_id, i->level, + bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags); - } - - return ret; } -static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, - bool overwrite) +static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i) { + verify_update_old_key(trans, i); + + if ((i->flags & BTREE_TRIGGER_norun) || + !btree_node_type_has_trans_triggers(i->bkey_type)) + return 0; + /* * Transactional triggers create new btree_insert_entries, so we can't * pass them a pointer to a btree_insert_entry, that memory is going to @@ -471,13 +496,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ struct bkey_s_c old = { &old_k, i->old_v }; const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL; - - verify_update_old_key(trans, i); - - if ((i->flags & BTREE_TRIGGER_NORUN) || - !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) - return 0; + unsigned flags = i->flags|BTREE_TRIGGER_transactional; if (!i->insert_trigger_run && !i->overwrite_trigger_run && @@ -485,12 +504,12 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ i->overwrite_trigger_run = true; i->insert_trigger_run = true; return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), - BTREE_TRIGGER_INSERT| - BTREE_TRIGGER_OVERWRITE|flags) ?: 1; - } else if (overwrite && !i->overwrite_trigger_run) { + BTREE_TRIGGER_insert| + BTREE_TRIGGER_overwrite|flags) ?: 1; + } else if (!i->overwrite_trigger_run) { i->overwrite_trigger_run = true; return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; - } else if (!overwrite && !i->insert_trigger_run) { + } else if (!i->insert_trigger_run) { i->insert_trigger_run = true; return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; } else { @@ -499,43 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - struct btree_insert_entry *btree_id_start) + unsigned *btree_id_updates_start) { - struct btree_insert_entry *i; bool trans_trigger_run; - int ret, overwrite; - for (overwrite = 1; overwrite >= 0; --overwrite) { + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; + for (unsigned i = *btree_id_updates_start; + i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; + i++) { + if (trans->updates[i].btree_id < btree_id) { + *btree_id_updates_start = i; + continue; + } - for (i = btree_id_start; - i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; - i++) { - if (i->btree_id != btree_id) - continue; + int ret = run_one_trans_trigger(trans, trans->updates + i); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; + } + } while (trans_trigger_run); - ret = run_one_trans_trigger(trans, i, overwrite); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; - } - } while (trans_trigger_run); - } + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && + i->btree_id == btree_id && + btree_node_type_has_trans_triggers(i->bkey_type) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); return 0; } static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - struct btree_insert_entry *btree_id_start = trans->updates; - unsigned btree_id = 0; + unsigned btree_id = 0, btree_id_updates_start = 0; int ret = 0; /* @@ -549,30 +570,20 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) if (btree_id == BTREE_ID_alloc) continue; - while (btree_id_start < trans->updates + trans->nr_updates && - btree_id_start->btree_id < btree_id) - btree_id_start++; - - ret = run_btree_triggers(trans, btree_id, btree_id_start); + ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start); if (ret) return ret; } - trans_for_each_update(trans, i) { - if (i->btree_id > BTREE_ID_alloc) - break; - if (i->btree_id == BTREE_ID_alloc) { - ret = run_btree_triggers(trans, BTREE_ID_alloc, i); - if (ret) - return ret; - break; - } - } + btree_id_updates_start = 0; + ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start); + if (ret) + return ret; #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && - (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && + btree_node_type_has_trans_triggers(i->bkey_type) && (!i->insert_trigger_run || !i->overwrite_trigger_run)); #endif return 0; @@ -580,20 +591,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) { - trans_for_each_update(trans, i) { - /* - * XXX: synchronization of cached update triggers with gc - * XXX: synchronization of interior node updates with gc - */ - BUG_ON(i->cached || i->level); - - if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) && - gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) { - int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + trans_for_each_update(trans, i) + if (btree_node_type_has_triggers(i->bkey_type) && + gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) { + int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc); if (ret) return ret; } - } return 0; } @@ -606,11 +610,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct bch_fs *c = trans->c; struct btree_trans_commit_hook *h; unsigned u64s = 0; - int ret; + int ret = 0; + + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (race_fault()) { trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); - return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); } /* @@ -662,30 +668,40 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, !(flags & BCH_TRANS_COMMIT_no_journal_res)) { if (bch2_journal_seq_verify) trans_for_each_update(trans, i) - i->k->k.version.lo = trans->journal_res.seq; + i->k->k.bversion.lo = trans->journal_res.seq; else if (bch2_inject_invalid_keys) trans_for_each_update(trans, i) - i->k->k.version = MAX_VERSION; + i->k->k.bversion = MAX_VERSION; } - if (trans->fs_usage_deltas && - bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) - return -BCH_ERR_btree_insert_need_mark_replicas; - - /* XXX: we only want to run this if deltas are nonzero */ - bch2_trans_account_disk_usage_change(trans); - h = trans->hooks; while (h) { ret = h->fn(trans, h); if (ret) - goto revert_fs_usage; + return ret; h = h->next; } + struct jset_entry *entry = trans->journal_entries; + + percpu_down_read(&c->mark_lock); + for (entry = trans->journal_entries; + entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && + entry->start->k.type == KEY_TYPE_accounting) { + ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags); + if (ret) + goto revert_fs_usage; + } + percpu_up_read(&c->mark_lock); + + /* XXX: we only want to run this if deltas are nonzero */ + bch2_trans_account_disk_usage_change(trans); + trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags); + if (btree_node_type_has_atomic_triggers(i->bkey_type)) { + ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags); if (ret) goto fatal_err; } @@ -696,6 +712,37 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, goto fatal_err; } + struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit }; + + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; + + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) { + ret = bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, + CPU_BIG_ENDIAN, validate_context); + if (unlikely(ret)) { + bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", + trans->fn); + goto fatal_err; + } + } + + trans_for_each_update(trans, i) { + validate_context.level = i->level; + validate_context.btree = i->btree_id; + + ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context); + if (unlikely(ret)){ + bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", + trans->fn, (void *) i->ip_allocated); + goto fatal_err; + } + btree_insert_entry_checks(trans, i); + } + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { struct journal *j = &c->journal; struct jset_entry *entry; @@ -704,7 +751,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (i->key_cache_already_flushed) continue; - if (i->flags & BTREE_UPDATE_NOJOURNAL) + if (i->flags & BTREE_UPDATE_nojournal) continue; verify_update_old_key(trans, i); @@ -739,75 +786,41 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, trans_for_each_update(trans, i) { struct btree_path *path = trans->paths + i->path; - if (!i->cached) { + if (!i->cached) bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq); - } else if (!i->key_cache_already_flushed) + else if (!i->key_cache_already_flushed) bch2_btree_insert_key_cached(trans, flags, i); - else { + else bch2_btree_key_cache_drop(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - } } return 0; fatal_err: - bch2_fatal_error(c); + bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); + percpu_down_read(&c->mark_lock); revert_fs_usage: - if (trans->fs_usage_deltas) - bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); + for (struct jset_entry *entry2 = trans->journal_entries; + entry2 != entry; + entry2 = vstruct_next(entry2)) + if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys && + entry2->start->k.type == KEY_TYPE_accounting) + bch2_accounting_trans_commit_revert(trans, + bkey_i_to_accounting(entry2->start), flags); + percpu_up_read(&c->mark_lock); return ret; } static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) { + /* + * Accounting keys aren't deduped in the journal: we have to compare + * each individual update against what's in the btree to see if it has + * been applied yet, and accounting updates also don't overwrite, + * they're deltas that accumulate. + */ trans_for_each_update(trans, i) - bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); -} - -static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, - enum bkey_invalid_flags flags, - struct btree_insert_entry *i, - struct printbuf *err) -{ - struct bch_fs *c = trans->c; - - printbuf_reset(err); - prt_printf(err, "invalid bkey on insert from %s -> %ps", - trans->fn, (void *) i->ip_allocated); - prt_newline(err); - printbuf_indent_add(err, 2); - - bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); - prt_newline(err); - - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err); - bch2_print_string_as_lines(KERN_ERR, err->buf); - - bch2_inconsistent_error(c); - bch2_dump_trans_updates(trans); - - return -EINVAL; -} - -static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans, - struct jset_entry *i) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "invalid bkey on insert from %s", trans->fn); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - bch2_journal_entry_to_text(&buf, c, i); - prt_newline(&buf); - - bch2_print_string_as_lines(KERN_ERR, buf.buf); - - bch2_inconsistent_error(c); - bch2_dump_trans_updates(trans); - - return -EINVAL; + if (i->k->k.type != KEY_TYPE_accounting) + bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); } static int bch2_trans_commit_journal_pin_flush(struct journal *j, @@ -826,7 +839,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags struct bch_fs *c = trans->c; int ret = 0, u64s_delta = 0; - trans_for_each_update(trans, i) { + for (unsigned idx = 0; idx < trans->nr_updates; idx++) { + struct btree_insert_entry *i = trans->updates + idx; if (i->cached) continue; @@ -854,7 +868,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (!ret && unlikely(trans->journal_replay_not_finished)) bch2_drop_overwrites_from_journal(trans); - bch2_trans_unlock_write(trans); + bch2_trans_unlock_updates_write(trans); if (!ret && trans->journal_pin) bch2_journal_pin_add(&c->journal, trans->journal_res.seq, @@ -874,7 +888,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags static int journal_reclaim_wait_done(struct bch_fs *c) { int ret = bch2_journal_error(&c->journal) ?: - !bch2_btree_key_cache_must_wait(c); + bch2_btree_key_cache_wait_done(c); if (!ret) journal_reclaim_kick(&c->journal); @@ -887,6 +901,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, int ret, unsigned long trace_ip) { struct bch_fs *c = trans->c; + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; switch (ret) { case -BCH_ERR_btree_insert_btree_node_full: @@ -897,7 +912,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, break; case -BCH_ERR_btree_insert_need_mark_replicas: ret = drop_locks_do(trans, - bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); + bch2_accounting_update_sb(trans)); break; case -BCH_ERR_journal_res_get_blocked: /* @@ -905,7 +920,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, * flag */ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { + watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; } @@ -919,9 +934,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, bch2_trans_unlock(trans); trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); + track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true); wait_event_freezable(c->journal.reclaim_wait, (ret = journal_reclaim_wait_done(c))); + + track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false); + if (ret < 0) break; @@ -941,24 +960,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, return ret; } -static noinline int -bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) -{ - struct bch_fs *c = trans->c; - int ret; - - if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) || - test_bit(BCH_FS_started, &c->flags)) - return -BCH_ERR_erofs_trans_commit; - - ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); - if (ret) - return ret; - - bch2_write_ref_get(c, BCH_WRITE_REF_trans); - return 0; -} - /* * This is for updates done in the early part of fsck - btree_gc - before we've * gone RW. we only add the new key to the list of keys for journal replay to @@ -968,15 +969,26 @@ static noinline int do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) { struct bch_fs *c = trans->c; - int ret = 0; + + BUG_ON(current != c->recovery_task); trans_for_each_update(trans, i) { - ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); + int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); if (ret) - break; + return ret; } - return ret; + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) + if (i->type == BCH_JSET_ENTRY_btree_keys || + i->type == BCH_JSET_ENTRY_write_buffer_keys) { + int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start); + if (ret) + return ret; + } + + return 0; } int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) @@ -985,60 +997,27 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; int ret = 0; + bch2_trans_verify_not_unlocked_or_in_restart(trans); + + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) + goto out_reset; + if (!trans->nr_updates && !trans->journal_entries_u64s) goto out_reset; - memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); - ret = bch2_trans_commit_run_triggers(trans); if (ret) goto out_reset; - trans_for_each_update(trans, i) { - struct printbuf buf = PRINTBUF; - enum bkey_invalid_flags invalid_flags = 0; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; - - if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, invalid_flags, &buf))) - ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf); - btree_insert_entry_checks(trans, i); - printbuf_exit(&buf); - - if (ret) - return ret; - } - - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - i = vstruct_next(i)) { - enum bkey_invalid_flags invalid_flags = 0; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; - - if (unlikely(bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, invalid_flags))) - ret = bch2_trans_commit_journal_entry_invalid(trans, i); - - if (ret) - return ret; - } - - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { - ret = do_bch2_trans_commit_to_journal_replay(trans); - goto out_reset; - } - if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { - ret = bch2_trans_commit_get_rw_cold(trans, flags); - if (ret) - goto out_reset; + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) + ret = do_bch2_trans_commit_to_journal_replay(trans); + else + ret = -BCH_ERR_erofs_trans_commit; + goto out_reset; } EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); @@ -1062,7 +1041,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (i->key_cache_already_flushed) continue; - if (i->flags & BTREE_UPDATE_NOJOURNAL) + if (i->flags & BTREE_UPDATE_nojournal) continue; /* we're going to journal the key being updated: */ @@ -1083,9 +1062,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) } retry: errored_at = NULL; - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 4a5a64499eb7..a09cbe9cd94f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -5,6 +5,7 @@ #include <linux/list.h> #include <linux/rhashtable.h> +#include "bbpos_types.h" #include "btree_key_cache_types.h" #include "buckets_types.h" #include "darray.h" @@ -137,6 +138,31 @@ struct btree { struct list_head list; }; +#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \ + x(lock_intent) \ + x(lock_write) \ + x(dirty) \ + x(read_in_flight) \ + x(write_in_flight) \ + x(noevict) \ + x(write_blocked) \ + x(will_make_reachable) \ + x(access_bit) + +enum bch_btree_cache_not_freed_reasons { +#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n, + BCH_BTREE_CACHE_NOT_FREED_REASONS() +#undef x + BCH_BTREE_CACHE_NOT_FREED_REASONS_NR, +}; + +struct btree_cache_list { + unsigned idx; + struct shrinker *shrink; + struct list_head list; + size_t nr; +}; + struct btree_cache { struct rhashtable table; bool table_init_done; @@ -154,16 +180,19 @@ struct btree_cache { * should never grow past ~2-3 nodes in practice. */ struct mutex lock; - struct list_head live; struct list_head freeable; struct list_head freed_pcpu; struct list_head freed_nonpcpu; + struct btree_cache_list live[2]; - /* Number of elements in live + freeable lists */ - unsigned used; - unsigned reserve; - atomic_t dirty; - struct shrinker *shrink; + size_t nr_freeable; + size_t nr_reserve; + size_t nr_by_btree[BTREE_ID_NR]; + atomic_long_t nr_dirty; + + /* shrinker stats */ + size_t nr_freed; + u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR]; /* * If we need to allocate memory for a new btree node and that @@ -173,6 +202,11 @@ struct btree_cache { */ struct task_struct *alloc_lock; struct closure_waitlist alloc_wait; + + struct bbpos pinned_nodes_start; + struct bbpos pinned_nodes_end; + /* btree id mask: 0 for leaves, 1 for interior */ + u64 pinned_nodes_mask[2]; }; struct btree_node_iter { @@ -181,36 +215,89 @@ struct btree_node_iter { } data[MAX_BSETS]; }; +#define BTREE_ITER_FLAGS() \ + x(slots) \ + x(intent) \ + x(prefetch) \ + x(is_extents) \ + x(not_extents) \ + x(cached) \ + x(with_key_cache) \ + x(with_updates) \ + x(with_journal) \ + x(snapshot_field) \ + x(all_snapshots) \ + x(filter_snapshots) \ + x(nopreserve) \ + x(cached_nofill) \ + x(key_cache_fill) \ + +#define STR_HASH_FLAGS() \ + x(must_create) \ + x(must_replace) + +#define BTREE_UPDATE_FLAGS() \ + x(internal_snapshot_node) \ + x(nojournal) \ + x(key_cache_reclaim) + + /* - * Iterate over all possible positions, synthesizing deleted keys for holes: - */ -static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0; -/* - * Indicates that intent locks should be taken on leaf nodes, because we expect - * to be doing updates: - */ -static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1; -/* - * Causes the btree iterator code to prefetch additional btree nodes from disk: - */ -static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2; -/* - * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for - * @pos or the first key strictly greater than @pos + * BTREE_TRIGGER_norun - don't run triggers at all + * + * BTREE_TRIGGER_transactional - we're running transactional triggers as part of + * a transaction commit: triggers may generate new updates + * + * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction + * commit: we have our journal reservation, we're holding btree node write + * locks, and we know the transaction is going to commit (returning an error + * here is a fatal error, causing us to go emergency read-only) + * + * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage + * + * BTREE_TRIGGER_insert - @new is entering the btree + * BTREE_TRIGGER_overwrite - @old is leaving the btree + * + * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc + * trigger */ -static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3; -static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4; -static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5; -static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6; -static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7; -static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8; -static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9; -static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; -static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11; -static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12; -static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13; -static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14; -#define __BTREE_ITER_FLAGS_END 15 +#define BTREE_TRIGGER_FLAGS() \ + x(norun) \ + x(transactional) \ + x(atomic) \ + x(check_repair) \ + x(gc) \ + x(insert) \ + x(overwrite) \ + x(is_root) \ + x(bucket_invalidate) + +enum { +#define x(n) BTREE_ITER_FLAG_BIT_##n, + BTREE_ITER_FLAGS() + STR_HASH_FLAGS() + BTREE_UPDATE_FLAGS() + BTREE_TRIGGER_FLAGS() +#undef x +}; + +/* iter flags must fit in a u16: */ +//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15); + +enum btree_iter_update_trigger_flags { +#define x(n) BTREE_ITER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + BTREE_ITER_FLAGS() +#undef x +#define x(n) STR_HASH_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + STR_HASH_FLAGS() +#undef x +#define x(n) BTREE_UPDATE_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + BTREE_UPDATE_FLAGS() +#undef x +#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + BTREE_TRIGGER_FLAGS() +#undef x +}; enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -301,7 +388,7 @@ struct btree_iter { */ struct bkey k; - /* BTREE_ITER_WITH_JOURNAL: */ + /* BTREE_ITER_with_journal: */ size_t journal_idx; #ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; @@ -316,17 +403,15 @@ struct bkey_cached { unsigned long flags; u16 u64s; - bool valid; - u32 btree_trans_barrier_seq; struct bkey_cached_key key; struct rhash_head hash; - struct list_head list; struct journal_entry_pin journal; u64 seq; struct bkey_i *k; + struct rcu_head rcu; }; static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) @@ -358,7 +443,21 @@ struct btree_insert_entry { unsigned long ip_allocated; }; +/* Number of btree paths we preallocate, usually enough */ #define BTREE_ITER_INITIAL 64 +/* + * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code + * paths should run inside this limit, and if they don't it usually indicates a + * bug (leaking/duplicated btree paths). + * + * exception: some fsck paths + * + * bugs with excessive path usage seem to have possibly been eliminated now, so + * we might consider eliminating this (and btree_trans_too_many_iter()) at some + * point. + */ +#define BTREE_ITER_NORMAL_LIMIT 256 +/* never exceed limit */ #define BTREE_ITER_MAX (1U << 10) struct btree_trans_commit_hook; @@ -393,11 +492,14 @@ struct btree_trans { btree_path_idx_t nr_sorted; btree_path_idx_t nr_paths; btree_path_idx_t nr_paths_max; + btree_path_idx_t nr_updates; u8 fn_idx; - u8 nr_updates; u8 lock_must_abort; bool lock_may_not_fail:1; bool srcu_held:1; + bool locked:1; + bool pf_memalloc_nofs:1; + bool write_locked:1; bool used_mempool:1; bool in_traverse_all:1; bool paths_sorted:1; @@ -405,13 +507,19 @@ struct btree_trans { bool journal_transaction_names:1; bool journal_replay_not_finished:1; bool notrace_relock_fail:1; - bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + u32 restart_count_this_trans; +#endif u64 last_begin_time; unsigned long last_begin_ip; unsigned long last_restarted_ip; +#ifdef CONFIG_BCACHEFS_DEBUG + bch_stacktrace last_restarted_trace; +#endif + unsigned long last_unlock_ip; unsigned long srcu_lock_time; const char *fn; @@ -435,8 +543,10 @@ struct btree_trans { unsigned journal_u64s; unsigned extra_disk_res; /* XXX kill */ - struct replicas_delta_list *fs_usage_deltas; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* Entries before this are zeroed out on every bch2_trans_get() call */ struct list_head list; @@ -494,7 +604,8 @@ enum btree_write_type { x(dying) \ x(fake) \ x(need_rewrite) \ - x(never_write) + x(never_write) \ + x(pinned) enum btree_flags { /* First bits for btree node write type */ @@ -654,6 +765,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); BIT_ULL(BKEY_TYPE_inodes)| \ BIT_ULL(BKEY_TYPE_stripes)| \ BIT_ULL(BKEY_TYPE_reflink)| \ + BIT_ULL(BKEY_TYPE_subvolumes)| \ BIT_ULL(BKEY_TYPE_btree)) #define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ @@ -666,58 +778,79 @@ const char *bch2_btree_node_type_str(enum btree_node_type); (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS) -static inline bool btree_node_type_needs_gc(enum btree_node_type type) +static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type) { - return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type); + return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS; } -static inline bool btree_node_type_is_extents(enum btree_node_type type) +static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type) +{ + return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS; +} + +static inline bool btree_node_type_has_triggers(enum btree_node_type type) { - const unsigned mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) + return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS; +} + +static inline bool btree_id_is_extents(enum btree_id btree) +{ + const u64 mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << type) & mask; + return BIT_ULL(btree) & mask; } -static inline bool btree_id_is_extents(enum btree_id btree) +static inline bool btree_node_type_is_extents(enum btree_node_type type) { - return btree_node_type_is_extents(__btree_node_type(0, btree)); + return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1); +} + +static inline bool btree_type_has_snapshots(enum btree_id btree) +{ + const u64 mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr) + BCH_BTREE_IDS() +#undef x + ; + + return BIT_ULL(btree) & mask; } -static inline bool btree_type_has_snapshots(enum btree_id id) +static inline bool btree_type_has_snapshot_field(enum btree_id btree) { - const unsigned mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) + const u64 mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(btree) & mask; } -static inline bool btree_type_has_snapshot_field(enum btree_id id) +static inline bool btree_type_has_ptrs(enum btree_id btree) { - const unsigned mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) + const u64 mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(btree) & mask; } -static inline bool btree_type_has_ptrs(enum btree_id id) +static inline bool btree_type_uses_write_buffer(enum btree_id btree) { - const unsigned mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) + const u64 mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(btree) & mask; } struct btree_root { @@ -727,7 +860,7 @@ struct btree_root { __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); u8 level; u8 alive; - s8 error; + s16 error; }; enum btree_gc_coalesce_fail_reason { diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index c3ff365acce9..13d794f201a5 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -25,19 +25,22 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, static int __must_check bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t, - struct bkey_i *, enum btree_update_flags, + struct bkey_i *, enum btree_iter_update_trigger_flags, unsigned long ip); static noinline int extent_front_merge(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct bkey_i **insert, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct bkey_i *update; int ret; + if (unlikely(trans->journal_replay_not_finished)) + return 0; + update = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(update); if (ret) @@ -69,6 +72,9 @@ static noinline int extent_back_merge(struct btree_trans *trans, struct bch_fs *c = trans->c; int ret; + if (unlikely(trans->journal_replay_not_finished)) + return 0; + ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); if (ret < 0) @@ -98,8 +104,8 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, pos.snapshot++; for_each_btree_key_norestart(trans, iter, btree_id, pos, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_NOPRESERVE, k, ret) { + BTREE_ITER_all_snapshots| + BTREE_ITER_nopreserve, k, ret) { if (!bkey_eq(k.k->p, pos)) break; @@ -132,21 +138,21 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, darray_init(&s); bch2_trans_iter_init(trans, &old_iter, id, old_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); while ((old_k = bch2_btree_iter_prev(&old_iter)).k && !(ret = bkey_err(old_k)) && bkey_eq(old_pos, old_k.k->p)) { struct bpos whiteout_pos = - SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; + SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot); if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) continue; new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); + BTREE_ITER_not_extents| + BTREE_ITER_intent); ret = bkey_err(new_k); if (ret) break; @@ -162,7 +168,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, update->k.type = KEY_TYPE_whiteout; ret = bch2_trans_update(trans, &new_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } bch2_trans_iter_exit(trans, &new_iter); @@ -179,7 +185,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, int bch2_trans_update_extent_overwrite(struct btree_trans *trans, struct btree_iter *iter, - enum btree_update_flags flags, + enum btree_iter_update_trigger_flags flags, struct bkey_s_c old, struct bkey_s_c new) { @@ -212,7 +218,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, ret = bch2_insert_snapshot_whiteouts(trans, btree_id, old.k->p, update->k.p) ?: bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; } @@ -229,7 +235,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, ret = bch2_insert_snapshot_whiteouts(trans, btree_id, old.k->p, update->k.p) ?: bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; } @@ -254,7 +260,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, } ret = bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; } @@ -267,7 +273,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, bch2_cut_front(new.k->p, update); ret = bch2_trans_update_by_path(trans, iter->path, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_UPDATE_internal_snapshot_node| flags, _RET_IP_); if (ret) return ret; @@ -279,7 +285,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, static int bch2_trans_update_extent(struct btree_trans *trans, struct btree_iter *orig_iter, struct bkey_i *insert, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; struct bkey_s_c k; @@ -287,10 +293,10 @@ static int bch2_trans_update_extent(struct btree_trans *trans, int ret = 0; bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_NOT_EXTENTS); - k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + BTREE_ITER_intent| + BTREE_ITER_with_updates| + BTREE_ITER_not_extents); + k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -317,7 +323,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, goto out; next: bch2_btree_iter_advance(&iter); - k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -340,7 +346,7 @@ err: static noinline int flush_new_cached_update(struct btree_trans *trans, struct btree_insert_entry *i, - enum btree_update_flags flags, + enum btree_iter_update_trigger_flags flags, unsigned long ip) { struct bkey k; @@ -348,7 +354,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, btree_path_idx_t path_idx = bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, - BTREE_ITER_INTENT, _THIS_IP_); + BTREE_ITER_intent, _THIS_IP_); ret = bch2_btree_path_traverse(trans, path_idx, 0); if (ret) goto out; @@ -366,9 +372,9 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, goto out; i->key_cache_already_flushed = true; - i->flags |= BTREE_TRIGGER_NORUN; + i->flags |= BTREE_TRIGGER_norun; - btree_path_set_should_be_locked(btree_path); + btree_path_set_should_be_locked(trans, btree_path); ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); out: bch2_path_put(trans, path_idx, true); @@ -377,7 +383,7 @@ out: static int __must_check bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, - struct bkey_i *k, enum btree_update_flags flags, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags, unsigned long ip) { struct bch_fs *c = trans->c; @@ -416,7 +422,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, break; } - if (!cmp && i < trans->updates + trans->nr_updates) { + bool overwrite = !cmp && i < trans->updates + trans->nr_updates; + + if (overwrite) { EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); bch2_path_put(trans, i->path, true); @@ -443,7 +451,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, } } - __btree_path_get(trans->paths + i->path, true); + __btree_path_get(trans, trans->paths + i->path, true); + + trace_update_by_path(trans, path, i, overwrite); /* * If a key is present in the key cache, it must also exist in the @@ -452,7 +462,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, * the key cache - but the key has to exist in the btree for that to * work: */ - if (path->cached && bkey_deleted(&i->old_k)) + if (path->cached && !i->old_btree_u64s) return flush_new_cached_update(trans, i, flags, ip); return 0; @@ -473,15 +483,15 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, if (!iter->key_cache_path) iter->key_cache_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT| - BTREE_ITER_CACHED, _THIS_IP_); + BTREE_ITER_intent| + BTREE_ITER_cached, _THIS_IP_); iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED); + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached); if (unlikely(ret)) return ret; @@ -492,24 +502,24 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); } - btree_path_set_should_be_locked(trans->paths + iter->key_cache_path); + btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); } return 0; } int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_update_flags flags) + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; - if (iter->flags & BTREE_ITER_IS_EXTENTS) + if (iter->flags & BTREE_ITER_is_extents) return bch2_trans_update_extent(trans, iter, k, flags); if (bkey_deleted(&k->k) && - !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && - (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + !(flags & BTREE_UPDATE_key_cache_reclaim) && + (iter->flags & BTREE_ITER_filter_snapshots)) { ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); if (unlikely(ret < 0)) return ret; @@ -522,7 +532,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter * Ensure that updates to cached btrees go to the key cache: */ struct btree_path *path = trans->paths + path_idx; - if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + if (!(flags & BTREE_UPDATE_key_cache_reclaim) && !path->cached && !path->level && btree_id_cached(trans->c, path->btree_id)) { @@ -578,12 +588,9 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, enum btree_id btree, struct bpos end) { - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); - k = bch2_btree_iter_prev(iter); - ret = bkey_err(k); + bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); + int ret = bkey_err(k); if (ret) goto err; @@ -615,15 +622,15 @@ void bch2_trans_commit_hook(struct btree_trans *trans, int bch2_btree_insert_nonextent(struct btree_trans *trans, enum btree_id btree, struct bkey_i *k, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; int ret; bch2_trans_iter_init(trans, &iter, btree, k->k.p, - BTREE_ITER_CACHED| - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); + BTREE_ITER_cached| + BTREE_ITER_not_extents| + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); @@ -631,16 +638,13 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans, } int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, - struct bkey_i *k, enum btree_update_flags flags) + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; - int ret; - bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, flags); + BTREE_ITER_intent|flags); + int ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -653,37 +657,31 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, * @disk_res: must be non-NULL whenever inserting or potentially * splitting data extents * @flags: transaction commit flags + * @iter_flags: btree iter update trigger flags * * Returns: 0 on success, error code on failure */ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, - struct disk_reservation *disk_res, int flags) + struct disk_reservation *disk_res, int flags, + enum btree_iter_update_trigger_flags iter_flags) { - return bch2_trans_do(c, disk_res, NULL, flags, - bch2_btree_insert_trans(trans, id, k, 0)); + return bch2_trans_commit_do(c, disk_res, NULL, flags, + bch2_btree_insert_trans(trans, id, k, iter_flags)); } -int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, - unsigned len, unsigned update_flags) +int bch2_btree_delete_at(struct btree_trans *trans, + struct btree_iter *iter, unsigned update_flags) { - struct bkey_i *k; - - k = bch2_trans_kmalloc(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + int ret = PTR_ERR_OR_ZERO(k); + if (ret) + return ret; bkey_init(&k->k); k->k.p = iter->pos; - bch2_key_resize(&k->k, len); return bch2_trans_update(trans, iter, k, update_flags); } -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned update_flags) -{ - return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); -} - int bch2_btree_delete(struct btree_trans *trans, enum btree_id btree, struct bpos pos, unsigned update_flags) @@ -692,8 +690,8 @@ int bch2_btree_delete(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, btree, pos, - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); + BTREE_ITER_cached| + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(trans, &iter, update_flags); bch2_trans_iter_exit(trans, &iter); @@ -711,8 +709,8 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, struct bkey_s_c k; int ret = 0; - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); - while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); + while ((k = bch2_btree_iter_peek_max(&iter, end)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); struct bkey_i delete; @@ -739,7 +737,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, */ delete.k.p = iter.pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) + if (iter.flags & BTREE_ITER_is_extents) bch2_key_resize(&delete.k, bpos_min(end, k.k->p).offset - iter.pos.offset); @@ -785,9 +783,37 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, return ret; } +int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set) +{ + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + int ret = PTR_ERR_OR_ZERO(k); + if (ret) + return ret; + + bkey_init(&k->k); + k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k->k.p = iter->pos; + if (iter->flags & BTREE_ITER_is_extents) + bch2_key_resize(&k->k, 1); + + return bch2_trans_update(trans, iter, k, 0); +} + int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, struct bpos pos, bool set) { + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); + + int ret = bch2_btree_iter_traverse(&iter) ?: + bch2_btree_bit_mod_iter(trans, &iter, set); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) +{ struct bkey_i k; bkey_init(&k.k); @@ -797,10 +823,17 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, return bch2_trans_update_buffered(trans, btree, &k); } -static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s) +int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) { + unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); + prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); + + int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + return ret; + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); - int ret = PTR_ERR_OR_ZERO(e); + ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; @@ -825,7 +858,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, if (ret) goto err; - if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { + if (!test_bit(JOURNAL_running, &c->journal.flags)) { ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); if (ret) goto err; @@ -835,9 +868,8 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, memcpy(l->d, buf.buf, buf.pos); c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { - ret = bch2_trans_do(c, NULL, NULL, - BCH_TRANS_COMMIT_lazy_rw|commit_flags, - __bch2_trans_log_msg(trans, &buf, u64s)); + ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, + bch2_trans_log_msg(trans, &buf)); } err: printbuf_exit(&buf); diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index b9382b7b288b..47d8690f01bf 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -24,11 +24,11 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, #define BCH_TRANS_COMMIT_FLAGS() \ x(no_enospc, "don't check for enospc") \ x(no_check_rw, "don't attempt to take a ref on c->writes") \ - x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \ x(no_journal_res, "don't take a journal reservation, instead " \ "pin journal entry referred to by trans->journal_res.seq") \ x(journal_reclaim, "operation required for journal reclaim; may return error" \ "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\ + x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied") enum __bch_trans_commit_flags { /* First bits for bch_watermark: */ @@ -44,30 +44,33 @@ enum bch_trans_commit_flags { #undef x }; -int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, - unsigned, unsigned); +void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); + int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, - struct bkey_i *, enum btree_update_flags); + struct bkey_i *, enum btree_iter_update_trigger_flags); int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, - enum btree_update_flags); -int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, - struct disk_reservation *, int flags); + enum btree_iter_update_trigger_flags); +int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct + disk_reservation *, int flags, enum + btree_iter_update_trigger_flags iter_flags); int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); +int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool); int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); +int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, enum btree_id btree, struct bpos pos) { - return bch2_btree_bit_mod(trans, btree, pos, false); + return bch2_btree_bit_mod_buffered(trans, btree, pos, false); } int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, @@ -93,14 +96,14 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, } int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, - enum btree_update_flags, + enum btree_iter_update_trigger_flags, struct bkey_s_c, struct bkey_s_c); int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos); int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_update_flags); + struct bkey_i *, enum btree_iter_update_trigger_flags); struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); @@ -123,11 +126,31 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); +int bch2_btree_write_buffer_insert_err(struct btree_trans *, + enum btree_id, struct bkey_i *); + static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, enum btree_id btree, struct bkey_i *k) { - if (unlikely(trans->journal_replay_not_finished)) + if (unlikely(!btree_type_uses_write_buffer(btree))) { + int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); + dump_stack(); + return ret; + } + /* + * Most updates skip the btree write buffer until journal replay is + * finished because synchronization with journal replay relies on having + * a btree node locked - if we're overwriting a key in the journal that + * journal replay hasn't yet replayed, we have to mark it as + * overwritten. + * + * But accounting updates don't overwrite, they're deltas, and they have + * to be flushed to the btree strictly in order for journal replay to be + * able to tell which updates need to be applied: + */ + if (k->k.type != KEY_TYPE_accounting && + unlikely(trans->journal_replay_not_finished)) return bch2_btree_insert_clone_trans(trans, btree, k); struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); @@ -144,6 +167,7 @@ void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *, unsigned); +int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); __printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); __printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); @@ -175,15 +199,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans, nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_flags))) -#define bch2_trans_run(_c, _do) \ -({ \ - struct btree_trans *trans = bch2_trans_get(_c); \ - int _ret = (_do); \ - bch2_trans_put(trans); \ - _ret; \ -}) - -#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ +#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \ bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) #define trans_for_each_update(_trans, _i) \ @@ -200,14 +216,6 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) trans->journal_entries_u64s = 0; trans->hooks = NULL; trans->extra_disk_res = 0; - - if (trans->fs_usage_deltas) { - trans->fs_usage_deltas->used = 0; - memset((void *) trans->fs_usage_deltas + - offsetof(struct replicas_delta_list, memset_start), 0, - (void *) &trans->fs_usage_deltas->memset_end - - (void *) &trans->fs_usage_deltas->memset_start); - } } static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, @@ -219,7 +227,8 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t if (type && k.k->type != type) return ERR_PTR(-ENOENT); - mut = bch2_trans_kmalloc_nomemzero(trans, bytes); + /* extra padding for varint_decode_fast... */ + mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8); if (!IS_ERR(mut)) { bkey_reassemble(mut, k); @@ -242,7 +251,8 @@ static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *tra KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k, unsigned flags, + struct bkey_s_c *k, + enum btree_iter_update_trigger_flags flags, unsigned type, unsigned min_bytes) { struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes); @@ -259,8 +269,9 @@ static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, str return mut; } -static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k, unsigned flags) +static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c *k, + enum btree_iter_update_trigger_flags flags) { return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0); } @@ -272,10 +283,11 @@ static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struc static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type, unsigned min_bytes) + enum btree_iter_update_trigger_flags flags, + unsigned type, unsigned min_bytes) { struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, - btree_id, pos, flags|BTREE_ITER_INTENT, type); + btree_id, pos, flags|BTREE_ITER_intent, type); struct bkey_i *ret = IS_ERR(k.k) ? ERR_CAST(k.k) : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); @@ -287,7 +299,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); } @@ -295,10 +307,11 @@ static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *tran static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type, unsigned min_bytes) + enum btree_iter_update_trigger_flags flags, + unsigned type, unsigned min_bytes) { struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, - btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes); + btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); int ret; if (IS_ERR(mut)) @@ -316,7 +329,8 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags, unsigned min_bytes) + enum btree_iter_update_trigger_flags flags, + unsigned min_bytes) { return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); } @@ -324,7 +338,7 @@ static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); } @@ -335,7 +349,8 @@ static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, - unsigned flags, unsigned type, unsigned val_size) + enum btree_iter_update_trigger_flags flags, + unsigned type, unsigned val_size) { struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size); int ret; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 4530b14ff2c3..e4e7c804625e 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_gc.h" @@ -15,86 +16,138 @@ #include "clock.h" #include "error.h" #include "extents.h" +#include "io_write.h" #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery_passes.h" #include "replicas.h" +#include "sb-members.h" #include "super-io.h" #include "trace.h" #include <linux/random.h> +static const char * const bch2_btree_update_modes[] = { +#define x(t) #t, + BTREE_UPDATE_MODES() +#undef x + NULL +}; + static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - btree_path_idx_t, struct btree *, - struct keylist *, unsigned); + btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, - enum btree_id btree_id, - unsigned level, - struct bpos pos) -{ - btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, - BTREE_ITER_NOPRESERVE| - BTREE_ITER_INTENT, _RET_IP_); - path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); - - struct btree_path *path = trans->paths + path_idx; - bch2_btree_path_downgrade(trans, path); - __bch2_btree_path_unlock(trans, path); - return path_idx; -} - -/* Debug code: */ - /* * Verify that child nodes correctly span parent node's range: */ -static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) +int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) { -#ifdef CONFIG_BCACHEFS_DEBUG - struct bpos next_node = b->data->min_key; - struct btree_node_iter iter; + struct bch_fs *c = trans->c; + struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key + : b->data->min_key; + struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_s_c_btree_ptr_v2 bp; - struct bkey unpacked; - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; + struct bkey_buf prev; + int ret = 0; - BUG_ON(!b->c.level); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); + + bch2_bkey_buf_init(&prev); + bkey_init(&prev.k->k); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + + if (b == btree_node_root(c, b)) { + if (!bpos_eq(b->data->min_key, POS_MIN)) { + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->min_key); + log_fsck_err(trans, btree_root_bad_min_key, + "btree root with incorrect min_key: %s", buf.buf); + goto topology_repair; + } - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; + if (!bpos_eq(b->data->max_key, SPOS_MAX)) { + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->max_key); + log_fsck_err(trans, btree_root_bad_max_key, + "btree root with incorrect max_key: %s", buf.buf); + goto topology_repair; + } + } - bch2_btree_node_iter_init_from_start(&iter, b); + if (!b->c.level) + goto out; - while (1) { - k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { if (k.k->type != KEY_TYPE_btree_ptr_v2) - break; - bp = bkey_s_c_to_btree_ptr_v2(k); + goto out; - if (!bpos_eq(next_node, bp.v->min_key)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, next_node); - bch2_bpos_to_text(&buf2, bp.v->min_key); - panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); - } + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - bch2_btree_node_iter_advance(&iter, b); + struct bpos expected_min = bkey_deleted(&prev.k->k) + ? node_min + : bpos_successor(prev.k->k.p); - if (bch2_btree_node_iter_end(&iter)) { - if (!bpos_eq(k.k->p, b->key.k.p)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, b->key.k.p); - bch2_bpos_to_text(&buf2, k.k->p); - panic("expected end %s got %s\n", buf1.buf, buf2.buf); - } - break; + if (!bpos_eq(expected_min, bp.v->min_key)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "end of prev node doesn't match start of next node\n in "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n prev "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_str(&buf, "\n next "); + bch2_bkey_val_to_text(&buf, c, k); + + log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); + goto topology_repair; } - next_node = bpos_successor(k.k->p); + bch2_bkey_buf_reassemble(&prev, c, k); + bch2_btree_and_journal_iter_advance(&iter); } -#endif + + if (bkey_deleted(&prev.k->k)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "empty interior node\n in "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); + goto topology_repair; + } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "last child node doesn't end at end of parent node\n in "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n last key "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + + log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); + goto topology_repair; + } +out: +fsck_err: + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev, c); + printbuf_exit(&buf); + return ret; +topology_repair: + ret = bch2_topology_error(c); + goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -102,7 +155,6 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) { struct bkey_packed *k; - struct bset_tree *t; struct bkey uk; for_each_bset(b, t) @@ -179,10 +231,6 @@ static void __btree_node_free(struct btree_trans *trans, struct btree *b) BUG_ON(b->will_make_reachable); clear_btree_node_noevict(b); - - mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); - mutex_unlock(&c->btree_cache.lock); } static void bch2_btree_node_free_inmem(struct btree_trans *trans, @@ -190,19 +238,19 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, struct btree *b) { struct bch_fs *c = trans->c; - unsigned i, level = b->c.level; bch2_btree_node_lock_write_nofail(trans, path, &b->c); - bch2_btree_node_hash_remove(&c->btree_cache, b); + __btree_node_free(trans, b); + + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_unlock(&c->btree_cache.lock); + six_unlock_write(&b->c.lock); - mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); + mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - trans_for_each_path(trans, path, i) - if (path->l[level].b == b) { - btree_node_unlock(trans, path, level); - path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); - } + bch2_trans_node_drop(trans, b); } static void bch2_btree_node_free_never_used(struct btree_update *as, @@ -211,8 +259,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, { struct bch_fs *c = as->c; struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; - struct btree_path *path; - unsigned i, level = b->c.level; BUG_ON(!list_empty(&b->write_blocked)); BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); @@ -226,8 +272,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, clear_btree_node_need_write(b); mutex_lock(&c->btree_cache.lock); - list_del_init(&b->list); - bch2_btree_node_hash_remove(&c->btree_cache, b); + __bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); BUG_ON(p->nr >= ARRAY_SIZE(p->b)); @@ -235,11 +280,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, six_unlock_intent(&b->c.lock); - trans_for_each_path(trans, path, i) - if (path->l[level].b == b) { - btree_node_unlock(trans, path, level); - path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); - } + bch2_trans_node_drop(trans, b); } static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, @@ -255,11 +296,17 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct open_buckets obs = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim + unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim ? BTREE_NODE_RESERVE : 0; int ret; + b = bch2_btree_node_mem_alloc(trans, interior_node); + if (IS_ERR(b)) + return b; + + BUG_ON(b->ob.nr); + mutex_lock(&c->btree_reserve_cache_lock); if (c->btree_reserve_cache_nr > nr_reserve) { struct btree_alloc *a = @@ -268,10 +315,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, obs = a->ob; bkey_copy(&tmp.k, &a->k); mutex_unlock(&c->btree_reserve_cache_lock); - goto mem_alloc; + goto out; } mutex_unlock(&c->btree_reserve_cache_lock); - retry: ret = bch2_alloc_sectors_start_trans(trans, c->opts.metadata_target ?: @@ -284,7 +330,7 @@ retry: c->opts.metadata_replicas_required), watermark, 0, cl, &wp); if (unlikely(ret)) - return ERR_PTR(ret); + goto err; if (wp->sectors_free < btree_sectors(c)) { struct open_bucket *ob; @@ -303,19 +349,16 @@ retry: bch2_open_bucket_get(c, wp, &obs); bch2_alloc_sectors_done(c, wp); -mem_alloc: - b = bch2_btree_node_mem_alloc(trans, interior_node); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - /* we hold cannibalize_lock: */ - BUG_ON(IS_ERR(b)); - BUG_ON(b->ob.nr); - +out: bkey_copy(&b->key, &tmp.k); b->ob = obs; + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); return b; +err: + bch2_btree_node_to_freelist(c, b); + return ERR_PTR(ret); } static struct btree *bch2_btree_node_alloc(struct btree_update *as, @@ -457,8 +500,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); __btree_node_free(trans, b); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); + bch2_btree_node_to_freelist(c, b); } } } @@ -550,6 +592,26 @@ static void btree_update_add_key(struct btree_update *as, bch2_keylist_push(keys); } +static bool btree_update_new_nodes_marked_sb(struct btree_update *as) +{ + for_each_keylist_key(&as->new_keys, k) + if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k))) + return false; + return true; +} + +static void btree_update_new_nodes_mark_sb(struct btree_update *as) +{ + struct bch_fs *c = as->c; + + mutex_lock(&c->sb_lock); + for_each_keylist_key(&as->new_keys, k) + bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + /* * The transactional part of an interior btree node update, where we journal the * update we did to the interior node and update alloc info: @@ -570,7 +632,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k), - BTREE_TRIGGER_TRANSACTIONAL); + BTREE_TRIGGER_transactional); if (ret) return ret; } @@ -579,7 +641,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k), - BTREE_TRIGGER_TRANSACTIONAL); + BTREE_TRIGGER_transactional); if (ret) return ret; } @@ -607,6 +669,9 @@ static void btree_update_nodes_written(struct btree_update *as) if (ret) goto err; + if (!btree_update_new_nodes_marked_sb(as)) + btree_update_new_nodes_mark_sb(as); + /* * Wait for any in flight writes to finish before we free the old nodes * on disk: @@ -616,9 +681,11 @@ static void btree_update_nodes_written(struct btree_update *as) b = as->old_nodes[i]; + bch2_trans_begin(trans); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); seq = b->data ? b->data->keys.seq : 0; six_unlock_read(&b->c.lock); + bch2_trans_unlock_long(trans); if (seq == as->old_nodes_seq[i]) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, @@ -639,7 +706,7 @@ static void btree_update_nodes_written(struct btree_update *as) * which may require allocations as well. */ ret = commit_do(trans, &as->disk_res, &journal_seq, - BCH_WATERMARK_reclaim| + BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_journal_reclaim, @@ -647,14 +714,27 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_trans_unlock(trans); bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, - "%s(): error %s", __func__, bch2_err_str(ret)); + "%s", bch2_err_str(ret)); err: - if (as->b) { + /* + * Ensure transaction is unlocked before using btree_node_lock_nopath() + * (the use of which is always suspect, we need to work on removing this + * in the future) + * + * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get() + * calls bch2_path_upgrade(), before we call path_make_mut(), so we may + * rarely end up with a locked path besides the one we have here: + */ + bch2_trans_unlock(trans); + bch2_trans_begin(trans); - b = as->b; - btree_path_idx_t path_idx = get_unlocked_mut_path(trans, - as->btree_id, b->c.level, b->key.k.p); - struct btree_path *path = trans->paths + path_idx; + /* + * We have to be careful because another thread might be getting ready + * to free as->b and calling btree_update_reparent() on us - we'll + * recheck under btree_update_lock below: + */ + b = READ_ONCE(as->b); + if (b) { /* * @b is the node we did the final insert into: * @@ -667,17 +747,9 @@ err: * we're in journal error state: */ - /* - * Ensure transaction is unlocked before using - * btree_node_lock_nopath() (the use of which is always suspect, - * we need to work on removing this in the future) - * - * It should be, but get_unlocked_mut_path() -> bch2_path_get() - * calls bch2_path_upgrade(), before we call path_make_mut(), so - * we may rarely end up with a locked path besides the one we - * have here: - */ - bch2_trans_unlock(trans); + btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans, + as->btree_id, b->c.level, b->key.k.p); + struct btree_path *path = trans->paths + path_idx; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED); path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); @@ -722,7 +794,7 @@ err: mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); six_unlock_write(&b->c.lock); - btree_node_write_if_need(c, b, SIX_LOCK_intent); + btree_node_write_if_need(trans, b, SIX_LOCK_intent); btree_node_unlock(trans, path, b->c.level); bch2_path_put(trans, path_idx, true); } @@ -743,7 +815,7 @@ err: b = as->new_nodes[i]; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - btree_node_write_if_need(c, b, SIX_LOCK_read); + btree_node_write_if_need(trans, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); } @@ -795,15 +867,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode != BTREE_UPDATE_none); + BUG_ON(as->update_level_end < b->c.level); BUG_ON(!btree_node_dirty(b)); BUG_ON(!b->c.level); - as->mode = BTREE_INTERIOR_UPDATING_NODE; + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + + as->mode = BTREE_UPDATE_node; as->b = b; + as->update_level_end = b->c.level; set_btree_node_write_blocked(b); list_add(&as->write_blocked_list, &b->write_blocked); @@ -825,7 +899,7 @@ static void btree_update_reparent(struct btree_update *as, lockdep_assert_held(&c->btree_interior_update_lock); child->b = NULL; - child->mode = BTREE_INTERIOR_UPDATING_AS; + child->mode = BTREE_UPDATE_update; bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, bch2_update_reparent_journal_pin_flush); @@ -836,7 +910,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) struct bkey_i *insert = &b->key; struct bch_fs *c = as->c; - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode != BTREE_UPDATE_none); BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ARRAY_SIZE(as->journal_entries)); @@ -850,7 +924,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - as->mode = BTREE_INTERIOR_UPDATING_ROOT; + as->mode = BTREE_UPDATE_root; mutex_unlock(&c->btree_interior_update_lock); } @@ -1028,7 +1102,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * struct bch_fs *c = as->c; u64 start_time = as->start_time; - BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode == BTREE_UPDATE_none); if (as->took_gc_lock) up_read(&as->c->gc_lock); @@ -1045,7 +1119,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level, bool split, unsigned flags) + unsigned level_start, bool split, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; @@ -1053,7 +1127,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0; unsigned nr_nodes[2] = { 0, 0 }; - unsigned update_level = level; + unsigned level_end = level_start; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; int ret = 0; u32 restart_count = trans->restart_count; @@ -1068,29 +1142,29 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < c->journal.watermark) { - struct journal_res res = { 0 }; + if (watermark < BCH_WATERMARK_reclaim && + test_bit(JOURNAL_space_low, &c->journal.flags)) { + if (flags & BCH_TRANS_COMMIT_journal_reclaim) + return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); ret = drop_locks_do(trans, - bch2_journal_res_get(&c->journal, &res, 1, - watermark|JOURNAL_RES_GET_CHECK)); + ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; })); if (ret) return ERR_PTR(ret); } while (1) { - nr_nodes[!!update_level] += 1 + split; - update_level++; + nr_nodes[!!level_end] += 1 + split; + level_end++; - ret = bch2_btree_path_upgrade(trans, path, update_level + 1); + ret = bch2_btree_path_upgrade(trans, path, level_end + 1); if (ret) return ERR_PTR(ret); - if (!btree_path_node(path, update_level)) { + if (!btree_path_node(path, level_end)) { /* Allocating new root? */ nr_nodes[1] += split; - update_level = BTREE_MAX_DEPTH; + level_end = BTREE_MAX_DEPTH; break; } @@ -1098,11 +1172,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * Always check for space for two keys, even if we won't have to * split at prior level - it might have been a merge instead: */ - if (bch2_btree_node_insert_fits(path->l[update_level].b, + if (bch2_btree_node_insert_fits(path->l[level_end].b, BKEY_BTREE_PTR_U64s_MAX * 2)) break; - split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); + split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); } if (!down_read_trylock(&c->gc_lock)) { @@ -1116,12 +1190,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); memset(as, 0, sizeof(*as)); closure_init(&as->cl, NULL); - as->c = c; - as->start_time = start_time; - as->mode = BTREE_INTERIOR_NO_UPDATE; - as->took_gc_lock = true; - as->btree_id = path->btree_id; - as->update_level = update_level; + as->c = c; + as->start_time = start_time; + as->ip_started = _RET_IP_; + as->mode = BTREE_UPDATE_none; + as->flags = flags; + as->took_gc_lock = true; + as->btree_id = path->btree_id; + as->update_level_start = level_start; + as->update_level_end = level_end; INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->unwritten_list); INIT_LIST_HEAD(&as->write_blocked_list); @@ -1163,7 +1240,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, */ if (bch2_err_matches(ret, ENOSPC) && (flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark != BCH_WATERMARK_reclaim) { + watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; goto err; } @@ -1174,7 +1251,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); bch2_trans_unlock(trans); - closure_sync(&cl); + bch2_wait_on_allocator(c, &cl); } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); } @@ -1193,7 +1270,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, err: bch2_btree_update_free(as, trans); if (!bch2_err_matches(ret, ENOSPC) && - !bch2_err_matches(ret, EROFS)) + !bch2_err_matches(ret, EROFS) && + ret != -BCH_ERR_journal_reclaim_would_deadlock) bch_err_fn_ratelimited(c, ret); return ERR_PTR(ret); } @@ -1208,33 +1286,35 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) mutex_unlock(&c->btree_cache.lock); mutex_lock(&c->btree_root_lock); - BUG_ON(btree_node_root(c, b) && - (b->c.level < btree_node_root(c, b)->c.level || - !btree_node_dying(btree_node_root(c, b)))); - bch2_btree_id_root(c, b->c.btree_id)->b = b; mutex_unlock(&c->btree_root_lock); bch2_recalc_btree_reserve(c); } -static void bch2_btree_set_root(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +static int bch2_btree_set_root(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + bool nofail) { struct bch_fs *c = as->c; - struct btree *old; trace_and_count(c, btree_node_set_root, trans, b); - old = btree_node_root(c, b); + struct btree *old = btree_node_root(c, b); /* * Ensure no one is using the old root while we switch to the * new root: */ - bch2_btree_node_lock_write_nofail(trans, path, &old->c); + if (nofail) { + bch2_btree_node_lock_write_nofail(trans, path, &old->c); + } else { + int ret = bch2_btree_node_lock_write(trans, path, &old->c); + if (ret) + return ret; + } bch2_btree_set_root_inmem(c, b); @@ -1248,6 +1328,7 @@ static void bch2_btree_set_root(struct btree_update *as, * depend on the new root would have to update the new root. */ bch2_btree_node_unlock_write(trans, path, old); + return 0; } /* Interior node updates: */ @@ -1262,26 +1343,23 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct bch_fs *c = as->c; struct bkey_packed *k; struct printbuf buf = PRINTBUF; - unsigned long old, new, v; + unsigned long old, new; BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && - !btree_ptr_sectors_written(insert)); + !btree_ptr_sectors_written(bkey_i_to_s_c(insert))); - if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); - if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), - btree_node_type(b), WRITE, &buf) ?: - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) { - printbuf_reset(&buf); - prt_printf(&buf, "inserting invalid bkey\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - prt_printf(&buf, "\n "); - bch2_bkey_invalid(c, bkey_i_to_s_c(insert), - btree_node_type(b), WRITE, &buf); - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf); - - bch2_fs_inconsistent(c, "%s", buf.buf); + struct bkey_validate_context from = (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level, + .btree = b->c.btree_id, + .flags = BCH_VALIDATE_commit, + }; + if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?: + bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) { + bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); dump_stack(); } @@ -1301,25 +1379,25 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); set_btree_node_dirty_acct(c, b); - v = READ_ONCE(b->flags); + old = READ_ONCE(b->flags); do { - old = new = v; + new = old; new &= ~BTREE_WRITE_TYPE_MASK; new |= BTREE_WRITE_interior; new |= 1 << BTREE_NODE_need_write; - } while ((v = cmpxchg(&b->flags, old, new)) != old); + } while (!try_cmpxchg(&b->flags, &old, new)); printbuf_exit(&buf); } static void -__bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter node_iter, - struct keylist *keys) +bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter node_iter, + struct keylist *keys) { struct bkey_i *insert = bch2_keylist_front(keys); struct bkey_packed *k; @@ -1330,15 +1408,35 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) ; - while (!bch2_keylist_empty(keys)) { - insert = bch2_keylist_front(keys); + for (; + insert != keys->top && bpos_le(insert->k.p, b->key.k.p); + insert = bkey_next(insert)) + bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); - if (bpos_gt(insert->k.p, b->key.k.p)) - break; + if (bch2_btree_node_check_topology(trans, b)) { + struct printbuf buf = PRINTBUF; - bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); - bch2_keylist_pop_front(keys); + for (struct bkey_i *k = keys->keys; + k != insert; + k = bkey_next(k)) { + bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); + prt_newline(&buf); + } + + panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf); } + + memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data); + keys->top_p -= insert->_data - keys->keys_p; +} + +static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) +{ + if (insert_keys) + for_each_keylist_key(insert_keys, k) + if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos)) + return true; + return false; } /* @@ -1348,7 +1446,8 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, static void __btree_split_node(struct btree_update *as, struct btree_trans *trans, struct btree *b, - struct btree *n[2]) + struct btree *n[2], + struct keylist *insert_keys) { struct bkey_packed *k; struct bpos n1_pos = POS_MIN; @@ -1378,9 +1477,17 @@ static void __btree_split_node(struct btree_update *as, if (bkey_deleted(k)) continue; + uk = bkey_unpack_key(b, k); + + if (b->c.level && + u64s < n1_u64s && + u64s + k->u64s >= n1_u64s && + (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) || + key_deleted_in_insert(insert_keys, uk.p))) + n1_u64s += k->u64s; + i = u64s >= n1_u64s; u64s += k->u64s; - uk = bkey_unpack_key(b, k); if (!i) n1_pos = uk.p; bch2_bkey_format_add_key(&format[i], &uk); @@ -1439,8 +1546,7 @@ static void __btree_split_node(struct btree_update *as, bch2_verify_btree_nr_keys(n[i]); - if (b->c.level) - btree_node_interior_verify(as->c, n[i]); + BUG_ON(bch2_btree_node_check_topology(trans, n[i])); } } @@ -1469,15 +1575,13 @@ static void btree_split_insert_keys(struct btree_update *as, bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); - __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); - - btree_node_interior_verify(as->c, b); + bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); } } static int btree_split(struct btree_update *as, struct btree_trans *trans, btree_path_idx_t path, struct btree *b, - struct keylist *keys, unsigned flags) + struct keylist *keys) { struct bch_fs *c = as->c; struct btree *parent = btree_node_parent(trans->paths + path, b); @@ -1486,10 +1590,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; + bch2_verify_btree_nr_keys(b); BUG_ON(!parent && (b != btree_node_root(c, b))); BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); - bch2_btree_interior_update_will_free_node(as, b); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { struct btree *n[2]; @@ -1499,7 +1606,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); - __btree_split_node(as, trans, b, n); + __btree_split_node(as, trans, b, n, keys); if (keys) { btree_split_insert_keys(as, trans, path, n1, keys); @@ -1515,12 +1622,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); + path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + path1, n1); - path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p); + path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p); six_lock_increment(&n2->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + path2, n2); @@ -1565,7 +1672,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_update_add_new_node(as, n1); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); + path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + path1, n1); @@ -1578,26 +1685,29 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (parent) { /* Split a non root node */ - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); - if (ret) - goto err; + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); } else if (n3) { - bch2_btree_set_root(as, trans, trans->paths + path, n3); + ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(as, trans, trans->paths + path, n1); + ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false); } + if (ret) + goto err; + + bch2_btree_interior_update_will_free_node(as, b); + if (n3) { bch2_btree_update_get_open_buckets(as, n3); - bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); + bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0); } if (n2) { bch2_btree_update_get_open_buckets(as, n2); - bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); + bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0); } bch2_btree_update_get_open_buckets(as, n1); - bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); + bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0); /* * The old node must be freed (in memory) _before_ unlocking the new @@ -1644,27 +1754,6 @@ err: goto out; } -static void -bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct keylist *keys) -{ - struct btree_path *linked; - unsigned i; - - __bch2_btree_insert_keys_interior(as, trans, path, b, - path->l[b->c.level].iter, keys); - - btree_update_updated_node(as, b); - - trans_for_each_path_with_node(trans, b, linked, i) - bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - - bch2_trans_verify_paths(trans); -} - /** * bch2_btree_insert_node - insert bkeys into a given btree node * @@ -1673,7 +1762,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * @path_idx: path that points to current node * @b: node to insert keys into * @keys: list of keys to insert - * @flags: transaction commit flags * * Returns: 0 on success, typically transaction restart error on failure * @@ -1683,10 +1771,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as, */ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, btree_path_idx_t path_idx, struct btree *b, - struct keylist *keys, unsigned flags) + struct keylist *keys) { struct bch_fs *c = as->c; - struct btree_path *path = trans->paths + path_idx; + struct btree_path *path = trans->paths + path_idx, *linked; + unsigned i; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -1709,9 +1798,19 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t goto split; } - btree_node_interior_verify(c, b); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) { + bch2_btree_node_unlock_write(trans, path, b); + return ret; + } + + bch2_btree_insert_keys_interior(as, trans, path, b, + path->l[b->c.level].iter, keys); + + trans_for_each_path_with_node(trans, b, linked, i) + bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - bch2_btree_insert_keys_interior(as, trans, path, b, keys); + bch2_trans_verify_paths(trans); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; @@ -1725,21 +1824,20 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_maybe_compact_whiteouts(c, b)) bch2_trans_node_reinit_iter(trans, b); + btree_update_updated_node(as, b); bch2_btree_node_unlock_write(trans, path, b); - - btree_node_interior_verify(c, b); return 0; split: /* * We could attempt to avoid the transaction restart, by calling * bch2_btree_path_upgrade() and allocating more nodes: */ - if (b->c.level >= as->update_level) { + if (b->c.level >= as->update_level_end) { trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } - return btree_split(as, trans, path_idx, b, keys, flags); + return btree_split(as, trans, path_idx, b, keys); } int bch2_btree_split_leaf(struct btree_trans *trans, @@ -1747,7 +1845,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans, unsigned flags) { /* btree_split & merge may both cause paths array to be reallocated */ - struct btree *b = path_l(trans->paths + path)->b; struct btree_update *as; unsigned l; @@ -1759,7 +1856,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, if (IS_ERR(as)) return PTR_ERR(as); - ret = btree_split(as, trans, path, b, NULL, flags); + ret = btree_split(as, trans, path, b, NULL); if (ret) { bch2_btree_update_free(as, trans); return ret; @@ -1775,6 +1872,65 @@ int bch2_btree_split_leaf(struct btree_trans *trans, return ret; } +static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, + btree_path_idx_t path_idx) +{ + struct bch_fs *c = as->c; + struct btree_path *path = trans->paths + path_idx; + struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b; + + BUG_ON(!btree_node_locked(path, b->c.level)); + + n = __btree_root_alloc(as, trans, b->c.level + 1); + + bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->c.lock); + + path->locks_want++; + BUG_ON(btree_node_locked(path, n->c.level)); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, path, n); + + n->sib_u64s[0] = U16_MAX; + n->sib_u64s[1] = U16_MAX; + + bch2_keylist_add(&as->parent_keys, &b->key); + btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); + + int ret = bch2_btree_set_root(as, trans, path, n, true); + BUG_ON(ret); + + bch2_btree_update_get_open_buckets(as, n); + bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); + bch2_trans_node_add(trans, path, n); + six_unlock_intent(&n->c.lock); + + mutex_lock(&c->btree_cache.lock); + list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); + mutex_unlock(&c->btree_cache.lock); + + bch2_trans_verify_locks(trans); +} + +int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; + + if (btree_node_fake(b)) + return bch2_btree_split_leaf(trans, path, flags); + + struct btree_update *as = + bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); + if (IS_ERR(as)) + return PTR_ERR(as); + + __btree_increase_depth(as, trans, path); + bch2_btree_update_done(as, trans); + return 0; +} + int __bch2_foreground_maybe_merge(struct btree_trans *trans, btree_path_idx_t path, unsigned level, @@ -1794,9 +1950,26 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; + bch2_trans_verify_not_unlocked_or_in_restart(trans); BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); + /* + * Work around a deadlock caused by the btree write buffer not doing + * merges and leaving tons of merges for us to do - we really don't need + * to be doing merges at all from the interior update path, and if the + * interior update path is generating too many new interior updates we + * deadlock: + */ + if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) + return 0; + + if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) { + flags &= ~BCH_WATERMARK_MASK; + flags |= BCH_WATERMARK_btree; + flags |= BCH_TRANS_COMMIT_journal_reclaim; + } + b = trans->paths[path].l[level].b; if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || @@ -1810,12 +1983,12 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, : bpos_successor(b->data->max_key); sib_path = bch2_path_get(trans, btree, sib_pos, - U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); + U8_MAX, level, BTREE_ITER_intent, _THIS_IP_); ret = bch2_btree_path_traverse(trans, sib_path, false); if (ret) goto err; - btree_path_set_should_be_locked(trans->paths + sib_path); + btree_path_set_should_be_locked(trans, trans->paths + sib_path); m = trans->paths[sib_path].l[level].b; @@ -1845,8 +2018,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, __func__, buf1.buf, buf2.buf); printbuf_exit(&buf1); printbuf_exit(&buf2); - bch2_topology_error(c); - ret = -EIO; + ret = bch2_topology_error(c); goto err; } @@ -1882,9 +2054,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, trace_and_count(c, btree_node_merge, trans, b); - bch2_btree_interior_update_will_free_node(as, b); - bch2_btree_interior_update_will_free_node(as, m); - n = bch2_btree_node_alloc(as, trans, b->c.level); SET_BTREE_NODE_SEQ(n->data, @@ -1904,7 +2073,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p); + new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + new_path, n); @@ -1916,14 +2085,17 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_trans_verify_paths(trans); - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); if (ret) goto err_free_update; + bch2_btree_interior_update_will_free_node(as, b); + bch2_btree_interior_update_will_free_node(as, m); + bch2_trans_verify_paths(trans); bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); bch2_btree_node_free_inmem(trans, trans->paths + path, b); bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); @@ -1943,6 +2115,10 @@ err: bch2_path_put(trans, new_path, true); bch2_path_put(trans, sib_path, true); bch2_trans_verify_locks(trans); + if (ret == -BCH_ERR_journal_reclaim_would_deadlock) + ret = 0; + if (!ret) + ret = bch2_trans_relock(trans); return ret; err_free_update: bch2_btree_node_free_never_used(as, trans, n); @@ -1970,15 +2146,13 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (ret) goto out; - bch2_btree_interior_update_will_free_node(as, b); - n = bch2_btree_node_alloc_replacement(as, trans, b); bch2_btree_build_aux_trees(n); bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); + new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + new_path, n); @@ -1987,16 +2161,18 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, - parent, &as->parent_keys, flags); - if (ret) - goto err; + ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); } else { - bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n); + ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false); } + if (ret) + goto err; + + bch2_btree_interior_update_will_free_node(as, b); + bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); @@ -2021,42 +2197,50 @@ struct async_btree_rewrite { struct list_head list; enum btree_id btree_id; unsigned level; - struct bpos pos; - __le64 seq; + struct bkey_buf key; }; static int async_btree_node_rewrite_trans(struct btree_trans *trans, struct async_btree_rewrite *a) { - struct bch_fs *c = trans->c; struct btree_iter iter; - struct btree *b; - int ret; - - bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, + bch2_trans_node_iter_init(trans, &iter, + a->btree_id, a->key.k->k.p, BTREE_MAX_DEPTH, a->level, 0); - b = bch2_btree_iter_peek_node(&iter); - ret = PTR_ERR_OR_ZERO(b); + struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); if (ret) goto out; - if (!b || b->data->keys.seq != a->seq) { + bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k); + ret = found + ? bch2_btree_node_rewrite(trans, &iter, b, 0) + : -ENOENT; + +#if 0 + /* Tracepoint... */ + if (!ret || ret == -ENOENT) { + struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - if (b) - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - else - prt_str(&buf, "(null"); - bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s", - __func__, a->seq, buf.buf); + if (!ret) { + prt_printf(&buf, "rewrite node:\n "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); + } else { + prt_printf(&buf, "node to rewrite not found:\n want: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); + prt_printf(&buf, "\n got: "); + if (b) + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + else + prt_str(&buf, "(null)"); + } + bch_info(c, "%s", buf.buf); printbuf_exit(&buf); - goto out; } - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0); +#endif out: bch2_trans_iter_exit(trans, &iter); - return ret; } @@ -2065,85 +2249,99 @@ static void async_btree_node_rewrite_work(struct work_struct *work) struct async_btree_rewrite *a = container_of(work, struct async_btree_rewrite, work); struct bch_fs *c = a->c; - int ret; - ret = bch2_trans_do(c, NULL, NULL, 0, - async_btree_node_rewrite_trans(trans, a)); - bch_err_fn(c, ret); + int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); + if (ret != -ENOENT) + bch_err_fn_ratelimited(c, ret); + + spin_lock(&c->btree_node_rewrites_lock); + list_del(&a->list); + spin_unlock(&c->btree_node_rewrites_lock); + + closure_wake_up(&c->btree_node_rewrites_wait); + + bch2_bkey_buf_exit(&a->key, c); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) { - struct async_btree_rewrite *a; - int ret; - - a = kmalloc(sizeof(*a), GFP_NOFS); - if (!a) { - bch_err(c, "%s: error allocating memory", __func__); + struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS); + if (!a) return; - } a->c = c; a->btree_id = b->c.btree_id; a->level = b->c.level; - a->pos = b->key.k.p; - a->seq = b->data->keys.seq; INIT_WORK(&a->work, async_btree_node_rewrite_work); - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { - mutex_lock(&c->pending_node_rewrites_lock); - list_add(&a->list, &c->pending_node_rewrites); - mutex_unlock(&c->pending_node_rewrites_lock); - return; - } + bch2_bkey_buf_init(&a->key); + bch2_bkey_buf_copy(&a->key, c, &b->key); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { - if (test_bit(BCH_FS_started, &c->flags)) { - bch_err(c, "%s: error getting c->writes ref", __func__); - kfree(a); - return; - } + bool now = false, pending = false; - ret = bch2_fs_read_write_early(c); - bch_err_msg(c, ret, "going read-write"); - if (ret) { - kfree(a); - return; - } + spin_lock(&c->btree_node_rewrites_lock); + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && + bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { + list_add(&a->list, &c->btree_node_rewrites); + now = true; + } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { + list_add(&a->list, &c->btree_node_rewrites_pending); + pending = true; + } + spin_unlock(&c->btree_node_rewrites_lock); - bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + if (now) { + queue_work(c->btree_node_rewrite_worker, &a->work); + } else if (pending) { + /* bch2_do_pending_node_rewrites will execute */ + } else { + bch2_bkey_buf_exit(&a->key, c); + kfree(a); } +} - queue_work(c->btree_interior_update_worker, &a->work); +void bch2_async_btree_node_rewrites_flush(struct bch_fs *c) +{ + closure_wait_event(&c->btree_node_rewrites_wait, + list_empty(&c->btree_node_rewrites)); } void bch2_do_pending_node_rewrites(struct bch_fs *c) { - struct async_btree_rewrite *a, *n; - - mutex_lock(&c->pending_node_rewrites_lock); - list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { - list_del(&a->list); + while (1) { + spin_lock(&c->btree_node_rewrites_lock); + struct async_btree_rewrite *a = + list_pop_entry(&c->btree_node_rewrites_pending, + struct async_btree_rewrite, list); + if (a) + list_add(&a->list, &c->btree_node_rewrites); + spin_unlock(&c->btree_node_rewrites_lock); + + if (!a) + break; bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); - queue_work(c->btree_interior_update_worker, &a->work); + queue_work(c->btree_node_rewrite_worker, &a->work); } - mutex_unlock(&c->pending_node_rewrites_lock); } void bch2_free_pending_node_rewrites(struct bch_fs *c) { - struct async_btree_rewrite *a, *n; + while (1) { + spin_lock(&c->btree_node_rewrites_lock); + struct async_btree_rewrite *a = + list_pop_entry(&c->btree_node_rewrites_pending, + struct async_btree_rewrite, list); + spin_unlock(&c->btree_node_rewrites_lock); - mutex_lock(&c->pending_node_rewrites_lock); - list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { - list_del(&a->list); + if (!a) + break; + bch2_bkey_buf_exit(&a->key, c); kfree(a); } - mutex_unlock(&c->pending_node_rewrites_lock); } static int __bch2_btree_node_update_key(struct btree_trans *trans, @@ -2161,10 +2359,10 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, if (!skip_triggers) { ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key), - BTREE_TRIGGER_TRANSACTIONAL) ?: + BTREE_TRIGGER_transactional) ?: bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, bkey_i_to_s(new_key), - BTREE_TRIGGER_TRANSACTIONAL); + BTREE_TRIGGER_transactional); if (ret) return ret; } @@ -2181,7 +2379,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bch2_trans_copy_iter(&iter2, iter); iter2.path = bch2_btree_path_make_mut(trans, iter2.path, - iter2.flags & BTREE_ITER_INTENT, + iter2.flags & BTREE_ITER_intent, _THIS_IP_); struct btree_path *path2 = btree_iter_path(trans, &iter2); @@ -2193,7 +2391,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, trans->paths_sorted = false; ret = bch2_btree_iter_traverse(&iter2) ?: - bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); + bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); if (ret) goto err; } else { @@ -2220,7 +2418,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, if (new_hash) { mutex_lock(&c->btree_cache.lock); bch2_btree_node_hash_remove(&c->btree_cache, new_hash); - bch2_btree_node_hash_remove(&c->btree_cache, b); + + __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, new_key); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); @@ -2272,6 +2471,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite } new_hash = bch2_btree_node_mem_alloc(trans, false); + ret = PTR_ERR_OR_ZERO(new_hash); + if (ret) + goto err; } path->intent_ref++; @@ -2279,14 +2481,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite commit_flags, skip_triggers); --path->intent_ref; - if (new_hash) { - mutex_lock(&c->btree_cache.lock); - list_move(&new_hash->list, &c->btree_cache.freeable); - mutex_unlock(&c->btree_cache.lock); - - six_unlock_write(&new_hash->c.lock); - six_unlock_intent(&new_hash->c.lock); - } + if (new_hash) + bch2_btree_node_to_freelist(c, new_hash); +err: closure_sync(&cl); bch2_btree_cache_cannibalize_unlock(trans); return ret; @@ -2301,7 +2498,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter); if (ret) goto out; @@ -2315,7 +2512,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, BUG_ON(!btree_node_hashed(b)); - struct bch_extent_ptr *ptr; bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); @@ -2339,7 +2535,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) bch2_btree_set_root_inmem(c, b); } -static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) +int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level) { struct bch_fs *c = trans->c; struct closure cl; @@ -2356,9 +2552,13 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) b = bch2_btree_node_mem_alloc(trans, false); bch2_btree_cache_cannibalize_unlock(trans); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret; + set_btree_node_fake(b); set_btree_node_need_rewrite(b); - b->c.level = 0; + b->c.level = level; b->c.btree_id = id; bkey_btree_ptr_init(&b->key); @@ -2385,9 +2585,25 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) return 0; } -void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) +void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); + bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level))); +} + +static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) +{ + prt_printf(out, "%ps: ", (void *) as->ip_started); + bch2_trans_commit_flags_to_text(out, as->flags); + + prt_str(out, " "); + bch2_btree_id_to_text(out, as->btree_id); + prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + as->update_level_start, + as->update_level_end, + bch2_btree_update_modes[as->mode], + as->nodes_written, + closure_nr_remaining(&as->cl), + as->journal.seq); } void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) @@ -2396,12 +2612,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) - prt_printf(out, "%p m %u w %u r %u j %llu\n", - as, - as->mode, - as->nodes_written, - closure_nr_remaining(&as->cl), - as->journal.seq); + bch2_btree_update_to_text(out, as); mutex_unlock(&c->btree_interior_update_lock); } @@ -2463,8 +2674,35 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c, return end; } +static void bch2_btree_alloc_to_text(struct printbuf *out, + struct bch_fs *c, + struct btree_alloc *a) +{ + printbuf_indent_add(out, 2); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k)); + prt_newline(out); + + struct open_bucket *ob; + unsigned i; + open_bucket_for_each(c, &a->ob, ob, i) + bch2_open_bucket_to_text(out, c, ob); + + printbuf_indent_sub(out, 2); +} + +void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c) +{ + for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++) + bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]); +} + void bch2_fs_btree_interior_update_exit(struct bch_fs *c) { + WARN_ON(!list_empty(&c->btree_node_rewrites)); + WARN_ON(!list_empty(&c->btree_node_rewrites_pending)); + + if (c->btree_node_rewrite_worker) + destroy_workqueue(c->btree_node_rewrite_worker); if (c->btree_interior_update_worker) destroy_workqueue(c->btree_interior_update_worker); mempool_exit(&c->btree_interior_update_pool); @@ -2478,17 +2716,23 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) mutex_init(&c->btree_interior_update_lock); INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); - INIT_LIST_HEAD(&c->pending_node_rewrites); - mutex_init(&c->pending_node_rewrites_lock); + INIT_LIST_HEAD(&c->btree_node_rewrites); + INIT_LIST_HEAD(&c->btree_node_rewrites_pending); + spin_lock_init(&c->btree_node_rewrites_lock); } int bch2_fs_btree_interior_update_init(struct bch_fs *c) { c->btree_interior_update_worker = - alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); if (!c->btree_interior_update_worker) return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + c->btree_node_rewrite_worker = + alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); + if (!c->btree_node_rewrite_worker) + return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, sizeof(struct btree_update))) return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index c593c925d1e3..26d646e1275c 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -10,6 +10,20 @@ #define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) +int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); + +#define BTREE_UPDATE_MODES() \ + x(none) \ + x(node) \ + x(root) \ + x(update) + +enum btree_update_mode { +#define x(n) BTREE_UPDATE_##n, + BTREE_UPDATE_MODES() +#undef x +}; + /* * Tracks an in progress split/rewrite of a btree node and the update to the * parent node: @@ -32,28 +46,24 @@ struct btree_update { struct closure cl; struct bch_fs *c; u64 start_time; + unsigned long ip_started; struct list_head list; struct list_head unwritten_list; - /* What kind of update are we doing? */ - enum { - BTREE_INTERIOR_NO_UPDATE, - BTREE_INTERIOR_UPDATING_NODE, - BTREE_INTERIOR_UPDATING_ROOT, - BTREE_INTERIOR_UPDATING_AS, - } mode; - + enum btree_update_mode mode; + enum bch_trans_commit_flags flags; unsigned nodes_written:1; unsigned took_gc_lock:1; enum btree_id btree_id; - unsigned update_level; + unsigned update_level_start; + unsigned update_level_end; struct disk_reservation disk_res; /* - * BTREE_INTERIOR_UPDATING_NODE: + * BTREE_UPDATE_node: * The update that made the new nodes visible was a regular update to an * existing interior node - @b. We can't write out the update to @b * until the new nodes we created are finished writing, so we block @b @@ -119,6 +129,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); +int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned); + int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, unsigned, unsigned, enum btree_node_sibling); @@ -132,6 +144,9 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, EBUG_ON(!btree_node_locked(path, level)); + if (bch2_btree_node_merging_disabled) + return 0; + b = path->l[level].b; if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) return 0; @@ -144,6 +159,8 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, unsigned level, unsigned flags) { + bch2_trans_verify_not_unlocked_or_in_restart(trans); + return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, btree_prev_sib) ?: bch2_foreground_maybe_merge_sibling(trans, path, level, flags, @@ -160,7 +177,9 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, struct bkey_i *, unsigned, bool); void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); -void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); + +int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned); +void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); static inline unsigned btree_update_reserve_required(struct bch_fs *c, struct btree *b) @@ -259,12 +278,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt { struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), - (void *) btree_bkey_last(b, bset_tree_last(b))); + (void *) btree_bkey_last(b, t)); ssize_t remaining_space = __bch2_btree_u64s_remaining(b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { - if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) + if (b->written + block_sectors(c) <= btree_sectors(c)) return bne; } else { if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && @@ -315,9 +334,12 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, struct jset_entry *, unsigned long); +void bch2_async_btree_node_rewrites_flush(struct bch_fs *); void bch2_do_pending_node_rewrites(struct bch_fs *); void bch2_free_pending_node_rewrites(struct bch_fs *); +void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *); + void bch2_fs_btree_interior_update_exit(struct bch_fs *); void bch2_fs_btree_interior_update_init_early(struct bch_fs *); int bch2_fs_btree_interior_update_init(struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index ac7844861966..2c09d19dd621 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -1,22 +1,24 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "btree_locking.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" +#include "disk_accounting.h" #include "error.h" +#include "extents.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" #include <linux/prefetch.h> +#include <linux/sort.h> static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); -static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); - static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) { return (cmp_int(l->hi, r->hi) ?: @@ -46,6 +48,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke #endif } +static int wb_key_seq_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->journal_seq, r->journal_seq); +} + /* Compare excluding idx, the low 24 bits: */ static inline bool wb_key_eq(const void *_l, const void *_r) { @@ -113,7 +123,7 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans, trans->journal_res.seq = wb->journal_seq; return bch2_trans_update(trans, iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw| @@ -123,7 +133,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans, static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter, struct btree_write_buffered_key *wb, - bool *write_locked, size_t *fast) + bool *write_locked, + bool *accounting_accumulated, + size_t *fast) { struct btree_path *path; int ret; @@ -136,6 +148,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite if (ret) return ret; + if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) { + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u); + + if (k.k->type == KEY_TYPE_accounting) + bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k), + bkey_s_c_to_accounting(k)); + } + *accounting_accumulated = true; + /* * We can't clone a path that has write locks: unshare it now, before * set_pos and traverse(): @@ -182,13 +204,13 @@ btree_write_buffered_insert(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), - BTREE_ITER_CACHED|BTREE_ITER_INTENT); + BTREE_ITER_cached|BTREE_ITER_intent); trans->journal_res.seq = wb->journal_seq; ret = bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -242,16 +264,37 @@ out: BUG_ON(wb->sorted.size < wb->flushing.keys.nr); } +int bch2_btree_write_buffer_insert_err(struct btree_trans *trans, + enum btree_id btree, struct bkey_i *k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "attempting to do write buffer update on non wb btree="); + bch2_btree_id_to_text(&buf, btree); + prt_str(&buf, "\n"); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); + return -EROFS; +} + static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; struct btree_iter iter = { NULL }; - size_t skipped = 0, fast = 0, slowpath = 0; + size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; bool write_locked = false; + bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); int ret = 0; + ret = bch2_journal_error(&c->journal); + if (ret) + return ret; + bch2_trans_unlock(trans); bch2_trans_begin(trans); @@ -285,16 +328,32 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) darray_for_each(wb->sorted, i) { struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; + if (unlikely(!btree_type_uses_write_buffer(k->btree))) { + ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k); + goto err; + } + for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) prefetch(&wb->flushing.keys.data[n->idx]); BUG_ON(!k->journal_seq); + if (!accounting_replay_done && + k->k.k.type == KEY_TYPE_accounting) { + slowpath++; + continue; + } + if (i + 1 < &darray_top(wb->sorted) && wb_key_eq(i, i + 1)) { struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; - skipped++; + if (k->k.k.type == KEY_TYPE_accounting && + n->k.k.type == KEY_TYPE_accounting) + bch2_accounting_accumulate(bkey_i_to_accounting(&n->k), + bkey_i_to_s_c_accounting(&k->k)); + + overwritten++; n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); k->journal_seq = 0; continue; @@ -307,25 +366,37 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { bch2_btree_node_unlock_write(trans, path, path->l[0].b); write_locked = false; + + ret = lockrestart_do(trans, + bch2_btree_iter_traverse(&iter) ?: + bch2_foreground_maybe_merge(trans, iter.path, 0, + BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc)); + if (ret) + goto err; } } if (!iter.path || iter.btree_id != k->btree) { bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, - BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_intent|BTREE_ITER_all_snapshots); } bch2_btree_iter_set_pos(&iter, k->k.k.p); btree_iter_path(trans, &iter)->preserve = false; + bool accounting_accumulated = false; do { if (race_fault()) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; } - ret = wb_flush_one(trans, &iter, k, &write_locked, &fast); + ret = wb_flush_one(trans, &iter, k, &write_locked, + &accounting_accumulated, &fast); if (!write_locked) bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); @@ -357,49 +428,128 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) */ trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); + sort(wb->flushing.keys.data, + wb->flushing.keys.nr, + sizeof(wb->flushing.keys.data[0]), + wb_key_seq_cmp, NULL); + darray_for_each(wb->flushing.keys, i) { if (!i->journal_seq) continue; - bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, - bch2_btree_write_buffer_journal_flush); + if (!accounting_replay_done && + i->k.k.type == KEY_TYPE_accounting) { + could_not_insert++; + continue; + } + + if (!could_not_insert) + bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); bch2_trans_begin(trans); ret = commit_do(trans, NULL, NULL, BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_journal_res| - BCH_TRANS_COMMIT_journal_reclaim, + BCH_TRANS_COMMIT_no_journal_res , btree_write_buffered_insert(trans, i)); if (ret) goto err; + + i->journal_seq = 0; + } + + /* + * If journal replay hasn't finished with accounting keys we + * can't flush accounting keys at all - condense them and leave + * them for next time. + * + * Q: Can the write buffer overflow? + * A Shouldn't be any actual risk. It's just new accounting + * updates that the write buffer can't flush, and those are only + * going to be generated by interior btree node updates as + * journal replay has to split/rewrite nodes to make room for + * its updates. + * + * And for those new acounting updates, updates to the same + * counters get accumulated as they're flushed from the journal + * to the write buffer - see the patch for eytzingcer tree + * accumulated. So we could only overflow if the number of + * distinct counters touched somehow was very large. + */ + if (could_not_insert) { + struct btree_write_buffered_key *dst = wb->flushing.keys.data; + + darray_for_each(wb->flushing.keys, i) + if (i->journal_seq) + *dst++ = *i; + wb->flushing.keys.nr = dst - wb->flushing.keys.data; } } err: - bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); - trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0); - bch2_journal_pin_drop(j, &wb->flushing.pin); - wb->flushing.keys.nr = 0; + if (ret || !could_not_insert) { + bch2_journal_pin_drop(j, &wb->flushing.pin); + wb->flushing.keys.nr = 0; + } + + bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret)); + trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0); return ret; } -static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) +static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) +{ + struct journal_keys_to_wb dst; + int ret = 0; + + bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); + + for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { + jset_entry_for_each_key(entry, k) { + ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); + if (ret) + goto out; + } + + entry->type = BCH_JSET_ENTRY_btree_keys; + } +out: + ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; + return ret; +} + +static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) { struct journal *j = &c->journal; struct journal_buf *buf; + bool blocked; int ret = 0; - while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) { ret = bch2_journal_keys_to_write_buffer(c, buf); + + if (!blocked && !ret) { + spin_lock(&j->lock); + buf->need_flush_to_write_buffer = false; + spin_unlock(&j->lock); + } + mutex_unlock(&j->buf_lock); + + if (blocked) { + bch2_journal_unblock(j); + break; + } } return ret; } -static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) +static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq, + bool *did_work) { struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; @@ -408,7 +558,9 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) do { bch2_trans_unlock(trans); - fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); + fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq); + + *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr; /* * On memory allocation failure, bch2_btree_write_buffer_flush_locked() @@ -419,8 +571,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) mutex_unlock(&wb->flushing.lock); } while (!ret && (fetch_from_journal_err || - (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || - (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); + (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) || + (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq))); return ret; } @@ -429,17 +581,34 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j, struct journal_entry_pin *_pin, u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool did_work = false; - return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq)); + return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work)); } int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) { struct bch_fs *c = trans->c; + bool did_work = false; trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); - return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal)); + return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work); +} + +/* + * The write buffer requires flushing when going RO: keys in the journal for the + * write buffer don't have a journal pin yet + */ +bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c) +{ + if (bch2_journal_error(&c->journal)) + return false; + + bool did_work = false; + bch2_trans_run(c, btree_write_buffer_flush_seq(trans, + journal_cur_seq(&c->journal), &did_work)); + return did_work; } int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) @@ -468,6 +637,49 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) return ret; } +/* + * In check and repair code, when checking references to write buffer btrees we + * need to issue a flush before we have a definitive error: this issues a flush + * if this is a key we haven't yet checked. + */ +int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, + struct bkey_s_c referring_k, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bkey_buf tmp; + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { + if (trace_write_buffer_maybe_flush_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, referring_k); + trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); + } + + bch2_bkey_buf_reassemble(&tmp, c, referring_k); + + if (bkey_is_btree_ptr(referring_k.k)) { + bch2_trans_unlock(trans); + bch2_btree_interior_updates_flush(c); + } + + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + + bch2_bkey_buf_copy(last_flushed, c, tmp.k); + ret = -BCH_ERR_transaction_restart_write_buffer_flush; + } +err: + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + static void bch2_btree_write_buffer_flush_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); @@ -483,6 +695,29 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work) bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); } +static void wb_accounting_sort(struct btree_write_buffer *wb) +{ + eytzinger0_sort(wb->accounting.data, wb->accounting.nr, + sizeof(wb->accounting.data[0]), + wb_key_cmp, NULL); +} + +int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree, + struct bkey_i_accounting *k) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + struct btree_write_buffered_key new = { .btree = btree }; + + bkey_copy(&new.k, &k->k_i); + + int ret = darray_push(&wb->accounting, new); + if (ret) + return ret; + + wb_accounting_sort(wb); + return 0; +} + int bch2_journal_key_to_wb_slowpath(struct bch_fs *c, struct journal_keys_to_wb *dst, enum btree_id btree, struct bkey_i *k) @@ -552,11 +787,35 @@ void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_ke bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, bch2_btree_write_buffer_journal_flush); + + darray_for_each(wb->accounting, i) + memset(&i->k.v, 0, bkey_val_bytes(&i->k.k)); } -void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) +int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) { struct btree_write_buffer *wb = &c->btree_write_buffer; + unsigned live_accounting_keys = 0; + int ret = 0; + + darray_for_each(wb->accounting, i) + if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) { + i->journal_seq = dst->seq; + live_accounting_keys++; + ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k); + if (ret) + break; + } + + if (live_accounting_keys * 2 < wb->accounting.nr) { + struct btree_write_buffered_key *dst = wb->accounting.data; + + darray_for_each(wb->accounting, src) + if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k))) + *dst++ = *src; + wb->accounting.nr = dst - wb->accounting.data; + wb_accounting_sort(wb); + } if (!dst->wb->keys.nr) bch2_journal_pin_drop(&c->journal, &dst->wb->pin); @@ -569,30 +828,7 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys if (dst->wb == &wb->flushing) mutex_unlock(&wb->flushing.lock); mutex_unlock(&wb->inc.lock); -} -static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) -{ - struct journal_keys_to_wb dst; - struct jset_entry *entry; - struct bkey_i *k; - int ret = 0; - - bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); - - for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { - jset_entry_for_each_key(entry, k) { - ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); - if (ret) - goto out; - } - - entry->type = BCH_JSET_ENTRY_btree_keys; - } - - buf->need_flush_to_write_buffer = false; -out: - bch2_journal_keys_to_write_buffer_end(c, &dst); return ret; } @@ -624,6 +860,7 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && !bch2_journal_error(&c->journal)); + darray_exit(&wb->accounting); darray_exit(&wb->sorted); darray_exit(&wb->flushing.keys); darray_exit(&wb->inc.keys); diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index eebcd2b15249..d535cea28bde 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -3,6 +3,7 @@ #define _BCACHEFS_BTREE_WRITE_BUFFER_H #include "bkey.h" +#include "disk_accounting.h" static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) { @@ -20,25 +21,58 @@ static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) struct btree_trans; int bch2_btree_write_buffer_flush_sync(struct btree_trans *); +bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *); int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); int bch2_btree_write_buffer_tryflush(struct btree_trans *); +struct bkey_buf; +int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *); + struct journal_keys_to_wb { struct btree_write_buffer_keys *wb; size_t room; u64 seq; }; +static inline int wb_key_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p); +} + +int bch2_accounting_key_to_wb_slowpath(struct bch_fs *, + enum btree_id, struct bkey_i_accounting *); + +static inline int bch2_accounting_key_to_wb(struct bch_fs *c, + enum btree_id btree, struct bkey_i_accounting *k) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + struct btree_write_buffered_key search; + search.btree = btree; + search.k.k.p = k->k.p; + + unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr, + sizeof(wb->accounting.data[0]), + wb_key_cmp, &search); + + if (idx >= wb->accounting.nr) + return bch2_accounting_key_to_wb_slowpath(c, btree, k); + + struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k); + bch2_accounting_accumulate(dst, accounting_i_to_s_c(k)); + return 0; +} + int bch2_journal_key_to_wb_slowpath(struct bch_fs *, struct journal_keys_to_wb *, enum btree_id, struct bkey_i *); -static inline int bch2_journal_key_to_wb(struct bch_fs *c, +static inline int __bch2_journal_key_to_wb(struct bch_fs *c, struct journal_keys_to_wb *dst, enum btree_id btree, struct bkey_i *k) { - EBUG_ON(!dst->seq); - if (unlikely(!dst->room)) return bch2_journal_key_to_wb_slowpath(c, dst, btree, k); @@ -51,8 +85,19 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c, return 0; } +static inline int bch2_journal_key_to_wb(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + EBUG_ON(!dst->seq); + + return k->k.type == KEY_TYPE_accounting + ? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k)) + : __bch2_journal_key_to_wb(c, dst, btree, k); +} + void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); -void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); +int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h index 9b9433de9c36..e9e76e20f43b 100644 --- a/fs/bcachefs/btree_write_buffer_types.h +++ b/fs/bcachefs/btree_write_buffer_types.h @@ -52,6 +52,8 @@ struct btree_write_buffer { struct btree_write_buffer_keys inc; struct btree_write_buffer_keys flushing; struct work_struct flush_work; + + DARRAY(struct btree_write_buffered_key) accounting; }; #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 54f7826ac498..345b117a4a4a 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -13,11 +13,14 @@ #include "btree_update.h" #include "buckets.h" #include "buckets_waiting_for_journal.h" +#include "disk_accounting.h" #include "ec.h" #include "error.h" #include "inode.h" #include "movinggc.h" +#include "rebalance.h" #include "recovery.h" +#include "recovery_passes.h" #include "reflink.h" #include "replicas.h" #include "subvolume.h" @@ -25,197 +28,10 @@ #include <linux/preempt.h> -static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, - enum bch_data_type data_type, - s64 sectors) -{ - switch (data_type) { - case BCH_DATA_btree: - fs_usage->btree += sectors; - break; - case BCH_DATA_user: - case BCH_DATA_parity: - fs_usage->data += sectors; - break; - case BCH_DATA_cached: - fs_usage->cached += sectors; - break; - default: - break; - } -} - -void bch2_fs_usage_initialize(struct bch_fs *c) -{ - percpu_down_write(&c->mark_lock); - struct bch_fs_usage *usage = c->usage_base; - - for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++) - bch2_fs_usage_acc_to_base(c, i); - - for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) - usage->b.reserved += usage->persistent_reserved[i]; - - for (unsigned i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); - - fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]); - } - - for_each_member_device(c, ca) { - struct bch_dev_usage dev = bch2_dev_usage_read(ca); - - usage->b.hidden += (dev.d[BCH_DATA_sb].buckets + - dev.d[BCH_DATA_journal].buckets) * - ca->mi.bucket_size; - } - - percpu_up_write(&c->mark_lock); -} - -static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, - unsigned journal_seq, - bool gc) -{ - BUG_ON(!gc && !journal_seq); - - return this_cpu_ptr(gc - ? ca->usage_gc - : ca->usage[journal_seq & JOURNAL_BUF_MASK]); -} - void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) { - struct bch_fs *c = ca->fs; - unsigned seq, i, u64s = dev_usage_u64s(); - - do { - seq = read_seqcount_begin(&c->usage_lock); - memcpy(usage, ca->usage_base, u64s * sizeof(u64)); - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) - acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s); - } while (read_seqcount_retry(&c->usage_lock, seq)); -} - -u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) -{ - ssize_t offset = v - (u64 *) c->usage_base; - unsigned i, seq; - u64 ret; - - BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); - percpu_rwsem_assert_held(&c->mark_lock); - - do { - seq = read_seqcount_begin(&c->usage_lock); - ret = *v; - - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); - } while (read_seqcount_retry(&c->usage_lock, seq)); - - return ret; -} - -struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) -{ - struct bch_fs_usage_online *ret; - unsigned nr_replicas = READ_ONCE(c->replicas.nr); - unsigned seq, i; -retry: - ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL); - if (unlikely(!ret)) - return NULL; - - percpu_down_read(&c->mark_lock); - - if (nr_replicas != c->replicas.nr) { - nr_replicas = c->replicas.nr; - percpu_up_read(&c->mark_lock); - kfree(ret); - goto retry; - } - - ret->online_reserved = percpu_u64_get(c->online_reserved); - - do { - seq = read_seqcount_begin(&c->usage_lock); - unsafe_memcpy(&ret->u, c->usage_base, - __fs_usage_u64s(nr_replicas) * sizeof(u64), - "embedded variable length struct"); - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], - __fs_usage_u64s(nr_replicas)); - } while (read_seqcount_retry(&c->usage_lock, seq)); - - return ret; -} - -void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) -{ - unsigned u64s = fs_usage_u64s(c); - - BUG_ON(idx >= ARRAY_SIZE(c->usage)); - - preempt_disable(); - write_seqcount_begin(&c->usage_lock); - - acc_u64s_percpu((u64 *) c->usage_base, - (u64 __percpu *) c->usage[idx], u64s); - percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); - - rcu_read_lock(); - for_each_member_device_rcu(c, ca, NULL) { - u64s = dev_usage_u64s(); - - acc_u64s_percpu((u64 *) ca->usage_base, - (u64 __percpu *) ca->usage[idx], u64s); - percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); - } - rcu_read_unlock(); - - write_seqcount_end(&c->usage_lock); - preempt_enable(); -} - -void bch2_fs_usage_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_fs_usage_online *fs_usage) -{ - unsigned i; - - prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); - - prt_printf(out, "hidden:\t\t\t\t%llu\n", - fs_usage->u.b.hidden); - prt_printf(out, "data:\t\t\t\t%llu\n", - fs_usage->u.b.data); - prt_printf(out, "cached:\t\t\t\t%llu\n", - fs_usage->u.b.cached); - prt_printf(out, "reserved:\t\t\t%llu\n", - fs_usage->u.b.reserved); - prt_printf(out, "nr_inodes:\t\t\t%llu\n", - fs_usage->u.b.nr_inodes); - prt_printf(out, "online reserved:\t\t%llu\n", - fs_usage->online_reserved); - - for (i = 0; - i < ARRAY_SIZE(fs_usage->u.persistent_reserved); - i++) { - prt_printf(out, "%u replicas:\n", i + 1); - prt_printf(out, "\treserved:\t\t%llu\n", - fs_usage->u.persistent_reserved[i]); - } - - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); - - prt_printf(out, "\t"); - bch2_replicas_entry_to_text(out, e); - prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]); - } + memset(usage, 0, sizeof(*usage)); + acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s()); } static u64 reserve_factor(u64 r) @@ -223,16 +39,6 @@ static u64 reserve_factor(u64 r) return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); } -u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) -{ - return min(fs_usage->u.b.hidden + - fs_usage->u.b.btree + - fs_usage->u.b.data + - reserve_factor(fs_usage->u.b.reserved + - fs_usage->online_reserved), - c->capacity); -} - static struct bch_fs_usage_short __bch2_fs_usage_read_short(struct bch_fs *c) { @@ -240,17 +46,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c) u64 data, reserved; ret.capacity = c->capacity - - bch2_fs_usage_read_one(c, &c->usage_base->b.hidden); + percpu_u64_get(&c->usage->hidden); - data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) + - bch2_fs_usage_read_one(c, &c->usage_base->b.btree); - reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) + + data = percpu_u64_get(&c->usage->data) + + percpu_u64_get(&c->usage->btree); + reserved = percpu_u64_get(&c->usage->reserved) + percpu_u64_get(c->online_reserved); ret.used = min(ret.capacity, data + reserve_factor(reserved)); ret.free = ret.capacity - ret.used; - ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes); + ret.nr_inodes = percpu_u64_get(&c->usage->nr_inodes); return ret; } @@ -267,324 +73,346 @@ bch2_fs_usage_read_short(struct bch_fs *c) return ret; } -void bch2_dev_usage_init(struct bch_dev *ca) +void bch2_dev_usage_to_text(struct printbuf *out, + struct bch_dev *ca, + struct bch_dev_usage *usage) { - ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; -} + if (out->nr_tabstops < 5) { + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + } -void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) -{ - prt_tab(out); - prt_str(out, "buckets"); - prt_tab_rjust(out); - prt_str(out, "sectors"); - prt_tab_rjust(out); - prt_str(out, "fragmented"); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); for (unsigned i = 0; i < BCH_DATA_NR; i++) { bch2_prt_data_type(out, i); - prt_tab(out); - prt_u64(out, usage->d[i].buckets); - prt_tab_rjust(out); - prt_u64(out, usage->d[i].sectors); - prt_tab_rjust(out); - prt_u64(out, usage->d[i].fragmented); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\t%llu\r%llu\r%llu\r\n", + usage->d[i].buckets, + usage->d[i].sectors, + usage->d[i].fragmented); } -} - -void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - const struct bch_alloc_v4 *old, - const struct bch_alloc_v4 *new, - u64 journal_seq, bool gc) -{ - struct bch_fs_usage *fs_usage; - struct bch_dev_usage *u; - - preempt_disable(); - fs_usage = fs_usage_ptr(c, journal_seq, gc); - - if (data_type_is_hidden(old->data_type)) - fs_usage->b.hidden -= ca->mi.bucket_size; - if (data_type_is_hidden(new->data_type)) - fs_usage->b.hidden += ca->mi.bucket_size; - - u = dev_usage_ptr(ca, journal_seq, gc); - - u->d[old->data_type].buckets--; - u->d[new->data_type].buckets++; - u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old); - u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new); - - u->d[BCH_DATA_cached].sectors += new->cached_sectors; - u->d[BCH_DATA_cached].sectors -= old->cached_sectors; - - u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old); - u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new); - - preempt_enable(); + prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets); } -static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) +static int bch2_check_fix_ptr(struct btree_trans *trans, + struct bkey_s_c k, + struct extent_ptr_decoded p, + const union bch_extent_entry *entry, + bool *do_update) { - return (struct bch_alloc_v4) { - .gen = b.gen, - .data_type = b.data_type, - .dirty_sectors = b.dirty_sectors, - .cached_sectors = b.cached_sectors, - .stripe = b.stripe, - }; -} + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; -void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, - struct bucket *old, struct bucket *new) -{ - struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old); - struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new); + struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); + if (!ca) { + if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID, + trans, ptr_to_invalid_device, + "pointer to missing device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + return 0; + } - bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true); -} + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + if (!g) { + if (fsck_err(trans, ptr_to_invalid_device, + "pointer to invalid bucket on device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + goto out; + } -static inline int __update_replicas(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct bch_replicas_entry_v1 *r, - s64 sectors) -{ - int idx = bch2_replicas_entry_idx(c, r); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); - if (idx < 0) - return -1; + if (fsck_err_on(!g->gen_valid, + trans, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached) { + g->gen_valid = true; + g->gen = p.ptr.gen; + } else { + *do_update = true; + } + } - fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); - fs_usage->replicas[idx] += sectors; - return 0; -} + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, + trans, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached && + (g->data_type != BCH_DATA_btree || + data_type == BCH_DATA_btree)) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = 0; + g->stripe_sectors = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + *do_update = true; + } + } -int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k, - struct bch_replicas_entry_v1 *r, s64 sectors, - unsigned journal_seq, bool gc) -{ - struct bch_fs_usage *fs_usage; - int idx, ret = 0; - struct printbuf buf = PRINTBUF; + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, + trans, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; - percpu_down_read(&c->mark_lock); + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, + trans, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; - idx = bch2_replicas_entry_idx(c, r); - if (idx < 0 && - fsck_err(c, ptr_to_missing_replicas_entry, - "no replicas entry\n while marking %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - percpu_up_read(&c->mark_lock); - ret = bch2_mark_replicas(c, r); - percpu_down_read(&c->mark_lock); + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) + goto out; - if (ret) - goto err; - idx = bch2_replicas_entry_idx(c, r); - } - if (idx < 0) { - ret = -1; - goto err; + if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), + trans, ptr_bucket_data_type_mismatch, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (data_type == BCH_DATA_btree) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = data_type; + g->stripe_sectors = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + *do_update = true; + } } - preempt_disable(); - fs_usage = fs_usage_ptr(c, journal_seq, gc); - fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); - fs_usage->replicas[idx] += sectors; - preempt_enable(); -err: + if (p.has_ec) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); + + if (fsck_err_on(!m || !m->alive, + trans, ptr_to_missing_stripe, + "pointer to nonexistent stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + + if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), + trans, ptr_to_incorrect_stripe, + "pointer does not match stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + } +out: fsck_err: - percpu_up_read(&c->mark_lock); + bch2_dev_put(ca); printbuf_exit(&buf); return ret; } -static inline int update_cached_sectors(struct bch_fs *c, - struct bkey_s_c k, - unsigned dev, s64 sectors, - unsigned journal_seq, bool gc) -{ - struct bch_replicas_padded r; - - bch2_replicas_entry_cached(&r.e, dev); - - return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc); -} - -static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, - gfp_t gfp) +int bch2_check_fix_ptrs(struct btree_trans *trans, + enum btree_id btree, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { - struct replicas_delta_list *d = trans->fs_usage_deltas; - unsigned new_size = d ? (d->size + more) * 2 : 128; - unsigned alloc_size = sizeof(*d) + new_size; - - WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); - - if (!d || d->used + more > d->size) { - d = krealloc(d, alloc_size, gfp|__GFP_ZERO); - - if (unlikely(!d)) { - if (alloc_size > REPLICAS_DELTA_LIST_MAX) - return -ENOMEM; - - d = mempool_alloc(&trans->c->replicas_delta_pool, gfp); - if (!d) - return -ENOMEM; - - memset(d, 0, REPLICAS_DELTA_LIST_MAX); - - if (trans->fs_usage_deltas) - memcpy(d, trans->fs_usage_deltas, - trans->fs_usage_deltas->size + sizeof(*d)); - - new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d); - kfree(trans->fs_usage_deltas); - } + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry_c; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; + struct printbuf buf = PRINTBUF; + int ret = 0; - d->size = new_size; - trans->fs_usage_deltas = d; + bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { + ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); + if (ret) + goto err; } - return 0; -} - -int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) -{ - return allocate_dropping_locks_errcode(trans, - __replicas_deltas_realloc(trans, more, _gfp)); -} - -int bch2_update_replicas_list(struct btree_trans *trans, - struct bch_replicas_entry_v1 *r, - s64 sectors) -{ - struct replicas_delta_list *d; - struct replicas_delta *n; - unsigned b; - int ret; - - if (!sectors) - return 0; - - b = replicas_entry_bytes(r) + 8; - ret = bch2_replicas_deltas_realloc(trans, b); - if (ret) - return ret; - - d = trans->fs_usage_deltas; - n = (void *) d->d + d->used; - n->delta = sectors; - unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r), - r, replicas_entry_bytes(r), - "flexible array member embedded in strcuct with padding"); - bch2_replicas_entry_sort(&n->r); - d->used += b; - return 0; -} - -int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors) -{ - struct bch_replicas_padded r; - - bch2_replicas_entry_cached(&r.e, dev); + if (do_update) { + if (flags & BTREE_TRIGGER_is_root) { + bch_err(c, "cannot update btree roots yet"); + ret = -EINVAL; + goto err; + } - return bch2_update_replicas_list(trans, &r.e, sectors); -} + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; -int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, enum bch_data_type data_type, - unsigned sectors, struct gc_pos pos, - unsigned flags) -{ - struct bucket old, new, *g; - int ret = 0; + rcu_read_lock(); + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); + rcu_read_unlock(); - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - BUG_ON(data_type != BCH_DATA_sb && - data_type != BCH_DATA_journal); + if (level) { + /* + * We don't want to drop btree node pointers - if the + * btree node isn't there anymore, the read path will + * sort it out: + */ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + rcu_read_lock(); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) - return 0; + ptr->gen = g->gen; + } + rcu_read_unlock(); + } else { + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; + + rcu_read_lock(); +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); + + if ((p.ptr.cached && + (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || + (!p.ptr.cached && + gen_cmp(p.ptr.gen, g->gen) < 0) || + gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type)) { + bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); + goto restart_drop_ptrs; + } + } + rcu_read_unlock(); +again: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_extent_entry_for_each(ptrs, entry) { + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, + entry->stripe_ptr.idx); + union bch_extent_entry *next_ptr; + + bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) + if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) + goto found; + next_ptr = NULL; +found: + if (!next_ptr) { + bch_err(c, "aieee, found stripe ptr with no data ptr"); + continue; + } + + if (!m || !m->alive || + !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], + &next_ptr->ptr, + m->sectors)) { + bch2_bkey_extent_entry_drop(new, entry); + goto again; + } + } + } + } - percpu_down_read(&c->mark_lock); - g = gc_bucket(ca, b); + if (0) { + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, k); + bch_info(c, "updated %s", buf.buf); - bucket_lock(g); - old = *g; + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); + bch_info(c, "new key %s", buf.buf); + } - if (bch2_fs_inconsistent_on(g->data_type && - g->data_type != data_type, c, - "different types of data in same bucket: %s, %s", - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type))) { - ret = -EIO; - goto err; - } + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, + BTREE_ITER_intent|BTREE_ITER_all_snapshots); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun); + bch2_trans_iter_exit(trans, &iter); + if (ret) + goto err; - if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", - ca->dev_idx, b, g->gen, - bch2_data_type_str(g->data_type ?: data_type), - g->dirty_sectors, sectors)) { - ret = -EIO; - goto err; + if (level) + bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); } - - g->data_type = data_type; - g->dirty_sectors += sectors; - new = *g; err: - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, &old, &new); - percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); return ret; } -int bch2_check_bucket_ref(struct btree_trans *trans, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 b_gen, u8 bucket_data_type, - u32 bucket_sectors) +int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 b_gen, u8 bucket_data_type, + u32 *bucket_sectors) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); struct printbuf buf = PRINTBUF; + bool inserting = sectors > 0; int ret = 0; - if (bucket_data_type == BCH_DATA_cached) - bucket_data_type = BCH_DATA_user; - - if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || - (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) - bucket_data_type = ptr_data_type = BCH_DATA_stripe; + BUG_ON(!sectors); if (gen_after(ptr->gen, b_gen)) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, ptr_gen_newer_than_bucket_gen, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + goto out; } if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - BCH_FSCK_ERR_ptr_too_stale, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, ptr_too_stale, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -592,35 +420,35 @@ int bch2_check_bucket_ref(struct btree_trans *trans, ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + goto out; + } + + if (b_gen != ptr->gen && ptr->cached) { + ret = 1; + goto out; } - if (b_gen != ptr->gen && !ptr->cached) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - BCH_FSCK_ERR_stale_dirty_ptr, + if (b_gen != ptr->gen) { + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, stale_dirty_ptr, "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - *bucket_gen(ca, bucket_nr), + bucket_gen_get(ca, bucket_nr), bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; - } - - if (b_gen != ptr->gen) { - ret = 1; + if (inserting) + goto err; goto out; } - if (!data_type_is_empty(bucket_data_type) && - ptr_data_type && - bucket_data_type != ptr_data_type) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - BCH_FSCK_ERR_ptr_bucket_data_type_mismatch, + if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) { + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, ptr_bucket_data_type_mismatch, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -628,72 +456,38 @@ int bch2_check_bucket_ref(struct btree_trans *trans, bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + goto out; } - if ((u64) bucket_sectors + sectors > U32_MAX) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - BCH_FSCK_ERR_bucket_sector_count_overflow, + if ((u64) *bucket_sectors + sectors > U32_MAX) { + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, bucket_sector_count_overflow, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), - bucket_sectors, sectors, + *bucket_sectors, sectors, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + sectors = -*bucket_sectors; } + + *bucket_sectors += sectors; out: printbuf_exit(&buf); return ret; err: +fsck_err: bch2_dump_trans_updates(trans); + bch2_inconsistent_error(c); + ret = -BCH_ERR_bucket_ref_update; goto out; } -void bch2_trans_fs_usage_revert(struct btree_trans *trans, - struct replicas_delta_list *deltas) -{ - struct bch_fs *c = trans->c; - struct bch_fs_usage *dst; - struct replicas_delta *d, *top = (void *) deltas->d + deltas->used; - s64 added = 0; - unsigned i; - - percpu_down_read(&c->mark_lock); - preempt_disable(); - dst = fs_usage_ptr(c, trans->journal_res.seq, false); - - /* revert changes: */ - for (d = deltas->d; d != top; d = replicas_delta_next(d)) { - switch (d->r.data_type) { - case BCH_DATA_btree: - case BCH_DATA_user: - case BCH_DATA_parity: - added += d->delta; - } - BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); - } - - dst->b.nr_inodes -= deltas->nr_inodes; - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - added -= deltas->persistent_reserved[i]; - dst->b.reserved -= deltas->persistent_reserved[i]; - dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; - } - - if (added > 0) { - trans->disk_res->sectors += added; - this_cpu_add(*c->online_reserved, added); - } - - preempt_enable(); - percpu_up_read(&c->mark_lock); -} - void bch2_trans_account_disk_usage_change(struct btree_trans *trans) { struct bch_fs *c = trans->c; @@ -702,8 +496,6 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans) bool warn = false; percpu_down_read(&c->mark_lock); - preempt_disable(); - struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b; struct bch_fs_usage_base *src = &trans->fs_usage_delta; s64 added = src->btree + src->data + src->reserved; @@ -714,13 +506,13 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans) */ s64 should_not_have_added = added - (s64) disk_res_sectors; if (unlikely(should_not_have_added > 0)) { - u64 old, new, v = atomic64_read(&c->sectors_available); + u64 old, new; + old = atomic64_read(&c->sectors_available); do { - old = v; new = max_t(s64, 0, old - should_not_have_added); - } while ((v = atomic64_cmpxchg(&c->sectors_available, - old, new)) != old); + } while (!atomic64_try_cmpxchg(&c->sectors_available, + &old, new)); added -= should_not_have_added; warn = true; @@ -731,13 +523,9 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans) this_cpu_sub(*c->online_reserved, added); } - dst->hidden += src->hidden; - dst->btree += src->btree; - dst->data += src->data; - dst->cached += src->cached; - dst->reserved += src->reserved; - dst->nr_inodes += src->nr_inodes; - + preempt_disable(); + struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); + acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); preempt_enable(); percpu_up_read(&c->mark_lock); @@ -747,150 +535,104 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans) should_not_have_added, disk_res_sectors); } -int bch2_trans_fs_usage_apply(struct btree_trans *trans, - struct replicas_delta_list *deltas) -{ - struct bch_fs *c = trans->c; - struct replicas_delta *d, *d2; - struct replicas_delta *top = (void *) deltas->d + deltas->used; - struct bch_fs_usage *dst; - unsigned i; - - percpu_down_read(&c->mark_lock); - preempt_disable(); - dst = fs_usage_ptr(c, trans->journal_res.seq, false); - - for (d = deltas->d; d != top; d = replicas_delta_next(d)) - if (__update_replicas(c, dst, &d->r, d->delta)) - goto need_mark; - - dst->b.nr_inodes += deltas->nr_inodes; - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - dst->b.reserved += deltas->persistent_reserved[i]; - dst->persistent_reserved[i] += deltas->persistent_reserved[i]; - } - - preempt_enable(); - percpu_up_read(&c->mark_lock); - return 0; -need_mark: - /* revert changes: */ - for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) - BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); - - preempt_enable(); - percpu_up_read(&c->mark_lock); - return -1; -} - /* KEY_TYPE_extent: */ -static int __mark_pointer(struct btree_trans *trans, +static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, - const struct bch_extent_ptr *ptr, + const struct extent_ptr_decoded *p, s64 sectors, enum bch_data_type ptr_data_type, - u8 bucket_gen, u8 *bucket_data_type, - u32 *dirty_sectors, u32 *cached_sectors) + struct bch_alloc_v4 *a, + bool insert) { - u32 *dst_sectors = !ptr->cached - ? dirty_sectors - : cached_sectors; - int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, - bucket_gen, *bucket_data_type, *dst_sectors); + u32 *dst_sectors = p->has_ec ? &a->stripe_sectors : + !p->ptr.cached ? &a->dirty_sectors : + &a->cached_sectors; + int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type, + a->gen, a->data_type, dst_sectors); if (ret) return ret; - - *dst_sectors += sectors; - - if (!*dirty_sectors && !*cached_sectors) - *bucket_data_type = 0; - else if (*bucket_data_type != BCH_DATA_stripe) - *bucket_data_type = ptr_data_type; - + if (insert) + alloc_data_type_set(a, ptr_data_type); return 0; } static int bch2_trigger_pointer(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, + const union bch_extent_entry *entry, s64 *sectors, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { - bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); - struct bpos bucket; - struct bch_backpointer bp; + struct bch_fs *c = trans->c; + bool insert = !(flags & BTREE_TRIGGER_overwrite); + struct printbuf buf = PRINTBUF; + int ret = 0; - bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); - *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { - struct btree_iter iter; - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) - return ret; + *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len; - ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type, - a->v.gen, &a->v.data_type, - &a->v.dirty_sectors, &a->v.cached_sectors) ?: - bch2_trans_update(trans, &iter, &a->k_i, 0); - bch2_trans_iter_exit(trans, &iter); + struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); + if (unlikely(!ca)) { + if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) + ret = -BCH_ERR_trigger_pointer; + goto err; + } + struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); + + if (flags & BTREE_TRIGGER_transactional) { + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); + ret = PTR_ERR_OR_ZERO(a) ?: + __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert); if (ret) - return ret; + goto err; if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); + ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); if (ret) - return ret; + goto err; } } - if (flags & BTREE_TRIGGER_GC) { - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); - - percpu_down_read(&c->mark_lock); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - bucket_lock(g); - struct bucket old = *g; - - u8 bucket_data_type = g->data_type; - int ret = __mark_pointer(trans, k, &p.ptr, *sectors, - data_type, g->gen, - &bucket_data_type, - &g->dirty_sectors, - &g->cached_sectors); - if (ret) { - bucket_unlock(g); - percpu_up_read(&c->mark_lock); - return ret; + if (flags & BTREE_TRIGGER_gc) { + struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + p.ptr.dev, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -BCH_ERR_trigger_pointer; + goto err; } - g->data_type = bucket_data_type; - struct bucket new = *g; + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; + ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); + alloc_to_bucket(g, new); bucket_unlock(g); - bch2_dev_usage_update_m(c, ca, &old, &new); - percpu_up_read(&c->mark_lock); - } - return 0; + if (!ret) + ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); + } +err: + bch2_dev_put(ca); + printbuf_exit(&buf); + return ret; } static int bch2_trigger_stripe_ptr(struct btree_trans *trans, struct bkey_s_c k, struct extent_ptr_decoded p, enum bch_data_type data_type, - s64 sectors, unsigned flags) + s64 sectors, + enum btree_iter_update_trigger_flags flags) { - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct btree_iter iter; struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_WITH_UPDATES, stripe); + BTREE_ITER_with_updates, stripe); int ret = PTR_ERR_OR_ZERO(s); if (unlikely(ret)) { bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, @@ -903,7 +645,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, bch2_trans_inconsistent(trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); - ret = -EIO; + ret = -BCH_ERR_trigger_stripe_pointer; goto err; } @@ -911,20 +653,20 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, stripe_blockcount_get(&s->v, p.ec.block) + sectors); - struct bch_replicas_padded r; - bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); - r.e.data_type = data_type; - ret = bch2_update_replicas_list(trans, &r.e, sectors); + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); + acc.replicas.data_type = data_type; + ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); err: bch2_trans_iter_exit(trans, &iter); return ret; } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { struct bch_fs *c = trans->c; - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); if (!m) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", @@ -942,16 +684,21 @@ err: (u64) p.ec.idx, buf.buf); printbuf_exit(&buf); bch2_inconsistent_error(c); - return -EIO; + return -BCH_ERR_trigger_stripe_pointer; } m->block_sectors[p.ec.block] += sectors; - struct bch_replicas_padded r = m->r; + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e)); mutex_unlock(&c->ec_stripes_heap_lock); - r.e.data_type = data_type; - bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); + acc.replicas.data_type = data_type; + int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true); + if (ret) + return ret; } return 0; @@ -959,45 +706,49 @@ err: static int __trigger_extent(struct btree_trans *trans, enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) + struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags, + s64 *replicas_sectors) { - bool gc = flags & BTREE_TRIGGER_GC; - struct bch_fs *c = trans->c; + bool gc = flags & BTREE_TRIGGER_gc; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - struct bch_replicas_padded r; enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ? BCH_DATA_btree : BCH_DATA_user; - s64 dirty_sectors = 0; int ret = 0; - r.e.data_type = data_type; - r.e.nr_devs = 0; - r.e.nr_required = 1; + struct disk_accounting_pos acc_replicas_key = { + .type = BCH_DISK_ACCOUNTING_replicas, + .replicas.data_type = data_type, + .replicas.nr_devs = 0, + .replicas.nr_required = 1, + }; + + struct disk_accounting_pos acct_compression_key = { + .type = BCH_DISK_ACCOUNTING_compression, + }; + u64 compression_acct[3] = { 1, 0, 0 }; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors; - ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags); + s64 disk_sectors = 0; + ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); if (ret < 0) return ret; bool stale = ret > 0; + if (p.ptr.cached && stale) + continue; + if (p.ptr.cached) { - if (!stale) { - ret = !gc - ? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors) - : update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true); - bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors", - __func__); - if (ret) - return ret; - } + ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc); + if (ret) + return ret; } else if (!p.has_ec) { - dirty_sectors += disk_sectors; - r.e.devs[r.e.nr_devs++] = p.ptr.dev; + *replicas_sectors += disk_sectors; + replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev); } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); if (ret) @@ -1008,21 +759,77 @@ static int __trigger_extent(struct btree_trans *trans, * if so they're not required for mounting if we have an * erasure coded pointer in this extent: */ - r.e.nr_required = 0; + acc_replicas_key.replicas.nr_required = 0; } - } - if (r.e.nr_devs) { - ret = !gc - ? bch2_update_replicas_list(trans, &r.e, dirty_sectors) - : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true); - if (unlikely(ret && gc)) { - struct printbuf buf = PRINTBUF; + if (acct_compression_key.compression.type && + acct_compression_key.compression.type != p.crc.compression_type) { + if (flags & BTREE_TRIGGER_overwrite) + bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - bch2_bkey_val_to_text(&buf, c, k); - bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); - printbuf_exit(&buf); + ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, + ARRAY_SIZE(compression_acct), gc); + if (ret) + return ret; + + compression_acct[0] = 1; + compression_acct[1] = 0; + compression_acct[2] = 0; + } + + acct_compression_key.compression.type = p.crc.compression_type; + if (p.crc.compression_type) { + compression_acct[1] += p.crc.uncompressed_size; + compression_acct[2] += p.crc.compressed_size; } + } + + if (acc_replicas_key.replicas.nr_devs) { + ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc); + if (ret) + return ret; + } + + if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { + struct disk_accounting_pos acc_snapshot_key = { + .type = BCH_DISK_ACCOUNTING_snapshot, + .snapshot.id = k.k->p.snapshot, + }; + ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc); + if (ret) + return ret; + } + + if (acct_compression_key.compression.type) { + if (flags & BTREE_TRIGGER_overwrite) + bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); + + ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, + ARRAY_SIZE(compression_acct), gc); + if (ret) + return ret; + } + + if (level) { + struct disk_accounting_pos acc_btree_key = { + .type = BCH_DISK_ACCOUNTING_btree, + .btree.id = btree_id, + }; + ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc); + if (ret) + return ret; + } else { + bool insert = !(flags & BTREE_TRIGGER_overwrite); + struct disk_accounting_pos acc_inum_key = { + .type = BCH_DISK_ACCOUNTING_inum, + .inum.inum = k.k->p.inode, + }; + s64 v[3] = { + insert ? 1 : -1, + insert ? k.k->size : -((s64) k.k->size), + *replicas_sectors, + }; + ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); if (ret) return ret; } @@ -1031,15 +838,19 @@ static int __trigger_extent(struct btree_trans *trans, } int bch2_trigger_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, + enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start; + if (unlikely(flags & BTREE_TRIGGER_check_repair)) + return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags); + /* if pointers aren't changing - nothing to do: */ if (new_ptrs_bytes == old_ptrs_bytes && !memcmp(new_ptrs.start, @@ -1047,60 +858,75 @@ int bch2_trigger_extent(struct btree_trans *trans, new_ptrs_bytes)) return 0; - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { - struct bch_fs *c = trans->c; - int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) - - (int) bch2_bkey_needs_rebalance(c, old); + if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { + s64 old_replicas_sectors = 0, new_replicas_sectors = 0; - if (mod) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0); + if (old.k->type) { + int ret = __trigger_extent(trans, btree, level, old, + flags & ~BTREE_TRIGGER_insert, + &old_replicas_sectors); if (ret) return ret; } - } - if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC)) - return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags); + if (new.k->type) { + int ret = __trigger_extent(trans, btree, level, new.s_c, + flags & ~BTREE_TRIGGER_overwrite, + &new_replicas_sectors); + if (ret) + return ret; + } - return 0; -} + int need_rebalance_delta = 0; + s64 need_rebalance_sectors_delta = 0; -/* KEY_TYPE_reservation */ + s64 s = bch2_bkey_sectors_need_rebalance(c, old); + need_rebalance_delta -= s != 0; + need_rebalance_sectors_delta -= s; -static int __trigger_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) -{ - struct bch_fs *c = trans->c; - unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; - s64 sectors = (s64) k.k->size * replicas; + s = bch2_bkey_sectors_need_rebalance(c, new.s_c); + need_rebalance_delta += s != 0; + need_rebalance_sectors_delta += s; - if (flags & BTREE_TRIGGER_OVERWRITE) - sectors = -sectors; + if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + new.k->p, need_rebalance_delta > 0); + if (ret) + return ret; + } - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { - int ret = bch2_replicas_deltas_realloc(trans, 0); - if (ret) - return ret; + if (need_rebalance_sectors_delta) { + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_rebalance_work, + }; + int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1, + flags & BTREE_TRIGGER_gc); + if (ret) + return ret; + } + } - struct replicas_delta_list *d = trans->fs_usage_deltas; - replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved)); + return 0; +} - d->persistent_reserved[replicas - 1] += sectors; - } +/* KEY_TYPE_reservation */ - if (flags & BTREE_TRIGGER_GC) { - percpu_down_read(&c->mark_lock); - preempt_disable(); +static int __trigger_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) +{ + if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { + s64 sectors = k.k->size; - struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc); + if (flags & BTREE_TRIGGER_overwrite) + sectors = -sectors; - replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved)); - fs_usage->b.reserved += sectors; - fs_usage->persistent_reserved[replicas - 1] += sectors; + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_persistent_reserved, + .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas, + }; - preempt_enable(); - percpu_up_read(&c->mark_lock); + return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc); } return 0; @@ -1109,7 +935,7 @@ static int __trigger_reservation(struct btree_trans *trans, int bch2_trigger_reservation(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags); } @@ -1117,35 +943,29 @@ int bch2_trigger_reservation(struct btree_trans *trans, /* Mark superblocks: */ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, size_t b, + struct bch_dev *ca, u64 b, enum bch_data_type type, unsigned sectors) { struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_i_alloc_v4 *a; int ret = 0; - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) - return 0; - - a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); + struct bkey_i_alloc_v4 *a = + bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b)); if (IS_ERR(a)) return PTR_ERR(a); if (a->v.data_type && type && a->v.data_type != type) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - BCH_FSCK_ERR_bucket_metadata_type_mismatch, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, bucket_metadata_type_mismatch, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", iter.pos.inode, iter.pos.offset, a->v.gen, bch2_data_type_str(a->v.data_type), bch2_data_type_str(type), bch2_data_type_str(type)); - ret = -EIO; + ret = -BCH_ERR_metadata_bucket_inconsistency; goto err; } @@ -1156,24 +976,80 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, ret = bch2_trans_update(trans, &iter, &a->k_i, 0); } err: +fsck_err: bch2_trans_iter_exit(trans, &iter); return ret; } +static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, + u64 b, enum bch_data_type data_type, unsigned sectors, + enum btree_iter_update_trigger_flags flags) +{ + struct bch_fs *c = trans->c; + int ret = 0; + + struct bucket *g = gc_bucket(ca, b); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", + ca->dev_idx, bch2_data_type_str(data_type))) + goto err; + + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g); + + if (bch2_fs_inconsistent_on(g->data_type && + g->data_type != data_type, c, + "different types of data in same bucket: %s, %s", + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type))) + goto err_unlock; + + if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, + "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", + ca->dev_idx, b, g->gen, + bch2_data_type_str(g->data_type ?: data_type), + g->dirty_sectors, sectors)) + goto err_unlock; + + g->data_type = data_type; + g->dirty_sectors += sectors; + struct bch_alloc_v4 new = bucket_m_to_alloc(*g); + bucket_unlock(g); + ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); + return ret; +err_unlock: + bucket_unlock(g); +err: + return -BCH_ERR_metadata_bucket_inconsistency; +} + int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, size_t b, - enum bch_data_type type, - unsigned sectors) + struct bch_dev *ca, u64 b, + enum bch_data_type type, unsigned sectors, + enum btree_iter_update_trigger_flags flags) { - return commit_do(trans, NULL, NULL, 0, - __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); + BUG_ON(type != BCH_DATA_free && + type != BCH_DATA_sb && + type != BCH_DATA_journal); + + /* + * Backup superblock might be past the end of our normal usable space: + */ + if (b >= ca->mi.nbuckets) + return 0; + + if (flags & BTREE_TRIGGER_gc) + return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags); + else if (flags & BTREE_TRIGGER_transactional) + return commit_do(trans, NULL, NULL, 0, + __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); + else + BUG(); } static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, - struct bch_dev *ca, - u64 start, u64 end, - enum bch_data_type type, - u64 *bucket, unsigned *bucket_sectors) + struct bch_dev *ca, u64 start, u64 end, + enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors, + enum btree_iter_update_trigger_flags flags) { do { u64 b = sector_to_bucket(ca, start); @@ -1182,7 +1058,7 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, if (b != *bucket && *bucket_sectors) { int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, - type, *bucket_sectors); + type, *bucket_sectors, flags); if (ret) return ret; @@ -1197,35 +1073,40 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, return 0; } -static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, - struct bch_dev *ca) +static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca, + enum btree_iter_update_trigger_flags flags) { - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + struct bch_fs *c = trans->c; + + mutex_lock(&c->sb_lock); + struct bch_sb_layout layout = ca->disk_sb.sb->layout; + mutex_unlock(&c->sb_lock); + u64 bucket = 0; unsigned i, bucket_sectors = 0; int ret; - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); + for (i = 0; i < layout.nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout.sb_offset[i]); if (offset == BCH_SB_SECTOR) { ret = bch2_trans_mark_metadata_sectors(trans, ca, 0, BCH_SB_SECTOR, - BCH_DATA_sb, &bucket, &bucket_sectors); + BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; } ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, - offset + (1 << layout->sb_max_size_bits), - BCH_DATA_sb, &bucket, &bucket_sectors); + offset + (1 << layout.sb_max_size_bits), + BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; } if (bucket_sectors) { ret = bch2_trans_mark_metadata_bucket(trans, ca, - bucket, BCH_DATA_sb, bucket_sectors); + bucket, BCH_DATA_sb, bucket_sectors, flags); if (ret) return ret; } @@ -1233,7 +1114,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, for (i = 0; i < ca->journal.nr; i++) { ret = bch2_trans_mark_metadata_bucket(trans, ca, ca->journal.buckets[i], - BCH_DATA_journal, ca->mi.bucket_size); + BCH_DATA_journal, ca->mi.bucket_size, flags); if (ret) return ret; } @@ -1241,20 +1122,22 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, return 0; } -int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) +int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, + enum btree_iter_update_trigger_flags flags) { - int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); - + int ret = bch2_trans_run(c, + __bch2_trans_mark_dev_sb(trans, ca, flags)); bch_err_fn(c, ret); return ret; } -int bch2_trans_mark_dev_sbs(struct bch_fs *c) +int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, + enum btree_iter_update_trigger_flags flags) { for_each_online_member(c, ca) { - int ret = bch2_trans_mark_dev_sb(c, ca); + int ret = bch2_trans_mark_dev_sb(c, ca, flags); if (ret) { - percpu_ref_put(&ca->ref); + percpu_ref_put(&ca->io_ref); return ret; } } @@ -1262,16 +1145,46 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c) return 0; } +int bch2_trans_mark_dev_sbs(struct bch_fs *c) +{ + return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); +} + +bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 b_offset = bucket_to_sector(ca, b); + u64 b_end = bucket_to_sector(ca, b + 1); + unsigned i; + + if (!b) + return true; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + u64 end = offset + (1 << layout->sb_max_size_bits); + + if (!(offset >= b_end || end <= b_offset)) + return true; + } + + for (i = 0; i < ca->journal.nr; i++) + if (b == ca->journal.buckets[i]) + return true; + + return false; +} + /* Disk reservations: */ #define SECTORS_CACHE 1024 int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - u64 sectors, int flags) + u64 sectors, enum bch_reservation_flags flags) { struct bch_fs_pcpu *pcpu; - u64 old, v, get; - s64 sectors_available; + u64 old, get; + u64 sectors_available; int ret; percpu_down_read(&c->mark_lock); @@ -1281,17 +1194,16 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, if (sectors <= pcpu->sectors_available) goto out; - v = atomic64_read(&c->sectors_available); + old = atomic64_read(&c->sectors_available); do { - old = v; get = min((u64) sectors + SECTORS_CACHE, old); if (get < sectors) { preempt_enable(); goto recalculate; } - } while ((v = atomic64_cmpxchg(&c->sectors_available, - old, old - get)) != old); + } while (!atomic64_try_cmpxchg(&c->sectors_available, + &old, old - get)); pcpu->sectors_available += get; @@ -1310,6 +1222,9 @@ recalculate: percpu_u64_set(&c->pcpu->sectors_available, 0); sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); + if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL)) + sectors = min(sectors, sectors_available); + if (sectors <= sectors_available || (flags & BCH_DISK_RESERVATION_NOFAIL)) { atomic64_set(&c->sectors_available, @@ -1330,75 +1245,82 @@ recalculate: /* Startup/shutdown: */ +void bch2_buckets_nouse_free(struct bch_fs *c) +{ + for_each_member_device(c, ca) { + kvfree_rcu_mightsleep(ca->buckets_nouse); + ca->buckets_nouse = NULL; + } +} + +int bch2_buckets_nouse_alloc(struct bch_fs *c) +{ + for_each_member_device(c, ca) { + BUG_ON(ca->buckets_nouse); + + ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO); + if (!ca->buckets_nouse) { + bch2_dev_put(ca); + return -BCH_ERR_ENOMEM_buckets_nouse; + } + } + + return 0; +} + static void bucket_gens_free_rcu(struct rcu_head *rcu) { struct bucket_gens *buckets = container_of(rcu, struct bucket_gens, rcu); - kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); + kvfree(buckets); } int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; - unsigned long *buckets_nouse = NULL; bool resize = ca->bucket_gens != NULL; int ret; - if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, - GFP_KERNEL|__GFP_ZERO))) { - ret = -BCH_ERR_ENOMEM_bucket_gens; - goto err; - } + if (resize) + lockdep_assert_held(&c->state_lock); - if ((c->opts.buckets_nouse && - !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)))) { - ret = -BCH_ERR_ENOMEM_buckets_nouse; + if (resize && ca->buckets_nouse) + return -BCH_ERR_no_resize_with_buckets_nouse; + + bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), + GFP_KERNEL|__GFP_ZERO); + if (!bucket_gens) { + ret = -BCH_ERR_ENOMEM_bucket_gens; goto err; } bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; - - if (resize) { - down_write(&c->gc_lock); - down_write(&ca->bucket_lock); - percpu_down_write(&c->mark_lock); - } + bucket_gens->nbuckets_minus_first = + bucket_gens->nbuckets - bucket_gens->first_bucket; old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); if (resize) { - size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); - + bucket_gens->nbuckets = min(bucket_gens->nbuckets, + old_bucket_gens->nbuckets); + bucket_gens->nbuckets_minus_first = + bucket_gens->nbuckets - bucket_gens->first_bucket; memcpy(bucket_gens->b, old_bucket_gens->b, - n); - if (buckets_nouse) - memcpy(buckets_nouse, - ca->buckets_nouse, - BITS_TO_LONGS(n) * sizeof(unsigned long)); + bucket_gens->nbuckets); } rcu_assign_pointer(ca->bucket_gens, bucket_gens); bucket_gens = old_bucket_gens; - swap(ca->buckets_nouse, buckets_nouse); - nbuckets = ca->mi.nbuckets; - if (resize) { - percpu_up_write(&c->mark_lock); - up_write(&ca->bucket_lock); - up_write(&c->gc_lock); - } - ret = 0; err: - kvpfree(buckets_nouse, - BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); if (bucket_gens) call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); @@ -1407,31 +1329,16 @@ err: void bch2_dev_buckets_free(struct bch_dev *ca) { - unsigned i; - - kvpfree(ca->buckets_nouse, - BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); - kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), - sizeof(struct bucket_gens) + ca->mi.nbuckets); - - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) - free_percpu(ca->usage[i]); - kfree(ca->usage_base); + kvfree(ca->buckets_nouse); + kvfree(rcu_dereference_protected(ca->bucket_gens, 1)); + free_percpu(ca->usage); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - unsigned i; - - ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); - if (!ca->usage_base) + ca->usage = alloc_percpu(struct bch_dev_usage); + if (!ca->usage) return -BCH_ERR_ENOMEM_usage_init; - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { - ca->usage[i] = alloc_percpu(struct bch_dev_usage); - if (!ca->usage[i]) - return -BCH_ERR_ENOMEM_usage_init; - } - return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 6387e039f789..a9acdd6c0c86 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -12,7 +12,7 @@ #include "extents.h" #include "sb-members.h" -static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) +static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s) { return div_u64(s, ca->mi.bucket_size); } @@ -30,8 +30,7 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) return remainder; } -static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, - u32 *offset) +static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset) { return div_u64_rem(s, ca->mi.bucket_size, offset); } @@ -81,60 +80,58 @@ static inline void bucket_lock(struct bucket *b) TASK_UNINTERRUPTIBLE); } -static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) -{ - return rcu_dereference_check(ca->buckets_gc, - !ca->fs || - percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->gc_lock) || - lockdep_is_held(&ca->bucket_lock)); -} - static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { - struct bucket_array *buckets = gc_bucket_array(ca); - - BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); - return buckets->b + b; + return bucket_valid(ca, b) + ? genradix_ptr(&ca->buckets_gc, b) + : NULL; } static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) { return rcu_dereference_check(ca->bucket_gens, - !ca->fs || - percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->gc_lock) || - lockdep_is_held(&ca->bucket_lock)); + lockdep_is_held(&ca->fs->state_lock)); } static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) { struct bucket_gens *gens = bucket_gens(ca); - BUG_ON(b < gens->first_bucket || b >= gens->nbuckets); + if (b - gens->first_bucket >= gens->nbuckets_minus_first) + return NULL; return gens->b + b; } +static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b) +{ + u8 *gen = bucket_gen(ca, b); + return gen ? *gen : -1; +} + +static inline int bucket_gen_get(struct bch_dev *ca, size_t b) +{ + rcu_read_lock(); + int ret = bucket_gen_get_rcu(ca, b); + rcu_read_unlock(); + return ret; +} + static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, const struct bch_extent_ptr *ptr) { return sector_to_bucket(ca, ptr->offset); } -static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, - const struct bch_extent_ptr *ptr) +static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca, + const struct bch_extent_ptr *ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); } -static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c, +static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca, const struct bch_extent_ptr *ptr, u32 *bucket_offset) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); } @@ -175,19 +172,21 @@ static inline int gen_after(u8 a, u8 b) return r > 0 ? r : 0; } +static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) +{ + int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr)); + return gen < 0 ? gen : gen_after(gen, ptr->gen); +} + /** - * ptr_stale() - check if a pointer points into a bucket that has been + * dev_ptr_stale() - check if a pointer points into a bucket that has been * invalidated. */ -static inline u8 ptr_stale(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) +static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - u8 ret; - rcu_read_lock(); - ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); + int ret = dev_ptr_stale_rcu(ca, ptr); rcu_read_unlock(); - return ret; } @@ -202,8 +201,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) return ret; } -void bch2_dev_usage_init(struct bch_dev *); -void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *); +void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *); static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) { @@ -226,6 +224,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma fallthrough; case BCH_WATERMARK_btree_copygc: case BCH_WATERMARK_reclaim: + case BCH_WATERMARK_interior_updates: break; } @@ -263,129 +262,52 @@ static inline u64 dev_buckets_available(struct bch_dev *ca, /* Filesystem usage: */ -static inline unsigned __fs_usage_u64s(unsigned nr_replicas) -{ - return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas; -} - -static inline unsigned fs_usage_u64s(struct bch_fs *c) -{ - return __fs_usage_u64s(READ_ONCE(c->replicas.nr)); -} - -static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas) -{ - return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas; -} - -static inline unsigned fs_usage_online_u64s(struct bch_fs *c) -{ - return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr)); -} - static inline unsigned dev_usage_u64s(void) { return sizeof(struct bch_dev_usage) / sizeof(u64); } -u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); - -struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *); - -void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); - -void bch2_fs_usage_to_text(struct printbuf *, - struct bch_fs *, struct bch_fs_usage_online *); - -u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *); - struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *); -void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *, - const struct bch_alloc_v4 *, - const struct bch_alloc_v4 *, u64, bool); -void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *, - struct bucket *, struct bucket *); - -/* key/bucket marking: */ - -static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, - unsigned journal_seq, - bool gc) -{ - percpu_rwsem_assert_held(&c->mark_lock); - BUG_ON(!gc && !journal_seq); - - return this_cpu_ptr(gc - ? c->usage_gc - : c->usage[journal_seq & JOURNAL_BUF_MASK]); -} - -int bch2_update_replicas(struct bch_fs *, struct bkey_s_c, - struct bch_replicas_entry_v1 *, s64, - unsigned, bool); -int bch2_update_replicas_list(struct btree_trans *, - struct bch_replicas_entry_v1 *, s64); -int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64); -int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned); - -void bch2_fs_usage_initialize(struct bch_fs *); +int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *, + struct bkey_s_c, const struct bch_extent_ptr *, + s64, enum bch_data_type, u8, u8, u32 *); -int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c, - const struct bch_extent_ptr *, - s64, enum bch_data_type, u8, u8, u32); - -int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, - size_t, enum bch_data_type, unsigned, - struct gc_pos, unsigned); +int bch2_check_fix_ptrs(struct btree_trans *, + enum btree_id, unsigned, struct bkey_s_c, + enum btree_iter_update_trigger_flags); int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ ({ \ int ret = 0; \ \ if (_old.k->type) \ - ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \ + ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert); \ if (!ret && _new.k->type) \ - ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\ + ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\ ret; \ }) void bch2_trans_account_disk_usage_change(struct btree_trans *); -void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); -int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); - -int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, - size_t, enum bch_data_type, unsigned); -int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); +int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64, + enum bch_data_type, unsigned, + enum btree_iter_update_trigger_flags); +int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *, + enum btree_iter_update_trigger_flags); +int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, + enum btree_iter_update_trigger_flags); int bch2_trans_mark_dev_sbs(struct bch_fs *); -static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - u64 b_offset = bucket_to_sector(ca, b); - u64 b_end = bucket_to_sector(ca, b + 1); - unsigned i; - - if (!b) - return true; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - u64 end = offset + (1 << layout->sb_max_size_bits); - - if (!(offset >= b_end || end <= b_offset)) - return true; - } - - return false; -} +bool bch2_is_superblock_bucket(struct bch_dev *, u64); static inline const char *bch2_data_type_str(enum bch_data_type type) { @@ -394,14 +316,6 @@ static inline const char *bch2_data_type_str(enum bch_data_type type) : "(invalid data type)"; } -static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type) -{ - if (type < BCH_DATA_NR) - prt_str(out, __bch2_data_types[type]); - else - prt_printf(out, "(invalid data type %u)", type); -} - /* disk reservations: */ static inline void bch2_disk_reservation_put(struct bch_fs *c, @@ -413,25 +327,27 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c, } } -#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) +enum bch_reservation_flags { + BCH_DISK_RESERVATION_NOFAIL = 1 << 0, + BCH_DISK_RESERVATION_PARTIAL = 1 << 1, +}; -int __bch2_disk_reservation_add(struct bch_fs *, - struct disk_reservation *, - u64, int); +int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *, + u64, enum bch_reservation_flags); static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - u64 sectors, int flags) + u64 sectors, enum bch_reservation_flags flags) { #ifdef __KERNEL__ u64 old, new; + old = this_cpu_read(c->pcpu->sectors_available); do { - old = this_cpu_read(c->pcpu->sectors_available); if (sectors > old) return __bch2_disk_reservation_add(c, res, sectors, flags); new = old - sectors; - } while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old); + } while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new)); this_cpu_add(*c->online_reserved, sectors); res->sectors += sectors; @@ -471,6 +387,9 @@ static inline u64 avail_factor(u64 r) return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); } +void bch2_buckets_nouse_free(struct bch_fs *); +int bch2_buckets_nouse_alloc(struct bch_fs *); + int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); void bch2_dev_buckets_free(struct bch_dev *); int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 6a31740222a7..7174047b8e92 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -16,24 +16,19 @@ struct bucket { u32 stripe; u32 dirty_sectors; u32 cached_sectors; -}; - -struct bucket_array { - struct rcu_head rcu; - u16 first_bucket; - size_t nbuckets; - struct bucket b[]; -}; + u32 stripe_sectors; +} __aligned(sizeof(long)); struct bucket_gens { struct rcu_head rcu; u16 first_bucket; size_t nbuckets; - u8 b[]; + size_t nbuckets_minus_first; + u8 b[] __counted_by(nbuckets); }; struct bch_dev_usage { - struct { + struct bch_dev_usage_type { u64 buckets; u64 sectors; /* _compressed_ sectors: */ /* @@ -54,18 +49,6 @@ struct bch_fs_usage_base { u64 nr_inodes; }; -struct bch_fs_usage { - /* all fields are in units of 512 byte sectors: */ - struct bch_fs_usage_base b; - u64 persistent_reserved[BCH_REPLICAS_MAX]; - u64 replicas[]; -}; - -struct bch_fs_usage_online { - u64 online_reserved; - struct bch_fs_usage u; -}; - struct bch_fs_usage_short { u64 capacity; u64 used; diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index ec1b636ef78d..c8a488e6b7b8 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -22,23 +22,21 @@ static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_ memset(t->d, 0, sizeof(t->d[0]) << t->bits); } -bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, - u64 flushed_seq, - unsigned dev, u64 bucket) +u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b, + unsigned dev, u64 bucket) { struct buckets_waiting_for_journal_table *t; u64 dev_bucket = (u64) dev << 56 | bucket; - bool ret = false; - unsigned i; + u64 ret = 0; mutex_lock(&b->lock); t = b->t; - for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); if (h->dev_bucket == dev_bucket) { - ret = h->journal_seq > flushed_seq; + ret = h->journal_seq; break; } } @@ -93,7 +91,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, .dev_bucket = (u64) dev << 56 | bucket, .journal_seq = journal_seq, }; - size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0; + size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0; int ret = 0; mutex_lock(&b->lock); @@ -106,8 +104,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, for (i = 0; i < size; i++) nr_elements += t->d[i].journal_seq > flushed_seq; - new_bits = t->bits + (nr_elements * 3 > size); - + new_bits = ilog2(roundup_pow_of_two(nr_elements * 3)); +realloc: n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); if (!n) { ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; @@ -115,7 +113,16 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, } retry_rehash: + if (nr_rehashes_this_size == 3) { + new_bits++; + nr_rehashes_this_size = 0; + kvfree(n); + goto realloc; + } + nr_rehashes++; + nr_rehashes_this_size++; + bucket_table_init(n, new_bits); tmp = new; diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h index d2ae19cbe18c..365619ca44c8 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.h +++ b/fs/bcachefs/buckets_waiting_for_journal.h @@ -4,8 +4,8 @@ #include "buckets_waiting_for_journal_types.h" -bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, - u64, unsigned, u64); +u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *, + unsigned, u64); int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, u64, unsigned, u64, u64); diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 226b39c17667..46e9e32105a9 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -5,11 +5,12 @@ #include "bcachefs_ioctl.h" #include "buckets.h" #include "chardev.h" +#include "disk_accounting.h" +#include "fsck.h" #include "journal.h" #include "move.h" -#include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" -#include "super.h" #include "super-io.h" #include "thread_with_file.h" @@ -22,12 +23,6 @@ #include <linux/slab.h> #include <linux/uaccess.h> -__must_check -static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) -{ - return copy_to_user(to, from, n) ? -EFAULT : 0; -} - /* returns with ref on ca->ref */ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, unsigned flags) @@ -38,12 +33,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, if (dev >= c->sb.nr_devices) return ERR_PTR(-EINVAL); - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - + ca = bch2_dev_tryget_noerror(c, dev); if (!ca) return ERR_PTR(-EINVAL); } else { @@ -137,102 +127,6 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg } #endif -struct fsck_thread { - struct thread_with_stdio thr; - struct bch_fs *c; - char **devs; - size_t nr_devs; - struct bch_opts opts; -}; - -static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) -{ - struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); - if (thr->devs) - for (size_t i = 0; i < thr->nr_devs; i++) - kfree(thr->devs[i]); - kfree(thr->devs); - kfree(thr); -} - -static int bch2_fsck_offline_thread_fn(void *arg) -{ - struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); - struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); - - thr->thr.thr.ret = PTR_ERR_OR_ZERO(c); - if (!thr->thr.thr.ret) - bch2_fs_stop(c); - - thread_with_stdio_done(&thr->thr); - return 0; -} - -static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) -{ - struct bch_ioctl_fsck_offline arg; - struct fsck_thread *thr = NULL; - u64 *devs = NULL; - long ret = 0; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) || - !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) || - !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) { - ret = -ENOMEM; - goto err; - } - - thr->opts = bch2_opts_empty(); - thr->nr_devs = arg.nr_devs; - - if (copy_from_user(devs, &user_arg->devs[0], - array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) { - ret = -EINVAL; - goto err; - } - - for (size_t i = 0; i < arg.nr_devs; i++) { - thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX); - ret = PTR_ERR_OR_ZERO(thr->devs[i]); - if (ret) - goto err; - } - - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - - ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(NULL, &thr->opts, optstr); - kfree(optstr); - - if (ret) - goto err; - } - - opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); - - ret = bch2_run_thread_with_stdio(&thr->thr, - bch2_fsck_thread_exit, - bch2_fsck_offline_thread_fn); -err: - if (ret < 0) { - if (thr) - bch2_fsck_thread_exit(&thr->thr); - pr_err("ret %s", bch2_err_str(ret)); - } - kfree(devs); - return ret; -} - static long bch2_global_ioctl(unsigned cmd, void __user *arg) { long ret; @@ -304,7 +198,8 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) return ret; ret = bch2_dev_add(c, path); - kfree(path); + if (!IS_ERR(path)) + kfree(path); return ret; } @@ -371,7 +266,7 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) return PTR_ERR(ca); ret = bch2_dev_offline(c, ca, arg.flags); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -400,7 +295,7 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, if (ret) bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -453,7 +348,6 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, static const struct file_operations bcachefs_data_ops = { .release = bch2_data_job_release, .read = bch2_data_job_read, - .llseek = no_llseek, }; static long bch2_ioctl_data(struct bch_fs *c, @@ -486,11 +380,9 @@ static long bch2_ioctl_data(struct bch_fs *c, static long bch2_ioctl_fs_usage(struct bch_fs *c, struct bch_ioctl_fs_usage __user *user_arg) { - struct bch_ioctl_fs_usage *arg = NULL; - struct bch_replicas_usage *dst_e, *dst_end; - struct bch_fs_usage_online *src; + struct bch_ioctl_fs_usage arg = {}; + darray_char replicas = {}; u32 replica_entries_bytes; - unsigned i; int ret = 0; if (!test_bit(BCH_FS_started, &c->flags)) @@ -499,62 +391,60 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) return -EFAULT; - arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL); - if (!arg) - return -ENOMEM; - - src = bch2_fs_usage_read(c); - if (!src) { - ret = -ENOMEM; + ret = bch2_fs_replicas_usage_read(c, &replicas) ?: + (replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?: + copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr); + if (ret) goto err; - } - - arg->capacity = c->capacity; - arg->used = bch2_fs_sectors_used(c, src); - arg->online_reserved = src->online_reserved; - - for (i = 0; i < BCH_REPLICAS_MAX; i++) - arg->persistent_reserved[i] = src->u.persistent_reserved[i]; - - dst_e = arg->replicas; - dst_end = (void *) arg->replicas + replica_entries_bytes; - - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *src_e = - cpu_replicas_entry(&c->replicas, i); - - /* check that we have enough space for one replicas entry */ - if (dst_e + 1 > dst_end) { - ret = -ERANGE; - break; - } - dst_e->sectors = src->u.replicas[i]; - dst_e->r = *src_e; + struct bch_fs_usage_short u = bch2_fs_usage_read_short(c); + arg.capacity = c->capacity; + arg.used = u.used; + arg.online_reserved = percpu_u64_get(c->online_reserved); + arg.replica_entries_bytes = replicas.nr; - /* recheck after setting nr_devs: */ - if (replicas_usage_next(dst_e) > dst_end) { - ret = -ERANGE; - break; - } - - memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); + for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { + struct disk_accounting_pos k = { + .type = BCH_DISK_ACCOUNTING_persistent_reserved, + .persistent_reserved.nr_replicas = i, + }; - dst_e = replicas_usage_next(dst_e); + bch2_accounting_mem_read(c, + disk_accounting_pos_to_bpos(&k), + &arg.persistent_reserved[i], 1); } - arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; + ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); +err: + darray_exit(&replicas); + return ret; +} + +static long bch2_ioctl_query_accounting(struct bch_fs *c, + struct bch_ioctl_query_accounting __user *user_arg) +{ + struct bch_ioctl_query_accounting arg; + darray_char accounting = {}; + int ret = 0; - percpu_up_read(&c->mark_lock); - kfree(src); + if (!test_bit(BCH_FS_started, &c->flags)) + return -EINVAL; + ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?: + bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?: + (arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?: + copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr); if (ret) goto err; - ret = copy_to_user_errcode(user_arg, arg, - sizeof(*arg) + arg->replica_entries_bytes); + arg.capacity = c->capacity; + arg.used = bch2_fs_usage_read_short(c).used; + arg.online_reserved = percpu_u64_get(c->online_reserved); + arg.accounting_u64s = accounting.nr / sizeof(u64); + + ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); err: - kfree(arg); + darray_exit(&accounting); return ret; } @@ -589,13 +479,13 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.bucket_size = ca->mi.bucket_size; arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - for (i = 0; i < BCH_DATA_NR; i++) { + for (i = 0; i < ARRAY_SIZE(arg.d); i++) { arg.d[i].buckets = src.d[i].buckets; arg.d[i].sectors = src.d[i].sectors; arg.d[i].fragmented = src.d[i].fragmented; } - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); } @@ -647,7 +537,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, goto err; } err: - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -669,11 +559,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c, if (arg.flags & BCH_READ_DEV) { ca = bch2_device_lookup(c, arg.dev, arg.flags); - - if (IS_ERR(ca)) { - ret = PTR_ERR(ca); - goto err; - } + ret = PTR_ERR_OR_ZERO(ca); + if (ret) + goto err_unlock; sb = ca->disk_sb.sb; } else { @@ -688,8 +576,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c, ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb, vstruct_bytes(sb)); err: - if (!IS_ERR_OR_NULL(ca)) - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); +err_unlock: mutex_unlock(&c->sb_lock); return ret; } @@ -733,7 +621,7 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c, ret = bch2_dev_resize(c, ca, arg.nbuckets); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -759,98 +647,7 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); - percpu_ref_put(&ca->ref); - return ret; -} - -static int bch2_fsck_online_thread_fn(void *arg) -{ - struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); - struct bch_fs *c = thr->c; - - c->stdio_filter = current; - c->stdio = &thr->thr.stdio; - - /* - * XXX: can we figure out a way to do this without mucking with c->opts? - */ - unsigned old_fix_errors = c->opts.fix_errors; - if (opt_defined(thr->opts, fix_errors)) - c->opts.fix_errors = thr->opts.fix_errors; - else - c->opts.fix_errors = FSCK_FIX_ask; - - c->opts.fsck = true; - set_bit(BCH_FS_fsck_running, &c->flags); - - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; - int ret = bch2_run_online_recovery_passes(c); - - clear_bit(BCH_FS_fsck_running, &c->flags); - bch_err_fn(c, ret); - - c->stdio = NULL; - c->stdio_filter = NULL; - c->opts.fix_errors = old_fix_errors; - - thread_with_stdio_done(&thr->thr); - - up(&c->online_fsck_mutex); - bch2_ro_ref_put(c); - return 0; -} - -static long bch2_ioctl_fsck_online(struct bch_fs *c, - struct bch_ioctl_fsck_online arg) -{ - struct fsck_thread *thr = NULL; - long ret = 0; - - if (arg.flags) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!bch2_ro_ref_tryget(c)) - return -EROFS; - - if (down_trylock(&c->online_fsck_mutex)) { - bch2_ro_ref_put(c); - return -EAGAIN; - } - - thr = kzalloc(sizeof(*thr), GFP_KERNEL); - if (!thr) { - ret = -ENOMEM; - goto err; - } - - thr->c = c; - thr->opts = bch2_opts_empty(); - - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - - ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(c, &thr->opts, optstr); - kfree(optstr); - - if (ret) - goto err; - } - - ret = bch2_run_thread_with_stdio(&thr->thr, - bch2_fsck_thread_exit, - bch2_fsck_online_thread_fn); -err: - if (ret < 0) { - bch_err_fn(c, ret); - if (thr) - bch2_fsck_thread_exit(&thr->thr); - up(&c->online_fsck_mutex); - bch2_ro_ref_put(c); - } + bch2_dev_put(ca); return ret; } @@ -911,6 +708,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); case BCH_IOCTL_FSCK_ONLINE: BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); + case BCH_IOCTL_QUERY_ACCOUNTING: + return bch2_ioctl_query_accounting(c, arg); default: return -ENOTTY; } @@ -940,7 +739,9 @@ static const struct file_operations bch_chardev_fops = { }; static int bch_chardev_major; -static struct class *bch_chardev_class; +static const struct class bch_chardev_class = { + .name = "bcachefs", +}; static struct device *bch_chardev; void bch2_fs_chardev_exit(struct bch_fs *c) @@ -957,7 +758,7 @@ int bch2_fs_chardev_init(struct bch_fs *c) if (c->minor < 0) return c->minor; - c->chardev = device_create(bch_chardev_class, NULL, + c->chardev = device_create(&bch_chardev_class, NULL, MKDEV(bch_chardev_major, c->minor), c, "bcachefs%u-ctl", c->minor); if (IS_ERR(c->chardev)) @@ -968,32 +769,39 @@ int bch2_fs_chardev_init(struct bch_fs *c) void bch2_chardev_exit(void) { - if (!IS_ERR_OR_NULL(bch_chardev_class)) - device_destroy(bch_chardev_class, - MKDEV(bch_chardev_major, U8_MAX)); - if (!IS_ERR_OR_NULL(bch_chardev_class)) - class_destroy(bch_chardev_class); + device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX)); + class_unregister(&bch_chardev_class); if (bch_chardev_major > 0) unregister_chrdev(bch_chardev_major, "bcachefs"); } int __init bch2_chardev_init(void) { + int ret; + bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); if (bch_chardev_major < 0) return bch_chardev_major; - bch_chardev_class = class_create("bcachefs"); - if (IS_ERR(bch_chardev_class)) - return PTR_ERR(bch_chardev_class); + ret = class_register(&bch_chardev_class); + if (ret) + goto major_out; - bch_chardev = device_create(bch_chardev_class, NULL, + bch_chardev = device_create(&bch_chardev_class, NULL, MKDEV(bch_chardev_major, U8_MAX), NULL, "bcachefs-ctl"); - if (IS_ERR(bch_chardev)) - return PTR_ERR(bch_chardev); + if (IS_ERR(bch_chardev)) { + ret = PTR_ERR(bch_chardev); + goto class_out; + } return 0; + +class_out: + class_unregister(&bch_chardev_class); +major_out: + unregister_chrdev(bch_chardev_major, "bcachefs-ctl"); + return ret; } #endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 3c761ad6b1c8..23a383577d4c 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "checksum.h" #include "errcode.h" +#include "error.h" #include "super.h" #include "super-io.h" @@ -10,6 +11,7 @@ #include <linux/xxhash.h> #include <linux/key.h> #include <linux/random.h> +#include <linux/ratelimit.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> #include <crypto/chacha.h> @@ -99,12 +101,12 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct scatterlist *sg, size_t len) { SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - int ret; skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); - ret = crypto_skcipher_encrypt(req); + int ret = crypto_skcipher_encrypt(req); if (ret) pr_err("got error %i from crypto_skcipher_encrypt()", ret); @@ -116,38 +118,47 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm, void *buf, size_t len) { if (!is_vmalloc_addr(buf)) { - struct scatterlist sg; - - sg_init_table(&sg, 1); - sg_set_page(&sg, - is_vmalloc_addr(buf) - ? vmalloc_to_page(buf) - : virt_to_page(buf), - len, offset_in_page(buf)); + struct scatterlist sg = {}; + + sg_mark_end(&sg); + sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf)); return do_encrypt_sg(tfm, nonce, &sg, len); } else { - unsigned pages = buf_pages(buf, len); - struct scatterlist *sg; - size_t orig_len = len; - int ret, i; - - sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL); - if (!sg) - return -BCH_ERR_ENOMEM_do_encrypt; + DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; + size_t sgl_len = 0; + int ret; - sg_init_table(sg, pages); + darray_init(&sgl); - for (i = 0; i < pages; i++) { + while (len) { unsigned offset = offset_in_page(buf); - unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset); + struct scatterlist sg = { + .page_link = (unsigned long) vmalloc_to_page(buf), + .offset = offset, + .length = min(len, PAGE_SIZE - offset), + }; - sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset); - buf += pg_len; - len -= pg_len; + if (darray_push(&sgl, sg)) { + sg_mark_end(&darray_last(sgl)); + ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); + if (ret) + goto err; + + nonce = nonce_add(nonce, sgl_len); + sgl_len = 0; + sgl.nr = 0; + BUG_ON(darray_push(&sgl, sg)); + } + + buf += sg.length; + len -= sg.length; + sgl_len += sg.length; } - ret = do_encrypt_sg(tfm, nonce, sg, orig_len); - kfree(sg); + sg_mark_end(&darray_last(sgl)); + ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); +err: + darray_exit(&sgl); return ret; } } @@ -232,7 +243,7 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, return ret; } default: - BUG(); + return (struct bch_csum) {}; } } @@ -242,6 +253,10 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, if (!bch2_csum_type_is_encryption(type)) return 0; + if (bch2_fs_inconsistent_on(!c->chacha20, + c, "attempting to encrypt without encryption key")) + return -BCH_ERR_no_encryption_key; + return do_encrypt(c->chacha20, nonce, data, len); } @@ -306,7 +321,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, return ret; } default: - BUG(); + return (struct bch_csum) {}; } } @@ -323,36 +338,44 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, { struct bio_vec bv; struct bvec_iter iter; - struct scatterlist sgl[16], *sg = sgl; - size_t bytes = 0; + DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; + size_t sgl_len = 0; int ret = 0; - if (!bch2_csum_type_is_encryption(type)) - return 0; + if (bch2_fs_inconsistent_on(!c->chacha20, + c, "attempting to encrypt without encryption key")) + return -BCH_ERR_no_encryption_key; - sg_init_table(sgl, ARRAY_SIZE(sgl)); + darray_init(&sgl); bio_for_each_segment(bv, bio, iter) { - if (sg == sgl + ARRAY_SIZE(sgl)) { - sg_mark_end(sg - 1); - - ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + struct scatterlist sg = { + .page_link = (unsigned long) bv.bv_page, + .offset = bv.bv_offset, + .length = bv.bv_len, + }; + + if (darray_push(&sgl, sg)) { + sg_mark_end(&darray_last(sgl)); + ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); if (ret) - return ret; + goto err; - nonce = nonce_add(nonce, bytes); - bytes = 0; + nonce = nonce_add(nonce, sgl_len); + sgl_len = 0; + sgl.nr = 0; - sg_init_table(sgl, ARRAY_SIZE(sgl)); - sg = sgl; + BUG_ON(darray_push(&sgl, sg)); } - sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); - bytes += bv.bv_len; + sgl_len += sg.length; } - sg_mark_end(sg - 1); - return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + sg_mark_end(&darray_last(sgl)); + ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); +err: + darray_exit(&sgl); + return ret; } struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, @@ -429,15 +452,20 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, extent_nonce(version, crc_old), bio); if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { - bch_err(c, "checksum error in %s() (memory corruption or bug?)\n" - "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", - __func__, - crc_old.csum.hi, - crc_old.csum.lo, - merged.hi, - merged.lo, - bch2_csum_types[crc_old.csum_type], - bch2_csum_types[new_csum_type]); + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n" + " expected %0llx:%0llx got %0llx:%0llx (old type ", + __func__, + crc_old.csum.hi, + crc_old.csum.lo, + merged.hi, + merged.lo); + bch2_prt_csum_type(&buf, crc_old.csum_type); + prt_str(&buf, " new type "); + bch2_prt_csum_type(&buf, new_csum_type); + prt_str(&buf, ")"); + WARN_RATELIMIT(1, "%s", buf.buf); + printbuf_exit(&buf); return -EIO; } @@ -463,9 +491,8 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, /* BCH_SB_FIELD_crypt: */ -static int bch2_sb_crypt_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); @@ -488,14 +515,10 @@ static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); - prt_newline(out); - prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); - prt_newline(out); - prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); - prt_newline(out); - prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); - prt_newline(out); + prt_printf(out, "KFD: %llu\n", BCH_CRYPT_KDF_TYPE(crypt)); + prt_printf(out, "scrypt n: %llu\n", BCH_KDF_SCRYPT_N(crypt)); + prt_printf(out, "scrypt r: %llu\n", BCH_KDF_SCRYPT_R(crypt)); + prt_printf(out, "scrypt p: %llu\n", BCH_KDF_SCRYPT_P(crypt)); } const struct bch_sb_field_ops bch_sb_field_ops_crypt = { @@ -558,7 +581,7 @@ got_key: return 0; } -#include "../crypto.h" +#include "crypto.h" #endif int bch2_request_key(struct bch_sb *sb, struct bch_key *key) @@ -647,26 +670,26 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { - int ret; - - if (!c->chacha20) - c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); - ret = PTR_ERR_OR_ZERO(c->chacha20); + if (c->chacha20) + return 0; + struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); + int ret = PTR_ERR_OR_ZERO(chacha20); if (ret) { bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); return ret; } - if (!c->poly1305) - c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); - ret = PTR_ERR_OR_ZERO(c->poly1305); - + struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0); + ret = PTR_ERR_OR_ZERO(poly1305); if (ret) { bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); + crypto_free_sync_skcipher(chacha20); return ret; } + c->chacha20 = chacha20; + c->poly1305 = poly1305; return 0; } @@ -761,11 +784,11 @@ err: void bch2_fs_encryption_exit(struct bch_fs *c) { - if (!IS_ERR_OR_NULL(c->poly1305)) + if (c->poly1305) crypto_free_shash(c->poly1305); - if (!IS_ERR_OR_NULL(c->chacha20)) + if (c->chacha20) crypto_free_sync_skcipher(c->chacha20); - if (!IS_ERR_OR_NULL(c->sha256)) + if (c->sha256) crypto_free_shash(c->sha256); } @@ -778,6 +801,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) c->sha256 = crypto_alloc_shash("sha256", 0, 0); ret = PTR_ERR_OR_ZERO(c->sha256); if (ret) { + c->sha256 = NULL; bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); goto out; } diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 1b8c2c1016dc..43b9d71f2f2b 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -61,11 +61,12 @@ static inline void bch2_csum_err_msg(struct printbuf *out, struct bch_csum expected, struct bch_csum got) { - prt_printf(out, "checksum error: got "); + prt_str(out, "checksum error, type "); + bch2_prt_csum_type(out, type); + prt_str(out, ": got "); bch2_csum_to_text(out, type, got); prt_str(out, " should be "); bch2_csum_to_text(out, type, expected); - prt_printf(out, " type %s", bch2_csum_types[type]); } int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); @@ -108,7 +109,7 @@ int bch2_enable_encryption(struct bch_fs *, bool); void bch2_fs_encryption_exit(struct bch_fs *); int bch2_fs_encryption_init(struct bch_fs *); -static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, +static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type, bool data) { switch (type) { diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index 363644451106..1f8e035d7119 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -6,44 +6,45 @@ #include <linux/kthread.h> #include <linux/preempt.h> -static inline long io_timer_cmp(io_timer_heap *h, - struct io_timer *l, - struct io_timer *r) +static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args) { - return l->expire - r->expire; + struct io_timer **_l = (struct io_timer **)l; + struct io_timer **_r = (struct io_timer **)r; + + return (*_l)->expire < (*_r)->expire; } +static const struct min_heap_callbacks callbacks = { + .less = io_timer_cmp, + .swp = NULL, +}; + void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) { - size_t i; - spin_lock(&clock->timer_lock); - if (time_after_eq((unsigned long) atomic64_read(&clock->now), - timer->expire)) { + if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { spin_unlock(&clock->timer_lock); timer->fn(timer); return; } - for (i = 0; i < clock->timers.used; i++) + for (size_t i = 0; i < clock->timers.nr; i++) if (clock->timers.data[i] == timer) goto out; - BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); + BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL)); out: spin_unlock(&clock->timer_lock); } void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) { - size_t i; - spin_lock(&clock->timer_lock); - for (i = 0; i < clock->timers.used; i++) + for (size_t i = 0; i < clock->timers.nr; i++) if (clock->timers.data[i] == timer) { - heap_del(&clock->timers, i, io_timer_cmp, NULL); + min_heap_del(&clock->timers, i, &callbacks, NULL); break; } @@ -75,33 +76,31 @@ static void io_clock_cpu_timeout(struct timer_list *timer) wake_up_process(wait->task); } -void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) +void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) { - struct io_clock_wait wait; + struct io_clock_wait wait = { + .io_timer.expire = until, + .io_timer.fn = io_clock_wait_fn, + .io_timer.fn2 = (void *) _RET_IP_, + .task = current, + }; - /* XXX: calculate sleep time rigorously */ - wait.io_timer.expire = until; - wait.io_timer.fn = io_clock_wait_fn; - wait.task = current; - wait.expired = 0; bch2_io_timer_add(clock, &wait.io_timer); - schedule(); - bch2_io_timer_del(clock, &wait.io_timer); } void bch2_kthread_io_clock_wait(struct io_clock *clock, - unsigned long io_until, - unsigned long cpu_timeout) + u64 io_until, unsigned long cpu_timeout) { bool kthread = (current->flags & PF_KTHREAD) != 0; - struct io_clock_wait wait; + struct io_clock_wait wait = { + .io_timer.expire = io_until, + .io_timer.fn = io_clock_wait_fn, + .io_timer.fn2 = (void *) _RET_IP_, + .task = current, + }; - wait.io_timer.expire = io_until; - wait.io_timer.fn = io_clock_wait_fn; - wait.task = current; - wait.expired = 0; bch2_io_timer_add(clock, &wait.io_timer); timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); @@ -127,44 +126,44 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, bch2_io_timer_del(clock, &wait.io_timer); } -static struct io_timer *get_expired_timer(struct io_clock *clock, - unsigned long now) +static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) { struct io_timer *ret = NULL; - spin_lock(&clock->timer_lock); - - if (clock->timers.used && - time_after_eq(now, clock->timers.data[0]->expire)) - heap_pop(&clock->timers, ret, io_timer_cmp, NULL); - - spin_unlock(&clock->timer_lock); + if (clock->timers.nr && + time_after_eq64(now, clock->timers.data[0]->expire)) { + ret = *min_heap_peek(&clock->timers); + min_heap_pop(&clock->timers, &callbacks, NULL); + } return ret; } -void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) +void __bch2_increment_clock(struct io_clock *clock, u64 sectors) { struct io_timer *timer; - unsigned long now = atomic64_add_return(sectors, &clock->now); + u64 now = atomic64_add_return(sectors, &clock->now); + spin_lock(&clock->timer_lock); while ((timer = get_expired_timer(clock, now))) timer->fn(timer); + spin_unlock(&clock->timer_lock); } void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) { - unsigned long now; - unsigned i; - out->atomic++; spin_lock(&clock->timer_lock); - now = atomic64_read(&clock->now); + u64 now = atomic64_read(&clock->now); + + printbuf_tabstop_push(out, 40); + prt_printf(out, "current time:\t%llu\n", now); - for (i = 0; i < clock->timers.used; i++) - prt_printf(out, "%ps:\t%li\n", + for (unsigned i = 0; i < clock->timers.nr; i++) + prt_printf(out, "%ps %ps:\t%llu\n", clock->timers.data[i]->fn, - clock->timers.data[i]->expire - now); + clock->timers.data[i]->fn2, + clock->timers.data[i]->expire); spin_unlock(&clock->timer_lock); --out->atomic; } diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h index 70a0f7436c84..82c79c8baf92 100644 --- a/fs/bcachefs/clock.h +++ b/fs/bcachefs/clock.h @@ -4,12 +4,11 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *); void bch2_io_timer_del(struct io_clock *, struct io_timer *); -void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, - unsigned long); +void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long); -void __bch2_increment_clock(struct io_clock *, unsigned); +void __bch2_increment_clock(struct io_clock *, u64); -static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, +static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors, int rw) { struct io_clock *clock = &c->io_clock[rw]; @@ -19,16 +18,7 @@ static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); } -void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); - -#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ -({ \ - long __ret = timeout; \ - might_sleep(); \ - if (!___wait_cond_timeout(condition)) \ - __ret = __wait_event_timeout(wq, condition, timeout); \ - __ret; \ -}) +void bch2_io_clock_schedule_timeout(struct io_clock *, u64); void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h index 5fae0012d808..37554e4514fe 100644 --- a/fs/bcachefs/clock_types.h +++ b/fs/bcachefs/clock_types.h @@ -17,13 +17,14 @@ typedef void (*io_timer_fn)(struct io_timer *); struct io_timer { io_timer_fn fn; - unsigned long expire; + void *fn2; + u64 expire; }; /* Amount to buffer up on a percpu counter */ #define IO_CLOCK_PCPU_SECTORS 128 -typedef HEAP(struct io_timer *) io_timer_heap; +typedef DEFINE_MIN_HEAP(struct io_timer *, io_timer_heap) io_timer_heap; struct io_clock { atomic64_t now; diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 33df8cf86bd8..114bf2f3879f 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -2,13 +2,34 @@ #include "bcachefs.h" #include "checksum.h" #include "compress.h" +#include "error.h" #include "extents.h" +#include "io_write.h" +#include "opts.h" #include "super-io.h" #include <linux/lz4.h> #include <linux/zlib.h> #include <linux/zstd.h> +static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type) +{ + switch (type) { + case BCH_COMPRESSION_TYPE_none: + case BCH_COMPRESSION_TYPE_incompressible: + return BCH_COMPRESSION_OPT_none; + case BCH_COMPRESSION_TYPE_lz4_old: + case BCH_COMPRESSION_TYPE_lz4: + return BCH_COMPRESSION_OPT_lz4; + case BCH_COMPRESSION_TYPE_gzip: + return BCH_COMPRESSION_OPT_gzip; + case BCH_COMPRESSION_TYPE_zstd: + return BCH_COMPRESSION_OPT_zstd; + default: + BUG(); + } +} + /* Bounce buffer: */ struct bbuf { void *b; @@ -158,6 +179,19 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, void *workspace; int ret; + enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); + mempool_t *workspace_pool = &c->compress_workspace[opt]; + if (unlikely(!mempool_initialized(workspace_pool))) { + if (fsck_err(c, compression_type_not_marked_in_sb, + "compression type %s set but not marked in superblock", + __bch2_compression_types[crc.compression_type])) + ret = bch2_check_set_has_compressed_data(c, opt); + else + ret = -BCH_ERR_compression_workspace_not_initialized; + if (ret) + goto out; + } + src_data = bio_map_or_bounce(c, src, READ); switch (crc.compression_type) { @@ -176,13 +210,13 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, .avail_out = dst_len, }; - workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); + workspace = mempool_alloc(workspace_pool, GFP_NOFS); zlib_set_workspace(&strm, workspace); zlib_inflateInit2(&strm, -MAX_WBITS); ret = zlib_inflate(&strm, Z_FINISH); - mempool_free(workspace, &c->decompress_workspace); + mempool_free(workspace, workspace_pool); if (ret != Z_STREAM_END) goto err; @@ -195,14 +229,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, if (real_src_len > src_len - 4) goto err; - workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); + workspace = mempool_alloc(workspace_pool, GFP_NOFS); ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); ret = zstd_decompress_dctx(ctx, dst_data, dst_len, src_data.b + 4, real_src_len); - mempool_free(workspace, &c->decompress_workspace); + mempool_free(workspace, workspace_pool); if (ret != dst_len) goto err; @@ -212,6 +246,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, BUG(); } ret = 0; +fsck_err: out: bio_unmap_or_unbounce(c, src_data); return ret; @@ -220,11 +255,14 @@ err: goto out; } -int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, - struct bch_extent_crc_unpacked *crc) +int bch2_bio_uncompress_inplace(struct bch_write_op *op, + struct bio *bio) { + struct bch_fs *c = op->c; + struct bch_extent_crc_unpacked *crc = &op->crc; struct bbuf data = { NULL }; size_t dst_len = crc->uncompressed_size << 9; + int ret = 0; /* bio must own its pages: */ BUG_ON(!bio->bi_vcnt); @@ -232,17 +270,26 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || crc->compressed_size << 9 > c->opts.encoded_extent_max) { - bch_err(c, "error rewriting existing data: extent too big"); + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "error rewriting existing data: extent too big"); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); return -EIO; } data = __bounce_alloc(c, dst_len, WRITE); if (__bio_uncompress(c, bio, data.b, *crc)) { - if (!c->opts.no_data_io) - bch_err(c, "error rewriting existing data: decompression error"); - bio_unmap_or_unbounce(c, data); - return -EIO; + if (!c->opts.no_data_io) { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "error rewriting existing data: decompression error"); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } + ret = -EIO; + goto err; } /* @@ -259,9 +306,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, crc->uncompressed_size = crc->live_size; crc->offset = 0; crc->csum = (struct bch_csum) { 0, 0 }; - +err: bio_unmap_or_unbounce(c, data); - return 0; + return ret; } int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, @@ -394,8 +441,21 @@ static unsigned __bio_compress(struct bch_fs *c, unsigned pad; int ret = 0; - BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); - BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); + /* bch2_compression_decode catches unknown compression types: */ + BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR); + + mempool_t *workspace_pool = &c->compress_workspace[compression.type]; + if (unlikely(!mempool_initialized(workspace_pool))) { + if (fsck_err(c, compression_opt_not_marked_in_sb, + "compression opt %s set but not marked in superblock", + bch2_compression_opts[compression.type])) { + ret = bch2_check_set_has_compressed_data(c, compression.type); + if (ret) /* memory allocation failure, don't compress */ + return 0; + } else { + return 0; + } + } /* If it's only one block, don't bother trying to compress: */ if (src->bi_iter.bi_size <= c->opts.block_size) @@ -404,7 +464,7 @@ static unsigned __bio_compress(struct bch_fs *c, dst_data = bio_map_or_bounce(c, dst, WRITE); src_data = bio_map_or_bounce(c, src, READ); - workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS); + workspace = mempool_alloc(workspace_pool, GFP_NOFS); *src_len = src->bi_iter.bi_size; *dst_len = dst->bi_iter.bi_size; @@ -447,7 +507,7 @@ static unsigned __bio_compress(struct bch_fs *c, *src_len = round_down(*src_len, block_bytes(c)); } - mempool_free(workspace, &c->compress_workspace[compression_type]); + mempool_free(workspace, workspace_pool); if (ret) goto err; @@ -477,6 +537,9 @@ out: err: ret = BCH_COMPRESSION_TYPE_incompressible; goto out; +fsck_err: + ret = 0; + goto out; } unsigned bch2_bio_compress(struct bch_fs *c, @@ -559,7 +622,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) { unsigned i; - mempool_exit(&c->decompress_workspace); for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) mempool_exit(&c->compress_workspace[i]); mempool_exit(&c->compression_bounce[WRITE]); @@ -568,7 +630,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { - size_t decompress_workspace_size = 0; ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), c->opts.encoded_extent_max); @@ -576,19 +637,17 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) struct { unsigned feature; - enum bch_compression_type type; + enum bch_compression_opts type; size_t compress_workspace; - size_t decompress_workspace; } compression_types[] = { - { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, - max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS), - 0 }, - { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, - zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), - zlib_inflate_workspacesize(), }, - { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, - c->zstd_workspace_size, - zstd_dctx_workspace_bound() }, + { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4, + max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, + { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip, + max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), + zlib_inflate_workspacesize()) }, + { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd, + max(c->zstd_workspace_size, + zstd_dctx_workspace_bound()) }, }, *i; bool have_compressed = false; @@ -601,38 +660,30 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) return 0; if (!mempool_initialized(&c->compression_bounce[READ]) && - mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], - 1, c->opts.encoded_extent_max)) + mempool_init_kvmalloc_pool(&c->compression_bounce[READ], + 1, c->opts.encoded_extent_max)) return -BCH_ERR_ENOMEM_compression_bounce_read_init; if (!mempool_initialized(&c->compression_bounce[WRITE]) && - mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], - 1, c->opts.encoded_extent_max)) + mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], + 1, c->opts.encoded_extent_max)) return -BCH_ERR_ENOMEM_compression_bounce_write_init; for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); i++) { - decompress_workspace_size = - max(decompress_workspace_size, i->decompress_workspace); - if (!(features & (1 << i->feature))) continue; if (mempool_initialized(&c->compress_workspace[i->type])) continue; - if (mempool_init_kvpmalloc_pool( + if (mempool_init_kvmalloc_pool( &c->compress_workspace[i->type], 1, i->compress_workspace)) return -BCH_ERR_ENOMEM_compression_workspace_init; } - if (!mempool_initialized(&c->decompress_workspace) && - mempool_init_kvpmalloc_pool(&c->decompress_workspace, - 1, decompress_workspace_size)) - return -BCH_ERR_ENOMEM_decompression_workspace_init; - return 0; } diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 58c2eb45570f..bec2f05bfd52 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -47,16 +47,8 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } -static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type) -{ - if (type < BCH_COMPRESSION_TYPE_NR) - prt_str(out, __bch2_compression_types[type]); - else - prt_printf(out, "(invalid compression type %u)", type); -} - -int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, - struct bch_extent_crc_unpacked *); +struct bch_write_op; +int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, struct bvec_iter, struct bch_extent_crc_unpacked); unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c index ac35b8b705ae..e86d36d23e9e 100644 --- a/fs/bcachefs/darray.c +++ b/fs/bcachefs/darray.c @@ -2,18 +2,32 @@ #include <linux/log2.h> #include <linux/slab.h> +#include <linux/vmalloc.h> #include "darray.h" -int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) +int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) { if (new_size > d->size) { new_size = roundup_pow_of_two(new_size); - void *data = kvmalloc_array(new_size, element_size, gfp); + /* + * This is a workaround: kvmalloc() doesn't support > INT_MAX + * allocations, but vmalloc() does. + * The limit needs to be lifted from kvmalloc, and when it does + * we'll go back to just using that. + */ + size_t bytes; + if (unlikely(check_mul_overflow(new_size, element_size, &bytes))) + return -ENOMEM; + + void *data = likely(bytes < INT_MAX) + ? kvmalloc_noprof(bytes, gfp) + : vmalloc_noprof(bytes); if (!data) return -ENOMEM; - memcpy(data, d->data, d->size * element_size); + if (d->size) + memcpy(data, d->data, d->size * element_size); if (d->data != d->preallocated) kvfree(d->data); d->data = data; diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 4b340d13caac..c6151495985f 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -22,29 +22,23 @@ struct { \ typedef DARRAY(char) darray_char; typedef DARRAY(char *) darray_str; -int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t); - -static inline int __darray_resize(darray_char *d, size_t element_size, - size_t new_size, gfp_t gfp) -{ - return unlikely(new_size > d->size) - ? __bch2_darray_resize(d, element_size, new_size, gfp) - : 0; -} +int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); + +#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__)) + +#define __darray_resize(_d, _element_size, _new_size, _gfp) \ + (unlikely((_new_size) > (_d)->size) \ + ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\ + : 0) #define darray_resize_gfp(_d, _new_size, _gfp) \ - unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)) + __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp) #define darray_resize(_d, _new_size) \ darray_resize_gfp(_d, _new_size, GFP_KERNEL) -static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp) -{ - return __darray_resize(d, t_size, d->nr + more, gfp); -} - #define darray_make_room_gfp(_d, _more, _gfp) \ - __darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp) + darray_resize_gfp((_d), (_d)->nr + (_more), _gfp) #define darray_make_room(_d, _more) \ darray_make_room_gfp(_d, _more, GFP_KERNEL) @@ -89,7 +83,7 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each_reverse(_d, _i) \ - for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) + for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) #define darray_init(_d) \ do { \ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 4150feca42a2..642fbc60ecab 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -5,7 +5,9 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "compress.h" #include "data_update.h" +#include "disk_groups.h" #include "ec.h" #include "error.h" #include "extents.h" @@ -14,20 +16,105 @@ #include "move.h" #include "nocow_locking.h" #include "rebalance.h" +#include "snapshot.h" #include "subvolume.h" #include "trace.h" -static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) +static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) { - if (trace_move_extent_finish_enabled()) { - struct printbuf buf = PRINTBUF; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) + bch2_dev_put(bch2_dev_have_ref(c, ptr->dev)); +} + +static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + if (!bch2_dev_tryget(c, ptr->dev)) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); + } + return false; + } + } + return true; +} - bch2_bkey_val_to_text(&buf, c, k); - trace_move_extent_finish(c, buf.buf); - printbuf_exit(&buf); +static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); } } +static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + + if (ctxt) { + bool locked; + + move_ctxt_wait_event(ctxt, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || + list_empty(&ctxt->ios)); + + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); + } else { + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + + ca = bch2_dev_have_ref(c, ptr2->dev); + bucket = PTR_BUCKET_POS(ca, ptr2); + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + } + return false; + } + } + } + return true; +} + +static noinline void trace_move_extent_finish2(struct data_update *u, + struct bkey_i *new, + struct bkey_i *insert) +{ + struct bch_fs *c = u->op.c; + struct printbuf buf = PRINTBUF; + + prt_newline(&buf); + + bch2_data_update_to_text(&buf, u); + prt_newline(&buf); + + prt_str_indented(&buf, "new replicas:\t"); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); + prt_newline(&buf); + + prt_str_indented(&buf, "insert:\t"); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + prt_newline(&buf); + + trace_move_extent_finish(c, buf.buf); + printbuf_exit(&buf); +} + static void trace_move_extent_fail2(struct data_update *m, struct bkey_s_c new, struct bkey_s_c wrote, @@ -36,11 +123,8 @@ static void trace_move_extent_fail2(struct data_update *m, { struct bch_fs *c = m->op.c; struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - const union bch_extent_entry *entry; - struct bch_extent_ptr *ptr; - struct extent_ptr_decoded p; struct printbuf buf = PRINTBUF; - unsigned i, rewrites_found = 0; + unsigned rewrites_found = 0; if (!trace_move_extent_fail_enabled()) return; @@ -48,27 +132,25 @@ static void trace_move_extent_fail2(struct data_update *m, prt_str(&buf, msg); if (insert) { - i = 0; + const union bch_extent_entry *entry; + struct bch_extent_ptr *ptr; + struct extent_ptr_decoded p; + + unsigned ptr_bit = 1; bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { - if (((1U << i) & m->data_opts.rewrite_ptrs) && + if ((ptr_bit & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) - rewrites_found |= 1U << i; - i++; + rewrites_found |= ptr_bit; + ptr_bit <<= 1; } } - prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u", - (m->data_opts.rewrite_ptrs & (1 << 0)) != 0, - (m->data_opts.rewrite_ptrs & (1 << 1)) != 0, - (m->data_opts.rewrite_ptrs & (1 << 2)) != 0, - (m->data_opts.rewrite_ptrs & (1 << 3)) != 0); + prt_str(&buf, "rewrites found:\t"); + bch2_prt_u64_base2(&buf, rewrites_found); + prt_newline(&buf); - prt_printf(&buf, "\nrewrites found: %u%u%u%u", - (rewrites_found & (1 << 0)) != 0, - (rewrites_found & (1 << 1)) != 0, - (rewrites_found & (1 << 2)) != 0, - (rewrites_found & (1 << 3)) != 0); + bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); prt_str(&buf, "\nold: "); bch2_bkey_val_to_text(&buf, c, old); @@ -105,7 +187,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, m->btree_id, bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); while (1) { struct bkey_s_c k; @@ -120,7 +202,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, struct bpos next_pos; bool should_check_enospc; s64 i_sectors_delta = 0, disk_sectors_delta = 0; - unsigned rewrites_found = 0, durability, i; + unsigned rewrites_found = 0, durability, ptr_bit; bch2_trans_begin(trans); @@ -157,15 +239,16 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, * * Fist, drop rewrite_ptrs from @new: */ - i = 0; + ptr_bit = 1; bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { - if (((1U << i) & m->data_opts.rewrite_ptrs) && + if ((ptr_bit & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) { - bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); - rewrites_found |= 1U << i; + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), ptr); + rewrites_found |= ptr_bit; } - i++; + ptr_bit <<= 1; } if (m->data_opts.rewrite_ptrs && @@ -201,6 +284,7 @@ restart_drop_conflicting_replicas: bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); /* Now, drop excess replicas: */ + rcu_read_lock(); restart_drop_extra_replicas: bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); @@ -209,17 +293,19 @@ restart_drop_extra_replicas: durability - ptr_durability >= m->op.opts.data_replicas) { durability -= ptr_durability; - bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), &entry->ptr); goto restart_drop_extra_replicas; } } + rcu_read_unlock(); /* Finally, add the pointers we just wrote: */ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) bch2_extent_ptr_decoded_append(insert, &p); bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize(c, bkey_i_to_s(insert)); + bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert)); ret = bch2_sum_sector_overwrites(trans, &iter, insert, &should_check_enospc, @@ -245,14 +331,16 @@ restart_drop_extra_replicas: * it's been hard to reproduce, so this should give us some more * information when it does occur: */ - struct printbuf err = PRINTBUF; - int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err); - printbuf_exit(&err); - + int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), + (struct bkey_validate_context) { + .btree = m->btree_id, + .flags = BCH_VALIDATE_commit, + }); if (invalid) { struct printbuf buf = PRINTBUF; prt_str(&buf, "about to insert invalid key in data update path"); + prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); prt_str(&buf, "\nold: "); bch2_bkey_val_to_text(&buf, c, old); prt_str(&buf, "\nk: "); @@ -264,6 +352,7 @@ restart_drop_extra_replicas: printbuf_exit(&buf); bch2_fatal_error(c); + ret = -EIO; goto out; } @@ -285,9 +374,9 @@ restart_drop_extra_replicas: k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: + bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: bch2_trans_update(trans, &iter, insert, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, &op->res, NULL, BCH_TRANS_COMMIT_no_check_rw| @@ -297,7 +386,8 @@ restart_drop_extra_replicas: bch2_btree_iter_set_pos(&iter, next_pos); this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); - trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); + if (trace_move_extent_finish_enabled()) + trace_move_extent_finish2(m, &new->k_i, insert); } err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -352,16 +442,11 @@ void bch2_data_update_read_done(struct data_update *m, void bch2_data_update_exit(struct data_update *update) { struct bch_fs *c = update->op.c; - struct bkey_ptrs_c ptrs = - bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); - - bkey_for_each_ptr(ptrs, ptr) { - if (c->opts.nocow_enabled) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), 0); - percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref); - } + struct bkey_s_c k = bkey_i_to_s_c(update->k.k); + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); + bkey_put_dev_refs(c, k); bch2_bkey_buf_exit(&update->k, c); bch2_disk_reservation_put(c, &update->op.res); bch2_bio_free_pages_pool(c, &update->op.wbio.bio); @@ -385,8 +470,10 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, while (bio_sectors(bio)) { unsigned sectors = bio_sectors(bio); + bch2_trans_begin(trans); + bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, - BTREE_ITER_SLOTS); + BTREE_ITER_slots); ret = lockrestart_do(trans, ({ k = bch2_btree_iter_peek_slot(&iter); bkey_err(k); @@ -448,10 +535,50 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, } } +void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + printbuf_tabstop_push(out, 20); + + prt_str_indented(out, "rewrite ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); + prt_newline(out); + + prt_str_indented(out, "kill ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->kill_ptrs); + prt_newline(out); + + prt_str_indented(out, "target:\t"); + bch2_target_to_text(out, c, data_opts->target); + prt_newline(out); + + prt_str_indented(out, "compression:\t"); + bch2_compression_opt_to_text(out, io_opts->background_compression); + prt_newline(out); + + prt_str_indented(out, "opts.replicas:\t"); + prt_u64(out, io_opts->data_replicas); + prt_newline(out); + + prt_str_indented(out, "extra replicas:\t"); + prt_u64(out, data_opts->extra_replicas); +} + +void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) +{ + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); + prt_newline(out); + + prt_str_indented(out, "old key:\t"); + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); +} + int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - struct data_update_opts data_opts) + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { struct bch_fs *c = trans->c; struct bkey_i *n; @@ -462,12 +589,11 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, if (ret) return ret; - while (data_opts.kill_ptrs) { - unsigned i = 0, drop = __fls(data_opts.kill_ptrs); - struct bch_extent_ptr *ptr; + while (data_opts->kill_ptrs) { + unsigned i = 0, drop = __fls(data_opts->kill_ptrs); - bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); - data_opts.kill_ptrs ^= 1U << drop; + bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop); + data_opts->kill_ptrs ^= 1U << drop; } /* @@ -475,19 +601,19 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, * will do the appropriate thing with it (turning it into a * KEY_TYPE_error key, or just a discard if it was a cached extent) */ - bch2_extent_normalize(c, bkey_i_to_s(n)); + bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n)); /* * Since we're not inserting through an extent iterator - * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * (BTREE_ITER_all_snapshots iterators aren't extent iterators), * we aren't using the extent overwrite path to delete, we're * just using the normal key deletion path: */ - if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents)) n->k.size = 0; return bch2_trans_relock(trans) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } @@ -505,10 +631,26 @@ int bch2_data_update_init(struct btree_trans *trans, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; - unsigned ptrs_locked = 0; + unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; int ret = 0; + /* + * fs is corrupt we have a key for a snapshot node that doesn't exist, + * and we have to check for this because we go rw before repairing the + * snapshots table - just skip it, we can move it later. + */ + if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) + return -BCH_ERR_data_update_done; + + if (!bkey_get_dev_refs(c, k)) + return -BCH_ERR_data_update_done; + + if (c->opts.nocow_enabled && + !bkey_nocow_lock(c, ctxt, k)) { + bkey_put_dev_refs(c, k); + return -BCH_ERR_nocow_lock_blocked; + } + bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); m->btree_id = btree_id; @@ -518,7 +660,7 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_write_op_init(&m->op, c, io_opts); m->op.pos = bkey_start_pos(k.k); - m->op.version = k.k->version; + m->op.version = k.k->bversion; m->op.target = data_opts.target; m->op.write_point = wp; m->op.nr_replicas = 0; @@ -527,30 +669,26 @@ int bch2_data_update_init(struct btree_trans *trans, BCH_WRITE_DATA_ENCODED| BCH_WRITE_MOVE| m->data_opts.write_flags; - m->op.compression_opt = background_compression(io_opts); + m->op.compression_opt = io_opts.background_compression; m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - bkey_for_each_ptr(ptrs, ptr) - percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref); - unsigned durability_have = 0, durability_removing = 0; - i = 0; + unsigned ptr_bit = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - bool locked; - - if (((1U << i) & m->data_opts.rewrite_ptrs)) { - BUG_ON(p.ptr.cached); - - if (crc_is_compressed(p.crc)) - reserve_sectors += k.k->size; - - m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); - durability_removing += bch2_extent_ptr_desired_durability(c, &p); - } else if (!p.ptr.cached && - !((1U << i) & m->data_opts.kill_ptrs)) { - bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); - durability_have += bch2_extent_ptr_durability(c, &p); + if (!p.ptr.cached) { + rcu_read_lock(); + if (ptr_bit & m->data_opts.rewrite_ptrs) { + if (crc_is_compressed(p.crc)) + reserve_sectors += k.k->size; + + m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); + durability_removing += bch2_extent_ptr_desired_durability(c, &p); + } else if (!(ptr_bit & m->data_opts.kill_ptrs)) { + bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + durability_have += bch2_extent_ptr_durability(c, &p); + } + rcu_read_unlock(); } /* @@ -566,30 +704,11 @@ int bch2_data_update_init(struct btree_trans *trans, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; - if (c->opts.nocow_enabled) { - if (ctxt) { - move_ctxt_wait_event(ctxt, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0)) || - (!atomic_read(&ctxt->read_sectors) && - !atomic_read(&ctxt->write_sectors))); - - if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0); - } else { - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0)) { - ret = -BCH_ERR_nocow_lock_blocked; - goto err; - } - } - ptrs_locked |= (1U << i); - } - - i++; + ptr_bit <<= 1; } + unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); + /* * If current extent durability is less than io_opts.data_replicas, * we're not trying to rereplicate the extent up to data_replicas here - @@ -598,51 +717,49 @@ int bch2_data_update_init(struct btree_trans *trans, * Increasing replication is an explicit operation triggered by * rereplicate, currently, so that users don't get an unexpected -ENOSPC */ - if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) && - durability_have >= io_opts.data_replicas) { + m->op.nr_replicas = min(durability_removing, durability_required) + + m->data_opts.extra_replicas; + + /* + * If device(s) were set to durability=0 after data was written to them + * we can end up with a duribilty=0 extent, and the normal algorithm + * that tries not to increase durability doesn't work: + */ + if (!(durability_have + durability_removing)) + m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); + + m->op.nr_replicas_required = m->op.nr_replicas; + + /* + * It might turn out that we don't need any new replicas, if the + * replicas or durability settings have been changed since the extent + * was written: + */ + if (!m->op.nr_replicas) { m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); - goto done; + ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); + goto out; } - m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) + - m->data_opts.extra_replicas; - m->op.nr_replicas_required = m->op.nr_replicas; - - BUG_ON(!m->op.nr_replicas); - if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, m->data_opts.extra_replicas ? 0 : BCH_DISK_RESERVATION_NOFAIL); if (ret) - goto err; + goto out; } if (bkey_extent_is_unwritten(k)) { bch2_update_unwritten_extent(trans, m); - goto done; + goto out; } return 0; -err: - i = 0; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if ((1U << i) & ptrs_locked) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0); - percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref); - i++; - } - - bch2_bkey_buf_exit(&m->k, c); - bch2_bio_free_pages_pool(c, &m->op.wbio.bio); - return ret; -done: +out: bch2_data_update_exit(m); return ret ?: -BCH_ERR_data_update_done; } @@ -650,14 +767,14 @@ done: void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i = 0; + unsigned ptr_bit = 1; bkey_for_each_ptr(ptrs, ptr) { - if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) { - opts->kill_ptrs |= 1U << i; - opts->rewrite_ptrs ^= 1U << i; + if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) { + opts->kill_ptrs |= ptr_bit; + opts->rewrite_ptrs ^= ptr_bit; } - i++; + ptr_bit <<= 1; } } diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 991095bbd469..e4b50723428e 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -17,6 +17,9 @@ struct data_update_opts { unsigned write_flags; }; +void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, + struct bch_io_opts *, struct data_update_opts *); + struct data_update { /* extent being updated: */ enum btree_id btree_id; @@ -27,6 +30,8 @@ struct data_update { struct bch_write_op op; }; +void bch2_data_update_to_text(struct printbuf *, struct data_update *); + int bch2_data_update_index_update(struct bch_write_op *); void bch2_data_update_read_done(struct data_update *, @@ -35,7 +40,8 @@ void bch2_data_update_read_done(struct data_update *, int bch2_extent_drop_ptrs(struct btree_trans *, struct btree_iter *, struct bkey_s_c, - struct data_update_opts); + struct bch_io_opts *, + struct data_update_opts *); void bch2_data_update_exit(struct data_update *); int bch2_data_update_init(struct btree_trans *, struct btree_iter *, diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 7bdba8507fc9..55333e82d1fe 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -13,12 +13,14 @@ #include "btree_iter.h" #include "btree_locking.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" #include "debug.h" #include "error.h" #include "extents.h" #include "fsck.h" #include "inode.h" +#include "journal_reclaim.h" #include "super.h" #include <linux/console.h> @@ -36,11 +38,11 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, struct btree_node *n_ondisk = c->verify_ondisk; struct btree_node *n_sorted = c->verify_data->data; struct bset *sorted, *inmemory = &b->data->keys; - struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); struct bio *bio; bool failed = false, saw_error = false; - if (!bch2_dev_get_ioref(ca, READ)) + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + if (!ca) return false; bio = bio_alloc_bioset(ca->disk_sb.bdev, @@ -137,7 +139,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { - c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); + c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!c->verify_ondisk) goto out; } @@ -170,7 +172,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf); + bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf); printbuf_exit(&buf); } out: @@ -193,13 +195,13 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - ca = bch_dev_bkey_exists(c, pick.ptr.dev); - if (!bch2_dev_get_ioref(ca, READ)) { + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + if (!ca) { prt_printf(out, "error getting device to read from: not online\n"); return; } - n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); + n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!n_ondisk) { prt_printf(out, "memory allocation failure\n"); goto out; @@ -293,7 +295,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, out: if (bio) bio_put(bio); - kvpfree(n_ondisk, btree_buf_bytes(b)); + kvfree(n_ondisk); percpu_ref_put(&ca->io_ref); } @@ -374,8 +376,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, return flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ bch2_bkey_val_to_text(&i->buf, i->c, k); prt_newline(&i->buf); bch2_trans_unlock(trans); @@ -396,47 +398,27 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans *trans; - struct btree_iter iter; - struct btree *b; - ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - ret = flush_buf(i); + ssize_t ret = flush_buf(i); if (ret) return ret; if (bpos_eq(SPOS_MAX, i->from)) return i->ret; - trans = bch2_trans_get(i->c); -retry: - bch2_trans_begin(trans); - - for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) { - bch2_btree_node_to_text(&i->buf, i->c, b); - i->from = !bpos_eq(SPOS_MAX, b->key.k.p) - ? bpos_successor(b->key.k.p) - : b->key.k.p; - - ret = drop_locks_do(trans, flush_buf(i)); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); + return bch2_trans_run(i->c, + for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({ + bch2_btree_node_to_text(&i->buf, i->c, b); + i->from = !bpos_eq(SPOS_MAX, b->key.k.p) + ? bpos_successor(b->key.k.p) + : b->key.k.p; - if (!ret) - ret = flush_buf(i); - - return ret ?: i->ret; + drop_locks_do(trans, flush_buf(i)); + }))) ?: i->ret; } static const struct file_operations btree_format_debug_ops = { @@ -458,8 +440,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, return flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ struct btree_path_level *l = &btree_iter_path(trans, &iter)->l[0]; struct bkey_packed *_k = @@ -491,51 +473,28 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); - prt_printf(out, "%px btree=%s l=%u ", - b, - bch2_btree_id_str(b->c.btree_id), - b->c.level); - prt_newline(out); + prt_printf(out, "%px ", b); + bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); + prt_printf(out, "\n"); printbuf_indent_add(out, 2); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); prt_newline(out); - prt_printf(out, "flags: "); - prt_tab(out); + prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_btree_node_flags, b->flags); prt_newline(out); - prt_printf(out, "pcpu read locks: "); - prt_tab(out); - prt_printf(out, "%u", b->c.lock.readers != NULL); - prt_newline(out); - - prt_printf(out, "written:"); - prt_tab(out); - prt_printf(out, "%u", b->written); - prt_newline(out); - - prt_printf(out, "writes blocked:"); - prt_tab(out); - prt_printf(out, "%u", !list_empty_careful(&b->write_blocked)); - prt_newline(out); - - prt_printf(out, "will make reachable:"); - prt_tab(out); - prt_printf(out, "%lx", b->will_make_reachable); - prt_newline(out); - - prt_printf(out, "journal pin %px:", &b->writes[0].journal); - prt_tab(out); - prt_printf(out, "%llu", b->writes[0].journal.seq); - prt_newline(out); + prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL); + prt_printf(out, "written:\t%u\n", b->written); + prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked)); + prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable); - prt_printf(out, "journal pin %px:", &b->writes[1].journal); - prt_tab(out); - prt_printf(out, "%llu", b->writes[1].journal.seq); - prt_newline(out); + prt_printf(out, "journal pin %px:\t%llu\n", + &b->writes[0].journal, b->writes[0].journal.seq); + prt_printf(out, "journal pin %px:\t%llu\n", + &b->writes[1].journal, b->writes[1].journal.seq); printbuf_indent_sub(out, 2); } @@ -592,6 +551,32 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; +typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r); + +static void list_sort(struct list_head *head, list_cmp_fn cmp) +{ + struct list_head *pos; + + list_for_each(pos, head) + while (!list_is_last(pos, head) && + cmp(pos, pos->next) > 0) { + struct list_head *pos2, *next = pos->next; + + list_del(next); + list_for_each(pos2, head) + if (cmp(next, pos2) < 0) + goto pos_found; + BUG(); +pos_found: + list_add_tail(next, pos2); + } +} + +static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r) +{ + return cmp_int(l, r); +} + static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -599,42 +584,39 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, struct bch_fs *c = i->c; struct btree_trans *trans; ssize_t ret = 0; - u32 seq; i->ubuf = buf; i->size = size; i->ret = 0; restart: seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct task_struct *task = READ_ONCE(trans->locking_wait.task); + list_sort(&c->btree_trans_list, list_ptr_order_cmp); - if (!task || task->pid <= i->iter) + list_for_each_entry(trans, &c->btree_trans_list, list) { + if ((ulong) trans <= i->iter) continue; - closure_get(&trans->ref); - seq = seqmutex_seq(&c->btree_trans_lock); - seqmutex_unlock(&c->btree_trans_lock); + i->iter = (ulong) trans; - ret = flush_buf(i); - if (ret) { - closure_put(&trans->ref); - goto unlocked; - } + if (!closure_get_not_zero(&trans->ref)) + continue; + + u32 seq = seqmutex_unlock(&c->btree_trans_lock); bch2_btree_trans_to_text(&i->buf, trans); - prt_printf(&i->buf, "backtrace:"); - prt_newline(&i->buf); + prt_printf(&i->buf, "backtrace:\n"); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); + bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = task->pid; - closure_put(&trans->ref); + ret = flush_buf(i); + if (ret) + goto unlocked; + if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } @@ -668,7 +650,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, i->size = size; i->ret = 0; - do { + while (1) { err = flush_buf(i); if (err) return err; @@ -676,9 +658,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, if (!i->size) break; + if (done) + break; + done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); i->iter++; - } while (!done); + } if (i->buf.allocation_failure) return -ENOMEM; @@ -693,13 +678,45 @@ static const struct file_operations journal_pins_ops = { .read = bch2_journal_pins_read, }; +static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (!i->iter) { + bch2_btree_updates_to_text(&i->buf, c); + i->iter++; + } + + err = flush_buf(i); + if (err) + return err; + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations btree_updates_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_btree_updates_read, +}; + static int btree_transaction_stats_open(struct inode *inode, struct file *file) { struct bch_fs *c = inode->i_private; struct dump_iter *i; i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) return -ENOMEM; @@ -746,25 +763,20 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, !bch2_btree_transaction_fns[i->iter]) break; - prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]); - prt_newline(&i->buf); + prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]); printbuf_indent_add(&i->buf, 2); mutex_lock(&s->lock); - prt_printf(&i->buf, "Max mem used: %u", s->max_mem); - prt_newline(&i->buf); - - prt_printf(&i->buf, "Transaction duration:"); - prt_newline(&i->buf); + prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); + prt_printf(&i->buf, "Transaction duration:\n"); printbuf_indent_add(&i->buf, 2); bch2_time_stats_to_text(&i->buf, &s->duration); printbuf_indent_sub(&i->buf, 2); if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { - prt_printf(&i->buf, "Lock hold times:"); - prt_newline(&i->buf); + prt_printf(&i->buf, "Lock hold times:\n"); printbuf_indent_add(&i->buf, 2); bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); @@ -772,8 +784,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, } if (s->max_paths_text) { - prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths); - prt_newline(&i->buf); + prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths); printbuf_indent_add(&i->buf, 2); prt_str_indented(&i->buf, s->max_paths_text); @@ -800,50 +811,55 @@ static const struct file_operations btree_transaction_stats_op = { .read = btree_transaction_stats_read, }; -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +/* walk btree transactions until we find a deadlock and print it */ +static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) { - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; struct btree_trans *trans; - ssize_t ret = 0; - u32 seq; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - if (i->iter) - goto out; + ulong iter = 0; restart: seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct task_struct *task = READ_ONCE(trans->locking_wait.task); + list_sort(&c->btree_trans_list, list_ptr_order_cmp); - if (!task || task->pid <= i->iter) + list_for_each_entry(trans, &c->btree_trans_list, list) { + if ((ulong) trans <= iter) continue; - closure_get(&trans->ref); - seq = seqmutex_seq(&c->btree_trans_lock); - seqmutex_unlock(&c->btree_trans_lock); + iter = (ulong) trans; - ret = flush_buf(i); - if (ret) { - closure_put(&trans->ref); - goto out; - } + if (!closure_get_not_zero(&trans->ref)) + continue; - bch2_check_for_deadlock(trans, &i->buf); + u32 seq = seqmutex_unlock(&c->btree_trans_lock); - i->iter = task->pid; + bool found = bch2_check_for_deadlock(trans, out) != 0; closure_put(&trans->ref); + if (found) + return; + if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } seqmutex_unlock(&c->btree_trans_lock); -out: +} + +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (!i->iter) { + btree_deadlock_to_text(&i->buf, c); + i->iter++; + } + if (i->buf.allocation_failure) ret = -ENOMEM; @@ -866,6 +882,20 @@ void bch2_fs_debug_exit(struct bch_fs *c) debugfs_remove_recursive(c->fs_debug_dir); } +static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd) +{ + struct dentry *d; + + d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir); + + debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops); + + debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops); + + debugfs_create_file("bfloat-failed", 0400, d, bd, + &bfloat_failed_debug_ops); +} + void bch2_fs_debug_init(struct bch_fs *c) { struct btree_debug *bd; @@ -888,6 +918,9 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); + debugfs_create_file("btree_updates", 0400, c->fs_debug_dir, + c->btree_debug, &btree_updates_ops); + debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, c, &btree_transaction_stats_op); @@ -902,21 +935,7 @@ void bch2_fs_debug_init(struct bch_fs *c) bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); bd++) { bd->id = bd - c->btree_debug; - debugfs_create_file(bch2_btree_id_str(bd->id), - 0400, c->btree_debug_dir, bd, - &btree_debug_ops); - - snprintf(name, sizeof(name), "%s-formats", - bch2_btree_id_str(bd->id)); - - debugfs_create_file(name, 0400, c->btree_debug_dir, bd, - &btree_format_debug_ops); - - snprintf(name, sizeof(name), "%s-bfloat-failed", - bch2_btree_id_str(bd->id)); - - debugfs_create_file(name, 0400, c->btree_debug_dir, bd, - &bfloat_failed_debug_ops); + bch2_fs_debug_btree_init(c, bd); } } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 4ae1e9f002a0..600eee936f13 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -15,6 +15,9 @@ static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { + if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name)) + return 0; + unsigned bkey_u64s = bkey_val_u64s(d.k); unsigned bkey_bytes = bkey_u64s * sizeof(u64); u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1]; @@ -97,20 +100,19 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { .is_visible = dirent_is_visible, }; -int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); int ret = 0; - bkey_fsck_err_on(!d_name.len, c, err, - dirent_empty_name, + bkey_fsck_err_on(!d_name.len, + c, dirent_empty_name, "empty name"); - bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err, - dirent_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), + c, dirent_val_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); @@ -118,45 +120,47 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, * Check new keys don't exceed the max length * (older keys may be larger.) */ - bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err, - dirent_name_too_long, + bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, + c, dirent_name_too_long, "dirent name too big (%u > %u)", d_name.len, BCH_NAME_MAX); - bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err, - dirent_name_embedded_nul, + bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), + c, dirent_name_embedded_nul, "dirent has stray data after name's NUL"); bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) || - (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err, - dirent_name_dot_or_dotdot, + (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), + c, dirent_name_dot_or_dotdot, "invalid name"); - bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err, - dirent_name_has_slash, + bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), + c, dirent_name_has_slash, "name with /"); bkey_fsck_err_on(d.v->d_type != DT_SUBVOL && - le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err, - dirent_to_itself, + le64_to_cpu(d.v->d_inum) == d.k->p.inode, + c, dirent_to_itself, "dirent points to own directory"); fsck_err: return ret; } -void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); - prt_printf(out, "%.*s -> %llu type %s", - d_name.len, - d_name.name, - d.v->d_type != DT_SUBVOL - ? le64_to_cpu(d.v->d_inum) - : le32_to_cpu(d.v->d_child_subvol), - bch2_d_type_str(d.v->d_type)); + prt_printf(out, "%.*s -> ", d_name.len, d_name.name); + + if (d.v->d_type != DT_SUBVOL) + prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum)); + else + prt_printf(out, "%u -> %u", + le32_to_cpu(d.v->d_parent_subvol), + le32_to_cpu(d.v->d_child_subvol)); + + prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, @@ -199,17 +203,17 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, } int bch2_dirent_create_snapshot(struct btree_trans *trans, - u64 dir, u32 snapshot, + u32 dir_subvol, u64 dir, u32 snapshot, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, - bch_str_hash_flags_t str_hash_flags) + enum btree_iter_update_trigger_flags flags) { - subvol_inum zero_inum = { 0 }; + subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir }; struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum); + dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -217,10 +221,9 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, dirent->k.p.inode = dir; dirent->k.p.snapshot = snapshot; - ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info, - zero_inum, snapshot, - &dirent->k_i, str_hash_flags, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, + dir_inum, snapshot, &dirent->k_i, + flags|BTREE_UPDATE_internal_snapshot_node); *dir_offset = dirent->k.p.offset; return ret; @@ -230,7 +233,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, - bch_str_hash_flags_t str_hash_flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_i_dirent *dirent; int ret; @@ -241,19 +244,12 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, return ret; ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir, &dirent->k_i, str_hash_flags); + dir, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; return ret; } -static void dirent_copy_target(struct bkey_i_dirent *dst, - struct bkey_s_c_dirent src) -{ - dst->v.d_inum = src.v->d_inum; - dst->v.d_type = src.v->d_type; -} - int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, struct bkey_s_c_dirent d, subvol_inum *target) { @@ -270,7 +266,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } else { target->subvol = le32_to_cpu(d.v->d_child_subvol); - ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s); + ret = bch2_subvolume_get(trans, target->subvol, true, &s); target->inum = le64_to_cpu(s.inode); } @@ -291,23 +287,17 @@ int bch2_dirent_rename(struct btree_trans *trans, struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); - unsigned src_type = 0, dst_type = 0, src_update_flags = 0; + unsigned src_update_flags = 0; + bool delete_src, delete_dst; int ret = 0; - if (src_dir.subvol != dst_dir.subvol) - return -EXDEV; - memset(src_inum, 0, sizeof(*src_inum)); memset(dst_inum, 0, sizeof(*dst_inum)); /* Lookup src: */ - ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, - src_hash, src_dir, src_name, - BTREE_ITER_INTENT); - if (ret) - goto out; - - old_src = bch2_btree_iter_peek_slot(&src_iter); + old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, + src_hash, src_dir, src_name, + BTREE_ITER_intent); ret = bkey_err(old_src); if (ret) goto out; @@ -317,12 +307,6 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; - src_type = bkey_s_c_to_dirent(old_src).v->d_type; - - if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) - return -EOPNOTSUPP; - - /* Lookup dst: */ if (mode == BCH_RENAME) { /* @@ -335,13 +319,9 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; } else { - ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name, - BTREE_ITER_INTENT); - if (ret) - goto out; - - old_dst = bch2_btree_iter_peek_slot(&dst_iter); + old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name, + BTREE_ITER_intent); ret = bkey_err(old_dst); if (ret) goto out; @@ -350,11 +330,6 @@ int bch2_dirent_rename(struct btree_trans *trans, bkey_s_c_to_dirent(old_dst), dst_inum); if (ret) goto out; - - dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; - - if (dst_type == DT_SUBVOL) - return -EOPNOTSUPP; } if (mode != BCH_RENAME_EXCHANGE) @@ -424,28 +399,55 @@ int bch2_dirent_rename(struct btree_trans *trans, } } + if (new_dst->v.d_type == DT_SUBVOL) + new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol); + + if ((mode == BCH_RENAME_EXCHANGE) && + new_src->v.d_type == DT_SUBVOL) + new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; out_set_src: - /* - * If we're deleting a subvolume, we need to really delete the dirent, - * not just emit a whiteout in the current snapshot: + * If we're deleting a subvolume we need to really delete the dirent, + * not just emit a whiteout in the current snapshot - there can only be + * single dirent that points to a given subvolume. + * + * IOW, we don't maintain multiple versions in different snapshots of + * dirents that point to subvolumes - dirents that point to subvolumes + * are only visible in one particular subvolume so it's not necessary, + * and it would be particularly confusing for fsck to have to deal with. */ - if (src_type == DT_SUBVOL) { - bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); - ret = bch2_btree_iter_traverse(&src_iter); + delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL && + new_src->k.p.snapshot != old_src.k->p.snapshot; + + delete_dst = old_dst.k && + bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL && + new_dst->k.p.snapshot != old_dst.k->p.snapshot; + + if (!delete_src || !bkey_deleted(&new_src->k)) { + ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); if (ret) goto out; + } - new_src->k.p = src_iter.pos; - src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; + if (delete_src) { + bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(&src_iter) ?: + bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); + if (ret) + goto out; } - ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); - if (ret) - goto out; + if (delete_dst) { + bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dst_iter) ?: + bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); + if (ret) + goto out; + } if (mode == BCH_RENAME_EXCHANGE) *src_offset = new_src->k.p.offset; @@ -456,41 +458,25 @@ out: return ret; } -int __bch2_dirent_lookup_trans(struct btree_trans *trans, - struct btree_iter *iter, - subvol_inum dir, - const struct bch_hash_info *hash_info, - const struct qstr *name, subvol_inum *inum, - unsigned flags) +int bch2_dirent_lookup_trans(struct btree_trans *trans, + struct btree_iter *iter, + subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum, + unsigned flags) { - struct bkey_s_c k; - struct bkey_s_c_dirent d; - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); - if (ret) - return ret; - - ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, name, flags); - if (ret) - return ret; - - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); + struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); + int ret = bkey_err(k); if (ret) goto err; - d = bkey_s_c_to_dirent(k); - - ret = bch2_dirent_read_target(trans, dir, d, inum); + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum); if (ret > 0) ret = -ENOENT; err: if (ret) bch2_trans_iter_exit(trans, iter); - return ret; } @@ -502,23 +488,26 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, struct btree_iter iter = { NULL }; int ret = lockrestart_do(trans, - __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); + bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); return ret; } -int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot) +int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot) { struct btree_iter iter; struct bkey_s_c k; int ret; - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, SPOS(dir, 0, snapshot), POS(dir, U64_MAX), 0, k, ret) if (k.k->type == KEY_TYPE_dirent) { - ret = -ENOTEMPTY; + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) + continue; + ret = -BCH_ERR_ENOTEMPTY_dir_not_empty; break; } bch2_trans_iter_exit(trans, &iter); @@ -531,73 +520,55 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) u32 snapshot; return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?: - bch2_empty_dir_snapshot(trans, dir.inum, snapshot); + bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot); +} + +static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target) +{ + struct qstr name = bch2_dirent_get_name(d); + /* + * Although not required by the kernel code, updating ctx->pos is needed + * for the bcachefs FUSE driver. Without this update, the FUSE + * implementation will be stuck in an infinite loop when reading + * directories (via the bcachefs_fuse_readdir callback). + * In kernel space, ctx->pos is updated by the VFS code. + */ + ctx->pos = d.k->p.offset; + bool ret = dir_emit(ctx, name.name, + name.len, + target.inum, + vfs_d_type(d.v->d_type)); + if (ret) + ctx->pos = d.k->p.offset + 1; + return ret; } int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_dirent dirent; - subvol_inum target; - u32 snapshot; struct bkey_buf sk; - struct qstr name; - int ret; - bch2_bkey_buf_init(&sk); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, - SPOS(inum.inum, ctx->pos, snapshot), - POS(inum.inum, U64_MAX), 0, k, ret) { - if (k.k->type != KEY_TYPE_dirent) - continue; - dirent = bkey_s_c_to_dirent(k); + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents, + POS(inum.inum, ctx->pos), + POS(inum.inum, U64_MAX), + inum.subvol, 0, k, ({ + if (k.k->type != KEY_TYPE_dirent) + continue; - ret = bch2_dirent_read_target(trans, inum, dirent, &target); - if (ret < 0) - break; - if (ret) - continue; + /* dir_emit() can fault and block: */ + bch2_bkey_buf_reassemble(&sk, c, k); + struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); - /* dir_emit() can fault and block: */ - bch2_bkey_buf_reassemble(&sk, c, k); - dirent = bkey_i_to_s_c_dirent(sk.k); - bch2_trans_unlock(trans); + subvol_inum target; + int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target); + if (ret2 > 0) + continue; - name = bch2_dirent_get_name(dirent); + ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target)); + }))); - ctx->pos = dirent.k->p.offset; - if (!dir_emit(ctx, name.name, - name.len, - target.inum, - vfs_d_type(dirent.v->d_type))) - break; - ctx->pos = dirent.k->p.offset + 1; - - /* - * read_target looks up subvolumes, we can overflow paths if the - * directory has many subvolumes in it - */ - ret = btree_trans_too_many_iters(trans); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); - return ret; + return ret < 0 ? ret : 0; } diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 21ffeb78f02e..362b3b2f2f2e 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -4,15 +4,14 @@ #include "str_hash.h" -enum bkey_invalid_flags; extern const struct bch_hash_desc bch2_dirent_hash_desc; -int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ - .key_invalid = bch2_dirent_invalid, \ + .key_validate = bch2_dirent_validate, \ .val_to_text = bch2_dirent_to_text, \ .min_val_size = 16, \ }) @@ -35,14 +34,21 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_read_target(struct btree_trans *, subvol_inum, struct bkey_s_c_dirent, subvol_inum *); -int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32, +static inline void dirent_copy_target(struct bkey_i_dirent *dst, + struct bkey_s_c_dirent src) +{ + dst->v.d_inum = src.v->d_inum; + dst->v.d_type = src.v->d_type; +} + +int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, - bch_str_hash_flags_t); + enum btree_iter_update_trigger_flags); int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, - bch_str_hash_flags_t); + enum btree_iter_update_trigger_flags); static inline unsigned vfs_d_type(unsigned type) { @@ -62,14 +68,14 @@ int bch2_dirent_rename(struct btree_trans *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); -int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, +int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, subvol_inum, const struct bch_hash_info *, const struct qstr *, subvol_inum *, unsigned); u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, const struct bch_hash_info *, const struct qstr *, subvol_inum *); -int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32); +int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c new file mode 100644 index 000000000000..b32e91ba8be8 --- /dev/null +++ b/fs/bcachefs/disk_accounting.c @@ -0,0 +1,1012 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bcachefs_ioctl.h" +#include "btree_cache.h" +#include "btree_journal_iter.h" +#include "btree_update.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "compress.h" +#include "disk_accounting.h" +#include "error.h" +#include "journal_io.h" +#include "replicas.h" + +/* + * Notes on disk accounting: + * + * We have two parallel sets of counters to be concerned with, and both must be + * kept in sync. + * + * - Persistent/on disk accounting, stored in the accounting btree and updated + * via btree write buffer updates that treat new accounting keys as deltas to + * apply to existing values. But reading from a write buffer btree is + * expensive, so we also have + * + * - In memory accounting, where accounting is stored as an array of percpu + * counters, indexed by an eytzinger array of disk acounting keys/bpos (which + * are the same thing, excepting byte swabbing on big endian). + * + * Cheap to read, but non persistent. + * + * Disk accounting updates are generated by transactional triggers; these run as + * keys enter and leave the btree, and can compare old and new versions of keys; + * the output of these triggers are deltas to the various counters. + * + * Disk accounting updates are done as btree write buffer updates, where the + * counters in the disk accounting key are deltas that will be applied to the + * counter in the btree when the key is flushed by the write buffer (or journal + * replay). + * + * To do a disk accounting update: + * - initialize a disk_accounting_pos, to specify which counter is being update + * - initialize counter deltas, as an array of 1-3 s64s + * - call bch2_disk_accounting_mod() + * + * This queues up the accounting update to be done at transaction commit time. + * Underneath, it's a normal btree write buffer update. + * + * The transaction commit path is responsible for propagating updates to the in + * memory counters, with bch2_accounting_mem_mod(). + * + * The commit path also assigns every disk accounting update a unique version + * number, based on the journal sequence number and offset within that journal + * buffer; this is used by journal replay to determine which updates have been + * done. + * + * The transaction commit path also ensures that replicas entry accounting + * updates are properly marked in the superblock (so that we know whether we can + * mount without data being unavailable); it will update the superblock if + * bch2_accounting_mem_mod() tells it to. + */ + +static const char * const disk_accounting_type_strs[] = { +#define x(t, n, ...) [n] = #t, + BCH_DISK_ACCOUNTING_TYPES() +#undef x + NULL +}; + +static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, + s64 *d, unsigned nr) +{ + struct bkey_i_accounting *acc = bkey_accounting_init(k); + + acc->k.p = disk_accounting_pos_to_bpos(pos); + set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); + + memcpy_u64s_small(acc->v.d, d, nr); +} + +static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); + +int bch2_disk_accounting_mod(struct btree_trans *trans, + struct disk_accounting_pos *k, + s64 *d, unsigned nr, bool gc) +{ + /* Normalize: */ + switch (k->type) { + case BCH_DISK_ACCOUNTING_replicas: + bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp); + break; + } + + BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); + + struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; + + accounting_key_init(&k_i.k, k, d, nr); + + if (unlikely(gc)) { + int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + if (ret == -BCH_ERR_btree_insert_need_mark_replicas) + ret = drop_locks_do(trans, + bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: + bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + return ret; + } else { + return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); + } +} + +int bch2_mod_dev_cached_sectors(struct btree_trans *trans, + unsigned dev, s64 sectors, + bool gc) +{ + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + + bch2_replicas_entry_cached(&acc.replicas, dev); + + return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); +} + +static inline bool is_zero(char *start, char *end) +{ + BUG_ON(start > end); + + for (; start < end; start++) + if (*start) + return false; + return true; +} + +#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) + +int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) +{ + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + void *end = &acc_k + 1; + int ret = 0; + + bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && + bversion_zero(k.k->bversion), + c, accounting_key_version_0, + "accounting key with version=0"); + + switch (acc_k.type) { + case BCH_DISK_ACCOUNTING_nr_inodes: + end = field_end(acc_k, nr_inodes); + break; + case BCH_DISK_ACCOUNTING_persistent_reserved: + end = field_end(acc_k, persistent_reserved); + break; + case BCH_DISK_ACCOUNTING_replicas: + bkey_fsck_err_on(!acc_k.replicas.nr_devs, + c, accounting_key_replicas_nr_devs_0, + "accounting key replicas entry with nr_devs=0"); + + bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs || + (acc_k.replicas.nr_required > 1 && + acc_k.replicas.nr_required == acc_k.replicas.nr_devs), + c, accounting_key_replicas_nr_required_bad, + "accounting key replicas entry with bad nr_required"); + + for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) + bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], + c, accounting_key_replicas_devs_unsorted, + "accounting key replicas entry with unsorted devs"); + + end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + end = field_end(acc_k, dev_data_type); + break; + case BCH_DISK_ACCOUNTING_compression: + end = field_end(acc_k, compression); + break; + case BCH_DISK_ACCOUNTING_snapshot: + end = field_end(acc_k, snapshot); + break; + case BCH_DISK_ACCOUNTING_btree: + end = field_end(acc_k, btree); + break; + case BCH_DISK_ACCOUNTING_rebalance_work: + end = field_end(acc_k, rebalance_work); + break; + } + + bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), + c, accounting_key_junk_at_end, + "junk at end of accounting key"); +fsck_err: + return ret; +} + +void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) +{ + if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) { + prt_printf(out, "unknown type %u", k->type); + return; + } + + prt_str(out, disk_accounting_type_strs[k->type]); + prt_str(out, " "); + + switch (k->type) { + case BCH_DISK_ACCOUNTING_nr_inodes: + break; + case BCH_DISK_ACCOUNTING_persistent_reserved: + prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas); + break; + case BCH_DISK_ACCOUNTING_replicas: + bch2_replicas_entry_to_text(out, &k->replicas); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev); + bch2_prt_data_type(out, k->dev_data_type.data_type); + break; + case BCH_DISK_ACCOUNTING_compression: + bch2_prt_compression_type(out, k->compression.type); + break; + case BCH_DISK_ACCOUNTING_snapshot: + prt_printf(out, "id=%u", k->snapshot.id); + break; + case BCH_DISK_ACCOUNTING_btree: + prt_str(out, "btree="); + bch2_btree_id_to_text(out, k->btree.id); + break; + } +} + +void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k); + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + bch2_accounting_key_to_text(out, &acc_k); + + for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++) + prt_printf(out, " %lli", acc.v->d[i]); +} + +void bch2_accounting_swab(struct bkey_s k) +{ + for (u64 *p = (u64 *) k.v; + p < (u64 *) bkey_val_end(k); + p++) + *p = swab64(*p); +} + +static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, + struct disk_accounting_pos *acc) +{ + unsafe_memcpy(r, &acc->replicas, + replicas_entry_bytes(&acc->replicas), + "variable length struct"); +} + +static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) +{ + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, p); + + switch (acc_k.type) { + case BCH_DISK_ACCOUNTING_replicas: + __accounting_to_replicas(r, &acc_k); + return true; + default: + return false; + } +} + +static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) +{ + struct bch_replicas_padded r; + return accounting_to_replicas(&r.e, p) + ? bch2_mark_replicas(c, &r.e) + : 0; +} + +/* + * Ensure accounting keys being updated are present in the superblock, when + * applicable (i.e. replicas updates) + */ +int bch2_accounting_update_sb(struct btree_trans *trans) +{ + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) + if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) { + int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p); + if (ret) + return ret; + } + + return 0; +} + +static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a) +{ + struct bch_accounting_mem *acc = &c->accounting; + + /* raced with another insert, already present: */ + if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &a.k->p) < acc->k.nr) + return 0; + + struct accounting_mem_entry n = { + .pos = a.k->p, + .bversion = a.k->bversion, + .nr_counters = bch2_accounting_counters(a.k), + .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL), + }; + + if (!n.v[0]) + goto err; + + if (acc->gc_running) { + n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL); + if (!n.v[1]) + goto err; + } + + if (darray_push(&acc->k, n)) + goto err; + + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + + if (trace_accounting_mem_insert_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_accounting_to_text(&buf, c, a.s_c); + trace_accounting_mem_insert(c, buf.buf); + printbuf_exit(&buf); + } + return 0; +err: + free_percpu(n.v[1]); + free_percpu(n.v[0]); + return -BCH_ERR_ENOMEM_disk_accounting; +} + +int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, + enum bch_accounting_mode mode) +{ + struct bch_replicas_padded r; + + if (mode != BCH_ACCOUNTING_read && + accounting_to_replicas(&r.e, a.k->p) && + !bch2_replicas_marked_locked(c, &r.e)) + return -BCH_ERR_btree_insert_need_mark_replicas; + + percpu_up_read(&c->mark_lock); + percpu_down_write(&c->mark_lock); + int ret = __bch2_accounting_mem_insert(c, a); + percpu_up_write(&c->mark_lock); + percpu_down_read(&c->mark_lock); + return ret; +} + +static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) +{ + for (unsigned i = 0; i < e->nr_counters; i++) + if (percpu_u64_get(e->v[0] + i) || + (e->v[1] && + percpu_u64_get(e->v[1] + i))) + return false; + return true; +} + +void bch2_accounting_mem_gc(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + + percpu_down_write(&c->mark_lock); + struct accounting_mem_entry *dst = acc->k.data; + + darray_for_each(acc->k, src) { + if (accounting_mem_entry_is_zero(src)) { + free_percpu(src->v[0]); + free_percpu(src->v[1]); + } else { + *dst++ = *src; + } + } + + acc->k.nr = dst - acc->k.data; + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + percpu_up_write(&c->mark_lock); +} + +/* + * Read out accounting keys for replicas entries, as an array of + * bch_replicas_usage entries. + * + * Note: this may be deprecated/removed at smoe point in the future and replaced + * with something more general, it exists to support the ioctl used by the + * 'bcachefs fs usage' command. + */ +int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) +{ + struct bch_accounting_mem *acc = &c->accounting; + int ret = 0; + + darray_init(usage); + + percpu_down_read(&c->mark_lock); + darray_for_each(acc->k, i) { + struct { + struct bch_replicas_usage r; + u8 pad[BCH_BKEY_PTRS_MAX]; + } u; + + if (!accounting_to_replicas(&u.r.r, i->pos)) + continue; + + u64 sectors; + bch2_accounting_mem_read_counters(acc, i - acc->k.data, §ors, 1, false); + u.r.sectors = sectors; + + ret = darray_make_room(usage, replicas_usage_bytes(&u.r)); + if (ret) + break; + + memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r)); + usage->nr += replicas_usage_bytes(&u.r); + } + percpu_up_read(&c->mark_lock); + + if (ret) + darray_exit(usage); + return ret; +} + +int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask) +{ + + struct bch_accounting_mem *acc = &c->accounting; + int ret = 0; + + darray_init(out_buf); + + percpu_down_read(&c->mark_lock); + darray_for_each(acc->k, i) { + struct disk_accounting_pos a_p; + bpos_to_disk_accounting_pos(&a_p, i->pos); + + if (!(accounting_types_mask & BIT(a_p.type))) + continue; + + ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) + + sizeof(u64) * i->nr_counters); + if (ret) + break; + + struct bkey_i_accounting *a_out = + bkey_accounting_init((void *) &darray_top(*out_buf)); + set_bkey_val_u64s(&a_out->k, i->nr_counters); + a_out->k.p = i->pos; + bch2_accounting_mem_read_counters(acc, i - acc->k.data, + a_out->v.d, i->nr_counters, false); + + if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out))) + out_buf->nr += bkey_bytes(&a_out->k); + } + + percpu_up_read(&c->mark_lock); + + if (ret) + darray_exit(out_buf); + return ret; +} + +static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) +{ + darray_for_each(acc->k, e) { + free_percpu(e->v[gc]); + e->v[gc] = NULL; + } +} + +int bch2_gc_accounting_start(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + int ret = 0; + + percpu_down_write(&c->mark_lock); + darray_for_each(acc->k, e) { + e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL); + if (!e->v[1]) { + bch2_accounting_free_counters(acc, true); + ret = -BCH_ERR_ENOMEM_disk_accounting; + break; + } + } + + acc->gc_running = !ret; + percpu_up_write(&c->mark_lock); + + return ret; +} + +int bch2_gc_accounting_done(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + struct btree_trans *trans = bch2_trans_get(c); + struct printbuf buf = PRINTBUF; + struct bpos pos = POS_MIN; + int ret = 0; + + percpu_down_write(&c->mark_lock); + while (1) { + unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &pos); + + if (idx >= acc->k.nr) + break; + + struct accounting_mem_entry *e = acc->k.data + idx; + pos = bpos_successor(e->pos); + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, e->pos); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + continue; + + u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; + u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; + + unsigned nr = e->nr_counters; + bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false); + bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true); + + if (memcmp(dst_v, src_v, nr * sizeof(u64))) { + printbuf_reset(&buf); + prt_str(&buf, "accounting mismatch for "); + bch2_accounting_key_to_text(&buf, &acc_k); + + prt_str(&buf, ": got"); + for (unsigned j = 0; j < nr; j++) + prt_printf(&buf, " %llu", dst_v[j]); + + prt_str(&buf, " should be"); + for (unsigned j = 0; j < nr; j++) + prt_printf(&buf, " %llu", src_v[j]); + + for (unsigned j = 0; j < nr; j++) + src_v[j] -= dst_v[j]; + + if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) { + percpu_up_write(&c->mark_lock); + ret = commit_do(trans, NULL, NULL, 0, + bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); + percpu_down_write(&c->mark_lock); + if (ret) + goto err; + + if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); + struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; + + accounting_key_init(&k_i.k, &acc_k, src_v, nr); + bch2_accounting_mem_mod_locked(trans, + bkey_i_to_s_c_accounting(&k_i.k), + BCH_ACCOUNTING_normal); + + preempt_disable(); + struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); + struct bch_fs_usage_base *src = &trans->fs_usage_delta; + acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); + preempt_enable(); + } + } + } + } +err: +fsck_err: + percpu_up_write(&c->mark_lock); + printbuf_exit(&buf); + bch2_trans_put(trans); + bch_err_fn(c, ret); + return ret; +} + +static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + + if (k.k->type != KEY_TYPE_accounting) + return 0; + + percpu_down_read(&c->mark_lock); + int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), + BCH_ACCOUNTING_read); + percpu_up_read(&c->mark_lock); + return ret; +} + +static int bch2_disk_accounting_validate_late(struct btree_trans *trans, + struct disk_accounting_pos acc, + u64 *v, unsigned nr) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0, invalid_dev = -1; + + switch (acc.type) { + case BCH_DISK_ACCOUNTING_replicas: { + struct bch_replicas_padded r; + __accounting_to_replicas(&r.e, &acc); + + for (unsigned i = 0; i < r.e.nr_devs; i++) + if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_dev_exists(c, r.e.devs[i])) { + invalid_dev = r.e.devs[i]; + goto invalid_device; + } + + /* + * All replicas entry checks except for invalid device are done + * in bch2_accounting_validate + */ + BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); + + if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), + trans, accounting_replicas_not_marked, + "accounting not marked in superblock replicas\n %s", + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) { + /* + * We're not RW yet and still single threaded, dropping + * and retaking lock is ok: + */ + percpu_up_write(&c->mark_lock); + ret = bch2_mark_replicas(c, &r.e); + if (ret) + goto fsck_err; + percpu_down_write(&c->mark_lock); + } + break; + } + + case BCH_DISK_ACCOUNTING_dev_data_type: + if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { + invalid_dev = acc.dev_data_type.dev; + goto invalid_device; + } + break; + } + +fsck_err: + printbuf_exit(&buf); + return ret; +invalid_device: + if (fsck_err(trans, accounting_to_invalid_device, + "accounting entry points to invalid device %i\n %s", + invalid_dev, + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) { + for (unsigned i = 0; i < nr; i++) + v[i] = -v[i]; + + ret = commit_do(trans, NULL, NULL, 0, + bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: + -BCH_ERR_remove_disk_accounting_entry; + } else { + ret = -BCH_ERR_remove_disk_accounting_entry; + } + goto fsck_err; +} + +/* + * At startup time, initialize the in memory accounting from the btree (and + * journal) + */ +int bch2_accounting_read(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + struct btree_trans *trans = bch2_trans_get(c); + struct printbuf buf = PRINTBUF; + + /* + * We might run more than once if we rewind to start topology repair or + * btree node scan - and those might cause us to get different results, + * so we can't just skip if we've already run. + * + * Instead, zero out any accounting we have: + */ + percpu_down_write(&c->mark_lock); + darray_for_each(acc->k, e) + percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); + for_each_member_device(c, ca) + percpu_memset(ca->usage, 0, sizeof(*ca->usage)); + percpu_memset(c->usage, 0, sizeof(*c->usage)); + percpu_up_write(&c->mark_lock); + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + iter.flags &= ~BTREE_ITER_with_journal; + int ret = for_each_btree_key_continue(trans, iter, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); + + if (k.k->type != KEY_TYPE_accounting) + continue; + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + break; + + if (!bch2_accounting_is_mem(acc_k)) { + struct disk_accounting_pos next = { .type = acc_k.type + 1 }; + bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + continue; + } + + accounting_read_key(trans, k); + })); + if (ret) + goto err; + + struct journal_keys *keys = &c->journal_keys; + struct journal_key *dst = keys->data; + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) { + if (i->k->k.type == KEY_TYPE_accounting) { + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); + + if (!bch2_accounting_is_mem(acc_k)) + continue; + + struct bkey_s_c k = bkey_i_to_s_c(i->k); + unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, + sizeof(acc->k.data[0]), + accounting_pos_cmp, &k.k->p); + + bool applied = idx < acc->k.nr && + bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0; + + if (applied) + continue; + + if (i + 1 < &darray_top(*keys) && + i[1].k->k.type == KEY_TYPE_accounting && + !journal_key_cmp(i, i + 1)) { + WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); + + i[1].journal_seq = i[0].journal_seq; + + bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k), + bkey_s_c_to_accounting(k)); + continue; + } + + ret = accounting_read_key(trans, k); + if (ret) + goto err; + } + + *dst++ = *i; + } + keys->gap = keys->nr = dst - keys->data; + + percpu_down_write(&c->mark_lock); + + darray_for_each_reverse(acc->k, i) { + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, i->pos); + + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + memset(v, 0, sizeof(v)); + + for (unsigned j = 0; j < i->nr_counters; j++) + v[j] = percpu_u64_get(i->v[0] + j); + + /* + * If the entry counters are zeroed, it should be treated as + * nonexistent - it might point to an invalid device. + * + * Remove it, so that if it's re-added it gets re-marked in the + * superblock: + */ + ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) + ? -BCH_ERR_remove_disk_accounting_entry + : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); + + if (ret == -BCH_ERR_remove_disk_accounting_entry) { + free_percpu(i->v[0]); + free_percpu(i->v[1]); + darray_remove_item(&acc->k, i); + ret = 0; + continue; + } + + if (ret) + goto fsck_err; + } + + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + + preempt_disable(); + struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); + + for (unsigned i = 0; i < acc->k.nr; i++) { + struct disk_accounting_pos k; + bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); + + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); + + switch (k.type) { + case BCH_DISK_ACCOUNTING_persistent_reserved: + usage->reserved += v[0] * k.persistent_reserved.nr_replicas; + break; + case BCH_DISK_ACCOUNTING_replicas: + fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); + if (ca) { + struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; + percpu_u64_set(&d->buckets, v[0]); + percpu_u64_set(&d->sectors, v[1]); + percpu_u64_set(&d->fragmented, v[2]); + + if (k.dev_data_type.data_type == BCH_DATA_sb || + k.dev_data_type.data_type == BCH_DATA_journal) + usage->hidden += v[0] * ca->mi.bucket_size; + } + rcu_read_unlock(); + break; + } + } + preempt_enable(); +fsck_err: + percpu_up_write(&c->mark_lock); +err: + printbuf_exit(&buf); + bch2_trans_put(trans); + bch_err_fn(c, ret); + return ret; +} + +int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) +{ + return bch2_trans_run(c, + bch2_btree_write_buffer_flush_sync(trans) ?: + for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, + BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ + struct disk_accounting_pos acc; + bpos_to_disk_accounting_pos(&acc, k.k->p); + + acc.type == BCH_DISK_ACCOUNTING_dev_data_type && + acc.dev_data_type.dev == dev + ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) + : 0; + })) ?: + bch2_btree_write_buffer_flush_sync(trans)); +} + +int bch2_dev_usage_init(struct bch_dev *ca, bool gc) +{ + struct bch_fs *c = ca->fs; + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_dev_data_type, + .dev_data_type.dev = ca->dev_idx, + .dev_data_type.data_type = BCH_DATA_free, + }; + u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; + + int ret = bch2_trans_do(c, ({ + bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?: + (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); + })); + bch_err_fn(c, ret); + return ret; +} + +void bch2_verify_accounting_clean(struct bch_fs *c) +{ + bool mismatch = false; + struct bch_fs_usage_base base = {}, base_inmem = {}; + + bch2_trans_run(c, + for_each_btree_key(trans, iter, + BTREE_ID_accounting, POS_MIN, + BTREE_ITER_all_snapshots, k, ({ + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); + unsigned nr = bch2_accounting_counters(k.k); + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + break; + + if (!bch2_accounting_is_mem(acc_k)) { + struct disk_accounting_pos next = { .type = acc_k.type + 1 }; + bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + continue; + } + + bch2_accounting_mem_read(c, k.k->p, v, nr); + + if (memcmp(a.v->d, v, nr * sizeof(u64))) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, " !="); + for (unsigned j = 0; j < nr; j++) + prt_printf(&buf, " %llu", v[j]); + + pr_err("%s", buf.buf); + printbuf_exit(&buf); + mismatch = true; + } + + switch (acc_k.type) { + case BCH_DISK_ACCOUNTING_persistent_reserved: + base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; + break; + case BCH_DISK_ACCOUNTING_replicas: + fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: { + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); + if (!ca) { + rcu_read_unlock(); + continue; + } + + v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); + v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); + v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); + rcu_read_unlock(); + + if (memcmp(a.v->d, v, 3 * sizeof(u64))) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, " in mem"); + for (unsigned j = 0; j < nr; j++) + prt_printf(&buf, " %llu", v[j]); + + pr_err("dev accounting mismatch: %s", buf.buf); + printbuf_exit(&buf); + mismatch = true; + } + } + } + + 0; + }))); + + acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64)); + +#define check(x) \ + if (base.x != base_inmem.x) { \ + pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \ + mismatch = true; \ + } + + //check(hidden); + check(btree); + check(data); + check(cached); + check(reserved); + check(nr_inodes); + + WARN_ON(mismatch); +} + +void bch2_accounting_gc_free(struct bch_fs *c) +{ + lockdep_assert_held(&c->mark_lock); + + struct bch_accounting_mem *acc = &c->accounting; + + bch2_accounting_free_counters(acc, true); + acc->gc_running = false; +} + +void bch2_fs_accounting_exit(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + + bch2_accounting_free_counters(acc, false); + darray_exit(&acc->k); +} diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h new file mode 100644 index 000000000000..f4372cafea2e --- /dev/null +++ b/fs/bcachefs/disk_accounting.h @@ -0,0 +1,275 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_ACCOUNTING_H +#define _BCACHEFS_DISK_ACCOUNTING_H + +#include "btree_update.h" +#include "eytzinger.h" +#include "sb-members.h" + +static inline void bch2_u64s_neg(u64 *v, unsigned nr) +{ + for (unsigned i = 0; i < nr; i++) + v[i] = -v[i]; +} + +static inline unsigned bch2_accounting_counters(const struct bkey *k) +{ + return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64); +} + +static inline void bch2_accounting_neg(struct bkey_s_accounting a) +{ + bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k)); +} + +static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a) +{ + for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) + if (a.v->d[i]) + return false; + return true; +} + +static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, + struct bkey_s_c_accounting src) +{ + EBUG_ON(dst->k.u64s != src.k->u64s); + + for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++) + dst->v.d[i] += src.v->d[i]; + if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0) + dst->k.bversion = src.k->bversion; +} + +static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, + enum bch_data_type data_type, + s64 sectors) +{ + switch (data_type) { + case BCH_DATA_btree: + fs_usage->btree += sectors; + break; + case BCH_DATA_user: + case BCH_DATA_parity: + fs_usage->data += sectors; + break; + case BCH_DATA_cached: + fs_usage->cached += sectors; + break; + default: + break; + } +} + +static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p) +{ + BUILD_BUG_ON(sizeof(*acc) != sizeof(p)); + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + acc->_pad = p; +#else + memcpy_swab(acc, &p, sizeof(p)); +#endif +} + +static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc) +{ + struct bpos p; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + p = acc->_pad; +#else + memcpy_swab(&p, acc, sizeof(p)); +#endif + return p; +} + +int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, + s64 *, unsigned, bool); +int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); + +int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); +void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_accounting_swab(struct bkey_s); + +#define bch2_bkey_ops_accounting ((struct bkey_ops) { \ + .key_validate = bch2_accounting_validate, \ + .val_to_text = bch2_accounting_to_text, \ + .swab = bch2_accounting_swab, \ + .min_val_size = 8, \ +}) + +int bch2_accounting_update_sb(struct btree_trans *); + +static inline int accounting_pos_cmp(const void *_l, const void *_r) +{ + const struct bpos *l = _l, *r = _r; + + return bpos_cmp(*l, *r); +} + +enum bch_accounting_mode { + BCH_ACCOUNTING_normal, + BCH_ACCOUNTING_gc, + BCH_ACCOUNTING_read, +}; + +int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); +void bch2_accounting_mem_gc(struct bch_fs *); + +static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) +{ + return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && + acc.type != BCH_DISK_ACCOUNTING_inum; +} + +/* + * Update in memory counters so they match the btree update we're doing; called + * from transaction commit path + */ +static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, + struct bkey_s_c_accounting a, + enum bch_accounting_mode mode) +{ + struct bch_fs *c = trans->c; + struct bch_accounting_mem *acc = &c->accounting; + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, a.k->p); + bool gc = mode == BCH_ACCOUNTING_gc; + + if (gc && !acc->gc_running) + return 0; + + if (!bch2_accounting_is_mem(acc_k)) + return 0; + + if (mode == BCH_ACCOUNTING_normal) { + switch (acc_k.type) { + case BCH_DISK_ACCOUNTING_persistent_reserved: + trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; + break; + case BCH_DISK_ACCOUNTING_replicas: + fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); + if (ca) { + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); + } + rcu_read_unlock(); + break; + } + } + + unsigned idx; + + while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { + int ret = bch2_accounting_mem_insert(c, a, mode); + if (ret) + return ret; + } + + struct accounting_mem_entry *e = &acc->k.data[idx]; + + EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); + + for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) + this_cpu_add(e->v[gc][i], a.v->d[i]); + return 0; +} + +static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) +{ + percpu_down_read(&trans->c->mark_lock); + int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal); + percpu_up_read(&trans->c->mark_lock); + return ret; +} + +static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc, + unsigned idx, u64 *v, unsigned nr, bool gc) +{ + memset(v, 0, sizeof(*v) * nr); + + if (unlikely(idx >= acc->k.nr)) + return; + + struct accounting_mem_entry *e = &acc->k.data[idx]; + + nr = min_t(unsigned, nr, e->nr_counters); + + for (unsigned i = 0; i < nr; i++) + v[i] = percpu_u64_get(e->v[gc] + i); +} + +static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, + u64 *v, unsigned nr) +{ + percpu_down_read(&c->mark_lock); + struct bch_accounting_mem *acc = &c->accounting; + unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &p); + + bch2_accounting_mem_read_counters(acc, idx, v, nr, false); + percpu_up_read(&c->mark_lock); +} + +static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) +{ + EBUG_ON(!res->ref); + + return (struct bversion) { + .hi = res->seq >> 32, + .lo = (res->seq << 32) | (res->offset + offset), + }; +} + +static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, + struct bkey_i_accounting *a, + unsigned commit_flags) +{ + a->k.bversion = journal_pos_to_bversion(&trans->journal_res, + (u64 *) a - (u64 *) trans->journal_entries); + + EBUG_ON(bversion_zero(a->k.bversion)); + + return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) + ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal) + : 0; +} + +static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans, + struct bkey_i_accounting *a_i, + unsigned commit_flags) +{ + if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { + struct bkey_s_accounting a = accounting_i_to_s(a_i); + + bch2_accounting_neg(a); + bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); + bch2_accounting_neg(a); + } +} + +int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); +int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); + +int bch2_gc_accounting_start(struct bch_fs *); +int bch2_gc_accounting_done(struct bch_fs *); + +int bch2_accounting_read(struct bch_fs *); + +int bch2_dev_usage_remove(struct bch_fs *, unsigned); +int bch2_dev_usage_init(struct bch_dev *, bool); + +void bch2_verify_accounting_clean(struct bch_fs *c); + +void bch2_accounting_gc_free(struct bch_fs *); +void bch2_fs_accounting_exit(struct bch_fs *); + +#endif /* _BCACHEFS_DISK_ACCOUNTING_H */ diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h new file mode 100644 index 000000000000..7b6e6c97e6aa --- /dev/null +++ b/fs/bcachefs/disk_accounting_format.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H +#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H + +#include "replicas_format.h" + +/* + * Disk accounting - KEY_TYPE_accounting - on disk format: + * + * Here, the key has considerably more structure than a typical key (bpos); an + * accounting key is 'struct disk_accounting_pos', which is a union of bpos. + * + * More specifically: a key is just a muliword integer (where word endianness + * matches native byte order), so we're treating bpos as an opaque 20 byte + * integer and mapping bch_accounting_key to that. + * + * This is a type-tagged union of all our various subtypes; a disk accounting + * key can be device counters, replicas counters, et cetera - it's extensible. + * + * The value is a list of u64s or s64s; the number of counters is specific to a + * given accounting type. + * + * Unlike with other key types, updates are _deltas_, and the deltas are not + * resolved until the update to the underlying btree, done by btree write buffer + * flush or journal replay. + * + * Journal replay in particular requires special handling. The journal tracks a + * range of entries which may possibly have not yet been applied to the btree + * yet - it does not know definitively whether individual entries are dirty and + * still need to be applied. + * + * To handle this, we use the version field of struct bkey, and give every + * accounting update a unique version number - a total ordering in time; the + * version number is derived from the key's position in the journal. Then + * journal replay can compare the version number of the key from the journal + * with the version number of the key in the btree to determine if a key needs + * to be replayed. + * + * For this to work, we must maintain this strict time ordering of updates as + * they are flushed to the btree, both via write buffer flush and via journal + * replay. This has complications for the write buffer code while journal replay + * is still in progress; the write buffer cannot flush any accounting keys to + * the btree until journal replay has finished replaying its accounting keys, or + * the (newer) version number of the keys from the write buffer will cause + * updates from journal replay to be lost. + */ + +struct bch_accounting { + struct bch_val v; + __u64 d[]; +}; + +#define BCH_ACCOUNTING_MAX_COUNTERS 3 + +#define BCH_DATA_TYPES() \ + x(free, 0) \ + x(sb, 1) \ + x(journal, 2) \ + x(btree, 3) \ + x(user, 4) \ + x(cached, 5) \ + x(parity, 6) \ + x(stripe, 7) \ + x(need_gc_gens, 8) \ + x(need_discard, 9) \ + x(unstriped, 10) + +enum bch_data_type { +#define x(t, n) BCH_DATA_##t, + BCH_DATA_TYPES() +#undef x + BCH_DATA_NR +}; + +static inline bool data_type_is_empty(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: + return true; + default: + return false; + } +} + +static inline bool data_type_is_hidden(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_sb: + case BCH_DATA_journal: + return true; + default: + return false; + } +} + +#define BCH_DISK_ACCOUNTING_TYPES() \ + x(nr_inodes, 0) \ + x(persistent_reserved, 1) \ + x(replicas, 2) \ + x(dev_data_type, 3) \ + x(compression, 4) \ + x(snapshot, 5) \ + x(btree, 6) \ + x(rebalance_work, 7) \ + x(inum, 8) + +enum disk_accounting_type { +#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr, + BCH_DISK_ACCOUNTING_TYPES() +#undef x + BCH_DISK_ACCOUNTING_TYPE_NR, +}; + +struct bch_nr_inodes { +}; + +struct bch_persistent_reserved { + __u8 nr_replicas; +}; + +struct bch_dev_data_type { + __u8 dev; + __u8 data_type; +}; + +struct bch_acct_compression { + __u8 type; +}; + +struct bch_acct_snapshot { + __u32 id; +} __packed; + +struct bch_acct_btree { + __u32 id; +} __packed; + +struct bch_acct_inum { + __u64 inum; +} __packed; + +struct bch_acct_rebalance_work { +}; + +struct disk_accounting_pos { + union { + struct { + __u8 type; + union { + struct bch_nr_inodes nr_inodes; + struct bch_persistent_reserved persistent_reserved; + struct bch_replicas_entry_v1 replicas; + struct bch_dev_data_type dev_data_type; + struct bch_acct_compression compression; + struct bch_acct_snapshot snapshot; + struct bch_acct_btree btree; + struct bch_acct_rebalance_work rebalance_work; + struct bch_acct_inum inum; + } __packed; + } __packed; + struct bpos _pad; + }; +}; + +#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */ diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h new file mode 100644 index 000000000000..b1982131b206 --- /dev/null +++ b/fs/bcachefs/disk_accounting_types.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H +#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H + +#include "darray.h" + +struct accounting_mem_entry { + struct bpos pos; + struct bversion bversion; + unsigned nr_counters; + u64 __percpu *v[2]; +}; + +struct bch_accounting_mem { + DARRAY(struct accounting_mem_entry) k; + bool gc_running; +}; + +#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 06a7df529b40..5df8de0b8c02 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -18,9 +18,8 @@ static int group_cmp(const void *_l, const void *_r) strncmp(l->label, r->label, sizeof(l->label)); } -static int bch2_sb_disk_groups_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_disk_groups *groups = field_to_type(f, disk_groups); @@ -177,7 +176,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i); struct bch_disk_group_cpu *dst; - if (!bch2_member_exists(&m)) + if (!bch2_member_alive(&m)) continue; g = BCH_MEMBER_GROUP(&m); @@ -512,7 +511,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, return -EINVAL; if (!c) - return 0; + return -BCH_ERR_option_needs_open_fs; if (!strlen(val) || !strcmp(val, "none")) { *res = 0; @@ -523,7 +522,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, ca = bch2_dev_lookup(c, val); if (!IS_ERR(ca)) { *res = dev_to_target(ca->dev_idx); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return 0; } @@ -588,7 +587,7 @@ static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsi case TARGET_DEV: { struct bch_member m = bch2_sb_member_get(sb, t.dev); - if (bch2_dev_exists(sb, t.dev)) { + if (bch2_member_exists(sb, t.dev)) { prt_printf(out, "Device "); pr_uuid(out, m.uuid.b); prt_printf(out, " (%u)", t.dev); diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h new file mode 100644 index 000000000000..698990bbf1d2 --- /dev/null +++ b/fs/bcachefs/disk_groups_format.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H +#define _BCACHEFS_DISK_GROUPS_FORMAT_H + +#define BCH_SB_LABEL_SIZE 32 + +struct bch_disk_group { + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 flags[2]; +} __packed __aligned(8); + +LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) +LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) +LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) + +struct bch_sb_field_disk_groups { + struct bch_sb_field field; + struct bch_disk_group entries[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d503af270024..d2a5e76e6479 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -13,10 +13,12 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "checksum.h" +#include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" #include "error.h" #include "io_read.h" +#include "io_write.h" #include "keylist.h" #include "recovery.h" #include "replicas.h" @@ -24,6 +26,7 @@ #include "util.h" #include <linux/sort.h> +#include <linux/string_choices.h> #ifdef __KERNEL__ @@ -106,24 +109,28 @@ struct ec_bio { /* Stripes btree keys: */ -int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; int ret = 0; bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || - bpos_gt(k.k->p, POS(0, U32_MAX)), c, err, - stripe_pos_bad, + bpos_gt(k.k->p, POS(0, U32_MAX)), + c, stripe_pos_bad, "stripe at bad pos"); - bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err, - stripe_val_size_bad, + bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), + c, stripe_val_size_bad, "incorrect value size (%zu < %u)", bkey_val_u64s(k.k), stripe_val_u64s(s)); - ret = bch2_bkey_ptrs_invalid(c, k, flags, err); + bkey_fsck_err_on(s->csum_granularity_bits >= 64, + c, stripe_csum_granularity_bad, + "invalid csum granularity (%u >= 64)", + s->csum_granularity_bits); + + ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } @@ -131,174 +138,251 @@ fsck_err: void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned i, nr_data = s->nr_blocks - s->nr_redundant; + const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; + struct bch_stripe s = {}; - prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", - s->algorithm, - le16_to_cpu(s->sectors), - nr_data, - s->nr_redundant, - s->csum_type, - 1U << s->csum_granularity_bits); + memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); - for (i = 0; i < s->nr_blocks; i++) { - const struct bch_extent_ptr *ptr = s->ptrs + i; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + unsigned nr_data = s.nr_blocks - s.nr_redundant; - prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); - if (i < nr_data) - prt_printf(out, "#%u", stripe_blockcount_get(s, i)); - prt_printf(out, " gen %u", ptr->gen); - if (ptr_stale(ca, ptr)) - prt_printf(out, " stale"); + prt_printf(out, "algo %u sectors %u blocks %u:%u csum ", + s.algorithm, + le16_to_cpu(s.sectors), + nr_data, + s.nr_redundant); + bch2_prt_csum_type(out, s.csum_type); + prt_str(out, " gran "); + if (s.csum_granularity_bits < 64) + prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits); + else + prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits); + + if (s.disk_label) { + prt_str(out, " label"); + bch2_disk_path_to_text(out, c, s.disk_label - 1); + } + + for (unsigned i = 0; i < s.nr_blocks; i++) { + const struct bch_extent_ptr *ptr = sp->ptrs + i; + + if ((void *) ptr >= bkey_val_end(k)) + break; + + prt_char(out, ' '); + bch2_extent_ptr_to_text(out, c, ptr); + + if (s.csum_type < BCH_CSUM_NR && + i < nr_data && + stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k)) + prt_printf(out, "#%u", stripe_blockcount_get(sp, i)); } } /* Triggers: */ -static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c_stripe s, - unsigned idx, bool deleting) +static int __mark_stripe_bucket(struct btree_trans *trans, + struct bch_dev *ca, + struct bkey_s_c_stripe s, + unsigned ptr_idx, bool deleting, + struct bpos bucket, + struct bch_alloc_v4 *a, + enum btree_iter_update_trigger_flags flags) { - struct bch_fs *c = trans->c; - const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; - enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity : 0; - s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; + const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; + unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant; + bool parity = ptr_idx >= nr_data; + enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; + s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0; + struct printbuf buf = PRINTBUF; int ret = 0; + struct bch_fs *c = trans->c; if (deleting) sectors = -sectors; - a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); - if (IS_ERR(a)) - return PTR_ERR(a); - - ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, - a->v.gen, a->v.data_type, - a->v.dirty_sectors); - if (ret) - goto err; - if (!deleting) { - if (bch2_trans_inconsistent_on(a->v.stripe || - a->v.stripe_redundancy, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - a->v.dirty_sectors, - a->v.stripe, s.k->p.offset)) { - ret = -EIO; + if (bch2_trans_inconsistent_on(a->stripe || + a->stripe_redundancy, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + a->dirty_sectors, + a->stripe, s.k->p.offset, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -BCH_ERR_mark_stripe; goto err; } - if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - a->v.dirty_sectors, - s.k->p.offset)) { - ret = -EIO; + if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + a->dirty_sectors, + a->cached_sectors, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -BCH_ERR_mark_stripe; goto err; } - - a->v.stripe = s.k->p.offset; - a->v.stripe_redundancy = s.v->nr_redundant; - a->v.data_type = BCH_DATA_stripe; } else { - if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || - a->v.stripe_redundancy != s.v->nr_redundant, trans, - "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", - iter.pos.inode, iter.pos.offset, a->v.gen, - s.k->p.offset, a->v.stripe)) { - ret = -EIO; + if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset || + a->stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s", + bucket.inode, bucket.offset, a->gen, + a->stripe, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -BCH_ERR_mark_stripe; + goto err; + } + + if (bch2_trans_inconsistent_on(a->data_type != data_type, trans, + "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + bch2_data_type_str(data_type), + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -BCH_ERR_mark_stripe; goto err; } - a->v.stripe = 0; - a->v.stripe_redundancy = 0; - a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); + if (bch2_trans_inconsistent_on(parity && + (a->dirty_sectors != -sectors || + a->cached_sectors), trans, + "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s", + bucket.inode, bucket.offset, a->gen, + a->dirty_sectors, + a->cached_sectors, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -BCH_ERR_mark_stripe; + goto err; + } } - a->v.dirty_sectors += sectors; - if (data_type) - a->v.data_type = !deleting ? data_type : 0; + if (sectors) { + ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, + a->gen, a->data_type, &a->dirty_sectors); + if (ret) + goto err; + } - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - if (ret) - goto err; + if (!deleting) { + a->stripe = s.k->p.offset; + a->stripe_redundancy = s.v->nr_redundant; + alloc_data_type_set(a, data_type); + } else { + a->stripe = 0; + a->stripe_redundancy = 0; + alloc_data_type_set(a, BCH_DATA_user); + } err: - bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); return ret; } static int mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c k, - unsigned ptr_idx, - unsigned flags) + struct bkey_s_c_stripe s, + unsigned ptr_idx, bool deleting, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned nr_data = s->nr_blocks - s->nr_redundant; - bool parity = ptr_idx >= nr_data; - enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; - s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; - const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket old, new, *g; + const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; struct printbuf buf = PRINTBUF; int ret = 0; - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - /* * XXX doesn't handle deletion */ - - percpu_down_read(&c->mark_lock); - g = PTR_GC_BUCKET(ca, ptr); - - if (g->dirty_sectors || - (g->stripe && g->stripe != k.k->p.offset)) { - bch2_fs_inconsistent(c, - "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EINVAL; + struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); + if (unlikely(!ca)) { + if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) + ret = -BCH_ERR_mark_stripe; goto err; } - bucket_lock(g); - old = *g; + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type, - g->gen, g->data_type, - g->dirty_sectors); - if (ret) - goto err; + if (flags & BTREE_TRIGGER_transactional) { + struct bkey_i_alloc_v4 *a = + bch2_trans_start_alloc_update(trans, bucket, 0); + ret = PTR_ERR_OR_ZERO(a) ?: + __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); + } + + if (flags & BTREE_TRIGGER_gc) { + struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + ptr->dev, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -BCH_ERR_mark_stripe; + goto err; + } - g->data_type = data_type; - g->dirty_sectors += sectors; + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; + ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); + alloc_to_bucket(g, new); + bucket_unlock(g); - g->stripe = k.k->p.offset; - g->stripe_redundancy = s->nr_redundant; - new = *g; + if (!ret) + ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); + } err: - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, &old, &new); - percpu_up_read(&c->mark_lock); + bch2_dev_put(ca); printbuf_exit(&buf); return ret; } +static int mark_stripe_buckets(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + enum btree_iter_update_trigger_flags flags) +{ + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; + + BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks); + + unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; + + for (unsigned i = 0; i < nr_blocks; i++) { + if (new_s && old_s && + !memcmp(&new_s->ptrs[i], + &old_s->ptrs[i], + sizeof(new_s->ptrs[i]))) + continue; + + if (new_s) { + int ret = mark_stripe_bucket(trans, + bkey_s_c_to_stripe(new), i, false, flags); + if (ret) + return ret; + } + + if (old_s) { + int ret = mark_stripe_bucket(trans, + bkey_s_c_to_stripe(old), i, true, flags); + if (ret) + return ret; + } + } + + return 0; +} + +static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s) +{ + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->disk_label = s->disk_label; + m->blocks_nonempty = 0; + + for (unsigned i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); +} + int bch2_trigger_stripe(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, + enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_s_c new = _new.s_c; struct bch_fs *c = trans->c; @@ -308,7 +392,15 @@ int bch2_trigger_stripe(struct btree_trans *trans, const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe ? bkey_s_c_to_stripe(new).v : NULL; - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (unlikely(flags & BTREE_TRIGGER_check_repair)) + return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags); + + BUG_ON(new_s && old_s && + (new_s->nr_blocks != old_s->nr_blocks || + new_s->nr_redundant != old_s->nr_redundant)); + + + if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { /* * If the pointers aren't changing, we don't need to do anything: */ @@ -319,55 +411,68 @@ int bch2_trigger_stripe(struct btree_trans *trans, new_s->nr_blocks * sizeof(struct bch_extent_ptr))) return 0; - BUG_ON(new_s && old_s && - (new_s->nr_blocks != old_s->nr_blocks || - new_s->nr_redundant != old_s->nr_redundant)); + struct gc_stripe *gc = NULL; + if (flags & BTREE_TRIGGER_gc) { + gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); + if (!gc) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); + return -BCH_ERR_ENOMEM_mark_stripe; + } + + /* + * This will be wrong when we bring back runtime gc: we should + * be unmarking the old key and then marking the new key + * + * Also: when we bring back runtime gc, locking + */ + gc->alive = true; + gc->sectors = le16_to_cpu(new_s->sectors); + gc->nr_blocks = new_s->nr_blocks; + gc->nr_redundant = new_s->nr_redundant; + + for (unsigned i = 0; i < new_s->nr_blocks; i++) + gc->ptrs[i] = new_s->ptrs[i]; + + /* + * gc recalculates this field from stripe ptr + * references: + */ + memset(gc->block_sectors, 0, sizeof(gc->block_sectors)); + } if (new_s) { - s64 sectors = le16_to_cpu(new_s->sectors); + s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant; - struct bch_replicas_padded r; - bch2_bkey_to_replicas(&r.e, new); - int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + bch2_bkey_to_replicas(&acc.replicas, new); + int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); if (ret) return ret; + + if (gc) + memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas)); } if (old_s) { - s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); + s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant; - struct bch_replicas_padded r; - bch2_bkey_to_replicas(&r.e, old); - int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + bch2_bkey_to_replicas(&acc.replicas, old); + int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); if (ret) return ret; } - unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - for (unsigned i = 0; i < nr_blocks; i++) { - if (new_s && old_s && - !memcmp(&new_s->ptrs[i], - &old_s->ptrs[i], - sizeof(new_s->ptrs[i]))) - continue; - - if (new_s) { - int ret = bch2_trans_mark_stripe_bucket(trans, - bkey_s_c_to_stripe(new), i, false); - if (ret) - return ret; - } - - if (old_s) { - int ret = bch2_trans_mark_stripe_bucket(trans, - bkey_s_c_to_stripe(old), i, true); - if (ret) - return ret; - } - } + int ret = mark_stripe_buckets(trans, old, new, flags); + if (ret) + return ret; } - if (flags & BTREE_TRIGGER_ATOMIC) { + if (flags & BTREE_TRIGGER_atomic) { struct stripe *m = genradix_ptr(&c->stripes, idx); if (!m) { @@ -390,14 +495,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, memset(m, 0, sizeof(*m)); } else { - m->sectors = le16_to_cpu(new_s->sectors); - m->algorithm = new_s->algorithm; - m->nr_blocks = new_s->nr_blocks; - m->nr_redundant = new_s->nr_redundant; - m->blocks_nonempty = 0; - - for (unsigned i = 0; i < new_s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); + stripe_to_mem(m, new_s); if (!old_s) bch2_stripes_heap_insert(c, m, idx); @@ -406,54 +504,6 @@ int bch2_trigger_stripe(struct btree_trans *trans, } } - if (flags & BTREE_TRIGGER_GC) { - struct gc_stripe *m = - genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); - - if (!m) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", - idx); - return -BCH_ERR_ENOMEM_mark_stripe; - } - /* - * This will be wrong when we bring back runtime gc: we should - * be unmarking the old key and then marking the new key - */ - m->alive = true; - m->sectors = le16_to_cpu(new_s->sectors); - m->nr_blocks = new_s->nr_blocks; - m->nr_redundant = new_s->nr_redundant; - - for (unsigned i = 0; i < new_s->nr_blocks; i++) - m->ptrs[i] = new_s->ptrs[i]; - - bch2_bkey_to_replicas(&m->r.e, new); - - /* - * gc recalculates this field from stripe ptr - * references: - */ - memset(m->block_sectors, 0, sizeof(m->block_sectors)); - - for (unsigned i = 0; i < new_s->nr_blocks; i++) { - int ret = mark_stripe_bucket(trans, new, i, flags); - if (ret) - return ret; - } - - int ret = bch2_update_replicas(c, new, &m->r.e, - ((s64) m->sectors * m->nr_redundant), - 0, true); - if (ret) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, new); - bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); - printbuf_exit(&buf); - return ret; - } - } - return 0; } @@ -504,7 +554,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) unsigned i; for (i = 0; i < s->v.nr_blocks; i++) { - kvpfree(buf->data[i], buf->size << 9); + kvfree(buf->data[i]); buf->data[i] = NULL; } } @@ -531,7 +581,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, memset(buf->valid, 0xFF, sizeof(buf->valid)); for (i = 0; i < v->nr_blocks; i++) { - buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); + buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); if (!buf->data[i]) goto err; } @@ -604,21 +654,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct bch_csum got = ec_block_checksum(buf, i, offset); if (bch2_crc_cmp(want, got)) { - struct printbuf err = PRINTBUF; - struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); - - prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n", - want.hi, want.lo, - got.hi, got.lo, - bch2_csum_types[v->csum_type]); - prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); - bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); - bch_err_ratelimited(ca, "%s", err.buf); - printbuf_exit(&err); + struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev); + if (ca) { + struct printbuf err = PRINTBUF; - clear_bit(i, buf->valid); + prt_str(&err, "stripe "); + bch2_csum_err_msg(&err, v->csum_type, want, got); + prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); + bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); + bch_err_ratelimited(ca, "%s", err.buf); + printbuf_exit(&err); + + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + } - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + clear_bit(i, buf->valid); break; } @@ -681,14 +731,16 @@ static void ec_block_endio(struct bio *bio) ? BCH_MEMBER_ERROR_write : BCH_MEMBER_ERROR_read, "erasure coding %s error: %s", - bio_data_dir(bio) ? "write" : "read", + str_write_read(bio_data_dir(bio)), bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); - if (ptr_stale(ca, ptr)) { + int stale = dev_ptr_stale(ca, ptr); + if (stale) { bch_err_ratelimited(ca->fs, - "error %s stripe: stale pointer after io", - bio_data_dir(bio) == READ ? "reading from" : "writing to"); + "error %s stripe: stale/invalid pointer (%i) after io", + bio_data_dir(bio) == READ ? "reading from" : "writing to", + stale); clear_bit(ec_bio->idx, ec_bio->buf->valid); } @@ -703,25 +755,28 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned offset = 0, bytes = buf->size << 9; struct bch_extent_ptr *ptr = &v->ptrs[idx]; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant ? BCH_DATA_user : BCH_DATA_parity; int rw = op_is_write(opf); - if (ptr_stale(ca, ptr)) { - bch_err_ratelimited(c, - "error %s stripe: stale pointer", - rw == READ ? "reading from" : "writing to"); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw); + if (!ca) { clear_bit(idx, buf->valid); return; } - if (!bch2_dev_get_ioref(ca, rw)) { + int stale = dev_ptr_stale(ca, ptr); + if (stale) { + bch_err_ratelimited(c, + "error %s stripe: stale pointer (%i)", + rw == READ ? "reading from" : "writing to", + stale); clear_bit(idx, buf->valid); return; } + this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); while (offset < bytes) { @@ -767,7 +822,7 @@ static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - POS(0, idx), BTREE_ITER_SLOTS); + POS(0, idx), BTREE_ITER_slots); ret = bkey_err(k); if (ret) goto err; @@ -782,13 +837,16 @@ err: } /* recovery read path: */ -int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) +int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; - struct ec_stripe_buf *buf; + struct ec_stripe_buf *buf = NULL; struct closure cl; struct bch_stripe *v; unsigned i, offset; + const char *msg = NULL; + struct printbuf msgbuf = PRINTBUF; int ret = 0; closure_init_stack(&cl); @@ -801,32 +859,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); if (ret) { - bch_err_ratelimited(c, - "error doing reconstruct read: error %i looking up stripe", ret); - kfree(buf); - return -EIO; + msg = "stripe not found"; + goto err; } v = &bkey_i_to_stripe(&buf->key)->v; if (!bch2_ptr_matches_stripe(v, rbio->pick)) { - bch_err_ratelimited(c, - "error doing reconstruct read: pointer doesn't match stripe"); - ret = -EIO; + msg = "pointer doesn't match stripe"; goto err; } offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { - bch_err_ratelimited(c, - "error doing reconstruct read: read is bigger than stripe"); - ret = -EIO; + msg = "read is bigger than stripe"; goto err; } ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); - if (ret) + if (ret) { + msg = "-ENOMEM"; goto err; + } for (i = 0; i < v->nr_blocks; i++) ec_block_io(c, buf, REQ_OP_READ, i, &cl); @@ -834,9 +888,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) closure_sync(&cl); if (ec_nr_failed(buf) > v->nr_redundant) { - bch_err_ratelimited(c, - "error doing reconstruct read: unable to read enough blocks"); - ret = -EIO; + msg = "unable to read enough blocks"; goto err; } @@ -848,10 +900,17 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); -err: +out: ec_stripe_buf_exit(buf); kfree(buf); return ret; +err: + bch2_bkey_val_to_text(&msgbuf, c, orig_k); + bch_err_ratelimited(c, + "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); + printbuf_exit(&msgbuf); + ret = -BCH_ERR_stripe_reconstruct; + goto out; } /* stripe bucket accounting: */ @@ -866,8 +925,8 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) mutex_lock(&c->ec_stripes_heap_lock); if (n.size > h->size) { - memcpy(n.data, h->data, h->used * sizeof(h->data[0])); - n.used = h->used; + memcpy(n.data, h->data, h->nr * sizeof(h->data[0])); + n.nr = h->nr; swap(*h, n); } mutex_unlock(&c->ec_stripes_heap_lock); @@ -878,7 +937,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && + if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; @@ -958,7 +1017,7 @@ static u64 stripe_idx_to_delete(struct bch_fs *c) lockdep_assert_held(&c->ec_stripes_heap_lock); - if (h->used && + if (h->nr && h->data[0].blocks_nonempty == 0 && !bch2_stripe_is_open(c, h->data[0].idx)) return h->data[0].idx; @@ -966,14 +1025,6 @@ static u64 stripe_idx_to_delete(struct bch_fs *c) return 0; } -static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, - struct ec_stripe_heap_entry l, - struct ec_stripe_heap_entry r) -{ - return ((l.blocks_nonempty > r.blocks_nonempty) - - (l.blocks_nonempty < r.blocks_nonempty)); -} - static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, size_t i) { @@ -982,12 +1033,40 @@ static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; } +static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args) +{ + struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; + struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; + + return ((_l->blocks_nonempty > _r->blocks_nonempty) < + (_l->blocks_nonempty < _r->blocks_nonempty)); +} + +static inline void ec_stripes_heap_swap(void *l, void *r, void *h) +{ + struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; + struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; + ec_stripes_heap *_h = (ec_stripes_heap *)h; + size_t i = _l - _h->data; + size_t j = _r - _h->data; + + swap(*_l, *_r); + + ec_stripes_heap_set_backpointer(_h, i); + ec_stripes_heap_set_backpointer(_h, j); +} + +static const struct min_heap_callbacks callbacks = { + .less = ec_stripes_heap_cmp, + .swp = ec_stripes_heap_swap, +}; + static void heap_verify_backpointer(struct bch_fs *c, size_t idx) { ec_stripes_heap *h = &c->ec_stripes_heap; struct stripe *m = genradix_ptr(&c->stripes, idx); - BUG_ON(m->heap_idx >= h->used); + BUG_ON(m->heap_idx >= h->nr); BUG_ON(h->data[m->heap_idx].idx != idx); } @@ -997,9 +1076,7 @@ void bch2_stripes_heap_del(struct bch_fs *c, mutex_lock(&c->ec_stripes_heap_lock); heap_verify_backpointer(c, idx); - heap_del(&c->ec_stripes_heap, m->heap_idx, - ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); + min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap); mutex_unlock(&c->ec_stripes_heap_lock); } @@ -1007,14 +1084,15 @@ void bch2_stripes_heap_insert(struct bch_fs *c, struct stripe *m, size_t idx) { mutex_lock(&c->ec_stripes_heap_lock); - BUG_ON(heap_full(&c->ec_stripes_heap)); + BUG_ON(min_heap_full(&c->ec_stripes_heap)); - heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { + genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr; + min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) { .idx = idx, .blocks_nonempty = m->blocks_nonempty, }), - ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); + &callbacks, + &c->ec_stripes_heap); heap_verify_backpointer(c, idx); mutex_unlock(&c->ec_stripes_heap_lock); @@ -1033,10 +1111,8 @@ void bch2_stripes_heap_update(struct bch_fs *c, h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; i = m->heap_idx; - heap_sift_up(h, i, ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); - heap_sift_down(h, i, ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); + min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap); + min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap); heap_verify_backpointer(c, idx); @@ -1058,7 +1134,7 @@ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bkey_err(k); if (ret) goto err; @@ -1100,7 +1176,7 @@ static void ec_stripe_delete_work(struct work_struct *work) if (!idx) break; - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ec_stripe_delete(trans, idx)); bch_err_fn(c, ret); if (ret) @@ -1120,47 +1196,62 @@ void bch2_do_stripe_deletes(struct bch_fs *c) /* stripe creation: */ static int ec_stripe_key_update(struct btree_trans *trans, - struct bkey_i_stripe *new, - bool create) + struct bkey_i_stripe *old, + struct bkey_i_stripe *new) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; + bool create = !old; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_INTENT); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, + new->k.p, BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) goto err; - if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { - bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", - create ? "creating" : "updating", - bch2_bkey_types[k.k->type]); + if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe), + c, "error %s stripe: got existing key type %s", + create ? "creating" : "updating", + bch2_bkey_types[k.k->type])) { ret = -EINVAL; goto err; } if (k.k->type == KEY_TYPE_stripe) { - const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; - unsigned i; + const struct bch_stripe *v = bkey_s_c_to_stripe(k).v; - if (old->nr_blocks != new->v.nr_blocks) { - bch_err(c, "error updating stripe: nr_blocks does not match"); - ret = -EINVAL; - goto err; - } + BUG_ON(old->v.nr_blocks != new->v.nr_blocks); + BUG_ON(old->v.nr_blocks != v->nr_blocks); + + for (unsigned i = 0; i < new->v.nr_blocks; i++) { + unsigned sectors = stripe_blockcount_get(v, i); - for (i = 0; i < new->v.nr_blocks; i++) { - unsigned v = stripe_blockcount_get(old, i); + if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) { + struct printbuf buf = PRINTBUF; - BUG_ON(v && - (old->ptrs[i].dev != new->v.ptrs[i].dev || - old->ptrs[i].gen != new->v.ptrs[i].gen || - old->ptrs[i].offset != new->v.ptrs[i].offset)); + prt_printf(&buf, "stripe changed nonempty block %u", i); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i)); + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); + ret = -EINVAL; + goto err; + } + + /* + * If the stripe ptr changed underneath us, it must have + * been dev_remove_stripes() -> * invalidate_stripe_to_dev() + */ + if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) { + BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID); + + if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i])) + new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID; + } - stripe_blockcount_set(&new->v, i, v); + stripe_blockcount_set(&new->v, i, sectors); } } @@ -1171,48 +1262,42 @@ err: } static int ec_stripe_update_extent(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, u8 gen, struct ec_stripe_buf *s, - struct bpos *bp_pos) + struct bkey_s_c_backpointer bp, + struct bkey_buf *last_flushed) { struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; struct bch_fs *c = trans->c; - struct bch_backpointer bp; struct btree_iter iter; struct bkey_s_c k; const struct bch_extent_ptr *ptr_c; - struct bch_extent_ptr *ptr, *ec_ptr = NULL; + struct bch_extent_ptr *ec_ptr = NULL; struct bch_extent_stripe_ptr stripe_ptr; struct bkey_i *n; int ret, dev, block; - ret = bch2_get_next_backpointer(trans, bucket, gen, - bp_pos, &bp, BTREE_ITER_CACHED); - if (ret) - return ret; - if (bpos_eq(*bp_pos, SPOS_MAX)) - return 0; - - if (bp.level) { + if (bp.v->level) { struct printbuf buf = PRINTBUF; struct btree_iter node_iter; struct btree *b; - b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); + b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); bch2_trans_iter_exit(trans, &node_iter); if (!b) return 0; prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); - bch2_backpointer_to_text(&buf, &bp); + bch2_bkey_val_to_text(&buf, c, bp.s_c); bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); return -EIO; } - k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT); + k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); ret = bkey_err(k); if (ret) return ret; @@ -1244,7 +1329,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bkey_reassemble(n, k); - bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); + bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev); ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); BUG_ON(!ec_ptr); @@ -1270,25 +1355,37 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b { struct bch_fs *c = trans->c; struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - struct bch_extent_ptr bucket = v->ptrs[block]; - struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); - struct bpos bp_pos = POS_MIN; + struct bch_extent_ptr ptr = v->ptrs[block]; int ret = 0; - while (1) { - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, - ec_stripe_update_extent(trans, bucket_pos, bucket.gen, - s, &bp_pos)); - if (ret) - break; - if (bkey_eq(bp_pos, POS_MAX)) + struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); + if (!ca) + return -EIO; + + struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, bucket_pos), + bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, + NULL, NULL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, ({ + if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0))) break; - bp_pos = bpos_nosnap_successor(bp_pos); - } + if (bp_k.k->type != KEY_TYPE_backpointer) + continue; + + ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, + bkey_s_c_to_backpointer(bp_k), &last_flushed); + })); + bch2_bkey_buf_exit(&last_flushed, c); + bch2_dev_put(ca); return ret; } @@ -1319,20 +1416,18 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, unsigned block, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); - unsigned offset = ca->mi.bucket_size - ob->sectors_free; - int ret; - - if (!bch2_dev_get_ioref(ca, WRITE)) { + struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE); + if (!ca) { s->err = -BCH_ERR_erofs_no_writes; return; } + unsigned offset = ca->mi.bucket_size - ob->sectors_free; memset(s->new_stripe.data[block] + (offset << 9), 0, ob->sectors_free << 9); - ret = blkdev_issue_zeroout(ca->disk_sb.bdev, + int ret = blkdev_issue_zeroout(ca->disk_sb.bdev, ob->bucket * ca->mi.bucket_size + offset, ob->sectors_free, GFP_KERNEL, 0); @@ -1414,12 +1509,14 @@ static void ec_stripe_create(struct ec_stripe_new *s) goto err; } - ret = bch2_trans_do(c, &s->res, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, - ec_stripe_key_update(trans, - bkey_i_to_stripe(&s->new_stripe.key), - !s->have_existing_stripe)); + ret = bch2_trans_commit_do(c, &s->res, NULL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, + ec_stripe_key_update(trans, + s->have_existing_stripe + ? bkey_i_to_stripe(&s->existing_stripe.key) + : NULL, + bkey_i_to_stripe(&s->new_stripe.key))); bch_err_msg(c, ret, "creating stripe key"); if (ret) { goto err; @@ -1491,10 +1588,12 @@ void bch2_ec_do_stripe_creates(struct bch_fs *c) bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); } -static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) +static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s = h->s; + lockdep_assert_held(&h->lock); + BUG_ON(!s->allocated && !s->err); h->s = NULL; @@ -1507,6 +1606,12 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) ec_stripe_new_put(c, s, STRIPE_REF_io); } +static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err) +{ + h->s->err = err; + ec_stripe_new_set_pending(c, h); +} + void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) { struct ec_stripe_new *s = ob->ec; @@ -1517,16 +1622,13 @@ void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) { struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); - struct bch_dev *ca; - unsigned offset; - if (!ob) return NULL; BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); - ca = bch_dev_bkey_exists(c, ob->dev); - offset = ca->mi.bucket_size - ob->sectors_free; + struct bch_dev *ca = ob_dev(c, ob); + unsigned offset = ca->mi.bucket_size - ob->sectors_free; return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); } @@ -1580,7 +1682,8 @@ static void ec_stripe_key_init(struct bch_fs *c, struct bkey_i *k, unsigned nr_data, unsigned nr_parity, - unsigned stripe_size) + unsigned stripe_size, + unsigned disk_label) { struct bkey_i_stripe *s = bkey_stripe_init(k); unsigned u64s; @@ -1591,7 +1694,7 @@ static void ec_stripe_key_init(struct bch_fs *c, s->v.nr_redundant = nr_parity; s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); s->v.csum_type = BCH_CSUM_crc32c; - s->v.pad = 0; + s->v.disk_label = disk_label; while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { BUG_ON(1 << s->v.csum_granularity_bits >= @@ -1603,7 +1706,7 @@ static void ec_stripe_key_init(struct bch_fs *c, set_bkey_val_u64s(&s->k, u64s); } -static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) +static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s; @@ -1611,7 +1714,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) - return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; + return NULL; mutex_init(&s->lock); closure_init(&s->iodone, NULL); @@ -1624,40 +1727,29 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) s->nr_parity = h->redundancy; ec_stripe_key_init(c, &s->new_stripe.key, - s->nr_data, s->nr_parity, h->blocksize); - - h->s = s; - return 0; + s->nr_data, s->nr_parity, + h->blocksize, h->disk_label); + return s; } -static struct ec_stripe_head * -ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, - unsigned algo, unsigned redundancy, - enum bch_watermark watermark) +static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) { - struct ec_stripe_head *h; - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return NULL; - - mutex_init(&h->lock); - BUG_ON(!mutex_trylock(&h->lock)); - - h->target = target; - h->algo = algo; - h->redundancy = redundancy; - h->watermark = watermark; + struct bch_devs_mask devs = h->devs; rcu_read_lock(); - h->devs = target_rw_devs(c, BCH_DATA_user, target); + h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label + ? group_to_target(h->disk_label - 1) + : 0); + unsigned nr_devs = dev_mask_nr(&h->devs); for_each_member_device_rcu(c, ca, &h->devs) if (!ca->mi.durability) __clear_bit(ca->dev_idx, h->devs.d); + unsigned nr_devs_with_durability = dev_mask_nr(&h->devs); h->blocksize = pick_blocksize(c, &h->devs); + h->nr_active_devs = 0; for_each_member_device_rcu(c, ca, &h->devs) if (ca->mi.bucket_size == h->blocksize) h->nr_active_devs++; @@ -1668,9 +1760,50 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, * If we only have redundancy + 1 devices, we're better off with just * replication: */ - if (h->nr_active_devs < h->redundancy + 2) - bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", - h->nr_active_devs, h->redundancy + 2); + h->insufficient_devs = h->nr_active_devs < h->redundancy + 2; + + if (h->insufficient_devs) { + const char *err; + + if (nr_devs < h->redundancy + 2) + err = NULL; + else if (nr_devs_with_durability < h->redundancy + 2) + err = "cannot use durability=0 devices"; + else + err = "mismatched bucket sizes"; + + if (err) + bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s", + h->nr_active_devs, h->redundancy + 2, err); + } + + struct bch_devs_mask devs_leaving; + bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX); + + if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving)) + ec_stripe_new_cancel(c, h, -EINTR); + + h->rw_devs_change_count = c->rw_devs_change_count; +} + +static struct ec_stripe_head * +ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label, + unsigned algo, unsigned redundancy, + enum bch_watermark watermark) +{ + struct ec_stripe_head *h; + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return NULL; + + mutex_init(&h->lock); + BUG_ON(!mutex_trylock(&h->lock)); + + h->disk_label = disk_label; + h->algo = algo; + h->redundancy = redundancy; + h->watermark = watermark; list_add(&h->list, &c->ec_stripe_head_list); return h; @@ -1682,14 +1815,14 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) h->s->allocated && bitmap_weight(h->s->blocks_allocated, h->s->nr_data) == h->s->nr_data) - ec_stripe_set_pending(c, h); + ec_stripe_new_set_pending(c, h); mutex_unlock(&h->lock); } static struct ec_stripe_head * __bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned target, + unsigned disk_label, unsigned algo, unsigned redundancy, enum bch_watermark watermark) @@ -1707,63 +1840,84 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, if (test_bit(BCH_FS_going_ro, &c->flags)) { h = ERR_PTR(-BCH_ERR_erofs_no_writes); - goto found; + goto err; } list_for_each_entry(h, &c->ec_stripe_head_list, list) - if (h->target == target && + if (h->disk_label == disk_label && h->algo == algo && h->redundancy == redundancy && h->watermark == watermark) { ret = bch2_trans_mutex_lock(trans, &h->lock); - if (ret) + if (ret) { h = ERR_PTR(ret); + goto err; + } goto found; } - h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); + h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); + if (!h) { + h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc); + goto err; + } found: - if (!IS_ERR_OR_NULL(h) && - h->nr_active_devs < h->redundancy + 2) { + if (h->rw_devs_change_count != c->rw_devs_change_count) + ec_stripe_head_devs_update(c, h); + + if (h->insufficient_devs) { mutex_unlock(&h->lock); h = NULL; } +err: mutex_unlock(&c->ec_stripe_head_lock); return h; } -static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, +static int new_stripe_alloc_buckets(struct btree_trans *trans, + struct ec_stripe_head *h, struct ec_stripe_new *s, enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; struct bch_devs_mask devs = h->devs; struct open_bucket *ob; struct open_buckets buckets; - struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; bool have_cache = true; int ret = 0; - BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); - BUG_ON(v->nr_redundant != h->s->nr_parity); + BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); + BUG_ON(v->nr_redundant != s->nr_parity); + + /* * We bypass the sector allocator which normally does this: */ + bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); + + for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { + /* + * Note: we don't yet repair invalid blocks (failed/removed + * devices) when reusing stripes - we still need a codepath to + * walk backpointers and update all extents that point to that + * block when updating the stripe + */ + if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) + __clear_bit(v->ptrs[i].dev, devs.d); - for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { - __clear_bit(v->ptrs[i].dev, devs.d); - if (i < h->s->nr_data) + if (i < s->nr_data) nr_have_data++; else nr_have_parity++; } - BUG_ON(nr_have_data > h->s->nr_data); - BUG_ON(nr_have_parity > h->s->nr_parity); + BUG_ON(nr_have_data > s->nr_data); + BUG_ON(nr_have_parity > s->nr_parity); buckets.nr = 0; - if (nr_have_parity < h->s->nr_parity) { + if (nr_have_parity < s->nr_parity) { ret = bch2_bucket_alloc_set_trans(trans, &buckets, &h->parity_stripe, &devs, - h->s->nr_parity, + s->nr_parity, &nr_have_parity, &have_cache, 0, BCH_DATA_parity, @@ -1771,14 +1925,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ cl); open_bucket_for_each(c, &buckets, ob, i) { - j = find_next_zero_bit(h->s->blocks_gotten, - h->s->nr_data + h->s->nr_parity, - h->s->nr_data); - BUG_ON(j >= h->s->nr_data + h->s->nr_parity); + j = find_next_zero_bit(s->blocks_gotten, + s->nr_data + s->nr_parity, + s->nr_data); + BUG_ON(j >= s->nr_data + s->nr_parity); - h->s->blocks[j] = buckets.v[i]; + s->blocks[j] = buckets.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, h->s->blocks_gotten); + __set_bit(j, s->blocks_gotten); } if (ret) @@ -1786,11 +1940,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ } buckets.nr = 0; - if (nr_have_data < h->s->nr_data) { + if (nr_have_data < s->nr_data) { ret = bch2_bucket_alloc_set_trans(trans, &buckets, &h->block_stripe, &devs, - h->s->nr_data, + s->nr_data, &nr_have_data, &have_cache, 0, BCH_DATA_user, @@ -1798,13 +1952,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ cl); open_bucket_for_each(c, &buckets, ob, i) { - j = find_next_zero_bit(h->s->blocks_gotten, - h->s->nr_data, 0); - BUG_ON(j >= h->s->nr_data); + j = find_next_zero_bit(s->blocks_gotten, + s->nr_data, 0); + BUG_ON(j >= s->nr_data); - h->s->blocks[j] = buckets.v[i]; + s->blocks[j] = buckets.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, h->s->blocks_gotten); + __set_bit(j, s->blocks_gotten); } if (ret) @@ -1814,7 +1968,6 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ return 0; } -/* XXX: doesn't obey target: */ static s64 get_existing_stripe(struct bch_fs *c, struct ec_stripe_head *head) { @@ -1828,7 +1981,7 @@ static s64 get_existing_stripe(struct bch_fs *c, return -1; mutex_lock(&c->ec_stripes_heap_lock); - for (heap_idx = 0; heap_idx < h->used; heap_idx++) { + for (heap_idx = 0; heap_idx < h->nr; heap_idx++) { /* No blocks worth reusing, stripe will just be deleted: */ if (!h->data[heap_idx].blocks_nonempty) continue; @@ -1837,7 +1990,8 @@ static s64 get_existing_stripe(struct bch_fs *c, m = genradix_ptr(&c->stripes, stripe_idx); - if (m->algorithm == head->algo && + if (m->disk_label == head->disk_label && + m->algorithm == head->algo && m->nr_redundant == head->redundancy && m->sectors == head->blocksize && m->blocks_nonempty < m->nr_blocks - m->nr_redundant && @@ -1850,73 +2004,78 @@ static s64 get_existing_stripe(struct bch_fs *c, return ret; } -static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) +static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s) { - struct bch_fs *c = trans->c; - struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; - struct bch_stripe *existing_v; + struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v; unsigned i; - s64 idx; - int ret; - /* - * If we can't allocate a new stripe, and there's no stripes with empty - * blocks for us to reuse, that means we have to wait on copygc: - */ - idx = get_existing_stripe(c, h); - if (idx < 0) - return -BCH_ERR_stripe_alloc_blocked; - - ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); - if (ret) { - bch2_stripe_close(c, h->s); - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret)); - return ret; - } - - existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; - - BUG_ON(existing_v->nr_redundant != h->s->nr_parity); - h->s->nr_data = existing_v->nr_blocks - + BUG_ON(existing_v->nr_redundant != s->nr_parity); + s->nr_data = existing_v->nr_blocks - existing_v->nr_redundant; - ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); + int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); if (ret) { - bch2_stripe_close(c, h->s); + bch2_stripe_close(c, s); return ret; } - BUG_ON(h->s->existing_stripe.size != h->blocksize); - BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); + BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); /* * Free buckets we initially allocated - they might conflict with * blocks from the stripe we're reusing: */ - for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { - bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); - h->s->blocks[i] = 0; + for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) { + bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]); + s->blocks[i] = 0; } - memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); - memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); + memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten)); + memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated)); - for (i = 0; i < existing_v->nr_blocks; i++) { + for (unsigned i = 0; i < existing_v->nr_blocks; i++) { if (stripe_blockcount_get(existing_v, i)) { - __set_bit(i, h->s->blocks_gotten); - __set_bit(i, h->s->blocks_allocated); + __set_bit(i, s->blocks_gotten); + __set_bit(i, s->blocks_allocated); } - ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); + ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone); } - bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); - h->s->have_existing_stripe = true; + bkey_copy(&s->new_stripe.key, &s->existing_stripe.key); + s->have_existing_stripe = true; return 0; } -static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) +static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h, + struct ec_stripe_new *s) +{ + struct bch_fs *c = trans->c; + s64 idx; + int ret; + + /* + * If we can't allocate a new stripe, and there's no stripes with empty + * blocks for us to reuse, that means we have to wait on copygc: + */ + idx = get_existing_stripe(c, h); + if (idx < 0) + return -BCH_ERR_stripe_alloc_blocked; + + ret = get_stripe_key_trans(trans, idx, &s->existing_stripe); + bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, + "reading stripe key: %s", bch2_err_str(ret)); + if (ret) { + bch2_stripe_close(c, s); + return ret; + } + + return init_new_stripe_from_existing(c, s); +} + +static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h, + struct ec_stripe_new *s) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -1925,17 +2084,21 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); int ret; - if (!h->s->res.sectors) { - ret = bch2_disk_reservation_get(c, &h->s->res, + if (!s->res.sectors) { + ret = bch2_disk_reservation_get(c, &s->res, h->blocksize, - h->s->nr_parity, + s->nr_parity, BCH_DISK_RESERVATION_NOFAIL); if (ret) return ret; } + /* + * Allocate stripe slot + * XXX: we're going to need a bitrange btree of free stripes + */ for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { if (bkey_gt(k.k->p, POS(0, U32_MAX))) { if (start_pos.offset) { start_pos = min_pos; @@ -1948,7 +2111,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st } if (bkey_deleted(k.k) && - bch2_try_open_stripe(c, h->s, k.k->p.offset)) + bch2_try_open_stripe(c, s, k.k->p.offset)) break; } @@ -1959,16 +2122,16 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st ret = ec_stripe_mem_alloc(trans, &iter); if (ret) { - bch2_stripe_close(c, h->s); + bch2_stripe_close(c, s); goto err; } - h->s->new_stripe.key.k.p = iter.pos; + s->new_stripe.key.k.p = iter.pos; out: bch2_trans_iter_exit(trans, &iter); return ret; err: - bch2_disk_reservation_put(c, &h->s->res); + bch2_disk_reservation_put(c, &s->res); goto out; } @@ -1982,29 +2145,44 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, struct bch_fs *c = trans->c; struct ec_stripe_head *h; bool waiting = false; + unsigned disk_label = 0; + struct target t = target_decode(target); int ret; - h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); + if (t.type == TARGET_GROUP) { + if (t.group > U8_MAX) { + bch_err(c, "cannot create a stripe when disk_label > U8_MAX"); + return NULL; + } + disk_label = t.group + 1; /* 0 == no label */ + } + + h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark); if (IS_ERR_OR_NULL(h)) return h; if (!h->s) { - ret = ec_new_stripe_alloc(c, h); - if (ret) { + h->s = ec_new_stripe_alloc(c, h); + if (!h->s) { + ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc; bch_err(c, "failed to allocate new stripe"); goto err; } + + h->nr_created++; } - if (h->s->allocated) + struct ec_stripe_new *s = h->s; + + if (s->allocated) goto allocated; - if (h->s->have_existing_stripe) + if (s->have_existing_stripe) goto alloc_existing; /* First, try to allocate a full stripe: */ - ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h); + ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h, s); if (!ret) goto allocate_buf; if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || @@ -2016,15 +2194,15 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, * existing stripe: */ while (1) { - ret = __bch2_ec_stripe_head_reuse(trans, h); + ret = __bch2_ec_stripe_head_reuse(trans, h, s); if (!ret) break; if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) goto err; if (watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h); + ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h, s); if (ret) goto err; goto allocate_buf; @@ -2042,19 +2220,19 @@ alloc_existing: * Retry allocating buckets, with the watermark for this * particular write: */ - ret = new_stripe_alloc_buckets(trans, h, watermark, cl); + ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); if (ret) goto err; allocate_buf: - ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); + ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize); if (ret) goto err; - h->s->allocated = true; + s->allocated = true; allocated: - BUG_ON(!h->s->idx); - BUG_ON(!h->s->new_stripe.data[0]); + BUG_ON(!s->idx); + BUG_ON(!s->new_stripe.data[0]); BUG_ON(trans->restarted); return h; err: @@ -2062,6 +2240,73 @@ err: return ERR_PTR(ret); } +/* device removal */ + +static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); + + if (!a->stripe) + return 0; + + if (a->stripe_sectors) { + bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); + return -BCH_ERR_invalidate_stripe_to_dev; + } + + struct btree_iter iter; + struct bkey_i_stripe *s = + bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), + BTREE_ITER_slots, stripe); + int ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + + s64 sectors = 0; + for (unsigned i = 0; i < s->v.nr_blocks; i++) + sectors -= stripe_blockcount_get(&s->v, i); + + bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); + acc.replicas.data_type = BCH_DATA_user; + ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + if (ret) + goto err; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == k_a.k->p.inode) + ptr->dev = BCH_SB_MEMBER_INVALID; + + sectors = -sectors; + + bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); + acc.replicas.data_type = BCH_DATA_user; + ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) +{ + return bch2_trans_run(c, + for_each_btree_key_max_commit(trans, iter, + BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), + BTREE_ITER_intent, k, + NULL, NULL, 0, ({ + bch2_invalidate_stripe_to_dev(trans, k); + }))); +} + +/* startup/shutdown */ + static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) { struct ec_stripe_head *h; @@ -2087,8 +2332,7 @@ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) } goto unlock; found: - h->s->err = -BCH_ERR_erofs_no_writes; - ec_stripe_set_pending(c, h); + ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes); unlock: mutex_unlock(&h->lock); } @@ -2125,7 +2369,7 @@ int bch2_stripes_read(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ if (k.k->type != KEY_TYPE_stripe) continue; @@ -2133,17 +2377,9 @@ int bch2_stripes_read(struct bch_fs *c) if (ret) break; - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->blocks_nonempty = 0; - for (unsigned i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); + stripe_to_mem(m, bkey_s_c_to_stripe(k).v); bch2_stripes_heap_insert(c, m, k.k->p.offset); 0; @@ -2159,7 +2395,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) size_t i; mutex_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min_t(size_t, h->used, 50); i++) { + for (i = 0; i < min_t(size_t, h->nr, 50); i++) { m = genradix_ptr(&c->stripes, h->data[i].idx); prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, @@ -2173,6 +2409,25 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) mutex_unlock(&c->ec_stripes_heap_lock); } +static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, + struct ec_stripe_new *s) +{ + prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs", + s->idx, s->nr_data, s->nr_parity, + bitmap_weight(s->blocks_allocated, s->nr_data), + atomic_read(&s->ref[STRIPE_REF_io]), + atomic_read(&s->ref[STRIPE_REF_stripe]), + bch2_watermarks[s->h->watermark]); + + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + unsigned i; + for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) + prt_printf(out, " %u", s->blocks[i]); + prt_newline(out); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key)); + prt_newline(out); +} + void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) { struct ec_stripe_head *h; @@ -2180,28 +2435,21 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->ec_stripe_head_lock); list_for_each_entry(h, &c->ec_stripe_head_list, list) { - prt_printf(out, "target %u algo %u redundancy %u %s:\n", - h->target, h->algo, h->redundancy, - bch2_watermarks[h->watermark]); + prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n", + h->disk_label, h->algo, h->redundancy, + bch2_watermarks[h->watermark], + h->nr_created); if (h->s) - prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n", - h->s->idx, h->s->nr_data, h->s->nr_parity, - bitmap_weight(h->s->blocks_allocated, - h->s->nr_data)); + bch2_new_stripe_to_text(out, c, h->s); } mutex_unlock(&c->ec_stripe_head_lock); prt_printf(out, "in flight:\n"); mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry(s, &c->ec_stripe_new_list, list) { - prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n", - s->idx, s->nr_data, s->nr_parity, - atomic_read(&s->ref[STRIPE_REF_io]), - atomic_read(&s->ref[STRIPE_REF_stripe]), - bch2_watermarks[s->h->watermark]); - } + list_for_each_entry(s, &c->ec_stripe_new_list, list) + bch2_new_stripe_to_text(out, c, s); mutex_unlock(&c->ec_stripe_new_lock); } @@ -2212,11 +2460,9 @@ void bch2_fs_ec_exit(struct bch_fs *c) while (1) { mutex_lock(&c->ec_stripe_head_lock); - h = list_first_entry_or_null(&c->ec_stripe_head_list, - struct ec_stripe_head, list); - if (h) - list_del(&h->list); + h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); mutex_unlock(&c->ec_stripe_head_lock); + if (!h) break; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index f4369b02e805..583ca6a226da 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -6,17 +6,16 @@ #include "buckets_types.h" #include "extents_types.h" -enum bkey_invalid_flags; - -int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_stripe ((struct bkey_ops) { \ - .key_invalid = bch2_stripe_invalid, \ + .key_validate = bch2_stripe_validate, \ .val_to_text = bch2_stripe_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_stripe, \ @@ -32,6 +31,8 @@ static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) static inline unsigned stripe_csum_offset(const struct bch_stripe *s, unsigned dev, unsigned csum_idx) { + EBUG_ON(s->csum_type >= BCH_CSUM_NR); + unsigned csum_bytes = bch_crc_bytes[s->csum_type]; return sizeof(struct bch_stripe) + @@ -95,7 +96,9 @@ static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe const struct bch_extent_ptr *data_ptr, unsigned sectors) { - return data_ptr->dev == stripe_ptr->dev && + return (data_ptr->dev == stripe_ptr->dev || + data_ptr->dev == BCH_SB_MEMBER_INVALID || + stripe_ptr->dev == BCH_SB_MEMBER_INVALID) && data_ptr->gen == stripe_ptr->gen && data_ptr->offset >= stripe_ptr->offset && data_ptr->offset < stripe_ptr->offset + sectors; @@ -184,10 +187,15 @@ struct ec_stripe_head { struct list_head list; struct mutex lock; - unsigned target; + unsigned disk_label; unsigned algo; unsigned redundancy; enum bch_watermark watermark; + bool insufficient_devs; + + unsigned long rw_devs_change_count; + + u64 nr_created; struct bch_devs_mask devs; unsigned nr_active_devs; @@ -200,7 +208,7 @@ struct ec_stripe_head { struct ec_stripe_new *s; }; -int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *); +int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c); void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); @@ -245,6 +253,8 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, } } +int bch2_dev_remove_stripes(struct bch_fs *, unsigned); + void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_fs_ec_stop(struct bch_fs *); void bch2_fs_ec_flush(struct bch_fs *); diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h index 44ce88ba08d7..b9770f24f213 100644 --- a/fs/bcachefs/ec_format.h +++ b/fs/bcachefs/ec_format.h @@ -11,7 +11,31 @@ struct bch_stripe { __u8 csum_granularity_bits; __u8 csum_type; - __u8 pad; + + /* + * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2 + * + * we can manage with this because this only needs to point to a + * disk label, not a target: + */ + __u8 disk_label; + + /* + * Variable length sections: + * - Pointers + * - Checksums + * 2D array of [stripe block/device][csum block], with checksum block + * size given by csum_granularity_bits + * - Block sector counts: per-block array of u16s + * + * XXX: + * Either checksums should have come last, or we should have included a + * checksum_size field (the size in bytes of the checksum itself, not + * the blocksize the checksum covers). + * + * Currently we aren't able to access the block sector counts if the + * checksum type is unknown. + */ struct bch_extent_ptr ptrs[]; } __packed __aligned(8); diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 976426da3a12..8d1e70e830ac 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -16,6 +16,7 @@ struct stripe { u8 nr_blocks; u8 nr_redundant; u8 blocks_nonempty; + u8 disk_label; }; struct gc_stripe { @@ -36,6 +37,6 @@ struct ec_stripe_heap_entry { unsigned blocks_nonempty; }; -typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; +typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap; #endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c index d260ff9bbfeb..43557bebd0f8 100644 --- a/fs/bcachefs/errcode.c +++ b/fs/bcachefs/errcode.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "errcode.h" +#include "trace.h" #include <linux/errname.h> @@ -49,15 +50,17 @@ bool __bch2_err_matches(int err, int class) return err == class; } -int __bch2_err_class(int err) +int __bch2_err_class(int bch_err) { - err = -err; - BUG_ON((unsigned) err >= BCH_ERR_MAX); + int std_err = -bch_err; + BUG_ON((unsigned) std_err >= BCH_ERR_MAX); - while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START]) - err = bch2_errcode_parents[err - BCH_ERR_START]; + while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START]) + std_err = bch2_errcode_parents[std_err - BCH_ERR_START]; + + trace_error_downcast(bch_err, std_err, _RET_IP_); - return -err; + return -std_err; } const char *bch2_blk_status_to_str(blk_status_t status) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 8c40c2067a04..4590cd0c7c90 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -5,6 +5,10 @@ #define BCH_ERRCODES() \ x(ERANGE, ERANGE_option_too_small) \ x(ERANGE, ERANGE_option_too_big) \ + x(EINVAL, mount_option) \ + x(BCH_ERR_mount_option, option_name) \ + x(BCH_ERR_mount_option, option_value) \ + x(BCH_ERR_mount_option, option_not_bool) \ x(ENOMEM, ENOMEM_stripe_buf) \ x(ENOMEM, ENOMEM_replicas_table) \ x(ENOMEM, ENOMEM_cpu_replicas) \ @@ -50,7 +54,8 @@ x(ENOMEM, ENOMEM_compression_bounce_read_init) \ x(ENOMEM, ENOMEM_compression_bounce_write_init) \ x(ENOMEM, ENOMEM_compression_workspace_init) \ - x(ENOMEM, ENOMEM_decompression_workspace_init) \ + x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \ + x(EIO, compression_workspace_not_initialized) \ x(ENOMEM, ENOMEM_bucket_gens) \ x(ENOMEM, ENOMEM_buckets_nouse) \ x(ENOMEM, ENOMEM_usage_init) \ @@ -78,6 +83,9 @@ x(ENOMEM, ENOMEM_fs_name_alloc) \ x(ENOMEM, ENOMEM_fs_other_alloc) \ x(ENOMEM, ENOMEM_dev_alloc) \ + x(ENOMEM, ENOMEM_disk_accounting) \ + x(ENOMEM, ENOMEM_stripe_head_alloc) \ + x(ENOMEM, ENOMEM_journal_read_bucket) \ x(ENOSPC, ENOSPC_disk_reservation) \ x(ENOSPC, ENOSPC_bucket_alloc) \ x(ENOSPC, ENOSPC_disk_label_add) \ @@ -109,8 +117,15 @@ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ - x(0, open_buckets_empty) \ - x(0, freelist_empty) \ + x(ENOENT, ENOENT_inode_no_backpointer) \ + x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ + x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ + x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ + x(EEXIST, EEXIST_str_hash_set) \ + x(EEXIST, EEXIST_discard_in_flight_add) \ + x(EEXIST, EEXIST_subvolume_create) \ + x(ENOSPC, open_buckets_empty) \ + x(ENOSPC, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ x(0, transaction_restart) \ x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ @@ -136,6 +151,7 @@ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ x(BCH_ERR_transaction_restart, transaction_restart_nested) \ + x(BCH_ERR_transaction_restart, transaction_restart_commit) \ x(0, no_btree_node) \ x(BCH_ERR_no_btree_node, no_btree_node_relock) \ x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ @@ -152,15 +168,17 @@ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ x(0, backpointer_to_overwritten_btree_node) \ - x(0, lock_fail_root_changed) \ x(0, journal_reclaim_would_deadlock) \ x(EINVAL, fsck) \ x(BCH_ERR_fsck, fsck_fix) \ + x(BCH_ERR_fsck, fsck_delete_bkey) \ x(BCH_ERR_fsck, fsck_ignore) \ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ - x(0, restart_recovery) \ + x(EINVAL, restart_recovery) \ + x(EINVAL, not_in_recovery) \ + x(EINVAL, cannot_rewind_recovery) \ x(0, data_update_done) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ @@ -168,6 +186,7 @@ x(EINVAL, block_size_too_small) \ x(EINVAL, bucket_size_too_small) \ x(EINVAL, device_size_too_small) \ + x(EINVAL, device_size_too_big) \ x(EINVAL, device_not_a_member_of_filesystem) \ x(EINVAL, device_has_been_removed) \ x(EINVAL, device_splitbrain) \ @@ -176,6 +195,11 @@ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ x(EINVAL, opt_parse_error) \ + x(EINVAL, remove_with_metadata_missing_unimplemented)\ + x(EINVAL, remove_would_lose_data) \ + x(EINVAL, no_resize_with_buckets_nouse) \ + x(EINVAL, inode_unpack_error) \ + x(EINVAL, varint_decode_error) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -207,6 +231,7 @@ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \ x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \ x(BCH_ERR_invalid_sb, invalid_sb_members) \ x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ @@ -223,9 +248,27 @@ x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ + x(EIO, journal_shutdown) \ + x(EIO, journal_flush_err) \ x(EIO, btree_node_read_err) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ - x(EIO, btree_write_all_failed) \ + x(EIO, btree_node_write_all_failed) \ + x(EIO, btree_node_read_error) \ + x(EIO, btree_node_read_validate_error) \ + x(EIO, btree_need_topology_repair) \ + x(EIO, bucket_ref_update) \ + x(EIO, trigger_pointer) \ + x(EIO, trigger_stripe_pointer) \ + x(EIO, metadata_bucket_inconsistency) \ + x(EIO, mark_stripe) \ + x(EIO, stripe_reconstruct) \ + x(EIO, key_type_error) \ + x(EIO, no_device_to_read_from) \ + x(EIO, missing_indirect_extent) \ + x(EIO, invalidate_stripe_to_dev) \ + x(EIO, no_encryption_key) \ + x(EIO, insufficient_journal_devices) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ @@ -238,7 +281,10 @@ x(BCH_ERR_nopromote, nopromote_congested) \ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ - x(BCH_ERR_nopromote, nopromote_enomem) + x(BCH_ERR_nopromote, nopromote_enomem) \ + x(0, invalid_snapshot_node) \ + x(0, option_needs_open_fs) \ + x(0, remove_disk_accounting_entry) enum bch_errcode { BCH_ERR_START = 2048, @@ -271,6 +317,7 @@ static inline long bch2_err_class(long err) #define BLK_STS_REMOVED ((__force blk_status_t)128) +#include <linux/blk_types.h> const char *bch2_blk_status_to_str(blk_status_t); #endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index d32c8bebe46c..038da6a61f6b 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,6 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" +#include "btree_iter.h" #include "error.h" +#include "fs-common.h" +#include "journal.h" +#include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" @@ -13,9 +18,11 @@ bool bch2_inconsistent_error(struct bch_fs *c) switch (c->opts.errors) { case BCH_ON_ERROR_continue: return false; + case BCH_ON_ERROR_fix_safe: case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) - bch_err(c, "inconsistency detected - emergency read only"); + bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", + journal_cur_seq(&c->journal)); return true; case BCH_ON_ERROR_panic: panic(bch2_fmt(c, "panic after error")); @@ -25,11 +32,16 @@ bool bch2_inconsistent_error(struct bch_fs *c) } } -void bch2_topology_error(struct bch_fs *c) +int bch2_topology_error(struct bch_fs *c) { set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_fsck_running, &c->flags)) + if (!test_bit(BCH_FS_recovery_running, &c->flags)) { bch2_inconsistent_error(c); + return -BCH_ERR_btree_need_topology_repair; + } else { + return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: + -BCH_ERR_btree_node_read_validate_error; + } } void bch2_fatal_error(struct bch_fs *c) @@ -89,7 +101,7 @@ static enum ask_yn parse_yn_response(char *buf) } #ifdef __KERNEL__ -static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) +static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) { struct stdio_redirect *stdio = c->stdio; @@ -99,25 +111,44 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) if (!stdio) return YN_NO; - char buf[100]; + if (trans) + bch2_trans_unlock(trans); + + unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0; + darray_char line = {}; int ret; do { + unsigned long t; bch2_print(c, " (y,n, or Y,N for all errors of this type) "); +rewait: + t = unlock_long_at + ? max_t(long, unlock_long_at - jiffies, 0) + : MAX_SCHEDULE_TIMEOUT; + + int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t); + if (r == -ETIME) { + bch2_trans_unlock_long(trans); + unlock_long_at = 0; + goto rewait; + } - int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1); - if (r < 0) - return YN_NO; - buf[r] = '\0'; - } while ((ret = parse_yn_response(buf)) < 0); + if (r < 0) { + ret = YN_NO; + break; + } + + darray_last(line) = '\0'; + } while ((ret = parse_yn_response(line.data)) < 0); + darray_exit(&line); return ret; } #else #include "tools-util.h" -static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) +static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) { char *buf = NULL; size_t buflen = 0; @@ -168,20 +199,93 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) return s; } -int bch2_fsck_err(struct bch_fs *c, +/* s/fix?/fixing/ s/recreate?/recreating/ */ +static void prt_actioning(struct printbuf *out, const char *action) +{ + unsigned len = strlen(action); + + BUG_ON(action[len - 1] != '?'); + --len; + + if (action[len - 1] == 'e') + --len; + + prt_bytes(out, action, len); + prt_str(out, "ing"); +} + +static const u8 fsck_flags_extra[] = { +#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags, + BCH_SB_ERRS() +#undef x +}; + +static int do_fsck_ask_yn(struct bch_fs *c, + struct btree_trans *trans, + struct printbuf *question, + const char *action) +{ + prt_str(question, ", "); + prt_str(question, action); + + if (bch2_fs_stdio_redirect(c)) + bch2_print(c, "%s", question->buf); + else + bch2_print_string_as_lines(KERN_ERR, question->buf); + + int ask = bch2_fsck_ask_yn(c, trans); + + if (trans) { + int ret = bch2_trans_relock(trans); + if (ret) + return ret; + } + + return ask; +} + +int __bch2_fsck_err(struct bch_fs *c, + struct btree_trans *trans, enum bch_fsck_flags flags, enum bch_sb_error_id err, const char *fmt, ...) { struct fsck_err_state *s = NULL; va_list args; - bool print = true, suppressing = false, inconsistent = false; + bool print = true, suppressing = false, inconsistent = false, exiting = false; struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; - - if ((flags & FSCK_CAN_FIX) && - test_bit(err, c->sb.errors_silent)) - return -BCH_ERR_fsck_fix; + const char *action_orig = "fix?", *action = action_orig; + + might_sleep(); + + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + flags |= fsck_flags_extra[err]; + + if (!c) + c = trans->c; + + /* + * Ugly: if there's a transaction in the current task it has to be + * passed in to unlock if we prompt for user input. + * + * But, plumbing a transaction and transaction restarts into + * bkey_validate() is problematic. + * + * So: + * - make all bkey errors AUTOFIX, they're simple anyways (we just + * delete the key) + * - and we don't need to warn if we're not prompting + */ + WARN_ON((flags & FSCK_CAN_FIX) && + !(flags & FSCK_AUTOFIX) && + !trans && + bch2_current_has_btree_trans(c)); + + if (test_bit(err, c->sb.errors_silent)) + return flags & FSCK_CAN_FIX + ? -BCH_ERR_fsck_fix + : -BCH_ERR_fsck_ignore; bch2_sb_error_count(c, err); @@ -189,6 +293,19 @@ int bch2_fsck_err(struct bch_fs *c, prt_vprintf(out, fmt, args); va_end(args); + /* Custom fix/continue/recreate/etc.? */ + if (out->buf[out->pos - 1] == '?') { + const char *p = strrchr(out->buf, ','); + if (p) { + out->pos = p - out->buf; + action = kstrdup(p + 2, GFP_KERNEL); + if (!action) { + ret = -ENOMEM; + goto err; + } + } + } + mutex_lock(&c->fsck_error_msgs_lock); s = fsck_err_get(c, fmt); if (s) { @@ -199,13 +316,15 @@ int bch2_fsck_err(struct bch_fs *c, */ if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { ret = s->ret; - mutex_unlock(&c->fsck_error_msgs_lock); - printbuf_exit(&buf); - return ret; + goto err_unlock; } kfree(s->last_msg); s->last_msg = kstrdup(buf.buf, GFP_KERNEL); + if (!s->last_msg) { + ret = -ENOMEM; + goto err_unlock; + } if (c->opts.ratelimit_errors && !(flags & FSCK_NO_RATELIMIT) && @@ -224,14 +343,28 @@ int bch2_fsck_err(struct bch_fs *c, prt_printf(out, bch2_log_msg(c, "")); #endif - if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + if ((flags & FSCK_AUTOFIX) && + (c->opts.errors == BCH_ON_ERROR_continue || + c->opts.errors == BCH_ON_ERROR_fix_safe)) { + prt_str(out, ", "); + if (flags & FSCK_CAN_FIX) { + prt_actioning(out, action); + ret = -BCH_ERR_fsck_fix; + } else { + prt_str(out, ", continuing"); + ret = -BCH_ERR_fsck_ignore; + } + + goto print; + } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str(out, ", shutting down"); inconsistent = true; ret = -BCH_ERR_fsck_errors_not_fixed; } else if (flags & FSCK_CAN_FIX) { - prt_str(out, ", fixing"); + prt_str(out, ", "); + prt_actioning(out, action); ret = -BCH_ERR_fsck_fix; } else { prt_str(out, ", continuing"); @@ -246,36 +379,31 @@ int bch2_fsck_err(struct bch_fs *c, : c->opts.fix_errors; if (fix == FSCK_FIX_ask) { - int ask; - - prt_str(out, ": fix?"); - if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s", out->buf); - else - bch2_print_string_as_lines(KERN_ERR, out->buf); print = false; - ask = bch2_fsck_ask_yn(c); + ret = do_fsck_ask_yn(c, trans, out, action); + if (ret < 0) + goto err_unlock; - if (ask >= YN_ALLNO && s) - s->fix = ask == YN_ALLNO + if (ret >= YN_ALLNO && s) + s->fix = ret == YN_ALLNO ? FSCK_FIX_no : FSCK_FIX_yes; - ret = ask & 1 + ret = ret & 1 ? -BCH_ERR_fsck_fix : -BCH_ERR_fsck_ignore; } else if (fix == FSCK_FIX_yes || (c->opts.nochanges && !(flags & FSCK_CAN_IGNORE))) { - prt_str(out, ", fixing"); + prt_str(out, ", "); + prt_actioning(out, action); ret = -BCH_ERR_fsck_fix; } else { - prt_str(out, ", not fixing"); + prt_str(out, ", not "); + prt_actioning(out, action); } - } else if (flags & FSCK_NEED_FSCK) { - prt_str(out, " (run fsck to correct)"); - } else { + } else if (!(flags & FSCK_CAN_IGNORE)) { prt_str(out, " (repair unimplemented)"); } @@ -284,6 +412,13 @@ int bch2_fsck_err(struct bch_fs *c, !(flags & FSCK_CAN_IGNORE))) ret = -BCH_ERR_fsck_errors_not_fixed; + if (test_bit(BCH_FS_fsck_running, &c->flags) && + (ret != -BCH_ERR_fsck_fix && + ret != -BCH_ERR_fsck_ignore)) { + exiting = true; + print = true; + } +print: if (print) { if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s\n", out->buf); @@ -291,9 +426,7 @@ int bch2_fsck_err(struct bch_fs *c, bch2_print_string_as_lines(KERN_ERR, out->buf); } - if (test_bit(BCH_FS_fsck_running, &c->flags) && - (ret != -BCH_ERR_fsck_fix && - ret != -BCH_ERR_fsck_ignore)) + if (exiting) bch_err(c, "Unable to continue, halting"); else if (suppressing) bch_err(c, "Ratelimiting new instances of previous error"); @@ -301,20 +434,81 @@ int bch2_fsck_err(struct bch_fs *c, if (s) s->ret = ret; - mutex_unlock(&c->fsck_error_msgs_lock); + if (inconsistent) + bch2_inconsistent_error(c); + /* + * We don't yet track whether the filesystem currently has errors, for + * log_fsck_err()s: that would require us to track for every error type + * which recovery pass corrects it, to get the fsck exit status correct: + */ + if (flags & FSCK_CAN_FIX) { + if (ret == -BCH_ERR_fsck_fix) { + set_bit(BCH_FS_errors_fixed, &c->flags); + } else { + set_bit(BCH_FS_errors_not_fixed, &c->flags); + set_bit(BCH_FS_error, &c->flags); + } + } +err_unlock: + mutex_unlock(&c->fsck_error_msgs_lock); +err: + if (action != action_orig) + kfree(action); printbuf_exit(&buf); + return ret; +} - if (inconsistent) - bch2_inconsistent_error(c); +static const char * const bch2_bkey_validate_contexts[] = { +#define x(n) #n, + BKEY_VALIDATE_CONTEXTS() +#undef x + NULL +}; - if (ret == -BCH_ERR_fsck_fix) { - set_bit(BCH_FS_errors_fixed, &c->flags); - } else { - set_bit(BCH_FS_errors_not_fixed, &c->flags); - set_bit(BCH_FS_error, &c->flags); +int __bch2_bkey_fsck_err(struct bch_fs *c, + struct bkey_s_c k, + struct bkey_validate_context from, + enum bch_sb_error_id err, + const char *fmt, ...) +{ + if (from.flags & BCH_VALIDATE_silent) + return -BCH_ERR_fsck_delete_bkey; + + unsigned fsck_flags = 0; + if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { + if (test_bit(err, c->sb.errors_silent)) + return -BCH_ERR_fsck_delete_bkey; + + fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; } + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + fsck_flags |= fsck_flags_extra[err]; + + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "invalid bkey in %s", + bch2_bkey_validate_contexts[from.from]); + + if (from.from == BKEY_VALIDATE_journal) + prt_printf(&buf, " journal seq=%llu offset=%u", + from.journal_seq, from.journal_offset); + + prt_str(&buf, " btree="); + bch2_btree_id_to_text(&buf, from.btree); + prt_printf(&buf, " level=%u: ", from.level); + + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\n "); + va_list args; + va_start(args, fmt); + prt_vprintf(&buf, fmt, args); + va_end(args); + + prt_str(&buf, ": delete?"); + + int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf); + printbuf_exit(&buf); return ret; } @@ -335,3 +529,36 @@ void bch2_flush_fsck_errs(struct bch_fs *c) mutex_unlock(&c->fsck_error_msgs_lock); } + +int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) +{ + u32 restart_count = trans->restart_count; + int ret = 0; + + /* XXX: we don't yet attempt to print paths when we don't know the subvol */ + if (inum.subvol) + ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); + if (!inum.subvol || ret) + prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); + + return trans_was_restarted(trans, restart_count); +} + +int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + subvol_inum inum, u64 offset) +{ + int ret = bch2_inum_err_msg_trans(trans, out, inum); + prt_printf(out, " offset %llu: ", offset); + return ret; +} + +void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) +{ + bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); +} + +void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, + subvol_inum inum, u64 offset) +{ + bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index fec17d1353d1..7acf2a27ca28 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -4,6 +4,7 @@ #include <linux/list.h> #include <linux/printk.h> +#include "bkey_types.h" #include "sb-errors.h" struct bch_dev; @@ -30,7 +31,13 @@ struct work_struct; bool bch2_inconsistent_error(struct bch_fs *); -void bch2_topology_error(struct bch_fs *); +int bch2_topology_error(struct bch_fs *); + +#define bch2_fs_topology_error(c, ...) \ +({ \ + bch_err(c, "btree topology error: " __VA_ARGS__); \ + bch2_topology_error(c); \ +}) #define bch2_fs_inconsistent(c, ...) \ ({ \ @@ -38,32 +45,11 @@ void bch2_topology_error(struct bch_fs *); bch2_inconsistent_error(c); \ }) -#define bch2_fs_inconsistent_on(cond, c, ...) \ +#define bch2_fs_inconsistent_on(cond, ...) \ ({ \ bool _ret = unlikely(!!(cond)); \ - \ if (_ret) \ - bch2_fs_inconsistent(c, __VA_ARGS__); \ - _ret; \ -}) - -/* - * Later we might want to mark only the particular device inconsistent, not the - * entire filesystem: - */ - -#define bch2_dev_inconsistent(ca, ...) \ -do { \ - bch_err(ca, __VA_ARGS__); \ - bch2_inconsistent_error((ca)->fs); \ -} while (0) - -#define bch2_dev_inconsistent_on(cond, ca, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - \ - if (_ret) \ - bch2_dev_inconsistent(ca, __VA_ARGS__); \ + bch2_fs_inconsistent(__VA_ARGS__); \ _ret; \ }) @@ -102,27 +88,23 @@ struct fsck_err_state { char *last_msg; }; -enum bch_fsck_flags { - FSCK_CAN_FIX = 1 << 0, - FSCK_CAN_IGNORE = 1 << 1, - FSCK_NEED_FSCK = 1 << 2, - FSCK_NO_RATELIMIT = 1 << 3, -}; - #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) -__printf(4, 5) __cold -int bch2_fsck_err(struct bch_fs *, +__printf(5, 6) __cold +int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, enum bch_fsck_flags, enum bch_sb_error_id, const char *, ...); +#define bch2_fsck_err(c, _flags, _err_type, ...) \ + __bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\ + type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\ + _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__) + void bch2_flush_fsck_errs(struct bch_fs *); -#define __fsck_err(c, _flags, _err_type, ...) \ +#define fsck_err_wrap(_do) \ ({ \ - int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \ - __VA_ARGS__); \ - \ + int _ret = _do; \ if (_ret != -BCH_ERR_fsck_fix && \ _ret != -BCH_ERR_fsck_ignore) { \ ret = _ret; \ @@ -132,18 +114,21 @@ void bch2_flush_fsck_errs(struct bch_fs *); _ret == -BCH_ERR_fsck_fix; \ }) +#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) + /* These macros return true if error should be fixed: */ /* XXX: mark in superblock that filesystem contains errors, if we ignore: */ #define __fsck_err_on(cond, c, _flags, _err_type, ...) \ - (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false) - -#define need_fsck_err_on(cond, c, _err_type, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) - -#define need_fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) +({ \ + might_sleep(); \ + \ + if (type_is(c, struct bch_fs *)) \ + WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\ + \ + (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\ +}) #define mustfix_fsck_err(c, _err_type, ...) \ __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) @@ -157,24 +142,38 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define fsck_err_on(cond, c, _err_type, ...) \ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) -__printf(4, 0) -static inline void bch2_bkey_fsck_err(struct bch_fs *c, - struct printbuf *err_msg, - enum bch_sb_error_id err_type, - const char *fmt, ...) -{ - va_list args; +#define log_fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) + +#define log_fsck_err_on(cond, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + if (_ret) \ + log_fsck_err(__VA_ARGS__); \ + _ret; \ +}) - va_start(args, fmt); - prt_vprintf(err_msg, fmt, args); - va_end(args); -} +enum bch_validate_flags; +__printf(5, 6) +int __bch2_bkey_fsck_err(struct bch_fs *, + struct bkey_s_c, + struct bkey_validate_context from, + enum bch_sb_error_id, + const char *, ...); -#define bkey_fsck_err(c, _err_msg, _err_type, ...) \ +/* + * for now, bkey fsck errors are always handled by deleting the entire key - + * this will change at some point + */ +#define bkey_fsck_err(c, _err_type, _err_msg, ...) \ do { \ - prt_printf(_err_msg, __VA_ARGS__); \ - bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \ - ret = -BCH_ERR_invalid_bkey; \ + int _ret = __bch2_bkey_fsck_err(c, k, from, \ + BCH_FSCK_ERR_##_err_type, \ + _err_msg, ##__VA_ARGS__); \ + if (_ret != -BCH_ERR_fsck_fix && \ + _ret != -BCH_ERR_fsck_ignore) \ + ret = _ret; \ + ret = -BCH_ERR_fsck_delete_bkey; \ goto fsck_err; \ } while (0) @@ -191,9 +190,9 @@ do { \ void bch2_fatal_error(struct bch_fs *); -#define bch2_fs_fatal_error(c, ...) \ +#define bch2_fs_fatal_error(c, _msg, ...) \ do { \ - bch_err(c, __VA_ARGS__); \ + bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__); \ bch2_fatal_error(c); \ } while (0) @@ -239,4 +238,10 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); _ret; \ }) +int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum); +int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); + +void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); +void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); + #endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index b9033bb4f11c..6aac579a692a 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -64,7 +64,7 @@ static int count_iters_for_insert(struct btree_trans *trans, break; case KEY_TYPE_reflink_p: { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx = le64_to_cpu(p.v->idx); + u64 idx = REFLINK_P_IDX(p.v); unsigned sectors = bpos_min(*end, p.k->p).offset - bkey_start_offset(p.k); struct btree_iter iter; @@ -72,7 +72,7 @@ static int count_iters_for_insert(struct btree_trans *trans, for_each_btree_key_norestart(trans, iter, BTREE_ID_reflink, POS(0, idx + offset), - BTREE_ITER_SLOTS, r_k, ret2) { + BTREE_ITER_slots, r_k, ret2) { if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors))) break; @@ -128,7 +128,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans, bch2_trans_copy_iter(©, iter); - for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) { + for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) { unsigned offset = 0; if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 61395b113df9..2d8042f853dc 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -21,6 +21,7 @@ #include "extents.h" #include "inode.h" #include "journal.h" +#include "rebalance.h" #include "replicas.h" #include "super.h" #include "super-io.h" @@ -37,8 +38,8 @@ static void bch2_extent_crc_pack(union bch_extent_crc *, struct bch_extent_crc_unpacked, enum bch_extent_entry_type); -static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, - unsigned dev) +struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, + unsigned dev) { struct bch_dev_io_failures *i; @@ -52,7 +53,7 @@ static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, void bch2_mark_io_failure(struct bch_io_failures *failed, struct extent_ptr_decoded *p) { - struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); + struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); if (!f) { BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); @@ -71,6 +72,12 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, } } +static inline u64 dev_latency(struct bch_fs *c, unsigned dev) +{ + struct bch_dev *ca = bch2_dev_rcu(c, dev); + return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; +} + /* * returns true if p1 is better than p2: */ @@ -79,15 +86,20 @@ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p2) { if (likely(!p1.idx && !p2.idx)) { - struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); - struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); + u64 l1 = dev_latency(c, p1.ptr.dev); + u64 l2 = dev_latency(c, p2.ptr.dev); - u64 l1 = atomic64_read(&dev1->cur_latency[READ]); - u64 l2 = atomic64_read(&dev2->cur_latency[READ]); + /* + * Square the latencies, to bias more in favor of the faster + * device - we never want to stop issuing reads to the slower + * device altogether, so that we can update our latency numbers: + */ + l1 *= l1; + l2 *= l2; /* Pick at random, biased in favor of the faster device: */ - return bch2_rand_range(l1 + l2) > l1; + return bch2_get_random_u64_below(l1 + l2) > l1; } if (bch2_force_reconstruct_read) @@ -109,47 +121,47 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, const union bch_extent_entry *entry; struct extent_ptr_decoded p; struct bch_dev_io_failures *f; - struct bch_dev *ca; int ret = 0; if (k.k->type == KEY_TYPE_error) - return -EIO; + return -BCH_ERR_key_type_error; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { /* * Unwritten extent: no need to actually read, treat it as a * hole and return 0s: */ - if (p.ptr.unwritten) - return 0; - - ca = bch_dev_bkey_exists(c, p.ptr.dev); + if (p.ptr.unwritten) { + ret = 0; + break; + } /* * If there are any dirty pointers it's an error if we can't * read: */ if (!ret && !p.ptr.cached) - ret = -EIO; + ret = -BCH_ERR_no_device_to_read_from; - if (p.ptr.cached && ptr_stale(ca, &p.ptr)) + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + + if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; - f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; + f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; if (f) p.idx = f->nr_failed < f->nr_retries ? f->idx : f->idx + 1; - if (!p.idx && - !bch2_dev_is_readable(ca)) + if (!p.idx && (!ca || !bch2_dev_is_readable(ca))) p.idx++; - if (bch2_force_reconstruct_read && - !p.idx && p.has_ec) + if (!p.idx && p.has_ec && bch2_force_reconstruct_read) p.idx++; - if (p.idx >= (unsigned) p.has_ec + 1) + if (p.idx > (unsigned) p.has_ec) continue; if (ret > 0 && !ptr_better(c, p, *pick)) @@ -158,23 +170,23 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, *pick = p; ret = 1; } + rcu_read_unlock(); return ret; } /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err, - btree_ptr_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, + c, btree_ptr_val_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - ret = bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } @@ -185,18 +197,28 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } -int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err, - btree_ptr_v2_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, + c, btree_ptr_v2_val_too_big, "value too big (%zu > %zu)", bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); - ret = bch2_bkey_ptrs_invalid(c, k, flags, err); + bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), + c, btree_ptr_v2_min_key_bad, + "min_key > key"); + + if ((from.flags & BCH_VALIDATE_write) && + c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written) + bkey_fsck_err_on(!bp.v->sectors_written, + c, btree_ptr_v2_written_0, + "sectors_written == 0"); + + ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } @@ -242,7 +264,6 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) const union bch_extent_entry *en_r; struct extent_ptr_decoded lp, rp; bool use_right_ptr; - struct bch_dev *ca; en_l = l_ptrs.start; en_r = r_ptrs.start; @@ -273,8 +294,12 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) return false; /* Extents may not straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp.ptr.dev); - if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); + bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); + rcu_read_unlock(); + + if (!same_bucket) return false; if (lp.has_ec != rp.has_ec || @@ -379,15 +404,14 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ -int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); int ret = 0; - bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err, - reservation_key_nr_replicas_invalid, + bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, + c, reservation_key_nr_replicas_invalid, "invalid nr_replicas (%u)", r.v->nr_replicas); fsck_err: return ret; @@ -662,16 +686,16 @@ static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); + struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - return __extent_ptr_durability(ca, p); + return ca ? __extent_ptr_durability(ca, p) : 0; } unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); + struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - if (ca->mi.state == BCH_MEMBER_STATE_failed) + if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed) return 0; return __extent_ptr_durability(ca, p); @@ -684,8 +708,10 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) durability += bch2_extent_ptr_durability(c, &p); + rcu_read_unlock(); return durability; } @@ -697,9 +723,11 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) durability += bch2_extent_ptr_durability(c, &p); + rcu_read_unlock(); return durability; } @@ -760,14 +788,17 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, /* * Returns pointer to the next entry after the one being dropped: */ -union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, - struct bch_extent_ptr *ptr) +void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry = to_entry(ptr), *next; - union bch_extent_entry *ret = entry; bool drop_crc = true; + if (k.k->type == KEY_TYPE_stripe) { + ptr->dev = BCH_SB_MEMBER_INVALID; + return; + } + EBUG_ON(ptr < &ptrs.start->ptr || ptr >= &ptrs.end->ptr); EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); @@ -790,21 +821,28 @@ union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, break; if ((extent_entry_is_crc(entry) && drop_crc) || - extent_entry_is_stripe_ptr(entry)) { - ret = (void *) ret - extent_entry_bytes(entry); + extent_entry_is_stripe_ptr(entry)) extent_entry_drop(k, entry); - } } - - return ret; } -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr) { + if (k.k->type != KEY_TYPE_stripe) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == ptr->dev && p.has_ec) { + ptr->dev = BCH_SB_MEMBER_INVALID; + return; + } + } + bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; - union bch_extent_entry *ret = - bch2_bkey_drop_ptr_noerror(k, ptr); + + bch2_bkey_drop_ptr_noerror(k, ptr); /* * If we deleted all the dirty pointers and there's still cached @@ -816,29 +854,20 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, !bch2_bkey_dirty_devs(k.s_c).nr) { k.k->type = KEY_TYPE_error; set_bkey_val_u64s(k.k, 0); - ret = NULL; } else if (!bch2_bkey_nr_ptrs(k.s_c)) { k.k->type = KEY_TYPE_deleted; set_bkey_val_u64s(k.k, 0); - ret = NULL; } - - return ret; } void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) { - struct bch_extent_ptr *ptr; - bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) { - struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev); - - if (ptr) - bch2_bkey_drop_ptr_noerror(k, ptr); + bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev); } const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) @@ -855,14 +884,21 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_dev *ca; + bool ret = false; + rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && + (ca = bch2_dev_rcu(c, ptr->dev)) && (!ptr->cached || - !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - return true; + !dev_ptr_stale_rcu(ca, ptr))) { + ret = true; + break; + } + rcu_read_unlock(); - return false; + return ret; } bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, @@ -903,8 +939,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) if (p1.ptr.dev == p2.ptr.dev && p1.ptr.gen == p2.ptr.gen && + + /* + * This checks that the two pointers point + * to the same region on disk - adjusting + * for the difference in where the extents + * start, since one may have been trimmed: + */ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) && + + /* + * This additionally checks that the + * extents overlap on disk, since the + * previous check may trigger spuriously + * when one extent is immediately partially + * overwritten with another extent (so that + * on disk they are adjacent) and + * compression is in use: + */ + ((p1.ptr.offset >= p2.ptr.offset && + p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) || + (p2.ptr.offset >= p1.ptr.offset && + p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) return true; return false; @@ -931,31 +988,54 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke return NULL; } -void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) +static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, + struct bch_extent_ptr *ptr) +{ + if (!opts->promote_target || + !bch2_dev_in_target(c, ptr->dev, opts->promote_target)) + return false; + + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + + return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr); +} + +void bch2_extent_ptr_set_cached(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s k, + struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; - union bch_extent_entry *ec = NULL; + struct extent_ptr_decoded p; - bkey_extent_entry_for_each(ptrs, entry) { + rcu_read_lock(); + if (!want_cached_ptr(c, opts, ptr)) { + bch2_bkey_drop_ptr_noerror(k, ptr); + goto out; + } + + /* + * Stripes can't contain cached data, for - reasons. + * + * Possibly something we can fix in the future? + */ + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (&entry->ptr == ptr) { - ptr->cached = true; - if (ec) - extent_entry_drop(k, ec); - return; + if (p.has_ec) + bch2_bkey_drop_ptr_noerror(k, ptr); + else + ptr->cached = true; + goto out; } - if (extent_entry_is_stripe_ptr(entry)) - ec = entry; - else if (extent_entry_is_ptr(entry)) - ec = NULL; - } - BUG(); +out: + rcu_read_unlock(); } /* - * bch_extent_normalize - clean up an extent, dropping stale pointers etc. + * bch2_extent_normalize - clean up an extent, dropping stale pointers etc. * * Returns true if @k should be dropped entirely * @@ -964,15 +1044,143 @@ void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) */ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { - struct bch_extent_ptr *ptr; + struct bch_dev *ca; + rcu_read_lock(); bch2_bkey_drop_ptrs(k, ptr, ptr->cached && - ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); + (!(ca = bch2_dev_rcu(c, ptr->dev)) || + dev_ptr_stale_rcu(ca, ptr) > 0)); + rcu_read_unlock(); return bkey_deleted(k.k); } +/* + * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc. + * + * Like bch2_extent_normalize(), but also only keeps a single cached pointer on + * the promote target. + */ +bool bch2_extent_normalize_by_opts(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s k) +{ + struct bkey_ptrs ptrs; + bool have_cached_ptr; + + rcu_read_lock(); +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(k); + have_cached_ptr = false; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->cached) { + if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) { + bch2_bkey_drop_ptr(k, ptr); + goto restart_drop_ptrs; + } + have_cached_ptr = true; + } + rcu_read_unlock(); + + return bkey_deleted(k.k); +} + +void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) +{ + out->atomic++; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); + } else { + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, "ptr: %u:%llu:%u gen %u", + ptr->dev, b, offset, ptr->gen); + if (ca->mi.durability != 1) + prt_printf(out, " d=%u", ca->mi.durability); + if (ptr->cached) + prt_str(out, " cached"); + if (ptr->unwritten) + prt_str(out, " unwritten"); + int stale = dev_ptr_stale_rcu(ca, ptr); + if (stale > 0) + prt_printf(out, " stale"); + else if (stale) + prt_printf(out, " invalid"); + } + rcu_read_unlock(); + --out->atomic; +} + +void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc) +{ + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", + crc->compressed_size, + crc->uncompressed_size, + crc->offset, crc->nonce); + bch2_prt_csum_type(out, crc->csum_type); + prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo); + prt_str(out, " compress "); + bch2_prt_compression_type(out, crc->compression_type); +} + +static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, + const struct bch_extent_rebalance *r) +{ + prt_str(out, "rebalance:"); + + prt_printf(out, " replicas=%u", r->data_replicas); + if (r->data_replicas_from_inode) + prt_str(out, " (inode)"); + + prt_str(out, " checksum="); + bch2_prt_csum_opt(out, r->data_checksum); + if (r->data_checksum_from_inode) + prt_str(out, " (inode)"); + + if (r->background_compression || r->background_compression_from_inode) { + prt_str(out, " background_compression="); + bch2_compression_opt_to_text(out, r->background_compression); + + if (r->background_compression_from_inode) + prt_str(out, " (inode)"); + } + + if (r->background_target || r->background_target_from_inode) { + prt_str(out, " background_target="); + if (c) + bch2_target_to_text(out, c, r->background_target); + else + prt_printf(out, "%u", r->background_target); + + if (r->background_target_from_inode) + prt_str(out, " (inode)"); + } + + if (r->promote_target || r->promote_target_from_inode) { + prt_str(out, " promote_target="); + if (c) + bch2_target_to_text(out, c, r->promote_target); + else + prt_printf(out, "%u", r->promote_target); + + if (r->promote_target_from_inode) + prt_str(out, " (inode)"); + } + + if (r->erasure_code || r->erasure_code_from_inode) { + prt_printf(out, " ec=%u", r->erasure_code); + if (r->erasure_code_from_inode) + prt_str(out, " (inode)"); + } +} + void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -988,43 +1196,17 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " "); switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: { - const struct bch_extent_ptr *ptr = entry_to_ptr(entry); - struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; - - if (!ca) { - prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : ""); - } else { - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); - - prt_printf(out, "ptr: %u:%llu:%u gen %u", - ptr->dev, b, offset, ptr->gen); - if (ptr->cached) - prt_str(out, " cached"); - if (ptr->unwritten) - prt_str(out, " unwritten"); - if (ca && ptr_stale(ca, ptr)) - prt_printf(out, " stale"); - } + case BCH_EXTENT_ENTRY_ptr: + bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); break; - } + case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: { struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ", - crc.compressed_size, - crc.uncompressed_size, - crc.offset, crc.nonce, - bch2_csum_types[crc.csum_type]); - bch2_prt_compression_type(out, crc.compression_type); + bch2_extent_crc_unpacked_to_text(out, &crc); break; } case BCH_EXTENT_ENTRY_stripe_ptr: { @@ -1034,18 +1216,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, (u64) ec->idx, ec->block); break; } - case BCH_EXTENT_ENTRY_rebalance: { - const struct bch_extent_rebalance *r = &entry->rebalance; - - prt_str(out, "rebalance: target "); - if (c) - bch2_target_to_text(out, c, r->target); - else - prt_printf(out, "%u", r->target); - prt_str(out, " compression "); - bch2_compression_opt_to_text(out, r->compression); + case BCH_EXTENT_ENTRY_rebalance: + bch2_extent_rebalance_to_text(out, c, &entry->rebalance); break; - } + default: prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; @@ -1055,58 +1229,51 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } -static int extent_ptr_invalid(struct bch_fs *c, - struct bkey_s_c k, - enum bkey_invalid_flags flags, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata, - struct printbuf *err) +static int extent_ptr_validate(struct bch_fs *c, + struct bkey_s_c k, + struct bkey_validate_context from, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - u64 bucket; - u32 bucket_offset; - struct bch_dev *ca; int ret = 0; - if (!bch2_dev_exists2(c, ptr->dev)) { - /* - * If we're in the write path this key might have already been - * overwritten, and we could be seeing a device that doesn't - * exist anymore due to racing with device removal: - */ - if (flags & BKEY_INVALID_WRITE) - return 0; - - bkey_fsck_err(c, err, ptr_to_invalid_device, - "pointer to invalid device (%u)", ptr->dev); - } - - ca = bch_dev_bkey_exists(c, ptr->dev); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr2) - bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err, - ptr_to_duplicate_device, + bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, + c, ptr_to_duplicate_device, "multiple pointers to same device (%u)", ptr->dev); - bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - - bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err, - ptr_after_last_bucket, - "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets); - bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err, - ptr_before_first_bucket, - "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket); - bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err, - ptr_spans_multiple_buckets, + /* bad pointers are repaired by check_fix_ptrs(): */ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + if (!ca) { + rcu_read_unlock(); + return 0; + } + u32 bucket_offset; + u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); + unsigned first_bucket = ca->mi.first_bucket; + u64 nbuckets = ca->mi.nbuckets; + unsigned bucket_size = ca->mi.bucket_size; + rcu_read_unlock(); + + bkey_fsck_err_on(bucket >= nbuckets, + c, ptr_after_last_bucket, + "pointer past last bucket (%llu > %llu)", bucket, nbuckets); + bkey_fsck_err_on(bucket < first_bucket, + c, ptr_before_first_bucket, + "pointer before first bucket (%llu < %u)", bucket, first_bucket); + bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, + c, ptr_spans_multiple_buckets, "pointer spans multiple buckets (%u + %u > %u)", - bucket_offset, size_ondisk, ca->mi.bucket_size); + bucket_offset, size_ondisk, bucket_size); fsck_err: return ret; } -int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1121,25 +1288,24 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { - bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err, - extent_ptrs_invalid_entry, - "invalid extent entry type (got %u, max %u)", - __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); + bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, + c, extent_ptrs_invalid_entry, + "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && - !extent_entry_is_ptr(entry), c, err, - btree_ptr_has_non_ptr, + !extent_entry_is_ptr(entry), + c, btree_ptr_has_non_ptr, "has non ptr field"); switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: - ret = extent_ptr_invalid(c, k, flags, &entry->ptr, - size_ondisk, false, err); + ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false); if (ret) return ret; - bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err, - ptr_cached_and_erasure_coded, + bkey_fsck_err_on(entry->ptr.cached && have_ec, + c, ptr_cached_and_erasure_coded, "cached, erasure coded ptr"); if (!entry->ptr.unwritten) @@ -1156,44 +1322,54 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err, - ptr_crc_uncompressed_size_too_small, - "checksum offset + key size > uncompressed size"); - bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err, - ptr_crc_csum_type_unknown, + bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), + c, ptr_crc_csum_type_unknown, "invalid checksum type"); - bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err, - ptr_crc_compression_type_unknown, + bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, + c, ptr_crc_compression_type_unknown, "invalid compression type"); + bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, + c, ptr_crc_uncompressed_size_too_small, + "checksum offset + key size > uncompressed size"); + bkey_fsck_err_on(crc_is_encoded(crc) && + (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && + (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), + c, ptr_crc_uncompressed_size_too_big, + "too large encoded extent"); + bkey_fsck_err_on(!crc_is_compressed(crc) && + crc.compressed_size != crc.uncompressed_size, + c, ptr_crc_uncompressed_size_mismatch, + "not compressed but compressed != uncompressed size"); + if (bch2_csum_type_is_encryption(crc.csum_type)) { if (nonce == UINT_MAX) nonce = crc.offset + crc.nonce; else if (nonce != crc.offset + crc.nonce) - bkey_fsck_err(c, err, ptr_crc_nonce_mismatch, + bkey_fsck_err(c, ptr_crc_nonce_mismatch, "incorrect nonce"); } - bkey_fsck_err_on(crc_since_last_ptr, c, err, - ptr_crc_redundant, + bkey_fsck_err_on(crc_since_last_ptr, + c, ptr_crc_redundant, "redundant crc entry"); crc_since_last_ptr = true; - bkey_fsck_err_on(crc_is_encoded(crc) && - (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err, - ptr_crc_uncompressed_size_too_big, - "too large encoded extent"); - size_ondisk = crc.compressed_size; break; case BCH_EXTENT_ENTRY_stripe_ptr: - bkey_fsck_err_on(have_ec, c, err, - ptr_stripe_redundant, + bkey_fsck_err_on(have_ec, + c, ptr_stripe_redundant, "redundant stripe entry"); have_ec = true; break; case BCH_EXTENT_ENTRY_rebalance: { + /* + * this shouldn't be a fsck error, for forward + * compatibility; the rebalance code should just refetch + * the compression opt if it's unknown + */ +#if 0 const struct bch_extent_rebalance *r = &entry->rebalance; if (!bch2_compression_opt_valid(r->compression)) { @@ -1202,28 +1378,29 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, opt.type, opt.level); return -BCH_ERR_invalid_bkey; } +#endif break; } } } - bkey_fsck_err_on(!nr_ptrs, c, err, - extent_ptrs_no_ptrs, + bkey_fsck_err_on(!nr_ptrs, + c, extent_ptrs_no_ptrs, "no ptrs"); - bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err, - extent_ptrs_too_many_ptrs, + bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, + c, extent_ptrs_too_many_ptrs, "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); - bkey_fsck_err_on(have_written && have_unwritten, c, err, - extent_ptrs_written_and_unwritten, + bkey_fsck_err_on(have_written && have_unwritten, + c, extent_ptrs_written_and_unwritten, "extent with unwritten and written ptrs"); - bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err, - extent_ptrs_unwritten, + bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, + c, extent_ptrs_unwritten, "has unwritten ptrs"); - bkey_fsck_err_on(crc_since_last_ptr, c, err, - extent_ptrs_redundant_crc, + bkey_fsck_err_on(crc_since_last_ptr, + c, extent_ptrs_redundant_crc, "redundant crc entry"); - bkey_fsck_err_on(have_ec, c, err, - extent_ptrs_redundant_stripe, + bkey_fsck_err_on(have_ec, + c, extent_ptrs_redundant_stripe, "redundant stripe entry"); fsck_err: return ret; @@ -1243,7 +1420,7 @@ void bch2_ptr_swab(struct bkey_s k) for (entry = ptrs.start; entry < ptrs.end; entry = extent_entry_next(entry)) { - switch (extent_entry_type(entry)) { + switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: break; case BCH_EXTENT_ENTRY_crc32: @@ -1263,131 +1440,13 @@ void bch2_ptr_swab(struct bkey_s k) break; case BCH_EXTENT_ENTRY_rebalance: break; + default: + /* Bad entry type: will be caught by validate() */ + return; } } } -const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - - bkey_extent_entry_for_each(ptrs, entry) - if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) - return &entry->rebalance; - - return NULL; -} - -unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, - unsigned target, unsigned compression) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned rewrite_ptrs = 0; - - if (compression) { - unsigned compression_type = bch2_compression_opt_to_type(compression); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned i = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) { - rewrite_ptrs = 0; - goto incompressible; - } - - if (!p.ptr.cached && p.crc.compression_type != compression_type) - rewrite_ptrs |= 1U << i; - i++; - } - } -incompressible: - if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { - unsigned i = 0; - - bkey_for_each_ptr(ptrs, ptr) { - if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target)) - rewrite_ptrs |= 1U << i; - i++; - } - } - - return rewrite_ptrs; -} - -bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) -{ - const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - - /* - * If it's an indirect extent, we don't delete the rebalance entry when - * done so that we know what options were applied - check if it still - * needs work done: - */ - if (r && - k.k->type == KEY_TYPE_reflink_v && - !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression)) - r = NULL; - - return r != NULL; -} - -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, - struct bch_io_opts *opts) -{ - struct bkey_s k = bkey_i_to_s(_k); - struct bch_extent_rebalance *r; - unsigned target = opts->background_target; - unsigned compression = background_compression(*opts); - bool needs_rebalance; - - if (!bkey_extent_is_direct_data(k.k)) - return 0; - - /* get existing rebalance entry: */ - r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - if (r) { - if (k.k->type == KEY_TYPE_reflink_v) { - /* - * indirect extents: existing options take precedence, - * so that we don't move extents back and forth if - * they're referenced by different inodes with different - * options: - */ - if (r->target) - target = r->target; - if (r->compression) - compression = r->compression; - } - - r->target = target; - r->compression = compression; - } - - needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression); - - if (needs_rebalance && !r) { - union bch_extent_entry *new = bkey_val_end(k); - - new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; - new->rebalance.compression = compression; - new->rebalance.target = target; - new->rebalance.unused = 0; - k.k->u64s += extent_entry_u64s(new); - } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) { - /* - * For indirect extents, don't delete the rebalance entry when - * we're finished so that we know we specifically moved it or - * compressed it to its current location/compression type - */ - extent_entry_drop(k, (union bch_extent_entry *) r); - } - - return 0; -} - /* Generic extent code: */ int bch2_cut_front_s(struct bpos where, struct bkey_s k) @@ -1447,7 +1506,7 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) case KEY_TYPE_reflink_p: { struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); - le64_add_cpu(&p.v->idx, sub); + SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub); break; } case KEY_TYPE_inline_data: diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 6bf839d69e84..204d765dd74c 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -8,7 +8,6 @@ struct bch_fs; struct btree_trans; -enum bkey_invalid_flags; /* extent entries: */ @@ -43,6 +42,11 @@ enum bkey_invalid_flags; #define extent_entry_next(_entry) \ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) +#define extent_entry_next_safe(_entry, _end) \ + (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \ + ? extent_entry_next(_entry) \ + : _end) + static inline unsigned __extent_entry_type(const union bch_extent_entry *e) { @@ -103,17 +107,17 @@ static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *en static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) { - return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; + return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; } static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) { - return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; + return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; } static inline bool extent_entry_is_crc(const union bch_extent_entry *e) { - switch (extent_entry_type(e)) { + switch (__extent_entry_type(e)) { case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: @@ -207,6 +211,8 @@ static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc) return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc); } +void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *); + /* bkey_ptrs: generically over any key type that has ptrs */ struct bkey_ptrs_c { @@ -280,7 +286,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) #define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ for ((_entry) = (_start); \ (_entry) < (_end); \ - (_entry) = extent_entry_next(_entry)) + (_entry) = extent_entry_next_safe(_entry, _end)) #define __bkey_ptr_next(_ptr, _end) \ ({ \ @@ -318,7 +324,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) (_ptr).has_ec = false; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ - switch (extent_entry_type(_entry)) { \ + switch (__extent_entry_type(_entry)) { \ case BCH_EXTENT_ENTRY_ptr: \ (_ptr).ptr = _entry->ptr; \ goto out; \ @@ -344,13 +350,13 @@ out: \ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ (_entry) = _start; \ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ - (_entry) = extent_entry_next(_entry)) + (_entry) = extent_entry_next_safe(_entry, _end)) #define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ _ptr, _entry) -#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ +#define bkey_crc_next(_k, _end, _crc, _iter) \ ({ \ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ if (extent_entry_is_crc(_iter)) { \ @@ -365,7 +371,7 @@ out: \ #define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ (_iter) = (_start); \ - bkey_crc_next(_k, _start, _end, _crc, _iter); \ + bkey_crc_next(_k, _end, _crc, _iter); \ (_iter) = extent_entry_next(_iter)) #define bkey_for_each_crc(_k, _p, _crc, _iter) \ @@ -392,6 +398,8 @@ out: \ /* utility code common to all keys with pointers: */ +struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, + unsigned); void bch2_mark_io_failure(struct bch_io_failures *, struct extent_ptr_decoded *); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, @@ -400,26 +408,26 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); #define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \ - .key_invalid = bch2_btree_ptr_invalid, \ + .key_validate = bch2_btree_ptr_validate, \ .val_to_text = bch2_btree_ptr_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_extent, \ }) #define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ - .key_invalid = bch2_btree_ptr_v2_invalid, \ + .key_validate = bch2_btree_ptr_v2_validate, \ .val_to_text = bch2_btree_ptr_v2_to_text, \ .swab = bch2_ptr_swab, \ .compat = bch2_btree_ptr_v2_compat, \ @@ -432,7 +440,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_extent ((struct bkey_ops) { \ - .key_invalid = bch2_bkey_ptrs_invalid, \ + .key_validate = bch2_bkey_ptrs_validate, \ .val_to_text = bch2_bkey_ptrs_to_text, \ .swab = bch2_ptr_swab, \ .key_normalize = bch2_extent_normalize, \ @@ -442,13 +450,13 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ -int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_reservation ((struct bkey_ops) { \ - .key_invalid = bch2_reservation_invalid, \ + .key_validate = bch2_reservation_validate, \ .val_to_text = bch2_reservation_to_text, \ .key_merge = bch2_reservation_merge, \ .trigger = bch2_trigger_reservation, \ @@ -591,30 +599,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) return ret; } -static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - return BCH_DATA_btree; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return BCH_DATA_user; - case KEY_TYPE_stripe: { - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - BUG_ON(ptr < s.v->ptrs || - ptr >= s.v->ptrs + s.v->nr_blocks); - - return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity - : BCH_DATA_user; - } - default: - BUG(); - } -} - unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); @@ -626,9 +610,6 @@ unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_d unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); -void bch2_bkey_drop_device(struct bkey_s, unsigned); -void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); - const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned); static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev) @@ -664,26 +645,38 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); -union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s, - struct bch_extent_ptr *); -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, - struct bch_extent_ptr *); +void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *); +void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); -#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ +void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); +void bch2_bkey_drop_device(struct bkey_s, unsigned); + +#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \ do { \ - struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ - \ - _ptr = &_ptrs.start->ptr; \ + __label__ _again; \ + struct bkey_ptrs _ptrs; \ +_again: \ + _ptrs = bch2_bkey_ptrs(_k); \ \ - while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ + bkey_for_each_ptr(_ptrs, _ptr) \ if (_cond) { \ - _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ - _ptrs = bch2_bkey_ptrs(_k); \ - continue; \ + bch2_bkey_drop_ptr_noerror(_k, _ptr); \ + goto _again; \ } \ +} while (0) + +#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ +do { \ + __label__ _again; \ + struct bkey_ptrs _ptrs; \ +_again: \ + _ptrs = bch2_bkey_ptrs(_k); \ \ - (_ptr)++; \ - } \ + bkey_for_each_ptr(_ptrs, _ptr) \ + if (_cond) { \ + bch2_bkey_drop_ptr(_k, _ptr); \ + goto _again; \ + } \ } while (0) bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, @@ -692,23 +685,29 @@ bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); struct bch_extent_ptr * bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); -void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); +void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *, + struct bkey_s, struct bch_extent_ptr *); +bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); + +void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); -void bch2_ptr_swab(struct bkey_s); - -const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); -unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, - unsigned, unsigned); -bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); +static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, + struct bch_extent_ptr ptr2) +{ + return (ptr1.cached == ptr2.cached && + ptr1.unwritten == ptr2.unwritten && + ptr1.offset == ptr2.offset && + ptr1.dev == ptr2.dev && + ptr1.gen == ptr2.gen); +} -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, - struct bch_io_opts *); +void bch2_ptr_swab(struct bkey_s); /* Generic extent code: */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h index 3bd2fdbb0817..c198dfc376d6 100644 --- a/fs/bcachefs/extents_format.h +++ b/fs/bcachefs/extents_format.h @@ -201,19 +201,8 @@ struct bch_extent_stripe_ptr { #endif }; -struct bch_extent_rebalance { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:34, - compression:8, /* enum bch_compression_opt */ - target:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 target:16, - compression:8, - unused:34, - type:6; -#endif -}; +/* bch_extent_rebalance: */ +#include "rebalance_format.h" union bch_extent_entry { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c new file mode 100644 index 000000000000..2eaffe37b5e7 --- /dev/null +++ b/fs/bcachefs/eytzinger.c @@ -0,0 +1,305 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "eytzinger.h" + +/** + * is_aligned - is this pointer & size okay for word-wide copying? + * @base: pointer to data + * @size: size of each element + * @align: required alignment (typically 4 or 8) + * + * Returns true if elements can be copied using word loads and stores. + * The size must be a multiple of the alignment, and the base address must + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + * + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" + * to "if ((a | b) & mask)", so we do that by hand. + */ +__attribute_const__ __always_inline +static bool is_aligned(const void *base, size_t size, unsigned char align) +{ + unsigned char lsbits = (unsigned char)size; + + (void)base; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + lsbits |= (unsigned char)(uintptr_t)base; +#endif + return (lsbits & (align - 1)) == 0; +} + +/** + * swap_words_32 - swap two elements in 32-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 4) + * + * Exchange the two objects in memory. This exploits base+index addressing, + * which basically all CPUs have, to minimize loop overhead computations. + * + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the + * bottom of the loop, even though the zero flag is still valid from the + * subtract (since the intervening mov instructions don't alter the flags). + * Gcc 8.1.0 doesn't have that problem. + */ +static void swap_words_32(void *a, void *b, size_t n) +{ + do { + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + } while (n); +} + +/** + * swap_words_64 - swap two elements in 64-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 8) + * + * Exchange the two objects in memory. This exploits base+index + * addressing, which basically all CPUs have, to minimize loop overhead + * computations. + * + * We'd like to use 64-bit loads if possible. If they're not, emulating + * one requires base+index+4 addressing which x86 has but most other + * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, + * but it's possible to have 64-bit loads without 64-bit pointers (e.g. + * x32 ABI). Are there any cases the kernel needs to worry about? + */ +static void swap_words_64(void *a, void *b, size_t n) +{ + do { +#ifdef CONFIG_64BIT + u64 t = *(u64 *)(a + (n -= 8)); + *(u64 *)(a + n) = *(u64 *)(b + n); + *(u64 *)(b + n) = t; +#else + /* Use two 32-bit transfers to avoid base+index+4 addressing */ + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + + t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; +#endif + } while (n); +} + +/** + * swap_bytes - swap two elements a byte at a time + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size + * + * This is the fallback if alignment doesn't allow using larger chunks. + */ +static void swap_bytes(void *a, void *b, size_t n) +{ + do { + char t = ((char *)a)[--n]; + ((char *)a)[n] = ((char *)b)[n]; + ((char *)b)[n] = t; + } while (n); +} + +/* + * The values are arbitrary as long as they can't be confused with + * a pointer, but small integers make for the smallest compare + * instructions. + */ +#define SWAP_WORDS_64 (swap_r_func_t)0 +#define SWAP_WORDS_32 (swap_r_func_t)1 +#define SWAP_BYTES (swap_r_func_t)2 +#define SWAP_WRAPPER (swap_r_func_t)3 + +struct wrapper { + cmp_func_t cmp; + swap_func_t swap_func; +}; + +/* + * The function pointer is last to make tail calls most efficient if the + * compiler decides not to inline this function. + */ +static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) +{ + if (swap_func == SWAP_WRAPPER) { + ((const struct wrapper *)priv)->swap_func(a, b, (int)size); + return; + } + + if (swap_func == SWAP_WORDS_64) + swap_words_64(a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32(a, b, size); + else if (swap_func == SWAP_BYTES) + swap_bytes(a, b, size); + else + swap_func(a, b, (int)size, priv); +} + +#define _CMP_WRAPPER ((cmp_r_func_t)0L) + +static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) +{ + if (cmp == _CMP_WRAPPER) + return ((const struct wrapper *)priv)->cmp(a, b); + return cmp(a, b, priv); +} + +static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, const void *priv, + size_t l, size_t r) +{ + return do_cmp(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + cmp_func, priv); +} + +static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, + swap_r_func_t swap_func, const void *priv, + size_t l, size_t r) +{ + do_swap(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size, swap_func, priv); +} + +void eytzinger0_sort_r(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + int i, j, k; + + /* called from 'sort' without swap function, let's pick the default */ + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) + swap_func = NULL; + + if (!swap_func) { + if (is_aligned(base, size, 8)) + swap_func = SWAP_WORDS_64; + else if (is_aligned(base, size, 4)) + swap_func = SWAP_WORDS_32; + else + swap_func = SWAP_BYTES; + } + + /* heapify */ + for (i = n / 2 - 1; i >= 0; --i) { + /* Find the sift-down path all the way to the leaves. */ + for (j = i; k = j * 2 + 1, k + 1 < n;) + j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + + /* Special case for the last leaf with no sibling. */ + if (j * 2 + 2 == n) + j = j * 2 + 1; + + /* Backtrack to the correct location. */ + while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) + j = (j - 1) / 2; + + /* Shift the element into its correct place. */ + for (k = j; j != i;) { + j = (j - 1) / 2; + eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); + } + } + + /* sort */ + for (i = n - 1; i > 0; --i) { + eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); + + /* Find the sift-down path all the way to the leaves. */ + for (j = 0; k = j * 2 + 1, k + 1 < i;) + j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + + /* Special case for the last leaf with no sibling. */ + if (j * 2 + 2 == i) + j = j * 2 + 1; + + /* Backtrack to the correct location. */ + while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) + j = (j - 1) / 2; + + /* Shift the element into its correct place. */ + for (k = j; j;) { + j = (j - 1) / 2; + eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); + } + } +} + +void eytzinger0_sort(void *base, size_t n, size_t size, + cmp_func_t cmp_func, + swap_func_t swap_func) +{ + struct wrapper w = { + .cmp = cmp_func, + .swap_func = swap_func, + }; + + return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); +} + +#if 0 +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/ktime.h> + +static u64 cmp_count; + +static int mycmp(const void *a, const void *b) +{ + u32 _a = *(u32 *)a; + u32 _b = *(u32 *)b; + + cmp_count++; + if (_a < _b) + return -1; + else if (_a > _b) + return 1; + else + return 0; +} + +static int test(void) +{ + size_t N, i; + ktime_t start, end; + s64 delta; + u32 *arr; + + for (N = 10000; N <= 100000; N += 10000) { + arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL); + cmp_count = 0; + + for (i = 0; i < N; i++) + arr[i] = get_random_u32(); + + start = ktime_get(); + eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL); + end = ktime_get(); + + delta = ktime_us_delta(end, start); + printk(KERN_INFO "time: %lld\n", delta); + printk(KERN_INFO "comparisons: %lld\n", cmp_count); + + u32 prev = 0; + + eytzinger0_for_each(i, N) { + if (prev > arr[i]) + goto err; + prev = arr[i]; + } + + kfree(arr); + } + return 0; + +err: + kfree(arr); + return -1; +} +#endif diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index b04750dbf870..0541192d7bc0 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -5,23 +5,33 @@ #include <linux/bitops.h> #include <linux/log2.h> -#include "util.h" +#ifdef EYTZINGER_DEBUG +#define EYTZINGER_BUG_ON(cond) BUG_ON(cond) +#else +#define EYTZINGER_BUG_ON(cond) +#endif /* * Traversal for trees in eytzinger layout - a full binary tree layed out in an - * array - */ - -/* - * One based indexing version: + * array. + * + * Consider using an eytzinger tree any time you would otherwise be doing binary + * search over an array. Binary search is a worst case scenario for branch + * prediction and prefetching, but in an eytzinger tree every node's children + * are adjacent in memory, thus we can prefetch children before knowing the + * result of the comparison, assuming multiple nodes fit on a cacheline. + * + * Two variants are provided, for one based indexing and zero based indexing. * - * With one based indexing each level of the tree starts at a power of two - - * good for cacheline alignment: + * Zero based indexing is more convenient, but one based indexing has better + * alignment and thus better performance because each new level of the tree + * starts at a power of two, and thus if element 0 was cacheline aligned, each + * new level will be as well. */ static inline unsigned eytzinger1_child(unsigned i, unsigned child) { - EBUG_ON(child > 1); + EYTZINGER_BUG_ON(child > 1); return (i << 1) + child; } @@ -38,7 +48,7 @@ static inline unsigned eytzinger1_right_child(unsigned i) static inline unsigned eytzinger1_first(unsigned size) { - return rounddown_pow_of_two(size); + return size ? rounddown_pow_of_two(size) : 0; } static inline unsigned eytzinger1_last(unsigned size) @@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size) static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EBUG_ON(i > size); + EYTZINGER_BUG_ON(i > size); if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); @@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EBUG_ON(i > size); + EYTZINGER_BUG_ON(i > size); if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; @@ -91,7 +101,9 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size) static inline unsigned eytzinger1_extra(unsigned size) { - return (size + 1 - rounddown_pow_of_two(size)) << 1; + return size + ? (size + 1 - rounddown_pow_of_two(size)) << 1 + : 0; } static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, @@ -101,7 +113,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, unsigned shift = __fls(size) - b; int s; - EBUG_ON(!i || i > size); + EYTZINGER_BUG_ON(!i || i > size); i ^= 1U << b; i <<= 1; @@ -126,7 +138,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, unsigned shift; int s; - EBUG_ON(!i || i > size); + EYTZINGER_BUG_ON(!i || i > size); /* * sign bit trick: @@ -164,7 +176,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) static inline unsigned eytzinger0_child(unsigned i, unsigned child) { - EBUG_ON(child > 1); + EYTZINGER_BUG_ON(child > 1); return (i << 1) + 1 + child; } @@ -231,11 +243,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) -typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); - /* return greatest node <= @search, or -1 if not found */ -static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, - eytzinger_cmp_fn cmp, const void *search) +static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) { unsigned i, n = 0; @@ -244,21 +254,49 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, do { i = n; - n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); + n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); } while (n < nr); if (n & 1) { - /* @i was greater than @search, return previous node: */ - - if (i == eytzinger0_first(nr)) - return -1; - + /* + * @i was greater than @search, return previous node: + * + * if @i was leftmost/smallest element, + * eytzinger0_prev(eytzinger0_first())) returns -1, as expected + */ return eytzinger0_prev(i, nr); } else { return i; } } +static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) +{ + ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + + /* + * if eytitzinger0_find_le() returned -1 - no element was <= search - we + * want to return the first element; next/prev identities mean this work + * as expected + * + * similarly if find_le() returns last element, we should return -1; + * identities mean this all works out: + */ + return eytzinger0_next(idx, nr); +} + +static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) +{ + ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + + if (idx < nr && !cmp(base + idx * size, search)) + return idx; + + return eytzinger0_next(idx, nr); +} + #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ void *_base = (base); \ @@ -269,13 +307,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, int _res; \ \ while (_i < _nr && \ - (_res = _cmp(_search, _base + _i * _size, _size))) \ + (_res = _cmp(_search, _base + _i * _size))) \ _i = eytzinger0_child(_i, _res > 0); \ _i; \ }) -void eytzinger0_sort(void *, size_t, size_t, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)); +void eytzinger0_sort_r(void *, size_t, size_t, + cmp_r_func_t, swap_r_func_t, const void *); +void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t); #endif /* _EYTZINGER_H */ diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h index 66b945be10c2..d8153fe27037 100644 --- a/fs/bcachefs/fifo.h +++ b/fs/bcachefs/fifo.h @@ -24,12 +24,12 @@ struct { \ (fifo)->mask = (fifo)->size \ ? roundup_pow_of_two((fifo)->size) - 1 \ : 0; \ - (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ + (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \ }) #define free_fifo(fifo) \ do { \ - kvpfree((fifo)->data, fifo_buf_size(fifo)); \ + kvfree((fifo)->data); \ (fifo)->data = NULL; \ } while (0) diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 1c1ea0f0c692..2c3d46ac70c6 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -42,7 +42,8 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, + BTREE_ITER_intent|BTREE_ITER_with_updates); if (ret) goto err; @@ -68,9 +69,7 @@ int bch2_create_trans(struct btree_trans *trans, if (!snapshot_src.inum) { /* Inode wasn't specified, just snapshot: */ struct bch_subvolume s; - - ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, - BTREE_ITER_CACHED, &s); + ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s); if (ret) goto err; @@ -78,7 +77,7 @@ int bch2_create_trans(struct btree_trans *trans, } ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -107,6 +106,7 @@ int bch2_create_trans(struct btree_trans *trans, u32 new_subvol, dir_snapshot; ret = bch2_subvolume_create(trans, new_inode->bi_inum, + dir.subvol, snapshot_src.subvol, &new_subvol, &snapshot, (flags & BCH_CREATE_SNAPSHOT_RO) != 0); @@ -162,7 +162,7 @@ int bch2_create_trans(struct btree_trans *trans, name, dir_target, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + STR_HASH_must_create|BTREE_ITER_with_updates); if (ret) goto err; @@ -170,7 +170,11 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_dir_offset = dir_offset; } - inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + if (S_ISDIR(mode) && + !new_inode->bi_subvol) + new_inode->bi_depth = dir_u->bi_depth + 1; + + inode_iter.flags &= ~BTREE_ITER_all_snapshots; bch2_btree_iter_set_snapshot(&inode_iter, snapshot); ret = bch2_btree_iter_traverse(&inode_iter) ?: @@ -197,16 +201,16 @@ int bch2_link_trans(struct btree_trans *trans, if (dir.subvol != inum.subvol) return -EXDEV; - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); if (ret) - goto err; + return ret; inode_u->bi_ctime = now; ret = bch2_inode_nlink_inc(inode_u); if (ret) - return ret; + goto err; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); if (ret) goto err; @@ -222,7 +226,7 @@ int bch2_link_trans(struct btree_trans *trans, ret = bch2_dirent_create(trans, dir, &dir_hash, mode_to_type(inode_u->bi_mode), name, inum.inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + STR_HASH_must_create); if (ret) goto err; @@ -242,7 +246,7 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, const struct qstr *name, - bool deleting_snapshot) + bool deleting_subvol) { struct bch_fs *c = trans->c; struct btree_iter dir_iter = { NULL }; @@ -254,34 +258,41 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bkey_s_c k; int ret; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); if (ret) goto err; dir_hash = bch2_hash_info_init(c, dir_u); - ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, - name, &inum, BTREE_ITER_INTENT); + ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, + name, &inum, BTREE_ITER_intent); if (ret) goto err; ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; - if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { + if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) { ret = bch2_empty_dir_trans(trans, inum); if (ret) goto err; } - if (deleting_snapshot && !inode_u->bi_subvol) { + if (deleting_subvol && !inode_u->bi_subvol) { ret = -BCH_ERR_ENOENT_not_subvol; goto err; } - if (deleting_snapshot || inode_u->bi_subvol) { + if (inode_u->bi_subvol) { + /* Recursive subvolume destroy not allowed (yet?) */ + ret = bch2_subvol_has_children(trans, inode_u->bi_subvol); + if (ret) + goto err; + } + + if (deleting_subvol || inode_u->bi_subvol) { ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); if (ret) goto err; @@ -314,7 +325,7 @@ int bch2_unlink_trans(struct btree_trans *trans, ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash, &dirent_iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_inode_write(trans, &dir_iter, dir_u) ?: bch2_inode_write(trans, &inode_iter, inode_u); err: @@ -349,6 +360,22 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, return ret; } +static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent) +{ + struct btree_iter iter; + struct bkey_i_subvolume *s = + bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvol), + BTREE_ITER_cached, subvolume); + int ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + s->v.fs_path_parent = cpu_to_le32(new_parent); + bch2_trans_iter_exit(trans, &iter); + return 0; +} + int bch2_rename_trans(struct btree_trans *trans, subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, @@ -370,7 +397,7 @@ int bch2_rename_trans(struct btree_trans *trans, int ret; ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -379,7 +406,7 @@ int bch2_rename_trans(struct btree_trans *trans, if (dst_dir.inum != src_dir.inum || dst_dir.subvol != src_dir.subvol) { ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -399,17 +426,47 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; if (dst_inum.inum) { ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, - BTREE_ITER_INTENT); + BTREE_ITER_intent); + if (ret) + goto err; + } + + if (src_inode_u->bi_subvol && + dst_dir.subvol != src_inode_u->bi_parent_subvol) { + ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol); + if (ret) + goto err; + } + + if (mode == BCH_RENAME_EXCHANGE && + dst_inode_u->bi_subvol && + src_dir.subvol != dst_inode_u->bi_parent_subvol) { + ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol); if (ret) goto err; } + /* Can't move across subvolumes, unless it's a subvolume root: */ + if (src_dir.subvol != dst_dir.subvol && + (!src_inode_u->bi_subvol || + (dst_inum.inum && !dst_inode_u->bi_subvol))) { + ret = -EXDEV; + goto err; + } + + if (src_inode_u->bi_parent_subvol) + src_inode_u->bi_parent_subvol = dst_dir.subvol; + + if ((mode == BCH_RENAME_EXCHANGE) && + dst_inode_u->bi_parent_subvol) + dst_inode_u->bi_parent_subvol = src_dir.subvol; + src_inode_u->bi_dir = dst_dir_u->bi_inum; src_inode_u->bi_dir_offset = dst_offset; @@ -432,10 +489,10 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (S_ISDIR(dst_inode_u->bi_mode) && - bch2_empty_dir_trans(trans, dst_inum)) { - ret = -ENOTEMPTY; - goto err; + if (S_ISDIR(dst_inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, dst_inum); + if (ret) + goto err; } } @@ -457,6 +514,15 @@ int bch2_rename_trans(struct btree_trans *trans, dst_dir_u->bi_nlink++; } + if (S_ISDIR(src_inode_u->bi_mode) && + !src_inode_u->bi_subvol) + src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; + + if (mode == BCH_RENAME_EXCHANGE && + S_ISDIR(dst_inode_u->bi_mode) && + !dst_inode_u->bi_subvol) + dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; + if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { dst_dir_u->bi_nlink--; src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; @@ -493,3 +559,94 @@ err: bch2_trans_iter_exit(trans, &src_dir_iter); return ret; } + +static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) +{ + bch2_printbuf_make_room(out, n); + + unsigned can_print = min(n, printbuf_remaining(out)); + + b += n; + + for (unsigned i = 0; i < can_print; i++) + out->buf[out->pos++] = *((char *) --b); + + printbuf_nul_terminate(out); +} + +static inline void prt_str_reversed(struct printbuf *out, const char *s) +{ + prt_bytes_reversed(out, s, strlen(s)); +} + +static inline void reverse_bytes(void *b, size_t n) +{ + char *e = b + n, *s = b; + + while (s < e) { + --e; + swap(*s, *e); + s++; + } +} + +/* XXX: we don't yet attempt to print paths when we don't know the subvol */ +int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path) +{ + unsigned orig_pos = path->pos; + int ret = 0; + + while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL && + inum.inum == BCACHEFS_ROOT_INO)) { + struct bch_inode_unpacked inode; + ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); + if (ret) + goto disconnected; + + if (!inode.bi_dir && !inode.bi_dir_offset) { + ret = -BCH_ERR_ENOENT_inode_no_backpointer; + goto disconnected; + } + + inum.subvol = inode.bi_parent_subvol ?: inum.subvol; + inum.inum = inode.bi_dir; + + u32 snapshot; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto disconnected; + + struct btree_iter d_iter; + struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, + BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot), + 0, dirent); + ret = bkey_err(d.s_c); + if (ret) + goto disconnected; + + struct qstr dirent_name = bch2_dirent_get_name(d); + prt_bytes_reversed(path, dirent_name.name, dirent_name.len); + + prt_char(path, '/'); + + bch2_trans_iter_exit(trans, &d_iter); + } + + if (orig_pos == path->pos) + prt_char(path, '/'); +out: + ret = path->allocation_failure ? -ENOMEM : 0; + if (ret) + goto err; + + reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); + return 0; +err: + return ret; +disconnected: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto err; + + prt_str_reversed(path, "(disconnected)"); + goto out; +} diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h index dde237859514..2b59210bb5e8 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/fs-common.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_FS_COMMON_H #define _BCACHEFS_FS_COMMON_H +#include "dirent.h" + struct posix_acl; #define BCH_CREATE_TMPFILE (1U << 0) @@ -40,4 +42,6 @@ int bch2_rename_trans(struct btree_trans *, bool bch2_reinherit_attrs(struct bch_inode_unpacked *, struct bch_inode_unpacked *); +int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); + #endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 27710cdd5710..ab1d5db2fa56 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -30,15 +30,8 @@ static void bch2_readpages_end_io(struct bio *bio) { struct folio_iter fi; - bio_for_each_folio_all(fi, bio) { - if (!bio->bi_status) { - folio_mark_uptodate(fi.folio); - } else { - folio_clear_uptodate(fi.folio); - folio_set_error(fi.folio); - } - folio_unlock(fi.folio); - } + bio_for_each_folio_all(fi, bio) + folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK); bio_put(bio); } @@ -158,7 +151,6 @@ static void bchfs_read(struct btree_trans *trans, struct bkey_buf sk; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; - u32 snapshot; int ret = 0; rbio->c = c; @@ -166,29 +158,24 @@ static void bchfs_read(struct btree_trans *trans, rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); -retry: bch2_trans_begin(trans); - iter = (struct btree_iter) { NULL }; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); + POS(inum.inum, rbio->bio.bi_iter.bi_sector), + BTREE_ITER_slots); while (1) { struct bkey_s_c k; - unsigned bytes, sectors, offset_into_extent; + unsigned bytes, sectors; + s64 offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; - /* - * read_extent -> io_time_reset may cause a transaction restart - * without returning an error, we need to check for that here: - */ - ret = bch2_trans_relock(trans); + bch2_trans_begin(trans); + + u32 snapshot; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) - break; + goto err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); bch2_btree_iter_set_pos(&iter, POS(inum.inum, rbio->bio.bi_iter.bi_sector)); @@ -196,7 +183,7 @@ retry: k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) - break; + goto err; offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); @@ -207,17 +194,17 @@ retry: ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &sk); if (ret) - break; + goto err; k = bkey_i_to_s_c(sk.k); - sectors = min(sectors, k.k->size - offset_into_extent); + sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); if (readpages_iter) { ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, extent_partial_reads_expensive(k)); if (ret) - break; + goto err; } bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; @@ -236,22 +223,20 @@ retry: swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); - - ret = btree_trans_too_many_iters(trans); - if (ret) +err: + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } -err: bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (ret) { - bch_err_inum_offset_ratelimited(c, - iter.pos.inode, - iter.pos.offset << 9, - "read error %i from btree lookup", ret); + struct printbuf buf = PRINTBUF; + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9); + prt_printf(&buf, "read error %i from btree lookup", ret); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); } @@ -264,9 +249,9 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts; - struct btree_trans *trans = bch2_trans_get(c); struct folio *folio; struct readpages_iter readpages_iter; + struct blk_plug plug; bch2_inode_opts_get(&opts, c, &inode->ei_inode); @@ -274,8 +259,19 @@ void bch2_readahead(struct readahead_control *ractl) if (ret) return; + /* + * Besides being a general performance optimization, plugging helps with + * avoiding btree transaction srcu warnings - submitting a bio can + * block, and we don't want todo that with the transaction locked. + * + * However, plugged bios are submitted when we schedule; we ideally + * would have our own scheduler hook to call unlock_long() before + * scheduling. + */ + blk_start_plug(&plug); bch2_pagecache_add_get(inode); + struct btree_trans *trans = bch2_trans_get(c); while ((folio = readpage_iter_peek(&readpages_iter))) { unsigned n = min_t(unsigned, readpages_iter.folios.nr - @@ -296,10 +292,10 @@ void bch2_readahead(struct readahead_control *ractl) &readpages_iter); bch2_trans_unlock(trans); } + bch2_trans_put(trans); bch2_pagecache_add_put(inode); - - bch2_trans_put(trans); + blk_finish_plug(&plug); darray_exit(&readpages_iter.folios); } @@ -314,9 +310,13 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_read_bio *rbio; struct bch_io_opts opts; + struct blk_plug plug; int ret; DECLARE_COMPLETION_ONSTACK(done); + BUG_ON(folio_test_uptodate(folio)); + BUG_ON(folio_test_dirty(folio)); + if (!bch2_folio_create(folio, GFP_KERNEL)) return -ENOMEM; @@ -331,7 +331,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + blk_start_plug(&plug); bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); + blk_finish_plug(&plug); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); @@ -408,7 +410,6 @@ static void bch2_writepage_io_done(struct bch_write_op *op) bio_for_each_folio_all(fi, bio) { struct bch_folio *s; - folio_set_error(fi.folio); mapping_set_error(fi.folio->mapping, -EIO); s = __bch2_folio(fi.folio); @@ -445,8 +446,8 @@ static void bch2_writepage_io_done(struct bch_write_op *op) */ /* - * PageWriteback is effectively our ref on the inode - fixup i_blocks - * before calling end_page_writeback: + * The writeback flag is effectively our ref on the inode - + * fixup i_blocks before calling folio_end_writeback: */ bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); @@ -494,7 +495,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); - op->subvol = inode->ei_subvol; + op->subvol = inode->ei_inum.subvol; op->pos = POS(inode->v.i_ino, sector); op->end_io = bch2_writepage_io_done; op->devs_need_flush = &inode->ei_devs_need_flush; @@ -542,7 +543,7 @@ do_io: if (f_sectors > w->tmp_sectors) { kfree(w->tmp); - w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL); + w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL); w->tmp_sectors = f_sectors; } @@ -624,15 +625,6 @@ do_io: BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, sectors << 9, offset << 9)); - /* Check for writing past i_size: */ - WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > - round_up(i_size, block_bytes(c)) && - !test_bit(BCH_FS_emergency_ro, &c->flags), - "writing past i_size: %llu > %llu (unrounded %llu)\n", - bio_end_sector(&w->io->op.wbio.bio) << 9, - round_up(i_size, block_bytes(c)), - i_size); - w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; w->io->op.new_i_size = i_size; @@ -667,7 +659,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc int bch2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -686,9 +678,9 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, bch2_pagecache_add_get(inode); folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, - FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, - mapping_gfp_mask(mapping)); - if (IS_ERR_OR_NULL(folio)) + FGP_WRITEBEGIN | fgf_set_order(len), + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) goto err_unlock; offset = pos - folio_pos(folio); @@ -736,12 +728,11 @@ out: goto err; } - *pagep = &folio->page; + *foliop = folio; return 0; err: folio_unlock(folio); folio_put(folio); - *pagep = NULL; err_unlock: bch2_pagecache_add_put(inode); kfree(res); @@ -751,12 +742,11 @@ err_unlock: int bch2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation *res = fsdata; - struct folio *folio = page_folio(page); unsigned offset = pos - folio_pos(folio); lockdep_assert_held(&inode->v.i_rwsem); @@ -827,9 +817,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, darray_init(&fs); ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, - FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, - mapping_gfp_mask(mapping), - &fs); + FGP_WRITEBEGIN | fgf_set_order(len), + mapping_gfp_mask(mapping), &fs); if (ret) goto out; @@ -862,24 +851,32 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, f_pos = pos; f_offset = pos - folio_pos(darray_first(fs)); darray_for_each(fs, fi) { + ssize_t f_reserved; + f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; + f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len); + + if (unlikely(f_reserved != f_len)) { + if (f_reserved < 0) { + if (f == darray_first(fs)) { + ret = f_reserved; + goto out; + } + + folios_trunc(&fs, fi); + end = min(end, folio_end_pos(darray_last(fs))); + } else { + if (!folio_test_uptodate(f)) { + ret = bch2_read_single_folio(f, mapping); + if (ret) + goto out; + } + + folios_trunc(&fs, fi + 1); + end = f_pos + f_reserved; + } - /* - * XXX: per POSIX and fstests generic/275, on -ENOSPC we're - * supposed to write as much as we have disk space for. - * - * On failure here we should still write out a partial page if - * we aren't completely out of disk space - we don't do that - * yet: - */ - ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); - if (unlikely(ret)) { - folios_trunc(&fs, fi); - if (!fs.nr) - goto out; - - end = min(end, folio_end_pos(darray_last(fs))); break; } @@ -896,7 +893,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, darray_for_each(fs, fi) { f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; - f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); + f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter); if (!f_copied) { folios_trunc(&fs, fi); break; diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h index a6126ff790e6..3207ebbb4ab4 100644 --- a/fs/bcachefs/fs-io-buffered.h +++ b/fs/bcachefs/fs-io-buffered.h @@ -10,10 +10,10 @@ int bch2_read_folio(struct file *, struct folio *); int bch2_writepages(struct address_space *, struct writeback_control *); void bch2_readahead(struct readahead_control *); -int bch2_write_begin(struct file *, struct address_space *, loff_t, - unsigned, struct page **, void **); +int bch2_write_begin(struct file *, struct address_space *, loff_t pos, + unsigned len, struct folio **, void **); int bch2_write_end(struct file *, struct address_space *, loff_t, - unsigned, unsigned, struct page *, void *); + unsigned len, unsigned copied, struct folio *, void *); ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 33cb6da3a5ad..2089c36b5866 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -70,6 +70,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct bch_io_opts opts; struct dio_read *dio; struct bio *bio; + struct blk_plug plug; loff_t offset = req->ki_pos; bool sync = is_sync_kiocb(req); size_t shorten; @@ -128,6 +129,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) */ dio->should_dirty = iter_is_iovec(iter); + blk_start_plug(&plug); + goto start; while (iter->count) { bio = bio_alloc_bioset(NULL, @@ -160,6 +163,8 @@ start: bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); } + blk_finish_plug(&plug); + iter->count += shorten; if (sync) { @@ -179,7 +184,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct bch_inode_info *inode = file_bch_inode(file); struct address_space *mapping = file->f_mapping; size_t count = iov_iter_count(iter); - ssize_t ret; + ssize_t ret = 0; if (!count) return 0; /* skip atime */ @@ -205,7 +210,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos += ret; } else { bch2_pagecache_add_get(inode); - ret = generic_file_read_iter(iocb, iter); + ret = filemap_read(iocb, iter, ret); bch2_pagecache_add_put(inode); } out: @@ -254,7 +259,7 @@ retry: for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inum.inum, offset, snapshot), - BTREE_ITER_SLOTS, k, err) { + BTREE_ITER_slots, k, err) { if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) break; @@ -369,6 +374,7 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio) static __always_inline long bch2_dio_write_done(struct dio_write *dio) { + struct bch_fs *c = dio->op.c; struct kiocb *req = dio->req; struct bch_inode_info *inode = dio->inode; bool sync = dio->sync; @@ -387,6 +393,8 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); + bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); @@ -498,7 +506,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) dio->op.target = dio->op.opts.foreground_target; dio->op.write_point = writepoint_hashed((unsigned long) current); dio->op.nr_replicas = dio->op.opts.data_replicas; - dio->op.subvol = inode->ei_subvol; + dio->op.subvol = inode->ei_inum.subvol; dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); dio->op.devs_need_flush = &inode->ei_devs_need_flush; @@ -536,7 +544,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) if (likely(!dio->iter.count) || dio->op.error) break; - bio_reset(bio, NULL, REQ_OP_WRITE); + bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); } out: return bch2_dio_write_done(dio); @@ -590,22 +598,27 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) prefetch(&inode->ei_inode); prefetch((void *) &inode->ei_inode + 64); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write)) + return -EROFS; + inode_lock(&inode->v); ret = generic_write_checks(req, iter); if (unlikely(ret <= 0)) - goto err; + goto err_put_write_ref; ret = file_remove_privs(file); if (unlikely(ret)) - goto err; + goto err_put_write_ref; ret = file_update_time(file); if (unlikely(ret)) - goto err; + goto err_put_write_ref; - if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) - goto err; + if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) { + ret = -EINVAL; + goto err_put_write_ref; + } inode_dio_begin(&inode->v); bch2_pagecache_block_get(inode); @@ -618,7 +631,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) bio = bio_alloc_bioset(NULL, bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_WRITE, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE, GFP_KERNEL, &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); @@ -645,7 +658,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) } ret = bch2_dio_write_loop(dio); -err: +out: if (locked) inode_unlock(&inode->v); return ret; @@ -653,7 +666,9 @@ err_put_bio: bch2_pagecache_block_put(inode); bio_put(bio); inode_dio_end(&inode->v); - goto err; +err_put_write_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + goto out; } void bch2_fs_fs_io_direct_exit(struct bch_fs *c) diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index d359aa9b33b8..e072900e6a5b 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -29,7 +29,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping, break; f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); - if (IS_ERR_OR_NULL(f)) + if (IS_ERR(f)) break; BUG_ON(fs->nr && folio_pos(f) != pos); @@ -182,18 +182,11 @@ static void __bch2_folio_set(struct folio *folio, int bch2_folio_set(struct bch_fs *c, subvol_inum inum, struct folio **fs, unsigned nr_folios) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_folio *s; u64 offset = folio_sector(fs[0]); - unsigned folio_idx; - u32 snapshot; bool need_set = false; - int ret; - for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { - s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); + for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) { + struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); if (!s) return -ENOMEM; @@ -203,53 +196,40 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum, if (!need_set) return 0; - folio_idx = 0; - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, - SPOS(inum.inum, offset, snapshot), - BTREE_ITER_SLOTS, k, ret) { - unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); - - while (folio_idx < nr_folios) { - struct folio *folio = fs[folio_idx]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - - folio_start; - unsigned folio_len = min(k.k->p.offset, folio_end) - - folio_offset - folio_start; - - BUG_ON(k.k->p.offset < folio_start); - BUG_ON(bkey_start_offset(k.k) > folio_end); - - if (!bch2_folio(folio)->uptodate) - __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); - - if (k.k->p.offset < folio_end) - break; - folio_idx++; - } - - if (folio_idx == nr_folios) - break; - } - - offset = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_put(trans); + unsigned folio_idx = 0; + + return bch2_trans_run(c, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, + POS(inum.inum, offset), + POS(inum.inum, U64_MAX), + inum.subvol, BTREE_ITER_slots, k, ({ + unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); + + while (folio_idx < nr_folios) { + struct folio *folio = fs[folio_idx]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - + folio_start; + unsigned folio_len = min(k.k->p.offset, folio_end) - + folio_offset - folio_start; + + BUG_ON(k.k->p.offset < folio_start); + BUG_ON(bkey_start_offset(k.k) > folio_end); + + if (!bch2_folio(folio)->uptodate) + __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); + + if (k.k->p.offset < folio_end) + break; + folio_idx++; + } - return ret; + if (folio_idx == nr_folios) + break; + 0; + }))); } void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) @@ -419,14 +399,17 @@ void bch2_folio_reservation_put(struct bch_fs *c, bch2_quota_reservation_put(c, inode, &res->quota); } -int bch2_folio_reservation_get(struct bch_fs *c, +static int __bch2_folio_reservation_get(struct bch_fs *c, struct bch_inode_info *inode, struct folio *folio, struct bch2_folio_reservation *res, - unsigned offset, unsigned len) + size_t offset, size_t len, + bool partial) { struct bch_folio *s = bch2_folio_create(folio, 0); unsigned i, disk_sectors = 0, quota_sectors = 0; + struct disk_reservation disk_res = {}; + size_t reserved = len; int ret; if (!s) @@ -437,32 +420,70 @@ int bch2_folio_reservation_get(struct bch_fs *c, for (i = round_down(offset, block_bytes(c)) >> 9; i < round_up(offset + len, block_bytes(c)) >> 9; i++) { - disk_sectors += sectors_to_reserve(&s->s[i], - res->disk.nr_replicas); + disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); quota_sectors += s->s[i].state == SECTOR_unallocated; } if (disk_sectors) { - ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); + ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors, + partial ? BCH_DISK_RESERVATION_PARTIAL : 0); if (unlikely(ret)) return ret; + + if (unlikely(disk_res.sectors != disk_sectors)) { + disk_sectors = quota_sectors = 0; + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); + if (disk_sectors > disk_res.sectors) { + /* + * Make sure to get a reservation that's + * aligned to the filesystem blocksize: + */ + unsigned reserved_offset = round_down(i << 9, block_bytes(c)); + reserved = clamp(reserved_offset, offset, offset + len) - offset; + + if (!reserved) { + bch2_disk_reservation_put(c, &disk_res); + return -BCH_ERR_ENOSPC_disk_reservation; + } + break; + } + quota_sectors += s->s[i].state == SECTOR_unallocated; + } + } } if (quota_sectors) { - ret = bch2_quota_reservation_add(c, inode, &res->quota, - quota_sectors, true); + ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true); if (unlikely(ret)) { - struct disk_reservation tmp = { - .sectors = disk_sectors - }; - - bch2_disk_reservation_put(c, &tmp); - res->disk.sectors -= disk_sectors; + bch2_disk_reservation_put(c, &disk_res); return ret; } } - return 0; + res->disk.sectors += disk_res.sectors; + return partial ? reserved : 0; +} + +int bch2_folio_reservation_get(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + size_t offset, size_t len) +{ + return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false); +} + +ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + size_t offset, size_t len) +{ + return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true); } static void bch2_clear_folio_bits(struct folio *folio) diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index 8cbaba6565b4..fad911cf5068 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -51,13 +51,10 @@ enum bch_folio_sector_state { struct bch_folio_sector { /* Uncompressed, fully allocated replicas (or on disk reservation): */ - unsigned nr_replicas:4; - + u8 nr_replicas:4, /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ - unsigned replicas_reserved:4; - - /* i_sectors: */ - enum bch_folio_sector_state state:8; + replicas_reserved:4; + u8 state; }; struct bch_folio { @@ -102,9 +99,7 @@ static inline void bch2_folio_release(struct folio *folio) static inline struct bch_folio *__bch2_folio(struct folio *folio) { - return folio_has_private(folio) - ? (struct bch_folio *) folio_get_private(folio) - : NULL; + return folio_get_private(folio); } static inline struct bch_folio *bch2_folio(struct folio *folio) @@ -156,7 +151,12 @@ int bch2_folio_reservation_get(struct bch_fs *, struct bch_inode_info *, struct folio *, struct bch2_folio_reservation *, - unsigned, unsigned); + size_t, size_t); +ssize_t bch2_folio_reservation_get_partial(struct bch_fs *, + struct bch_inode_info *, + struct folio *, + struct bch2_folio_reservation *, + size_t, size_t); void bch2_set_folio_dirty(struct bch_fs *, struct bch_inode_info *, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 8c70123b6a0c..717e7b94c66f 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -167,6 +167,34 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, /* fsync: */ +static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, + u64 *seq) +{ + struct printbuf buf = PRINTBUF; + struct bch_inode_unpacked u; + struct btree_iter iter; + int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); + if (ret) + return ret; + + u64 cur_seq = journal_cur_seq(&trans->c->journal); + *seq = min(cur_seq, u.bi_journal_seq); + + if (fsck_err_on(u.bi_journal_seq > cur_seq, + trans, inode_journal_seq_in_future, + "inode journal seq in future (currently at %llu)\n%s", + cur_seq, + (bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { + u.bi_journal_seq = cur_seq; + ret = bch2_inode_write(trans, &iter, &u); + } +fsck_err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + /* * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an * insert trigger: look up the btree inode instead @@ -174,25 +202,28 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, static int bch2_flush_inode(struct bch_fs *c, struct bch_inode_info *inode) { - struct bch_inode_unpacked u; - int ret; - if (c->opts.journal_flush_disabled) return 0; - ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u); - if (ret) - return ret; + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) + return -EROFS; - return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: - bch2_inode_flush_nocow_writes(c, inode); + u64 seq; + int ret = bch2_trans_commit_do(c, NULL, NULL, 0, + bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: + bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: + bch2_inode_flush_nocow_writes(c, inode); + bch2_write_ref_put(c, BCH_WRITE_REF_fsync); + return ret; } int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret; + int ret, err; + + trace_bch2_fsync(file, datasync); ret = file_write_and_wait_range(file, start, end); if (ret) @@ -202,7 +233,15 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) goto out; ret = bch2_flush_inode(c, inode); out: - return bch2_err_class(ret); + ret = bch2_err_class(ret); + if (ret == -EROFS) + ret = -EIO; + + err = file_check_and_advance_wb_err(file); + if (!ret) + ret = err; + + return ret; } /* truncate: */ @@ -211,30 +250,11 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol, struct bpos start, struct bpos end) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot); - if (ret) - goto err; - - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret) - if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { - ret = 1; - break; - } - start = iter.pos; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); - return ret; + return bch2_trans_run(c, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, + subvol, 0, k, ({ + bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); + }))); } static int __bch2_truncate_folio(struct bch_inode_info *inode, @@ -257,7 +277,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, * XXX: we're doing two index lookups when we end up reading the * folio */ - ret = range_has_data(c, inode->ei_subvol, + ret = range_has_data(c, inode->ei_inum.subvol, POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); if (ret <= 0) @@ -265,7 +285,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, folio = __filemap_get_folio(mapping, index, FGP_LOCK|FGP_CREAT, GFP_KERNEL); - if (IS_ERR_OR_NULL(folio)) { + if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } @@ -446,6 +466,7 @@ int bchfs_truncate(struct mnt_idmap *idmap, ret = bch2_truncate_folio(inode, iattr->ia_size); if (unlikely(ret < 0)) goto err; + ret = 0; truncate_setsize(&inode->v, iattr->ia_size); @@ -505,7 +526,7 @@ static int inode_update_times_fn(struct btree_trans *trans, return 0; } -static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) +static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; u64 end = offset + len; @@ -544,7 +565,7 @@ err: return ret; } -static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, +static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode, loff_t offset, loff_t len, bool insert) { @@ -580,7 +601,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, return ret; } -static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, +static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, u64 start_sector, u64 end_sector) { struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -594,9 +615,9 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inode->v.i_ino, start_sector), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); - while (!ret && bkey_lt(iter.pos, end_pos)) { + while (!ret) { s64 i_sectors_delta = 0; struct quota_res quota_res = { 0 }; struct bkey_s_c k; @@ -607,8 +628,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_trans_begin(trans); + if (bkey_ge(iter.pos, end_pos)) + break; + ret = bch2_subvolume_get_snapshot(trans, - inode->ei_subvol, &snapshot); + inode->ei_inum.subvol, &snapshot); if (ret) goto bkey_err; @@ -643,12 +667,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, - opts.data_replicas, true)) + opts.data_replicas, true)) { ret = drop_locks_do(trans, (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, opts.data_replicas, false), 0)); + if (ret) + goto bkey_err; + } bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); if (ret) @@ -676,10 +703,13 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); if (bch2_mark_pagecache_reserved(inode, &hole_start, - iter.pos.offset, true)) - drop_locks_do(trans, + iter.pos.offset, true)) { + ret = drop_locks_do(trans, bch2_mark_pagecache_reserved(inode, &hole_start, iter.pos.offset, false)); + if (ret) + goto bkey_err; + } bkey_err: bch2_quota_reservation_put(c, inode, "a_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -701,7 +731,7 @@ bkey_err: return ret; } -static long bchfs_fallocate(struct bch_inode_info *inode, int mode, +static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -803,41 +833,23 @@ static int quota_reserve_range(struct bch_inode_info *inode, u64 start, u64 end) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - u32 snapshot; u64 sectors = end - start; - u64 pos = start; - int ret; -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); - if (ret) - goto err; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inode->v.i_ino, pos, snapshot), 0); - - while (!(ret = btree_trans_too_many_iters(trans)) && - (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && - !(ret = bkey_err(k))) { - if (bkey_extent_is_allocation(k.k)) { - u64 s = min(end, k.k->p.offset) - - max(start, bkey_start_offset(k.k)); - BUG_ON(s > sectors); - sectors -= s; - } - bch2_btree_iter_advance(&iter); - } - pos = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_put(trans); + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_max(trans, iter, + BTREE_ID_extents, + POS(inode->v.i_ino, start), + POS(inode->v.i_ino, end - 1), + inode->ei_inum.subvol, 0, k, ({ + if (bkey_extent_is_allocation(k.k)) { + u64 s = min(end, k.k->p.offset) - + max(start, bkey_start_offset(k.k)); + BUG_ON(s > sectors); + sectors -= s; + } + + 0; + }))); return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); } @@ -857,9 +869,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) return -EINVAL; - if (remap_flags & REMAP_FILE_DEDUP) - return -EOPNOTSUPP; - if ((pos_src & (block_bytes(c) - 1)) || (pos_dst & (block_bytes(c) - 1))) return -EINVAL; @@ -892,16 +901,24 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, if (ret) goto err; - file_update_time(file_dst); + if (!(remap_flags & REMAP_FILE_DEDUP)) + file_update_time(file_dst); bch2_mark_pagecache_unallocated(src, pos_src >> 9, (pos_src + aligned_len) >> 9); + /* + * XXX: we'd like to be telling bch2_remap_range() if we have + * permission to write to the source file, and thus if io path option + * changes should be propagated through the copy, but we need mnt_idmap + * from the pathwalk, awkward + */ ret = bch2_remap_range(c, inode_inum(dst), pos_dst >> 9, inode_inum(src), pos_src >> 9, aligned_len >> 9, - pos_dst + len, &i_sectors_delta); + pos_dst + len, &i_sectors_delta, + false); if (ret < 0) goto err; @@ -934,42 +951,25 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; subvol_inum inum = inode_inum(inode); u64 isize, next_data = MAX_LFS_FILESIZE; - u32 snapshot; - int ret; isize = i_size_read(&inode->v); if (offset >= isize) return -ENXIO; - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, - SPOS(inode->v.i_ino, offset >> 9, snapshot), - POS(inode->v.i_ino, U64_MAX), - 0, k, ret) { - if (bkey_extent_is_data(k.k)) { - next_data = max(offset, bkey_start_offset(k.k) << 9); - break; - } else if (k.k->p.offset >> 9 > isize) - break; - } - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, + POS(inode->v.i_ino, offset >> 9), + POS(inode->v.i_ino, U64_MAX), + inum.subvol, 0, k, ({ + if (bkey_extent_is_data(k.k)) { + next_data = max(offset, bkey_start_offset(k.k) << 9); + break; + } else if (k.k->p.offset >> 9 > isize) + break; + 0; + }))); if (ret) return ret; @@ -987,50 +987,34 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; subvol_inum inum = inode_inum(inode); u64 isize, next_hole = MAX_LFS_FILESIZE; - u32 snapshot; - int ret; isize = i_size_read(&inode->v); if (offset >= isize) return -ENXIO; - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, - SPOS(inode->v.i_ino, offset >> 9, snapshot), - BTREE_ITER_SLOTS, k, ret) { - if (k.k->p.inode != inode->v.i_ino) { - next_hole = bch2_seek_pagecache_hole(&inode->v, - offset, MAX_LFS_FILESIZE, 0, false); - break; - } else if (!bkey_extent_is_data(k.k)) { - next_hole = bch2_seek_pagecache_hole(&inode->v, - max(offset, bkey_start_offset(k.k) << 9), - k.k->p.offset << 9, 0, false); - - if (next_hole < k.k->p.offset << 9) + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, + POS(inode->v.i_ino, offset >> 9), + POS(inode->v.i_ino, U64_MAX), + inum.subvol, BTREE_ITER_slots, k, ({ + if (k.k->p.inode != inode->v.i_ino) { + next_hole = bch2_seek_pagecache_hole(&inode->v, + offset, MAX_LFS_FILESIZE, 0, false); break; - } else { - offset = max(offset, bkey_start_offset(k.k) << 9); - } - } - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); + } else if (!bkey_extent_is_data(k.k)) { + next_hole = bch2_seek_pagecache_hole(&inode->v, + max(offset, bkey_start_offset(k.k) << 9), + k.k->p.offset << 9, 0, false); + + if (next_hole < k.k->p.offset << 9) + break; + } else { + offset = max(offset, bkey_start_offset(k.k) << 9); + } + 0; + }))); if (ret) return ret; diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 3dc8630ff9fe..15725b4ce393 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -100,7 +100,7 @@ static int bch2_ioc_setflags(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: bch2_write_inode(c, inode, bch2_inode_flags_set, &s, ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); @@ -184,7 +184,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: bch2_set_projid(c, inode, fa.fsx_projid) ?: bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ATTR_CTIME); @@ -272,6 +272,69 @@ err1: return ret; } +static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg) +{ + return put_user(inode->v.i_generation, arg); +} + +static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label) +{ + int ret; + size_t len; + char label[BCH_SB_LABEL_SIZE]; + + BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX); + + mutex_lock(&c->sb_lock); + memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE); + mutex_unlock(&c->sb_lock); + + len = strnlen(label, BCH_SB_LABEL_SIZE); + if (len == BCH_SB_LABEL_SIZE) { + bch_warn(c, + "label is too long, return the first %zu bytes", + --len); + } + + ret = copy_to_user(user_label, label, len); + + return ret ? -EFAULT : 0; +} + +static int bch2_ioc_setlabel(struct bch_fs *c, + struct file *file, + struct bch_inode_info *inode, + const char __user *user_label) +{ + int ret; + char label[BCH_SB_LABEL_SIZE]; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(label, user_label, sizeof(label))) + return -EFAULT; + + if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) { + bch_err(c, + "unable to set label with more than %d bytes", + BCH_SB_LABEL_SIZE - 1); + return -EINVAL; + } + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + mutex_lock(&c->sb_lock); + strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + mnt_drop_write_file(file); + return ret; +} + static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) { u32 flags; @@ -308,8 +371,8 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) return ret; } -static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) +static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) { struct inode *dir; struct bch_inode_info *inode; @@ -343,7 +406,7 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, sync_inodes_sb(c->vfs_sb); up_read(&c->vfs_sb->s_umount); } -retry: + if (arg.src_ptr) { error = user_path_at(arg.dirfd, (const char __user *)(unsigned long)arg.src_ptr, @@ -373,7 +436,7 @@ retry: } if (dst_dentry->d_inode) { - error = -EEXIST; + error = -BCH_ERR_EEXIST_subvolume_create; goto err3; } @@ -406,9 +469,12 @@ retry: !arg.src_ptr) snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; + down_write(&c->snapshot_create_lock); inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), dst_dentry, arg.mode|S_IFDIR, 0, snapshot_src, create_flags); + up_write(&c->snapshot_create_lock); + error = PTR_ERR_OR_ZERO(inode); if (error) goto err3; @@ -420,25 +486,10 @@ err3: err2: if (arg.src_ptr) path_put(&src_path); - - if (retry_estale(error, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } err1: return error; } -static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) -{ - down_write(&c->snapshot_create_lock); - long ret = __bch2_ioctl_subvolume_create(c, filp, arg); - up_write(&c->snapshot_create_lock); - - return ret; -} - static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, struct bch_ioctl_subvolume arg) { @@ -506,13 +557,21 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; case FS_IOC_GETVERSION: - ret = -ENOTTY; + ret = bch2_ioc_getversion(inode, (u32 __user *) arg); break; case FS_IOC_SETVERSION: ret = -ENOTTY; break; + case FS_IOC_GETFSLABEL: + ret = bch2_ioc_getlabel(c, (void __user *) arg); + break; + + case FS_IOC_SETFSLABEL: + ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg); + break; + case FS_IOC_GOINGDOWN: ret = bch2_ioc_goingdown(c, (u32 __user *) arg); break; @@ -548,12 +607,18 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) { /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { - case FS_IOC_GETFLAGS: + case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + case FS_IOC_GETFSLABEL: + case FS_IOC_SETFSLABEL: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 77ae65542db9..90ade8f648d9 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -23,19 +23,23 @@ #include "journal.h" #include "keylist.h" #include "quota.h" +#include "rebalance.h" #include "snapshot.h" #include "super.h" #include "xattr.h" +#include "trace.h" #include <linux/aio.h> #include <linux/backing-dev.h> #include <linux/exportfs.h> #include <linux/fiemap.h> +#include <linux/fs_context.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/posix_acl.h> #include <linux/random.h> #include <linux/seq_file.h> +#include <linux/siphash.h> #include <linux/statfs.h> #include <linux/string.h> #include <linux/xattr.h> @@ -56,15 +60,16 @@ void bch2_inode_update_after_write(struct btree_trans *trans, BUG_ON(bi->bi_inum != inode->v.i_ino); - bch2_assert_pos_locked(trans, BTREE_ID_inodes, - POS(0, bi->bi_inum), - c->opts.inodes_use_key_cache); + bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); i_gid_write(&inode->v, bi->bi_gid); inode->v.i_mode = bi->bi_mode; + if (fields & ATTR_SIZE) + i_size_write(&inode->v, bi->bi_size); + if (fields & ATTR_ATIME) inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); if (fields & ATTR_MTIME) @@ -89,10 +94,25 @@ int __must_check bch2_write_inode(struct bch_fs *c, retry: bch2_trans_begin(trans); - ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT) ?: - (set ? set(trans, inode, &inode_u, p) : 0) ?: - bch2_inode_write(trans, &iter, &inode_u) ?: + ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); + if (ret) + goto err; + + struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); + + ret = (set ? set(trans, inode, &inode_u, p) : 0); + if (ret) + goto err; + + struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); + + if (memcmp(&old_r, &new_r, sizeof(new_r))) { + ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); + if (ret) + goto err; + } + + ret = bch2_inode_write(trans, &iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); /* @@ -101,14 +121,15 @@ retry: */ if (!ret) bch2_inode_update_after_write(trans, inode, &inode_u, fields); - +err: bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, - "inode %u:%llu not found when updating", + "%s: inode %llu:%llu not found when updating", + bch2_err_str(ret), inode_inum(inode).subvol, inode_inum(inode).inum); @@ -151,70 +172,342 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -static int bch2_iget5_test(struct inode *vinode, void *p) +static bool subvol_inum_eq(subvol_inum a, subvol_inum b) { - struct bch_inode_info *inode = to_bch_ei(vinode); - subvol_inum *inum = p; + return a.subvol == b.subvol && a.inum == b.inum; +} + +static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) +{ + const subvol_inum *inum = data; + siphash_key_t k = { .key[0] = seed }; - return inode->ei_subvol == inum->subvol && - inode->ei_inode.bi_inum == inum->inum; + return siphash_2u64(inum->subvol, inum->inum, &k); } -static int bch2_iget5_set(struct inode *vinode, void *p) +static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) { - struct bch_inode_info *inode = to_bch_ei(vinode); - subvol_inum *inum = p; + const struct bch_inode_info *inode = data; - inode->v.i_ino = inum->inum; - inode->ei_subvol = inum->subvol; - inode->ei_inode.bi_inum = inum->inum; - return 0; + return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); } -static unsigned bch2_inode_hash(subvol_inum inum) +static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, + const void *obj) { - return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); + const struct bch_inode_info *inode = obj; + const subvol_inum *v = arg->key; + + return !subvol_inum_eq(inode->ei_inum, *v); } -struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +static const struct rhashtable_params bch2_vfs_inodes_params = { + .head_offset = offsetof(struct bch_inode_info, hash), + .key_offset = offsetof(struct bch_inode_info, ei_inum), + .key_len = sizeof(subvol_inum), + .hashfn = bch2_vfs_inode_hash_fn, + .obj_hashfn = bch2_vfs_inode_obj_hash_fn, + .obj_cmpfn = bch2_vfs_inode_cmp_fn, + .automatic_shrinking = true, +}; + +static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { + .head_offset = offsetof(struct bch_inode_info, by_inum_hash), + .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), + .key_len = sizeof(u64), + .automatic_shrinking = true, +}; + +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) +{ + struct bch_fs *c = trans->c; + struct rhltable *ht = &c->vfs_inodes_by_inum_table; + u64 inum = p.offset; + DARRAY(u32) subvols; + int ret = 0; + + if (!test_bit(BCH_FS_started, &c->flags)) + return false; + + darray_init(&subvols); +restart_from_top: + + /* + * Tweaked version of __rhashtable_lookup(); we need to get a list of + * subvolumes in which the given inode number is open. + * + * For this to work, we don't include the subvolume ID in the key that + * we hash - all inodes with the same inode number regardless of + * subvolume will hash to the same slot. + * + * This will be less than ideal if the same file is ever open + * simultaneously in many different snapshots: + */ + rcu_read_lock(); + struct rhash_lock_head __rcu *const *bkt; + struct rhash_head *he; + unsigned int hash; + struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); +restart: + hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); + bkt = rht_bucket(tbl, hash); + do { + struct bch_inode_info *inode; + + rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { + if (inode->ei_inum.inum == inum) { + ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, + GFP_NOWAIT|__GFP_NOWARN); + if (ret) { + rcu_read_unlock(); + ret = darray_make_room(&subvols, 1); + if (ret) + goto err; + subvols.nr = 0; + goto restart_from_top; + } + } + } + /* An object might have been moved to a different hash chain, + * while we walk along it - better check and retry. + */ + } while (he != RHT_NULLS_MARKER(bkt)); + + /* Ensure we see any new tables. */ + smp_rmb(); + + tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); + if (unlikely(tbl)) + goto restart; + rcu_read_unlock(); + + darray_for_each(subvols, i) { + u32 snap; + ret = bch2_subvolume_get_snapshot(trans, *i, &snap); + if (ret) + goto err; + + ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); + if (ret) + break; + } +err: + darray_exit(&subvols); + return ret; +} + +static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +{ + return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); +} + +static void __wait_on_freeing_inode(struct bch_fs *c, + struct bch_inode_info *inode, + subvol_inum inum) +{ + wait_queue_head_t *wq; + struct wait_bit_queue_entry wait; + + wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->v.i_lock); + + if (__bch2_inode_hash_find(c, inum) == inode) + schedule_timeout(HZ * 10); + finish_wait(wq, &wait.wq_entry); +} + +static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, + subvol_inum inum) { - struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; - struct btree_trans *trans; - struct bch_subvolume subvol; - int ret; +repeat: + inode = __bch2_inode_hash_find(c, inum); + if (inode) { + spin_lock(&inode->v.i_lock); + if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { + spin_unlock(&inode->v.i_lock); + return NULL; + } + if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { + if (!trans) { + __wait_on_freeing_inode(c, inode, inum); + } else { + bch2_trans_unlock(trans); + __wait_on_freeing_inode(c, inode, inum); + int ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + } + goto repeat; + } + __iget(&inode->v); + spin_unlock(&inode->v.i_lock); + } - inode = to_bch_ei(iget5_locked(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - if (!(inode->v.i_state & I_NEW)) - return &inode->v; + return inode; +} - trans = bch2_trans_get(c); - ret = lockrestart_do(trans, - bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); +static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) +{ + spin_lock(&inode->v.i_lock); + bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); + spin_unlock(&inode->v.i_lock); + + if (remove) { + int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, + &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); + BUG_ON(ret); + + ret = rhashtable_remove_fast(&c->vfs_inodes_table, + &inode->hash, bch2_vfs_inodes_params); + BUG_ON(ret); + inode->v.i_hash.pprev = NULL; + /* + * This pairs with the bch2_inode_hash_find() -> + * __wait_on_freeing_inode() path + */ + inode_wake_up_bit(&inode->v, __I_NEW); + } +} - if (!ret) - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - bch2_trans_put(trans); +static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, + struct btree_trans *trans, + struct bch_inode_info *inode) +{ + struct bch_inode_info *old = inode; + + set_bit(EI_INODE_HASHED, &inode->ei_flags); +retry: + if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, + &inode->ei_inum, + &inode->hash, + bch2_vfs_inodes_params))) { + old = bch2_inode_hash_find(c, trans, inode->ei_inum); + if (!old) + goto retry; + + clear_bit(EI_INODE_HASHED, &inode->ei_flags); + + /* + * bcachefs doesn't use I_NEW; we have no use for it since we + * only insert fully created inodes in the inode hash table. But + * discard_new_inode() expects it to be set... + */ + inode->v.i_state |= I_NEW; + /* + * We don't want bch2_evict_inode() to delete the inode on disk, + * we just raced and had another inode in cache. Normally new + * inodes don't have nlink == 0 - except tmpfiles do... + */ + set_nlink(&inode->v, 1); + discard_new_inode(&inode->v); + return old; + } else { + int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, + &inode->by_inum_hash, + bch2_vfs_inodes_by_inum_params); + BUG_ON(ret); + + inode_fake_hash(&inode->v); + + inode_sb_list_add(&inode->v); - if (ret) { - iget_failed(&inode->v); - return ERR_PTR(bch2_err_class(ret)); + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + return inode; } +} - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); +#define memalloc_flags_do(_flags, _do) \ +({ \ + unsigned _saved_flags = memalloc_flags_save(_flags); \ + typeof(_do) _ret = _do; \ + memalloc_noreclaim_restore(_saved_flags); \ + _ret; \ +}) + +static struct inode *bch2_alloc_inode(struct super_block *sb) +{ + BUG(); +} + +static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp) +{ + struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, + bch2_inode_cache, gfp); + if (!inode) + return NULL; + + inode_init_once(&inode->v); + mutex_init(&inode->ei_update_lock); + two_state_lock_init(&inode->ei_pagecache_lock); + INIT_LIST_HEAD(&inode->ei_vfs_inode_list); + inode->ei_flags = 0; + mutex_init(&inode->ei_quota_lock); + memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); + + if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) { + kmem_cache_free(bch2_inode_cache, inode); + return NULL; + } + + return inode; +} + +/* + * Allocate a new inode, dropping/retaking btree locks if necessary: + */ +static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) +{ + struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT); + + if (unlikely(!inode)) { + int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM); + if (ret && inode) { + __destroy_inode(&inode->v); + kmem_cache_free(bch2_inode_cache, inode); + } + if (ret) + return ERR_PTR(ret); + } + + return inode; +} + +static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *bi, + struct bch_subvolume *subvol) +{ + struct bch_inode_info *inode = bch2_new_inode(trans); + if (IS_ERR(inode)) + return inode; + + bch2_vfs_inode_init(trans, inum, inode, bi, subvol); + + return bch2_inode_hash_insert(trans->c, trans, inode); + +} + +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +{ + struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); + if (inode) + return &inode->v; + + struct btree_trans *trans = bch2_trans_get(c); - unlock_new_inode(&inode->v); + struct bch_inode_unpacked inode_u; + struct bch_subvolume subvol; + int ret = lockrestart_do(trans, + bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: + PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); + bch2_trans_put(trans); - return &inode->v; + return ret ? ERR_PTR(ret) : &inode->v; } struct bch_inode_info * @@ -226,12 +519,14 @@ __bch2_create(struct mnt_idmap *idmap, struct bch_fs *c = dir->v.i_sb->s_fs_info; struct btree_trans *trans; struct bch_inode_unpacked dir_u; - struct bch_inode_info *inode, *old; + struct bch_inode_info *inode; struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; subvol_inum inum; struct bch_subvolume subvol; u64 journal_seq = 0; + kuid_t kuid; + kgid_t kgid; int ret; /* @@ -243,7 +538,7 @@ __bch2_create(struct mnt_idmap *idmap, if (ret) return ERR_PTR(ret); #endif - inode = to_bch_ei(new_inode(c->vfs_sb)); + inode = __bch2_new_inode(c, GFP_NOFS); if (unlikely(!inode)) { inode = ERR_PTR(-ENOMEM); goto err; @@ -258,13 +553,15 @@ __bch2_create(struct mnt_idmap *idmap, retry: bch2_trans_begin(trans); - ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?: + kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); + kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); + ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: bch2_create_trans(trans, inode_inum(dir), &dir_u, &inode_u, !(flags & BCH_CREATE_TMPFILE) ? &dentry->d_name : NULL, - from_kuid(i_user_ns(&dir->v), current_fsuid()), - from_kgid(i_user_ns(&dir->v), current_fsgid()), + from_kuid(i_user_ns(&dir->v), kuid), + from_kgid(i_user_ns(&dir->v), kgid), mode, rdev, default_acl, acl, snapshot_src, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, @@ -272,11 +569,10 @@ retry: if (unlikely(ret)) goto err_before_quota; - inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; inum.inum = inode_u.bi_inum; - ret = bch2_subvolume_get(trans, inum.subvol, true, - BTREE_ITER_WITH_UPDATES, &subvol) ?: + ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, @@ -289,11 +585,10 @@ err_before_quota: if (!(flags & BCH_CREATE_TMPFILE)) { bch2_inode_update_after_write(trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME); + ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); mutex_unlock(&dir->ei_update_lock); } - bch2_iget5_set(&inode->v, &inum); bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -303,37 +598,16 @@ err_before_quota: * we must insert the new inode into the inode cache before calling * bch2_trans_exit() and dropping locks, else we could race with another * thread pulling the inode in and modifying it: + * + * also, calling bch2_inode_hash_insert() without passing in the + * transaction object is sketchy - if we could ever end up in + * __wait_on_freeing_inode(), we'd risk deadlock. + * + * But that shouldn't be possible, since we still have the inode locked + * that we just created, and we _really_ can't take a transaction + * restart here. */ - - inode->v.i_state |= I_CREATING; - - old = to_bch_ei(inode_insert5(&inode->v, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - BUG_ON(!old); - - if (unlikely(old != inode)) { - /* - * We raced, another process pulled the new inode into cache - * before us: - */ - make_bad_inode(&inode->v); - iput(&inode->v); - - inode = old; - } else { - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); - /* - * we really don't want insert_inode_locked2() to be setting - * I_NEW... - */ - unlock_new_inode(&inode->v); - } - + inode = bch2_inode_hash_insert(c, NULL, inode); bch2_trans_put(trans); err: posix_acl_release(default_acl); @@ -352,23 +626,79 @@ err_trans: /* methods */ +static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_hash_info *dir_hash_info, + const struct qstr *name) +{ + struct bch_fs *c = trans->c; + struct btree_iter dirent_iter = {}; + subvol_inum inum = {}; + struct printbuf buf = PRINTBUF; + + struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, + dir_hash_info, dir, name, 0); + int ret = bkey_err(k); + if (ret) + return ERR_PTR(ret); + + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + if (ret > 0) + ret = -ENOENT; + if (ret) + goto err; + + struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); + if (inode) + goto out; + + struct bch_subvolume subvol; + struct bch_inode_unpacked inode_u; + ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: + bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), + c, "dirent to missing inode:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (ret) + goto err; + + /* regular files may have hardlinks: */ + if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && + !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), + c, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, &inode_u), + buf.buf))) { + ret = -ENOENT; + goto err; + } +out: + bch2_trans_iter_exit(trans, &dirent_iter); + printbuf_exit(&buf); + return inode; +err: + inode = ERR_PTR(ret); + goto out; +} + static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, unsigned int flags) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); - struct inode *vinode = NULL; - subvol_inum inum = { .subvol = 1 }; - int ret; - ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, - &dentry->d_name, &inum); - - if (!ret) - vinode = bch2_vfs_inode_get(c, inum); + struct bch_inode_info *inode; + bch2_trans_do(c, + PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), + &hash, &dentry->d_name))); + if (IS_ERR(inode)) + inode = NULL; - return d_splice_alias(vinode, dentry); + return d_splice_alias(&inode->v, dentry); } static int bch2_mknod(struct mnt_idmap *idmap, @@ -398,11 +728,11 @@ static int __bch2_link(struct bch_fs *c, struct bch_inode_info *dir, struct dentry *dentry) { - struct btree_trans *trans = bch2_trans_get(c); struct bch_inode_unpacked dir_u, inode_u; int ret; mutex_lock(&inode->ei_update_lock); + struct btree_trans *trans = bch2_trans_get(c); ret = commit_do(trans, NULL, NULL, 0, bch2_link_trans(trans, @@ -412,7 +742,7 @@ static int __bch2_link(struct bch_fs *c, if (likely(!ret)) { bch2_inode_update_after_write(trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME); + ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); } @@ -431,8 +761,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, lockdep_assert_held(&inode->v.i_rwsem); - ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: - bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: + bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) return bch2_err_class(ret); @@ -449,11 +779,12 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_inode_unpacked dir_u, inode_u; - struct btree_trans *trans = bch2_trans_get(c); int ret; bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); + struct btree_trans *trans = bch2_trans_get(c); + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_unlink_trans(trans, @@ -464,7 +795,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, goto err; bch2_inode_update_after_write(trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME); + ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_MTIME); @@ -476,8 +807,8 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, set_nlink(&inode->v, 0); } err: - bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); bch2_trans_put(trans); + bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); return ret; } @@ -487,7 +818,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct bch_inode_info *dir= to_bch_ei(vdir); struct bch_fs *c = dir->v.i_sb->s_fs_info; - int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: + int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: __bch2_unlink(vdir, dentry, false); return bch2_err_class(ret); } @@ -544,15 +875,16 @@ static int bch2_rename2(struct mnt_idmap *idmap, struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); struct bch_inode_unpacked dst_dir_u, src_dir_u; - struct bch_inode_unpacked src_inode_u, dst_inode_u; + struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; struct btree_trans *trans; enum bch_rename_mode mode = flags & RENAME_EXCHANGE ? BCH_RENAME_EXCHANGE : dst_dentry->d_inode ? BCH_RENAME_OVERWRITE : BCH_RENAME; + bool whiteout = !!(flags & RENAME_WHITEOUT); int ret; - if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) return -EINVAL; if (mode == BCH_RENAME_OVERWRITE) { @@ -562,18 +894,18 @@ static int bch2_rename2(struct mnt_idmap *idmap, return ret; } - trans = bch2_trans_get(c); - bch2_lock_inodes(INODE_UPDATE_LOCK, src_dir, dst_dir, src_inode, dst_inode); - ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?: - bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol); + trans = bch2_trans_get(c); + + ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: + bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); if (ret) - goto err; + goto err_tx_restart; if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { ret = bch2_fs_quota_transfer(c, src_inode, @@ -593,29 +925,59 @@ static int bch2_rename2(struct mnt_idmap *idmap, if (ret) goto err; } +retry: + bch2_trans_begin(trans); - ret = commit_do(trans, NULL, NULL, 0, - bch2_rename_trans(trans, - inode_inum(src_dir), &src_dir_u, - inode_inum(dst_dir), &dst_dir_u, - &src_inode_u, - &dst_inode_u, - &src_dentry->d_name, - &dst_dentry->d_name, - mode)); + ret = bch2_rename_trans(trans, + inode_inum(src_dir), &src_dir_u, + inode_inum(dst_dir), &dst_dir_u, + &src_inode_u, + &dst_inode_u, + &src_dentry->d_name, + &dst_dentry->d_name, + mode); if (unlikely(ret)) + goto err_tx_restart; + + if (whiteout) { + whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); + ret = PTR_ERR_OR_ZERO(whiteout_inode_u); + if (unlikely(ret)) + goto err_tx_restart; + bch2_inode_init_early(c, whiteout_inode_u); + + ret = bch2_create_trans(trans, + inode_inum(src_dir), &src_dir_u, + whiteout_inode_u, + &src_dentry->d_name, + from_kuid(i_user_ns(&src_dir->v), current_fsuid()), + from_kgid(i_user_ns(&src_dir->v), current_fsgid()), + S_IFCHR|WHITEOUT_MODE, 0, + NULL, NULL, (subvol_inum) { 0 }, 0) ?: + bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, + KEY_TYPE_QUOTA_PREALLOC); + if (unlikely(ret)) + goto err_tx_restart; + } + + ret = bch2_trans_commit(trans, NULL, NULL, 0); + if (unlikely(ret)) { +err_tx_restart: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; goto err; + } BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); BUG_ON(dst_inode && dst_inode->v.i_ino != dst_inode_u.bi_inum); bch2_inode_update_after_write(trans, src_dir, &src_dir_u, - ATTR_MTIME|ATTR_CTIME); + ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); if (src_dir != dst_dir) bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, - ATTR_MTIME|ATTR_CTIME); + ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); bch2_inode_update_after_write(trans, src_inode, &src_inode_u, ATTR_CTIME); @@ -652,11 +1014,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap, { struct bch_fs *c = inode->v.i_sb->s_fs_info; unsigned int ia_valid = attr->ia_valid; + kuid_t kuid; + kgid_t kgid; - if (ia_valid & ATTR_UID) - bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid); - if (ia_valid & ATTR_GID) - bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + if (ia_valid & ATTR_UID) { + kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); + bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); + } + if (ia_valid & ATTR_GID) { + kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); + bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); + } if (ia_valid & ATTR_SIZE) bi->bi_size = attr->ia_size; @@ -671,11 +1039,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap, if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; kgid_t gid = ia_valid & ATTR_GID - ? attr->ia_gid + ? kgid : inode->v.i_gid; - if (!in_group_p(gid) && - !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID)) + if (!in_group_or_capable(idmap, &inode->v, + make_vfsgid(idmap, i_user_ns(&inode->v), gid))) mode &= ~S_ISGID; bi->bi_mode = mode; } @@ -691,17 +1059,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; + kuid_t kuid; + kgid_t kgid; int ret; mutex_lock(&inode->ei_update_lock); qid = inode->ei_qid; - if (attr->ia_valid & ATTR_UID) - qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid); + if (attr->ia_valid & ATTR_UID) { + kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); + qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); + } - if (attr->ia_valid & ATTR_GID) - qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + if (attr->ia_valid & ATTR_GID) { + kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); + qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); + } ret = bch2_fs_quota_transfer(c, inode, qid, ~0, KEY_TYPE_QUOTA_PREALLOC); @@ -715,7 +1089,7 @@ retry: acl = NULL; ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto btree_err; @@ -757,13 +1131,15 @@ static int bch2_getattr(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); struct bch_fs *c = inode->v.i_sb->s_fs_info; + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); stat->dev = inode->v.i_sb->s_dev; stat->ino = inode->v.i_ino; stat->mode = inode->v.i_mode; stat->nlink = inode->v.i_nlink; - stat->uid = inode->v.i_uid; - stat->gid = inode->v.i_gid; + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->v.i_rdev; stat->size = i_size_read(&inode->v); stat->atime = inode_get_atime(&inode->v); @@ -772,6 +1148,19 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->blksize = block_bytes(c); stat->blocks = inode->v.i_blocks; + stat->subvol = inode->ei_inum.subvol; + stat->result_mask |= STATX_SUBVOL; + + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { + stat->result_mask |= STATX_DIOALIGN; + /* + * this is incorrect; we should be tracking this in superblock, + * and checking the alignment of open devices + */ + stat->dio_mem_align = SECTOR_SIZE; + stat->dio_offset_align = block_bytes(c); + } + if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); @@ -801,7 +1190,7 @@ static int bch2_setattr(struct mnt_idmap *idmap, lockdep_assert_held(&inode->v.i_rwsem); - ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: setattr_prepare(idmap, dentry, iattr); if (ret) return ret; @@ -892,16 +1281,14 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; - struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); - unsigned offset_into_extent, sectors; bool have_extent = false; - u32 snapshot; int ret = 0; ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; + struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); if (start + len < start) return -EINVAL; @@ -910,42 +1297,50 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_bkey_buf_init(&cur); bch2_bkey_buf_init(&prev); trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot); - if (ret) - goto err; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(ei->v.i_ino, start, snapshot), 0); + POS(ei->v.i_ino, start), 0); - while (!(ret = btree_trans_too_many_iters(trans)) && - (k = bch2_btree_iter_peek_upto(&iter, end)).k && - !(ret = bkey_err(k))) { + while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { enum btree_id data_btree = BTREE_ID_extents; + bch2_trans_begin(trans); + + u32 snapshot; + ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + + k = bch2_btree_iter_peek_max(&iter, end); + ret = bkey_err(k); + if (ret) + continue; + + if (!k.k) + break; + if (!bkey_extent_is_data(k.k) && k.k->type != KEY_TYPE_reservation) { bch2_btree_iter_advance(&iter); continue; } - offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; + s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + unsigned sectors = k.k->size - offset_into_extent; bch2_bkey_buf_reassemble(&cur, c, k); ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &cur); if (ret) - break; + continue; k = bkey_i_to_s_c(cur.k); bch2_bkey_buf_realloc(&prev, c, k.k->u64s); - sectors = min(sectors, k.k->size - offset_into_extent); + sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); bch2_cut_front(POS(k.k->p.inode, bkey_start_offset(k.k) + @@ -969,11 +1364,7 @@ retry: bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); } - start = iter.pos.offset; bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; if (!ret && have_extent) { bch2_trans_unlock(trans); @@ -1029,11 +1420,13 @@ static int bch2_open(struct inode *vinode, struct file *file) struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret = bch2_subvol_is_ro(c, inode->ei_subvol); + int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); if (ret) return ret; } + file->f_mode |= FMODE_CAN_ODIRECT; + return generic_file_open(vinode, file); } @@ -1043,6 +1436,7 @@ static const struct file_operations bch_file_operations = { .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, .mmap = bch2_mmap, + .get_unmapped_area = thp_get_unmapped_area, .fsync = bch2_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, @@ -1060,7 +1454,7 @@ static const struct inode_operations bch_file_inode_operations = { .fiemap = bch2_fiemap, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1080,7 +1474,7 @@ static const struct inode_operations bch_dir_inode_operations = { .tmpfile = bch2_tmpfile, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1102,7 +1496,7 @@ static const struct inode_operations bch_symlink_inode_operations = { .setattr = bch2_setattr, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1112,7 +1506,7 @@ static const struct inode_operations bch_special_inode_operations = { .setattr = bch2_setattr, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1126,7 +1520,6 @@ static const struct address_space_operations bch_address_space_operations = { .write_end = bch2_write_end, .invalidate_folio = bch2_invalidate_folio, .release_folio = bch2_release_folio, - .direct_IO = noop_direct_IO, #ifdef CONFIG_MIGRATION .migrate_folio = filemap_migrate_folio, #endif @@ -1159,8 +1552,8 @@ static int bcachefs_fid_valid(int fh_len, int fh_type) static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) { return (struct bcachefs_fid) { - .inum = inode->ei_inode.bi_inum, - .subvol = inode->ei_subvol, + .inum = inode->ei_inum.inum, + .subvol = inode->ei_inum.subvol, .gen = inode->ei_inode.bi_generation, }; } @@ -1245,7 +1638,7 @@ static struct dentry *bch2_get_parent(struct dentry *child) struct bch_fs *c = inode->v.i_sb->s_fs_info; subvol_inum parent_inum = { .subvol = inode->ei_inode.bi_parent_subvol ?: - inode->ei_subvol, + inode->ei_inum.subvol, .inum = inode->ei_inode.bi_dir, }; @@ -1281,7 +1674,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child retry: bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); if (ret) goto err; @@ -1312,8 +1705,7 @@ retry: if (ret) goto err; - if (target.subvol == inode->ei_subvol && - target.inum == inode->ei_inode.bi_inum) + if (subvol_inum_eq(target, inode->ei_inum)) goto found; } else { /* @@ -1334,8 +1726,7 @@ retry: if (ret) continue; - if (target.subvol == inode->ei_subvol && - target.inum == inode->ei_inode.bi_inum) + if (subvol_inum_eq(target, inode->ei_inum)) goto found; } } @@ -1367,20 +1758,18 @@ static const struct export_operations bch_export_ops = { .get_name = bch2_get_name, }; -static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, +static void bch2_vfs_inode_init(struct btree_trans *trans, + subvol_inum inum, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { + inode->v.i_ino = inum.inum; + inode->ei_inum = inum; + inode->ei_inode.bi_inum = inum.inum; bch2_inode_update_after_write(trans, inode, bi, ~0); - if (BCH_SUBVOLUME_SNAP(subvol)) - set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - else - clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - inode->v.i_blocks = bi->bi_sectors; - inode->v.i_ino = bi->bi_inum; inode->v.i_rdev = bi->bi_dev; inode->v.i_generation = bi->bi_generation; inode->v.i_size = bi->bi_size; @@ -1388,7 +1777,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, inode->ei_flags = 0; inode->ei_quota_reserved = 0; inode->ei_qid = bch_qid(bi); - inode->ei_subvol = inum.subvol; + + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); inode->v.i_mapping->a_ops = &bch_address_space_operations; @@ -1414,34 +1805,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, mapping_set_large_folios(inode->v.i_mapping); } -static struct inode *bch2_alloc_inode(struct super_block *sb) +static void bch2_free_inode(struct inode *vinode) { - struct bch_inode_info *inode; - - inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); - if (!inode) - return NULL; - - inode_init_once(&inode->v); - mutex_init(&inode->ei_update_lock); - two_state_lock_init(&inode->ei_pagecache_lock); - INIT_LIST_HEAD(&inode->ei_vfs_inode_list); - mutex_init(&inode->ei_quota_lock); - - return &inode->v; -} - -static void bch2_i_callback(struct rcu_head *head) -{ - struct inode *vinode = container_of(head, struct inode, i_rcu); - struct bch_inode_info *inode = to_bch_ei(vinode); - - kmem_cache_free(bch2_inode_cache, inode); -} - -static void bch2_destroy_inode(struct inode *vinode) -{ - call_rcu(&vinode->i_rcu, bch2_i_callback); + kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); } static int inode_update_times_fn(struct btree_trans *trans, @@ -1477,6 +1843,17 @@ static void bch2_evict_inode(struct inode *vinode) { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(vinode); + bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); + + /* + * evict() has waited for outstanding writeback, we'll do no more IO + * through this inode: it's safe to remove from VFS inode hashtable here + * + * Do that now so that other threads aren't blocked from pulling it back + * in, there's no reason for them to be: + */ + if (!delete) + bch2_inode_hash_remove(c, inode); truncate_inode_pages_final(&inode->v.i_data); @@ -1484,12 +1861,18 @@ static void bch2_evict_inode(struct inode *vinode) BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); - if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { + if (delete) { bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); bch2_inode_rm(c, inode_inum(inode)); + + /* + * If we are deleting, we need it present in the vfs hash table + * so that fsck can check if unlinked inodes are still open: + */ + bch2_inode_hash_remove(c, inode); } mutex_lock(&c->vfs_inodes_lock); @@ -1519,7 +1902,7 @@ again: mutex_lock(&c->vfs_inodes_lock); list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { - if (!snapshot_list_has_id(s, inode->ei_subvol)) + if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) continue; if (!(inode->v.i_state & I_DONTCACHE) && @@ -1532,14 +1915,16 @@ again: break; } } else if (clean_pass && this_pass_clean) { - wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW); - DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); + prepare_to_wait_event(wq_head, &wqe.wq_entry, + TASK_UNINTERRUPTIBLE); mutex_unlock(&c->vfs_inodes_lock); schedule(); - finish_wait(wq, &wait.wq_entry); + finish_wait(wq_head, &wqe.wq_entry); goto again; } } @@ -1572,7 +1957,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) * number: */ u64 avail_inodes = ((usage.capacity - usage.used) << 3); - u64 fsid; buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -1583,10 +1967,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = usage.nr_inodes + avail_inodes; buf->f_ffree = avail_inodes; - fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ - le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); buf->f_namelen = BCH_NAME_MAX; return 0; @@ -1597,6 +1978,8 @@ static int bch2_sync_fs(struct super_block *sb, int wait) struct bch_fs *c = sb->s_fs_info; int ret; + trace_bch2_sync_fs(sb, wait); + if (c->opts.journal_flush_disabled) return 0; @@ -1625,15 +2008,11 @@ static struct bch_fs *bch2_path_to_fs(const char *path) return c ?: ERR_PTR(-ENOENT); } -static int bch2_remount(struct super_block *sb, int *flags, char *data) +static int bch2_remount(struct super_block *sb, int *flags, + struct bch_opts opts) { struct bch_fs *c = sb->s_fs_info; - struct bch_opts opts = bch2_opts_empty(); - int ret; - - ret = bch2_parse_mount_opts(c, &opts, data); - if (ret) - goto err; + int ret = 0; opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); @@ -1685,29 +2064,14 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) static int bch2_show_options(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; - enum bch_opt_id i; struct printbuf buf = PRINTBUF; - int ret = 0; - - for (i = 0; i < bch2_opts_nr; i++) { - const struct bch_option *opt = &bch2_opt_table[i]; - u64 v = bch2_opt_get_by_id(&c->opts, i); - if (!(opt->flags & OPT_MOUNT)) - continue; - - if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) - continue; + bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, + OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); + printbuf_nul_terminate(&buf); + seq_printf(seq, ",%s", buf.buf); - printbuf_reset(&buf); - bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v, - OPT_SHOW_MOUNT_STYLE); - seq_putc(seq, ','); - seq_puts(seq, buf.buf); - } - - if (buf.allocation_failure) - ret = -ENOMEM; + int ret = buf.allocation_failure ? -ENOMEM : 0; printbuf_exit(&buf); return ret; } @@ -1753,14 +2117,13 @@ static int bch2_unfreeze(struct super_block *sb) static const struct super_operations bch_super_operations = { .alloc_inode = bch2_alloc_inode, - .destroy_inode = bch2_destroy_inode, + .free_inode = bch2_free_inode, .write_inode = bch2_vfs_write_inode, .evict_inode = bch2_evict_inode, .sync_fs = bch2_sync_fs, .statfs = bch2_statfs, .show_devname = bch2_show_devname, .show_options = bch2_show_options, - .remount_fs = bch2_remount, .put_super = bch2_put_super, .freeze_fs = bch2_freeze, .unfreeze_fs = bch2_unfreeze, @@ -1793,75 +2156,63 @@ static int bch2_test_super(struct super_block *s, void *data) return true; } -static struct dentry *bch2_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int bch2_fs_get_tree(struct fs_context *fc) { struct bch_fs *c; struct super_block *sb; struct inode *vinode; - struct bch_opts opts = bch2_opts_empty(); + struct bch2_opts_parse *opts_parse = fc->fs_private; + struct bch_opts opts = opts_parse->opts; + darray_str devs; + darray_fs devs_to_fs = {}; int ret; - opt_set(opts, read_only, (flags & SB_RDONLY) != 0); - - ret = bch2_parse_mount_opts(NULL, &opts, data); - if (ret) - return ERR_PTR(ret); + opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); + opt_set(opts, nostart, true); - if (!dev_name || strlen(dev_name) == 0) - return ERR_PTR(-EINVAL); + if (!fc->source || strlen(fc->source) == 0) + return -EINVAL; - darray_str devs; - ret = bch2_split_devs(dev_name, &devs); + ret = bch2_split_devs(fc->source, &devs); if (ret) - return ERR_PTR(ret); + return ret; - darray_fs devs_to_fs = {}; darray_for_each(devs, i) { ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); - if (ret) { - sb = ERR_PTR(ret); - goto got_sb; - } + if (ret) + goto err; } - sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs); + sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs); if (!IS_ERR(sb)) goto got_sb; c = bch2_fs_open(devs.data, devs.nr, opts); - if (IS_ERR(c)) { - sb = ERR_CAST(c); - goto got_sb; - } + ret = PTR_ERR_OR_ZERO(c); + if (ret) + goto err; /* Some options can't be parsed until after the fs is started: */ - ret = bch2_parse_mount_opts(c, &opts, data); - if (ret) { - bch2_fs_stop(c); - sb = ERR_PTR(ret); - goto got_sb; - } + opts = bch2_opts_empty(); + ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf); + if (ret) + goto err_stop_fs; bch2_opts_apply(&c->opts, opts); - sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c); - if (IS_ERR(sb)) - bch2_fs_stop(c); -got_sb: - darray_exit(&devs_to_fs); - bch2_darray_str_exit(&devs); - - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - ret = bch2_err_class(ret); - return ERR_PTR(ret); - } + ret = bch2_fs_start(c); + if (ret) + goto err_stop_fs; + sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); + ret = PTR_ERR_OR_ZERO(sb); + if (ret) + goto err_stop_fs; +got_sb: c = sb->s_fs_info; if (sb->s_root) { - if ((flags ^ sb->s_flags) & SB_RDONLY) { + if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) { ret = -EBUSY; goto err_put_super; } @@ -1882,6 +2233,9 @@ got_sb: sb->s_time_gran = c->sb.nsec_per_time_unit; sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); + super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); + super_set_sysfs_name_uuid(sb); + sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); @@ -1925,11 +2279,30 @@ got_sb: sb->s_flags |= SB_ACTIVE; out: - return dget(sb->s_root); + fc->root = dget(sb->s_root); +err: + darray_exit(&devs_to_fs); + bch2_darray_str_exit(&devs); + if (ret) + pr_err("error: %s", bch2_err_str(ret)); + /* + * On an inconsistency error in recovery we might see an -EROFS derived + * errorcode (from the journal), but we don't want to return that to + * userspace as that causes util-linux to retry the mount RO - which is + * confusing: + */ + if (bch2_err_matches(ret, EROFS) && ret != -EROFS) + ret = -EIO; + return bch2_err_class(ret); + +err_stop_fs: + bch2_fs_stop(c); + goto err; err_put_super: + __bch2_fs_stop(c); deactivate_locked_super(sb); - return ERR_PTR(bch2_err_class(ret)); + goto err; } static void bch2_kill_sb(struct super_block *sb) @@ -1940,12 +2313,90 @@ static void bch2_kill_sb(struct super_block *sb) bch2_fs_free(c); } +static void bch2_fs_context_free(struct fs_context *fc) +{ + struct bch2_opts_parse *opts = fc->fs_private; + + if (opts) { + printbuf_exit(&opts->parse_later); + kfree(opts); + } +} + +static int bch2_fs_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + /* + * the "source" param, i.e., the name of the device(s) to mount, + * is handled by the VFS layer. + */ + if (!strcmp(param->key, "source")) + return -ENOPARAM; + + struct bch2_opts_parse *opts = fc->fs_private; + struct bch_fs *c = NULL; + + /* for reconfigure, we already have a struct bch_fs */ + if (fc->root) + c = fc->root->d_sb->s_fs_info; + + int ret = bch2_parse_one_mount_opt(c, &opts->opts, + &opts->parse_later, param->key, + param->string); + + return bch2_err_class(ret); +} + +static int bch2_fs_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct bch2_opts_parse *opts = fc->fs_private; + + return bch2_remount(sb, &fc->sb_flags, opts->opts); +} + +static const struct fs_context_operations bch2_context_ops = { + .free = bch2_fs_context_free, + .parse_param = bch2_fs_parse_param, + .get_tree = bch2_fs_get_tree, + .reconfigure = bch2_fs_reconfigure, +}; + +static int bch2_init_fs_context(struct fs_context *fc) +{ + struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL); + + if (!opts) + return -ENOMEM; + + opts->parse_later = PRINTBUF; + + fc->ops = &bch2_context_ops; + fc->fs_private = opts; + + return 0; +} + +void bch2_fs_vfs_exit(struct bch_fs *c) +{ + if (c->vfs_inodes_by_inum_table.ht.tbl) + rhltable_destroy(&c->vfs_inodes_by_inum_table); + if (c->vfs_inodes_table.tbl) + rhashtable_destroy(&c->vfs_inodes_table); +} + +int bch2_fs_vfs_init(struct bch_fs *c) +{ + return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: + rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); +} + static struct file_system_type bcache_fs_type = { - .owner = THIS_MODULE, - .name = "bcachefs", - .mount = bch2_mount, - .kill_sb = bch2_kill_sb, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "bcachefs", + .init_fs_context = bch2_init_fs_context, + .kill_sb = bch2_kill_sb, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("bcachefs"); @@ -1960,7 +2411,8 @@ int __init bch2_vfs_init(void) { int ret = -ENOMEM; - bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT); + bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT); if (!bch2_inode_cache) goto err; diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index c3af7225ff69..dd2198541455 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -13,6 +13,10 @@ struct bch_inode_info { struct inode v; + struct rhash_head hash; + struct rhlist_head by_inum_hash; + subvol_inum ei_inum; + struct list_head ei_vfs_inode_list; unsigned long ei_flags; @@ -24,8 +28,6 @@ struct bch_inode_info { struct mutex ei_quota_lock; struct bch_qid ei_qid; - u32 ei_subvol; - /* * When we've been doing nocow writes we'll need to issue flushes to the * underlying block devices @@ -50,10 +52,7 @@ struct bch_inode_info { static inline subvol_inum inode_inum(struct bch_inode_info *inode) { - return (subvol_inum) { - .subvol = inode->ei_subvol, - .inum = inode->ei_inode.bi_inum, - }; + return inode->ei_inum; } /* @@ -67,6 +66,7 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode) * those: */ #define EI_INODE_SNAPSHOT 1 +#define EI_INODE_HASHED 2 #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) @@ -147,6 +147,8 @@ struct bch_inode_info * __bch2_create(struct mnt_idmap *, struct bch_inode_info *, struct dentry *, umode_t, dev_t, subvol_inum, unsigned); +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p); + int bch2_fs_quota_transfer(struct bch_fs *, struct bch_inode_info *, struct bch_qid, @@ -187,6 +189,9 @@ int __bch2_unlink(struct inode *, struct dentry *, bool); void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); +void bch2_fs_vfs_exit(struct bch_fs *); +int bch2_fs_vfs_init(struct bch_fs *); + void bch2_vfs_exit(void); int bch2_vfs_init(void); @@ -194,8 +199,14 @@ int bch2_vfs_init(void); #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) +static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; } + static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} + +static inline void bch2_fs_vfs_exit(struct bch_fs *c) {} +static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; } + static inline void bch2_vfs_exit(void) {} static inline int bch2_vfs_init(void) { return 0; } diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 6a760777bafb..0e85131d0af8 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bcachefs_ioctl.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_update.h" @@ -8,18 +9,63 @@ #include "darray.h" #include "dirent.h" #include "error.h" +#include "fs.h" #include "fs-common.h" #include "fsck.h" #include "inode.h" #include "keylist.h" -#include "recovery.h" +#include "recovery_passes.h" #include "snapshot.h" #include "super.h" +#include "thread_with_file.h" #include "xattr.h" #include <linux/bsearch.h> #include <linux/dcache.h> /* struct qstr */ +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, + struct bch_inode_unpacked *inode) +{ + if (d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol + : le64_to_cpu(d.v->d_inum) == inode->bi_inum) + return 0; + return -BCH_ERR_ENOENT_dirent_doesnt_match_inode; +} + +static void dirent_inode_mismatch_msg(struct printbuf *out, + struct bch_fs *c, + struct bkey_s_c_dirent dirent, + struct bch_inode_unpacked *inode) +{ + prt_str(out, "inode points to dirent that does not point back:"); + prt_newline(out); + bch2_bkey_val_to_text(out, c, dirent.s_c); + prt_newline(out); + bch2_inode_unpacked_to_text(out, inode); +} + +static int dirent_points_to_inode(struct bch_fs *c, + struct bkey_s_c_dirent dirent, + struct bch_inode_unpacked *inode) +{ + int ret = dirent_points_to_inode_nowarn(dirent, inode); + if (ret) { + struct printbuf buf = PRINTBUF; + dirent_inode_mismatch_msg(&buf, c, dirent, inode); + bch_warn(c, "%s", buf.buf); + printbuf_exit(&buf); + } + return ret; +} + /* * XXX: this is handling transaction restarts without returning * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: @@ -29,7 +75,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, { u64 sectors = 0; - int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, SPOS(inum, 0, snapshot), POS(inum, U64_MAX), 0, k, ({ @@ -46,7 +92,7 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, { u64 subdirs = 0; - int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents, SPOS(inum, 0, snapshot), POS(inum, U64_MAX), 0, k, ({ @@ -63,9 +109,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, u32 *snapshot, u64 *inum) { struct bch_subvolume s; - int ret; - - ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + int ret = bch2_subvolume_get(trans, subvol, false, &s); *snapshot = le32_to_cpu(s.snapshot); *inum = le64_to_cpu(s.inode); @@ -79,36 +123,31 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, - POS(0, inode_nr), - BTREE_ITER_ALL_SNAPSHOTS); - k = bch2_btree_iter_peek(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) { - ret = -BCH_ERR_ENOENT_inode; - goto err; + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode_nr) + break; + if (!bkey_is_inode(k.k)) + continue; + ret = bch2_inode_unpack(k, inode); + goto found; } - - ret = bch2_inode_unpack(k, inode); -err: + ret = -BCH_ERR_ENOENT_inode; +found: bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); bch2_trans_iter_exit(trans, &iter); return ret; } -static int lookup_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode, - u32 *snapshot) +static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, + struct bch_inode_unpacked *inode) { struct btree_iter iter; struct bkey_s_c k; int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inode_nr, *snapshot), 0); + SPOS(0, inode_nr, snapshot), 0); ret = bkey_err(k); if (ret) goto err; @@ -116,8 +155,6 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr, ret = bkey_is_inode(k.k) ? bch2_inode_unpack(k, inode) : -BCH_ERR_ENOENT_inode; - if (!ret) - *snapshot = iter.pos.snapshot; err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -129,47 +166,19 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, u64 *target, unsigned *type, u32 snapshot) { struct btree_iter iter; - struct bkey_s_c_dirent d; - int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0, snapshot); + struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0, snapshot); + int ret = bkey_err(k); if (ret) return ret; - d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); *target = le64_to_cpu(d.v->d_inum); *type = d.v->d_type; bch2_trans_iter_exit(trans, &iter); return 0; } -static int __write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - struct bkey_inode_buf *inode_p = - bch2_trans_kmalloc(trans, sizeof(*inode_p)); - - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); - - bch2_inode_pack(inode_p, inode); - inode_p->inode.k.p.snapshot = snapshot; - - return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, - &inode_p->inode.k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -} - -static int fsck_write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __write_inode(trans, inode, snapshot)); - bch_err_fn(trans->c, ret); - return ret; -} - static int __remove_dirent(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; @@ -184,23 +193,56 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) dir_hash_info = bch2_hash_info_init(c, &dir_inode); - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); - ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter); err: bch_err_fn(c, ret); return ret; } +/* + * Find any subvolume associated with a tree of snapshots + * We can't rely on master_subvol - it might have been deleted. + */ +static int find_snapshot_tree_subvol(struct btree_trans *trans, + u32 tree_id, u32 *subvol) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + if (le32_to_cpu(s.v->tree) != tree_id) + continue; + + if (s.v->subvol) { + *subvol = le32_to_cpu(s.v->subvol); + goto found; + } + } + ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol; +found: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + /* Get lost+found, create if it doesn't exist: */ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - struct bch_inode_unpacked *lostfound) + struct bch_inode_unpacked *lostfound, + u64 reattaching_inum) { struct bch_fs *c = trans->c; struct qstr lostfound_str = QSTR("lost+found"); + struct btree_iter lostfound_iter = { NULL }; u64 inum = 0; unsigned d_type = 0; int ret; @@ -211,20 +253,43 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, if (ret) return ret; - subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; - u32 subvol_snapshot; + u32 subvolid; + ret = find_snapshot_tree_subvol(trans, + bch2_snapshot_tree(c, snapshot), &subvolid); + bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u", + bch2_snapshot_tree(c, snapshot)); + if (ret) + return ret; - ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol), - &subvol_snapshot, &root_inum.inum); - bch_err_msg(c, ret, "looking up root subvol"); + struct bch_subvolume subvol; + ret = bch2_subvolume_get(trans, subvolid, false, &subvol); + bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot); if (ret) return ret; + if (!subvol.inode) { + struct btree_iter iter; + struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvolid), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(subvol); + if (ret) + return ret; + + subvol->v.inode = cpu_to_le64(reattaching_inum); + bch2_trans_iter_exit(trans, &iter); + } + + subvol_inum root_inum = { + .subvol = subvolid, + .inum = le64_to_cpu(subvol.inode) + }; + struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; - u32 root_inode_snapshot = snapshot; - ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); - bch_err_msg(c, ret, "looking up root inode"); + ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); + bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", + root_inum.inum, subvolid); if (ret) return ret; @@ -248,25 +313,37 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, * The bch2_check_dirents pass has already run, dangling dirents * shouldn't exist here: */ - ret = lookup_inode(trans, inum, lostfound, &snapshot); + ret = lookup_inode(trans, inum, snapshot, lostfound); bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); return ret; create_lostfound: /* + * we always create lost+found in the root snapshot; we don't want + * different branches of the snapshot tree to have different lost+found + */ + snapshot = le32_to_cpu(st.root_snapshot); + /* * XXX: we could have a nicer log message here if we had a nice way to * walk backpointers to print a path */ - bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot)); + struct printbuf path = PRINTBUF; + ret = bch2_inum_to_path(trans, root_inum, &path); + if (ret) + goto err; + + bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", + path.buf, root_inum.subvol, snapshot); + printbuf_exit(&path); u64 now = bch2_current_time(c); - struct btree_iter lostfound_iter = { NULL }; u64 cpu = raw_smp_processor_id(); bch2_inode_init_early(c, lostfound); bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); lostfound->bi_dir = root_inode.bi_inum; + lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); root_inode.bi_nlink++; @@ -280,87 +357,345 @@ create_lostfound: goto err; ret = bch2_dirent_create_snapshot(trans, - root_inode.bi_inum, snapshot, &root_hash_info, + 0, root_inode.bi_inum, snapshot, &root_hash_info, mode_to_type(lostfound->bi_mode), &lostfound_str, lostfound->bi_inum, &lostfound->bi_dir_offset, - BCH_HASH_SET_MUST_CREATE) ?: + STR_HASH_must_create) ?: bch2_inode_write_flags(trans, &lostfound_iter, lostfound, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); err: bch_err_msg(c, ret, "creating lost+found"); bch2_trans_iter_exit(trans, &lostfound_iter); return ret; } -static int reattach_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 inode_snapshot) +static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) +{ + if (inode->bi_inum == BCACHEFS_ROOT_INO && + inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) + return false; + + return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); +} + +static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, + SPOS(d_pos.inode, d_pos.offset, snapshot), + BTREE_ITER_intent| + BTREE_ITER_with_updates); + int ret = bkey_err(k); + if (ret) + return ret; + + if (bpos_eq(k.k->p, d_pos)) { + /* + * delet_at() doesn't work because the update path doesn't + * internally use BTREE_ITER_with_updates yet + */ + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + bkey_init(&k->k); + k->k.type = KEY_TYPE_whiteout; + k->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) { - struct bch_hash_info dir_hash; + struct bch_fs *c = trans->c; struct bch_inode_unpacked lostfound; char name_buf[20]; - struct qstr name; - u64 dir_offset = 0; int ret; - ret = lookup_lostfound(trans, inode_snapshot, &lostfound); + u32 dirent_snapshot = inode->bi_snapshot; + if (inode->bi_subvol) { + inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; + + u64 root_inum; + ret = subvol_lookup(trans, inode->bi_parent_subvol, + &dirent_snapshot, &root_inum); + if (ret) + return ret; + + snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); + } else { + snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); + } + + ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); if (ret) return ret; - if (S_ISDIR(inode->bi_mode)) { - lostfound.bi_nlink++; + lostfound.bi_nlink += S_ISDIR(inode->bi_mode); - ret = __write_inode(trans, &lostfound, U32_MAX); - if (ret) - return ret; + /* ensure lost+found inode is also present in inode snapshot */ + if (!inode->bi_subvol) { + BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot)); + lostfound.bi_snapshot = inode->bi_snapshot; } - dir_hash = bch2_hash_info_init(trans->c, &lostfound); + ret = __bch2_fsck_write_inode(trans, &lostfound); + if (ret) + return ret; - snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); - name = (struct qstr) QSTR(name_buf); + struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); + struct qstr name = QSTR(name_buf); + + inode->bi_dir = lostfound.bi_inum; ret = bch2_dirent_create_snapshot(trans, - lostfound.bi_inum, inode_snapshot, + inode->bi_parent_subvol, lostfound.bi_inum, + dirent_snapshot, &dir_hash, inode_d_type(inode), - &name, inode->bi_inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + &name, + inode->bi_subvol ?: inode->bi_inum, + &inode->bi_dir_offset, + STR_HASH_must_create); + if (ret) { + bch_err_msg(c, ret, "error creating dirent"); + return ret; + } + + ret = __bch2_fsck_write_inode(trans, inode); if (ret) return ret; - inode->bi_dir = lostfound.bi_inum; - inode->bi_dir_offset = dir_offset; + /* + * Fix up inodes in child snapshots: if they should also be reattached + * update the backpointer field, if they should not be we need to emit + * whiteouts for the dirent we just created. + */ + if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { + snapshot_id_list whiteouts_done; + struct btree_iter iter; + struct bkey_s_c k; - return __write_inode(trans, inode, inode_snapshot); + darray_init(&whiteouts_done); + + for_each_btree_key_reverse_norestart(trans, iter, + BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), + BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { + if (k.k->p.offset != inode->bi_inum) + break; + + if (!bkey_is_inode(k.k) || + !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || + snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) + continue; + + struct bch_inode_unpacked child_inode; + ret = bch2_inode_unpack(k, &child_inode); + if (ret) + break; + + if (!inode_should_reattach(&child_inode)) { + ret = maybe_delete_dirent(trans, + SPOS(lostfound.bi_inum, inode->bi_dir_offset, + dirent_snapshot), + k.k->p.snapshot); + if (ret) + break; + + ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); + if (ret) + break; + } else { + iter.snapshot = k.k->p.snapshot; + child_inode.bi_dir = inode->bi_dir; + child_inode.bi_dir_offset = inode->bi_dir_offset; + + ret = bch2_inode_write_flags(trans, &iter, &child_inode, + BTREE_UPDATE_internal_snapshot_node); + if (ret) + break; + } + } + darray_exit(&whiteouts_done); + bch2_trans_iter_exit(trans, &iter); + } + + return ret; +} + +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) +{ + return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); } static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { - struct btree_iter iter; - struct bkey_s_c_dirent d; - int ret; + if (!inode->bi_dir) + return 0; - d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, - POS(inode->bi_dir, inode->bi_dir_offset), 0, - dirent); - ret = bkey_err(d) ?: - __remove_dirent(trans, d.k->p); + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); + int ret = bkey_err(d) ?: + dirent_points_to_inode(c, d, inode) ?: + __remove_dirent(trans, d.k->p); bch2_trans_iter_exit(trans, &iter); return ret; } -struct snapshots_seen_entry { - u32 id; - u32 equiv; -}; +static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) +{ + struct bch_fs *c = trans->c; + + struct bch_inode_unpacked inode; + int ret = bch2_inode_find_by_inum_trans(trans, + (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, + &inode); + if (ret) + return ret; + + ret = remove_backpointer(trans, &inode); + if (!bch2_err_matches(ret, ENOENT)) + bch_err_msg(c, ret, "removing dirent"); + if (ret) + return ret; + + ret = reattach_inode(trans, &inode); + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); + return ret; +} + +static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) +{ + struct bch_fs *c = trans->c; + + if (!bch2_snapshot_is_leaf(c, snapshotid)) { + bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); + return -BCH_ERR_fsck_repair_unimplemented; + } + + /* + * If inum isn't set, that means we're being called from check_dirents, + * not check_inodes - the root of this subvolume doesn't exist or we + * would have found it there: + */ + if (!inum) { + struct btree_iter inode_iter = {}; + struct bch_inode_unpacked new_inode; + u64 cpu = raw_smp_processor_id(); + + bch2_inode_init_early(c, &new_inode); + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); + + new_inode.bi_subvol = subvolid; + + int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: + bch2_btree_iter_traverse(&inode_iter) ?: + bch2_inode_write(trans, &inode_iter, &new_inode); + bch2_trans_iter_exit(trans, &inode_iter); + if (ret) + return ret; + + inum = new_inode.bi_inum; + } + + bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); + + struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); + int ret = PTR_ERR_OR_ZERO(new_subvol); + if (ret) + return ret; + + bkey_subvolume_init(&new_subvol->k_i); + new_subvol->k.p.offset = subvolid; + new_subvol->v.snapshot = cpu_to_le32(snapshotid); + new_subvol->v.inode = cpu_to_le64(inum); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); + if (ret) + return ret; + + struct btree_iter iter; + struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, snapshotid), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(s); + bch_err_msg(c, ret, "getting snapshot %u", snapshotid); + if (ret) + return ret; + + u32 snapshot_tree = le32_to_cpu(s->v.tree); + + s->v.subvol = cpu_to_le32(subvolid); + SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); + bch2_trans_iter_exit(trans, &iter); + + struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshot_trees, POS(0, snapshot_tree), + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(st); + bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); + if (ret) + return ret; + + if (!st->v.master_subvol) + st->v.master_subvol = cpu_to_le32(subvolid); + + bch2_trans_iter_exit(trans, &iter); + return 0; +} + +static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) +{ + struct bch_fs *c = trans->c; + unsigned i_mode = S_IFREG; + u64 i_size = 0; + + switch (btree) { + case BTREE_ID_extents: { + struct btree_iter iter = {}; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); + struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0)); + bch2_trans_iter_exit(trans, &iter); + int ret = bkey_err(k); + if (ret) + return ret; + + i_size = k.k->p.offset << 9; + break; + } + case BTREE_ID_dirents: + i_mode = S_IFDIR; + break; + case BTREE_ID_xattrs: + break; + default: + BUG(); + } + + struct bch_inode_unpacked new_inode; + bch2_inode_init_early(c, &new_inode); + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); + new_inode.bi_size = i_size; + new_inode.bi_inum = inum; + new_inode.bi_snapshot = snapshot; + + return __bch2_fsck_write_inode(trans, &new_inode); +} struct snapshots_seen { struct bpos pos; - DARRAY(struct snapshots_seen_entry) ids; + snapshot_id_list ids; }; static inline void snapshots_seen_exit(struct snapshots_seen *s) @@ -375,20 +710,15 @@ static inline void snapshots_seen_init(struct snapshots_seen *s) static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) { - struct snapshots_seen_entry *i, n = { - .id = id, - .equiv = bch2_snapshot_equiv(c, id), - }; - int ret = 0; - + u32 *i; __darray_for_each(s->ids, i) { - if (i->id == id) + if (*i == id) return 0; - if (i->id > id) + if (*i > id) break; } - ret = darray_insert_item(&s->ids, i - s->ids.data, n); + int ret = darray_insert_item(&s->ids, i - s->ids.data, id); if (ret) bch_err(c, "error reallocating snapshots_seen table (size %zu)", s->ids.size); @@ -398,42 +728,11 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, enum btree_id btree_id, struct bpos pos) { - struct snapshots_seen_entry n = { - .id = pos.snapshot, - .equiv = bch2_snapshot_equiv(c, pos.snapshot), - }; - int ret = 0; - if (!bkey_eq(s->pos, pos)) s->ids.nr = 0; - s->pos = pos; - s->pos.snapshot = n.equiv; - - darray_for_each(s->ids, i) { - if (i->id == n.id) - return 0; - /* - * We currently don't rigorously track for snapshot cleanup - * needing to be run, so it shouldn't be a fsck error yet: - */ - if (i->equiv == n.equiv) { - bch_err(c, "snapshot deletion did not finish:\n" - " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", - bch2_btree_id_str(btree_id), - pos.inode, pos.offset, - i->id, n.id, n.equiv); - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); - } - } - - ret = darray_push(&s->ids, n); - if (ret) - bch_err(c, "error reallocating snapshots_seen table (size %zu)", - s->ids.size); - return ret; + return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); } /** @@ -453,12 +752,10 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see ssize_t i; EBUG_ON(id > ancestor); - EBUG_ON(!bch2_snapshot_is_equiv(c, id)); - EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); /* @ancestor should be the snapshot most recently added to @seen */ EBUG_ON(ancestor != seen->pos.snapshot); - EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv); + EBUG_ON(ancestor != darray_last(seen->ids)); if (id == ancestor) return true; @@ -477,9 +774,9 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see */ for (i = seen->ids.nr - 2; - i >= 0 && seen->ids.data[i].equiv >= id; + i >= 0 && seen->ids.data[i] >= id; --i) - if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv)) + if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) return false; return true; @@ -510,9 +807,6 @@ static int ref_visible2(struct bch_fs *c, u32 src, struct snapshots_seen *src_seen, u32 dst, struct snapshots_seen *dst_seen) { - src = bch2_snapshot_equiv(c, src); - dst = bch2_snapshot_equiv(c, dst); - if (dst > src) { swap(dst, src); swap(dst_seen, src_seen); @@ -528,21 +822,24 @@ static int ref_visible2(struct bch_fs *c, struct inode_walker_entry { struct bch_inode_unpacked inode; u32 snapshot; - bool seen_this_pos; u64 count; + u64 i_size; }; struct inode_walker { bool first_this_inode; + bool have_inodes; bool recalculate_sums; struct bpos last_pos; DARRAY(struct inode_walker_entry) inodes; + snapshot_id_list deletes; }; static void inode_walker_exit(struct inode_walker *w) { darray_exit(&w->inodes); + darray_exit(&w->deletes); } static struct inode_walker inode_walker_init(void) @@ -555,11 +852,10 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, { struct bch_inode_unpacked u; - BUG_ON(bch2_inode_unpack(inode, &u)); - - return darray_push(&w->inodes, ((struct inode_walker_entry) { + return bch2_inode_unpack(inode, &u) ?: + darray_push(&w->inodes, ((struct inode_walker_entry) { .inode = u, - .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot), + .snapshot = inode.k->p.snapshot, })); } @@ -571,11 +867,17 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, struct bkey_s_c k; int ret; + /* + * We no longer have inodes for w->last_pos; clear this to avoid + * screwing up check_i_sectors/check_subdir_count if we take a + * transaction restart here: + */ + w->have_inodes = false; w->recalculate_sums = false; w->inodes.nr = 0; for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + BTREE_ITER_all_snapshots, k, ret) { if (k.k->p.offset != inum) break; @@ -588,41 +890,45 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, return ret; w->first_this_inode = true; + w->have_inodes = true; return 0; } static struct inode_walker_entry * -lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, - u32 snapshot, bool is_whiteout) +lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) { - struct inode_walker_entry *i; - - snapshot = bch2_snapshot_equiv(c, snapshot); + bool is_whiteout = k.k->type == KEY_TYPE_whiteout; + struct inode_walker_entry *i; __darray_for_each(w->inodes, i) - if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) goto found; return NULL; found: - BUG_ON(snapshot > i->snapshot); + BUG_ON(k.k->p.snapshot > i->snapshot); - if (snapshot != i->snapshot && !is_whiteout) { + if (k.k->p.snapshot != i->snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - size_t pos; - int ret; - new.snapshot = snapshot; - new.count = 0; + new.snapshot = k.k->p.snapshot; + new.count = 0; + new.i_size = 0; + + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); - bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", - w->last_pos.inode, snapshot, i->snapshot); + bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" + "unexpected because we should always update the inode when we update a key in that inode\n" + "%s", + w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); + printbuf_exit(&buf); - while (i > w->inodes.data && i[-1].snapshot > snapshot) + while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) --i; - pos = i - w->inodes.data; - ret = darray_insert_item(&w->inodes, pos, new); + size_t pos = i - w->inodes.data; + int ret = darray_insert_item(&w->inodes, pos, new); if (ret) return ERR_PTR(ret); @@ -633,27 +939,24 @@ found: } static struct inode_walker_entry *walk_inode(struct btree_trans *trans, - struct inode_walker *w, struct bpos pos, - bool is_whiteout) + struct inode_walker *w, + struct bkey_s_c k) { - if (w->last_pos.inode != pos.inode) { - int ret = get_inodes_all_snapshots(trans, w, pos.inode); + if (w->last_pos.inode != k.k->p.inode) { + int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); if (ret) return ERR_PTR(ret); - } else if (bkey_cmp(w->last_pos, pos)) { - darray_for_each(w->inodes, i) - i->seen_this_pos = false; } - w->last_pos = pos; + w->last_pos = k.k->p; - return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout); + return lookup_inode_for_snapshot(trans->c, w, k); } -static int __get_visible_inodes(struct btree_trans *trans, - struct inode_walker *w, - struct snapshots_seen *s, - u64 inum) +static int get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, + u64 inum) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -661,21 +964,23 @@ static int __get_visible_inodes(struct btree_trans *trans, int ret; w->inodes.nr = 0; + w->deletes.nr = 0; - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); - + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), + BTREE_ITER_all_snapshots, k, ret) { if (k.k->p.offset != inum) break; - if (!ref_visible(c, s, s->pos.snapshot, equiv)) + if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) continue; - if (bkey_is_inode(k.k)) - add_inode(c, w, k); + if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) + continue; - if (equiv >= s->pos.snapshot) + ret = bkey_is_inode(k.k) + ? add_inode(c, w, k) + : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); + if (ret) break; } bch2_trans_iter_exit(trans, &iter); @@ -683,143 +988,149 @@ static int __get_visible_inodes(struct btree_trans *trans, return ret; } -static int check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +/* + * Prefer to delete the first one, since that will be the one at the wrong + * offset: + * return value: 0 -> delete k1, 1 -> delete k2 + */ +int bch2_fsck_update_backpointers(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_i *new) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + if (new->k.type != KEY_TYPE_dirent) + return 0; + + struct bkey_i_dirent *d = bkey_i_to_dirent(new); + struct inode_walker target = inode_walker_init(); int ret = 0; - if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, - bkey_in_missing_snapshot, - "key in missing snapshot: %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; -fsck_err: - printbuf_exit(&buf); + if (d->v.d_type == DT_SUBVOL) { + BUG(); + } else { + ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); + if (ret) + goto err; + + darray_for_each(target.inodes, i) { + i->inode.bi_dir_offset = d->k.p.offset; + ret = __bch2_fsck_write_inode(trans, &i->inode); + if (ret) + goto err; + } + } +err: + inode_walker_exit(&target); return ret; } -static int hash_redo_key(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c k) +static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + u32 *snapshot) { - struct bkey_i *delete; - struct bkey_i *tmp; - - delete = bch2_trans_kmalloc(trans, sizeof(*delete)); - if (IS_ERR(delete)) - return PTR_ERR(delete); + if (inode->bi_subvol) { + u64 inum; + int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); + if (ret) + return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); + } - tmp = bch2_bkey_make_mut_noupdate(trans, k); - if (IS_ERR(tmp)) - return PTR_ERR(tmp); + return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); +} - bkey_init(&delete->k); - delete->k.p = k_iter->pos; - return bch2_btree_iter_traverse(k_iter) ?: - bch2_trans_update(trans, k_iter, delete, 0) ?: - bch2_hash_set_snapshot(trans, desc, hash_info, - (subvol_inum) { 0, k.k->p.inode }, - k.k->p.snapshot, tmp, - BCH_HASH_SET_MUST_CREATE, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); +static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); + int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; + bch2_trans_iter_exit(trans, &iter); + return ret; } -static int hash_check_key(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) +static int check_inode_dirent_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + bool *write_inode) { struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; struct printbuf buf = PRINTBUF; - struct bkey_s_c k; - u64 hash; - int ret = 0; - - if (hash_k.k->type != desc.key_type) - return 0; - - hash = desc.hash_bkey(hash_info, hash_k); - - if (likely(hash == hash_k.k->p.offset)) - return 0; - - if (hash_k.k->p.offset < hash) - goto bad_hash; - - for_each_btree_key_norestart(trans, iter, desc.btree_id, - SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_SLOTS, k, ret) { - if (bkey_eq(k.k->p, hash_k.k->p)) - break; - if (fsck_err_on(k.k->type == desc.key_type && - !desc.cmp_bkey(k, hash_k), c, - hash_table_key_duplicate, - "duplicate hash table keys:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), - buf.buf))) { - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; - break; - } + u32 inode_snapshot = inode->bi_snapshot; + struct btree_iter dirent_iter = {}; + struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); + int ret = bkey_err(d); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; - if (bkey_deleted(k.k)) { - bch2_trans_iter_exit(trans, &iter); - goto bad_hash; - } + if (fsck_err_on(ret, + trans, inode_points_to_missing_dirent, + "inode points to missing dirent\n%s", + (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || + fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode), + trans, inode_points_to_wrong_dirent, + "%s", + (printbuf_reset(&buf), + dirent_inode_mismatch_msg(&buf, c, d, inode), + buf.buf))) { + /* + * We just clear the backpointer fields for now. If we find a + * dirent that points to this inode in check_dirents(), we'll + * update it then; then when we get to check_path() if the + * backpointer is still 0 we'll reattach it. + */ + inode->bi_dir = 0; + inode->bi_dir_offset = 0; + *write_inode = true; } -out: - bch2_trans_iter_exit(trans, &iter); + + ret = 0; +fsck_err: + bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); + bch_err_fn(c, ret); return ret; -bad_hash: - if (fsck_err(c, hash_table_key_wrong_offset, - "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", - bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { - ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); - bch_err_fn(c, ret); - if (ret) - return ret; - ret = -BCH_ERR_transaction_restart_nested; - } -fsck_err: - goto out; } -static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) +static int get_snapshot_root_inode(struct btree_trans *trans, + struct bch_inode_unpacked *root, + u64 inum) { struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); - int ret = bkey_err(k); - if (ret) - return ret; + struct bkey_s_c k; + int ret = 0; + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) + goto found_root; + } + if (ret) + goto err; + BUG(); +found_root: + ret = bch2_inode_unpack(k, root); +err: bch2_trans_iter_exit(trans, &iter); - return k.k->type == KEY_TYPE_set; + return ret; } static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - struct bch_inode_unpacked *prev, - struct snapshots_seen *s, - bool full) + struct bch_inode_unpacked *snapshot_root, + struct snapshots_seen *s) { struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; struct bch_inode_unpacked u; bool do_update = false; int ret; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) goto err; if (ret) @@ -832,140 +1143,187 @@ static int check_inode(struct btree_trans *trans, if (!bkey_is_inode(k.k)) return 0; - BUG_ON(bch2_inode_unpack(k, &u)); - - if (!full && - !(u.bi_flags & (BCH_INODE_i_size_dirty| - BCH_INODE_i_sectors_dirty| - BCH_INODE_unlinked))) - return 0; - - if (prev->bi_inum != u.bi_inum) - *prev = u; - - if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || - inode_d_type(prev) != inode_d_type(&u), - c, inode_snapshot_mismatch, - "inodes in different snapshots don't match")) { - bch_err(c, "repair not implemented yet"); - return -BCH_ERR_fsck_repair_unimplemented; - } - - if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && - bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { - struct bpos new_min_pos; + ret = bch2_inode_unpack(k, &u); + if (ret) + goto err; - ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); + if (snapshot_root->bi_inum != u.bi_inum) { + ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); if (ret) goto err; + } - u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; + if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), + trans, inode_snapshot_mismatch, + "inode hash info in different snapshots don't match")) { + u.bi_hash_seed = snapshot_root->bi_hash_seed; + SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); + do_update = true; + } - ret = __write_inode(trans, &u, iter->pos.snapshot); - bch_err_msg(c, ret, "in fsck updating inode"); + if (u.bi_dir || u.bi_dir_offset) { + ret = check_inode_dirent_inode(trans, &u, &do_update); if (ret) - return ret; + goto err; + } - if (!bpos_eq(new_min_pos, POS_MIN)) - bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); - return 0; + if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked), + trans, inode_unlinked_but_has_dirent, + "inode unlinked but has dirent\n%s", + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { + u.bi_flags &= ~BCH_INODE_unlinked; + do_update = true; } - if (u.bi_flags & BCH_INODE_unlinked) { - ret = check_inode_deleted_list(trans, k.k->p); - if (ret < 0) - return ret; + if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) { + /* Check for this early so that check_unreachable_inode() will reattach it */ + + ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot); + if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty) + goto err; - fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list, - "inode %llu:%u unlinked, but not on deleted list", - u.bi_inum, k.k->p.snapshot); + fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty, + "dir unlinked but not empty\n%s", + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &u), + buf.buf)); + u.bi_flags &= ~BCH_INODE_unlinked; + do_update = true; ret = 0; } - if (u.bi_flags & BCH_INODE_unlinked && - (!c->sb.clean || - fsck_err(c, inode_unlinked_but_clean, - "filesystem marked clean, but inode %llu unlinked", - u.bi_inum))) { - ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); - bch_err_msg(c, ret, "in fsck deleting inode"); - return ret; + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) + goto err; + + if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), + trans, inode_has_child_snapshots_wrong, + "inode has_child_snapshots flag wrong (should be %u)\n%s", + ret, + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { + if (ret) + u.bi_flags |= BCH_INODE_has_child_snapshot; + else + u.bi_flags &= ~BCH_INODE_has_child_snapshot; + do_update = true; } + ret = 0; - if (u.bi_flags & BCH_INODE_i_size_dirty && - (!c->sb.clean || - fsck_err(c, inode_i_size_dirty_but_clean, - "filesystem marked clean, but inode %llu has i_size dirty", - u.bi_inum))) { - bch_verbose(c, "truncating inode %llu", u.bi_inum); + if ((u.bi_flags & BCH_INODE_unlinked) && + !(u.bi_flags & BCH_INODE_has_child_snapshot)) { + if (!test_bit(BCH_FS_started, &c->flags)) { + /* + * If we're not in online fsck, don't delete unlinked + * inodes, just make sure they're on the deleted list. + * + * They might be referred to by a logged operation - + * i.e. we might have crashed in the middle of a + * truncate on an unlinked but open file - so we want to + * let the delete_dead_inodes kill it after resuming + * logged ops. + */ + ret = check_inode_deleted_list(trans, k.k->p); + if (ret < 0) + goto err_noprint; - /* - * XXX: need to truncate partial blocks too here - or ideally - * just switch units to bytes and that issue goes away - */ - ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, - iter->pos.snapshot), - POS(u.bi_inum, U64_MAX), - 0, NULL); - bch_err_msg(c, ret, "in fsck truncating inode"); - if (ret) - return ret; + fsck_err_on(!ret, + trans, unlinked_inode_not_on_deleted_list, + "inode %llu:%u unlinked, but not on deleted list", + u.bi_inum, k.k->p.snapshot); - /* - * We truncated without our normal sector accounting hook, just - * make sure we recalculate it: - */ - u.bi_flags |= BCH_INODE_i_sectors_dirty; + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); + if (ret) + goto err; + } else { + ret = bch2_inode_or_descendents_is_open(trans, k.k->p); + if (ret < 0) + goto err; - u.bi_flags &= ~BCH_INODE_i_size_dirty; + if (fsck_err_on(!ret, + trans, inode_unlinked_and_not_open, + "inode %llu:%u unlinked and not open", + u.bi_inum, u.bi_snapshot)) { + ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck deleting inode"); + goto err_noprint; + } + ret = 0; + } + } + + if (fsck_err_on(u.bi_parent_subvol && + (u.bi_subvol == 0 || + u.bi_subvol == BCACHEFS_ROOT_SUBVOL), + trans, inode_bi_parent_nonzero, + "inode %llu:%u has subvol %u but nonzero parent subvol %u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { + u.bi_parent_subvol = 0; do_update = true; } - if (u.bi_flags & BCH_INODE_i_sectors_dirty && - (!c->sb.clean || - fsck_err(c, inode_i_sectors_dirty_but_clean, - "filesystem marked clean, but inode %llu has i_sectors dirty", - u.bi_inum))) { - s64 sectors; + if (u.bi_subvol) { + struct bch_subvolume s; - bch_verbose(c, "recounting sectors for inode %llu", - u.bi_inum); + ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; - sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); - if (sectors < 0) { - bch_err_msg(c, sectors, "in fsck recounting inode sectors"); - return sectors; + if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { + ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); + goto do_update; } - u.bi_sectors = sectors; - u.bi_flags &= ~BCH_INODE_i_sectors_dirty; - do_update = true; + if (fsck_err_on(ret, + trans, inode_bi_subvol_missing, + "inode %llu:%u bi_subvol points to missing subvolume %u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol) || + fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || + !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), + k.k->p.snapshot), + trans, inode_bi_subvol_wrong, + "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol, + le64_to_cpu(s.inode), + le32_to_cpu(s.snapshot))) { + u.bi_subvol = 0; + u.bi_parent_subvol = 0; + do_update = true; + } } - if (u.bi_flags & BCH_INODE_backptr_untrusted) { - u.bi_dir = 0; - u.bi_dir_offset = 0; - u.bi_flags &= ~BCH_INODE_backptr_untrusted; + if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal), + trans, inode_journal_seq_in_future, + "inode journal seq in future (currently at %llu)\n%s", + journal_cur_seq(&c->journal), + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { + u.bi_journal_seq = journal_cur_seq(&c->journal); do_update = true; } - +do_update: if (do_update) { - ret = __write_inode(trans, &u, iter->pos.snapshot); + ret = __bch2_fsck_write_inode(trans, &u); bch_err_msg(c, ret, "in fsck updating inode"); if (ret) - return ret; + goto err_noprint; } err: fsck_err: bch_err_fn(c, ret); +err_noprint: + printbuf_exit(&buf); return ret; } int bch2_check_inodes(struct bch_fs *c) { - bool full = c->opts.fsck; - struct bch_inode_unpacked prev = { 0 }; + struct bch_inode_unpacked snapshot_root = {}; struct snapshots_seen s; snapshots_seen_init(&s); @@ -973,41 +1331,178 @@ int bch2_check_inodes(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_inode(trans, &iter, k, &prev, &s, full))); + check_inode(trans, &iter, k, &snapshot_root, &s))); snapshots_seen_exit(&s); bch_err_fn(c, ret); return ret; } -static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos) +static int find_oldest_inode_needs_reattach(struct btree_trans *trans, + struct bch_inode_unpacked *inode) { - return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + /* + * We look for inodes to reattach in natural key order, leaves first, + * but we should do the reattach at the oldest version that needs to be + * reattached: + */ + for_each_btree_key_norestart(trans, iter, + BTREE_ID_inodes, + SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode->bi_inum) + break; + + if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) + continue; + + if (!bkey_is_inode(k.k)) + break; + + struct bch_inode_unpacked parent_inode; + ret = bch2_inode_unpack(k, &parent_inode); + if (ret) + break; + + if (!inode_should_reattach(&parent_inode)) + break; + + *inode = parent_inode; + } + bch2_trans_iter_exit(trans, &iter); + + return ret; } -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) +static int check_unreachable_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) { - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (!bkey_is_inode(k.k)) + return 0; + + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(k, &inode); + if (ret) + return ret; + + if (!inode_should_reattach(&inode)) + return 0; + + ret = find_oldest_inode_needs_reattach(trans, &inode); + if (ret) + return ret; + + if (fsck_err(trans, inode_unreachable, + "unreachable inode:\n%s", + (bch2_inode_unpacked_to_text(&buf, &inode), + buf.buf))) + ret = reattach_inode(trans, &inode); +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * Reattach unreachable (but not unlinked) inodes + * + * Run after check_inodes() and check_dirents(), so we node that inode + * backpointer fields point to valid dirents, and every inode that has a dirent + * that points to it has its backpointer field set - so we're just looking for + * non-unlinked inodes without backpointers: + * + * XXX: this is racy w.r.t. hardlink removal in online fsck + */ +int bch2_check_unreachable_inodes(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_unreachable_inode(trans, &iter, k))); + bch_err_fn(c, ret); + return ret; } -static bool dirent_points_to_inode(struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) +static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) { - return d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol - : le64_to_cpu(d.v->d_inum) == inode->bi_inum; + switch (btree) { + case BTREE_ID_extents: + return S_ISREG(mode) || S_ISLNK(mode); + case BTREE_ID_dirents: + return S_ISDIR(mode); + case BTREE_ID_xattrs: + return true; + default: + BUG(); + } } -static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) +static int check_key_has_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct inode_walker *inode, + struct inode_walker_entry *i, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = PTR_ERR_OR_ZERO(i); + if (ret) + return ret; + + if (k.k->type == KEY_TYPE_whiteout) + goto out; + + if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + inode->last_pos.inode--; + ret = -BCH_ERR_transaction_restart_nested; + goto err; + } + + if (fsck_err_on(!i, + trans, key_in_missing_inode, + "key in missing inode:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + goto delete; + + if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), + trans, key_in_wrong_inode_type, + "key for wrong inode mode %o:\n %s", + i->inode.bi_mode, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + goto delete; +out: +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +delete: + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); + goto out; +} + +static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; int ret = 0; s64 count2; @@ -1021,25 +1516,32 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) i->count = count2; if (i->count != count2) { - bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); - return -BCH_ERR_internal_fsck_err; + bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); + i->count = count2; } if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), - c, inode_i_sectors_wrong, + trans, inode_i_sectors_wrong, "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", w->last_pos.inode, i->snapshot, i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; - ret = fsck_write_inode(trans, &i->inode, i->snapshot); + ret = bch2_fsck_write_inode(trans, &i->inode); if (ret) break; } } fsck_err: bch_err_fn(c, ret); - return ret ?: trans_was_restarted(trans, restart_count); + return ret; +} + +static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) +{ + u32 restart_count = trans->restart_count; + return check_i_sectors_notnested(trans, w) ?: + trans_was_restarted(trans, restart_count); } struct extent_end { @@ -1118,9 +1620,9 @@ static int overlapping_extents_found(struct btree_trans *trans, BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); bch2_trans_iter_init(trans, &iter1, btree, pos1, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_NOT_EXTENTS); - k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); + BTREE_ITER_all_snapshots| + BTREE_ITER_not_extents); + k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX)); ret = bkey_err(k1); if (ret) goto err; @@ -1145,7 +1647,7 @@ static int overlapping_extents_found(struct btree_trans *trans, while (1) { bch2_btree_iter_advance(&iter2); - k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); + k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX)); ret = bkey_err(k2); if (ret) goto err; @@ -1168,7 +1670,7 @@ static int overlapping_extents_found(struct btree_trans *trans, prt_printf(&buf, "\n overwriting %s extent", pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); - if (fsck_err(c, extent_overlapping, + if (fsck_err(trans, extent_overlapping, "overlapping extents%s", buf.buf)) { struct btree_iter *old_iter = &iter1; struct disk_reservation res = { 0 }; @@ -1181,7 +1683,7 @@ static int overlapping_extents_found(struct btree_trans *trans, trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); ret = bch2_trans_update_extent_overwrite(trans, old_iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + BTREE_UPDATE_internal_snapshot_node, k1, k2) ?: bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(c, &res); @@ -1222,7 +1724,6 @@ static int check_overlapping_extents(struct btree_trans *trans, struct snapshots_seen *seen, struct extent_ends *extent_ends, struct bkey_s_c k, - u32 equiv, struct btree_iter *iter, bool *fixed) { @@ -1255,10 +1756,6 @@ static int check_overlapping_extents(struct btree_trans *trans, goto err; } - ret = extent_ends_at(c, extent_ends, seen, k); - if (ret) - goto err; - extent_ends->last_pos = k.k->p; err: return ret; @@ -1290,81 +1787,59 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct inode_walker *inode, struct snapshots_seen *s, - struct extent_ends *extent_ends) + struct extent_ends *extent_ends, + struct disk_reservation *res) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; - struct bpos equiv = k.k->p; int ret = 0; - equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); - - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret) { ret = ret < 0 ? ret : 0; goto out; } - if (inode->last_pos.inode != k.k->p.inode) { + if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { ret = check_i_sectors(trans, inode); if (ret) goto err; } - i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout); - ret = PTR_ERR_OR_ZERO(i); + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); if (ret) goto err; - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); + ret = PTR_ERR_OR_ZERO(extent_i); if (ret) goto err; - if (k.k->type != KEY_TYPE_whiteout) { - if (fsck_err_on(!i, c, extent_in_missing_inode, - "extent in missing inode:\n %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - goto delete; - - if (fsck_err_on(i && - !S_ISREG(i->inode.bi_mode) && - !S_ISLNK(i->inode.bi_mode), - c, extent_in_non_reg_inode, - "extent in non regular inode mode %o:\n %s", - i->inode.bi_mode, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - goto delete; + ret = check_key_has_inode(trans, iter, inode, extent_i, k); + if (ret) + goto err; - ret = check_overlapping_extents(trans, s, extent_ends, k, - equiv.snapshot, iter, + if (k.k->type != KEY_TYPE_whiteout) { + ret = check_overlapping_extents(trans, s, extent_ends, k, iter, &inode->recalculate_sums); if (ret) goto err; - } - - /* - * Check inodes in reverse order, from oldest snapshots to newest, - * starting from the inode that matches this extent's snapshot. If we - * didn't have one, iterate over all inodes: - */ - if (!i) - i = inode->inodes.data + inode->inodes.nr - 1; - for (; - inode->inodes.data && i >= inode->inodes.data; - --i) { - if (i->snapshot > equiv.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) - continue; + /* + * Check inodes in reverse order, from oldest snapshots to + * newest, starting from the inode that matches this extent's + * snapshot. If we didn't have one, iterate over all inodes: + */ + for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); + inode->inodes.data && i >= inode->inodes.data; + --i) { + if (i->snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + continue; - if (k.k->type != KEY_TYPE_whiteout) { - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && - k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && + if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), - c, extent_past_end_of_inode, + trans, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n %s", i->inode.bi_inum, i->snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -1374,19 +1849,37 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, bch2_btree_iter_set_snapshot(&iter2, i->snapshot); ret = bch2_btree_iter_traverse(&iter2) ?: bch2_btree_delete_at(trans, &iter2, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter2); if (ret) goto err; iter->k.type = KEY_TYPE_whiteout; + break; } + } + } + + ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + if (bkey_extent_is_allocation(k.k)) { + for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); + inode->inodes.data && i >= inode->inodes.data; + --i) { + if (i->snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + continue; - if (bkey_extent_is_allocation(k.k)) - i->count += k.k->size; + i->count += k.k->size; } + } - i->seen_this_pos = true; + if (k.k->type != KEY_TYPE_whiteout) { + ret = extent_ends_at(c, extent_ends, s, k); + if (ret) + goto err; } out: err: @@ -1394,9 +1887,6 @@ fsck_err: printbuf_exit(&buf); bch_err_fn(c, ret); return ret; -delete: - ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - goto out; } /* @@ -1414,16 +1904,14 @@ int bch2_check_extents(struct bch_fs *c) extent_ends_init(&extent_ends); int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_extents, + for_each_btree_key(trans, iter, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - &res, NULL, - BCH_TRANS_COMMIT_no_enospc, ({ + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ bch2_disk_reservation_put(c, &res); - check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: + check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: check_extent_overbig(trans, &iter, k); })) ?: - check_i_sectors(trans, &w)); + check_i_sectors_notnested(trans, &w)); bch2_disk_reservation_put(c, &res); extent_ends_exit(&extent_ends); @@ -1441,7 +1929,7 @@ int bch2_check_indirect_extents(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_disk_reservation_put(c, &res); @@ -1453,10 +1941,9 @@ int bch2_check_indirect_extents(struct bch_fs *c) return ret; } -static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; int ret = 0; s64 count2; @@ -1469,111 +1956,166 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) return count2; if (i->count != count2) { - bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", - i->count, count2); + bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); i->count = count2; if (i->inode.bi_nlink == i->count) continue; } if (fsck_err_on(i->inode.bi_nlink != i->count, - c, inode_dir_wrong_nlink, + trans, inode_dir_wrong_nlink, "directory %llu:%u with wrong i_nlink: got %u, should be %llu", w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; - ret = fsck_write_inode(trans, &i->inode, i->snapshot); + ret = bch2_fsck_write_inode(trans, &i->inode); if (ret) break; } } fsck_err: bch_err_fn(c, ret); - return ret ?: trans_was_restarted(trans, restart_count); + return ret; } -static int check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - u32 target_snapshot) +static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) +{ + u32 restart_count = trans->restart_count; + return check_subdir_count_notnested(trans, w) ?: + trans_was_restarted(trans, restart_count); +} + +noinline_for_stack +static int check_dirent_inode_dirent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target) { struct bch_fs *c = trans->c; - struct bkey_i_dirent *n; struct printbuf buf = PRINTBUF; struct btree_iter bp_iter = { NULL }; int ret = 0; + if (inode_points_to_dirent(target, d)) + return 0; + if (!target->bi_dir && !target->bi_dir_offset) { + fsck_err_on(S_ISDIR(target->bi_mode), + trans, inode_dir_missing_backpointer, + "directory with missing backpointer\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + fsck_err_on(target->bi_flags & BCH_INODE_unlinked, + trans, inode_unlinked_but_has_dirent, + "inode unlinked but has dirent\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + target->bi_flags &= ~BCH_INODE_unlinked; target->bi_dir = d.k->p.inode; target->bi_dir_offset = d.k->p.offset; - - ret = __write_inode(trans, target, target_snapshot); - if (ret) - goto err; + return __bch2_fsck_write_inode(trans, target); } - if (!inode_points_to_dirent(target, d)) { - struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, - SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - bool backpointer_exists = !ret; - ret = 0; + if (bch2_inode_should_have_single_bp(target) && + !fsck_err(trans, inode_wrong_backpointer, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf))) + goto err; - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, + SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot)); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; - if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, - c, inode_dir_multiple_links, - "directory %llu:%u with multiple links\n%s", - target->bi_inum, target_snapshot, buf.buf)) { - ret = __remove_dirent(trans, d.k->p); - goto out; - } + bool backpointer_exists = !ret; + ret = 0; + + if (fsck_err_on(!backpointer_exists, + trans, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target->bi_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target); + goto out; + } - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - c, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - - ret = __write_inode(trans, target, target_snapshot); - if (ret) - goto err; - } + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + if (backpointer_exists) + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + + if (fsck_err_on(backpointer_exists && + (S_ISDIR(target->bi_mode) || + target->bi_subvol), + trans, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf)) { + ret = __remove_dirent(trans, d.k->p); + goto out; + } - if (fsck_err_on(!backpointer_exists, - c, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - - ret = __write_inode(trans, target, target_snapshot); - if (ret) - goto err; - } + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(backpointer_exists && !target->bi_nlink, + trans, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target); + if (ret) + goto err; } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +noinline_for_stack +static int check_dirent_target(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target) +{ + struct bch_fs *c = trans->c; + struct bkey_i_dirent *n; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ret = check_dirent_inode_dirent(trans, iter, d, target); + if (ret) + goto err; if (fsck_err_on(d.v->d_type != inode_d_type(target), - c, dirent_d_type_wrong, + trans, dirent_d_type_wrong, "incorrect d_type: got %s, should be %s:\n%s", bch2_d_type_str(d.v->d_type), bch2_d_type_str(inode_d_type(target)), @@ -1586,6 +2128,12 @@ static int check_dirent_target(struct btree_trans *trans, bkey_reassemble(&n->k_i, d.s_c); n->v.d_type = inode_d_type(target); + if (n->v.d_type == DT_SUBVOL) { + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); + } else { + n->v.d_inum = cpu_to_le64(target->bi_inum); + } ret = bch2_trans_update(trans, iter, &n->k_i, 0); if (ret) @@ -1593,33 +2141,164 @@ static int check_dirent_target(struct btree_trans *trans, d = dirent_i_to_s_c(n); } +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} - if (fsck_err_on(d.v->d_type == DT_SUBVOL && - target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol), - c, dirent_d_parent_subvol_wrong, - "dirent has wrong d_parent_subvol field: got %u, should be %u", - le32_to_cpu(d.v->d_parent_subvol), - target->bi_parent_subvol)) { - n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); +/* find a subvolume that's a descendent of @snapshot: */ +static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { + bch2_trans_iter_exit(trans, &iter); + *subvolid = k.k->p.offset; + goto found; + } + } + if (!ret) + ret = -ENOENT; +found: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +noinline_for_stack +static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c_dirent d) +{ + struct bch_fs *c = trans->c; + struct btree_iter subvol_iter = {}; + struct bch_inode_unpacked subvol_root; + u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); + u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); + u32 parent_snapshot; + u32 new_parent_subvol = 0; + u64 parent_inum; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (ret || + (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { + int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); + if (ret2 && !bch2_err_matches(ret, ENOENT)) + return ret2; + } + + if (ret && + !new_parent_subvol && + (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { + /* + * Couldn't find a subvol for dirent's snapshot - but we lost + * subvols, so we need to reconstruct: + */ + ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); + if (ret) + return ret; + + parent_snapshot = d.k->p.snapshot; + } + + if (fsck_err_on(ret, + trans, dirent_to_missing_parent_subvol, + "dirent parent_subvol points to missing subvolume\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || + fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), + trans, dirent_not_visible_in_parent_subvol, + "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", + parent_snapshot, + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + if (!new_parent_subvol) { + bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); + return -BCH_ERR_fsck_repair_unimplemented; + } + + struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); + ret = PTR_ERR_OR_ZERO(new_dirent); if (ret) goto err; - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); + } - ret = bch2_trans_update(trans, iter, &n->k_i, 0); + struct bkey_s_c_subvolume s = + bch2_bkey_get_iter_typed(trans, &subvol_iter, + BTREE_ID_subvolumes, POS(0, target_subvol), + 0, subvolume); + ret = bkey_err(s.s_c); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (ret) { + if (fsck_err(trans, dirent_to_missing_subvol, + "dirent points to missing subvolume\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) + return __remove_dirent(trans, d.k->p); + ret = 0; + goto out; + } + + if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, + trans, subvol_fs_path_parent_wrong, + "subvol with wrong fs_path_parent, should be be %u\n%s", + parent_subvol, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); if (ret) goto err; - d = dirent_i_to_s_c(n); + n->v.fs_path_parent = cpu_to_le32(parent_subvol); + } + + u64 target_inum = le64_to_cpu(s.v->inode); + u32 target_snapshot = le32_to_cpu(s.v->snapshot); + + ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (ret) { + bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; + } + + if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, + trans, inode_bi_parent_wrong, + "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", + target_inum, + subvol_root.bi_parent_subvol, parent_subvol)) { + subvol_root.bi_parent_subvol = parent_subvol; + subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot); + ret = __bch2_fsck_write_inode(trans, &subvol_root); + if (ret) + goto err; } + + ret = check_dirent_target(trans, iter, d, &subvol_root); + if (ret) + goto err; out: err: fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); + bch2_trans_iter_exit(trans, &subvol_iter); printbuf_exit(&buf); - bch_err_fn(c, ret); return ret; } @@ -1631,21 +2310,16 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct snapshots_seen *s) { struct bch_fs *c = trans->c; - struct bkey_s_c_dirent d; struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; - struct bpos equiv; int ret = 0; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret) { ret = ret < 0 ? ret : 0; goto out; } - equiv = k.k->p; - equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); if (ret) goto err; @@ -1653,46 +2327,29 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (k.k->type == KEY_TYPE_whiteout) goto out; - if (dir->last_pos.inode != k.k->p.inode) { - ret = check_subdir_count(trans, dir); + if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { + ret = check_subdir_dirents_count(trans, dir); if (ret) goto err; } - BUG_ON(!btree_iter_path(trans, iter)->should_be_locked); - - i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, dir, k); ret = PTR_ERR_OR_ZERO(i); if (ret < 0) goto err; - if (dir->first_this_inode && dir->inodes.nr) - *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); - dir->first_this_inode = false; - - if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, - "dirent in nonexisting directory:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - goto out; - } + ret = check_key_has_inode(trans, iter, dir, i, k); + if (ret) + goto err; if (!i) goto out; - if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), - c, dirent_in_non_dir_inode, - "dirent in non directory inode type %s:\n%s", - bch2_d_type_str(inode_d_type(&i->inode)), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto out; - } + if (dir->first_this_inode) + *hash_info = bch2_hash_info_init(c, &i->inode); + dir->first_this_inode = false; - ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); + ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k); if (ret < 0) goto err; if (ret) { @@ -1704,64 +2361,20 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (k.k->type != KEY_TYPE_dirent) goto out; - d = bkey_s_c_to_dirent(k); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL) { - struct bch_inode_unpacked subvol_root; - u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); - u32 target_snapshot; - u64 target_inum; - - ret = subvol_lookup(trans, target_subvol, - &target_snapshot, &target_inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, c, dirent_to_missing_subvol, - "dirent points to missing subvolume %u", - le32_to_cpu(d.v->d_child_subvol))) { - ret = __remove_dirent(trans, d.k->p); - goto err; - } - - ret = lookup_inode(trans, target_inum, - &subvol_root, &target_snapshot); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, c, subvol_to_missing_root, - "subvolume %u points to missing subvolume root %llu", - target_subvol, - target_inum)) { - bch_err(c, "repair not implemented yet"); - ret = -EINVAL; - goto err; - } - - if (fsck_err_on(subvol_root.bi_subvol != target_subvol, - c, subvol_root_wrong_bi_subvol, - "subvol root %llu has wrong bi_subvol field: got %u, should be %u", - target_inum, - subvol_root.bi_subvol, target_subvol)) { - subvol_root.bi_subvol = target_subvol; - ret = __write_inode(trans, &subvol_root, target_snapshot); - if (ret) - goto err; - } - - ret = check_dirent_target(trans, iter, d, &subvol_root, - target_snapshot); + ret = check_dirent_to_subvol(trans, iter, d); if (ret) goto err; } else { - ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); + ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); if (ret) goto err; if (fsck_err_on(!target->inodes.nr, - c, dirent_to_missing_inode, - "dirent points to missing inode: (equiv %u)\n%s", - equiv.snapshot, + trans, dirent_to_missing_inode, + "dirent points to missing inode:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -1771,17 +2384,45 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, } darray_for_each(target->inodes, i) { - ret = check_dirent_target(trans, iter, d, - &i->inode, i->snapshot); + ret = check_dirent_target(trans, iter, d, &i->inode); if (ret) goto err; } + + darray_for_each(target->deletes, i) + if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i), + trans, dirent_to_overwritten_inode, + "dirent points to inode overwritten in snapshot %u:\n%s", + *i, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { + struct btree_iter delete_iter; + bch2_trans_iter_init(trans, &delete_iter, + BTREE_ID_dirents, + SPOS(k.k->p.inode, k.k->p.offset, *i), + BTREE_ITER_intent); + ret = bch2_btree_iter_traverse(&delete_iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + hash_info, + &delete_iter, + BTREE_UPDATE_internal_snapshot_node); + bch2_trans_iter_exit(trans, &delete_iter); + if (ret) + goto err; + + } } - if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, equiv.snapshot, i) - i->count++; + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { + if (d.v->d_type == DT_DIR) + i->count++; + i->i_size += bkey_bytes(d.k); + } out: err: fsck_err: @@ -1804,13 +2445,11 @@ int bch2_check_dirents(struct bch_fs *c) snapshots_seen_init(&s); int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, + for_each_btree_key(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, - k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s))); + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: + check_subdir_count_notnested(trans, &dir)); snapshots_seen_exit(&s); inode_walker_exit(&dir); @@ -1828,29 +2467,29 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, struct inode_walker_entry *i; int ret; - ret = check_key_has_snapshot(trans, iter, k); - if (ret) + ret = bch2_check_key_has_snapshot(trans, iter, k); + if (ret < 0) return ret; + if (ret) + return 0; - i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) return ret; - if (inode->first_this_inode && inode->inodes.nr) - *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); - inode->first_this_inode = false; - - if (fsck_err_on(!i, c, xattr_in_missing_inode, - "xattr for missing inode %llu", - k.k->p.inode)) - return bch2_btree_delete_at(trans, iter, 0); + ret = check_key_has_inode(trans, iter, inode, i, k); + if (ret) + return ret; if (!i) return 0; - ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); -fsck_err: + if (inode->first_this_inode) + *hash_info = bch2_hash_info_init(c, &i->inode); + inode->first_this_inode = false; + + ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k); bch_err_fn(c, ret); return ret; } @@ -1867,11 +2506,13 @@ int bch2_check_xattrs(struct bch_fs *c) ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_xattr(trans, &iter, k, &hash_info, &inode))); + + inode_walker_exit(&inode); bch_err_fn(c, ret); return ret; } @@ -1888,38 +2529,44 @@ static int check_root_trans(struct btree_trans *trans) if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (mustfix_fsck_err_on(ret, c, root_subvol_missing, + if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, "root subvol missing")) { - struct bkey_i_subvolume root_subvol; + struct bkey_i_subvolume *root_subvol = + bch2_trans_kmalloc(trans, sizeof(*root_subvol)); + ret = PTR_ERR_OR_ZERO(root_subvol); + if (ret) + goto err; snapshot = U32_MAX; inum = BCACHEFS_ROOT_INO; - bkey_subvolume_init(&root_subvol.k_i); - root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_subvol.v.flags = 0; - root_subvol.v.snapshot = cpu_to_le32(snapshot); - root_subvol.v.inode = cpu_to_le64(inum); - ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0); + bkey_subvolume_init(&root_subvol->k_i); + root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_subvol->v.flags = 0; + root_subvol->v.snapshot = cpu_to_le32(snapshot); + root_subvol->v.inode = cpu_to_le64(inum); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); bch_err_msg(c, ret, "writing root subvol"); if (ret) goto err; } - ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); + ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (mustfix_fsck_err_on(ret, c, root_dir_missing, + if (mustfix_fsck_err_on(ret, + trans, root_dir_missing, "root directory missing") || mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), - c, root_inode_not_dir, + trans, root_inode_not_dir, "root inode not a directory")) { bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); root_inode.bi_inum = inum; + root_inode.bi_snapshot = snapshot; - ret = __write_inode(trans, &root_inode, snapshot); + ret = __bch2_fsck_write_inode(trans, &root_inode); bch_err_msg(c, ret, "writing root inode"); } err: @@ -1930,12 +2577,91 @@ fsck_err: /* Get root directory, create if it doesn't exist: */ int bch2_check_root(struct bch_fs *c) { - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_root_trans(trans)); bch_err_fn(c, ret); return ret; } +typedef DARRAY(u32) darray_u32; + +static bool darray_u32_has(darray_u32 *d, u32 v) +{ + darray_for_each(*d, i) + if (*i == v) + return true; + return false; +} + +static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct btree_iter parent_iter = {}; + darray_u32 subvol_path = {}; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + + while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { + ret = darray_push(&subvol_path, k.k->p.offset); + if (ret) + goto err; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + + struct bch_inode_unpacked subvol_root; + ret = bch2_inode_find_by_inum_trans(trans, + (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, + &subvol_root); + if (ret) + break; + + u32 parent = le32_to_cpu(s.v->fs_path_parent); + + if (darray_u32_has(&subvol_path, parent)) { + if (fsck_err(c, subvol_loop, "subvolume loop")) + ret = reattach_subvol(trans, s); + break; + } + + bch2_trans_iter_exit(trans, &parent_iter); + bch2_trans_iter_init(trans, &parent_iter, + BTREE_ID_subvolumes, POS(0, parent), 0); + k = bch2_btree_iter_peek_slot(&parent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, + trans, subvol_unreachable, + "unreachable subvolume %s", + (bch2_bkey_val_to_text(&buf, c, s.s_c), + buf.buf))) { + ret = reattach_subvol(trans, s); + break; + } + } +fsck_err: +err: + printbuf_exit(&buf); + darray_exit(&subvol_path); + bch2_trans_iter_exit(trans, &parent_iter); + return ret; +} + +int bch2_check_subvolume_structure(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_subvol_path(trans, &iter, k))); + bch_err_fn(c, ret); + return ret; +} + struct pathbuf_entry { u64 inum; u32 snapshot; @@ -1943,6 +2669,48 @@ struct pathbuf_entry { typedef DARRAY(struct pathbuf_entry) pathbuf; +static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p, + u32 new_depth) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, p->inum, p->snapshot), 0); + + struct bch_inode_unpacked inode; + int ret = bkey_err(k) ?: + !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode + : bch2_inode_unpack(k, &inode); + if (ret) + goto err; + + if (inode.bi_depth != new_depth) { + inode.bi_depth = new_depth; + ret = __bch2_fsck_write_inode(trans, &inode) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth) +{ + u32 restart_count = trans->restart_count; + int ret = 0; + + darray_for_each_reverse(*path, i) { + ret = nested_lockrestart_do(trans, + bch2_bi_depth_renumber_one(trans, i, new_bi_depth)); + bch_err_fn(trans->c, ret); + if (ret) + break; + + new_bi_depth++; + } + + return ret ?: trans_was_restarted(trans, restart_count); +} + static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) { darray_for_each(*p, i) @@ -1952,152 +2720,134 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) return false; } -static int path_down(struct bch_fs *c, pathbuf *p, - u64 inum, u32 snapshot) -{ - int ret = darray_push(p, ((struct pathbuf_entry) { - .inum = inum, - .snapshot = snapshot, - })); - - if (ret) - bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", - p->size); - return ret; -} - -/* - * Check that a given inode is reachable from the root: - * - * XXX: we should also be verifying that inodes are in the right subvolumes - */ -static int check_path(struct btree_trans *trans, - pathbuf *p, - struct bch_inode_unpacked *inode, - u32 snapshot) +static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; + struct btree_iter inode_iter = {}; + pathbuf path = {}; + struct printbuf buf = PRINTBUF; + u32 snapshot = inode_k.k->p.snapshot; + bool redo_bi_depth = false; + u32 min_bi_depth = U32_MAX; int ret = 0; - snapshot = bch2_snapshot_equiv(c, snapshot); - p->nr = 0; + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(inode_k, &inode); + if (ret) + return ret; - while (!(inode->bi_inum == BCACHEFS_ROOT_INO && - inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + while (!inode.bi_subvol) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; u32 parent_snapshot = snapshot; - if (inode->bi_subvol) { - u64 inum; - - ret = subvol_lookup(trans, inode->bi_parent_subvol, - &parent_snapshot, &inum); - if (ret) - break; - } - - d = dirent_get_by_pos(trans, &dirent_iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, - parent_snapshot)); + d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) - break; + goto out; - if (!ret && !dirent_points_to_inode(d, inode)) { + if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) bch2_trans_iter_exit(trans, &dirent_iter); - ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; - } if (bch2_err_matches(ret, ENOENT)) { - if (fsck_err(c, inode_unreachable, - "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", - inode->bi_inum, snapshot, - bch2_d_type_str(inode_d_type(inode)), - inode->bi_nlink, - inode->bi_dir, - inode->bi_dir_offset)) - ret = reattach_inode(trans, inode, snapshot); - break; + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, inode_k); + bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", + bch2_err_str(ret), buf.buf); + goto out; } bch2_trans_iter_exit(trans, &dirent_iter); - if (!S_ISDIR(inode->bi_mode)) - break; - - ret = path_down(c, p, inode->bi_inum, snapshot); - if (ret) { - bch_err(c, "memory allocation failure"); + ret = darray_push(&path, ((struct pathbuf_entry) { + .inum = inode.bi_inum, + .snapshot = snapshot, + })); + if (ret) return ret; - } snapshot = parent_snapshot; - ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); + bch2_trans_iter_exit(trans, &inode_iter); + inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, + SPOS(0, inode.bi_dir, snapshot), 0); + + struct bch_inode_unpacked parent_inode; + ret = bkey_err(inode_k) ?: + !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode + : bch2_inode_unpack(inode_k, &parent_inode); if (ret) { /* Should have been caught in dirents pass */ - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "error looking up parent directory: %i", ret); - break; + bch_err_msg(c, ret, "error looking up parent directory"); + goto out; } - if (path_is_dup(p, inode->bi_inum, snapshot)) { + min_bi_depth = parent_inode.bi_depth; + + if (parent_inode.bi_depth < inode.bi_depth && + min_bi_depth < U16_MAX) + break; + + inode = parent_inode; + snapshot = inode_k.k->p.snapshot; + redo_bi_depth = true; + + if (path_is_dup(&path, inode.bi_inum, snapshot)) { /* XXX print path */ bch_err(c, "directory structure loop"); - darray_for_each(*p, i) + darray_for_each(path, i) pr_err("%llu:%u", i->inum, i->snapshot); - pr_err("%llu:%u", inode->bi_inum, snapshot); + pr_err("%llu:%u", inode.bi_inum, snapshot); - if (!fsck_err(c, dir_loop, "directory structure loop")) - return 0; - - ret = remove_backpointer(trans, inode); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + if (fsck_err(trans, dir_loop, "directory structure loop")) { + ret = remove_backpointer(trans, &inode); bch_err_msg(c, ret, "removing dirent"); - if (ret) - break; + if (ret) + break; - ret = reattach_inode(trans, inode, snapshot); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum); - break; + ret = reattach_inode(trans, &inode); + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); + } + + goto out; } } + + if (inode.bi_subvol) + min_bi_depth = 0; + + if (redo_bi_depth) + ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth); +out: fsck_err: + bch2_trans_iter_exit(trans, &inode_iter); + darray_exit(&path); + printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } /* - * Check for unreachable inodes, as well as loops in the directory structure: - * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's - * unreachable: + * Check for loops in the directory structure: all other connectivity issues + * have been fixed by prior passes */ int bch2_check_directory_structure(struct bch_fs *c) { - struct bch_inode_unpacked u; - pathbuf path = { 0, }; - int ret; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - if (!bkey_is_inode(k.k)) + if (!S_ISDIR(bkey_inode_mode(k))) continue; - BUG_ON(bch2_inode_unpack(k, &u)); - - if (u.bi_flags & BCH_INODE_unlinked) + if (bch2_inode_flags(k) & BCH_INODE_unlinked) continue; - check_path(trans, &path, &u, iter.pos.snapshot); + check_path_loop(trans, k); }))); - darray_exit(&path); bch_err_fn(c, ret); return ret; @@ -2187,15 +2937,17 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ if (!bkey_is_inode(k.k)) continue; /* Should never fail, checked by bch2_inode_invalid: */ struct bch_inode_unpacked u; - BUG_ON(bch2_inode_unpack(k, &u)); + _ret3 = bch2_inode_unpack(k, &u); + if (_ret3) + break; /* * Backpointer and directory structure checks are sufficient for @@ -2204,6 +2956,10 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, if (S_ISDIR(u.bi_mode)) continue; + /* + * Previous passes ensured that bi_nlink is nonzero if + * it had multiple hardlinks: + */ if (!u.bi_nlink) continue; @@ -2230,9 +2986,9 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); if (ret) break; @@ -2243,8 +2999,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links if (d.v->d_type != DT_DIR && d.v->d_type != DT_SUBVOL) inc_link(c, &s, links, range_start, range_end, - le64_to_cpu(d.v->d_inum), - bch2_snapshot_equiv(c, d.k->p.snapshot)); + le64_to_cpu(d.v->d_inum), d.k->p.snapshot); } 0; }))); @@ -2260,7 +3015,6 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite struct nlink_table *links, size_t *idx, u64 range_end) { - struct bch_fs *c = trans->c; struct bch_inode_unpacked u; struct nlink *link = &links->d[*idx]; int ret = 0; @@ -2271,7 +3025,9 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite if (!bkey_is_inode(k.k)) return 0; - BUG_ON(bch2_inode_unpack(k, &u)); + ret = bch2_inode_unpack(k, &u); + if (ret) + return ret; if (S_ISDIR(u.bi_mode)) return 0; @@ -2286,12 +3042,12 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite } if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, - c, inode_wrong_nlink, + trans, inode_wrong_nlink, "inode %llu type %s has wrong i_nlink (%u, should be %u)", u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], bch2_inode_nlink_get(&u), link->count)) { bch2_inode_nlink_set(&u, link->count); - ret = __write_inode(trans, &u, k.k->p.snapshot); + ret = __bch2_fsck_write_inode(trans, &u); } fsck_err: return ret; @@ -2307,7 +3063,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS(0, range_start), - BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); if (ret < 0) { @@ -2375,7 +3131,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, u->v.front_pad = 0; u->v.back_pad = 0; - return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); + return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); } int bch2_fix_reflink_p(struct bch_fs *c) @@ -2386,10 +3142,230 @@ int bch2_fix_reflink_p(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS_MIN, - BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_intent|BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, fix_reflink_p_key(trans, &iter, k))); bch_err_fn(c, ret); return ret; } + +#ifndef NO_BCACHEFS_CHARDEV + +struct fsck_thread { + struct thread_with_stdio thr; + struct bch_fs *c; + struct bch_opts opts; +}; + +static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) +{ + struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); + kfree(thr); +} + +static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) +{ + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); + struct bch_fs *c = thr->c; + + int ret = PTR_ERR_OR_ZERO(c); + if (ret) + return ret; + + ret = bch2_fs_start(thr->c); + if (ret) + goto err; + + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); + ret |= 1; + } + if (test_bit(BCH_FS_error, &c->flags)) { + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); + ret |= 4; + } +err: + bch2_fs_stop(c); + return ret; +} + +static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { + .exit = bch2_fsck_thread_exit, + .fn = bch2_fsck_offline_thread_fn, +}; + +long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) +{ + struct bch_ioctl_fsck_offline arg; + struct fsck_thread *thr = NULL; + darray_str(devs) = {}; + long ret = 0; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + for (size_t i = 0; i < arg.nr_devs; i++) { + u64 dev_u64; + ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); + if (ret) + goto err; + + char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); + ret = PTR_ERR_OR_ZERO(dev_str); + if (ret) + goto err; + + ret = darray_push(&devs, dev_str); + if (ret) { + kfree(dev_str); + goto err; + } + } + + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; + } + + thr->opts = bch2_opts_empty(); + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); + if (!IS_ERR(optstr)) + kfree(optstr); + + if (ret) + goto err; + } + + opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); + opt_set(thr->opts, read_only, 1); + opt_set(thr->opts, ratelimit_errors, 0); + + /* We need request_key() to be called before we punt to kthread: */ + opt_set(thr->opts, nostart, true); + + bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); + + thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + + if (!IS_ERR(thr->c) && + thr->c->opts.errors == BCH_ON_ERROR_panic) + thr->c->opts.errors = BCH_ON_ERROR_ro; + + ret = __bch2_run_thread_with_stdio(&thr->thr); +out: + darray_for_each(devs, i) + kfree(*i); + darray_exit(&devs); + return ret; +err: + if (thr) + bch2_fsck_thread_exit(&thr->thr); + pr_err("ret %s", bch2_err_str(ret)); + goto out; +} + +static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) +{ + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); + struct bch_fs *c = thr->c; + + c->stdio_filter = current; + c->stdio = &thr->thr.stdio; + + /* + * XXX: can we figure out a way to do this without mucking with c->opts? + */ + unsigned old_fix_errors = c->opts.fix_errors; + if (opt_defined(thr->opts, fix_errors)) + c->opts.fix_errors = thr->opts.fix_errors; + else + c->opts.fix_errors = FSCK_FIX_ask; + + c->opts.fsck = true; + set_bit(BCH_FS_fsck_running, &c->flags); + + c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; + int ret = bch2_run_online_recovery_passes(c); + + clear_bit(BCH_FS_fsck_running, &c->flags); + bch_err_fn(c, ret); + + c->stdio = NULL; + c->stdio_filter = NULL; + c->opts.fix_errors = old_fix_errors; + + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + return ret; +} + +static const struct thread_with_stdio_ops bch2_online_fsck_ops = { + .exit = bch2_fsck_thread_exit, + .fn = bch2_fsck_online_thread_fn, +}; + +long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) +{ + struct fsck_thread *thr = NULL; + long ret = 0; + + if (arg.flags) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!bch2_ro_ref_tryget(c)) + return -EROFS; + + if (down_trylock(&c->online_fsck_mutex)) { + bch2_ro_ref_put(c); + return -EAGAIN; + } + + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; + } + + thr->c = c; + thr->opts = bch2_opts_empty(); + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); + if (!IS_ERR(optstr)) + kfree(optstr); + + if (ret) + goto err; + } + + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); +err: + if (ret < 0) { + bch_err_fn(c, ret); + if (thr) + bch2_fsck_thread_exit(&thr->thr); + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + } + return ret; +} + +#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index da991e8cf27e..574948278cd4 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -2,14 +2,27 @@ #ifndef _BCACHEFS_FSCK_H #define _BCACHEFS_FSCK_H +#include "str_hash.h" + +int bch2_fsck_update_backpointers(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc, + struct bch_hash_info *, + struct bkey_i *); + int bch2_check_inodes(struct bch_fs *); int bch2_check_extents(struct bch_fs *); int bch2_check_indirect_extents(struct bch_fs *); int bch2_check_dirents(struct bch_fs *); int bch2_check_xattrs(struct bch_fs *); int bch2_check_root(struct bch_fs *); +int bch2_check_subvolume_structure(struct bch_fs *); +int bch2_check_unreachable_inodes(struct bch_fs *); int bch2_check_directory_structure(struct bch_fs *); int bch2_check_nlinks(struct bch_fs *); int bch2_fix_reflink_p(struct bch_fs *); +long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *); +long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online); + #endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 086f0090b03a..339b80770f1d 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -8,10 +8,13 @@ #include "buckets.h" #include "compress.h" #include "dirent.h" +#include "disk_accounting.h" #include "error.h" #include "extents.h" #include "extent_update.h" +#include "fs.h" #include "inode.h" +#include "opts.h" #include "str_hash.h" #include "snapshot.h" #include "subvolume.h" @@ -19,7 +22,7 @@ #include <linux/random.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #define x(name, ...) #name, const char * const bch2_inode_opts[] = { @@ -33,6 +36,8 @@ static const char * const bch2_inode_flag_strs[] = { }; #undef x +static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); + static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; static int inode_decode_field(const u8 *in, const u8 *end, @@ -43,10 +48,10 @@ static int inode_decode_field(const u8 *in, const u8 *end, u8 *p; if (in >= end) - return -1; + return -BCH_ERR_inode_unpack_error; if (!*in) - return -1; + return -BCH_ERR_inode_unpack_error; /* * position of highest set bit indicates number of bytes: @@ -56,7 +61,7 @@ static int inode_decode_field(const u8 *in, const u8 *end, bytes = byte_table[shift - 1]; if (in + bytes > end) - return -1; + return -BCH_ERR_inode_unpack_error; p = (u8 *) be + 16 - bytes; memcpy(p, in, bytes); @@ -159,8 +164,8 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, unsigned fieldnr = 0, field_bits; int ret; -#define x(_name, _bits) \ - if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ +#define x(_name, _bits) \ + if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \ unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ memset((void *) unpacked + offset, 0, \ sizeof(*unpacked) - offset); \ @@ -172,7 +177,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, return ret; \ \ if (field_bits > sizeof(unpacked->_name) * 8) \ - return -1; \ + return -BCH_ERR_inode_unpack_error; \ \ unpacked->_name = field[1]; \ in += ret; @@ -213,7 +218,7 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, \ unpacked->_name = v[0]; \ if (v[1] || v[0] != unpacked->_name) \ - return -1; \ + return -BCH_ERR_inode_unpack_error; \ fieldnr++; BCH_INODE_FIELDS_v2() @@ -264,7 +269,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, \ unpacked->_name = v[0]; \ if (v[1] || v[0] != unpacked->_name) \ - return -1; \ + return -BCH_ERR_inode_unpack_error; \ fieldnr++; BCH_INODE_FIELDS_v3() @@ -279,6 +284,8 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, { memset(unpacked, 0, sizeof(*unpacked)); + unpacked->bi_snapshot = k.k->p.snapshot; + switch (k.k->type) { case KEY_TYPE_inode: { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); @@ -289,10 +296,10 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); - if (INODE_NEW_VARINT(inode.v)) { + if (INODEv1_NEW_VARINT(inode.v)) { return bch2_inode_unpack_v2(unpacked, inode.v->fields, bkey_val_end(inode), - INODE_NR_FIELDS(inode.v)); + INODEv1_NR_FIELDS(inode.v)); } else { return bch2_inode_unpack_v1(inode, unpacked); } @@ -319,27 +326,27 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, int bch2_inode_unpack(struct bkey_s_c k, struct bch_inode_unpacked *unpacked) { - if (likely(k.k->type == KEY_TYPE_inode_v3)) - return bch2_inode_unpack_v3(k, unpacked); - return bch2_inode_unpack_slowpath(k, unpacked); + unpacked->bi_snapshot = k.k->p.snapshot; + + return likely(k.k->type == KEY_TYPE_inode_v3) + ? bch2_inode_unpack_v3(k, unpacked) + : bch2_inode_unpack_slowpath(k, unpacked); } -static int bch2_inode_peek_nowarn(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - subvol_inum inum, unsigned flags) +int __bch2_inode_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + subvol_inum inum, unsigned flags, + bool warn) { - struct bkey_s_c k; u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn); if (ret) return ret; - k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), - flags|BTREE_ITER_CACHED); + struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), + flags|BTREE_ITER_cached); ret = bkey_err(k); if (ret) return ret; @@ -354,24 +361,16 @@ static int bch2_inode_peek_nowarn(struct btree_trans *trans, return 0; err: + if (warn) + bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum); bch2_trans_iter_exit(trans, iter); return ret; } -int bch2_inode_peek(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - subvol_inum inum, unsigned flags) -{ - int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); - bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum); - return ret; -} - int bch2_inode_write_flags(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_inode_buf *inode_p; @@ -384,6 +383,30 @@ int bch2_inode_write_flags(struct btree_trans *trans, return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); } +int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) +{ + struct bkey_inode_buf *inode_p = + bch2_trans_kmalloc(trans, sizeof(*inode_p)); + + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + bch2_inode_pack(inode_p, inode); + inode_p->inode.k.p.snapshot = inode->bi_snapshot; + + return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, + &inode_p->inode.k_i, + BTREE_UPDATE_internal_snapshot_node); +} + +int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) +{ + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_fsck_write_inode(trans, inode)); + bch_err_fn(trans->c, ret); + return ret; +} + struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) { struct bch_inode_unpacked u; @@ -405,100 +428,98 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) return &inode_p->inode.k_i; } -static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) +static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bch_inode_unpacked unpacked; int ret = 0; - bkey_fsck_err_on(k.k->p.inode, c, err, - inode_pos_inode_nonzero, + bkey_fsck_err_on(k.k->p.inode, + c, inode_pos_inode_nonzero, "nonzero k.p.inode"); - bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err, - inode_pos_blockdev_range, + bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, + c, inode_pos_blockdev_range, "fs inode in blockdev range"); - bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err, - inode_unpack_error, + bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), + c, inode_unpack_error, "invalid variable length fields"); - bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err, - inode_checksum_type_invalid, + bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, + c, inode_checksum_type_invalid, "invalid data checksum type (%u >= %u", unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); bkey_fsck_err_on(unpacked.bi_compression && - !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err, - inode_compression_type_invalid, + !bch2_compression_opt_valid(unpacked.bi_compression - 1), + c, inode_compression_type_invalid, "invalid compression opt %u", unpacked.bi_compression - 1); bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && - unpacked.bi_nlink != 0, c, err, - inode_unlinked_but_nlink_nonzero, + unpacked.bi_nlink != 0, + c, inode_unlinked_but_nlink_nonzero, "flagged as unlinked but bi_nlink != 0"); - bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err, - inode_subvol_root_but_not_dir, + bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), + c, inode_subvol_root_but_not_dir, "subvolume root but not a directory"); fsck_err: return ret; } -int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); int ret = 0; - bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, - inode_str_hash_invalid, + bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", - INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); + INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_invalid(c, k, err); + ret = __bch2_inode_validate(c, k, from); fsck_err: return ret; } -int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); int ret = 0; - bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, - inode_str_hash_invalid, + bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_invalid(c, k, err); + ret = __bch2_inode_validate(c, k, from); fsck_err: return ret; } -int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); int ret = 0; bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || - INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err, - inode_v3_fields_start_bad, + INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), + c, inode_v3_fields_start_bad, "invalid fields_start (got %llu, min %u max %zu)", INODEv3_FIELDS_START(inode.v), INODEv3_FIELDS_START_INITIAL, bkey_val_u64s(inode.k)); - bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, - inode_str_hash_invalid, + bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_invalid(c, k, err); + ret = __bch2_inode_validate(c, k, from); fsck_err: return ret; } @@ -506,38 +527,35 @@ fsck_err: static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { + prt_printf(out, "\n"); printbuf_indent_add(out, 2); - prt_printf(out, "mode=%o", inode->bi_mode); - prt_newline(out); + prt_printf(out, "mode=%o\n", inode->bi_mode); prt_str(out, "flags="); prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); - prt_printf(out, " (%x)", inode->bi_flags); - prt_newline(out); + prt_printf(out, "(%x)\n", inode->bi_flags); - prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq); + prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); + prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed); + prt_printf(out, "hash_type="); + bch2_prt_str_hash_type(out, INODE_STR_HASH(inode)); prt_newline(out); - - prt_printf(out, "bi_size=%llu", inode->bi_size); - prt_newline(out); - - prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); - prt_newline(out); - - prt_newline(out); - prt_printf(out, "bi_version=%llu", inode->bi_version); + prt_printf(out, "bi_size=%llu\n", inode->bi_size); + prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors); + prt_printf(out, "bi_version=%llu\n", inode->bi_version); #define x(_name, _bits) \ - prt_printf(out, #_name "=%llu", (u64) inode->_name); \ - prt_newline(out); + prt_printf(out, #_name "=%llu\n", (u64) inode->_name); BCH_INODE_FIELDS_v3() #undef x + + bch2_printbuf_strip_trailing_newline(out); printbuf_indent_sub(out, 2); } void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - prt_printf(out, "inum: %llu ", inode->bi_inum); + prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot); __bch2_inode_unpacked_to_text(out, inode); } @@ -567,62 +585,207 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k) } } -static inline bool bkey_is_deleted_inode(struct bkey_s_c k) +static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f); + return; + case KEY_TYPE_inode_v2: + bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f); + return; + case KEY_TYPE_inode_v3: + bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f); + return; + default: + BUG(); + } +} + +static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) +{ + unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; + + return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); +} + +static struct bkey_s_c +bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos pos, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_max_norestart(trans, *iter, btree, + bpos_successor(pos), + SPOS(pos.inode, pos.offset, U32_MAX), + flags|BTREE_ITER_all_snapshots, k, ret) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) + return k; + + bch2_trans_iter_exit(trans, iter); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; +} + +static struct bkey_s_c +bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos, unsigned flags) +{ + struct bkey_s_c k; +again: + k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags); + if (!k.k || + bkey_err(k) || + bkey_is_inode(k.k)) + return k; + + bch2_trans_iter_exit(trans, iter); + pos = k.k->p; + goto again; +} + +int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_max_norestart(trans, iter, + BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), + BTREE_ITER_all_snapshots| + BTREE_ITER_with_updates, k, ret) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) && + bkey_is_inode(k.k)) { + ret = 1; + break; + } + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int update_inode_has_children(struct btree_trans *trans, + struct bkey_s k, + bool have_child) +{ + if (!have_child) { + int ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret) + return ret < 0 ? ret : 0; + } + + u64 f = bkey_inode_flags(k.s_c); + if (have_child != !!(f & BCH_INODE_has_child_snapshot)) + bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot); + + return 0; +} + +static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos, + bool have_child) { - return bkey_inode_flags(k) & BCH_INODE_unlinked; + struct btree_iter iter; + struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans, + &iter, pos, BTREE_ITER_with_updates); + int ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) + return 0; + + if (!have_child) { + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret) { + ret = ret < 0 ? ret : 0; + goto err; + } + } + + u64 f = bkey_inode_flags(k); + if (have_child != !!(f & BCH_INODE_has_child_snapshot)) { + struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k, + BTREE_UPDATE_internal_snapshot_node); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } int bch2_trigger_inode(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; + + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { + BUG_ON(!trans->journal_res.seq); + bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); + } + s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); + if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) { + struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes }; + int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc); + if (ret) + return ret; + } - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { - if (nr) { - int ret = bch2_replicas_deltas_realloc(trans, 0); + if (flags & BTREE_TRIGGER_transactional) { + int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) - + (int) bkey_is_unlinked_inode(old); + if (unlinked_delta) { + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, + new.k->p, unlinked_delta > 0); if (ret) return ret; - - trans->fs_usage_deltas->nr_inodes += nr; } - bool old_deleted = bkey_is_deleted_inode(old); - bool new_deleted = bkey_is_deleted_inode(new.s_c); - if (old_deleted != new_deleted) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted); + /* + * If we're creating or deleting an inode at this snapshot ID, + * and there might be an inode in a parent snapshot ID, we might + * need to set or clear the has_child_snapshot flag on the + * parent. + */ + int deleted_delta = (int) bkey_is_inode(new.k) - + (int) bkey_is_inode(old.k); + if (deleted_delta && + bch2_snapshot_parent(c, new.k->p.snapshot)) { + int ret = update_parent_inode_has_children(trans, new.k->p, + deleted_delta > 0); if (ret) return ret; } - } - if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { - BUG_ON(!trans->journal_res.seq); - - bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); - } - - if (flags & BTREE_TRIGGER_GC) { - struct bch_fs *c = trans->c; - - percpu_down_read(&c->mark_lock); - this_cpu_add(c->usage_gc->b.nr_inodes, nr); - percpu_up_read(&c->mark_lock); + /* + * When an inode is first updated in a new snapshot, we may need + * to clear has_child_snapshot + */ + if (deleted_delta > 0) { + int ret = update_inode_has_children(trans, new, false); + if (ret) + return ret; + } } return 0; } -int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { int ret = 0; - bkey_fsck_err_on(k.k->p.inode, c, err, - inode_pos_inode_nonzero, + bkey_fsck_err_on(k.k->p.inode, + c, inode_pos_inode_nonzero, "nonzero k.p.inode"); fsck_err: return ret; @@ -636,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); } +int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) +{ + int ret = 0; + + bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors, + c, inode_alloc_cursor_inode_bad, + "k.p.inode bad"); +fsck_err: + return ret; +} + +void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k); + + prt_printf(out, "idx %llu generation %llu", + le64_to_cpu(i.v->idx), + le64_to_cpu(i.v->gen)); +} + void bch2_inode_init_early(struct bch_fs *c, struct bch_inode_unpacked *inode_u) { @@ -644,10 +829,8 @@ void bch2_inode_init_early(struct bch_fs *c, memset(inode_u, 0, sizeof(*inode_u)); - /* ick */ - inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; - get_random_bytes(&inode_u->bi_hash_seed, - sizeof(inode_u->bi_hash_seed)); + SET_INODE_STR_HASH(inode_u, str_hash); + get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); } void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, @@ -698,43 +881,78 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } -/* - * This just finds an empty slot: - */ -int bch2_inode_create(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode_u, - u32 snapshot, u64 cpu) +static struct bkey_i_inode_alloc_cursor * +bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) { struct bch_fs *c = trans->c; - struct bkey_s_c k; - u64 min, max, start, pos, *hint; - int ret = 0; - unsigned bits = (c->opts.inodes_32bit ? 31 : 63); - if (c->opts.shard_inode_numbers) { - bits -= c->inode_shard_bits; + u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1; + + cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); - min = (cpu << bits); - max = (cpu << bits) | ~(ULLONG_MAX << bits); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_logged_ops, + POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), + BTREE_ITER_cached); + int ret = bkey_err(k); + if (ret) + return ERR_PTR(ret); - min = max_t(u64, min, BLOCKDEV_INODE_MAX); - hint = c->unused_inode_hints + cpu; + struct bkey_i_inode_alloc_cursor *cursor = + k.k->type == KEY_TYPE_inode_alloc_cursor + ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) + : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); + ret = PTR_ERR_OR_ZERO(cursor); + if (ret) + goto err; + + if (c->opts.inodes_32bit) { + *min = BLOCKDEV_INODE_MAX; + *max = INT_MAX; } else { - min = BLOCKDEV_INODE_MAX; - max = ~(ULLONG_MAX << bits); - hint = c->unused_inode_hints; + cursor->v.bits = c->opts.shard_inode_numbers_bits; + + unsigned bits = 63 - c->opts.shard_inode_numbers_bits; + + *min = max(cpu << bits, (u64) INT_MAX + 1); + *max = (cpu << bits) | ~(ULLONG_MAX << bits); } - start = READ_ONCE(*hint); + if (le64_to_cpu(cursor->v.idx) < *min) + cursor->v.idx = cpu_to_le64(*min); + + if (le64_to_cpu(cursor->v.idx) >= *max) { + cursor->v.idx = cpu_to_le64(*min); + le32_add_cpu(&cursor->v.gen, 1); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret ? ERR_PTR(ret) : cursor; +} + +/* + * This just finds an empty slot: + */ +int bch2_inode_create(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode_u, + u32 snapshot, u64 cpu) +{ + u64 min, max; + struct bkey_i_inode_alloc_cursor *cursor = + bch2_inode_alloc_cursor_get(trans, cpu, &min, &max); + int ret = PTR_ERR_OR_ZERO(cursor); + if (ret) + return ret; - if (start >= max || start < min) - start = min; + u64 start = le64_to_cpu(cursor->v.idx); + u64 pos = start; - pos = start; bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_INTENT); + BTREE_ITER_all_snapshots| + BTREE_ITER_intent); + struct bkey_s_c k; again: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -764,6 +982,7 @@ again: /* Retry from start */ pos = start = min; bch2_btree_iter_set_pos(iter, POS(0, pos)); + le32_add_cpu(&cursor->v.gen, 1); goto again; found_slot: bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); @@ -774,9 +993,9 @@ found_slot: return ret; } - *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; - inode_u->bi_generation = bkey_generation(k); + inode_u->bi_generation = le64_to_cpu(cursor->v.gen); + cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); return 0; } @@ -795,7 +1014,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, * extent iterator: */ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), - BTREE_ITER_INTENT); + BTREE_ITER_intent); while (1) { bch2_trans_begin(trans); @@ -806,7 +1025,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bch2_btree_iter_set_snapshot(&iter, snapshot); - k = bch2_btree_iter_peek_upto(&iter, end); + k = bch2_btree_iter_peek_max(&iter, end); ret = bkey_err(k); if (ret) goto err; @@ -817,7 +1036,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bkey_init(&delete.k); delete.k.p = iter.pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) + if (iter.flags & BTREE_ITER_is_extents) bch2_key_resize(&delete.k, bpos_min(end, k.k->p).offset - iter.pos.offset); @@ -838,8 +1057,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; - struct bkey_i_inode_generation delete; - struct bch_inode_unpacked inode_u; struct bkey_s_c k; u32 snapshot; int ret; @@ -866,7 +1083,7 @@ retry: k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, inum.inum, snapshot), - BTREE_ITER_INTENT|BTREE_ITER_CACHED); + BTREE_ITER_intent|BTREE_ITER_cached); ret = bkey_err(k); if (ret) goto err; @@ -879,13 +1096,7 @@ retry: goto err; } - bch2_inode_unpack(k, &inode_u); - - bkey_inode_generation_init(&delete.k_i); - delete.k.p = iter.pos; - delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - - ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + ret = bch2_btree_delete_at(trans, &iter, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); err: @@ -893,6 +1104,11 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + if (ret) + goto err2; + + ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); +err2: bch2_trans_put(trans); return ret; } @@ -926,8 +1142,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, struct bch_inode_unpacked *inode) { - return bch2_trans_do(c, NULL, NULL, 0, - bch2_inode_find_by_inum_trans(trans, inum, inode)); + return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); } int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) @@ -977,12 +1192,18 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, struct bch_inode_unpacked *inode) { -#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); +#define x(_name, _bits) \ + if ((inode)->bi_##_name) { \ + opts->_name = inode->bi_##_name - 1; \ + opts->_name##_from_inode = true; \ + } else { \ + opts->_name = c->opts._name; \ + opts->_name##_from_inode = false; \ + } BCH_INODE_OPTS() #undef x - if (opts->nocow) - opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; + bch2_io_opts_fixups(opts); } int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) @@ -997,7 +1218,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i return 0; } -int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { struct bch_fs *c = trans->c; struct btree_iter iter = { NULL }; @@ -1026,7 +1247,7 @@ retry: bch2_trans_begin(trans); k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum, snapshot), BTREE_ITER_INTENT); + SPOS(0, inum, snapshot), BTREE_ITER_intent); ret = bkey_err(k); if (ret) goto err; @@ -1060,6 +1281,45 @@ err: return ret ?: -BCH_ERR_transaction_restart_nested; } +/* + * After deleting an inode, there may be versions in older snapshots that should + * also be deleted - if they're not referenced by sibling snapshots and not open + * in other subvolumes: + */ +static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; +next_parent: + ret = lockrestart_do(trans, + bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0))); + if (ret || !k.k) + return ret; + + bool unlinked = bkey_is_unlinked_inode(k); + pos = k.k->p; + bch2_trans_iter_exit(trans, &iter); + + if (!unlinked) + return 0; + + ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); + if (ret) + return ret < 0 ? ret : 0; + + ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); + if (ret) + return ret; + goto next_parent; +} + +int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +{ + return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: + delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); +} + static int may_delete_deleted_inode(struct btree_trans *trans, struct btree_iter *iter, struct bpos pos, @@ -1069,16 +1329,17 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct btree_iter inode_iter; struct bkey_s_c k; struct bch_inode_unpacked inode; + struct printbuf buf = PRINTBUF; int ret; - k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); + k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); ret = bkey_err(k); if (ret) return ret; ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; - if (fsck_err_on(!bkey_is_inode(k.k), c, - deleted_inode_missing, + if (fsck_err_on(!bkey_is_inode(k.k), + trans, deleted_inode_missing, "nonexistent inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; @@ -1088,8 +1349,9 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; if (S_ISDIR(inode.bi_mode)) { - ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot); - if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir, + ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); + if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), + trans, deleted_inode_is_dir, "non empty directory %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; @@ -1097,41 +1359,42 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; } - if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c, - deleted_inode_not_unlinked, + if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), + trans, deleted_inode_not_unlinked, "non-deleted inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; - if (c->sb.clean && - !fsck_err(c, - deleted_inode_but_clean, - "filesystem marked as clean but have deleted inode %llu:%u", - pos.offset, pos.snapshot)) { - ret = 0; - goto out; - } - - if (bch2_snapshot_is_internal_node(c, pos.snapshot)) { - struct bpos new_min_pos; + if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, + trans, deleted_inode_has_child_snapshots, + "inode with child snapshots %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; - ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos); - if (ret) - goto out; + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) + goto out; - inode.bi_flags &= ~BCH_INODE_unlinked; + if (ret) { + if (fsck_err(trans, inode_has_child_snapshots_wrong, + "inode has_child_snapshots flag wrong (should be set)\n%s", + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &inode), + buf.buf))) { + inode.bi_flags |= BCH_INODE_has_child_snapshot; + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + goto out; + } + goto delete; - ret = bch2_inode_write_flags(trans, &inode_iter, &inode, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - bch_err_msg(c, ret, "clearing inode unlinked flag"); - if (ret) - goto out; + } - /* - * We'll need another write buffer flush to pick up the new - * unlinked inodes in the snapshot leaves: - */ - *need_another_pass = true; + if (test_bit(BCH_FS_clean_recovery, &c->flags) && + !fsck_err(trans, deleted_inode_but_clean, + "filesystem marked as clean but have deleted inode %llu:%u", + pos.offset, pos.snapshot)) { + ret = 0; goto out; } @@ -1139,9 +1402,10 @@ static int may_delete_deleted_inode(struct btree_trans *trans, out: fsck_err: bch2_trans_iter_exit(trans, &inode_iter); + printbuf_exit(&buf); return ret; delete: - ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); goto out; } @@ -1151,6 +1415,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c) bool need_another_pass; int ret; again: + /* + * if we ran check_inodes() unlinked inodes will have already been + * cleaned up but the write buffer will be out of sync; therefore we + * alway need a write buffer flush + */ + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + need_another_pass = false; /* @@ -1160,11 +1433,12 @@ again: * flushed and we'd spin: */ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); if (ret > 0) { - bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); + bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", + k.k->p.offset, k.k->p.snapshot); ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); /* @@ -1183,12 +1457,8 @@ again: ret; })); - if (!ret && need_another_pass) { - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; + if (!ret && need_another_pass) goto again; - } err: bch2_trans_put(trans); return ret; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index b63f312581cf..428b9be6af34 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -5,37 +5,47 @@ #include "bkey.h" #include "bkey_methods.h" #include "opts.h" +#include "snapshot.h" -enum bkey_invalid_flags; extern const char * const bch2_inode_opts[]; -int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); -int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); -int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_inode_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); + +static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) +{ + return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0 + ? __bch2_inode_has_child_snapshots(trans, pos) + : 0; +} + int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_inode ((struct bkey_ops) { \ - .key_invalid = bch2_inode_invalid, \ + .key_validate = bch2_inode_validate, \ .val_to_text = bch2_inode_to_text, \ .trigger = bch2_trigger_inode, \ .min_val_size = 16, \ }) #define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ - .key_invalid = bch2_inode_v2_invalid, \ + .key_validate = bch2_inode_v2_validate, \ .val_to_text = bch2_inode_to_text, \ .trigger = bch2_trigger_inode, \ .min_val_size = 32, \ }) #define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ - .key_invalid = bch2_inode_v3_invalid, \ + .key_validate = bch2_inode_v3_validate, \ .val_to_text = bch2_inode_to_text, \ .trigger = bch2_trigger_inode, \ .min_val_size = 48, \ @@ -48,16 +58,26 @@ static inline bool bkey_is_inode(const struct bkey *k) k->type == KEY_TYPE_inode_v3; } -int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ - .key_invalid = bch2_inode_generation_invalid, \ + .key_validate = bch2_inode_generation_validate, \ .val_to_text = bch2_inode_generation_to_text, \ .min_val_size = 8, \ }) +int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \ + .key_validate = bch2_inode_alloc_cursor_validate, \ + .val_to_text = bch2_inode_alloc_cursor_to_text, \ + .min_val_size = 16, \ +}) + #if 0 typedef struct { u64 lo; @@ -68,6 +88,7 @@ typedef u64 u96; struct bch_inode_unpacked { u64 bi_inum; + u32 bi_snapshot; u64 bi_journal_seq; __le64 bi_hash_seed; u64 bi_size; @@ -80,6 +101,7 @@ struct bch_inode_unpacked { BCH_INODE_FIELDS_v3() #undef x }; +BITMASK(INODE_STR_HASH, struct bch_inode_unpacked, bi_flags, 20, 24); struct bkey_inode_buf { struct bkey_i_inode_v3 inode; @@ -87,7 +109,7 @@ struct bkey_inode_buf { #define x(_name, _bits) + 8 + _bits / 8 u8 _pad[0 + BCH_INODE_FIELDS_v3()]; #undef x -} __packed __aligned(8); +}; void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); @@ -95,11 +117,29 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *); void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); -int bch2_inode_peek(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, subvol_inum, unsigned); +int __bch2_inode_peek(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, subvol_inum, unsigned, bool); + +static inline int bch2_inode_peek_nowarn(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + subvol_inum inum, unsigned flags) +{ + return __bch2_inode_peek(trans, iter, inode, inum, flags, false); +} + +static inline int bch2_inode_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + subvol_inum inum, unsigned flags) +{ + return __bch2_inode_peek(trans, iter, inode, inum, flags, true); + int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); + return ret; +} int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, enum btree_update_flags); + struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); static inline int bch2_inode_write(struct btree_trans *trans, struct btree_iter *iter, @@ -108,6 +148,9 @@ static inline int bch2_inode_write(struct btree_trans *trans, return bch2_inode_write_flags(trans, iter, inode, 0); } +int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); +int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); + void bch2_inode_init_early(struct bch_fs *, struct bch_inode_unpacked *); void bch2_inode_init_late(struct bch_inode_unpacked *, u64, @@ -172,6 +215,34 @@ static inline u8 inode_d_type(struct bch_inode_unpacked *inode) return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); } +static inline u32 bch2_inode_flags(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); + case KEY_TYPE_inode_v2: + return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); + case KEY_TYPE_inode_v3: + return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); + default: + return 0; + } +} + +static inline unsigned bkey_inode_mode(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode); + case KEY_TYPE_inode_v2: + return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode); + case KEY_TYPE_inode_v3: + return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v); + default: + return 0; + } +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) @@ -201,11 +272,29 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); +static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode) +{ + bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; + + return S_ISDIR(inode->bi_mode) || + (!inode->bi_nlink && inode_has_bp); +} + struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); +#include "rebalance.h" + +static inline struct bch_extent_rebalance +bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) +{ + struct bch_io_opts io_opts; + bch2_inode_opts_get(&io_opts, c, inode); + return io_opts_to_rebalance_opts(c, &io_opts); +} + int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); int bch2_delete_dead_inodes(struct bch_fs *); diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 83d107331edf..b99a5bf1a75e 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -101,7 +101,9 @@ struct bch_inode_generation { x(bi_dir_offset, 64) \ x(bi_subvol, 32) \ x(bi_parent_subvol, 32) \ - x(bi_nocow, 8) + x(bi_nocow, 8) \ + x(bi_depth, 32) \ + x(bi_inodes_32bit, 8) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -114,7 +116,8 @@ struct bch_inode_generation { x(foreground_target, 16) \ x(background_target, 16) \ x(erasure_code, 16) \ - x(nocow, 8) + x(nocow, 8) \ + x(inodes_32bit, 8) enum inode_opt_id { #define x(name, ...) \ @@ -133,7 +136,8 @@ enum inode_opt_id { x(i_size_dirty, 5) \ x(i_sectors_dirty, 6) \ x(unlinked, 7) \ - x(backptr_untrusted, 8) + x(backptr_untrusted, 8) \ + x(has_child_snapshot, 9) /* bits 20+ reserved for packed fields below: */ @@ -149,9 +153,9 @@ enum __bch_inode_flags { #undef x }; -LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); -LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); +LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24); +LE32_BITMASK(INODEv1_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32); LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); @@ -163,4 +167,12 @@ LE64_BITMASK(INODEv3_FIELDS_START, struct bch_inode_v3, bi_flags, 31, 36); LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); +struct bch_inode_alloc_cursor { + struct bch_val v; + __u8 bits; + __u8 pad; + __le32 gen; + __le64 idx; +}; + #endif /* _BCACHEFS_INODE_FORMAT_H */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 1baf78594cca..5353979117b0 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -113,11 +113,13 @@ int bch2_extent_fallocate(struct btree_trans *trans, err: if (!ret && sectors_allocated) bch2_increment_clock(c, sectors_allocated, WRITE); - if (should_print_err(ret)) - bch_err_inum_offset_ratelimited(c, - inum.inum, - iter->pos.offset << 9, - "%s(): error: %s", __func__, bch2_err_str(ret)); + if (should_print_err(ret)) { + struct printbuf buf = PRINTBUF; + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9); + prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } err_noprint: bch2_open_buckets_put(c, &open_buckets); bch2_disk_reservation_put(c, &disk_res); @@ -125,8 +127,8 @@ err_noprint: bch2_bkey_buf_exit(&old, c); if (closure_nr_remaining(&cl) != 1) { - bch2_trans_unlock(trans); - closure_sync(&cl); + bch2_trans_unlock_long(trans); + bch2_wait_on_allocator(c, &cl); } return ret; @@ -164,9 +166,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_btree_iter_set_snapshot(iter, snapshot); /* - * peek_upto() doesn't have ideal semantics for extents: + * peek_max() doesn't have ideal semantics for extents: */ - k = bch2_btree_iter_peek_upto(iter, end_pos); + k = bch2_btree_iter_peek_max(iter, end_pos); if (!k.k) break; @@ -198,7 +200,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inum.inum, start), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); @@ -224,13 +226,14 @@ void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, str static int truncate_set_isize(struct btree_trans *trans, subvol_inum inum, - u64 new_i_size) + u64 new_i_size, + bool warn) { struct btree_iter iter = { NULL }; struct bch_inode_unpacked inode_u; int ret; - ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?: + ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?: (inode_u.bi_size = new_i_size, 0) ?: bch2_inode_write(trans, &iter, &inode_u); @@ -247,23 +250,25 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k); subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; u64 new_i_size = le64_to_cpu(op->v.new_i_size); + bool warn_errors = i_sectors_delta != NULL; int ret; ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - truncate_set_isize(trans, inum, new_i_size)); + truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL)); if (ret) goto err; bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); bch2_trans_iter_exit(trans, &fpunch_iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ret = 0; err: - bch2_logged_op_finish(trans, op_k); + if (warn_errors) + bch_err_fn(c, ret); return ret; } @@ -287,9 +292,14 @@ int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sec * resume only proceeding in one of the snapshots */ down_read(&c->snapshot_create_lock); - int ret = bch2_trans_run(c, - bch2_logged_op_start(trans, &op.k_i) ?: - __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta)); + struct btree_trans *trans = bch2_trans_get(c); + int ret = bch2_logged_op_start(trans, &op.k_i); + if (ret) + goto out; + ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta); + ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; +out: + bch2_trans_put(trans); up_read(&c->snapshot_create_lock); return ret; @@ -307,7 +317,8 @@ void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, stru prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset)); } -static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len) +static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, + u64 offset, s64 len, bool warn) { struct btree_iter iter; struct bch_inode_unpacked inode_u; @@ -316,7 +327,7 @@ static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset offset <<= 9; len <<= 9; - ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); + ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn); if (ret) return ret; @@ -356,15 +367,25 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, u64 len = abs(shift); u64 pos = le64_to_cpu(op->v.pos); bool insert = shift > 0; + u32 snapshot; + bool warn_errors = i_sectors_delta != NULL; int ret = 0; ret = bch2_inum_opts_get(trans, inum, &opts); if (ret) return ret; + /* + * check for missing subvolume before fpunch, as in resume we don't want + * it to be a fatal error + */ + ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors)); + if (ret) + return ret; + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inum.inum, 0), - BTREE_ITER_INTENT); + BTREE_ITER_intent); switch (op->v.state) { case LOGGED_OP_FINSERT_start: @@ -372,7 +393,7 @@ case LOGGED_OP_FINSERT_start: if (insert) { ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - adjust_i_size(trans, inum, src_offset, len) ?: + adjust_i_size(trans, inum, src_offset, len, warn_errors) ?: bch2_logged_op_update(trans, &op->k_i)); if (ret) goto err; @@ -395,11 +416,11 @@ case LOGGED_OP_FINSERT_shift_extents: struct bkey_i delete, *copy; struct bkey_s_c k; struct bpos src_pos = POS(inum.inum, src_offset); - u32 snapshot; bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, + warn_errors); if (ret) goto btree_err; @@ -407,8 +428,8 @@ case LOGGED_OP_FINSERT_shift_extents: bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); k = insert - ? bch2_btree_iter_peek_prev(&iter) - : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); + ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0)) + : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX)); if ((ret = bkey_err(k))) goto btree_err; @@ -442,7 +463,7 @@ case LOGGED_OP_FINSERT_shift_extents: op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?: + ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: @@ -462,12 +483,12 @@ btree_err: if (!insert) { ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - adjust_i_size(trans, inum, src_offset, shift) ?: + adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?: bch2_logged_op_update(trans, &op->k_i)); } else { /* We need an inode update to update bi_journal_seq for fsync: */ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - adjust_i_size(trans, inum, 0, 0) ?: + adjust_i_size(trans, inum, 0, 0, warn_errors) ?: bch2_logged_op_update(trans, &op->k_i)); } @@ -476,8 +497,9 @@ case LOGGED_OP_FINSERT_finish: break; } err: - bch2_logged_op_finish(trans, op_k); bch2_trans_iter_exit(trans, &iter); + if (warn_errors) + bch_err_fn(c, ret); return ret; } @@ -506,9 +528,14 @@ int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum, * resume only proceeding in one of the snapshots */ down_read(&c->snapshot_create_lock); - int ret = bch2_trans_run(c, - bch2_logged_op_start(trans, &op.k_i) ?: - __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta)); + struct btree_trans *trans = bch2_trans_get(c); + int ret = bch2_logged_op_start(trans, &op.k_i); + if (ret) + goto out; + ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta); + ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; +out: + bch2_trans_put(trans); up_read(&c->snapshot_create_lock); return ret; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 3c574d8873a1..aa91fcf51eec 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -21,6 +21,7 @@ #include "io_read.h" #include "io_misc.h" #include "io_write.h" +#include "reflink.h" #include "subvolume.h" #include "trace.h" @@ -58,7 +59,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) } rcu_read_unlock(); - return bch2_rand_range(nr * CONGESTED_MAX) < total; + return get_random_u32_below(nr * CONGESTED_MAX) < total; } #else @@ -84,29 +85,38 @@ struct promote_op { }; static const struct rhashtable_params bch_promote_params = { - .head_offset = offsetof(struct promote_op, hash), - .key_offset = offsetof(struct promote_op, pos), - .key_len = sizeof(struct bpos), + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), + .automatic_shrinking = true, }; +static inline bool have_io_error(struct bch_io_failures *failed) +{ + return failed && failed->nr; +} + static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, struct bpos pos, struct bch_io_opts opts, - unsigned flags) + unsigned flags, + struct bch_io_failures *failed) { - BUG_ON(!opts.promote_target); + if (!have_io_error(failed)) { + BUG_ON(!opts.promote_target); - if (!(flags & BCH_READ_MAY_PROMOTE)) - return -BCH_ERR_nopromote_may_not; + if (!(flags & BCH_READ_MAY_PROMOTE)) + return -BCH_ERR_nopromote_may_not; - if (bch2_bkey_has_target(c, k, opts.promote_target)) - return -BCH_ERR_nopromote_already_promoted; + if (bch2_bkey_has_target(c, k, opts.promote_target)) + return -BCH_ERR_nopromote_already_promoted; - if (bkey_extent_is_unwritten(k)) - return -BCH_ERR_nopromote_unwritten; + if (bkey_extent_is_unwritten(k)) + return -BCH_ERR_nopromote_unwritten; - if (bch2_target_congested(c, opts.promote_target)) - return -BCH_ERR_nopromote_congested; + if (bch2_target_congested(c, opts.promote_target)) + return -BCH_ERR_nopromote_congested; + } if (rhashtable_lookup_fast(&c->promote_table, &pos, bch_promote_params)) @@ -163,7 +173,8 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, struct extent_ptr_decoded *pick, struct bch_io_opts opts, unsigned sectors, - struct bch_read_bio **rbio) + struct bch_read_bio **rbio, + struct bch_io_failures *failed) { struct bch_fs *c = trans->c; struct promote_op *op = NULL; @@ -174,7 +185,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return ERR_PTR(-BCH_ERR_nopromote_no_writes); - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL); + op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); if (!op) { ret = -BCH_ERR_nopromote_enomem; goto err; @@ -216,14 +227,28 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, bio = &op->write.op.wbio.bio; bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); + struct data_update_opts update_opts = {}; + + if (!have_io_error(failed)) { + update_opts.target = opts.promote_target; + update_opts.extra_replicas = 1; + update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; + } else { + update_opts.target = opts.foreground_target; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned ptr_bit = 1; + bkey_for_each_ptr(ptrs, ptr) { + if (bch2_dev_io_failures(failed, ptr->dev)) + update_opts.rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + } + ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), opts, - (struct data_update_opts) { - .target = opts.promote_target, - .extra_replicas = 1, - .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, - }, + update_opts, btree_id, k); /* * possible errors: -BCH_ERR_nocow_lock_blocked, @@ -243,7 +268,8 @@ err: bio_free_pages(&(*rbio)->bio); kfree(*rbio); *rbio = NULL; - kfree(op); + /* We may have added to the rhashtable and thus need rcu freeing: */ + kfree_rcu(op, rcu); bch2_write_ref_put(c, BCH_WRITE_REF_promote); return ERR_PTR(ret); } @@ -257,10 +283,17 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, unsigned flags, struct bch_read_bio **rbio, bool *bounce, - bool *read_full) + bool *read_full, + struct bch_io_failures *failed) { struct bch_fs *c = trans->c; - bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); + /* + * if failed != NULL we're not actually doing a promote, we're + * recovering from an io/checksum error + */ + bool promote_full = (have_io_error(failed) || + *read_full || + READ_ONCE(c->opts.promote_whole_extents)); /* data might have to be decompressed in the write path: */ unsigned sectors = promote_full ? max(pick->crc.compressed_size, pick->crc.live_size) @@ -271,7 +304,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, struct promote_op *promote; int ret; - ret = should_promote(c, k, pos, opts, flags); + ret = should_promote(c, k, pos, opts, flags, failed); if (ret) goto nopromote; @@ -279,7 +312,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, k.k->type == KEY_TYPE_reflink_v ? BTREE_ID_reflink : BTREE_ID_extents, - k, pos, pick, opts, sectors, rbio); + k, pos, pick, opts, sectors, rbio, failed); ret = PTR_ERR_OR_ZERO(promote); if (ret) goto nopromote; @@ -294,6 +327,20 @@ nopromote: /* Read */ +static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_read_bio *rbio, struct bpos read_pos) +{ + return bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { rbio->subvol, read_pos.inode }, + read_pos.offset << 9); +} + +static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, + struct bch_read_bio *rbio, struct bpos read_pos) +{ + bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); +} + #define READ_RETRY_AVOID 1 #define READ_RETRY 2 #define READ_ERR 3 @@ -378,17 +425,17 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, rbio->data_btree, - rbio->read_pos, BTREE_ITER_SLOTS); + rbio->read_pos, BTREE_ITER_slots); retry: + bch2_trans_begin(trans); rbio->bio.bi_status = 0; - k = bch2_btree_iter_peek_slot(&iter); - if (bkey_err(k)) + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (ret) goto err; bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - bch2_trans_unlock(trans); if (!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, @@ -472,6 +519,29 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, } } +static void bch2_read_io_err(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bio *bio = &rbio->bio; + struct bch_fs *c = rbio->c; + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + struct printbuf buf = PRINTBUF; + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); + + if (ca) { + bch2_io_error(ca, BCH_MEMBER_ERROR_read); + bch_err_ratelimited(ca, "%s", buf.buf); + } else { + bch_err_ratelimited(c, "%s", buf.buf); + } + + printbuf_exit(&buf); + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); +} + static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_read_bio *rbio) { @@ -487,11 +557,11 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, return 0; k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); if ((ret = bkey_err(k))) goto out; - if (bversion_cmp(k.k->version, rbio->version) || + if (bversion_cmp(k.k->bversion, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) goto out; @@ -523,7 +593,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, goto out; ret = bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); out: bch2_trans_iter_exit(trans, &iter); return ret; @@ -531,8 +601,75 @@ out: static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { - bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_rbio_narrow_crcs(trans, rbio)); + bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_rbio_narrow_crcs(trans, rbio)); +} + +static void bch2_read_csum_err(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bio *src = &rbio->bio; + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); + struct printbuf buf = PRINTBUF; + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_str(&buf, "data "); + bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); + + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + if (ca) { + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + bch_err_ratelimited(ca, "%s", buf.buf); + } else { + bch_err_ratelimited(c, "%s", buf.buf); + } + + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + printbuf_exit(&buf); +} + +static void bch2_read_decompress_err(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct printbuf buf = PRINTBUF; + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_str(&buf, "decompression error"); + + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + if (ca) + bch_err_ratelimited(ca, "%s", buf.buf); + else + bch_err_ratelimited(c, "%s", buf.buf); + + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + printbuf_exit(&buf); +} + +static void bch2_read_decrypt_err(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct printbuf buf = PRINTBUF; + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_str(&buf, "decrypt error"); + + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + if (ca) + bch_err_ratelimited(ca, "%s", buf.buf); + else + bch_err_ratelimited(c, "%s", buf.buf); + + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + printbuf_exit(&buf); } /* Inner part that may run in process context */ @@ -541,7 +678,6 @@ static void __bch2_read_endio(struct work_struct *work) struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); struct bio *src = &rbio->bio; struct bio *dst = &bch2_rbio_parent(rbio)->bio; struct bvec_iter dst_iter = rbio->bvec_iter; @@ -642,31 +778,13 @@ csum_err: goto out; } - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_str(&buf, "data "); - bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); - - bch_err_inum_offset_ratelimited(ca, - rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "data %s", buf.buf); - printbuf_exit(&buf); - - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; decompression_err: - bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "decompression error"); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; decrypt_err: - bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "decrypt error"); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; } @@ -675,7 +793,7 @@ static void bch2_read_endio(struct bio *bio) struct bch_read_bio *rbio = container_of(bio, struct bch_read_bio, bio); struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; @@ -687,17 +805,13 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, - rbio->read_pos.inode, - rbio->read_pos.offset, - "data read error: %s", - bch2_blk_status_to_str(bio->bi_status))) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + if (unlikely(bio->bi_status)) { + bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); return; } if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(ca, &rbio->pick.ptr)) { + (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { trace_and_count(c, read_reuse_race, &rbio->bio); if (rbio->flags & BCH_READ_RETRY_IF_STALE) @@ -718,72 +832,45 @@ static void bch2_read_endio(struct bio *bio) bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); } -int __bch2_read_indirect_extent(struct btree_trans *trans, - unsigned *offset_into_extent, - struct bkey_buf *orig_k) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 reflink_offset; - int ret; - - reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + - *offset_into_extent; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, - POS(0, reflink_offset), 0); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_reflink_v && - k.k->type != KEY_TYPE_indirect_inline_data) { - bch_err_inum_offset_ratelimited(trans->c, - orig_k->k->k.p.inode, - orig_k->k->k.p.offset << 9, - "%llu len %u points to nonexistent indirect extent %llu", - orig_k->k->k.p.offset, - orig_k->k->k.size, - reflink_offset); - bch2_inconsistent_error(trans->c); - ret = -EIO; - goto err; - } - - *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - bch2_bkey_buf_reassemble(orig_k, trans->c, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bch_dev *ca, struct bkey_s_c k, struct bch_extent_ptr ptr) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); struct btree_iter iter; struct printbuf buf = PRINTBUF; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - PTR_BUCKET_POS(c, &ptr), - BTREE_ITER_CACHED); + PTR_BUCKET_POS(ca, &ptr), + BTREE_ITER_cached); - prt_printf(&buf, "Attempting to read from stale dirty pointer:"); - printbuf_indent_add(&buf, 2); - prt_newline(&buf); + int gen = bucket_gen_get(ca, iter.pos.offset); + if (gen >= 0) { + prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); + printbuf_indent_add(&buf, 2); - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); - prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + prt_printf(&buf, "memory gen: %u", gen); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + } + } else { + prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", + iter.pos.inode, iter.pos.offset); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "first bucket %u nbuckets %llu\n", + ca->mi.first_bucket, ca->mi.nbuckets); - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - if (!ret) { - prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); } bch2_fs_inconsistent(c, "%s", buf.buf); @@ -801,7 +888,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct bch_dev *ca = NULL; struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); @@ -825,14 +911,29 @@ retry_pick: if (!pick_ret) goto hole; - if (pick_ret < 0) { - bch_err_inum_offset_ratelimited(c, - read_pos.inode, read_pos.offset << 9, - "no device to read from"); + if (unlikely(pick_ret < 0)) { + struct printbuf buf = PRINTBUF; + bch2_read_err_msg_trans(trans, &buf, orig, read_pos); + prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); + bch2_bkey_val_to_text(&buf, c, k); + + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); goto err; } - ca = bch_dev_bkey_exists(c, pick.ptr.dev); + if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { + struct printbuf buf = PRINTBUF; + bch2_read_err_msg_trans(trans, &buf, orig, read_pos); + prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); + bch2_bkey_val_to_text(&buf, c, k); + + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + goto err; + } + + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); /* * Stale dirty pointers are treated as IO errors, but @failed isn't @@ -842,25 +943,24 @@ retry_pick: */ if ((flags & BCH_READ_IN_RETRY) && !pick.ptr.cached && - unlikely(ptr_stale(ca, &pick.ptr))) { - read_from_stale_dirty_pointer(trans, k, pick.ptr); + ca && + unlikely(dev_ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); bch2_mark_io_failure(failed, &pick); + percpu_ref_put(&ca->io_ref); goto retry_pick; } - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - bch2_trans_unlock(trans); - if (flags & BCH_READ_NODECODE) { /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ - if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { + if (ca) + percpu_ref_put(&ca->io_ref); goto hole; + } iter.bi_size = pick.crc.compressed_size << 9; goto get_bio; @@ -888,9 +988,9 @@ retry_pick: bounce = true; } - if (orig->opts.promote_target) + if (orig->opts.promote_target || have_io_error(failed)) promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, - &rbio, &bounce, &read_full); + &rbio, &bounce, &read_full, failed); if (!read_full) { EBUG_ON(crc_is_compressed(pick.crc)); @@ -965,7 +1065,7 @@ get_bio: rbio->bvec_iter = iter; rbio->offset_into_extent= offset_into_extent; rbio->flags = flags; - rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); + rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; rbio->hole = 0; rbio->retry = 0; @@ -977,10 +1077,13 @@ get_bio: rbio->read_pos = read_pos; rbio->data_btree = data_btree; rbio->data_pos = data_pos; - rbio->version = k.k->version; + rbio->version = k.k->bversion; rbio->promote = promote; INIT_WORK(&rbio->work, NULL); + if (flags & BCH_READ_NODECODE) + orig->pick = pick; + rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; @@ -995,7 +1098,7 @@ get_bio: * If it's being moved internally, we don't want to flag it as a cache * hit: */ - if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); @@ -1004,12 +1107,25 @@ get_bio: trace_and_count(c, read_split, &orig->bio); } + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + if (!(flags & BCH_READ_IN_RETRY)) + bch2_trans_unlock(trans); + else + bch2_trans_unlock_long(trans); + if (!rbio->pick.idx) { - if (!rbio->have_ioref) { - bch_err_inum_offset_ratelimited(c, - read_pos.inode, - read_pos.offset << 9, - "no device to read from"); + if (unlikely(!rbio->have_ioref)) { + struct printbuf buf = PRINTBUF; + bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); + prt_printf(&buf, "no device to read from:\n "); + bch2_bkey_val_to_text(&buf, c, k); + + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } @@ -1035,7 +1151,7 @@ get_bio: trans->notrace_relock_fail = true; } else { /* Attempting reconstruct read: */ - if (bch2_ec_read_extent(trans, rbio)) { + if (bch2_ec_read_extent(trans, rbio, k)) { bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } @@ -1047,6 +1163,8 @@ out: if (likely(!(flags & BCH_READ_IN_RETRY))) { return 0; } else { + bch2_trans_unlock(trans); + int ret; rbio->context = RBIO_CONTEXT_UNBOUND; @@ -1097,34 +1215,26 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; - u32 snapshot; int ret; BUG_ON(flags & BCH_READ_NODECODE); bch2_bkey_buf_init(&sk); -retry: - bch2_trans_begin(trans); - iter = (struct btree_iter) { NULL }; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inum.inum, bvec_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); + POS(inum.inum, bvec_iter.bi_sector), + BTREE_ITER_slots); + while (1) { - unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; - /* - * read_extent -> io_time_reset may cause a transaction restart - * without returning an error, we need to check for that here: - */ - ret = bch2_trans_relock(trans); + bch2_trans_begin(trans); + + u32 snapshot; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) - break; + goto err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); bch2_btree_iter_set_pos(&iter, POS(inum.inum, bvec_iter.bi_sector)); @@ -1132,18 +1242,18 @@ retry: k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) - break; + goto err; - offset_into_extent = iter.pos.offset - + s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; + unsigned sectors = k.k->size - offset_into_extent; bch2_bkey_buf_reassemble(&sk, c, k); ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &sk); if (ret) - break; + goto err; k = bkey_i_to_s_c(sk.k); @@ -1151,9 +1261,9 @@ retry: * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: */ - sectors = min(sectors, k.k->size - offset_into_extent); + sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); if (bvec_iter.bi_size == bytes) @@ -1163,36 +1273,36 @@ retry: data_btree, k, offset_into_extent, failed, flags); if (ret) - break; + goto err; if (flags & BCH_READ_LAST_FRAGMENT) break; swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); - - ret = btree_trans_too_many_iters(trans); - if (ret) +err: + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + ret != READ_RETRY && + ret != READ_RETRY_AVOID) break; } -err: - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - ret == READ_RETRY || - ret == READ_RETRY_AVOID) - goto retry; - bch2_trans_put(trans); - bch2_bkey_buf_exit(&sk, c); + bch2_trans_iter_exit(trans, &iter); if (ret) { - bch_err_inum_offset_ratelimited(c, inum.inum, - bvec_iter.bi_sector << 9, - "read error %i from btree lookup", ret); + struct printbuf buf = PRINTBUF; + bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9); + prt_printf(&buf, "read error %i from btree lookup", ret); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + rbio->bio.bi_status = BLK_STS_IOERR; bch2_rbio_done(rbio); } + + bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); } void bch2_fs_io_read_exit(struct bch_fs *c) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index d9c18bb7d403..a82e8a94ccb6 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -3,6 +3,7 @@ #define _BCACHEFS_IO_READ_H #include "bkey_buf.h" +#include "reflink.h" struct bch_read_bio { struct bch_fs *c; @@ -79,19 +80,32 @@ struct bch_devs_mask; struct cache_promote_op; struct extent_ptr_decoded; -int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, - struct bkey_buf *); - static inline int bch2_read_indirect_extent(struct btree_trans *trans, enum btree_id *data_btree, - unsigned *offset_into_extent, - struct bkey_buf *k) + s64 *offset_into_extent, + struct bkey_buf *extent) { - if (k->k->k.type != KEY_TYPE_reflink_p) + if (extent->k->k.type != KEY_TYPE_reflink_p) return 0; *data_btree = BTREE_ID_reflink; - return __bch2_read_indirect_extent(trans, offset_into_extent, k); + struct btree_iter iter; + struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, + offset_into_extent, + bkey_i_to_s_c_reflink_p(extent->k), + true, 0); + int ret = bkey_err(k); + if (ret) + return ret; + + if (bkey_deleted(k.k)) { + bch2_trans_iter_exit(trans, &iter); + return -BCH_ERR_missing_indirect_extent; + } + + bch2_bkey_buf_reassemble(extent, trans->c, k); + bch2_trans_iter_exit(trans, &iter); + return 0; } enum bch_read_flags { diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 2c098ac017b3..03892388832b 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -69,11 +69,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) u64 io_latency = time_after64(now, submit_time) ? now - submit_time : 0; - u64 old, new, v = atomic64_read(latency); + u64 old, new; + old = atomic64_read(latency); do { - old = v; - /* * If the io latency was reasonably close to the current * latency, skip doing the update and atomic operation - most of @@ -84,11 +83,11 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) break; new = ewma_add(old, io_latency, 5); - } while ((v = atomic64_cmpxchg(latency, old, new)) != old); + } while (!atomic64_try_cmpxchg(latency, &old, new)); bch2_congested_acct(ca, io_latency, now, rw); - __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); + __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); } #endif @@ -165,8 +164,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, bch2_trans_copy_iter(&iter, extent_iter); - for_each_btree_key_upto_continue_norestart(iter, - new->k.p, BTREE_ITER_SLOTS, old, ret) { + for_each_btree_key_max_continue_norestart(iter, + new->k.p, BTREE_ITER_slots, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), bkey_start_offset(old.k)); @@ -199,9 +198,6 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, u64 new_i_size, s64 i_sectors_delta) { - struct btree_iter iter; - struct bkey_i *k; - struct bkey_i_inode_v3 *inode; /* * Crazy performance optimization: * Every extent update needs to also update the inode: the inode trigger @@ -213,26 +209,38 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, * to be journalled - if we crash, the bi_journal_seq update will be * lost, but that's fine. */ - unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; - int ret; + unsigned inode_update_flags = BTREE_UPDATE_nojournal; - k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, extent_iter->pos.inode, extent_iter->snapshot), - BTREE_ITER_CACHED); - ret = PTR_ERR_OR_ZERO(k); + BTREE_ITER_intent| + BTREE_ITER_cached); + int ret = bkey_err(k); if (unlikely(ret)) return ret; - if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { - k = bch2_inode_to_v3(trans, k); - ret = PTR_ERR_OR_ZERO(k); + /* + * varint_decode_fast(), in the inode .invalid method, reads up to 7 + * bytes past the end of the buffer: + */ + struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8); + ret = PTR_ERR_OR_ZERO(k_mut); + if (unlikely(ret)) + goto err; + + bkey_reassemble(k_mut, k); + + if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) { + k_mut = bch2_inode_to_v3(trans, k_mut); + ret = PTR_ERR_OR_ZERO(k_mut); if (unlikely(ret)) goto err; } - inode = bkey_i_to_inode_v3(k); + struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut); if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && new_i_size > le64_to_cpu(inode->v.bi_size)) { @@ -251,7 +259,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, } ret = bch2_trans_update(trans, &iter, &inode->k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_UPDATE_internal_snapshot_node| inode_update_flags); err: bch2_trans_iter_exit(trans, &iter); @@ -360,9 +368,9 @@ static int bch2_write_index_default(struct bch_write_op *op) bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, bkey_start_pos(&sk.k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); - ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: + ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, @@ -388,6 +396,31 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ +static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, + u64 offset) +{ + bch2_inum_offset_err_msg(op->c, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + prt_printf(out, "write error%s: ", + op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); +} + +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) +{ + __bch2_write_op_error(out, op, op->pos.offset); +} + +static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_write_op *op, u64 offset) +{ + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + prt_printf(out, "write error%s: ", + op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); +} + void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k, @@ -399,13 +432,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, BUG_ON(c->opts.nochanges); bkey_for_each_ptr(ptrs, ptr) { - BUG_ON(!bch2_dev_exists2(c, ptr->dev)); - - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = nocow + ? bch2_dev_have_ref(c, ptr->dev) + : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); if (to_entry(ptr + 1) < ptrs.end) { - n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, - GFP_NOFS, &ca->replica_set)); + n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; n->bio.bi_private = wbio->bio.bi_private; @@ -422,11 +454,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->c = c; n->dev = ptr->dev; - n->have_ioref = nocow || bch2_dev_get_ioref(ca, - type == BCH_DATA_btree ? READ : WRITE); + n->have_ioref = ca != NULL; n->nocow = nocow; n->submit_time = local_clock(); n->inode_offset = bkey_start_offset(&k->k); + if (nocow) + n->nocow_bucket = PTR_BUCKET_NR(ca, ptr); n->bio.bi_iter.bi_sector = ptr->offset; if (likely(n->have_ioref)) { @@ -473,7 +506,6 @@ static void bch2_write_done(struct closure *cl) static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { struct keylist *keys = &op->insert_keys; - struct bch_extent_ptr *ptr; struct bkey_i *src, *dst = keys->keys, *n; for (src = keys->keys; src != keys->top; src = n) { @@ -525,13 +557,14 @@ static void __bch2_write_index(struct bch_write_op *op) op->written += sectors_start - keylist_sectors(keys); - if (ret && !bch2_err_matches(ret, EROFS)) { + if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - bch_err_inum_offset_ratelimited(c, - insert->k.p.inode, insert->k.p.offset << 9, - "write error while doing btree update: %s", - bch2_err_str(ret)); + struct printbuf buf = PRINTBUF; + __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); + prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); } if (ret) @@ -547,7 +580,7 @@ out: err: keys->top = keys->keys; op->error = ret; - op->flags |= BCH_WRITE_DONE; + op->flags |= BCH_WRITE_SUBMITTED; goto out; } @@ -582,7 +615,7 @@ static CLOSURE_CALLBACK(bch2_write_index) struct workqueue_struct *wq = index_update_wq(op); unsigned long flags; - if ((op->flags & BCH_WRITE_DONE) && + if ((op->flags & BCH_WRITE_SUBMITTED) && (op->flags & BCH_WRITE_MOVE)) bch2_bio_free_pages_pool(op->c, &op->wbio.bio); @@ -614,9 +647,7 @@ void bch2_write_point_do_index_updates(struct work_struct *work) while (1) { spin_lock_irq(&wp->writes_lock); - op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); - if (op) - list_del(&op->wp_list); + op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list); wp_update_state(wp, op != NULL); spin_unlock_irq(&wp->writes_lock); @@ -627,7 +658,7 @@ void bch2_write_point_do_index_updates(struct work_struct *work) __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_DONE)) + if (!(op->flags & BCH_WRITE_SUBMITTED)) __bch2_write(op); else bch2_write_done(&op->cl); @@ -641,7 +672,9 @@ static void bch2_write_endio(struct bio *bio) struct bch_write_bio *wbio = to_wbio(bio); struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_fs *c = wbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + struct bch_dev *ca = wbio->have_ioref + ? bch2_dev_have_ref(c, wbio->dev) + : NULL; if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, op->pos.inode, @@ -652,8 +685,12 @@ static void bch2_write_endio(struct bio *bio) op->flags |= BCH_WRITE_IO_ERROR; } - if (wbio->nocow) + if (wbio->nocow) { + bch2_bucket_nocow_unlock(&c->nocow_locks, + POS(ca->dev_idx, wbio->nocow_bucket), + BUCKET_NOCOW_LOCK_UPDATE); set_bit(wbio->dev, op->devs_need_flush->d); + } if (wbio->have_ioref) { bch2_latency_acct(ca, wbio->submit_time, WRITE); @@ -684,7 +721,7 @@ static void init_append_extent(struct bch_write_op *op, e = bkey_extent_init(op->insert_keys.top); e->k.p = op->pos; e->k.size = crc.uncompressed_size; - e->k.version = version; + e->k.bversion = version; if (crc.csum_type || crc.compression_type || @@ -846,7 +883,7 @@ static enum prep_encoded_ret { if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) return PREP_ENCODED_CHECKSUM_ERR; - if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) + if (bch2_bio_uncompress_inplace(op, bio)) return PREP_ENCODED_ERR; } @@ -1067,7 +1104,14 @@ do_write: *_dst = dst; return more; csum_err: - bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } + ret = -EIO; err: if (to_wbio(dst)->bounce) @@ -1091,30 +1135,21 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, return false; e = bkey_s_c_to_extent(k); + + rcu_read_lock(); extent_for_each_ptr_decode(e, p, entry) { - if (crc_is_encoded(p.crc) || p.has_ec) + if (crc_is_encoded(p.crc) || p.has_ec) { + rcu_read_unlock(); return false; + } replicas += bch2_extent_ptr_durability(c, &p); } + rcu_read_unlock(); return replicas >= op->opts.data_replicas; } -static inline void bch2_nocow_write_unlock(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - - for_each_keylist_key(&op->insert_keys, k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - - bkey_for_each_ptr(ptrs, ptr) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), - BUCKET_NOCOW_LOCK_UPDATE); - } -} - static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *orig, @@ -1148,7 +1183,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return bch2_extent_update_i_size_sectors(trans, iter, min(new->k.p.offset << 9, new_i_size), 0) ?: bch2_trans_update(trans, iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) @@ -1157,9 +1192,9 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) struct btree_trans *trans = bch2_trans_get(c); for_each_keylist_key(&op->insert_keys, orig) { - int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); @@ -1167,10 +1202,11 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) if (ret && !bch2_err_matches(ret, EROFS)) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - bch_err_inum_offset_ratelimited(c, - insert->k.p.inode, insert->k.p.offset << 9, - "write error while doing btree update: %s", - bch2_err_str(ret)); + struct printbuf buf = PRINTBUF; + bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); + prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); } if (ret) { @@ -1184,8 +1220,6 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) static void __bch2_nocow_write_done(struct bch_write_op *op) { - bch2_nocow_write_unlock(op); - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { op->error = -EIO; } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) @@ -1215,7 +1249,7 @@ static void bch2_nocow_write(struct bch_write_op *op) DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; u32 snapshot; struct bucket_to_lock *stale_at; - int ret; + int stale, ret; if (op->flags & BCH_WRITE_MOVE) return; @@ -1231,12 +1265,16 @@ retry: bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(op->pos.inode, op->pos.offset, snapshot), - BTREE_ITER_SLOTS); + BTREE_ITER_slots); while (1) { struct bio *bio = &op->wbio.bio; buckets.nr = 0; + ret = bch2_trans_relock(trans); + if (ret) + break; + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -1256,14 +1294,15 @@ retry: /* Get iorefs before dropping btree locks: */ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - struct bpos b = PTR_BUCKET_POS(c, ptr); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + if (unlikely(!ca)) + goto err_get_ioref; + + struct bpos b = PTR_BUCKET_POS(ca, ptr); struct nocow_lock_bucket *l = bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); prefetch(l); - if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) - goto err_get_ioref; - /* XXX allocating memory with btree locks held - rare */ darray_push_gfp(&buckets, ((struct bucket_to_lock) { .b = b, .gen = ptr->gen, .l = l, @@ -1282,16 +1321,14 @@ retry: bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); darray_for_each(buckets, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode); + struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode); __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, bucket_to_u64(i->b), BUCKET_NOCOW_LOCK_UPDATE); - rcu_read_lock(); - bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen); - rcu_read_unlock(); - + int gen = bucket_gen_get(ca, i->b.offset); + stale = gen < 0 ? gen : gen_after(gen, i->gen); if (unlikely(stale)) { stale_at = i; goto err_bucket_stale; @@ -1305,7 +1342,7 @@ retry: wbio_init(bio)->put_bio = true; bio->bi_opf = op->wbio.bio.bi_opf; } else { - op->flags |= BCH_WRITE_DONE; + op->flags |= BCH_WRITE_SUBMITTED; } op->pos.offset += bio_sectors(bio); @@ -1319,7 +1356,7 @@ retry: op->insert_keys.top, true); bch2_keylist_push(&op->insert_keys); - if (op->flags & BCH_WRITE_DONE) + if (op->flags & BCH_WRITE_SUBMITTED) break; bch2_btree_iter_advance(&iter); } @@ -1329,19 +1366,21 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + bch2_trans_put(trans); + darray_exit(&buckets); + if (ret) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, op->pos.offset << 9, - "%s: btree lookup error %s", __func__, bch2_err_str(ret)); + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); op->error = ret; - op->flags |= BCH_WRITE_DONE; + op->flags |= BCH_WRITE_SUBMITTED; } - bch2_trans_put(trans); - darray_exit(&buckets); - /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_DONE)) { + if (!(op->flags & BCH_WRITE_SUBMITTED)) { closure_sync(&op->cl); __bch2_nocow_write_done(op); op->insert_keys.top = op->insert_keys.keys; @@ -1359,7 +1398,7 @@ err: return; err_get_ioref: darray_for_each(buckets, i) - percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref); + percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref); /* Fall back to COW path: */ goto out; @@ -1370,8 +1409,18 @@ err_bucket_stale: break; } - /* We can retry this: */ - ret = -BCH_ERR_transaction_restart; + struct printbuf buf = PRINTBUF; + if (bch2_fs_inconsistent_on(stale < 0, c, + "pointer to invalid bucket in nocow path on device %llu\n %s", + stale_at->b.inode, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + } else { + /* We can retry this: */ + ret = -BCH_ERR_transaction_restart; + } + printbuf_exit(&buf); + goto err_get_ioref; } @@ -1387,7 +1436,7 @@ static void __bch2_write(struct bch_write_op *op) if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { bch2_nocow_write(op); - if (op->flags & BCH_WRITE_DONE) + if (op->flags & BCH_WRITE_SUBMITTED) goto out_nofs_restore; } again: @@ -1414,7 +1463,7 @@ again: * freeing up space on specific disks, which means that * allocations for specific disks may hang arbitrarily long: */ - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_run(c, lockrestart_do(trans, bch2_alloc_sectors_start_trans(trans, op->target, op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), @@ -1424,9 +1473,7 @@ again: op->nr_replicas_required, op->watermark, op->flags, - (op->flags & (BCH_WRITE_ALLOC_NOWAIT| - BCH_WRITE_ONLY_SPECIFIED_DEVS)) - ? NULL : &op->cl, &wp)); + &op->cl, &wp))); if (unlikely(ret)) { if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) break; @@ -1442,14 +1489,16 @@ again: bch2_alloc_sectors_done_inlined(c, wp); err: if (ret <= 0) { - op->flags |= BCH_WRITE_DONE; - - if (ret < 0) { - if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s(): error: %s", __func__, bch2_err_str(ret)); + op->flags |= BCH_WRITE_SUBMITTED; + + if (unlikely(ret < 0)) { + if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } op->error = ret; break; } @@ -1476,12 +1525,13 @@ err: * once, as that signals backpressure to the caller. */ if ((op->flags & BCH_WRITE_SYNC) || - (!(op->flags & BCH_WRITE_DONE) && + (!(op->flags & BCH_WRITE_SUBMITTED) && !(op->flags & BCH_WRITE_IN_WORKER))) { - closure_sync(&op->cl); + bch2_wait_on_allocator(c, &op->cl); + __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_DONE)) + if (!(op->flags & BCH_WRITE_SUBMITTED)) goto again; bch2_write_done(&op->cl); } else { @@ -1500,8 +1550,10 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) unsigned sectors; int ret; + memset(&op->failed, 0, sizeof(op->failed)); + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; - op->flags |= BCH_WRITE_DONE; + op->flags |= BCH_WRITE_SUBMITTED; bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); @@ -1518,7 +1570,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) id = bkey_inline_data_init(op->insert_keys.top); id->k.p = op->pos; - id->k.version = op->version; + id->k.bversion = op->version; id->k.size = sectors; iter = bio->bi_iter; @@ -1564,16 +1616,19 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); + if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) + op->flags |= BCH_WRITE_ALLOC_NOWAIT; + op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; - if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "misaligned write"); + if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "misaligned write"); + printbuf_exit(&buf); op->error = -EIO; goto err; } @@ -1633,8 +1688,7 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) prt_bitflags(out, bch2_write_flags, op->flags); prt_newline(out); - prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); - prt_newline(out); + prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); printbuf_indent_sub(out, 2); } @@ -1642,13 +1696,14 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) void bch2_fs_io_write_exit(struct bch_fs *c) { mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->replica_set); bioset_exit(&c->bio_write); } int bch2_fs_io_write_init(struct bch_fs *c) { - if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), - BIOSET_NEED_BVECS)) + if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || + bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) return -BCH_ERR_ENOMEM_bio_write_init; if (mempool_init_page_pool(&c->bio_bounce_pages, diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index 6c276a48f95d..b4626013abc8 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -20,6 +20,8 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); + #define BCH_WRITE_FLAGS() \ x(ALLOC_NOWAIT) \ x(CACHED) \ @@ -33,7 +35,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, x(SYNC) \ x(MOVE) \ x(IN_WORKER) \ - x(DONE) \ + x(SUBMITTED) \ x(IO_ERROR) \ x(CONVERT_UNWRITTEN) diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index c7f97c2c4805..6e878a6f2f0b 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -20,6 +20,7 @@ struct bch_write_bio { u64 submit_time; u64 inode_offset; + u64 nocow_bucket; struct bch_devs_list failed; u8 dev; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bc890776eb57..05b1250619ec 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -27,31 +27,59 @@ static const char * const bch2_journal_errors[] = { NULL }; +static inline bool journal_seq_unwritten(struct journal *j, u64 seq) +{ + return seq > j->seq_ondisk; +} + +static bool __journal_entry_is_open(union journal_res_state state) +{ + return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; +} + +static inline unsigned nr_unwritten_journal_entries(struct journal *j) +{ + return atomic64_read(&j->seq) - j->seq_ondisk; +} + +static bool journal_entry_is_open(struct journal *j) +{ + return __journal_entry_is_open(j->reservations); +} + static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) { union journal_res_state s = READ_ONCE(j->reservations); unsigned i = seq & JOURNAL_BUF_MASK; struct journal_buf *buf = j->buf + i; - prt_printf(out, "seq:"); - prt_tab(out); - prt_printf(out, "%llu", seq); - prt_newline(out); + prt_printf(out, "seq:\t%llu\n", seq); printbuf_indent_add(out, 2); - prt_printf(out, "refcount:"); - prt_tab(out); - prt_printf(out, "%u", journal_state_count(s, i)); - prt_newline(out); + prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i)); - prt_printf(out, "size:"); - prt_tab(out); + prt_printf(out, "size:\t"); prt_human_readable_u64(out, vstruct_bytes(buf->data)); prt_newline(out); - prt_printf(out, "expires"); - prt_tab(out); - prt_printf(out, "%li jiffies", buf->expires - jiffies); + prt_printf(out, "expires:\t"); + prt_printf(out, "%li jiffies\n", buf->expires - jiffies); + + prt_printf(out, "flags:\t"); + if (buf->noflush) + prt_str(out, "noflush "); + if (buf->must_flush) + prt_str(out, "must_flush "); + if (buf->separate_flush) + prt_str(out, "separate_flush "); + if (buf->need_flush_to_write_buffer) + prt_str(out, "need_flush_to_write_buffer "); + if (buf->write_started) + prt_str(out, "write_started "); + if (buf->write_allocated) + prt_str(out, "write_allocated "); + if (buf->write_done) + prt_str(out, "write_done"); prt_newline(out); printbuf_indent_sub(out, 2); @@ -66,26 +94,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) seq <= journal_cur_seq(j); seq++) bch2_journal_buf_to_text(out, j, seq); -} - -static inline bool journal_seq_unwritten(struct journal *j, u64 seq) -{ - return seq > j->seq_ondisk; -} - -static bool __journal_entry_is_open(union journal_res_state state) -{ - return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -} - -static inline unsigned nr_unwritten_journal_entries(struct journal *j) -{ - return atomic64_read(&j->seq) - j->seq_ondisk; -} - -static bool journal_entry_is_open(struct journal *j) -{ - return __journal_entry_is_open(j->reservations); + prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); } static inline struct journal_buf * @@ -104,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq) static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) { - unsigned i; - - for (i = 0; i < ARRAY_SIZE(p->list); i++) - INIT_LIST_HEAD(&p->list[i]); - INIT_LIST_HEAD(&p->flushed); + for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++) + INIT_LIST_HEAD(&p->unflushed[i]); + for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) + INIT_LIST_HEAD(&p->flushed[i]); atomic_set(&p->count, count); p->devs.nr = 0; } @@ -174,21 +182,46 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) return stuck; } +void bch2_journal_do_writes(struct journal *j) +{ + for (u64 seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + unsigned idx = seq & JOURNAL_BUF_MASK; + struct journal_buf *w = j->buf + idx; + + if (w->write_started && !w->write_allocated) + break; + if (w->write_started) + continue; + + if (!journal_state_count(j->reservations, idx)) { + w->write_started = true; + closure_call(&w->io, bch2_journal_write, j->wq, NULL); + } + + break; + } +} + /* * Final processing when the last reference of a journal buffer has been * dropped. Drop the pin list reference acquired at journal entry open and write * the buffer, if requested. */ -void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) +void bch2_journal_buf_put_final(struct journal *j, u64 seq) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - lockdep_assert_held(&j->lock); if (__bch2_journal_pin_put(j, seq)) bch2_journal_reclaim_fast(j); - if (write) - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + bch2_journal_do_writes(j); + + /* + * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an + * open journal entry + */ + wake_up(&j->wait); } /* @@ -202,7 +235,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); unsigned sectors; BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && @@ -210,19 +242,23 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t lockdep_assert_held(&j->lock); + old.v = atomic64_read(&j->reservations.counter); do { - old.v = new.v = v; + new.v = old.v; new.cur_entry_offset = closed_val; if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || old.cur_entry_offset == new.cur_entry_offset) return; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); + } while (!atomic64_try_cmpxchg(&j->reservations.counter, + &old.v, new.v)); if (!__journal_entry_is_open(old)) return; + if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) + old.cur_entry_offset = j->cur_entry_offset_if_blocked; + /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); @@ -283,6 +319,16 @@ void bch2_journal_halt(struct journal *j) spin_unlock(&j->lock); } +void bch2_journal_halt_locked(struct journal *j) +{ + lockdep_assert_held(&j->lock); + + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); + if (!j->err_seq) + j->err_seq = journal_cur_seq(j); + journal_wake(j); +} + static bool journal_entry_want_write(struct journal *j) { bool ret = !journal_entry_is_open(j) || @@ -325,7 +371,6 @@ static int journal_entry_open(struct journal *j) ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); union journal_res_state old, new; int u64s; - u64 v; lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); @@ -346,6 +391,13 @@ static int journal_entry_open(struct journal *j) if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) return JOURNAL_ERR_max_in_flight; + if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { + bch_err(c, "cannot start: journal seq overflow"); + if (bch2_fs_emergency_read_only_locked(c)) + bch_err(c, "fatal error - emergency read only"); + return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + } + BUG_ON(!j->cur_entry_sectors); buf->expires = @@ -380,11 +432,14 @@ static int journal_entry_open(struct journal *j) BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); bkey_extent_init(&buf->key); - buf->noflush = false; - buf->must_flush = false; - buf->separate_flush = false; - buf->flush_time = 0; + buf->noflush = false; + buf->must_flush = false; + buf->separate_flush = false; + buf->flush_time = 0; buf->need_flush_to_write_buffer = true; + buf->write_started = false; + buf->write_allocated = false; + buf->write_done = false; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); @@ -401,9 +456,9 @@ static int journal_entry_open(struct journal *j) */ j->cur_entry_u64s = u64s; - v = atomic64_read(&j->reservations.counter); + old.v = atomic64_read(&j->reservations.counter); do { - old.v = new.v = v; + new.v = old.v; BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); @@ -415,12 +470,13 @@ static int journal_entry_open(struct journal *j) /* Handle any already added entries */ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); + } while (!atomic64_try_cmpxchg(&j->reservations.counter, + &old.v, new.v)); - mod_delayed_work(c->io_complete_wq, - &j->write_work, - msecs_to_jiffies(c->opts.journal_flush_delay)); + if (nr_unwritten_journal_entries(j) == 1) + mod_delayed_work(j->wq, + &j->write_work, + msecs_to_jiffies(c->opts.journal_flush_delay)); journal_wake(j); if (j->early_journal_entries.nr) @@ -445,20 +501,16 @@ static void journal_quiesce(struct journal *j) static void journal_write_work(struct work_struct *work) { struct journal *j = container_of(work, struct journal, write_work.work); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - long delta; spin_lock(&j->lock); - if (!__journal_entry_is_open(j->reservations)) - goto unlock; - - delta = journal_cur_buf(j)->expires - jiffies; + if (__journal_entry_is_open(j->reservations)) { + long delta = journal_cur_buf(j)->expires - jiffies; - if (delta > 0) - mod_delayed_work(c->io_complete_wq, &j->write_work, delta); - else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); -unlock: + if (delta > 0) + mod_delayed_work(j->wq, &j->write_work, delta); + else + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); + } spin_unlock(&j->lock); } @@ -476,30 +528,29 @@ retry: if (bch2_journal_error(j)) return -BCH_ERR_erofs_journal_err; - spin_lock(&j->lock); + if (j->blocked) + return -BCH_ERR_journal_res_get_blocked; - /* check once more in case somebody else shut things down... */ - if (bch2_journal_error(j)) { - spin_unlock(&j->lock); - return -BCH_ERR_erofs_journal_err; + if ((flags & BCH_WATERMARK_MASK) < j->watermark) { + ret = JOURNAL_ERR_journal_full; + can_discard = j->can_discard; + goto out; } + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { + ret = JOURNAL_ERR_max_in_flight; + goto out; + } + + spin_lock(&j->lock); + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call bch2_journal_entry_close() * unnecessarily */ if (journal_res_get_fast(j, res, flags)) { - spin_unlock(&j->lock); - return 0; - } - - if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - /* - * Don't want to close current journal entry, just need to - * invoke reclaim: - */ - ret = JOURNAL_ERR_journal_full; + ret = 0; goto unlock; } @@ -515,30 +566,30 @@ retry: j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); - ret = journal_entry_open(j); - - if (ret == JOURNAL_ERR_max_in_flight) { - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], - &j->max_in_flight_start, true); - if (trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - - bch2_journal_bufs_to_text(&buf, j); - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - } - count_event(c, journal_entry_full); - } + ret = journal_entry_open(j) ?: JOURNAL_ERR_retry; unlock: can_discard = j->can_discard; spin_unlock(&j->lock); - - if (!ret) +out: + if (ret == JOURNAL_ERR_retry) goto retry; + if (!ret) + return 0; + if (journal_error_check_stuck(j, ret, flags)) ret = -BCH_ERR_journal_res_get_blocked; + if (ret == JOURNAL_ERR_max_in_flight && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { + + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); + bch2_journal_bufs_to_text(&buf, j); + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + count_event(c, journal_entry_full); + } + /* * Journal is full - can't rely on reclaim from work item due to * freezing: @@ -562,6 +613,16 @@ unlock: : -BCH_ERR_journal_res_get_blocked; } +static unsigned max_dev_latency(struct bch_fs *c) +{ + u64 nsecs = 0; + + for_each_rw_member(c, ca) + nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); + + return nsecs_to_jiffies(nsecs); +} + /* * Essentially the entry function to the journaling code. When bcachefs is doing * a btree insert, it calls this function to get the current journal write. @@ -573,10 +634,37 @@ unlock: * btree node write locks. */ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - unsigned flags) + unsigned flags, + struct btree_trans *trans) { int ret; + if (closure_wait_event_timeout(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK), + HZ)) + return ret; + + if (trans) + bch2_trans_unlock_long(trans); + + struct bch_fs *c = container_of(j, struct bch_fs, journal); + int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10); + + remaining_wait = max(0, remaining_wait - HZ); + + if (closure_wait_event_timeout(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK), + remaining_wait)) + return ret; + + struct printbuf buf = PRINTBUF; + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", + buf.buf); + printbuf_exit(&buf); + closure_wait_event(&j->async_wait, (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || (flags & JOURNAL_RES_GET_NONBLOCK)); @@ -625,7 +713,7 @@ out: * @seq: seq to flush * @parent: closure object to wait with * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, - * -EIO if @seq will never be flushed + * -BCH_ERR_journal_flush_err if @seq will never be flushed * * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if * necessary @@ -648,7 +736,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { - ret = -EIO; + ret = -BCH_ERR_journal_flush_err; goto out; } @@ -669,12 +757,18 @@ recheck_need_open: spin_unlock(&j->lock); - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + /* + * We're called from bch2_journal_flush_seq() -> wait_event(); + * but this might block. We won't usually block, so we won't + * livelock: + */ + sched_annotate_sleep(); + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); if (ret) return ret; seq = res.seq; - buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf = journal_seq_to_buf(j, seq); buf->must_flush = true; if (!buf->flush_time) { @@ -692,8 +786,8 @@ recheck_need_open: } /* - * if write was kicked off without a flush, flush the next sequence - * number instead + * if write was kicked off without a flush, or if we promised it + * wouldn't be a flush, flush the next sequence number instead */ buf = journal_seq_to_buf(j, seq); if (buf->noflush) { @@ -702,6 +796,7 @@ recheck_need_open: } buf->must_flush = true; + j->flushing_seq = max(j->flushing_seq, seq); if (parent && !closure_wait(&buf->wait, parent)) BUG(); @@ -713,7 +808,7 @@ out: return ret; } -int bch2_journal_flush_seq(struct journal *j, u64 seq) +int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state) { u64 start_time = local_clock(); int ret, ret2; @@ -724,7 +819,9 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) if (seq <= j->flushed_seq_ondisk) return 0; - ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); + ret = wait_event_state(j->wait, + (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)), + task_state); if (!ret) bch2_time_stats_update(j->flush_seq_time, start_time); @@ -743,14 +840,15 @@ void bch2_journal_flush_async(struct journal *j, struct closure *parent) int bch2_journal_flush(struct journal *j) { - return bch2_journal_flush_seq(j, atomic64_read(&j->seq)); + return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE); } /* - * bch2_journal_noflush_seq - tell the journal not to issue any flushes before + * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the + * range [start, end) * @seq */ -bool bch2_journal_noflush_seq(struct journal *j, u64 seq) +bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) { struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 unwritten_seq; @@ -759,20 +857,20 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) return false; - if (seq <= c->journal.flushed_seq_ondisk) + if (c->journal.flushed_seq_ondisk >= start) return false; spin_lock(&j->lock); - if (seq <= c->journal.flushed_seq_ondisk) + if (c->journal.flushed_seq_ondisk >= start) goto out; for (unwritten_seq = journal_last_unwritten_seq(j); - unwritten_seq < seq; + unwritten_seq < end; unwritten_seq++) { struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); - /* journal write is already in flight, and was a flush write: */ - if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) + /* journal flush already in flight, or flush requseted */ + if (buf->must_flush) goto out; buf->noflush = true; @@ -784,19 +882,14 @@ out: return ret; } -int bch2_journal_meta(struct journal *j) +static int __bch2_journal_meta(struct journal *j) { - struct journal_buf *buf; - struct journal_res res; - int ret; - - memset(&res, 0, sizeof(res)); - - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + struct journal_res res = {}; + int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); if (ret) return ret; - buf = j->buf + (res.seq & JOURNAL_BUF_MASK); + struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK); buf->must_flush = true; if (!buf->flush_time) { @@ -806,7 +899,19 @@ int bch2_journal_meta(struct journal *j) bch2_journal_res_put(j, &res); - return bch2_journal_flush_seq(j, res.seq); + return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE); +} + +int bch2_journal_meta(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) + return -EROFS; + + int ret = __bch2_journal_meta(j); + bch2_write_ref_put(c, BCH_WRITE_REF_journal); + return ret; } /* block/unlock the journal: */ @@ -814,25 +919,58 @@ int bch2_journal_meta(struct journal *j) void bch2_journal_unblock(struct journal *j) { spin_lock(&j->lock); - j->blocked--; + if (!--j->blocked && + j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && + j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { + union journal_res_state old, new; + + old.v = atomic64_read(&j->reservations.counter); + do { + new.v = old.v; + new.cur_entry_offset = j->cur_entry_offset_if_blocked; + } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + } spin_unlock(&j->lock); journal_wake(j); } +static void __bch2_journal_block(struct journal *j) +{ + if (!j->blocked++) { + union journal_res_state old, new; + + old.v = atomic64_read(&j->reservations.counter); + do { + j->cur_entry_offset_if_blocked = old.cur_entry_offset; + + if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL) + break; + + new.v = old.v; + new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; + } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + + journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); + } +} + void bch2_journal_block(struct journal *j) { spin_lock(&j->lock); - j->blocked++; + __bch2_journal_block(j); spin_unlock(&j->lock); journal_quiesce(j); } -static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, + u64 max_seq, bool *blocked) { struct journal_buf *ret = NULL; + /* We're inside wait_event(), but using mutex_lock(: */ + sched_annotate_sleep(); mutex_lock(&j->buf_lock); spin_lock(&j->lock); max_seq = min(max_seq, journal_cur_seq(j)); @@ -844,13 +982,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou struct journal_buf *buf = j->buf + idx; if (buf->need_flush_to_write_buffer) { - if (seq == journal_cur_seq(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - union journal_res_state s; s.v = atomic64_read_acquire(&j->reservations.counter); - ret = journal_state_count(s, idx) + unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s); + + if (open && !*blocked) { + __bch2_journal_block(j); + *blocked = true; + } + + ret = journal_state_count(s, idx) > open ? ERR_PTR(-EAGAIN) : buf; break; @@ -863,18 +1005,24 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou return ret; } -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, + u64 max_seq, bool *blocked) { struct journal_buf *ret; + *blocked = false; + + wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, + max_seq, blocked)) != ERR_PTR(-EAGAIN)); + if (IS_ERR_OR_NULL(ret) && *blocked) + bch2_journal_unblock(j); - wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); return ret; } /* allocate journal on a device: */ -static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, - bool new_fs, struct closure *cl) +static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, + bool new_fs, struct closure *cl) { struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; @@ -896,30 +1044,29 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, } for (nr_got = 0; nr_got < nr_want; nr_got++) { - if (new_fs) { - bu[nr_got] = bch2_bucket_alloc_new_fs(ca); - if (bu[nr_got] < 0) { - ret = -BCH_ERR_ENOSPC_bucket_alloc; - break; - } - } else { - ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl); - ret = PTR_ERR_OR_ZERO(ob[nr_got]); - if (ret) - break; + enum bch_watermark watermark = new_fs + ? BCH_WATERMARK_btree + : BCH_WATERMARK_normal; + ob[nr_got] = bch2_bucket_alloc(c, ca, watermark, + BCH_DATA_journal, cl); + ret = PTR_ERR_OR_ZERO(ob[nr_got]); + if (ret) + break; + + if (!new_fs) { ret = bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, - ca->mi.bucket_size)); + ca->mi.bucket_size, BTREE_TRIGGER_transactional)); if (ret) { bch2_open_bucket_put(c, ob[nr_got]); bch_err_msg(c, ret, "marking new journal buckets"); break; } - - bu[nr_got] = ob[nr_got]->bucket; } + + bu[nr_got] = ob[nr_got]->bucket; } if (!nr_got) @@ -959,8 +1106,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (ret) goto err_unblock; - if (!new_fs) - bch2_write_super(c); + bch2_write_super(c); /* Commit: */ if (c) @@ -991,11 +1137,11 @@ err_unblock: for (i = 0; i < nr_got; i++) bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, - bu[i], BCH_DATA_free, 0)); + bu[i], BCH_DATA_free, 0, + BTREE_TRIGGER_transactional)); err_free: - if (!new_fs) - for (i = 0; i < nr_got; i++) - bch2_open_bucket_put(c, ob[i]); + for (i = 0; i < nr_got; i++) + bch2_open_bucket_put(c, ob[i]); kfree(new_bucket_seq); kfree(new_buckets); @@ -1004,26 +1150,20 @@ err_free: return ret; } -/* - * Allocate more journal space at runtime - not currently making use if it, but - * the code works: - */ -int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, - unsigned nr) +static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca, + unsigned nr, bool new_fs) { struct journal_device *ja = &ca->journal; - struct closure cl; int ret = 0; + struct closure cl; closure_init_stack(&cl); - down_write(&c->state_lock); - /* don't handle reducing nr of buckets yet: */ if (nr < ja->nr) - goto unlock; + return 0; - while (ja->nr < nr) { + while (!ret && ja->nr < nr) { struct disk_reservation disk_res = { 0, 0, 0 }; /* @@ -1036,29 +1176,42 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, * filesystem-wide allocation will succeed, this is a device * specific allocation - we can hang here: */ + if (!new_fs) { + ret = bch2_disk_reservation_get(c, &disk_res, + bucket_to_sector(ca, nr - ja->nr), 1, 0); + if (ret) + break; + } - ret = bch2_disk_reservation_get(c, &disk_res, - bucket_to_sector(ca, nr - ja->nr), 1, 0); - if (ret) - break; + ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl); - ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); + if (ret == -BCH_ERR_bucket_alloc_blocked || + ret == -BCH_ERR_open_buckets_empty) + ret = 0; /* wait and retry */ bch2_disk_reservation_put(c, &disk_res); - closure_sync(&cl); - - if (ret && ret != -BCH_ERR_bucket_alloc_blocked) - break; } - bch_err_fn(c, ret); -unlock: + return ret; +} + +/* + * Allocate more journal space at runtime - not currently making use if it, but + * the code works: + */ +int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, + unsigned nr) +{ + down_write(&c->state_lock); + int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false); up_write(&c->state_lock); + + bch_err_fn(c, ret); return ret; } -int bch2_dev_journal_alloc(struct bch_dev *ca) +int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { unsigned nr; int ret; @@ -1080,7 +1233,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); + ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs); err: bch_err_fn(ca, ret); return ret; @@ -1092,7 +1245,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c) if (ca->journal.nr) continue; - int ret = bch2_dev_journal_alloc(ca); + int ret = bch2_dev_journal_alloc(ca, true); if (ret) { percpu_ref_put(&ca->io_ref); return ret; @@ -1130,6 +1283,9 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) void bch2_fs_journal_stop(struct journal *j) { + if (!test_bit(JOURNAL_running, &j->flags)) + return; + bch2_journal_reclaim_stop(j); bch2_journal_flush_all_pins(j); @@ -1139,15 +1295,19 @@ void bch2_fs_journal_stop(struct journal *j) * Always write a new journal entry, to make sure the clock hands are up * to date (and match the superblock) */ - bch2_journal_meta(j); + __bch2_journal_meta(j); journal_quiesce(j); + cancel_delayed_work_sync(&j->write_work); - BUG_ON(!bch2_journal_error(j) && - test_bit(JOURNAL_REPLAY_DONE, &j->flags) && - j->last_empty_seq != journal_cur_seq(j)); + WARN(!bch2_journal_error(j) && + test_bit(JOURNAL_replay_done, &j->flags) && + j->last_empty_seq != journal_cur_seq(j), + "journal shutdown error: cur seq %llu but last empty seq %llu", + journal_cur_seq(j), j->last_empty_seq); - cancel_delayed_work_sync(&j->write_work); + if (!bch2_journal_error(j)) + clear_bit(JOURNAL_running, &j->flags); } int bch2_fs_journal_start(struct journal *j, u64 cur_seq) @@ -1157,13 +1317,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) struct journal_replay *i, **_i; struct genradix_iter iter; bool had_entries = false; - unsigned ptr; u64 last_seq = cur_seq, nr, seq; + if (cur_seq >= JOURNAL_SEQ_MAX) { + bch_err(c, "cannot start: journal seq overflow"); + return -EINVAL; + } + genradix_for_each_reverse(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; last_seq = le64_to_cpu(i->j.last_seq); @@ -1196,7 +1360,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; seq = le64_to_cpu(i->j.seq); @@ -1211,18 +1375,18 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) p = journal_seq_pin(j, seq); p->devs.nr = 0; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) - bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); + darray_for_each(i->ptrs, ptr) + bch2_dev_list_add_dev(&p->devs, ptr->dev); had_entries = true; } if (!had_entries) - j->last_empty_seq = cur_seq; + j->last_empty_seq = cur_seq - 1; /* to match j->seq */ spin_lock(&j->lock); - set_bit(JOURNAL_STARTED, &j->flags); + set_bit(JOURNAL_running, &j->flags); j->last_flush_write = jiffies; j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); @@ -1240,13 +1404,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) void bch2_dev_journal_exit(struct bch_dev *ca) { - kfree(ca->journal.bio); - kfree(ca->journal.buckets); - kfree(ca->journal.bucket_seq); + struct journal_device *ja = &ca->journal; + + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { + kfree(ja->bio[i]); + ja->bio[i] = NULL; + } - ca->journal.bio = NULL; - ca->journal.buckets = NULL; - ca->journal.bucket_seq = NULL; + kfree(ja->buckets); + kfree(ja->bucket_seq); + ja->buckets = NULL; + ja->bucket_seq = NULL; } int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) @@ -1256,14 +1424,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) bch2_sb_field_get(sb, journal); struct bch_sb_field_journal_v2 *journal_buckets_v2 = bch2_sb_field_get(sb, journal_v2); - unsigned i, nr_bvecs; ja->nr = 0; if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - for (i = 0; i < nr; i++) + for (unsigned i = 0; i < nr; i++) ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); } else if (journal_buckets) { ja->nr = bch2_nr_journal_buckets(journal_buckets); @@ -1273,13 +1440,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (!ja->bucket_seq) return -BCH_ERR_ENOMEM_dev_journal_init; - nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); + unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); - ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); - if (!ca->journal.bio) - return -BCH_ERR_ENOMEM_dev_journal_init; + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { + ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + nr_bvecs), GFP_KERNEL); + if (!ja->bio[i]) + return -BCH_ERR_ENOMEM_dev_journal_init; - bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0); + ja->bio[i]->ca = ca; + ja->bio[i]->buf_idx = i; + bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); + } ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->buckets) @@ -1287,14 +1459,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - unsigned j, dst = 0; + unsigned dst = 0; - for (i = 0; i < nr; i++) - for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) + for (unsigned i = 0; i < nr; i++) + for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) ja->buckets[dst++] = le64_to_cpu(journal_buckets_v2->d[i].start) + j; } else if (journal_buckets) { - for (i = 0; i < ja->nr; i++) + for (unsigned i = 0; i < ja->nr; i++) ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); } @@ -1303,19 +1475,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) void bch2_fs_journal_exit(struct journal *j) { - unsigned i; + if (j->wq) + destroy_workqueue(j->wq); darray_exit(&j->early_journal_entries); - for (i = 0; i < ARRAY_SIZE(j->buf); i++) - kvpfree(j->buf[i].data, j->buf[i].buf_size); + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) + kvfree(j->buf[i].data); free_fifo(&j->pin); } int bch2_fs_journal_init(struct journal *j) { static struct lock_class_key res_key; - unsigned i; mutex_init(&j->buf_lock); spin_lock_init(&j->lock); @@ -1336,19 +1508,32 @@ int bch2_fs_journal_init(struct journal *j) if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) return -BCH_ERR_ENOMEM_journal_pin_fifo; - for (i = 0; i < ARRAY_SIZE(j->buf); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); + j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); if (!j->buf[i].data) return -BCH_ERR_ENOMEM_journal_buf; + j->buf[i].idx = i; } j->pin.front = j->pin.back = 1; + + j->wq = alloc_workqueue("bcachefs_journal", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); + if (!j->wq) + return -BCH_ERR_ENOMEM_fs_other_alloc; return 0; } /* debug: */ +static const char * const bch2_journal_flags_strs[] = { +#define x(n) #n, + JOURNAL_FLAGS() +#undef x + NULL +}; + void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); @@ -1356,20 +1541,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) unsigned long now = jiffies; u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 24); + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 28); out->atomic++; rcu_read_lock(); s = READ_ONCE(j->reservations); + prt_printf(out, "flags:\t"); + prt_bitflags(out, bch2_journal_flags_strs, j->flags); + prt_newline(out); prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); - prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); - prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j)); + prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk); + prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); + prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); @@ -1378,49 +1566,52 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_newline(out); prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); - prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); + prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); + prt_printf(out, "blocked:\t%u\n", j->blocked); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); - prt_printf(out, "current entry:\t\t"); + prt_printf(out, "current entry:\t"); switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: - prt_printf(out, "error"); + prt_printf(out, "error\n"); break; case JOURNAL_ENTRY_CLOSED_VAL: - prt_printf(out, "closed"); + prt_printf(out, "closed\n"); + break; + case JOURNAL_ENTRY_BLOCKED_VAL: + prt_printf(out, "blocked\n"); break; default: - prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s); + prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); break; } - prt_newline(out); - prt_printf(out, "unwritten entries:"); - prt_newline(out); + prt_printf(out, "unwritten entries:\n"); bch2_journal_bufs_to_text(out, j); - prt_printf(out, - "replay done:\t\t%i\n", - test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - prt_printf(out, "space:\n"); - prt_printf(out, "\tdiscarded\t%u:%u\n", + printbuf_indent_add(out, 2); + prt_printf(out, "discarded\t%u:%u\n", j->space[journal_space_discarded].next_entry, j->space[journal_space_discarded].total); - prt_printf(out, "\tclean ondisk\t%u:%u\n", + prt_printf(out, "clean ondisk\t%u:%u\n", j->space[journal_space_clean_ondisk].next_entry, j->space[journal_space_clean_ondisk].total); - prt_printf(out, "\tclean\t\t%u:%u\n", + prt_printf(out, "clean\t%u:%u\n", j->space[journal_space_clean].next_entry, j->space[journal_space_clean].total); - prt_printf(out, "\ttotal\t\t%u:%u\n", + prt_printf(out, "total\t%u:%u\n", j->space[journal_space_total].next_entry, j->space[journal_space_total].total); + printbuf_indent_sub(out, 2); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { + if (!ca->mi.durability) + continue; + struct journal_device *ja = &ca->journal; if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) @@ -1429,16 +1620,21 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) if (!ja->nr) continue; - prt_printf(out, "dev %u:\n", ca->dev_idx); - prt_printf(out, "\tnr\t\t%u\n", ja->nr); - prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); - prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); - prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); - prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); - prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); - prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + prt_printf(out, "dev %u:\n", ca->dev_idx); + prt_printf(out, "durability %u:\n", ca->mi.durability); + printbuf_indent_add(out, 2); + prt_printf(out, "nr\t%u\n", ja->nr); + prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); + prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); + prt_printf(out, "discard_idx\t%u\n", ja->discard_idx); + prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); + prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); + prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + printbuf_indent_sub(out, 2); } + prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); + rcu_read_unlock(); --out->atomic; @@ -1450,57 +1646,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) __bch2_journal_debug_to_text(out, j); spin_unlock(&j->lock); } - -bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *pin; - unsigned i; - - spin_lock(&j->lock); - *seq = max(*seq, j->pin.front); - - if (*seq >= j->pin.back) { - spin_unlock(&j->lock); - return true; - } - - out->atomic++; - - pin_list = journal_seq_pin(j, *seq); - - prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count)); - prt_newline(out); - printbuf_indent_add(out, 2); - - for (i = 0; i < ARRAY_SIZE(pin_list->list); i++) - list_for_each_entry(pin, &pin_list->list[i], list) { - prt_printf(out, "\t%px %ps", pin, pin->flush); - prt_newline(out); - } - - if (!list_empty(&pin_list->flushed)) { - prt_printf(out, "flushed:"); - prt_newline(out); - } - - list_for_each_entry(pin, &pin_list->flushed, list) { - prt_printf(out, "\t%px %ps", pin, pin->flush); - prt_newline(out); - } - - printbuf_indent_sub(out, 2); - - --out->atomic; - spin_unlock(&j->lock); - - return false; -} - -void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) -{ - u64 seq = 0; - - while (!bch2_journal_seq_pins_to_text(out, j, &seq)) - seq++; -} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 4544ce24bb8a..107f7f901cd9 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u } bool bch2_journal_entry_close(struct journal *); -void bch2_journal_buf_put_final(struct journal *, u64, bool); +void bch2_journal_do_writes(struct journal *); +void bch2_journal_buf_put_final(struct journal *, u64); static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) { @@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + bch2_journal_buf_put_final(j, seq); } static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) @@ -282,9 +283,10 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) { spin_lock(&j->lock); - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + bch2_journal_buf_put_final(j, seq); spin_unlock(&j->lock); - } + } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)) + wake_up(&j->wait); } /* @@ -310,7 +312,7 @@ static inline void bch2_journal_res_put(struct journal *j, } int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, - unsigned); + unsigned, struct btree_trans *); /* First bits for BCH_WATERMARK: */ enum journal_res_flags { @@ -326,10 +328,10 @@ static inline int journal_res_get_fast(struct journal *j, unsigned flags) { union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); + old.v = atomic64_read(&j->reservations.counter); do { - old.v = new.v = v; + new.v = old.v; /* * Check if there is still room in the current journal @@ -355,8 +357,8 @@ static inline int journal_res_get_fast(struct journal *j, if (flags & JOURNAL_RES_GET_CHECK) return 1; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); + } while (!atomic64_try_cmpxchg(&j->reservations.counter, + &old.v, new.v)); res->ref = true; res->idx = old.idx; @@ -366,19 +368,20 @@ static inline int journal_res_get_fast(struct journal *j, } static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s, unsigned flags) + unsigned u64s, unsigned flags, + struct btree_trans *trans) { int ret; EBUG_ON(res->ref); - EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); + EBUG_ON(!test_bit(JOURNAL_running, &j->flags)); res->u64s = u64s; if (journal_res_get_fast(j, res, flags)) goto out; - ret = bch2_journal_res_get_slowpath(j, res, flags); + ret = bch2_journal_res_get_slowpath(j, res, flags, trans); if (ret) return ret; out: @@ -400,39 +403,38 @@ void bch2_journal_entry_res_resize(struct journal *, int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); void bch2_journal_flush_async(struct journal *, struct closure *); -int bch2_journal_flush_seq(struct journal *, u64); +int bch2_journal_flush_seq(struct journal *, u64, unsigned); int bch2_journal_flush(struct journal *); -bool bch2_journal_noflush_seq(struct journal *, u64); +bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); +void bch2_journal_halt_locked(struct journal *); static inline int bch2_journal_error(struct journal *j) { return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL - ? -EIO : 0; + ? -BCH_ERR_journal_shutdown : 0; } struct bch_dev; static inline void bch2_journal_set_replay_done(struct journal *j) { - BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); - set_bit(JOURNAL_REPLAY_DONE, &j->flags); + BUG_ON(!test_bit(JOURNAL_running, &j->flags)); + set_bit(JOURNAL_replay_done, &j->flags); } void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -void bch2_journal_pins_to_text(struct printbuf *, struct journal *); -bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); -int bch2_dev_journal_alloc(struct bch_dev *); +int bch2_dev_journal_alloc(struct bch_dev *, bool); int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 47805193f18c..11c39e0c34f4 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -17,6 +17,63 @@ #include "sb-clean.h" #include "trace.h" +#include <linux/ioprio.h> +#include <linux/string_choices.h> + +void bch2_journal_pos_from_member_info_set(struct bch_fs *c) +{ + lockdep_assert_held(&c->sb_lock); + + for_each_member_device(c, ca) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + + m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); + m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); + } +} + +void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) +{ + mutex_lock(&c->sb_lock); + for_each_member_device(c, ca) { + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); + + unsigned idx = le32_to_cpu(m.last_journal_bucket); + if (idx < ca->journal.nr) + ca->journal.cur_idx = idx; + unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); + if (offset <= ca->mi.bucket_size) + ca->journal.sectors_free = ca->mi.bucket_size - offset; + } + mutex_unlock(&c->sb_lock); +} + +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + darray_for_each(j->ptrs, i) { + if (i != j->ptrs.data) + prt_printf(out, " "); + prt_printf(out, "%u:%u:%u (sector %llu)", + i->dev, i->bucket, i->bucket_offset, i->sector); + } +} + +static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); + + bch2_journal_ptrs_to_text(out, c, j); + + for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { + struct jset_entry_datetime *datetime = + container_of(entry, struct jset_entry_datetime, entry); + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); + break; + } +} + static struct nonce journal_nonce(const struct jset *jset) { return (struct nonce) {{ @@ -52,13 +109,15 @@ static void __journal_replay_free(struct bch_fs *c, BUG_ON(*p != i); *p = NULL; - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + kvfree(i); } -static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) { - i->ignore = true; + if (blacklisted) + i->ignore_blacklisted = true; + else + i->ignore_not_dirty = true; if (!c->opts.read_entire_journal) __journal_replay_free(c, i); @@ -84,11 +143,15 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, { struct genradix_iter iter; struct journal_replay **_i, *i, *dup; - struct journal_ptr *ptr; size_t bytes = vstruct_bytes(j); u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; + struct printbuf buf = PRINTBUF; int ret = JOURNAL_ENTRY_ADD_OK; + if (!c->journal.oldest_seq_found_ondisk || + le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) + c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); + /* Is this entry older than the range we need? */ if (!c->opts.read_entire_journal && le64_to_cpu(j->seq) < jlist->last_seq) @@ -108,12 +171,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, journal_entry_radix_idx(c, jlist->last_seq)) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; if (le64_to_cpu(i->j.seq) >= last_seq) break; - journal_replay_free(c, i); + + journal_replay_free(c, i, false); } } @@ -131,72 +195,62 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, */ dup = *_i; if (dup) { - if (bytes == vstruct_bytes(&dup->j) && - !memcmp(j, &dup->j, bytes)) { - i = dup; - goto found; - } + bool identical = bytes == vstruct_bytes(&dup->j) && + !memcmp(j, &dup->j, bytes); + bool not_identical = !identical && + entry_ptr.csum_good && + dup->csum_good; + + bool same_device = false; + darray_for_each(dup->ptrs, ptr) + if (ptr->dev == ca->dev_idx) + same_device = true; + + ret = darray_push(&dup->ptrs, entry_ptr); + if (ret) + goto out; - if (!entry_ptr.csum_good) { - i = dup; - goto found; - } + bch2_journal_replay_to_text(&buf, c, dup); + + fsck_err_on(same_device, + c, journal_entry_dup_same_device, + "duplicate journal entry on same device\n %s", + buf.buf); + + fsck_err_on(not_identical, + c, journal_entry_replicas_data_mismatch, + "found duplicate but non identical journal entries\n %s", + buf.buf); - if (!dup->csum_good) + if (entry_ptr.csum_good && !identical) goto replace; - fsck_err(c, journal_entry_replicas_data_mismatch, - "found duplicate but non identical journal entries (seq %llu)", - le64_to_cpu(j->seq)); - i = dup; - goto found; + goto out; } replace: - i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) return -BCH_ERR_ENOMEM_journal_entry_add; - i->nr_ptrs = 0; - i->csum_good = entry_ptr.csum_good; - i->ignore = false; + darray_init(&i->ptrs); + i->csum_good = entry_ptr.csum_good; + i->ignore_blacklisted = false; + i->ignore_not_dirty = false; unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); - i->ptrs[i->nr_ptrs++] = entry_ptr; if (dup) { - if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { - bch_err(c, "found too many copies of journal entry %llu", - le64_to_cpu(i->j.seq)); - dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; - } - /* The first ptr should represent the jset we kept: */ - memcpy(i->ptrs + i->nr_ptrs, - dup->ptrs, - sizeof(dup->ptrs[0]) * dup->nr_ptrs); - i->nr_ptrs += dup->nr_ptrs; + darray_for_each(dup->ptrs, ptr) + darray_push(&i->ptrs, *ptr); __journal_replay_free(c, dup); + } else { + darray_push(&i->ptrs, entry_ptr); } *_i = i; - return 0; -found: - for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { - if (ptr->dev == ca->dev_idx) { - bch_err(c, "duplicate journal entry %llu on same device", - le64_to_cpu(i->j.seq)); - goto out; - } - } - - if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { - bch_err(c, "found too many copies of journal entry %llu", - le64_to_cpu(i->j.seq)); - goto out; - } - - i->ptrs[i->nr_ptrs++] = entry_ptr; out: fsck_err: + printbuf_exit(&buf); return ret; } @@ -223,7 +277,7 @@ static void journal_entry_err_msg(struct printbuf *out, if (entry) { prt_str(out, " type="); - prt_str(out, bch2_jset_entry_types[entry->type]); + bch2_prt_jset_entry_type(out, entry->type); } if (!jset) { @@ -248,7 +302,7 @@ static void journal_entry_err_msg(struct printbuf *out, journal_entry_err_msg(&_buf, version, jset, entry); \ prt_printf(&_buf, msg, ##__VA_ARGS__); \ \ - switch (flags & BKEY_INVALID_WRITE) { \ + switch (from.flags & BCH_VALIDATE_write) { \ case READ: \ mustfix_fsck_err(c, _err, "%s", _buf.buf); \ break; \ @@ -274,14 +328,13 @@ static void journal_entry_err_msg(struct printbuf *out, static int journal_validate_key(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, - unsigned level, enum btree_id btree_id, struct bkey_i *k, - unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from, + unsigned version, int big_endian) { - int write = flags & BKEY_INVALID_WRITE; + enum bch_validate_flags flags = from.flags; + int write = flags & BCH_VALIDATE_write; void *next = vstruct_next(entry); - struct printbuf buf = PRINTBUF; int ret = 0; if (journal_entry_err_on(!k->k.u64s, @@ -314,37 +367,23 @@ static int journal_validate_key(struct bch_fs *c, } if (!write) - bch2_bkey_compat(level, btree_id, version, big_endian, + bch2_bkey_compat(from.level, from.btree, version, big_endian, write, NULL, bkey_to_packed(k)); - if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), - __btree_node_type(level, btree_id), write, &buf)) { - printbuf_reset(&buf); - journal_entry_err_msg(&buf, version, jset, entry); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - prt_newline(&buf); - bch2_bkey_invalid(c, bkey_i_to_s_c(k), - __btree_node_type(level, btree_id), write, &buf); - - mustfix_fsck_err(c, journal_entry_bkey_invalid, - "%s", buf.buf); - + ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); + if (ret == -BCH_ERR_fsck_delete_bkey) { le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); - - printbuf_exit(&buf); return FSCK_DELETED_KEY; } + if (ret) + goto fsck_err; if (write) - bch2_bkey_compat(level, btree_id, version, big_endian, + bch2_bkey_compat(from.level, from.btree, version, big_endian, write, NULL, bkey_to_packed(k)); fsck_err: - printbuf_exit(&buf); return ret; } @@ -352,18 +391,19 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { struct bkey_i *k = entry->start; + from.level = entry->level; + from.btree = entry->btree_id; + while (k != vstruct_last(entry)) { - int ret = journal_validate_key(c, jset, entry, - entry->level, - entry->btree_id, - k, version, big_endian, - flags|BKEY_INVALID_JOURNAL); + int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); if (ret == FSCK_DELETED_KEY) continue; + else if (ret) + return ret; k = bkey_next(k); } @@ -374,15 +414,16 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { - struct bkey_i *k; bool first = true; jset_entry_for_each_key(entry, k) { if (!first) { prt_newline(out); - prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + bch2_prt_jset_entry_type(out, entry->type); + prt_str(out, ": "); } - prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); + bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); + prt_char(out, ' '); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); first = false; } @@ -392,11 +433,15 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { struct bkey_i *k = entry->start; int ret = 0; + from.root = true; + from.level = entry->level + 1; + from.btree = entry->btree_id; + if (journal_entry_err_on(!entry->u64s || le16_to_cpu(entry->u64s) != k->k.u64s, c, version, jset, entry, @@ -413,8 +458,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, return 0; } - ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, - version, big_endian, flags); + ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); if (ret == FSCK_DELETED_KEY) ret = 0; fsck_err: @@ -431,7 +475,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { /* obsolete, don't care: */ return 0; @@ -446,7 +490,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -473,7 +517,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { struct jset_entry_blacklist_v2 *bl_entry; int ret = 0; @@ -515,7 +559,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); @@ -540,16 +584,16 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); - prt_printf(out, "type=%s v=%llu", - bch2_fs_usage_types[u->entry.btree_id], - le64_to_cpu(u->v)); + prt_str(out, "type="); + bch2_prt_fs_usage_type(out, u->entry.btree_id); + prt_printf(out, " v=%llu", le64_to_cpu(u->v)); } static int journal_entry_data_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); @@ -566,7 +610,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, goto out; } - if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), + if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), c, version, jset, entry, journal_entry_data_usage_bad_size, "invalid journal entry usage: %s", err.buf)) { @@ -593,7 +637,7 @@ static int journal_entry_clock_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); @@ -626,20 +670,19 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); - prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); + prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); } static int journal_entry_dev_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); unsigned expected = sizeof(*u); - unsigned dev; int ret = 0; if (journal_entry_err_on(bytes < expected, @@ -651,16 +694,6 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, return ret; } - dev = le32_to_cpu(u->dev); - - if (journal_entry_err_on(!bch2_dev_exists2(c, dev), - c, version, jset, entry, - journal_entry_dev_usage_bad_dev, - "bad dev")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - if (journal_entry_err_on(u->pad, c, version, jset, entry, journal_entry_dev_usage_bad_pad, @@ -680,22 +713,28 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs container_of(entry, struct jset_entry_dev_usage, entry); unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); + if (vstruct_bytes(entry) < sizeof(*u)) + return; + prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); + printbuf_indent_add(out, 2); for (i = 0; i < nr_types; i++) { + prt_newline(out); bch2_prt_data_type(out, i); prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", le64_to_cpu(u->d[i].buckets), le64_to_cpu(u->d[i].sectors), le64_to_cpu(u->d[i].fragmented)); } + printbuf_indent_sub(out, 2); } static int journal_entry_log_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { return 0; } @@ -704,19 +743,19 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); - prt_printf(out, "%.*s", bytes, l->d); + prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); } static int journal_entry_overwrite_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { + from.flags = 0; return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, READ); + version, big_endian, from); } static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, @@ -729,10 +768,10 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, READ); + version, big_endian, from); } static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, @@ -741,10 +780,41 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_datetime_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + struct bkey_validate_context from) +{ + unsigned bytes = vstruct_bytes(entry); + unsigned expected = 16; + int ret = 0; + + if (journal_entry_err_on(vstruct_bytes(entry) < expected, + c, version, jset, entry, + journal_entry_dev_usage_bad_size, + "bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } +fsck_err: + return ret; +} + +static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_datetime *datetime = + container_of(entry, struct jset_entry_datetime, entry); + + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bkey_invalid_flags); + struct bkey_validate_context); void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; @@ -762,32 +832,40 @@ int bch2_journal_entry_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + struct bkey_validate_context from) { return entry->type < BCH_JSET_ENTRY_NR ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, - version, big_endian, flags) + version, big_endian, from) : 0; } void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { + bch2_prt_jset_entry_type(out, entry->type); + if (entry->type < BCH_JSET_ENTRY_NR) { - prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + prt_str(out, ": "); bch2_jset_entry_ops[entry->type].to_text(out, c, entry); - } else { - prt_printf(out, "(unknown type %u)", entry->type); } } static int jset_validate_entries(struct bch_fs *c, struct jset *jset, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { + struct bkey_validate_context from = { + .flags = flags, + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(jset->seq), + }; + unsigned version = le32_to_cpu(jset->version); int ret = 0; vstruct_for_each(jset, entry) { + from.journal_offset = (u64 *) entry - jset->_data; + if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), c, version, jset, entry, journal_entry_past_jset_end, @@ -796,8 +874,8 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset, break; } - ret = bch2_journal_entry_validate(c, jset, entry, - version, JSET_BIG_ENDIAN(jset), flags); + ret = bch2_journal_entry_validate(c, jset, entry, version, + JSET_BIG_ENDIAN(jset), from); if (ret) break; } @@ -808,15 +886,19 @@ fsck_err: static int jset_validate(struct bch_fs *c, struct bch_dev *ca, struct jset *jset, u64 sector, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { - unsigned version; + struct bkey_validate_context from = { + .flags = flags, + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(jset->seq), + }; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; - version = le32_to_cpu(jset->version); + unsigned version = le32_to_cpu(jset->version); if (journal_entry_err_on(!bch2_version_compatible(version), c, version, jset, NULL, jset_unsupported_version, @@ -861,15 +943,16 @@ static int jset_validate_early(struct bch_fs *c, unsigned bucket_sectors_left, unsigned sectors_read) { - size_t bytes = vstruct_bytes(jset); - unsigned version; - enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; + struct bkey_validate_context from = { + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(jset->seq), + }; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; - version = le32_to_cpu(jset->version); + unsigned version = le32_to_cpu(jset->version); if (journal_entry_err_on(!bch2_version_compatible(version), c, version, jset, NULL, jset_unsupported_version, @@ -882,6 +965,7 @@ static int jset_validate_early(struct bch_fs *c, return -EINVAL; } + size_t bytes = vstruct_bytes(jset); if (bytes > (sectors_read << 9) && sectors_read < bucket_sectors_left) return JOURNAL_ENTRY_REREAD; @@ -913,11 +997,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b, return -BCH_ERR_ENOMEM_journal_read_buf_realloc; new_size = roundup_pow_of_two(new_size); - n = kvpmalloc(new_size, GFP_KERNEL); + n = kvmalloc(new_size, GFP_KERNEL); if (!n) return -BCH_ERR_ENOMEM_journal_read_buf_realloc; - kvpfree(b->data, b->size); + kvfree(b->data); b->data = n; b->size = new_size; return 0; @@ -950,6 +1034,8 @@ reread: nr_bvecs = buf_pages(buf->data, sectors_read << 9); bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + if (!bio) + return -BCH_ERR_ENOMEM_journal_read_bucket; bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); bio->bi_iter.bi_sector = offset; @@ -1002,6 +1088,13 @@ reread: goto err; } + if (le64_to_cpu(j->seq) > ja->highest_seq_found) { + ja->highest_seq_found = le64_to_cpu(j->seq); + ja->cur_idx = bucket; + ja->sectors_free = ca->mi.bucket_size - + bucket_remainder(ca, offset) - sectors; + } + /* * This happens sometimes if we don't have discards on - * when we've partially overwritten a bucket with new @@ -1028,9 +1121,7 @@ reread: ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, vstruct_end(j) - (void *) j->encrypted_start); - bch2_fs_fatal_err_on(ret, c, - "error decrypting journal entry: %s", - bch2_err_str(ret)); + bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); mutex_lock(&jlist->lock); ret = journal_entry_add(c, ca, (struct journal_ptr) { @@ -1072,8 +1163,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) struct bch_fs *c = ca->fs; struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); - struct journal_replay *r, **_r; - struct genradix_iter iter; struct journal_read_buf buf = { NULL, 0 }; unsigned i; int ret = 0; @@ -1093,48 +1182,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) goto err; } - ja->sectors_free = ca->mi.bucket_size; - - mutex_lock(&jlist->lock); - genradix_for_each_reverse(&c->journal_entries, iter, _r) { - r = *_r; - - if (!r) - continue; - - for (i = 0; i < r->nr_ptrs; i++) { - if (r->ptrs[i].dev == ca->dev_idx) { - unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + - vstruct_sectors(&r->j, c->block_bits); - - ja->cur_idx = r->ptrs[i].bucket; - ja->sectors_free = ca->mi.bucket_size - wrote; - goto found; - } - } - } -found: - mutex_unlock(&jlist->lock); - - if (ja->bucket_seq[ja->cur_idx] && - ja->sectors_free == ca->mi.bucket_size) { -#if 0 - /* - * Debug code for ZNS support, where we (probably) want to be - * correlated where we stopped in the journal to the zone write - * points: - */ - bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); - bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); - for (i = 0; i < 3; i++) { - unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; - - bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); - } -#endif - ja->sectors_free = 0; - } - /* * Set dirty_idx to indicate the entire journal is full and needs to be * reclaimed - journal reclaim will immediately reclaim whatever isn't @@ -1144,7 +1191,7 @@ found: ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; out: bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); - kvpfree(buf.data, buf.size); + kvfree(buf.data); percpu_ref_put(&ca->io_ref); closure_return(cl); return; @@ -1155,27 +1202,6 @@ err: goto out; } -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) -{ - unsigned i; - - for (i = 0; i < j->nr_ptrs; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); - u64 offset; - - div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); - - if (i) - prt_printf(out, " "); - prt_printf(out, "%u:%u:%u (sector %llu)", - j->ptrs[i].dev, - j->ptrs[i].bucket, - j->ptrs[i].bucket_offset, - j->ptrs[i].sector); - } -} - int bch2_journal_read(struct bch_fs *c, u64 *last_seq, u64 *blacklist_seq, @@ -1224,27 +1250,29 @@ int bch2_journal_read(struct bch_fs *c, * those entries will be blacklisted: */ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { - enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; - i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; if (!*start_seq) *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; if (JSET_NO_FLUSH(&i->j)) { - i->ignore = true; + i->ignore_blacklisted = true; continue; } if (!last_write_torn && !i->csum_good) { last_write_torn = true; - i->ignore = true; + i->ignore_blacklisted = true; continue; } + struct bkey_validate_context from = { + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(i->j.seq), + }; if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), c, le32_to_cpu(i->j.version), &i->j, NULL, jset_last_seq_newer_than_seq, @@ -1280,12 +1308,12 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; seq = le64_to_cpu(i->j.seq); if (seq < *last_seq) { - journal_replay_free(c, i); + journal_replay_free(c, i, false); continue; } @@ -1293,7 +1321,7 @@ int bch2_journal_read(struct bch_fs *c, fsck_err_on(!JSET_NO_FLUSH(&i->j), c, jset_seq_blacklisted, "found blacklisted journal entry %llu", seq); - i->ignore = true; + i->ignore_blacklisted = true; } } @@ -1302,7 +1330,7 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; BUG_ON(seq > le64_to_cpu(i->j.seq)); @@ -1335,7 +1363,7 @@ int bch2_journal_read(struct bch_fs *c, fsck_err(c, journal_entries_missing, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" " prev at %s\n" - " next at %s", + " next at %s, continue?", missing_start, missing_end, *last_seq, *blacklist_seq - 1, buf1.buf, buf2.buf); @@ -1351,34 +1379,34 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { struct bch_replicas_padded replicas = { .e.data_type = BCH_DATA_journal, + .e.nr_devs = 0, .e.nr_required = 1, }; - unsigned ptr; i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + darray_for_each(i->ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - if (!i->ptrs[ptr].csum_good) - bch_err_dev_offset(ca, i->ptrs[ptr].sector, + if (!ptr->csum_good) + bch_err_dev_offset(ca, ptr->sector, "invalid journal checksum, seq %llu%s", le64_to_cpu(i->j.seq), i->csum_good ? " (had good copy on another device)" : ""); } ret = jset_validate(c, - bch_dev_bkey_exists(c, i->ptrs[0].dev), + bch2_dev_have_ref(c, i->ptrs.data[0].dev), &i->j, - i->ptrs[0].sector, + i->ptrs.data[0].sector, READ); if (ret) goto err; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) - replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; + darray_for_each(i->ptrs, ptr) + replicas_entry_add_dev(&replicas.e, ptr->dev); bch2_replicas_entry_sort(&replicas.e); @@ -1404,27 +1432,50 @@ fsck_err: /* journal write: */ +static void journal_advance_devs_to_next_bucket(struct journal *j, + struct dev_alloc_list *devs, + unsigned sectors, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + darray_for_each(*devs, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); + if (!ca) + continue; + + struct journal_device *ja = &ca->journal; + + if (sectors > ja->sectors_free && + sectors <= ca->mi.bucket_size && + bch2_journal_dev_buckets_available(j, ja, + journal_space_discarded)) { + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ja->sectors_free = ca->mi.bucket_size; + + /* + * ja->bucket_seq[ja->cur_idx] must always have + * something sensible: + */ + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); + } + } +} + static void __journal_write_alloc(struct journal *j, struct journal_buf *w, - struct dev_alloc_list *devs_sorted, + struct dev_alloc_list *devs, unsigned sectors, unsigned *replicas, unsigned replicas_want) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_device *ja; - struct bch_dev *ca; - unsigned i; - - if (*replicas >= replicas_want) - return; - for (i = 0; i < devs_sorted->nr; i++) { - ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); + darray_for_each(*devs, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) continue; - ja = &ca->journal; + struct journal_device *ja = &ca->journal; /* * Check that we can use this device, and aren't already using @@ -1470,65 +1521,53 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; - struct journal_device *ja; - struct bch_dev *ca; struct dev_alloc_list devs_sorted; unsigned sectors = vstruct_sectors(w->data, c->block_bits); unsigned target = c->opts.metadata_target ?: c->opts.foreground_target; - unsigned i, replicas = 0, replicas_want = + unsigned replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); unsigned replicas_need = min_t(unsigned, replicas_want, READ_ONCE(c->opts.metadata_replicas_required)); + bool advance_done = false; rcu_read_lock(); -retry: - devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); + /* We might run more than once if we have to stop and do discards: */ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); + bkey_for_each_ptr(ptrs, p) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); + if (ca) + replicas += ca->mi.durability; + } - __journal_write_alloc(j, w, &devs_sorted, - sectors, &replicas, replicas_want); +retry_target: + devs = target_rw_devs(c, BCH_DATA_journal, target); + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); +retry_alloc: + __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); - if (replicas >= replicas_want) + if (likely(replicas >= replicas_want)) goto done; - for (i = 0; i < devs_sorted.nr; i++) { - ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); - if (!ca) - continue; - - ja = &ca->journal; - - if (sectors > ja->sectors_free && - sectors <= ca->mi.bucket_size && - bch2_journal_dev_buckets_available(j, ja, - journal_space_discarded)) { - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = ca->mi.bucket_size; - - /* - * ja->bucket_seq[ja->cur_idx] must always have - * something sensible: - */ - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - } + if (!advance_done) { + journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); + advance_done = true; + goto retry_alloc; } - __journal_write_alloc(j, w, &devs_sorted, - sectors, &replicas, replicas_want); - if (replicas < replicas_want && target) { /* Retry from all devices: */ target = 0; - goto retry; + advance_done = false; + goto retry_target; } done: rcu_read_unlock(); BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= replicas_need ? 0 : -EROFS; + return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) @@ -1547,7 +1586,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) return; - new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); + new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1558,7 +1597,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) swap(buf->buf_size, new_size); spin_unlock(&j->lock); - kvpfree(new_buf, new_size); + kvfree(new_buf); } static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) @@ -1568,12 +1607,12 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) static CLOSURE_CALLBACK(journal_write_done) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; union journal_res_state old, new; - u64 v, seq; + u64 seq = le64_to_cpu(w->data->seq); int err = 0; bch2_time_stats_update(!JSET_NO_FLUSH(w->data) @@ -1593,63 +1632,69 @@ static CLOSURE_CALLBACK(journal_write_done) if (err) bch2_fatal_error(c); - spin_lock(&j->lock); - seq = le64_to_cpu(w->data->seq); + closure_debug_destroy(cl); + spin_lock(&j->lock); if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = w->devs_written; + if (err && (!j->err_seq || seq < j->err_seq)) + j->err_seq = seq; + w->write_done = true; + + bool completed = false; + + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + w = j->buf + (seq & JOURNAL_BUF_MASK); + if (!w->write_done) + break; - if (!err) { - if (!JSET_NO_FLUSH(w->data)) { + if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; bch2_do_discards(c); closure_wake_up(&c->freelist_wait); - bch2_reset_alloc_cursors(c); } - } else if (!j->err_seq || seq < j->err_seq) - j->err_seq = seq; - j->seq_ondisk = seq; + j->seq_ondisk = seq; - /* - * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard - * more buckets: - * - * Must come before signaling write completion, for - * bch2_fs_journal_stop(): - */ - if (j->watermark != BCH_WATERMARK_stripe) - journal_reclaim_kick(&c->journal); + /* + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard + * more buckets: + * + * Must come before signaling write completion, for + * bch2_fs_journal_stop(): + */ + if (j->watermark != BCH_WATERMARK_stripe) + journal_reclaim_kick(&c->journal); - /* also must come before signalling write completion: */ - closure_debug_destroy(cl); + old.v = atomic64_read(&j->reservations.counter); + do { + new.v = old.v; + BUG_ON(journal_state_count(new, new.unwritten_idx)); + BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); - v = atomic64_read(&j->reservations.counter); - do { - old.v = new.v = v; - BUG_ON(journal_state_count(new, new.unwritten_idx)); + new.unwritten_idx++; + } while (!atomic64_try_cmpxchg(&j->reservations.counter, + &old.v, new.v)); - new.unwritten_idx++; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); + closure_wake_up(&w->wait); + completed = true; + } - bch2_journal_reclaim_fast(j); - bch2_journal_space_available(j); + if (completed) { + bch2_journal_reclaim_fast(j); + bch2_journal_space_available(j); - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], - &j->max_in_flight_start, false); + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); - closure_wake_up(&w->wait); - journal_wake(j); + journal_wake(j); + } - if (!journal_state_count(new, new.unwritten_idx) && - journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { - spin_unlock(&j->lock); - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); - } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && + if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); long delta = buf->expires - jiffies; @@ -1659,47 +1704,52 @@ static CLOSURE_CALLBACK(journal_write_done) * previous entries still in flight - the current journal entry * might want to be written now: */ - - spin_unlock(&j->lock); - mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); - } else { - spin_unlock(&j->lock); + mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); } + + /* + * We don't typically trigger journal writes from her - the next journal + * write will be triggered immediately after the previous one is + * allocated, in bch2_journal_write() - but the journal write error path + * is special: + */ + bch2_journal_do_writes(j); + spin_unlock(&j->lock); } static void journal_write_endio(struct bio *bio) { - struct bch_dev *ca = bio->bi_private; + struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); + struct bch_dev *ca = jbio->ca; struct journal *j = &ca->fs->journal; - struct journal_buf *w = journal_last_unwritten_buf(j); - unsigned long flags; + struct journal_buf *w = j->buf + jbio->buf_idx; if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, "error writing journal entry %llu: %s", le64_to_cpu(w->data->seq), bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { + unsigned long flags; + spin_lock_irqsave(&j->err_lock, flags); bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); } - closure_put(&j->io); + closure_put(&w->io); percpu_ref_put(&ca->io_ref); } -static CLOSURE_CALLBACK(do_journal_write) +static CLOSURE_CALLBACK(journal_write_submit) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - struct journal_buf *w = journal_last_unwritten_buf(j); - struct bio *bio; unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - if (!percpu_ref_tryget(&ca->io_ref)) { + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + if (!ca) { /* XXX: fix this */ bch_err(c, "missing device for journal write\n"); continue; @@ -1708,11 +1758,13 @@ static CLOSURE_CALLBACK(do_journal_write) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); - bio = ca->journal.bio; + struct journal_device *ja = &ca->journal; + struct bio *bio = &ja->bio[w->idx]->bio; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); ca->prev_journal_sector = bio->bi_iter.bi_sector; @@ -1727,11 +1779,50 @@ static CLOSURE_CALLBACK(do_journal_write) trace_and_count(c, journal_write, bio); closure_bio_submit(bio, cl); - ca->journal.bucket_seq[ca->journal.cur_idx] = - le64_to_cpu(w->data->seq); + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); } - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); +} + +static CLOSURE_CALLBACK(journal_write_preflush) +{ + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { + spin_lock(&j->lock); + if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { + closure_wait(&j->async_wait, cl); + spin_unlock(&j->lock); + continue_at(cl, journal_write_preflush, j->wq); + return; + } + spin_unlock(&j->lock); + } + + if (w->separate_flush) { + for_each_rw_member(c, ca) { + percpu_ref_get(&ca->io_ref); + + struct journal_device *ja = &ca->journal; + struct bio *bio = &ja->bio[w->idx]->bio; + bio_reset(bio, ca->disk_sb.bdev, + REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + closure_bio_submit(bio, cl); + } + + continue_at(cl, journal_write_submit, j->wq); + } else { + /* + * no need to punt to another work item if we're not waiting on + * preflushes + */ + journal_write_submit(&cl->work); + } } static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) @@ -1782,11 +1873,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) if (!wb.wb) bch2_journal_keys_to_write_buffer_start(c, &wb, seq); - struct bkey_i *k; jset_entry_for_each_key(i, k) { ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); if (ret) { - bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer"); + bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", + bch2_err_str(ret)); bch2_journal_keys_to_write_buffer_end(c, &wb); return ret; } @@ -1796,17 +1887,32 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) } } - if (wb.wb) - bch2_journal_keys_to_write_buffer_end(c, &wb); + if (wb.wb) { + ret = bch2_journal_keys_to_write_buffer_end(c, &wb); + if (ret) { + bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", + bch2_err_str(ret)); + return ret; + } + } + + spin_lock(&c->journal.lock); w->need_flush_to_write_buffer = false; + spin_unlock(&c->journal.lock); start = end = vstruct_last(jset); end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); + struct jset_entry_datetime *d = + container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); + d->entry.type = BCH_JSET_ENTRY_datetime; + d->seconds = cpu_to_le64(ktime_get_real_seconds()); + bch2_journal_super_entries_add_common(c, &end, seq); u64s = (u64 *) end - (u64 *) start; - BUG_ON(u64s > j->entry_u64s_reserved); + + WARN_ON(u64s > j->entry_u64s_reserved); le32_add_cpu(&jset->u64s, u64s); @@ -1814,7 +1920,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) bytes = vstruct_bytes(jset); if (sectors > w->sectors) { - bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", + bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", vstruct_bytes(jset), w->sectors << 9, u64s, w->u64s_reserved, j->entry_u64s_reserved); return -EINVAL; @@ -1842,8 +1948,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); - if (bch2_fs_fatal_err_on(ret, c, - "error decrypting journal entry: %i", ret)) + if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) return ret; jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), @@ -1878,14 +1983,15 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * * So if we're in an error state, and we're still starting up, we don't * write anything at all. */ - if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) + if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) return -EIO; if (error || w->noflush || (!w->must_flush && - (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && - test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { + time_before(jiffies, j->last_flush_write + + msecs_to_jiffies(c->opts.journal_flush_delay)) && + test_bit(JOURNAL_may_skip_flush, &j->flags))) { w->noflush = true; SET_JSET_NO_FLUSH(w->data, true); w->data->last_seq = 0; @@ -1893,9 +1999,10 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * j->nr_noflush_writes++; } else { + w->must_flush = true; j->last_flush_write = jiffies; j->nr_flush_writes++; - clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); + clear_bit(JOURNAL_need_flush_write, &j->flags); } return 0; @@ -1903,20 +2010,27 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * CLOSURE_CALLBACK(bch2_journal_write) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; - struct bio *bio; - struct printbuf journal_debug_buf = PRINTBUF; unsigned nr_rw_members = 0; int ret; + for_each_rw_member(c, ca) + nr_rw_members++; + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + BUG_ON(!w->write_started); + BUG_ON(w->write_allocated); + BUG_ON(w->write_done); j->write_start_time = local_clock(); spin_lock(&j->lock); + if (nr_rw_members > 1) + w->separate_flush = true; + ret = bch2_journal_write_pick_flush(j, w); spin_unlock(&j->lock); if (ret) @@ -1942,26 +2056,35 @@ CLOSURE_CALLBACK(bch2_journal_write) bch2_journal_do_discards(j); } - if (ret) { - __bch2_journal_debug_to_text(&journal_debug_buf, j); + if (ret && !bch2_journal_error(j)) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), + le64_to_cpu(w->data->seq), + vstruct_sectors(w->data, c->block_bits), + bch2_err_str(ret)); + __bch2_journal_debug_to_text(&buf, j); spin_unlock(&j->lock); - bch_err(c, "Unable to allocate journal write:\n%s", - journal_debug_buf.buf); - printbuf_exit(&journal_debug_buf); - goto err; + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); } + if (ret) + goto err; /* * write is allocated, no longer need to account for it in * bch2_journal_space_available(): */ w->sectors = 0; + w->write_allocated = true; /* * journal entry has been compacted and allocated, recalculate space * available: */ bch2_journal_space_available(j); + bch2_journal_do_writes(j); spin_unlock(&j->lock); w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); @@ -1969,12 +2092,6 @@ CLOSURE_CALLBACK(bch2_journal_write) if (c->opts.nochanges) goto no_io; - for_each_rw_member(c, ca) - nr_rw_members++; - - if (nr_rw_members > 1) - w->separate_flush = true; - /* * Mark journal replicas before we submit the write to guarantee * recovery will find the journal entries after a crash. @@ -1985,25 +2102,15 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; - if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { - for_each_rw_member(c, ca) { - percpu_ref_get(&ca->io_ref); - - bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, - REQ_OP_WRITE|REQ_PREFLUSH); - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } - } - - continue_at(cl, do_journal_write, c->io_complete_wq); + if (!JSET_NO_FLUSH(w->data)) + continue_at(cl, journal_write_preflush, j->wq); + else + continue_at(cl, journal_write_submit, j->wq); return; no_io: - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); return; err: bch2_fatal_error(c); - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); } diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index c035e7c108e1..12b39fcb4424 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -2,26 +2,38 @@ #ifndef _BCACHEFS_JOURNAL_IO_H #define _BCACHEFS_JOURNAL_IO_H +#include "darray.h" + +void bch2_journal_pos_from_member_info_set(struct bch_fs *); +void bch2_journal_pos_from_member_info_resume(struct bch_fs *); + +struct journal_ptr { + bool csum_good; + u8 dev; + u32 bucket; + u32 bucket_offset; + u64 sector; +}; + /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration */ struct journal_replay { - struct journal_ptr { - bool csum_good; - u8 dev; - u32 bucket; - u32 bucket_offset; - u64 sector; - } ptrs[BCH_REPLICAS_MAX]; - unsigned nr_ptrs; + DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; bool csum_good; - bool ignore; + bool ignore_blacklisted; + bool ignore_not_dirty; /* must be last: */ struct jset j; }; +static inline bool journal_replay_ignore(struct journal_replay *i) +{ + return !i || i->ignore_blacklisted || i->ignore_not_dirty; +} + static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, struct jset_entry *entry, unsigned type) { @@ -36,12 +48,12 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, } #define for_each_jset_entry_type(entry, jset, type) \ - for (entry = (jset)->start; \ + for (struct jset_entry *entry = (jset)->start; \ (entry = __jset_entry_type_next(jset, entry, type)); \ entry = vstruct_next(entry)) #define jset_entry_for_each_key(_e, _k) \ - for (_k = (_e)->start; \ + for (struct bkey_i *_k = (_e)->start; \ _k < vstruct_last(_e); \ _k = bkey_next(_k)) @@ -51,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, int bch2_journal_entry_validate(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bkey_invalid_flags); + struct bkey_validate_context); void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, struct jset_entry *); @@ -62,4 +74,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); CLOSURE_CALLBACK(bch2_journal_write); +static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) +{ + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + + memset(entry, 0, u64s * sizeof(u64)); + /* + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ + entry->u64s = cpu_to_le16(u64s - 1); + + *end = vstruct_next(*end); + return entry; +} + #endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index c33dca641575..d373cd181a7f 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -38,6 +38,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, struct journal_device *ja, enum journal_space_from from) { + if (!ja->nr) + return 0; + unsigned available = (journal_space_from(ja, from) - ja->cur_idx - 1 + ja->nr) % ja->nr; @@ -62,14 +65,13 @@ void bch2_journal_set_watermark(struct journal *j) ? BCH_WATERMARK_reclaim : BCH_WATERMARK_stripe; - if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], - &j->low_on_space_start, low_on_space) || - track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], - &j->low_on_pin_start, low_on_pin) || - track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], - &j->write_buffer_full_start, low_on_wb)) + if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) || + track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) || + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) trace_and_count(c, journal_full, c); + mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin); + swap(watermark, j->watermark); if (watermark > j->watermark) journal_wake(j); @@ -138,14 +140,18 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne struct bch_fs *c = container_of(j, struct bch_fs, journal); unsigned pos, nr_devs = 0; struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; + unsigned min_bucket_size = U32_MAX; BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - if (!ca->journal.nr) + if (!ca->journal.nr || + !ca->mi.durability) continue; + min_bucket_size = min(min_bucket_size, ca->mi.bucket_size); + space = journal_dev_space_available(j, ca, from); if (!space.next_entry) continue; @@ -165,7 +171,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne * We sorted largest to smallest, and we want the smallest out of the * @nr_devs_want largest devices: */ - return dev_space[nr_devs_want - 1]; + space = dev_space[nr_devs_want - 1]; + space.next_entry = min(space.next_entry, min_bucket_size); + return space; } void bch2_journal_space_available(struct journal *j) @@ -206,6 +214,18 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; if (nr_online < metadata_replicas_required(c)) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" + "rw journal devs:", nr_online, metadata_replicas_required(c)); + + rcu_read_lock(); + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) + prt_printf(&buf, " %s", ca->name); + rcu_read_unlock(); + + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); ret = JOURNAL_ERR_insufficient_devices; goto out; } @@ -226,9 +246,9 @@ void bch2_journal_space_available(struct journal *j) j->space[journal_space_clean_ondisk].total) && (clean - clean_ondisk <= total / 8) && (clean_ondisk * 2 > clean)) - set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + set_bit(JOURNAL_may_skip_flush, &j->flags); else - clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + clear_bit(JOURNAL_may_skip_flush, &j->flags); bch2_journal_set_watermark(j); out: @@ -307,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j) popped = true; } - if (popped) + if (popped) { bch2_journal_space_available(j); + __closure_wake_up(&j->reclaim_flush_wait); + } } bool __bch2_journal_pin_put(struct journal *j, u64 seq) @@ -342,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j, pin->seq = 0; list_del_init(&pin->list); + if (j->reclaim_flush_wait.list.first) + __closure_wake_up(&j->reclaim_flush_wait); + /* * Unpinning a journal entry may make journal_next_bucket() succeed, if * writing a new last_seq will now make another bucket available: @@ -359,15 +384,19 @@ void bch2_journal_pin_drop(struct journal *j, spin_unlock(&j->lock); } -static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) +static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, + journal_pin_flush_fn fn) { if (fn == bch2_btree_node_flush0 || - fn == bch2_btree_node_flush1) - return JOURNAL_PIN_btree; - else if (fn == bch2_btree_key_cache_journal_flush) - return JOURNAL_PIN_key_cache; + fn == bch2_btree_node_flush1) { + unsigned idx = fn == bch2_btree_node_flush1; + struct btree *b = container_of(pin, struct btree, writes[idx].journal); + + return JOURNAL_PIN_TYPE_btree0 - b->c.level; + } else if (fn == bch2_btree_key_cache_journal_flush) + return JOURNAL_PIN_TYPE_key_cache; else - return JOURNAL_PIN_other; + return JOURNAL_PIN_TYPE_other; } static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, @@ -386,7 +415,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, atomic_inc(&pin_list->count); pin->seq = seq; pin->flush = flush_fn; - list_add(&pin->list, &pin_list->list[type]); + + if (list_empty(&pin_list->unflushed[type]) && + j->reclaim_flush_wait.list.first) + __closure_wake_up(&j->reclaim_flush_wait); + + list_add(&pin->list, &pin_list->unflushed[type]); } void bch2_journal_pin_copy(struct journal *j, @@ -394,8 +428,6 @@ void bch2_journal_pin_copy(struct journal *j, struct journal_entry_pin *src, journal_pin_flush_fn flush_fn) { - bool reclaim; - spin_lock(&j->lock); u64 seq = READ_ONCE(src->seq); @@ -411,44 +443,44 @@ void bch2_journal_pin_copy(struct journal *j, return; } - reclaim = __journal_pin_drop(j, dst); + bool reclaim = __journal_pin_drop(j, dst); - bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - journal_wake(j); + if (seq == journal_last_seq(j)) + journal_wake(j); + spin_unlock(&j->lock); } void bch2_journal_pin_set(struct journal *j, u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { - bool reclaim; - spin_lock(&j->lock); BUG_ON(seq < journal_last_seq(j)); - reclaim = __journal_pin_drop(j, pin); + bool reclaim = __journal_pin_drop(j, pin); - bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); - /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - journal_wake(j); + if (seq == journal_last_seq(j)) + journal_wake(j); + + spin_unlock(&j->lock); } /** @@ -481,16 +513,15 @@ journal_get_next_pin(struct journal *j, { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret = NULL; - unsigned i; fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { if (*seq > seq_to_flush && !allowed_above_seq) break; - for (i = 0; i < JOURNAL_PIN_NR; i++) - if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || - ((1U << i) & allowed_above_seq)) { - ret = list_first_entry_or_null(&pin_list->list[i], + for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) + if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) || + (BIT(i) & allowed_above_seq)) { + ret = list_first_entry_or_null(&pin_list->unflushed[i], struct journal_entry_pin, list); if (ret) return ret; @@ -526,8 +557,8 @@ static size_t journal_flush_pins(struct journal *j, } if (min_key_cache) { - allowed_above |= 1U << JOURNAL_PIN_key_cache; - allowed_below |= 1U << JOURNAL_PIN_key_cache; + allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache); + allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache); } cond_resched(); @@ -535,7 +566,9 @@ static size_t journal_flush_pins(struct journal *j, j->last_flushed = jiffies; spin_lock(&j->lock); - pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); + pin = journal_get_next_pin(j, seq_to_flush, + allowed_below, + allowed_above, &seq); if (pin) { BUG_ON(j->flush_in_progress); j->flush_in_progress = pin; @@ -558,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j, spin_lock(&j->lock); /* Pin might have been dropped or rearmed: */ if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); j->flush_in_progress = NULL; j->flush_in_progress_dropped = false; spin_unlock(&j->lock); @@ -632,6 +665,7 @@ static u64 journal_seq_to_flush(struct journal *j) static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_cache *bc = &c->btree_cache; bool kthread = (current->flags & PF_KTHREAD) != 0; u64 seq_to_flush; size_t min_nr, min_key_cache, nr_flushed; @@ -672,7 +706,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (j->watermark != BCH_WATERMARK_stripe) min_nr = 1; - if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) + size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr; + if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live) min_nr = 1; min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); @@ -680,8 +715,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) trace_and_count(c, journal_reclaim_start, c, direct, kicked, min_nr, min_key_cache, - atomic_read(&c->btree_cache.dirty), - c->btree_cache.used, + atomic_long_read(&bc->nr_dirty), btree_cache_live, atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_keys)); @@ -748,10 +782,12 @@ static int bch2_journal_reclaim_thread(void *arg) journal_empty = fifo_empty(&j->pin); spin_unlock(&j->lock); + long timeout = j->next_reclaim - jiffies; + if (journal_empty) schedule(); - else if (time_after(j->next_reclaim, jiffies)) - schedule_timeout(j->next_reclaim - jiffies); + else if (timeout > 0) + schedule_timeout(timeout); else break; } @@ -795,10 +831,41 @@ int bch2_journal_reclaim_start(struct journal *j) return 0; } +static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, + unsigned types) +{ + struct journal_entry_pin_list *pin_list; + u64 seq; + + spin_lock(&j->lock); + fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { + if (seq > seq_to_flush) + break; + + for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) + if ((BIT(i) & types) && + (!list_empty(&pin_list->unflushed[i]) || + !list_empty(&pin_list->flushed[i]))) { + spin_unlock(&j->lock); + return true; + } + } + spin_unlock(&j->lock); + + return false; +} + +static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush, + unsigned types) +{ + return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) || + journal_pins_still_flushing(j, seq_to_flush, types); +} + static int journal_flush_done(struct journal *j, u64 seq_to_flush, bool *did_work) { - int ret; + int ret = 0; ret = bch2_journal_error(j); if (ret) @@ -806,12 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - if (journal_flush_pins(j, seq_to_flush, - (1U << JOURNAL_PIN_key_cache)| - (1U << JOURNAL_PIN_other), 0, 0, 0) || - journal_flush_pins(j, seq_to_flush, - (1U << JOURNAL_PIN_btree), 0, 0, 0)) - *did_work = true; + for (int type = JOURNAL_PIN_TYPE_NR - 1; + type >= 0; + --type) + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { + *did_work = true; + goto unlock; + } if (seq_to_flush > journal_cur_seq(j)) bch2_journal_entry_close(j); @@ -821,11 +889,12 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, * If journal replay hasn't completed, the unreplayed journal entries * hold refs on their corresponding sequence numbers */ - ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || + ret = !test_bit(JOURNAL_replay_done, &j->flags) || journal_last_seq(j) > seq_to_flush || !fifo_used(&j->pin); spin_unlock(&j->lock); +unlock: mutex_unlock(&j->reclaim_lock); return ret; @@ -836,10 +905,10 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) /* time_stats this */ bool did_work = false; - if (!test_bit(JOURNAL_STARTED, &j->flags)) + if (!test_bit(JOURNAL_running, &j->flags)) return false; - closure_wait_event(&j->async_wait, + closure_wait_event(&j->reclaim_flush_wait, journal_flush_done(j, seq_to_flush, &did_work)); return did_work; @@ -905,3 +974,54 @@ err: return ret; } + +bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *pin; + + spin_lock(&j->lock); + if (!test_bit(JOURNAL_running, &j->flags)) { + spin_unlock(&j->lock); + return true; + } + + *seq = max(*seq, j->pin.front); + + if (*seq >= j->pin.back) { + spin_unlock(&j->lock); + return true; + } + + out->atomic++; + + pin_list = journal_seq_pin(j, *seq); + + prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); + printbuf_indent_add(out, 2); + + prt_printf(out, "unflushed:\n"); + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) + list_for_each_entry(pin, &pin_list->unflushed[i], list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); + + prt_printf(out, "flushed:\n"); + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++) + list_for_each_entry(pin, &pin_list->flushed[i], list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); + + printbuf_indent_sub(out, 2); + + --out->atomic; + spin_unlock(&j->lock); + + return false; +} + +void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) +{ + u64 seq = 0; + + while (!bch2_journal_seq_pins_to_text(out, j, &seq)) + seq++; +} diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h index ec84c3345281..0a73d7134e1c 100644 --- a/fs/bcachefs/journal_reclaim.h +++ b/fs/bcachefs/journal_reclaim.h @@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j) int bch2_journal_flush_device_pins(struct journal *, int); +void bch2_journal_pins_to_text(struct printbuf *, struct journal *); +bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); + #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c index ae4fb8c3a2bc..62b910f2fb27 100644 --- a/fs/bcachefs/journal_sb.c +++ b/fs/bcachefs/journal_sb.c @@ -16,9 +16,8 @@ static int u64_cmp(const void *_l, const void *_r) return cmp_int(*l, *r); } -static int bch2_sb_journal_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal *journal = field_to_type(f, journal); struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); @@ -99,13 +98,13 @@ static int u64_range_cmp(const void *_l, const void *_r) return cmp_int(l->start, r->start); } -static int bch2_sb_journal_v2_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); int ret = -BCH_ERR_invalid_sb_journal; + u64 sum = 0; unsigned nr; unsigned i; struct u64_range *b; @@ -121,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, for (i = 0; i < nr; i++) { b[i].start = le64_to_cpu(journal->d[i].start); b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); + + if (b[i].end <= b[i].start) { + prt_printf(err, "journal buckets entry with bad nr: %llu+%llu", + le64_to_cpu(journal->d[i].start), + le64_to_cpu(journal->d[i].nr)); + goto err; + } + + sum += le64_to_cpu(journal->d[i].nr); } sort(b, nr, sizeof(*b), u64_range_cmp, NULL); @@ -150,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, } } + if (sum > UINT_MAX) { + prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX); + goto err; + } + ret = 0; err: kfree(b); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 0200e299cfbb..1f25c111c54c 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "btree_iter.h" #include "eytzinger.h" +#include "journal.h" #include "journal_seq_blacklist.h" #include "super-io.h" @@ -43,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr) return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); } -static struct bch_sb_field_journal_seq_blacklist * -blacklist_entry_try_merge(struct bch_fs *c, - struct bch_sb_field_journal_seq_blacklist *bl, - unsigned i) -{ - unsigned nr = blacklist_nr_entries(bl); - - if (le64_to_cpu(bl->start[i].end) >= - le64_to_cpu(bl->start[i + 1].start)) { - bl->start[i].end = bl->start[i + 1].end; - --nr; - memmove(&bl->start[i], - &bl->start[i + 1], - sizeof(bl->start[0]) * (nr - i)); - - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - sb_blacklist_u64s(nr)); - BUG_ON(!bl); - } - - return bl; -} - -static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, - u64 start, u64 end) -{ - return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); -} - int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) { struct bch_sb_field_journal_seq_blacklist *bl; - unsigned i, nr; + unsigned i = 0, nr; int ret = 0; mutex_lock(&c->sb_lock); bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); nr = blacklist_nr_entries(bl); - for (i = 0; i < nr; i++) { + while (i < nr) { struct journal_seq_blacklist_entry *e = bl->start + i; - if (bl_entry_contig_or_overlaps(e, start, end)) { - e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); - e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); - - if (i + 1 < nr) - bl = blacklist_entry_try_merge(c, - bl, i); - if (i) - bl = blacklist_entry_try_merge(c, - bl, i - 1); - goto out_write_sb; + if (end < le64_to_cpu(e->start)) + break; + + if (start > le64_to_cpu(e->end)) { + i++; + continue; } + + /* + * Entry is contiguous or overlapping with new entry: merge it + * with new entry, and delete: + */ + + start = min(start, le64_to_cpu(e->start)); + end = max(end, le64_to_cpu(e->end)); + array_remove_item(bl->start, nr, i); } bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, @@ -107,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) goto out; } - bl->start[nr].start = cpu_to_le64(start); - bl->start[nr].end = cpu_to_le64(end); -out_write_sb: + array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) { + .start = cpu_to_le64(start), + .end = cpu_to_le64(end), + })); c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ret = bch2_write_super(c); @@ -119,8 +95,7 @@ out: return ret ?: bch2_blacklist_table_initialize(c); } -static int journal_seq_blacklist_table_cmp(const void *_l, - const void *_r, size_t size) +static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) { const struct journal_seq_blacklist_table_entry *l = _l; const struct journal_seq_blacklist_table_entry *r = _r; @@ -165,8 +140,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) if (!bl) return 0; - t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, - GFP_KERNEL); + t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); if (!t) return -BCH_ERR_ENOMEM_blacklist_table_init; @@ -188,9 +162,8 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) return 0; } -static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist); @@ -243,78 +216,40 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { .to_text = bch2_sb_journal_seq_blacklist_to_text }; -void bch2_blacklist_entries_gc(struct work_struct *work) +bool bch2_blacklist_entries_gc(struct bch_fs *c) { - struct bch_fs *c = container_of(work, struct bch_fs, - journal_seq_blacklist_gc_work); - struct journal_seq_blacklist_table *t; - struct bch_sb_field_journal_seq_blacklist *bl; struct journal_seq_blacklist_entry *src, *dst; - struct btree_trans *trans = bch2_trans_get(c); - unsigned i, nr, new_nr; - int ret; - - for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_iter iter; - struct btree *b; - - bch2_trans_node_iter_init(trans, &iter, i, POS_MIN, - 0, 0, BTREE_ITER_PREFETCH); -retry: - bch2_trans_begin(trans); - - b = bch2_btree_iter_peek_node(&iter); - - while (!(ret = PTR_ERR_OR_ZERO(b)) && - b && - !test_bit(BCH_FS_stopping, &c->flags)) - b = bch2_btree_iter_next_node(&iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_iter_exit(trans, &iter); - } - - bch2_trans_put(trans); - if (ret) - return; - - mutex_lock(&c->sb_lock); - bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); + struct bch_sb_field_journal_seq_blacklist *bl = + bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); if (!bl) - goto out; + return false; - nr = blacklist_nr_entries(bl); + unsigned nr = blacklist_nr_entries(bl); dst = bl->start; - t = c->journal_seq_blacklist_table; + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; BUG_ON(nr != t->nr); - for (src = bl->start, i = eytzinger0_first(t->nr); + unsigned i; + for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); src < bl->start + nr; src++, i = eytzinger0_next(i, nr)) { BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); - if (t->entries[i].dirty) + if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) *dst++ = *src; } - new_nr = dst - bl->start; - - bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); - - if (new_nr != nr) { - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - new_nr ? sb_blacklist_u64s(new_nr) : 0); - BUG_ON(new_nr && !bl); + unsigned new_nr = dst - bl->start; + if (new_nr == nr) + return false; - if (!new_nr) - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); + bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr); - bch2_write_super(c); - } -out: - mutex_unlock(&c->sb_lock); + bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, + new_nr ? sb_blacklist_u64s(new_nr) : 0); + BUG_ON(new_nr && !bl); + return true; } diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index afb886ec8e25..d47636f96fdc 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -17,6 +17,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; -void bch2_blacklist_entries_gc(struct work_struct *); +bool bch2_blacklist_entries_gc(struct bch_fs *); #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h new file mode 100644 index 000000000000..2566b12dbc04 --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist_format.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H +#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H + +struct journal_seq_blacklist_entry { + __le64 start; + __le64 end; +}; + +struct bch_sb_field_journal_seq_blacklist { + struct bch_sb_field field; + struct journal_seq_blacklist_entry start[]; +}; + +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 38817c7a0851..1ef3a28ed6ab 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -9,6 +9,9 @@ #include "super_types.h" #include "fifo.h" +/* btree write buffer steals 8 bits for its own purposes: */ +#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) + #define JOURNAL_BUF_BITS 2 #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) @@ -18,6 +21,7 @@ * the journal that are being staged or in flight. */ struct journal_buf { + struct closure io; struct jset *data; __BKEY_PADDED(key, BCH_REPLICAS_MAX); @@ -33,10 +37,14 @@ struct journal_buf { unsigned disk_sectors; /* maximum size entry could have been, if buf_size was bigger */ unsigned u64s_reserved; - bool noflush; /* write has already been kicked off, and was noflush */ - bool must_flush; /* something wants a flush */ - bool separate_flush; - bool need_flush_to_write_buffer; + bool noflush:1; /* write has already been kicked off, and was noflush */ + bool must_flush:1; /* something wants a flush */ + bool separate_flush:1; + bool need_flush_to_write_buffer:1; + bool write_started:1; + bool write_allocated:1; + bool write_done:1; + u8 idx; }; /* @@ -45,15 +53,18 @@ struct journal_buf { */ enum journal_pin_type { - JOURNAL_PIN_btree, - JOURNAL_PIN_key_cache, - JOURNAL_PIN_other, - JOURNAL_PIN_NR, + JOURNAL_PIN_TYPE_btree3, + JOURNAL_PIN_TYPE_btree2, + JOURNAL_PIN_TYPE_btree1, + JOURNAL_PIN_TYPE_btree0, + JOURNAL_PIN_TYPE_key_cache, + JOURNAL_PIN_TYPE_other, + JOURNAL_PIN_TYPE_NR, }; struct journal_entry_pin_list { - struct list_head list[JOURNAL_PIN_NR]; - struct list_head flushed; + struct list_head unflushed[JOURNAL_PIN_TYPE_NR]; + struct list_head flushed[JOURNAL_PIN_TYPE_NR]; atomic_t count; struct bch_devs_list devs; }; @@ -107,6 +118,7 @@ union journal_res_state { */ #define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) +#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) #define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) @@ -124,16 +136,23 @@ enum journal_space_from { journal_space_nr, }; +#define JOURNAL_FLAGS() \ + x(replay_done) \ + x(running) \ + x(may_skip_flush) \ + x(need_flush_write) \ + x(space_low) + enum journal_flags { - JOURNAL_REPLAY_DONE, - JOURNAL_STARTED, - JOURNAL_MAY_SKIP_FLUSH, - JOURNAL_NEED_FLUSH_WRITE, +#define x(n) JOURNAL_##n, + JOURNAL_FLAGS() +#undef x }; /* Reasons we may fail to get a journal reservation: */ #define JOURNAL_ERRORS() \ x(ok) \ + x(retry) \ x(blocked) \ x(max_in_flight) \ x(journal_full) \ @@ -149,6 +168,13 @@ enum journal_errors { typedef DARRAY(u64) darray_u64; +struct journal_bio { + struct bch_dev *ca; + unsigned buf_idx; + + struct bio bio; +}; + /* Embedded in struct bch_fs */ struct journal { /* Fastpath stuff up front: */ @@ -174,6 +200,7 @@ struct journal { * insufficient devices: */ enum journal_errors cur_entry_error; + unsigned cur_entry_offset_if_blocked; unsigned buf_size_want; /* @@ -202,9 +229,10 @@ struct journal { /* Used when waiting because the journal was full */ wait_queue_head_t wait; struct closure_waitlist async_wait; + struct closure_waitlist reclaim_flush_wait; - struct closure io; struct delayed_work write_work; + struct workqueue_struct *wq; /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; @@ -212,9 +240,11 @@ struct journal { /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; u64 flushed_seq_ondisk; + u64 flushing_seq; u64 last_seq_ondisk; u64 err_seq; u64 last_empty_seq; + u64 oldest_seq_found_ondisk; /* * FIFO of journal entries whose btree updates have not yet been @@ -274,11 +304,6 @@ struct journal { u64 nr_noflush_writes; u64 entry_bytes_written; - u64 low_on_space_start; - u64 low_on_pin_start; - u64 max_in_flight_start; - u64 write_buffer_full_start; - struct bch2_time_stats *flush_write_time; struct bch2_time_stats *noflush_write_time; struct bch2_time_stats *flush_seq_time; @@ -313,10 +338,11 @@ struct journal_device { u64 *buckets; /* Bio for journal reads/writes to this device */ - struct bio *bio; + struct journal_bio *bio[JOURNAL_BUF_NR]; /* for bch_journal_read_device */ struct closure read; + u64 highest_seq_found; }; /* diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index ad598105c587..75f27ec26f85 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -34,30 +34,40 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; - const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type); - struct bkey_buf sk; u32 restart_count = trans->restart_count; - int ret; + struct printbuf buf = PRINTBUF; + int ret = 0; - if (!fn) - return 0; + fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags), + trans, logged_op_but_clean, + "filesystem marked as clean but have logged op\n%s", + (bch2_bkey_val_to_text(&buf, c, k), + buf.buf)); + struct bkey_buf sk; bch2_bkey_buf_init(&sk); bch2_bkey_buf_reassemble(&sk, c, k); - ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?: - fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count); + const struct bch_logged_op_fn *fn = logged_op_fn(sk.k->k.type); + if (fn) + fn->resume(trans, sk.k); + + ret = bch2_logged_op_finish(trans, sk.k); bch2_bkey_buf_exit(&sk, c); - return ret; +fsck_err: + printbuf_exit(&buf); + return ret ?: trans_was_restarted(trans, restart_count); } int bch2_resume_logged_ops(struct bch_fs *c) { int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_logged_ops, POS_MIN, - BTREE_ITER_PREFETCH, k, + for_each_btree_key_max(trans, iter, + BTREE_ID_logged_ops, + POS(LOGGED_OPS_INUM_logged_ops, 0), + POS(LOGGED_OPS_INUM_logged_ops, U64_MAX), + BTREE_ITER_prefetch, k, resume_logged_op(trans, &iter, k))); bch_err_fn(c, ret); return ret; @@ -66,9 +76,8 @@ int bch2_resume_logged_ops(struct bch_fs *c) static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) { struct btree_iter iter; - int ret; - - ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX); + int ret = bch2_bkey_get_empty_slot(trans, &iter, + BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX)); if (ret) return ret; @@ -85,7 +94,7 @@ int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) __bch2_logged_op_start(trans, k)); } -void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) +int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) { int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0)); @@ -101,8 +110,10 @@ void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s", - __func__, buf.buf, bch2_err_str(ret)); + bch2_fs_fatal_error(c, "deleting logged operation %s: %s", + buf.buf, bch2_err_str(ret)); printbuf_exit(&buf); } + + return ret; } diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h index 4d1e786a27a8..30ae9ef737dd 100644 --- a/fs/bcachefs/logged_ops.h +++ b/fs/bcachefs/logged_ops.h @@ -15,6 +15,6 @@ static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i int bch2_resume_logged_ops(struct bch_fs *); int bch2_logged_op_start(struct btree_trans *, struct bkey_i *); -void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *); +int bch2_logged_op_finish(struct btree_trans *, struct bkey_i *); #endif /* _BCACHEFS_LOGGED_OPS_H */ diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h index 6a4bf7129dba..cfb67c95d4c8 100644 --- a/fs/bcachefs/logged_ops_format.h +++ b/fs/bcachefs/logged_ops_format.h @@ -2,6 +2,11 @@ #ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H #define _BCACHEFS_LOGGED_OPS_FORMAT_H +enum logged_ops_inums { + LOGGED_OPS_INUM_logged_ops, + LOGGED_OPS_INUM_inode_cursors, +}; + struct bch_logged_op_truncate { struct bch_val v; __le32 subvol; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 7a4ca5a28b3e..ce794d55818f 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "alloc_background.h" +#include "bkey_buf.h" #include "btree_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" @@ -10,14 +11,13 @@ #include "recovery.h" /* KEY_TYPE_lru is obsolete: */ -int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { int ret = 0; - bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err, - lru_entry_at_time_0, + bkey_fsck_err_on(!lru_pos_time(k.k->p), + c, lru_entry_at_time_0, "lru entry at time=0"); fsck_err: return ret; @@ -44,8 +44,8 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time, bool set) { return time - ? bch2_btree_bit_mod(trans, BTREE_ID_lru, - lru_pos(lru_id, dev_bucket, time), set) + ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, + lru_pos(lru_id, dev_bucket, time), set) : 0; } @@ -77,10 +77,49 @@ static const char * const bch2_lru_types[] = { NULL }; +int bch2_lru_check_set(struct btree_trans *trans, + u16 lru_id, u64 time, + struct bkey_s_c referring_k, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter lru_iter; + struct bkey_s_c lru_k = + bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, + lru_pos(lru_id, + bucket_to_u64(referring_k.k->p), + time), 0); + int ret = bkey_err(lru_k); + if (ret) + return ret; + + if (lru_k.k->type != KEY_TYPE_set) { + ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed); + if (ret) + goto err; + + if (fsck_err(trans, alloc_key_to_missing_lru_entry, + "missing %s lru entry\n" + " %s", + bch2_lru_types[lru_type(lru_k)], + (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { + ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time); + if (ret) + goto err; + } + } +err: +fsck_err: + bch2_trans_iter_exit(trans, &lru_iter); + printbuf_exit(&buf); + return ret; +} + static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, - struct bpos *last_flushed_pos) + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -94,11 +133,13 @@ static int bch2_check_lru_key(struct btree_trans *trans, u64 idx; int ret; - if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c, - lru_entry_to_invalid_bucket, + struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos); + + if (fsck_err_on(!ca, + trans, lru_entry_to_invalid_bucket, "lru key points to nonexistent device:bucket %llu:%llu", alloc_pos.inode, alloc_pos.offset)) - return bch2_btree_delete_at(trans, lru_iter, 0); + return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); ret = bkey_err(k); @@ -112,21 +153,17 @@ static int bch2_check_lru_key(struct btree_trans *trans, idx = alloc_lru_idx_read(*a); break; case BCH_LRU_fragmentation: - idx = a->fragmentation_lru; + idx = alloc_lru_idx_fragmentation(*a, ca); break; } if (lru_k.k->type != KEY_TYPE_set || lru_pos_time(lru_k.k->p) != idx) { - if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) { - *last_flushed_pos = lru_k.k->p; - ret = bch2_btree_write_buffer_flush_sync(trans) ?: - -BCH_ERR_transaction_restart_write_buffer_flush; - goto out; - } + ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed); + if (ret) + goto err; - if (c->opts.reconstruct_alloc || - fsck_err(c, lru_entry_bad, + if (fsck_err(trans, lru_entry_bad, "incorrect lru entry: lru %s time %llu\n" " %s\n" " for %s", @@ -134,12 +171,12 @@ static int bch2_check_lru_key(struct btree_trans *trans, lru_pos_time(lru_k.k->p), (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) - ret = bch2_btree_delete_at(trans, lru_iter, 0); + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); } -out: err: fsck_err: bch2_trans_iter_exit(trans, &iter); + bch2_dev_put(ca); printbuf_exit(&buf2); printbuf_exit(&buf1); return ret; @@ -147,12 +184,18 @@ fsck_err: int bch2_check_lrus(struct bch_fs *c) { - struct bpos last_flushed_pos = POS_MIN; + struct bkey_buf last_flushed; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, - bch2_check_lru_key(trans, &iter, k, &last_flushed_pos))); + BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_check_lru_key(trans, &iter, k, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 429dca816df5..f31a6cf1514c 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -2,9 +2,6 @@ #ifndef _BCACHEFS_LRU_H #define _BCACHEFS_LRU_H -#define LRU_TIME_BITS 48 -#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) - static inline u64 lru_pos_id(struct bpos pos) { return pos.inode >> LRU_TIME_BITS; @@ -27,18 +24,6 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) return pos; } -#define BCH_LRU_TYPES() \ - x(read) \ - x(fragmentation) - -enum bch_lru_type { -#define x(n) BCH_LRU_##n, - BCH_LRU_TYPES() -#undef x -}; - -#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) - static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; @@ -48,14 +33,13 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) return BCH_LRU_read; } -int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_lru_pos_to_text(struct printbuf *, struct bpos); #define bch2_bkey_ops_lru ((struct bkey_ops) { \ - .key_invalid = bch2_lru_invalid, \ + .key_validate = bch2_lru_validate, \ .val_to_text = bch2_lru_to_text, \ .min_val_size = 8, \ }) @@ -64,6 +48,9 @@ int bch2_lru_del(struct btree_trans *, u16, u64, u64); int bch2_lru_set(struct btree_trans *, u16, u64, u64); int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); +struct bkey_buf; +int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); + int bch2_check_lrus(struct bch_fs *); #endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h new file mode 100644 index 000000000000..f372cb3b8cda --- /dev/null +++ b/fs/bcachefs/lru_format.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LRU_FORMAT_H +#define _BCACHEFS_LRU_FORMAT_H + +struct bch_lru { + struct bch_val v; + __le64 idx; +} __packed __aligned(8); + +#define BCH_LRU_TYPES() \ + x(read) \ + x(fragmentation) + +enum bch_lru_type { +#define x(n) BCH_LRU_##n, + BCH_LRU_TYPES() +#undef x +}; + +#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) + +#define LRU_TIME_BITS 48 +#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) + +#endif /* _BCACHEFS_LRU_FORMAT_H */ diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c index bf0ef668fd38..0ea9f30803a2 100644 --- a/fs/bcachefs/mean_and_variance.c +++ b/fs/bcachefs/mean_and_variance.c @@ -103,14 +103,17 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() * @s: mean and variance number of samples and their sums * @x: new value to include in the &mean_and_variance_weighted + * @initted: caller must track whether this is the first use or not + * @weight: ewma weight * * see linked pdf: function derived from equations 140-143 where alpha = 2^w. * values are stored bitshifted for performance and added precision. */ -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x) +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, + s64 x, bool initted, u8 weight) { // previous weighted variance. - u8 w = s->weight; + u8 w = weight; u64 var_w0 = s->variance; // new value weighted. s64 x_w = x << w; @@ -119,45 +122,50 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 // new mean weighted. s64 u_w1 = s->mean + diff; - if (!s->init) { + if (!initted) { s->mean = x_w; s->variance = 0; } else { s->mean = u_w1; s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; } - s->init = true; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); /** * mean_and_variance_weighted_get_mean() - get mean from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, + u8 weight) { - return fast_divpow2(s.mean, s.weight); + return fast_divpow2(s.mean, weight); } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); /** * mean_and_variance_weighted_get_variance() -- get variance from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, + u8 weight) { // always positive don't need fast divpow2 - return s.variance >> s.weight; + return s.variance >> weight; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); /** * mean_and_variance_weighted_get_stddev() - get standard deviation from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, + u8 weight) { - return int_sqrt64(mean_and_variance_weighted_get_variance(s)); + return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight)); } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h index 64df11ab422b..47e4a3c3d26e 100644 --- a/fs/bcachefs/mean_and_variance.h +++ b/fs/bcachefs/mean_and_variance.h @@ -111,11 +111,11 @@ static inline u128_u u128_shl(u128_u i, s8 shift) { u128_u r; - r.lo = i.lo << shift; + r.lo = i.lo << (shift & 63); if (shift < 64) - r.hi = (i.hi << shift) | (i.lo >> (64 - shift)); + r.hi = (i.hi << (shift & 63)) | (i.lo >> (-shift & 63)); else { - r.hi = i.lo << (shift - 64); + r.hi = i.lo << (-shift & 63); r.lo = 0; } return r; @@ -154,8 +154,6 @@ struct mean_and_variance { /* expontentially weighted variant */ struct mean_and_variance_weighted { - bool init; - u8 weight; /* base 2 logarithim */ s64 mean; u64 variance; }; @@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s); u64 mean_and_variance_get_variance(struct mean_and_variance s1); u32 mean_and_variance_get_stddev(struct mean_and_variance s); -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v); +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, + s64 v, bool initted, u8 weight); -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, + u8 weight); +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, + u8 weight); +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, + u8 weight); #endif // MEAN_AND_VAIRANCE_H_ diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index 019583c3ca0e..e9d9c0212e44 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -31,53 +31,59 @@ static void mean_and_variance_basic_test(struct kunit *test) static void mean_and_variance_weighted_test(struct kunit *test) { - struct mean_and_variance_weighted s = { .weight = 2 }; + struct mean_and_variance_weighted s = { }; - mean_and_variance_weighted_update(&s, 10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); + mean_and_variance_weighted_update(&s, 10, false, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - mean_and_variance_weighted_update(&s, 20); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); + mean_and_variance_weighted_update(&s, 20, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - mean_and_variance_weighted_update(&s, 30); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); + mean_and_variance_weighted_update(&s, 30, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); - s = (struct mean_and_variance_weighted) { .weight = 2 }; + s = (struct mean_and_variance_weighted) { }; - mean_and_variance_weighted_update(&s, -10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); + mean_and_variance_weighted_update(&s, -10, false, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - mean_and_variance_weighted_update(&s, -20); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); + mean_and_variance_weighted_update(&s, -20, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - mean_and_variance_weighted_update(&s, -30); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); + mean_and_variance_weighted_update(&s, -30, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); } static void mean_and_variance_weighted_advanced_test(struct kunit *test) { - struct mean_and_variance_weighted s = { .weight = 8 }; + struct mean_and_variance_weighted s = { }; + bool initted = false; s64 i; - for (i = 10; i <= 100; i += 10) - mean_and_variance_weighted_update(&s, i); + for (i = 10; i <= 100; i += 10) { + mean_and_variance_weighted_update(&s, i, initted, 8); + initted = true; + } - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); - s = (struct mean_and_variance_weighted) { .weight = 8 }; + s = (struct mean_and_variance_weighted) { }; + initted = false; - for (i = -10; i >= -100; i -= 10) - mean_and_variance_weighted_update(&s, i); + for (i = -10; i >= -100; i -= 10) { + mean_and_variance_weighted_update(&s, i, initted, 8); + initted = true; + } - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); } static void do_mean_and_variance_test(struct kunit *test, @@ -92,26 +98,26 @@ static void do_mean_and_variance_test(struct kunit *test, s64 *weighted_stddev) { struct mean_and_variance mv = {}; - struct mean_and_variance_weighted vw = { .weight = weight }; + struct mean_and_variance_weighted vw = { }; for (unsigned i = 0; i < initial_n; i++) { mean_and_variance_update(&mv, initial_value); - mean_and_variance_weighted_update(&vw, initial_value); + mean_and_variance_weighted_update(&vw, initial_value, false, weight); KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value); KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0); } for (unsigned i = 0; i < n; i++) { mean_and_variance_update(&mv, data[i]); - mean_and_variance_weighted_update(&vw, data[i]); + mean_and_variance_weighted_update(&vw, data[i], true, weight); KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]); KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]); } KUNIT_EXPECT_EQ(test, mv.n, initial_n + n); @@ -130,20 +136,8 @@ static void mean_and_variance_test_1(struct kunit *test) d, mean, stddev, weighted_mean, weighted_stddev); } -static void mean_and_variance_test_2(struct kunit *test) -{ - s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; - s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 }; - s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 }; - s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; - s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - /* Test behaviour where we switch from one steady state to another: */ -static void mean_and_variance_test_3(struct kunit *test) +static void mean_and_variance_test_2(struct kunit *test) { s64 d[] = { 100, 100, 100, 100, 100 }; s64 mean[] = { 22, 32, 40, 46, 50 }; @@ -155,18 +149,6 @@ static void mean_and_variance_test_3(struct kunit *test) d, mean, stddev, weighted_mean, weighted_stddev); } -static void mean_and_variance_test_4(struct kunit *test) -{ - s64 d[] = { 100, 100, 100, 100, 100 }; - s64 mean[] = { 10, 11, 12, 13, 14 }; - s64 stddev[] = { 9, 13, 15, 17, 19 }; - s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; - s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - static void mean_and_variance_fast_divpow2(struct kunit *test) { s64 i; @@ -224,8 +206,6 @@ static struct kunit_case mean_and_variance_test_cases[] = { KUNIT_CASE(mean_and_variance_weighted_advanced_test), KUNIT_CASE(mean_and_variance_test_1), KUNIT_CASE(mean_and_variance_test_2), - KUNIT_CASE(mean_and_variance_test_3), - KUNIT_CASE(mean_and_variance_test_4), {} }; @@ -237,4 +217,5 @@ static struct kunit_suite mean_and_variance_test_suite = { kunit_test_suite(mean_and_variance_test_suite); MODULE_AUTHOR("Daniel B. Hill"); +MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests"); MODULE_LICENSE("GPL"); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 5623cee3ef86..ddc187fb693d 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, nr_good = bch2_bkey_durability(c, k.s_c); if ((!nr_good && !(flags & lost)) || (nr_good < replicas && !(flags & degraded))) - return -EINVAL; + return -BCH_ERR_remove_would_lose_data; return 0; } @@ -49,7 +49,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, if (!bch2_bkey_has_device_c(k, dev_idx)) return 0; - n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node); ret = PTR_ERR_OR_ZERO(n); if (ret) return ret; @@ -67,7 +67,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, /* * Since we're not inserting through an extent iterator - * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * (BTREE_ITER_all_snapshots iterators aren't extent iterators), * we aren't using the extent overwrite path to delete, we're * just using the normal key deletion path: */ @@ -87,7 +87,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) continue; ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); if (ret) @@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) /* don't handle this yet: */ if (flags & BCH_FORCE_IF_METADATA_LOST) - return -EINVAL; + return -BCH_ERR_remove_with_metadata_missing_unimplemented; trans = bch2_trans_get(c); bch2_bkey_buf_init(&k); @@ -119,7 +119,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) for (id = 0; id < BTREE_ID_NR; id++) { bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); retry: ret = 0; while (bch2_trans_begin(trans), @@ -132,10 +132,8 @@ retry: ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true); - if (ret) { - bch_err(c, "Cannot drop device without losing data"); + if (ret) break; - } ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index bf68ea49447b..160b4374160a 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -21,6 +21,8 @@ #include "journal_reclaim.h" #include "keylist.h" #include "move.h" +#include "rebalance.h" +#include "reflink.h" #include "replicas.h" #include "snapshot.h" #include "super-io.h" @@ -36,36 +38,6 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - printbuf_tabstop_push(out, 20); - prt_str(out, "rewrite ptrs:"); - prt_tab(out); - bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); - prt_newline(out); - - prt_str(out, "kill ptrs: "); - prt_tab(out); - bch2_prt_u64_base2(out, data_opts->kill_ptrs); - prt_newline(out); - - prt_str(out, "target: "); - prt_tab(out); - bch2_target_to_text(out, c, data_opts->target); - prt_newline(out); - - prt_str(out, "compression: "); - prt_tab(out); - bch2_compression_opt_to_text(out, background_compression(*io_opts)); - prt_newline(out); - - prt_str(out, "extra replicas: "); - prt_tab(out); - prt_u64(out, data_opts->extra_replicas); -} - static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) @@ -226,6 +198,13 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) list_del(&ctxt->list); mutex_unlock(&c->moving_context_lock); + /* + * Generally, releasing a transaction within a transaction restart means + * an unhandled transaction restart: but this can happen legitimately + * within the move code, e.g. when bch2_move_ratelimit() tells us to + * exit before we've retried + */ + bch2_trans_begin(ctxt->trans); bch2_trans_put(ctxt->trans); memset(ctxt, 0, sizeof(*ctxt)); } @@ -296,7 +275,7 @@ int bch2_move_extent(struct moving_context *ctxt, if (!data_opts.rewrite_ptrs && !data_opts.extra_replicas) { if (data_opts.kill_ptrs) - return bch2_extent_drop_ptrs(trans, iter, k, data_opts); + return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); return 0; } @@ -322,8 +301,8 @@ int bch2_move_extent(struct moving_context *ctxt, io->write_sectors = k.k->size; bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); - bio_set_prio(&io->write.op.wbio.bio, - IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + io->write.op.wbio.bio.bi_ioprio = + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, GFP_KERNEL)) @@ -333,7 +312,7 @@ int bch2_move_extent(struct moving_context *ctxt, io->rbio.opts = io_opts; bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); io->rbio.bio.bi_vcnt = pages; - bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); io->rbio.bio.bi_iter.bi_size = sectors << 9; io->rbio.bio.bi_opf = REQ_OP_READ; @@ -409,34 +388,42 @@ err: return ret; } -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, +static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct per_snapshot_io_opts *io_opts, + struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ + struct btree_iter *extent_iter, struct bkey_s_c extent_k) { struct bch_fs *c = trans->c; u32 restart_count = trans->restart_count; + struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; int ret = 0; - if (io_opts->cur_inum != extent_k.k->p.inode) { + if (extent_k.k->type == KEY_TYPE_reflink_v) + goto out; + + if (io_opts->cur_inum != extent_pos.inode) { io_opts->d.nr = 0; - ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), - BTREE_ITER_ALL_SNAPSHOTS, k, ({ - if (k.k->p.offset != extent_k.k->p.inode) + ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), + BTREE_ITER_all_snapshots, k, ({ + if (k.k->p.offset != extent_pos.inode) break; if (!bkey_is_inode(k.k)) continue; struct bch_inode_unpacked inode; - BUG_ON(bch2_inode_unpack(k, &inode)); + _ret3 = bch2_inode_unpack(k, &inode); + if (_ret3) + break; struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; bch2_inode_opts_get(&e.io_opts, trans->c, &inode); darray_push(&io_opts->d, e); })); - io_opts->cur_inum = extent_k.k->p.inode; + io_opts->cur_inum = extent_pos.inode; } ret = ret ?: trans_was_restarted(trans, restart_count); @@ -445,43 +432,46 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, if (extent_k.k->p.snapshot) darray_for_each(io_opts->d, i) - if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) - return &i->io_opts; - - return &io_opts->fs_io_opts; + if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) { + opts_ret = &i->io_opts; + break; + } +out: + ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); + if (ret) + return ERR_PTR(ret); + return opts_ret; } int bch2_move_get_io_opts_one(struct btree_trans *trans, struct bch_io_opts *io_opts, + struct btree_iter *extent_iter, struct bkey_s_c extent_k) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; + struct bch_fs *c = trans->c; + + *io_opts = bch2_opts_to_inode_opts(c->opts); /* reflink btree? */ - if (!extent_k.k->p.inode) { - *io_opts = bch2_opts_to_inode_opts(trans->c->opts); - return 0; - } + if (!extent_k.k->p.inode) + goto out; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + struct btree_iter inode_iter; + struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), - BTREE_ITER_CACHED); - ret = bkey_err(k); + BTREE_ITER_cached); + int ret = bkey_err(inode_k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; - if (!ret && bkey_is_inode(k.k)) { + if (!ret && bkey_is_inode(inode_k.k)) { struct bch_inode_unpacked inode; - bch2_inode_unpack(k, &inode); - bch2_inode_opts_get(io_opts, trans->c, &inode); - } else { - *io_opts = bch2_opts_to_inode_opts(trans->c->opts); + bch2_inode_unpack(inode_k, &inode); + bch2_inode_opts_get(io_opts, c, &inode); } - - bch2_trans_iter_exit(trans, &iter); - return 0; + bch2_trans_iter_exit(trans, &inode_iter); +out: + return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); } int bch2_move_ratelimit(struct moving_context *ctxt) @@ -539,9 +529,15 @@ static int bch2_move_data_btree(struct moving_context *ctxt, struct per_snapshot_io_opts snapshot_io_opts; struct bch_io_opts *io_opts; struct bkey_buf sk; - struct btree_iter iter; + struct btree_iter iter, reflink_iter = {}; struct bkey_s_c k; struct data_update_opts data_opts; + /* + * If we're moving a single file, also process reflinked data it points + * to (this includes propagating changed io_opts from the inode to the + * extent): + */ + bool walk_indirect = start.inode == end.inode; int ret = 0, ret2; per_snapshot_io_opts_init(&snapshot_io_opts, c); @@ -552,14 +548,17 @@ static int bch2_move_data_btree(struct moving_context *ctxt, ctxt->stats->pos = BBPOS(btree_id, start); } + bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, btree_id, start, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots); if (ctxt->rate) bch2_ratelimit_reset(ctxt->rate); while (!bch2_move_ratelimit(ctxt)) { + struct btree_iter *extent_iter = &iter; + bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); @@ -578,10 +577,36 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (ctxt->stats) ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + if (walk_indirect && + k.k->type == KEY_TYPE_reflink_p && + REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + + bch2_trans_iter_exit(trans, &reflink_iter); + k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + if (bkey_deleted(k.k)) + goto next_nondata; + + /* + * XXX: reflink pointers may point to multiple indirect + * extents, so don't advance past the entire reflink + * pointer - need to fixup iter->k + */ + extent_iter = &reflink_iter; + } + if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); + io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, + iter.pos, extent_iter, k); ret = PTR_ERR_OR_ZERO(io_opts); if (ret) continue; @@ -597,7 +622,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); + ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; @@ -618,6 +643,7 @@ next_nondata: bch2_btree_iter_advance(&iter); } + bch2_trans_iter_exit(trans, &reflink_iter); bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); per_snapshot_io_opts_exit(&snapshot_io_opts); @@ -683,20 +709,22 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bch_fs *c = trans->c; bool is_kthread = current->flags & PF_KTHREAD; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct btree_iter iter; + struct btree_iter iter = {}, bp_iter = {}; struct bkey_buf sk; - struct bch_backpointer bp; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; struct bkey_s_c k; struct data_update_opts data_opts; - unsigned dirty_sectors, bucket_size; - u64 fragmentation; - struct bpos bp_pos = POS_MIN; + unsigned sectors_moved = 0; + struct bkey_buf last_flushed; int ret = 0; + struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); + if (!ca) + return 0; + trace_bucket_evacuate(c, &bucket); + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); bch2_bkey_buf_init(&sk); /* @@ -704,21 +732,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, */ bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - bucket, BTREE_ITER_CACHED); - ret = lockrestart_do(trans, - bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, bucket), 0); bch_err_msg(c, ret, "looking up alloc key"); if (ret) goto err; - a = bch2_alloc_to_v4(k, &a_convert); - dirty_sectors = bch2_bucket_sectors_dirty(*a); - bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; - fragmentation = a->fragmentation_lru; - ret = bch2_btree_write_buffer_tryflush(trans); bch_err_msg(c, ret, "flushing btree write buffer"); if (ret) @@ -730,18 +750,23 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(trans); - ret = bch2_get_next_backpointer(trans, bucket, gen, - &bp_pos, &bp, - BTREE_ITER_CACHED); + k = bch2_btree_iter_peek(&bp_iter); + ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) goto err; - if (bkey_eq(bp_pos, POS_MAX)) + + if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) break; - if (!bp.level) { - k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); + if (k.k->type != KEY_TYPE_backpointer) + goto next; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + + if (!bp.v->level) { + k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -753,7 +778,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret = bch2_move_get_io_opts_one(trans, &io_opts, k); + ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); if (ret) { bch2_trans_iter_exit(trans, &iter); continue; @@ -763,14 +788,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, data_opts.target = io_opts.background_target; data_opts.rewrite_ptrs = 0; + unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ unsigned i = 0; - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - if (ptr->dev == bucket.inode) { - data_opts.rewrite_ptrs |= 1U << i; - if (ptr->cached) { + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { + if (p.ptr.dev == bucket.inode) { + if (p.ptr.cached) { bch2_trans_iter_exit(trans, &iter); goto next; } + data_opts.rewrite_ptrs |= 1U << i; + break; } i++; } @@ -790,14 +819,15 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, goto err; if (ctxt->stats) - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + atomic64_add(sectors, &ctxt->stats->sectors_seen); + sectors_moved += sectors; } else { struct btree *b; - b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); + b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); ret = PTR_ERR_OR_ZERO(b); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - continue; + goto next; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) @@ -805,7 +835,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, if (!b) goto next; - unsigned sectors = btree_ptr_sectors_written(&b->key); + unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); ret = bch2_btree_node_rewrite(trans, &iter, b, 0); bch2_trans_iter_exit(trans, &iter); @@ -821,14 +851,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, atomic64_add(sectors, &ctxt->stats->sectors_seen); atomic64_add(sectors, &ctxt->stats->sectors_moved); } + sectors_moved += btree_sectors(c); } next: - bp_pos = bpos_nosnap_successor(bp_pos); + bch2_btree_iter_advance(&bp_iter); } - trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); + trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); err: + bch2_trans_iter_exit(trans, &bp_iter); + bch2_dev_put(ca); bch2_bkey_buf_exit(&sk, c); + bch2_bkey_buf_exit(&last_flushed, c); return ret; } @@ -868,7 +902,7 @@ static int bch2_move_btree(struct bch_fs *c, continue; bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); retry: ret = 0; while (bch2_trans_begin(trans), @@ -920,7 +954,20 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, ? c->opts.metadata_replicas : io_opts->data_replicas; - if (!nr_good || nr_good >= replicas) + rcu_read_lock(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned i = 0; + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ptr->cached && + (!ca || !ca->mi.durability)) + data_opts->kill_ptrs |= BIT(i); + i++; + } + rcu_read_unlock(); + + if (!data_opts->kill_ptrs && + (!nr_good || nr_good >= replicas)) return false; data_opts->target = 0; @@ -968,27 +1015,17 @@ static bool migrate_btree_pred(struct bch_fs *c, void *arg, return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } +/* + * Ancient versions of bcachefs produced packed formats which could represent + * keys that the in memory format cannot represent; this checks for those + * formats so we can get rid of them. + */ static bool bformat_needs_redo(struct bkey_format *f) { - unsigned i; - - for (i = 0; i < f->nr_fields; i++) { - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (f->bits_per_field[i] > unpacked_bits) - return true; - - if ((f->bits_per_field[i] == unpacked_bits) && field_offset) + for (unsigned i = 0; i < f->nr_fields; i++) + if (bch2_bkey_format_field_overflows(f, i)) return true; - if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & - unpacked_mask) < - field_offset) - return true; - } - return false; } @@ -1043,6 +1080,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, struct extent_ptr_decoded p; unsigned i = 0; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { unsigned d = bch2_extent_ptr_durability(c, &p); @@ -1053,6 +1091,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, i++; } + rcu_read_unlock(); return data_opts->kill_ptrs != 0; } @@ -1137,23 +1176,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_newline(out); printbuf_indent_add(out, 2); - prt_str(out, "keys moved: "); - prt_u64(out, atomic64_read(&stats->keys_moved)); - prt_newline(out); - - prt_str(out, "keys raced: "); - prt_u64(out, atomic64_read(&stats->keys_raced)); - prt_newline(out); - - prt_str(out, "bytes seen: "); + prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); + prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); + prt_printf(out, "bytes seen: "); prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); prt_newline(out); - prt_str(out, "bytes moved: "); + prt_printf(out, "bytes moved: "); prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_newline(out); - prt_str(out, "bytes raced: "); + prt_printf(out, "bytes raced: "); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); @@ -1167,19 +1200,17 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); - prt_printf(out, "reads: ios %u/%u sectors %u/%u", + prt_printf(out, "reads: ios %u/%u sectors %u/%u\n", atomic_read(&ctxt->read_ios), c->opts.move_ios_in_flight, atomic_read(&ctxt->read_sectors), c->opts.move_bytes_in_flight >> 9); - prt_newline(out); - prt_printf(out, "writes: ios %u/%u sectors %u/%u", + prt_printf(out, "writes: ios %u/%u sectors %u/%u\n", atomic_read(&ctxt->write_ios), c->opts.move_ios_in_flight, atomic_read(&ctxt->write_sectors), c->opts.move_bytes_in_flight >> 9); - prt_newline(out); printbuf_indent_add(out, 2); diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 9baf3093a678..51e0505a8156 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -110,9 +110,8 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt darray_exit(&io_opts->d); } -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, - struct per_snapshot_io_opts *, struct bkey_s_c); -int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c); +int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, + struct btree_iter *, struct bkey_s_c); int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 69e06a84dad4..6718dc37c5a3 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -35,9 +35,10 @@ struct buckets_in_flight { }; static const struct rhashtable_params bch_move_bucket_params = { - .head_offset = offsetof(struct move_bucket_in_flight, hash), - .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), - .key_len = sizeof(struct move_bucket_key), + .head_offset = offsetof(struct move_bucket_in_flight, hash), + .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .key_len = sizeof(struct move_bucket_key), + .automatic_shrinking = true, }; static struct move_bucket_in_flight * @@ -72,31 +73,36 @@ move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b) static int bch2_bucket_is_movable(struct btree_trans *trans, struct move_bucket *b, u64 time) { - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a; - int ret; + struct bch_fs *c = trans->c; - if (bch2_bucket_is_open(trans->c, - b->k.bucket.inode, - b->k.bucket.offset)) + if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) return 0; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - b->k.bucket, BTREE_ITER_CACHED); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, + b->k.bucket, BTREE_ITER_cached); + int ret = bkey_err(k); if (ret) return ret; - a = bch2_alloc_to_v4(k, &_a); + struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode); + if (!ca) + goto out; + + if (ca->mi.state != BCH_MEMBER_STATE_rw || + !bch2_dev_is_online(ca)) + goto out_put; + + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); b->k.gen = a->gen; b->sectors = bch2_bucket_sectors_dirty(*a); + u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); - ret = data_type_movable(a->data_type) && - a->fragmentation_lru && - a->fragmentation_lru <= time; - + ret = lru_idx && lru_idx <= time; +out_put: + bch2_dev_put(ca); +out: bch2_trans_iter_exit(trans, &iter); return ret; } @@ -155,11 +161,12 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, if (bch2_err_matches(ret, EROFS)) return ret; - if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()", - __func__, bch2_err_str(ret))) + if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) return ret; - ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, + bch2_trans_begin(trans); + + ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), 0, k, ({ @@ -207,7 +214,8 @@ static int bch2_copygc(struct moving_context *ctxt, }; move_buckets buckets = { 0 }; struct move_bucket_in_flight *f; - u64 moved = atomic64_read(&ctxt->stats->sectors_moved); + u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); + u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); @@ -237,7 +245,6 @@ static int bch2_copygc(struct moving_context *ctxt, *did_work = true; } err: - darray_exit(&buckets); /* no entries in LRU btree found, or got to end: */ if (bch2_err_matches(ret, ENOENT)) @@ -246,8 +253,11 @@ err: if (ret < 0 && !bch2_err_matches(ret, EROFS)) bch_err_msg(c, ret, "from bch2_move_data()"); - moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; - trace_and_count(c, copygc, c, moved, 0, 0, 0); + sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; + sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; + trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); + + darray_exit(&buckets); return ret; } @@ -288,18 +298,23 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) { - prt_printf(out, "Currently waiting for: "); + printbuf_tabstop_push(out, 32); + prt_printf(out, "running:\t%u\n", c->copygc_running); + prt_printf(out, "copygc_wait:\t%llu\n", c->copygc_wait); + prt_printf(out, "copygc_wait_at:\t%llu\n", c->copygc_wait_at); + + prt_printf(out, "Currently waiting for:\t"); prt_human_readable_u64(out, max(0LL, c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)) << 9); prt_newline(out); - prt_printf(out, "Currently waiting since: "); + prt_printf(out, "Currently waiting since:\t"); prt_human_readable_u64(out, max(0LL, atomic64_read(&c->io_clock[WRITE].now) - c->copygc_wait_at) << 9); prt_newline(out); - prt_printf(out, "Currently calculated wait: "); + prt_printf(out, "Currently calculated wait:\t"); prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); prt_newline(out); } @@ -337,9 +352,9 @@ static int bch2_copygc_thread(void *arg) bch2_trans_unlock_long(ctxt.trans); cond_resched(); - if (!c->copy_gc_enabled) { + if (!c->opts.copygc_enabled) { move_buckets_wait(&ctxt, buckets, true); - kthread_wait_freezable(c->copy_gc_enabled || + kthread_wait_freezable(c->opts.copygc_enabled || kthread_should_stop()); } @@ -376,7 +391,7 @@ static int bch2_copygc_thread(void *arg) if (min_member_capacity == U64_MAX) min_member_capacity = 128 * 2048; - bch2_trans_unlock_long(ctxt.trans); + move_buckets_wait(&ctxt, buckets, true); bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), MAX_SCHEDULE_TIMEOUT); } diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index b1ed0b9a20d3..6772faf385a5 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -1,12 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> +#include <linux/fs_parser.h> #include "bcachefs.h" #include "compress.h" #include "disk_groups.h" #include "error.h" #include "opts.h" +#include "recovery_passes.h" #include "super-io.h" #include "util.h" @@ -42,12 +44,12 @@ const char * const __bch2_btree_ids[] = { NULL }; -const char * const bch2_csum_types[] = { +static const char * const __bch2_csum_types[] = { BCH_CSUM_TYPES() NULL }; -const char * const bch2_csum_opts[] = { +const char * const __bch2_csum_opts[] = { BCH_CSUM_OPTS() NULL }; @@ -62,7 +64,7 @@ const char * const bch2_compression_opts[] = { NULL }; -const char * const bch2_str_hash_types[] = { +const char * const __bch2_str_hash_types[] = { BCH_STR_HASH_TYPES() NULL }; @@ -82,18 +84,41 @@ const char * const bch2_member_states[] = { NULL }; -const char * const bch2_jset_entry_types[] = { +static const char * const __bch2_jset_entry_types[] = { BCH_JSET_ENTRY_TYPES() NULL }; -const char * const bch2_fs_usage_types[] = { +static const char * const __bch2_fs_usage_types[] = { BCH_FS_USAGE_TYPES() NULL }; #undef x +static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], + unsigned nr, const char *type, unsigned idx) +{ + if (idx < nr) + prt_str(out, opts[idx]); + else + prt_printf(out, "(unknown %s %u)", type, idx); +} + +#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \ +void bch2_prt_##name(struct printbuf *out, type t) \ +{ \ + prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\ +} + +PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); +PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); +PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); +PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); +PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); +PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); +PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); + static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) { @@ -203,7 +228,12 @@ const struct bch_option bch2_opt_table[] = { #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ .min = _min, .max = _max #define OPT_STR(_choices) .type = BCH_OPT_STR, \ - .min = 0, .max = ARRAY_SIZE(_choices), \ + .min = 0, .max = ARRAY_SIZE(_choices) - 1, \ + .choices = _choices +#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ + .min = 0, .max = U64_MAX, \ + .choices = _choices +#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \ .choices = _choices #define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn @@ -305,17 +335,18 @@ int bch2_opt_parse(struct bch_fs *c, switch (opt->type) { case BCH_OPT_BOOL: if (val) { - ret = kstrtou64(val, 10, res); + ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); + if (ret != -BCH_ERR_option_not_bool) { + *res = ret; + } else { + if (err) + prt_printf(err, "%s: must be bool", opt->attr.name); + return ret; + } } else { - ret = 0; *res = 1; } - if (ret < 0 || (*res != 0 && *res != 1)) { - if (err) - prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; - } break; case BCH_OPT_UINT: if (!val) { @@ -351,8 +382,19 @@ int bch2_opt_parse(struct bch_fs *c, *res = ret; break; + case BCH_OPT_BITFIELD: { + s64 v = bch2_read_flag_list(val, opt->choices); + if (v < 0) + return v; + *res = v; + break; + } case BCH_OPT_FN: ret = opt->fn.parse(c, val, res, err); + + if (ret == -BCH_ERR_option_needs_open_fs) + return ret; + if (ret < 0) { if (err) prt_printf(err, "%s: parse error", @@ -389,11 +431,16 @@ void bch2_opt_to_text(struct printbuf *out, prt_printf(out, "%lli", v); break; case BCH_OPT_STR: - if (flags & OPT_SHOW_FULL_LIST) + if (v < opt->min || v >= opt->max) + prt_printf(out, "(invalid option %lli)", v); + else if (flags & OPT_SHOW_FULL_LIST) prt_string_option(out, opt->choices, v); else prt_str(out, opt->choices[v]); break; + case BCH_OPT_BITFIELD: + prt_bitflags(out, opt->choices, v); + break; case BCH_OPT_FN: opt->fn.to_text(out, c, sb, v); break; @@ -402,6 +449,32 @@ void bch2_opt_to_text(struct printbuf *out, } } +void bch2_opts_to_text(struct printbuf *out, + struct bch_opts opts, + struct bch_fs *c, struct bch_sb *sb, + unsigned show_mask, unsigned hide_mask, + unsigned flags) +{ + bool first = true; + + for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; + + if ((opt->flags & hide_mask) || !(opt->flags & show_mask)) + continue; + + u64 v = bch2_opt_get_by_id(&opts, i); + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) + continue; + + if (!first) + prt_char(out, ','); + first = false; + + bch2_opt_to_text(out, c, sb, opt, v, flags); + } +} + int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) { int ret = 0; @@ -435,14 +508,81 @@ int bch2_opts_check_may_set(struct bch_fs *c) return 0; } +int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, + struct printbuf *parse_later, + const char *name, const char *val) +{ + struct printbuf err = PRINTBUF; + u64 v; + int ret, id; + + id = bch2_mount_opt_lookup(name); + + /* Check for the form "noopt", negation of a boolean opt: */ + if (id < 0 && + !val && + !strncmp("no", name, 2)) { + id = bch2_mount_opt_lookup(name + 2); + val = "0"; + } + + /* Unknown options are ignored: */ + if (id < 0) + return 0; + + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) + goto bad_opt; + + if (id == Opt_acl && + !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) + goto bad_opt; + + if ((id == Opt_usrquota || + id == Opt_grpquota) && + !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) + goto bad_opt; + + ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); + if (ret == -BCH_ERR_option_needs_open_fs && parse_later) { + prt_printf(parse_later, "%s=%s,", name, val); + if (parse_later->allocation_failure) { + ret = -ENOMEM; + goto out; + } + + ret = 0; + goto out; + } + + if (ret < 0) + goto bad_val; + + if (opts) + bch2_opt_set_by_id(opts, id, v); + + ret = 0; + goto out; + +bad_opt: + pr_err("Bad mount option %s", name); + ret = -BCH_ERR_option_name; + goto out; + +bad_val: + pr_err("Invalid mount option %s", err.buf); + ret = -BCH_ERR_option_value; + +out: + printbuf_exit(&err); + return ret; +} + int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, - char *options) + struct printbuf *parse_later, char *options) { char *copied_opts, *copied_opts_start; char *opt, *name, *val; - int ret, id; - struct printbuf err = PRINTBUF; - u64 v; + int ret; if (!options) return 0; @@ -456,60 +596,26 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, copied_opts = kstrdup(options, GFP_KERNEL); if (!copied_opts) - return -1; + return -ENOMEM; copied_opts_start = copied_opts; while ((opt = strsep(&copied_opts, ",")) != NULL) { - name = strsep(&opt, "="); - val = opt; - - id = bch2_mount_opt_lookup(name); - - /* Check for the form "noopt", negation of a boolean opt: */ - if (id < 0 && - !val && - !strncmp("no", name, 2)) { - id = bch2_mount_opt_lookup(name + 2); - val = "0"; - } - - /* Unknown options are ignored: */ - if (id < 0) + if (!*opt) continue; - if (!(bch2_opt_table[id].flags & OPT_MOUNT)) - goto bad_opt; - - if (id == Opt_acl && - !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) - goto bad_opt; - - if ((id == Opt_usrquota || - id == Opt_grpquota) && - !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) - goto bad_opt; + name = strsep(&opt, "="); + val = opt; - ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); + ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val); if (ret < 0) - goto bad_val; - - bch2_opt_set_by_id(opts, id, v); + goto out; } ret = 0; goto out; -bad_opt: - pr_err("Bad mount option %s", name); - ret = -1; - goto out; -bad_val: - pr_err("Invalid mount option %s", err.buf); - ret = -1; - goto out; out: kfree(copied_opts_start); - printbuf_exit(&err); return ret; } @@ -549,10 +655,20 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) return 0; } -void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) +struct bch_dev_sb_opt_set { + void (*set_sb)(struct bch_member *, u64); +}; + +static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = { +#define x(n, set) [Opt_##n] = { .set_sb = SET_##set }, + BCH_DEV_OPT_SETTERS() +#undef x +}; + +void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, + const struct bch_option *opt, u64 v) { - if (opt->set_sb == SET_BCH2_NO_SB_OPT) - return; + enum bch_opt_id id = opt - bch2_opt_table; if (opt->flags & OPT_SB_FIELD_SECTORS) v >>= 9; @@ -560,16 +676,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) if (opt->flags & OPT_SB_FIELD_ILOG2) v = ilog2(v); - opt->set_sb(sb, v); + if (opt->flags & OPT_SB_FIELD_ONE_BIAS) + v++; + + if (opt->flags & OPT_FS) { + if (opt->set_sb != SET_BCH2_NO_SB_OPT) + opt->set_sb(sb, v); + } + + if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) { + if (WARN(!bch2_member_exists(sb, dev_idx), + "tried to set device option %s on nonexistent device %i", + opt->attr.name, dev_idx)) + return; + + struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); + + const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id; + if (set->set_sb) + set->set_sb(m, v); + else + pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name); + } } -void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) +void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, + const struct bch_option *opt, u64 v) { - if (opt->set_sb == SET_BCH2_NO_SB_OPT) - return; - mutex_lock(&c->sb_lock); - __bch2_opt_set_sb(c->disk_sb.sb, opt, v); + __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); bch2_write_super(c); mutex_unlock(&c->sb_lock); } @@ -578,11 +713,14 @@ void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) { - return (struct bch_io_opts) { + struct bch_io_opts opts = { #define x(_name, _bits) ._name = src._name, BCH_INODE_OPTS() #undef x }; + + bch2_io_opts_fixups(&opts); + return opts; } bool bch2_opt_is_inode_opt(enum bch_opt_id id) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 9a4b7faa3765..9d397fc2a1f0 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -16,18 +16,23 @@ extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; -extern const char * const bch2_csum_types[]; -extern const char * const bch2_csum_opts[]; +extern const char * const __bch2_csum_opts[]; extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; -extern const char * const bch2_str_hash_types[]; +extern const char * const __bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; -extern const char * const bch2_jset_entry_types[]; -extern const char * const bch2_fs_usage_types[]; extern const char * const bch2_d_types[]; +void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); +void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); +void bch2_prt_data_type(struct printbuf *, enum bch_data_type); +void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); +void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); +void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); +void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); + static inline const char *bch2_d_type_str(unsigned d_type) { return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; @@ -51,22 +56,25 @@ void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); /* When can be set: */ enum opt_flags { - OPT_FS = (1 << 0), /* Filesystem option */ - OPT_DEVICE = (1 << 1), /* Device option */ - OPT_INODE = (1 << 2), /* Inode option */ - OPT_FORMAT = (1 << 3), /* May be specified at format time */ - OPT_MOUNT = (1 << 4), /* May be specified at mount time */ - OPT_RUNTIME = (1 << 5), /* May be specified at runtime */ - OPT_HUMAN_READABLE = (1 << 6), - OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */ - OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */ - OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */ + OPT_FS = BIT(0), /* Filesystem option */ + OPT_DEVICE = BIT(1), /* Device option */ + OPT_INODE = BIT(2), /* Inode option */ + OPT_FORMAT = BIT(3), /* May be specified at format time */ + OPT_MOUNT = BIT(4), /* May be specified at mount time */ + OPT_RUNTIME = BIT(5), /* May be specified at runtime */ + OPT_HUMAN_READABLE = BIT(6), + OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */ + OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */ + OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */ + OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */ + OPT_HIDDEN = BIT(11), }; enum opt_type { BCH_OPT_BOOL, BCH_OPT_UINT, BCH_OPT_STR, + BCH_OPT_BITFIELD, BCH_OPT_FN, }; @@ -135,7 +143,7 @@ enum fsck_err_opts { x(errors, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ - BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ + BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ @@ -165,12 +173,12 @@ enum fsck_err_opts { "size", "Maximum size of checksummed/compressed extents")\ x(metadata_checksum, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_csum_opts), \ + OPT_STR(__bch2_csum_opts), \ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(data_checksum, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_csum_opts), \ + OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(compression, u8, \ @@ -214,14 +222,14 @@ enum fsck_err_opts { BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ x(inodes_32bit, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_INODE_32BIT, true, \ NULL, "Constrain inode numbers to 32 bits") \ - x(shard_inode_numbers, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_SHARD_INUMS, true, \ + x(shard_inode_numbers_bits, u8, \ + OPT_FS|OPT_FORMAT, \ + OPT_UINT(0, 8), \ + BCH_SB_SHARD_INUMS_NBITS, 0, \ NULL, "Shard new inode numbers by CPU id") \ x(inodes_use_key_cache, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ @@ -260,6 +268,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable inline data extents") \ + x(promote_whole_extents, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \ + NULL, "Promote whole extents, instead of just part being read")\ x(acl, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ @@ -290,6 +303,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Allow mounting in when data will be missing") \ + x(no_splitbrain_check, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't kick drives out when splitbrain detected")\ x(discard, u8, \ OPT_FS|OPT_MOUNT|OPT_DEVICE, \ OPT_BOOL(), \ @@ -332,6 +350,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Run fsck on mount") \ + x(fsck_memory_usage_percent, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(20, 70), \ + BCH2_NO_SB_OPT, 50, \ + NULL, "Maximum percentage of system ram fsck is allowed to pin")\ x(fix_errors, u8, \ OPT_FS|OPT_MOUNT, \ OPT_FN(bch2_opt_fix_errors), \ @@ -352,12 +375,27 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ - NULL, "Don't replay the journal") \ - x(keep_journal, u8, \ + NULL, "Exit recovery immediately prior to journal replay")\ + x(recovery_passes, u64, \ + OPT_FS|OPT_MOUNT, \ + OPT_BITFIELD(bch2_recovery_passes), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Recovery passes to run explicitly") \ + x(recovery_passes_exclude, u64, \ + OPT_FS|OPT_MOUNT, \ + OPT_BITFIELD(bch2_recovery_passes), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Recovery passes to exclude") \ + x(recovery_pass_last, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_STR_NOLIMIT(bch2_recovery_passes), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Exit recovery after specified pass") \ + x(retain_recovery_info, u8, \ 0, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ - NULL, "Don't free journal entries/keys after startup")\ + NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\ x(read_entire_journal, u8, \ 0, \ OPT_BOOL(), \ @@ -373,6 +411,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ NULL, "Log transaction function names in journal") \ + x(allocator_stuck_timeout, u16, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, U16_MAX), \ + BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \ + NULL, "Default timeout in seconds for stuck allocator messages")\ x(noexcl, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ @@ -389,7 +432,7 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ "offset", "Sector offset of superblock") \ x(read_only, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_HIDDEN, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, NULL) \ @@ -409,11 +452,6 @@ enum fsck_err_opts { BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ NULL, "Set superblock to latest version,\n" \ "allowing any new features to be used") \ - x(buckets_nouse, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Allocate the buckets_nouse bitmap") \ x(stdio, u64, \ 0, \ OPT_UINT(0, S64_MAX), \ @@ -437,6 +475,18 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, true, \ NULL, "Enable nocow mode: enables runtime locking in\n"\ "data move path needed if nocow will ever be in use\n")\ + x(copygc_enabled, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable copygc: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ + x(rebalance_enabled, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable rebalance: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ x(no_data_io, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ @@ -452,20 +502,30 @@ enum fsck_err_opts { OPT_DEVICE, \ OPT_UINT(0, S64_MAX), \ BCH2_NO_SB_OPT, 0, \ - "size", "Size of filesystem on device") \ + "size", "Specifies the bucket size; must be greater than the btree node size")\ x(durability, u8, \ - OPT_DEVICE, \ + OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \ OPT_UINT(0, BCH_REPLICAS_MAX), \ BCH2_NO_SB_OPT, 1, \ "n", "Data written to this device will be considered\n"\ "to have already been replicated n times") \ + x(data_allowed, u8, \ + OPT_DEVICE, \ + OPT_BITFIELD(__bch2_data_types), \ + BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ + "types", "Allowed data types for this device: journal, btree, and/or user")\ x(btree_node_prefetch, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\ + NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ " prefetched sequentially") +#define BCH_DEV_OPT_SETTERS() \ + x(discard, BCH_MEMBER_DISCARD) \ + x(durability, BCH_MEMBER_DURABILITY) \ + x(data_allowed, BCH_MEMBER_DATA_ALLOWED) + struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; BCH_OPTS() @@ -476,6 +536,13 @@ struct bch_opts { #undef x }; +struct bch2_opts_parse { + struct bch_opts opts; + + /* to save opts that can't be parsed before the FS is opened: */ + struct printbuf parse_later; +}; + static const __maybe_unused struct bch_opts bch2_opts_default = { #define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ ._name##_defined = true, \ @@ -538,8 +605,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); -void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); -void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); +void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); + +struct bch_dev; +void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); int bch2_opt_lookup(const char *); int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); @@ -551,10 +620,17 @@ int bch2_opt_parse(struct bch_fs *, const struct bch_option *, void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, const struct bch_option *, u64, unsigned); +void bch2_opts_to_text(struct printbuf *, + struct bch_opts, + struct bch_fs *, struct bch_sb *, + unsigned, unsigned, unsigned); int bch2_opt_check_may_set(struct bch_fs *, int, u64); int bch2_opts_check_may_set(struct bch_fs *); -int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *); +int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, + struct printbuf *, const char *, const char *); +int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, + char *); /* inode opts: */ @@ -562,11 +638,22 @@ struct bch_io_opts { #define x(_name, _bits) u##_bits _name; BCH_INODE_OPTS() #undef x +#define x(_name, _bits) u64 _name##_from_inode:1; + BCH_INODE_OPTS() +#undef x }; -static inline unsigned background_compression(struct bch_io_opts opts) +static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) { - return opts.background_compression ?: opts.compression; + if (!opts->background_target) + opts->background_target = opts->foreground_target; + if (!opts->background_compression) + opts->background_compression = opts->compression; + if (opts->nocow) { + opts->compression = opts->background_compression = 0; + opts->data_checksum = 0; + opts->erasure_code = 0; + } } struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index b27d22925929..4cf5a2af1e6f 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -10,35 +10,57 @@ #include "printbuf.h" -static inline unsigned printbuf_linelen(struct printbuf *buf) +static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos) { - return buf->pos - buf->last_newline; + return pos - buf->last_newline; } -int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) +static inline unsigned printbuf_linelen(struct printbuf *buf) { - unsigned new_size; - char *buf; + return __printbuf_linelen(buf, buf->pos); +} - if (!out->heap_allocated) - return 0; +/* + * Returns spaces from start of line, if set, or 0 if unset: + */ +static inline unsigned cur_tabstop(struct printbuf *buf) +{ + return buf->cur_tabstop < buf->nr_tabstops + ? buf->_tabstops[buf->cur_tabstop] + : 0; +} +int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) +{ /* Reserved space for terminating nul: */ extra += 1; - if (out->pos + extra < out->size) + if (out->pos + extra <= out->size) + return 0; + + if (!out->heap_allocated) { + out->overflow = true; return 0; + } + + unsigned new_size = roundup_pow_of_two(out->size + extra); - new_size = roundup_pow_of_two(out->size + extra); + /* Sanity check... */ + if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) { + out->allocation_failure = true; + out->overflow = true; + return -ENOMEM; + } /* * Note: output buffer must be freeable with kfree(), it's not required * that the user use printbuf_exit(). */ - buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); + char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); if (!buf) { out->allocation_failure = true; + out->overflow = true; return -ENOMEM; } @@ -47,6 +69,92 @@ int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) return 0; } +static void printbuf_advance_pos(struct printbuf *out, unsigned len) +{ + out->pos += min(len, printbuf_remaining(out)); +} + +static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr) +{ + unsigned move = out->pos - pos; + + bch2_printbuf_make_room(out, nr); + + if (pos + nr < out->size) + memmove(out->buf + pos + nr, + out->buf + pos, + min(move, out->size - 1 - pos - nr)); + + if (pos < out->size) + memset(out->buf + pos, ' ', min(nr, out->size - pos)); + + printbuf_advance_pos(out, nr); + printbuf_nul_terminate_reserved(out); +} + +static void __printbuf_do_indent(struct printbuf *out, unsigned pos) +{ + while (true) { + int pad; + unsigned len = out->pos - pos; + char *p = out->buf + pos; + char *n = memscan(p, '\n', len); + if (cur_tabstop(out)) { + n = min(n, (char *) memscan(p, '\r', len)); + n = min(n, (char *) memscan(p, '\t', len)); + } + + pos = n - out->buf; + if (pos == out->pos) + break; + + switch (*n) { + case '\n': + pos++; + out->last_newline = pos; + + printbuf_insert_spaces(out, pos, out->indent); + + pos = min(pos + out->indent, out->pos); + out->last_field = pos; + out->cur_tabstop = 0; + break; + case '\r': + memmove(n, n + 1, out->pos - pos); + --out->pos; + pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos); + if (pad > 0) { + printbuf_insert_spaces(out, out->last_field, pad); + pos += pad; + } + + out->last_field = pos; + out->cur_tabstop++; + break; + case '\t': + pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1; + if (pad > 0) { + *n = ' '; + printbuf_insert_spaces(out, pos, pad - 1); + pos += pad; + } else { + memmove(n, n + 1, out->pos - pos); + --out->pos; + } + + out->last_field = pos; + out->cur_tabstop++; + break; + } + } +} + +static inline void printbuf_do_indent(struct printbuf *out, unsigned pos) +{ + if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling) + __printbuf_do_indent(out, pos); +} + void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) { int len; @@ -55,14 +163,14 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) va_list args2; va_copy(args2, args); - len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2); va_end(args2); - } while (len + 1 >= printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len + 1)); + } while (len > printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len)); - len = min_t(size_t, len, - printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); - out->pos += len; + unsigned indent_pos = out->pos; + printbuf_advance_pos(out, len); + printbuf_do_indent(out, indent_pos); } void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) @@ -72,14 +180,14 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) do { va_start(args, fmt); - len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); + len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args); va_end(args); - } while (len + 1 >= printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len + 1)); + } while (len > printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len)); - len = min_t(size_t, len, - printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); - out->pos += len; + unsigned indent_pos = out->pos; + printbuf_advance_pos(out, len); + printbuf_do_indent(out, indent_pos); } /** @@ -194,31 +302,32 @@ void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) void bch2_prt_newline(struct printbuf *buf) { - unsigned i; - bch2_printbuf_make_room(buf, 1 + buf->indent); - __prt_char(buf, '\n'); + __prt_char_reserved(buf, '\n'); buf->last_newline = buf->pos; - for (i = 0; i < buf->indent; i++) - __prt_char(buf, ' '); + __prt_chars_reserved(buf, ' ', buf->indent); - printbuf_nul_terminate(buf); + printbuf_nul_terminate_reserved(buf); buf->last_field = buf->pos; buf->cur_tabstop = 0; } -/* - * Returns spaces from start of line, if set, or 0 if unset: - */ -static inline unsigned cur_tabstop(struct printbuf *buf) +void bch2_printbuf_strip_trailing_newline(struct printbuf *out) { - return buf->cur_tabstop < buf->nr_tabstops - ? buf->_tabstops[buf->cur_tabstop] - : 0; + for (int p = out->pos - 1; p >= 0; --p) { + if (out->buf[p] == '\n') { + out->pos = p; + break; + } + if (out->buf[p] != ' ') + break; + } + + printbuf_nul_terminate_reserved(out); } static void __prt_tab(struct printbuf *out) @@ -247,24 +356,9 @@ void bch2_prt_tab(struct printbuf *out) static void __prt_tab_rjust(struct printbuf *buf) { - unsigned move = buf->pos - buf->last_field; int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); - - if (pad > 0) { - bch2_printbuf_make_room(buf, pad); - - if (buf->last_field + pad < buf->size) - memmove(buf->buf + buf->last_field + pad, - buf->buf + buf->last_field, - min(move, buf->size - 1 - buf->last_field - pad)); - - if (buf->last_field < buf->size) - memset(buf->buf + buf->last_field, ' ', - min((unsigned) pad, buf->size - buf->last_field)); - - buf->pos += pad; - printbuf_nul_terminate(buf); - } + if (pad > 0) + printbuf_insert_spaces(buf, buf->last_field, pad); buf->last_field = buf->pos; buf->cur_tabstop++; @@ -301,41 +395,9 @@ void bch2_prt_tab_rjust(struct printbuf *buf) */ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) { - const char *unprinted_start = str; - const char *end = str + count; - - if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) { - prt_bytes(out, str, count); - return; - } - - while (str != end) { - switch (*str) { - case '\n': - prt_bytes(out, unprinted_start, str - unprinted_start); - unprinted_start = str + 1; - bch2_prt_newline(out); - break; - case '\t': - if (likely(cur_tabstop(out))) { - prt_bytes(out, unprinted_start, str - unprinted_start); - unprinted_start = str + 1; - __prt_tab(out); - } - break; - case '\r': - if (likely(cur_tabstop(out))) { - prt_bytes(out, unprinted_start, str - unprinted_start); - unprinted_start = str + 1; - __prt_tab_rjust(out); - } - break; - } - - str++; - } - - prt_bytes(out, unprinted_start, str - unprinted_start); + unsigned indent_pos = out->pos; + prt_bytes(out, str, count); + printbuf_do_indent(out, indent_pos); } /** @@ -348,9 +410,10 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou void bch2_prt_human_readable_u64(struct printbuf *out, u64 v) { bch2_printbuf_make_room(out, 10); - out->pos += string_get_size(v, 1, !out->si_units, - out->buf + out->pos, - printbuf_remaining_size(out)); + unsigned len = string_get_size(v, 1, !out->si_units, + out->buf + out->pos, + printbuf_remaining_size(out)); + printbuf_advance_pos(out, len); } /** @@ -402,9 +465,7 @@ void bch2_prt_string_option(struct printbuf *out, const char * const list[], size_t selected) { - size_t i; - - for (i = 0; list[i]; i++) + for (size_t i = 0; list[i]; i++) bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); } diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index 9a4a56c40937..d0dd398baa2b 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -86,6 +86,7 @@ struct printbuf { u8 atomic; bool allocation_failure:1; bool heap_allocated:1; + bool overflow:1; enum printbuf_si si_units:1; bool human_readable_units:1; bool has_indent_or_tabstops:1; @@ -114,6 +115,7 @@ void bch2_printbuf_indent_add(struct printbuf *, unsigned); void bch2_printbuf_indent_sub(struct printbuf *, unsigned); void bch2_prt_newline(struct printbuf *); +void bch2_printbuf_strip_trailing_newline(struct printbuf *); void bch2_prt_tab(struct printbuf *); void bch2_prt_tab_rjust(struct printbuf *); @@ -142,7 +144,9 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], */ static inline unsigned printbuf_remaining_size(struct printbuf *out) { - return out->pos < out->size ? out->size - out->pos : 0; + if (WARN_ON(out->size && out->pos >= out->size)) + out->pos = out->size - 1; + return out->size - out->pos; } /* @@ -151,7 +155,7 @@ static inline unsigned printbuf_remaining_size(struct printbuf *out) */ static inline unsigned printbuf_remaining(struct printbuf *out) { - return out->pos < out->size ? out->size - out->pos - 1 : 0; + return out->size ? printbuf_remaining_size(out) - 1 : 0; } static inline unsigned printbuf_written(struct printbuf *out) @@ -159,30 +163,25 @@ static inline unsigned printbuf_written(struct printbuf *out) return out->size ? min(out->pos, out->size - 1) : 0; } -/* - * Returns true if output was truncated: - */ -static inline bool printbuf_overflowed(struct printbuf *out) +static inline void printbuf_nul_terminate_reserved(struct printbuf *out) { - return out->pos >= out->size; + if (WARN_ON(out->size && out->pos >= out->size)) + out->pos = out->size - 1; + if (out->size) + out->buf[out->pos] = 0; } static inline void printbuf_nul_terminate(struct printbuf *out) { bch2_printbuf_make_room(out, 1); - - if (out->pos < out->size) - out->buf[out->pos] = 0; - else if (out->size) - out->buf[out->size - 1] = 0; + printbuf_nul_terminate_reserved(out); } /* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ static inline void __prt_char_reserved(struct printbuf *out, char c) { if (printbuf_remaining(out)) - out->buf[out->pos] = c; - out->pos++; + out->buf[out->pos++] = c; } /* Doesn't nul terminate: */ @@ -194,37 +193,34 @@ static inline void __prt_char(struct printbuf *out, char c) static inline void prt_char(struct printbuf *out, char c) { - __prt_char(out, c); - printbuf_nul_terminate(out); + bch2_printbuf_make_room(out, 2); + __prt_char_reserved(out, c); + printbuf_nul_terminate_reserved(out); } static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) { - unsigned i, can_print = min(n, printbuf_remaining(out)); + unsigned can_print = min(n, printbuf_remaining(out)); - for (i = 0; i < can_print; i++) + for (unsigned i = 0; i < can_print; i++) out->buf[out->pos++] = c; - out->pos += n - can_print; } static inline void prt_chars(struct printbuf *out, char c, unsigned n) { bch2_printbuf_make_room(out, n); __prt_chars_reserved(out, c, n); - printbuf_nul_terminate(out); + printbuf_nul_terminate_reserved(out); } static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) { - unsigned i, can_print; - bch2_printbuf_make_room(out, n); - can_print = min(n, printbuf_remaining(out)); + unsigned can_print = min(n, printbuf_remaining(out)); - for (i = 0; i < can_print; i++) + for (unsigned i = 0; i < can_print; i++) out->buf[out->pos++] = ((char *) b)[i]; - out->pos += n - can_print; printbuf_nul_terminate(out); } @@ -241,18 +237,28 @@ static inline void prt_str_indented(struct printbuf *out, const char *str) static inline void prt_hex_byte(struct printbuf *out, u8 byte) { - bch2_printbuf_make_room(out, 2); + bch2_printbuf_make_room(out, 3); __prt_char_reserved(out, hex_asc_hi(byte)); __prt_char_reserved(out, hex_asc_lo(byte)); - printbuf_nul_terminate(out); + printbuf_nul_terminate_reserved(out); } static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) { - bch2_printbuf_make_room(out, 2); + bch2_printbuf_make_room(out, 3); __prt_char_reserved(out, hex_asc_upper_hi(byte)); __prt_char_reserved(out, hex_asc_upper_lo(byte)); - printbuf_nul_terminate(out); + printbuf_nul_terminate_reserved(out); +} + +static inline void printbuf_reset_keep_tabstops(struct printbuf *buf) +{ + buf->pos = 0; + buf->allocation_failure = 0; + buf->last_newline = 0; + buf->last_field = 0; + buf->indent = 0; + buf->cur_tabstop = 0; } /** @@ -260,11 +266,8 @@ static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) */ static inline void printbuf_reset(struct printbuf *buf) { - buf->pos = 0; - buf->allocation_failure = 0; - buf->indent = 0; + printbuf_reset_keep_tabstops(buf); buf->nr_tabstops = 0; - buf->cur_tabstop = 0; } /** diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index e68b34eab90a..8b857fc33244 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -20,7 +20,7 @@ static const char * const bch2_quota_counters[] = { }; static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_quota *q = field_to_type(f, quota); @@ -59,14 +59,13 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { .to_text = bch2_sb_quota_to_text, }; -int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { int ret = 0; - bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err, - quota_type_invalid, + bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, + c, quota_type_invalid, "invalid quota type (%llu >= %u)", k.k->p.inode, QTYP_NR); fsck_err: @@ -97,45 +96,14 @@ static void qc_info_to_text(struct printbuf *out, struct qc_info *i) printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 20); - prt_str(out, "i_fieldmask"); - prt_tab(out); - prt_printf(out, "%x", i->i_fieldmask); - prt_newline(out); - - prt_str(out, "i_flags"); - prt_tab(out); - prt_printf(out, "%u", i->i_flags); - prt_newline(out); - - prt_str(out, "i_spc_timelimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_spc_timelimit); - prt_newline(out); - - prt_str(out, "i_ino_timelimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_ino_timelimit); - prt_newline(out); - - prt_str(out, "i_rt_spc_timelimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_rt_spc_timelimit); - prt_newline(out); - - prt_str(out, "i_spc_warnlimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_spc_warnlimit); - prt_newline(out); - - prt_str(out, "i_ino_warnlimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_ino_warnlimit); - prt_newline(out); - - prt_str(out, "i_rt_spc_warnlimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_rt_spc_warnlimit); - prt_newline(out); + prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask); + prt_printf(out, "i_flags\t%u\n", i->i_flags); + prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit); + prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit); + prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit); + prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit); + prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit); + prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit); } static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) @@ -143,60 +111,17 @@ static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 20); - prt_str(out, "d_fieldmask"); - prt_tab(out); - prt_printf(out, "%x", q->d_fieldmask); - prt_newline(out); - - prt_str(out, "d_spc_hardlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_spc_hardlimit); - prt_newline(out); - - prt_str(out, "d_spc_softlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_spc_softlimit); - prt_newline(out); - - prt_str(out, "d_ino_hardlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_hardlimit); - prt_newline(out); - - prt_str(out, "d_ino_softlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_softlimit); - prt_newline(out); - - prt_str(out, "d_space"); - prt_tab(out); - prt_printf(out, "%llu", q->d_space); - prt_newline(out); - - prt_str(out, "d_ino_count"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_count); - prt_newline(out); - - prt_str(out, "d_ino_timer"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_timer); - prt_newline(out); - - prt_str(out, "d_spc_timer"); - prt_tab(out); - prt_printf(out, "%llu", q->d_spc_timer); - prt_newline(out); - - prt_str(out, "d_ino_warns"); - prt_tab(out); - prt_printf(out, "%i", q->d_ino_warns); - prt_newline(out); - - prt_str(out, "d_spc_warns"); - prt_tab(out); - prt_printf(out, "%i", q->d_spc_warns); - prt_newline(out); + prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask); + prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit); + prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit); + prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit); + prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit); + prt_printf(out, "d_space\t%llu\n", q->d_space); + prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count); + prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer); + prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer); + prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns); + prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns); } static inline unsigned __next_qtype(unsigned i, unsigned qtypes) @@ -560,13 +485,11 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_inode_unpacked u; struct bch_snapshot_tree s_t; - int ret; + u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot); - ret = bch2_snapshot_tree_lookup(trans, - bch2_snapshot_tree(c, k.k->p.snapshot), &s_t); + int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "%s: snapshot tree %u not found", __func__, - snapshot_t(c, k.k->p.snapshot)->tree); + "%s: snapshot tree %u not found", __func__, tree); if (ret) return ret; @@ -612,10 +535,10 @@ int bch2_fs_quota_read(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, __bch2_quota_set(c, k, NULL)) ?: for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, bch2_fs_quota_read_inode(trans, &iter, k))); bch_err_fn(c, ret); return ret; @@ -902,7 +825,7 @@ static int bch2_set_quota_trans(struct btree_trans *trans, int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); ret = bkey_err(k); if (unlikely(ret)) return ret; @@ -946,7 +869,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, bkey_quota_init(&new_quota.k_i); new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_set_quota_trans(trans, &new_quota, qdq)) ?: __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index 884f601f41c4..1551800ff44c 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -5,15 +5,14 @@ #include "inode.h" #include "quota_types.h" -enum bkey_invalid_flags; extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_quota ((struct bkey_ops) { \ - .key_invalid = bch2_quota_invalid, \ + .key_validate = bch2_quota_validate, \ .val_to_text = bch2_quota_to_text, \ .min_val_size = 32, \ }) diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c new file mode 100644 index 000000000000..bef2aa1b8bcd --- /dev/null +++ b/fs/bcachefs/rcu_pending.c @@ -0,0 +1,666 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "%s() " fmt "\n", __func__ + +#include <linux/generic-radix-tree.h> +#include <linux/mm.h> +#include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/srcu.h> +#include <linux/vmalloc.h> + +#include "rcu_pending.h" +#include "darray.h" +#include "util.h" + +#define static_array_for_each(_a, _i) \ + for (typeof(&(_a)[0]) _i = _a; \ + _i < (_a) + ARRAY_SIZE(_a); \ + _i++) + +enum rcu_pending_special { + RCU_PENDING_KVFREE = 1, + RCU_PENDING_CALL_RCU = 2, +}; + +#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) +#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) + +#ifdef __KERNEL__ +typedef unsigned long rcu_gp_poll_state_t; + +static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) +{ + return l == r; +} +#else +typedef struct urcu_gp_poll_state rcu_gp_poll_state_t; + +static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) +{ + return l.grace_period_id == r.grace_period_id; +} +#endif + +static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp) +{ + return ssp + ? get_state_synchronize_srcu(ssp) + : get_state_synchronize_rcu(); +} + +static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp) +{ + return ssp + ? start_poll_synchronize_srcu(ssp) + : start_poll_synchronize_rcu(); +} + +static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie) +{ + return ssp + ? poll_state_synchronize_srcu(ssp, cookie) + : poll_state_synchronize_rcu(cookie); +} + +static inline void __rcu_barrier(struct srcu_struct *ssp) +{ + return ssp + ? srcu_barrier(ssp) + : rcu_barrier(); +} + +static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp, + rcu_callback_t func) +{ + if (ssp) + call_srcu(ssp, rhp, func); + else + call_rcu(rhp, func); +} + +struct rcu_pending_seq { + /* + * We're using a radix tree like a vector - we're just pushing elements + * onto the end; we're using a radix tree instead of an actual vector to + * avoid reallocation overhead + */ + GENRADIX(struct rcu_head *) objs; + size_t nr; + struct rcu_head **cursor; + rcu_gp_poll_state_t seq; +}; + +struct rcu_pending_list { + struct rcu_head *head; + struct rcu_head *tail; + rcu_gp_poll_state_t seq; +}; + +struct rcu_pending_pcpu { + struct rcu_pending *parent; + spinlock_t lock; + int cpu; + + /* + * We can't bound the number of unprocessed gp sequence numbers, and we + * can't efficiently merge radix trees for expired grace periods, so we + * need darray/vector: + */ + DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs; + + /* Third entry is for expired objects: */ + struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1]; + + struct rcu_head cb; + bool cb_armed; + struct work_struct work; +}; + +static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p) +{ + if (p->objs.nr) + return true; + + static_array_for_each(p->lists, i) + if (i->head) + return true; + + return false; +} + +static void rcu_pending_list_merge(struct rcu_pending_list *l1, + struct rcu_pending_list *l2) +{ +#ifdef __KERNEL__ + if (!l1->head) + l1->head = l2->head; + else + l1->tail->next = l2->head; +#else + if (!l1->head) + l1->head = l2->head; + else + l1->tail->next.next = (void *) l2->head; +#endif + + l1->tail = l2->tail; + l2->head = l2->tail = NULL; +} + +static void rcu_pending_list_add(struct rcu_pending_list *l, + struct rcu_head *n) +{ +#ifdef __KERNEL__ + if (!l->head) + l->head = n; + else + l->tail->next = n; + l->tail = n; + n->next = NULL; +#else + if (!l->head) + l->head = n; + else + l->tail->next.next = (void *) n; + l->tail = n; + n->next.next = NULL; +#endif +} + +static void merge_expired_lists(struct rcu_pending_pcpu *p) +{ + struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + + for (struct rcu_pending_list *i = p->lists; i < expired; i++) + if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq)) + rcu_pending_list_merge(expired, i); +} + +#ifndef __KERNEL__ +static inline void kfree_bulk(size_t nr, void ** p) +{ + while (nr--) + kfree(*p); +} + +#define local_irq_save(flags) \ +do { \ + flags = 0; \ +} while (0) +#endif + +static noinline void __process_finished_items(struct rcu_pending *pending, + struct rcu_pending_pcpu *p, + unsigned long flags) +{ + struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + struct rcu_pending_seq objs = {}; + struct rcu_head *list = NULL; + + if (p->objs.nr && + __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) { + objs = p->objs.data[0]; + darray_remove_item(&p->objs, p->objs.data); + } + + merge_expired_lists(p); + + list = expired->head; + expired->head = expired->tail = NULL; + + spin_unlock_irqrestore(&p->lock, flags); + + switch ((ulong) pending->process) { + case RCU_PENDING_KVFREE: + for (size_t i = 0; i < objs.nr; ) { + size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i); + + kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i)); + i += nr_this_node; + } + genradix_free(&objs.objs); + + while (list) { + struct rcu_head *obj = list; +#ifdef __KERNEL__ + list = obj->next; +#else + list = (void *) obj->next.next; +#endif + + /* + * low bit of pointer indicates whether rcu_head needs + * to be freed - kvfree_rcu_mightsleep() + */ + BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0); + + void *ptr = (void *)(((unsigned long) obj->func) & ~1UL); + bool free_head = ((unsigned long) obj->func) & 1UL; + + kvfree(ptr); + if (free_head) + kfree(obj); + } + + break; + + case RCU_PENDING_CALL_RCU: + for (size_t i = 0; i < objs.nr; i++) { + struct rcu_head *obj = *genradix_ptr(&objs.objs, i); + obj->func(obj); + } + genradix_free(&objs.objs); + + while (list) { + struct rcu_head *obj = list; +#ifdef __KERNEL__ + list = obj->next; +#else + list = (void *) obj->next.next; +#endif + obj->func(obj); + } + break; + + default: + for (size_t i = 0; i < objs.nr; i++) + pending->process(pending, *genradix_ptr(&objs.objs, i)); + genradix_free(&objs.objs); + + while (list) { + struct rcu_head *obj = list; +#ifdef __KERNEL__ + list = obj->next; +#else + list = (void *) obj->next.next; +#endif + pending->process(pending, obj); + } + break; + } +} + +static bool process_finished_items(struct rcu_pending *pending, + struct rcu_pending_pcpu *p, + unsigned long flags) +{ + /* + * XXX: we should grab the gp seq once and avoid multiple function + * calls, this is called from __rcu_pending_enqueue() fastpath in + * may_sleep==true mode + */ + if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) || + (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) || + (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) || + p->lists[2].head) { + __process_finished_items(pending, p, flags); + return true; + } + + return false; +} + +static void rcu_pending_work(struct work_struct *work) +{ + struct rcu_pending_pcpu *p = + container_of(work, struct rcu_pending_pcpu, work); + struct rcu_pending *pending = p->parent; + unsigned long flags; + + do { + spin_lock_irqsave(&p->lock, flags); + } while (process_finished_items(pending, p, flags)); + + spin_unlock_irqrestore(&p->lock, flags); +} + +static void rcu_pending_rcu_cb(struct rcu_head *rcu) +{ + struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb); + + schedule_work_on(p->cpu, &p->work); + + unsigned long flags; + spin_lock_irqsave(&p->lock, flags); + if (__rcu_pending_has_pending(p)) { + spin_unlock_irqrestore(&p->lock, flags); + __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb); + } else { + p->cb_armed = false; + spin_unlock_irqrestore(&p->lock, flags); + } +} + +static __always_inline struct rcu_pending_seq * +get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq) +{ + darray_for_each_reverse(p->objs, objs) + if (rcu_gp_poll_cookie_eq(objs->seq, seq)) + return objs; + + if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) + return NULL; + + return &darray_last(p->objs); +} + +static noinline bool +rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq, + struct rcu_head *head, void *ptr, + unsigned long *flags) +{ + if (ptr) { + if (!head) { + /* + * kvfree_rcu_mightsleep(): we weren't passed an + * rcu_head, but we need one: use the low bit of the + * ponter to free to flag that the head needs to be + * freed as well: + */ + ptr = (void *)(((unsigned long) ptr)|1UL); + head = kmalloc(sizeof(*head), __GFP_NOWARN); + if (!head) { + spin_unlock_irqrestore(&p->lock, *flags); + head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL); + /* + * dropped lock, did GFP_KERNEL allocation, + * check for gp expiration + */ + if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) { + kvfree(--ptr); + kfree(head); + spin_lock_irqsave(&p->lock, *flags); + return false; + } + } + } + + head->func = ptr; + } +again: + for (struct rcu_pending_list *i = p->lists; + i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { + if (rcu_gp_poll_cookie_eq(i->seq, seq)) { + rcu_pending_list_add(i, head); + return false; + } + } + + for (struct rcu_pending_list *i = p->lists; + i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { + if (!i->head) { + i->seq = seq; + rcu_pending_list_add(i, head); + return true; + } + } + + merge_expired_lists(p); + goto again; +} + +/* + * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via + * pending->pracess) once grace period elapses. + * + * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall + * back to a linked list. + * + * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a + * process callback + * + * - If @ptr and @head are both not NULL, we're kvfree_rcu() + * + * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep() + * + * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process + * expired items. + */ +static __always_inline void +__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, + void *ptr, bool may_sleep) +{ + + struct rcu_pending_pcpu *p; + struct rcu_pending_seq *objs; + struct genradix_node *new_node = NULL; + unsigned long flags; + bool start_gp = false; + + BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); + + local_irq_save(flags); + p = this_cpu_ptr(pending->p); + spin_lock(&p->lock); + rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); +restart: + if (may_sleep && + unlikely(process_finished_items(pending, p, flags))) + goto check_expired; + + /* + * In kvfree_rcu() mode, the radix tree is only for slab pointers so + * that we can do kfree_bulk() - vmalloc pointers always use the linked + * list: + */ + if (ptr && unlikely(is_vmalloc_addr(ptr))) + goto list_add; + + objs = get_object_radix(p, seq); + if (unlikely(!objs)) + goto list_add; + + if (unlikely(!objs->cursor)) { + /* + * New radix tree nodes must be added under @p->lock because the + * tree root is in a darray that can be resized (typically, + * genradix supports concurrent unlocked allocation of new + * nodes) - hence preallocation and the retry loop: + */ + objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs, + objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN); + if (unlikely(!objs->cursor)) { + if (may_sleep) { + spin_unlock_irqrestore(&p->lock, flags); + + gfp_t gfp = GFP_KERNEL; + if (!head) + gfp |= __GFP_NOFAIL; + + new_node = genradix_alloc_node(gfp); + if (!new_node) + may_sleep = false; + goto check_expired; + } +list_add: + start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags); + goto start_gp; + } + } + + *objs->cursor++ = ptr ?: head; + /* zero cursor if we hit the end of a radix tree node: */ + if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1))) + objs->cursor = NULL; + start_gp = !objs->nr; + objs->nr++; +start_gp: + if (unlikely(start_gp)) { + /* + * We only have one callback (ideally, we would have one for + * every outstanding graceperiod) - so if our callback is + * already in flight, we may still have to start a grace period + * (since we used get_state() above, not start_poll()) + */ + if (!p->cb_armed) { + p->cb_armed = true; + __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb); + } else { + __start_poll_synchronize_rcu(pending->srcu); + } + } + spin_unlock_irqrestore(&p->lock, flags); +free_node: + if (new_node) + genradix_free_node(new_node); + return; +check_expired: + if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) { + switch ((ulong) pending->process) { + case RCU_PENDING_KVFREE: + kvfree(ptr); + break; + case RCU_PENDING_CALL_RCU: + head->func(head); + break; + default: + pending->process(pending, head); + break; + } + goto free_node; + } + + local_irq_save(flags); + p = this_cpu_ptr(pending->p); + spin_lock(&p->lock); + goto restart; +} + +void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj) +{ + __rcu_pending_enqueue(pending, obj, NULL, true); +} + +static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p) +{ + struct rcu_head *ret = NULL; + + spin_lock_irq(&p->lock); + darray_for_each(p->objs, objs) + if (objs->nr) { + ret = *genradix_ptr(&objs->objs, --objs->nr); + objs->cursor = NULL; + if (!objs->nr) + genradix_free(&objs->objs); + goto out; + } + + static_array_for_each(p->lists, i) + if (i->head) { + ret = i->head; +#ifdef __KERNEL__ + i->head = ret->next; +#else + i->head = (void *) ret->next.next; +#endif + if (!i->head) + i->tail = NULL; + goto out; + } +out: + spin_unlock_irq(&p->lock); + + return ret; +} + +struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending) +{ + return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p)); +} + +struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending) +{ + struct rcu_head *ret = rcu_pending_dequeue(pending); + + if (ret) + return ret; + + int cpu; + for_each_possible_cpu(cpu) { + ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu)); + if (ret) + break; + } + return ret; +} + +static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending) +{ + int cpu; + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + spin_lock_irq(&p->lock); + if (__rcu_pending_has_pending(p) || p->cb_armed) { + spin_unlock_irq(&p->lock); + return true; + } + spin_unlock_irq(&p->lock); + } + + return false; +} + +void rcu_pending_exit(struct rcu_pending *pending) +{ + int cpu; + + if (!pending->p) + return; + + while (rcu_pending_has_pending_or_armed(pending)) { + __rcu_barrier(pending->srcu); + + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + flush_work(&p->work); + } + } + + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + flush_work(&p->work); + } + + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + + static_array_for_each(p->lists, i) + WARN_ON(i->head); + WARN_ON(p->objs.nr); + darray_exit(&p->objs); + } + free_percpu(pending->p); +} + +/** + * rcu_pending_init: - initialize a rcu_pending + * + * @pending: Object to init + * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal + * RCU flavor + * @process: Callback function invoked on objects once their RCU barriers + * have completed; if NULL, kvfree() is used. + */ +int rcu_pending_init(struct rcu_pending *pending, + struct srcu_struct *srcu, + rcu_pending_process_fn process) +{ + pending->p = alloc_percpu(struct rcu_pending_pcpu); + if (!pending->p) + return -ENOMEM; + + int cpu; + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + p->parent = pending; + p->cpu = cpu; + spin_lock_init(&p->lock); + darray_init(&p->objs); + INIT_WORK(&p->work, rcu_pending_work); + } + + pending->srcu = srcu; + pending->process = process; + + return 0; +} diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h new file mode 100644 index 000000000000..71a2f4ddaade --- /dev/null +++ b/fs/bcachefs/rcu_pending.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RCU_PENDING_H +#define _LINUX_RCU_PENDING_H + +#include <linux/rcupdate.h> + +struct rcu_pending; +typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *); + +struct rcu_pending_pcpu; + +struct rcu_pending { + struct rcu_pending_pcpu __percpu *p; + struct srcu_struct *srcu; + rcu_pending_process_fn process; +}; + +void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj); +struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending); +struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending); + +void rcu_pending_exit(struct rcu_pending *pending); +int rcu_pending_init(struct rcu_pending *pending, + struct srcu_struct *srcu, + rcu_pending_process_fn process); + +#endif /* _LINUX_RCU_PENDING_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 22d1017aa49b..d0a1f5cd5c2b 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -13,6 +13,7 @@ #include "errcode.h" #include "error.h" #include "inode.h" +#include "io_write.h" #include "move.h" #include "rebalance.h" #include "subvolume.h" @@ -23,6 +24,190 @@ #include <linux/kthread.h> #include <linux/sched/cputime.h> +/* bch_extent_rebalance: */ + +static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + + bkey_extent_entry_for_each(ptrs, entry) + if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) + return &entry->rebalance; + + return NULL; +} + +static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s_c k, + struct bkey_ptrs_c ptrs) +{ + if (!opts->background_compression) + return 0; + + unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ptr_bit = 1; + unsigned rewrite_ptrs = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) + return 0; + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + return rewrite_ptrs; +} + +static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_ptrs_c ptrs) +{ + if (!opts->background_target || + !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) + return 0; + + unsigned ptr_bit = 1; + unsigned rewrite_ptrs = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) + rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + return rewrite_ptrs; +} + +static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | + bch2_bkey_ptrs_need_move(c, opts, ptrs); +} + +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); + if (!opts) + return 0; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 sectors = 0; + + if (opts->background_compression) { + unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) { + sectors = 0; + goto incompressible; + } + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + sectors += p.crc.compressed_size; + } + } +incompressible: + if (opts->background_target) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) + sectors += p.crc.compressed_size; + + return sectors; +} + +static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, + struct bkey_s_c k) +{ + if (!bkey_extent_is_direct_data(k.k)) + return 0; + + const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); + + if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { + struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts); + return old == NULL || memcmp(old, &new, sizeof(new)); + } else { + return old != NULL; + } +} + +int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, + struct bkey_i *_k) +{ + if (!bkey_extent_is_direct_data(&_k->k)) + return 0; + + struct bkey_s k = bkey_i_to_s(_k); + struct bch_extent_rebalance *old = + (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); + + if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) { + if (!old) { + old = bkey_val_end(k); + k.k->u64s += sizeof(*old) / sizeof(u64); + } + + *old = io_opts_to_rebalance_opts(c, opts); + } else { + if (old) + extent_entry_drop(k, (union bch_extent_entry *) old); + } + + return 0; +} + +int bch2_get_update_rebalance_opts(struct btree_trans *trans, + struct bch_io_opts *io_opts, + struct btree_iter *iter, + struct bkey_s_c k) +{ + BUG_ON(iter->flags & BTREE_ITER_is_extents); + BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); + + const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v + ? bch2_bkey_rebalance_opts(k) : NULL; + if (r) { +#define x(_name) \ + if (r->_name##_from_inode) { \ + io_opts->_name = r->_name; \ + io_opts->_name##_from_inode = true; \ + } + BCH_REBALANCE_OPTS() +#undef x + } + + if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k)) + return 0; + + struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(n, k); + + /* On successfull transaction commit, @k was invalidated: */ + + return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; +} + #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) static const char * const bch2_rebalance_state_strs[] = { @@ -32,7 +217,7 @@ static const char * const bch2_rebalance_state_strs[] = { #undef x }; -static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) +int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) { struct btree_iter iter; struct bkey_s_c k; @@ -42,7 +227,7 @@ static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -69,8 +254,9 @@ err: int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) { - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, - __bch2_set_rebalance_needs_scan(trans, inum)); + int ret = bch2_trans_commit_do(c, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + bch2_set_rebalance_needs_scan_trans(trans, inum)); rebalance_wakeup(c); return ret; } @@ -89,7 +275,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -118,6 +304,9 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { + if (!bch2_bkey_rebalance_opts(k)) + return 0; + struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); int ret = PTR_ERR_OR_ZERO(n); if (ret) @@ -131,31 +320,28 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct bpos work_pos, struct btree_iter *extent_iter, + struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { struct bch_fs *c = trans->c; - struct bkey_s_c k; bch2_trans_iter_exit(trans, extent_iter); bch2_trans_iter_init(trans, extent_iter, work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, work_pos, - BTREE_ITER_ALL_SNAPSHOTS); - k = bch2_btree_iter_peek_slot(extent_iter); + BTREE_ITER_all_snapshots); + struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter); if (bkey_err(k)) return k; - const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL; - if (!r) { - /* raced due to btree write buffer, nothing to do */ - return bkey_s_c_null; - } + int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k); + if (ret) + return bkey_s_c_err(ret); memset(data_opts, 0, sizeof(*data_opts)); - - data_opts->rewrite_ptrs = - bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); - data_opts->target = r->target; + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); + data_opts->target = io_opts->background_target; + data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; if (!data_opts->rewrite_ptrs) { /* @@ -174,12 +360,28 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, if (trace_rebalance_extent_enabled()) { struct printbuf buf = PRINTBUF; - prt_str(&buf, "target="); - bch2_target_to_text(&buf, c, r->target); - prt_str(&buf, " compression="); - bch2_compression_opt_to_text(&buf, r->compression); - prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs); + if (p) { + prt_str(&buf, "compression="); + bch2_compression_opt_to_text(&buf, io_opts->background_compression); + prt_str(&buf, " "); + bch2_prt_u64_base2(&buf, p); + prt_newline(&buf); + } + + p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs); + if (p) { + prt_str(&buf, "move="); + bch2_target_to_text(&buf, c, io_opts->background_target); + prt_str(&buf, " "); + bch2_prt_u64_base2(&buf, p); + prt_newline(&buf); + } trace_rebalance_extent(c, buf.buf); printbuf_exit(&buf); @@ -208,14 +410,10 @@ static int do_rebalance_extent(struct moving_context *ctxt, bch2_bkey_buf_init(&sk); ret = bkey_err(k = next_rebalance_extent(trans, work_pos, - extent_iter, &data_opts)); + extent_iter, &io_opts, &data_opts)); if (ret || !k.k) goto out; - ret = bch2_move_get_io_opts_one(trans, &io_opts, k); - if (ret) - goto out; - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); /* @@ -249,20 +447,9 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - unsigned target, compression; - - if (k.k->p.inode) { - target = io_opts->background_target; - compression = background_compression(*io_opts); - } else { - const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - - target = r ? r->target : io_opts->background_target; - compression = r ? r->compression : background_compression(*io_opts); - } - - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); - data_opts->target = target; + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); + data_opts->target = io_opts->background_target; + data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; return data_opts->rewrite_ptrs != 0; } @@ -323,17 +510,19 @@ static int do_rebalance(struct moving_context *ctxt) struct bkey_s_c k; int ret = 0; + bch2_trans_begin(trans); + bch2_move_stats_init(&r->work_stats, "rebalance_work"); bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); bch2_trans_iter_init(trans, &rebalance_work_iter, BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_all_snapshots); while (!bch2_move_ratelimit(ctxt)) { - if (!r->enabled) { + if (!c->opts.rebalance_enabled) { bch2_moving_ctxt_flush_all(ctxt); - kthread_wait_freezable(r->enabled || + kthread_wait_freezable(c->opts.rebalance_enabled || kthread_should_stop()); } @@ -412,11 +601,11 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) u64 now = atomic64_read(&c->io_clock[WRITE].now); prt_str(out, "io wait duration: "); - bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start); + bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); prt_newline(out); prt_str(out, "io wait remaining: "); - bch2_prt_human_readable_s64(out, r->wait_iotime_end - now); + bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); prt_newline(out); prt_str(out, "duration waited: "); diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 28a52638f16c..62a3859d3823 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -2,8 +2,38 @@ #ifndef _BCACHEFS_REBALANCE_H #define _BCACHEFS_REBALANCE_H +#include "compress.h" +#include "disk_groups.h" +#include "opts.h" #include "rebalance_types.h" +static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, + struct bch_io_opts *opts) +{ + struct bch_extent_rebalance r = { + .type = BIT(BCH_EXTENT_ENTRY_rebalance), +#define x(_name) \ + ._name = opts->_name, \ + ._name##_from_inode = opts->_name##_from_inode, + BCH_REBALANCE_OPTS() +#undef x + }; + + if (r.background_target && + !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) + r.background_target = 0; + + return r; +}; + +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); +int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); +int bch2_get_update_rebalance_opts(struct btree_trans *, + struct bch_io_opts *, + struct btree_iter *, + struct bkey_s_c); + +int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); int bch2_set_fs_needs_rebalance(struct bch_fs *); diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h new file mode 100644 index 000000000000..ff9a1342a22b --- /dev/null +++ b/fs/bcachefs/rebalance_format.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REBALANCE_FORMAT_H +#define _BCACHEFS_REBALANCE_FORMAT_H + +struct bch_extent_rebalance { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:3, + + promote_target_from_inode:1, + erasure_code_from_inode:1, + data_checksum_from_inode:1, + background_compression_from_inode:1, + data_replicas_from_inode:1, + background_target_from_inode:1, + + promote_target:16, + erasure_code:1, + data_checksum:4, + data_replicas:4, + background_compression:8, /* enum bch_compression_opt */ + background_target:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 background_target:16, + background_compression:8, + data_replicas:4, + data_checksum:4, + erasure_code:1, + promote_target:16, + + background_target_from_inode:1, + data_replicas_from_inode:1, + background_compression_from_inode:1, + data_checksum_from_inode:1, + erasure_code_from_inode:1, + promote_target_from_inode:1, + + unused:3, + type:6; +#endif +}; + +/* subset of BCH_INODE_OPTS */ +#define BCH_REBALANCE_OPTS() \ + x(data_checksum) \ + x(background_compression) \ + x(data_replicas) \ + x(promote_target) \ + x(background_target) \ + x(erasure_code) + +#endif /* _BCACHEFS_REBALANCE_FORMAT_H */ + diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index 0fffb536c1d0..fe5098c17dfc 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -30,8 +30,6 @@ struct bch_fs_rebalance { struct bbpos scan_start; struct bbpos scan_end; struct bch_move_stats scan_stats; - - unsigned enabled:1; }; #endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 21e13bb4335b..71c786cdb192 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1,66 +1,170 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "backpointers.h" -#include "bkey_buf.h" #include "alloc_background.h" -#include "btree_gc.h" +#include "bkey_buf.h" #include "btree_journal_iter.h" +#include "btree_node_scan.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" #include "buckets.h" #include "dirent.h" -#include "ec.h" +#include "disk_accounting.h" #include "errcode.h" #include "error.h" #include "fs-common.h" -#include "fsck.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" -#include "lru.h" #include "logged_ops.h" #include "move.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-clean.h" #include "sb-downgrade.h" #include "snapshot.h" -#include "subvolume.h" #include "super-io.h" #include <linux/sort.h> #include <linux/stat.h> -#define QSTR(n) { { { .len = strlen(n) } }, .name = n } -static bool btree_id_is_alloc(enum btree_id id) +int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) { - switch (id) { + u64 b = BIT_ULL(btree); + int ret = 0; + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (!(c->sb.btrees_lost_data & b)) { + struct printbuf buf = PRINTBUF; + bch2_btree_id_to_text(&buf, btree); + bch_err(c, "flagging btree %s lost data", buf.buf); + printbuf_exit(&buf); + ext->btrees_lost_data |= cpu_to_le64(b); + } + + /* Once we have runtime self healing for topology errors we won't need this: */ + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; + + /* Btree node accounting will be off: */ + __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + +#ifdef CONFIG_BCACHEFS_DEBUG + /* + * These are much more minor, and don't need to be corrected right away, + * but in debug mode we want the next fsck run to be clean: + */ + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; +#endif + + switch (btree) { case BTREE_ID_alloc: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); + goto out; case BTREE_ID_backpointers: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; + goto out; case BTREE_ID_need_discard: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; case BTREE_ID_freespace: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; case BTREE_ID_bucket_gens: - return true; + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; + case BTREE_ID_lru: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; + case BTREE_ID_accounting: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + goto out; default: - return false; + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + goto out; } +out: + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return ret; +} + +static void kill_btree(struct bch_fs *c, enum btree_id btree) +{ + bch2_btree_id_root(c, btree)->alive = false; + bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); } /* for -o reconstruct_alloc: */ -static void drop_alloc_keys(struct journal_keys *keys) +static void bch2_reconstruct_alloc(struct bch_fs *c) { - size_t src, dst; + bch2_journal_log_msg(c, "dropping alloc info"); + bch_info(c, "dropping and reconstructing all alloc info"); - for (src = 0, dst = 0; src < keys->nr; src++) - if (!btree_id_is_alloc(keys->d[src].btree_id)) - keys->d[dst++] = keys->d[src]; + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); + + __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); + + __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent); + + __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); + + __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent); + + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + + c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - keys->nr = dst; + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) + if (btree_id_is_alloc(i)) + kill_btree(c, i); } /* @@ -70,9 +174,7 @@ static void drop_alloc_keys(struct journal_keys *keys) */ static void zero_out_btree_mem_ptr(struct journal_keys *keys) { - struct journal_key *i; - - for (i = keys->d; i < keys->d + keys->nr; i++) + darray_for_each(*keys, i) if (i->k->k.type == KEY_TYPE_btree_ptr_v2) bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; } @@ -89,14 +191,53 @@ static void replay_now_at(struct journal *j, u64 seq) bch2_journal_pin_put(j, j->replay_journal_seq++); } +static int bch2_journal_replay_accounting_key(struct btree_trans *trans, + struct journal_key *k) +{ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, + BTREE_ITER_intent); + int ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; + + struct bkey u; + struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u); + + /* Has this delta already been applied to the btree? */ + if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) { + ret = 0; + goto out; + } + + struct bkey_i *new = k->k; + if (old.k->type == KEY_TYPE_accounting) { + new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto out; + + bch2_accounting_accumulate(bkey_i_to_accounting(new), + bkey_s_c_to_accounting(old)); + } + + trans->journal_res.seq = k->journal_seq; + + ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int bch2_journal_replay_key(struct btree_trans *trans, struct journal_key *k) { struct btree_iter iter; unsigned iter_flags = - BTREE_ITER_INTENT| - BTREE_ITER_NOT_EXTENTS; - unsigned update_flags = BTREE_TRIGGER_NORUN; + BTREE_ITER_intent| + BTREE_ITER_not_extents; + unsigned update_flags = BTREE_TRIGGER_norun; int ret; if (k->overwritten) @@ -105,17 +246,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans, trans->journal_res.seq = k->journal_seq; /* - * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to + * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to * keep the key cache coherent with the underlying btree. Nothing * besides the allocator is doing updates yet so we don't need key cache * coherency for non-alloc btrees, and key cache fills for snapshots - * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until + * btrees use BTREE_ITER_filter_snapshots, which isn't available until * the snapshots recovery pass runs. */ if (!k->level && k->btree_id == BTREE_ID_alloc) - iter_flags |= BTREE_ITER_CACHED; + iter_flags |= BTREE_ITER_cached; else - update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM; + update_flags |= BTREE_UPDATE_key_cache_reclaim; bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, @@ -124,10 +265,26 @@ static int bch2_journal_replay_key(struct btree_trans *trans, if (ret) goto out; + struct btree_path *path = btree_iter_path(trans, &iter); + if (unlikely(!btree_path_node(path, k->level))) { + bch2_trans_iter_exit(trans, &iter); + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, 0, iter_flags); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_btree_increase_depth(trans, iter.path, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; + } + /* Must be checked with btree locked: */ if (k->overwritten) goto out; + if (k->k->k.type == KEY_TYPE_accounting) { + ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k); + goto out; + } + ret = bch2_trans_update(trans, &iter, k->k, update_flags); out: bch2_trans_iter_exit(trans, &iter); @@ -139,17 +296,24 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) const struct journal_key *l = *((const struct journal_key **)_l); const struct journal_key *r = *((const struct journal_key **)_r); - return cmp_int(l->journal_seq, r->journal_seq); + /* + * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last + * + * journal_seq == 0 means that the key comes from early repair, and + * should be inserted last so as to avoid overflowing the journal + */ + return cmp_int(l->journal_seq - 1, r->journal_seq - 1); } -static int bch2_journal_replay(struct bch_fs *c) +int bch2_journal_replay(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; DARRAY(struct journal_key *) keys_sorted = { 0 }; struct journal *j = &c->journal; u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; - struct btree_trans *trans = bch2_trans_get(c); + struct btree_trans *trans = NULL; + bool immediate_flush = false; int ret = 0; if (keys->nr) { @@ -161,24 +325,58 @@ static int bch2_journal_replay(struct bch_fs *c) BUG_ON(!atomic_read(&keys->ref)); + move_gap(keys, keys->nr); + trans = bch2_trans_get(c); + + /* + * Replay accounting keys first: we can't allow the write buffer to + * flush accounting keys until we're done + */ + darray_for_each(*keys, k) { + if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated)) + continue; + + cond_resched(); + + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_skip_accounting_apply| + BCH_TRANS_COMMIT_no_journal_res| + BCH_WATERMARK_reclaim, + bch2_journal_replay_accounting_key(trans, k)); + if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) + goto err; + + k->overwritten = true; + } + + set_bit(BCH_FS_accounting_replay_done, &c->flags); + /* * First, attempt to replay keys in sorted order. This is more * efficient - better locality of btree access - but some might fail if * that would cause a journal deadlock. */ - for (size_t i = 0; i < keys->nr; i++) { + darray_for_each(*keys, k) { cond_resched(); - struct journal_key *k = keys->d + i; + /* + * k->allocated means the key wasn't read in from the journal, + * rather it was from early repair code + */ + if (k->allocated) + immediate_flush = true; /* Skip fastpath if we're low on space in the journal */ ret = c->journal.watermark ? -1 : commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_skip_accounting_apply| (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), bch2_journal_replay_key(trans, k)); - BUG_ON(!ret && !k->overwritten); + BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting); if (ret) { ret = darray_push(&keys_sorted, k); if (ret) @@ -186,6 +384,7 @@ static int bch2_journal_replay(struct bch_fs *c) } } + bch2_trans_unlock_long(trans); /* * Now, replay any remaining keys in the order in which they appear in * the journal, unpinning those journal entries as we go: @@ -199,20 +398,27 @@ static int bch2_journal_replay(struct bch_fs *c) struct journal_key *k = *kp; - replay_now_at(j, k->journal_seq); + if (k->journal_seq) + replay_now_at(j, k->journal_seq); + else + replay_now_at(j, j->replay_journal_seq_end); ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_skip_accounting_apply| (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim : 0), bch2_journal_replay_key(trans, k)); - bch_err_msg(c, ret, "while replaying key at btree %s level %u:", - bch2_btree_id_str(k->btree_id), k->level); - if (ret) + if (ret) { + struct printbuf buf = PRINTBUF; + bch2_btree_id_level_to_text(&buf, k->btree_id, k->level); + bch_err_msg(c, ret, "while replaying key at %s:", buf.buf); + printbuf_exit(&buf); goto err; + } - BUG_ON(!k->overwritten); + BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); } /* @@ -222,7 +428,8 @@ static int bch2_journal_replay(struct bch_fs *c) bch2_trans_put(trans); trans = NULL; - if (!c->opts.keep_journal) + if (!c->opts.retain_recovery_info && + c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); replay_now_at(j, j->replay_journal_seq_end); @@ -230,6 +437,12 @@ static int bch2_journal_replay(struct bch_fs *c) bch2_journal_set_replay_done(j); + /* if we did any repair, flush it immediately */ + if (immediate_flush) { + bch2_journal_flush_all_pins(&c->journal); + ret = bch2_journal_meta(&c->journal); + } + if (keys->nr) bch2_journal_log_msg(c, "journal replay finished"); err: @@ -249,7 +462,15 @@ static int journal_replay_entry_early(struct bch_fs *c, switch (entry->type) { case BCH_JSET_ENTRY_btree_root: { - struct btree_root *r; + + if (unlikely(!entry->u64s)) + return 0; + + if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, + c, invalid_btree_id, + "invalid btree id %u (max %u)", + entry->btree_id, BTREE_ID_NR_MAX)) + return 0; while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); @@ -257,15 +478,11 @@ static int journal_replay_entry_early(struct bch_fs *c, return ret; } - r = bch2_btree_id_root(c, entry->btree_id); + struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); - if (entry->u64s) { - r->level = entry->level; - bkey_copy(&r->key, (struct bkey_i *) entry->start); - r->error = 0; - } else { - r->error = -EIO; - } + r->level = entry->level; + bkey_copy(&r->key, (struct bkey_i *) entry->start); + r->error = 0; r->alive = true; break; } @@ -274,42 +491,10 @@ static int journal_replay_entry_early(struct bch_fs *c, container_of(entry, struct jset_entry_usage, entry); switch (entry->btree_id) { - case BCH_FS_USAGE_reserved: - if (entry->level < BCH_REPLICAS_MAX) - c->usage_base->persistent_reserved[entry->level] = - le64_to_cpu(u->v); - break; - case BCH_FS_USAGE_inodes: - c->usage_base->b.nr_inodes = le64_to_cpu(u->v); - break; case BCH_FS_USAGE_key_version: - atomic64_set(&c->key_version, - le64_to_cpu(u->v)); + atomic64_set(&c->key_version, le64_to_cpu(u->v)); break; } - - break; - } - case BCH_JSET_ENTRY_data_usage: { - struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); - - ret = bch2_replicas_set_usage(c, &u->r, - le64_to_cpu(u->v)); - break; - } - case BCH_JSET_ENTRY_dev_usage: { - struct jset_entry_dev_usage *u = - container_of(entry, struct jset_entry_dev_usage, entry); - struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); - unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); - - for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { - ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); - ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); - ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); - } - break; } case BCH_JSET_ENTRY_blacklist: { @@ -337,7 +522,7 @@ static int journal_replay_entry_early(struct bch_fs *c, atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); } } - +fsck_err: return ret; } @@ -359,7 +544,7 @@ static int journal_replay_early(struct bch_fs *c, genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; vstruct_for_each(&i->j, entry) { @@ -370,8 +555,6 @@ static int journal_replay_early(struct bch_fs *c, } } - bch2_fs_usage_initialize(c); - return 0; } @@ -379,199 +562,45 @@ static int journal_replay_early(struct bch_fs *c, static int read_btree_roots(struct bch_fs *c) { - unsigned i; + struct printbuf buf = PRINTBUF; int ret = 0; - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (!r->alive) continue; - if (btree_id_is_alloc(i) && - c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - continue; - } - - if (r->error) { - __fsck_err(c, - btree_id_is_alloc(i) - ? FSCK_CAN_IGNORE : 0, - btree_root_bkey_invalid, - "invalid btree root %s", - bch2_btree_id_str(i)); - if (i == BTREE_ID_alloc) - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - } - - ret = bch2_btree_root_read(c, i, &r->key, r->level); - if (ret) { - fsck_err(c, - btree_root_read_error, - "error reading btree root %s", - bch2_btree_id_str(i)); + printbuf_reset(&buf); + bch2_btree_id_level_to_text(&buf, i, r->level); + + if (mustfix_fsck_err_on((ret = r->error), + c, btree_root_bkey_invalid, + "invalid btree root %s", + buf.buf) || + mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), + c, btree_root_read_error, + "error reading btree root %s: %s", + buf.buf, bch2_err_str(ret))) { if (btree_id_is_alloc(i)) - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - ret = 0; + r->error = 0; + + ret = bch2_btree_lost_data(c, i); + BUG_ON(ret); } } - for (i = 0; i < BTREE_ID_NR; i++) { + for (unsigned i = 0; i < BTREE_ID_NR; i++) { struct btree_root *r = bch2_btree_id_root(c, i); - if (!r->b) { + if (!r->b && !r->error) { r->alive = false; r->level = 0; - bch2_btree_root_alloc(c, i); + bch2_btree_root_alloc_fake(c, i, 0); } } fsck_err: - return ret; -} - -static int bch2_initialize_subvolumes(struct bch_fs *c) -{ - struct bkey_i_snapshot_tree root_tree; - struct bkey_i_snapshot root_snapshot; - struct bkey_i_subvolume root_volume; - int ret; - - bkey_snapshot_tree_init(&root_tree.k_i); - root_tree.k.p.offset = 1; - root_tree.v.master_subvol = cpu_to_le32(1); - root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); - - bkey_snapshot_init(&root_snapshot.k_i); - root_snapshot.k.p.offset = U32_MAX; - root_snapshot.v.flags = 0; - root_snapshot.v.parent = 0; - root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); - root_snapshot.v.tree = cpu_to_le32(1); - SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); - - bkey_subvolume_init(&root_volume.k_i); - root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_volume.v.flags = 0; - root_volume.v.snapshot = cpu_to_le32(U32_MAX); - root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - - ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); - bch_err_fn(c, ret); - return ret; -} - -static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); - ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_is_inode(k.k)) { - bch_err(trans->c, "root inode not found"); - ret = -BCH_ERR_ENOENT_inode; - goto err; - } - - ret = bch2_inode_unpack(k, &inode); - BUG_ON(ret); - - inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - - ret = bch2_inode_write(trans, &iter, &inode); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* set bi_subvol on root inode */ -noinline_for_stack -static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) -{ - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, - __bch2_fs_upgrade_for_subvolumes(trans)); - bch_err_fn(c, ret); - return ret; -} - -const char * const bch2_recovery_passes[] = { -#define x(_fn, ...) #_fn, - BCH_RECOVERY_PASSES() -#undef x - NULL -}; - -static int bch2_check_allocations(struct bch_fs *c) -{ - return bch2_gc(c, true, c->opts.norecovery); -} - -static int bch2_set_may_go_rw(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - /* - * After we go RW, the journal keys buffer can't be modified (except for - * setting journal_key->overwritten: it will be accessed by multiple - * threads - */ - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; - - set_bit(BCH_FS_may_go_rw, &c->flags); - - if (keys->nr || c->opts.fsck || !c->sb.clean) - return bch2_fs_read_write_early(c); - return 0; -} - -struct recovery_pass_fn { - int (*fn)(struct bch_fs *); - unsigned when; -}; - -static struct recovery_pass_fn recovery_pass_fns[] = { -#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, - BCH_RECOVERY_PASSES() -#undef x -}; - -u64 bch2_recovery_passes_to_stable(u64 v) -{ - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, - BCH_RECOVERY_PASSES() -#undef x - }; - - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; -} - -u64 bch2_recovery_passes_from_stable(u64 v) -{ - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - }; - - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); + printbuf_exit(&buf); return ret; } @@ -582,6 +611,7 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_latest_compatible_version(c->sb.version)); unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; unsigned new_version = 0; + bool ret = false; if (old_version < bcachefs_metadata_required_upgrade_below) { if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || @@ -637,103 +667,31 @@ static bool check_version_upgrade(struct bch_fs *c) } bch_info(c, "%s", buf.buf); - - bch2_sb_upgrade(c, new_version); - printbuf_exit(&buf); - return true; - } - - return false; -} - -u64 bch2_fsck_recovery_passes(void) -{ - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & PASS_FSCK) - ret |= BIT_ULL(i); - return ret; -} - -static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - - if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) - return false; - if (c->recovery_passes_explicit & BIT_ULL(pass)) - return true; - if ((p->when & PASS_FSCK) && c->opts.fsck) - return true; - if ((p->when & PASS_UNCLEAN) && !c->sb.clean) - return true; - if (p->when & PASS_ALWAYS) - return true; - return false; -} - -static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - int ret; - - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) - return ret; - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_CONT " done\n"); - - return 0; -} - -static int bch2_run_recovery_passes(struct bch_fs *c) -{ - int ret = 0; - - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { - if (should_run_recovery_pass(c, c->curr_recovery_pass)) { - unsigned pass = c->curr_recovery_pass; - - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || - (ret && c->curr_recovery_pass < pass)) - continue; - if (ret) - break; - - c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); - } - c->curr_recovery_pass++; - c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); + ret = true; } - return ret; -} - -int bch2_run_online_recovery_passes(struct bch_fs *c) -{ - int ret = 0; + if (new_version > c->sb.version_incompat && + c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { + struct printbuf buf = PRINTBUF; - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { - struct recovery_pass_fn *p = recovery_pass_fns + i; + prt_str(&buf, "Now allowing incompatible features up to "); + bch2_version_to_text(&buf, new_version); + prt_str(&buf, ", previously allowed up to "); + bch2_version_to_text(&buf, c->sb.version_incompat_allowed); + prt_newline(&buf); - if (!(p->when & PASS_ONLINE)) - continue; + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); - ret = bch2_run_recovery_pass(c, i); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { - i = c->curr_recovery_pass; - continue; - } - if (ret) - break; + ret = true; } + if (ret) + bch2_sb_upgrade(c, new_version, + c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible); + return ret; } @@ -769,75 +727,73 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (c->opts.fsck && c->opts.norecovery) { - bch_err(c, "cannot select both norecovery and fsck"); - ret = -EINVAL; - goto err; + if (c->opts.norecovery) { + c->opts.recovery_pass_last = c->opts.recovery_pass_last + ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) + : BCH_RECOVERY_PASS_snapshots_read; + c->opts.nochanges = true; + c->opts.read_only = true; } - if (!c->opts.nochanges) { - mutex_lock(&c->sb_lock); - bool write_sb = false; - - struct bch_sb_field_ext *ext = - bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); - if (!ext) { - ret = -BCH_ERR_ENOSPC_sb; - mutex_unlock(&c->sb_lock); - goto err; - } + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + bool write_sb = false; - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { - ext->recovery_passes_required[0] |= - cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); - write_sb = true; - } + if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { + ext->recovery_passes_required[0] |= + cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); + write_sb = true; + } - u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - if (sb_passes) { - struct printbuf buf = PRINTBUF; - prt_str(&buf, "superblock requires following recovery passes to be run:\n "); - prt_bitflags(&buf, bch2_recovery_passes, sb_passes); - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } + u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (sb_passes) { + struct printbuf buf = PRINTBUF; + prt_str(&buf, "superblock requires following recovery passes to be run:\n "); + prt_bitflags(&buf, bch2_recovery_passes, sb_passes); + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + } - if (bch2_check_version_downgrade(c)) { - struct printbuf buf = PRINTBUF; + if (bch2_check_version_downgrade(c)) { + struct printbuf buf = PRINTBUF; - prt_str(&buf, "Version downgrade required:"); - - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_downgrade(c, - BCH_VERSION_MINOR(bcachefs_metadata_version_current), - BCH_VERSION_MINOR(c->sb.version)); - passes = ext->recovery_passes_required[0] & ~passes; - if (passes) { - prt_str(&buf, "\n running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } + prt_str(&buf, "Version downgrade required:"); - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - write_sb = true; + __le64 passes = ext->recovery_passes_required[0]; + bch2_sb_set_downgrade(c, + BCH_VERSION_MINOR(bcachefs_metadata_version_current), + BCH_VERSION_MINOR(c->sb.version)); + passes = ext->recovery_passes_required[0] & ~passes; + if (passes) { + prt_str(&buf, "\n running recovery passes: "); + prt_bitflags(&buf, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } - if (check_version_upgrade(c)) - write_sb = true; + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + write_sb = true; + } + + if (check_version_upgrade(c)) + write_sb = true; - if (write_sb) - bch2_write_super(c); + c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - mutex_unlock(&c->sb_lock); + if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) { + SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); + write_sb = true; } - if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); if (c->opts.fsck) set_bit(BCH_FS_fsck_running, &c->flags); + if (c->sb.clean) + set_bit(BCH_FS_clean_recovery, &c->flags); + set_bit(BCH_FS_recovery_running, &c->flags); ret = bch2_blacklist_table_initialize(c); if (ret) { @@ -845,7 +801,9 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { + bch2_journal_pos_from_member_info_resume(c); + + if (!c->sb.clean || c->opts.retain_recovery_info) { struct genradix_iter iter; struct journal_replay **i; @@ -862,7 +820,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto out; genradix_for_each_reverse(&c->journal_entries, iter, i) - if (*i && !(*i)->ignore) { + if (!journal_replay_ignore(*i)) { last_journal_entry = &(*i)->j; break; } @@ -887,7 +845,8 @@ int bch2_fs_recovery(struct bch_fs *c) genradix_for_each_reverse(&c->journal_entries, iter, i) if (*i) { last_journal_entry = &(*i)->j; - (*i)->ignore = false; + (*i)->ignore_blacklisted = false; + (*i)->ignore_not_dirty= false; /* * This was probably a NO_FLUSH entry, * so last_seq was garbage - but we know @@ -923,17 +882,15 @@ use_clean: c->journal_replay_seq_start = last_seq; c->journal_replay_seq_end = blacklist_seq - 1; - if (c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - drop_alloc_keys(&c->journal_keys); - } - zero_out_btree_mem_ptr(&c->journal_keys); ret = journal_replay_early(c, clean); if (ret) goto err; + if (c->opts.reconstruct_alloc) + bch2_reconstruct_alloc(c); + /* * After an unclean shutdown, skip then next few journal sequence * numbers as they may have been referenced by btree writes that @@ -950,7 +907,7 @@ use_clean: bch2_journal_seq_blacklist_add(c, blacklist_seq, journal_seq); if (ret) { - bch_err(c, "error creating new journal seq blacklist entry"); + bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); goto err; } } @@ -961,9 +918,6 @@ use_clean: if (ret) goto err; - if (c->opts.reconstruct_alloc) - bch2_journal_log_msg(c, "dropping alloc info"); - /* * Skip past versions that might have possibly been used (as nonces), * but hadn't had their pointers written: @@ -975,11 +929,34 @@ use_clean: if (ret) goto err; + set_bit(BCH_FS_btree_running, &c->flags); + + ret = bch2_sb_set_upgrade_extra(c); + ret = bch2_run_recovery_passes(c); if (ret) goto err; + /* + * Normally set by the appropriate recovery pass: when cleared, this + * indicates we're in early recovery and btree updates should be done by + * being applied to the journal replay keys. _Must_ be cleared before + * multithreaded use: + */ + set_bit(BCH_FS_may_go_rw, &c->flags); clear_bit(BCH_FS_fsck_running, &c->flags); + clear_bit(BCH_FS_recovery_running, &c->flags); + + /* in case we don't run journal replay, i.e. norecovery mode */ + set_bit(BCH_FS_accounting_replay_done, &c->flags); + + bch2_async_btree_node_rewrites_flush(c); + + /* fsync if we fixed errors */ + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_meta(&c->journal); + } /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && @@ -1015,7 +992,8 @@ use_clean: } mutex_lock(&c->sb_lock); - bool write_sb = false; + ext = bch2_sb_field_get(c->disk_sb.sb, ext); + write_sb = false; if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); @@ -1028,15 +1006,18 @@ use_clean: write_sb = true; } - if (!test_bit(BCH_FS_error, &c->flags)) { - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - if (ext && - (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || - !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { - memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); - memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); - write_sb = true; - } + if (!test_bit(BCH_FS_error, &c->flags) && + !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { + memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); + write_sb = true; + } + + if (c->opts.fsck && + !test_bit(BCH_FS_error, &c->flags) && + c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && + ext->btrees_lost_data) { + ext->btrees_lost_data = 0; + write_sb = true; } if (c->opts.fsck && @@ -1047,6 +1028,9 @@ use_clean: write_sb = true; } + if (bch2_blacklist_entries_gc(c)) + write_sb = true; + if (write_sb) bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -1069,18 +1053,16 @@ use_clean: bch_info(c, "scanning for old btree nodes done"); } - if (c->journal_seq_blacklist_table && - c->journal_seq_blacklist_table->nr > 128) - queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); - ret = 0; out: bch2_flush_fsck_errs(c); - if (!c->opts.keep_journal && - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + if (!c->opts.retain_recovery_info) { bch2_journal_keys_put_initial(c); - kfree(clean); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); + } + if (!IS_ERR(clean)) + kfree(clean); if (!ret && test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && @@ -1102,9 +1084,11 @@ int bch2_fs_initialize(struct bch_fs *c) struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; struct qstr lostfound = QSTR("lost+found"); + struct bch_member *m; int ret; bch_notice(c, "initializing new filesystem"); + set_bit(BCH_FS_new_fs, &c->flags); mutex_lock(&c->sb_lock); c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); @@ -1113,20 +1097,25 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_check_version_downgrade(c); if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { - bch2_sb_upgrade(c, bcachefs_metadata_version_current); + bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); bch2_write_super(c); } + + for_each_member_device(c, ca) { + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); + ca->mi = bch2_mi_to_cpu(m); + } + + bch2_write_super(c); mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); + set_bit(BCH_FS_btree_running, &c->flags); set_bit(BCH_FS_may_go_rw, &c->flags); for (unsigned i = 0; i < BTREE_ID_NR; i++) - bch2_btree_root_alloc(c, i); - - for_each_member_device(c, ca) - bch2_dev_usage_init(ca); + bch2_btree_root_alloc_fake(c, i, 0); ret = bch2_fs_journal_alloc(c); if (ret) @@ -1137,12 +1126,21 @@ int bch2_fs_initialize(struct bch_fs *c) * set up the journal.pin FIFO and journal.cur pointer: */ bch2_fs_journal_start(&c->journal, 1); + set_bit(BCH_FS_accounting_replay_done, &c->flags); bch2_journal_set_replay_done(&c->journal); ret = bch2_fs_read_write_early(c); if (ret) goto err; + for_each_member_device(c, ca) { + ret = bch2_dev_usage_init(ca, false); + if (ret) { + bch2_dev_put(ca); + goto err; + } + } + /* * Write out the superblock and journal buckets, now that we can do * btree updates @@ -1153,9 +1151,6 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - for_each_online_member(c, ca) - ca->new_fs_bucket_idx = 0; - ret = bch2_fs_freespace_init(c); if (ret) goto err; @@ -1176,14 +1171,14 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_pack(&packed_inode, &root_inode); packed_inode.inode.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0); bch_err_msg(c, ret, "creating root directory"); if (ret) goto err; bch2_inode_init_early(c, &lostfound_inode); - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_create_trans(trans, BCACHEFS_ROOT_SUBVOL_INUM, &root_inode, &lostfound_inode, @@ -1194,7 +1189,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1; + c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); @@ -1214,6 +1209,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); + c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; return 0; err: bch_err_fn(c, ret); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 4e9d24719b2e..b0d55754b21b 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,37 +2,9 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -extern const char * const bch2_recovery_passes[]; +int bch2_btree_lost_data(struct bch_fs *, enum btree_id); -u64 bch2_recovery_passes_to_stable(u64 v); -u64 bch2_recovery_passes_from_stable(u64 v); - -/* - * For when we need to rewind recovery passes and run a pass we skipped: - */ -static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - if (c->recovery_passes_explicit & BIT_ULL(pass)) - return 0; - - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); - - c->recovery_passes_explicit |= BIT_ULL(pass); - - if (c->curr_recovery_pass >= pass) { - c->curr_recovery_pass = pass; - c->recovery_passes_complete &= (1ULL << pass) >> 1; - return -BCH_ERR_restart_recovery; - } else { - return 0; - } -} - -int bch2_run_online_recovery_passes(struct bch_fs *); -u64 bch2_fsck_recovery_passes(void); +int bch2_journal_replay(struct bch_fs *); int bch2_fs_recovery(struct bch_fs *); int bch2_fs_initialize(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c new file mode 100644 index 000000000000..0b3c951c32da --- /dev/null +++ b/fs/bcachefs/recovery_passes.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "backpointers.h" +#include "btree_gc.h" +#include "btree_node_scan.h" +#include "disk_accounting.h" +#include "ec.h" +#include "fsck.h" +#include "inode.h" +#include "journal.h" +#include "lru.h" +#include "logged_ops.h" +#include "rebalance.h" +#include "recovery.h" +#include "recovery_passes.h" +#include "snapshot.h" +#include "subvolume.h" +#include "super.h" +#include "super-io.h" + +const char * const bch2_recovery_passes[] = { +#define x(_fn, ...) #_fn, + BCH_RECOVERY_PASSES() +#undef x + NULL +}; + +/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ +static int bch2_recovery_pass_empty(struct bch_fs *c) +{ + return 0; +} + +static int bch2_set_may_go_rw(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + + /* + * After we go RW, the journal keys buffer can't be modified (except for + * setting journal_key->overwritten: it will be accessed by multiple + * threads + */ + move_gap(keys, keys->nr); + + set_bit(BCH_FS_may_go_rw, &c->flags); + + if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) + return bch2_fs_read_write_early(c); + return 0; +} + +struct recovery_pass_fn { + int (*fn)(struct bch_fs *); + unsigned when; +}; + +static struct recovery_pass_fn recovery_pass_fns[] = { +#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, + BCH_RECOVERY_PASSES() +#undef x +}; + +static const u8 passes_to_stable_map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) +{ + return passes_to_stable_map[pass]; +} + +u64 bch2_recovery_passes_to_stable(u64 v) +{ + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(passes_to_stable_map[i]); + return ret; +} + +u64 bch2_recovery_passes_from_stable(u64 v) +{ + static const u8 map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x + }; + + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(map[i]); + return ret; +} + +/* + * For when we need to rewind recovery passes and run a pass we skipped: + */ +static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) + return -BCH_ERR_not_in_recovery; + + if (c->recovery_passes_complete & BIT_ULL(pass)) + return 0; + + bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); + + if (pass < BCH_RECOVERY_PASS_set_may_go_rw && + c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { + if (print) + bch_info(c, "need recovery pass %s (%u), but already rw", + bch2_recovery_passes[pass], pass); + return -BCH_ERR_cannot_rewind_recovery; + } + + if (print) + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + + c->opts.recovery_passes |= BIT_ULL(pass); + + if (c->curr_recovery_pass > pass) { + c->next_recovery_pass = pass; + c->recovery_passes_complete &= (1ULL << pass) >> 1; + return -BCH_ERR_restart_recovery; + } else { + return 0; + } +} + +int bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + unsigned long flags; + spin_lock_irqsave(&c->recovery_pass_lock, flags); + int ret = __bch2_run_explicit_recovery_pass(c, pass); + spin_unlock_irqrestore(&c->recovery_pass_lock, flags); + return ret; +} + +int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); + + return bch2_run_explicit_recovery_pass(c, pass); +} + +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (!test_bit_le64(s, ext->recovery_passes_required)) { + __set_bit_le64(s, ext->recovery_passes_required); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + + return bch2_run_explicit_recovery_pass(c, pass); +} + +static void bch2_clear_recovery_pass_required(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (test_bit_le64(s, ext->recovery_passes_required)) { + __clear_bit_le64(s, ext->recovery_passes_required); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); +} + +u64 bch2_fsck_recovery_passes(void) +{ + u64 ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & PASS_FSCK) + ret |= BIT_ULL(i); + return ret; +} + +static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_pass_fns + pass; + + if (c->opts.recovery_passes_exclude & BIT_ULL(pass)) + return false; + if (c->opts.recovery_passes & BIT_ULL(pass)) + return true; + if ((p->when & PASS_FSCK) && c->opts.fsck) + return true; + if ((p->when & PASS_UNCLEAN) && !c->sb.clean) + return true; + if (p->when & PASS_ALWAYS) + return true; + return false; +} + +static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_pass_fns + pass; + int ret; + + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_CONT " done\n"); + + return 0; +} + +int bch2_run_online_recovery_passes(struct bch_fs *c) +{ + int ret = 0; + + down_read(&c->state_lock); + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { + struct recovery_pass_fn *p = recovery_pass_fns + i; + + if (!(p->when & PASS_ONLINE)) + continue; + + ret = bch2_run_recovery_pass(c, i); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { + i = c->curr_recovery_pass; + continue; + } + if (ret) + break; + } + + up_read(&c->state_lock); + + return ret; +} + +int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; + + /* + * We can't allow set_may_go_rw to be excluded; that would cause us to + * use the journal replay keys for updates where it's not expected. + */ + c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; + + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { + c->next_recovery_pass = c->curr_recovery_pass + 1; + + spin_lock_irq(&c->recovery_pass_lock); + unsigned pass = c->curr_recovery_pass; + + if (c->opts.recovery_pass_last && + c->curr_recovery_pass > c->opts.recovery_pass_last) { + spin_unlock_irq(&c->recovery_pass_lock); + break; + } + + if (!should_run_recovery_pass(c, pass)) { + c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, pass); + spin_unlock_irq(&c->recovery_pass_lock); + continue; + } + spin_unlock_irq(&c->recovery_pass_lock); + + ret = bch2_run_recovery_pass(c, pass) ?: + bch2_journal_flush(&c->journal); + + if (!ret && !test_bit(BCH_FS_error, &c->flags)) + bch2_clear_recovery_pass_required(c, pass); + + spin_lock_irq(&c->recovery_pass_lock); + if (c->next_recovery_pass < c->curr_recovery_pass) { + /* + * bch2_run_explicit_recovery_pass() was called: we + * can't always catch -BCH_ERR_restart_recovery because + * it may have been called from another thread (btree + * node read completion) + */ + ret = 0; + c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); + } else { + c->recovery_passes_complete |= BIT_ULL(pass); + c->recovery_pass_done = max(c->recovery_pass_done, pass); + } + c->curr_recovery_pass = c->next_recovery_pass; + spin_unlock_irq(&c->recovery_pass_lock); + } + + return ret; +} diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h new file mode 100644 index 000000000000..7d7339c8fa29 --- /dev/null +++ b/fs/bcachefs/recovery_passes.h @@ -0,0 +1,18 @@ +#ifndef _BCACHEFS_RECOVERY_PASSES_H +#define _BCACHEFS_RECOVERY_PASSES_H + +extern const char * const bch2_recovery_passes[]; + +u64 bch2_recovery_passes_to_stable(u64 v); +u64 bch2_recovery_passes_from_stable(u64 v); + +u64 bch2_fsck_recovery_passes(void); + +int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); +int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass); +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); + +int bch2_run_online_recovery_passes(struct bch_fs *); +int bch2_run_recovery_passes(struct bch_fs *); + +#endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h new file mode 100644 index 000000000000..418557960ed6 --- /dev/null +++ b/fs/bcachefs/recovery_passes_types.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H +#define _BCACHEFS_RECOVERY_PASSES_TYPES_H + +#define PASS_SILENT BIT(0) +#define PASS_FSCK BIT(1) +#define PASS_UNCLEAN BIT(2) +#define PASS_ALWAYS BIT(3) +#define PASS_ONLINE BIT(4) + +#ifdef CONFIG_BCACHEFS_DEBUG +#define PASS_FSCK_DEBUG BIT(1) +#else +#define PASS_FSCK_DEBUG 0 +#endif + +/* + * Passes may be reordered, but the second field is a persistent identifier and + * must never change: + */ +#define BCH_RECOVERY_PASSES() \ + x(recovery_pass_empty, 41, PASS_SILENT) \ + x(scan_for_btree_nodes, 37, 0) \ + x(check_topology, 4, 0) \ + x(accounting_read, 39, PASS_ALWAYS) \ + x(alloc_read, 0, PASS_ALWAYS) \ + x(stripes_read, 1, PASS_ALWAYS) \ + x(initialize_subvolumes, 2, 0) \ + x(snapshots_read, 3, PASS_ALWAYS) \ + x(check_allocations, 5, PASS_FSCK) \ + x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ + x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ + x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, 9, PASS_ALWAYS) \ + x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ + x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ + x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ + x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ + x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ + x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ + x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ + x(bucket_gens_init, 17, 0) \ + x(reconstruct_snapshots, 38, 0) \ + x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ + x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ + x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ + x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ + x(fs_upgrade_for_subvolumes, 22, 0) \ + x(check_inodes, 24, PASS_FSCK) \ + x(check_extents, 25, PASS_FSCK) \ + x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ + x(check_dirents, 27, PASS_FSCK) \ + x(check_xattrs, 28, PASS_FSCK) \ + x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_unreachable_inodes, 40, PASS_FSCK) \ + x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ + x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ + x(check_nlinks, 31, PASS_FSCK) \ + x(resume_logged_ops, 23, PASS_ALWAYS) \ + x(delete_dead_inodes, 32, PASS_ALWAYS) \ + x(fix_reflink_p, 33, 0) \ + x(set_fs_needs_rebalance, 34, 0) + +/* We normally enumerate recovery passes in the order we run them: */ +enum bch_recovery_pass { +#define x(n, id, when) BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x + BCH_RECOVERY_PASS_NR +}; + +/* But we also need stable identifiers that can be used in the superblock */ +enum bch_recovery_pass_stable { +#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, + BCH_RECOVERY_PASSES() +#undef x +}; + +#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h deleted file mode 100644 index fa0c8efd2a1b..000000000000 --- a/fs/bcachefs/recovery_types.h +++ /dev/null @@ -1,66 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_TYPES_H -#define _BCACHEFS_RECOVERY_TYPES_H - -#define PASS_SILENT BIT(0) -#define PASS_FSCK BIT(1) -#define PASS_UNCLEAN BIT(2) -#define PASS_ALWAYS BIT(3) -#define PASS_ONLINE BIT(4) - -/* - * Passes may be reordered, but the second field is a persistent identifier and - * must never change: - */ -#define BCH_RECOVERY_PASSES() \ - x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, PASS_ALWAYS) \ - x(initialize_subvolumes, 2, 0) \ - x(snapshots_read, 3, PASS_ALWAYS) \ - x(check_topology, 4, 0) \ - x(check_allocations, 5, PASS_FSCK) \ - x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ - x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ - x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ - x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ - x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ - x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ - x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \ - x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ - x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ - x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ - x(bucket_gens_init, 17, 0) \ - x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ - x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ - x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ - x(fs_upgrade_for_subvolumes, 22, 0) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ - x(check_inodes, 24, PASS_FSCK) \ - x(check_extents, 25, PASS_FSCK) \ - x(check_indirect_extents, 26, PASS_FSCK) \ - x(check_dirents, 27, PASS_FSCK) \ - x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ - x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ - x(check_nlinks, 31, PASS_FSCK) \ - x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ - x(fix_reflink_p, 33, 0) \ - x(set_fs_needs_rebalance, 34, 0) \ - -/* We normally enumerate recovery passes in the order we run them: */ -enum bch_recovery_pass { -#define x(n, id, when) BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x -}; - -/* But we also need stable identifiers that can be used in the superblock */ -enum bch_recovery_pass_stable { -#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, - BCH_RECOVERY_PASSES() -#undef x -}; - -#endif /* _BCACHEFS_RECOVERY_TYPES_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index c47c66c2b394..441e648f28b5 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -15,6 +15,17 @@ #include <linux/sched/signal.h> +static inline bool bkey_extent_is_reflink_data(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_reflink_v: + case KEY_TYPE_indirect_inline_data: + return true; + default: + return false; + } +} + static inline unsigned bkey_type_to_indirect(const struct bkey *k) { switch (k->type) { @@ -29,17 +40,16 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ -int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); int ret = 0; - bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad), - c, err, reflink_p_front_pad_bad, + bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad), + c, reflink_p_front_pad_bad, "idx < front_pad (%llu < %u)", - le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); + REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad)); fsck_err: return ret; } @@ -50,7 +60,7 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); prt_printf(out, "idx %llu front_pad %u back_pad %u", - le64_to_cpu(p.v->idx), + REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad), le32_to_cpu(p.v->back_pad)); } @@ -66,90 +76,291 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r */ return false; - if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) + if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v)) + return false; + + if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v)) return false; bch2_key_resize(l.k, l.k->size + r.k->size); return true; } +/* indirect extents */ + +int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) +{ + int ret = 0; + + bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)), + c, reflink_v_pos_bad, + "indirect extent above maximum position 0:%llu", + REFLINK_P_IDX_MAX); + + ret = bch2_bkey_ptrs_validate(c, k, from); +fsck_err: + return ret; +} + +void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); + + bch2_bkey_ptrs_to_text(out, c, k); +} + +#if 0 +Currently disabled, needs to be debugged: + +bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +{ + struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); + + return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); +} +#endif + +/* indirect inline data */ + +int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) +{ + return 0; +} + +void bch2_indirect_inline_data_to_text(struct printbuf *out, + struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); + unsigned datalen = bkey_inline_data_bytes(k.k); + + prt_printf(out, "refcount %llu datalen %u: %*phN", + le64_to_cpu(d.v->refcount), datalen, + min(datalen, 32U), d.v->data); +} + +/* lookup */ + +static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p, + bool should_commit) +{ + struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + SET_REFLINK_P_ERROR(&new->v, false); + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); + if (ret) + return ret; + + if (!should_commit) + return 0; + + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; +} + +static int bch2_indirect_extent_missing_error(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 missing_start, u64 missing_end, + bool should_commit) +{ + if (REFLINK_P_ERROR(p.v)) + return 0; + + struct bch_fs *c = trans->c; + u64 live_start = REFLINK_P_IDX(p.v); + u64 live_end = REFLINK_P_IDX(p.v) + p.k->size; + u64 refd_start = live_start - le32_to_cpu(p.v->front_pad); + u64 refd_end = live_end + le32_to_cpu(p.v->back_pad); + struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(missing_start < refd_start); + BUG_ON(missing_end > refd_end); + + if (fsck_err(trans, reflink_p_to_missing_reflink_v, + "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", + (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), + missing_start, missing_end)) { + struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + /* + * Is the missing range not actually needed? + * + * p.v->idx refers to the data that we actually want, but if the + * indirect extent we point to was bigger, front_pad and back_pad + * indicate the range we took a reference on. + */ + + if (missing_end <= live_start) { + new->v.front_pad = cpu_to_le32(live_start - missing_end); + } else if (missing_start >= live_end) { + new->v.back_pad = cpu_to_le32(missing_start - live_end); + } else { + struct bpos new_start = bkey_start_pos(&new->k); + struct bpos new_end = new->k.p; + + if (missing_start > live_start) + new_start.offset += missing_start - live_start; + if (missing_end < live_end) + new_end.offset -= live_end - missing_end; + + bch2_cut_front(new_start, &new->k_i); + bch2_cut_back(new_end, &new->k_i); + + SET_REFLINK_P_ERROR(&new->v, true); + } + + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); + if (ret) + goto err; + + if (should_commit) + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * This is used from the read path, which doesn't expect to have to do a + * transaction commit, and from triggers, which should not be doing a commit: + */ +struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, + struct btree_iter *iter, + s64 *offset_into_extent, + struct bkey_s_c_reflink_p p, + bool should_commit, + unsigned iter_flags) +{ + BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad))); + BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad)); + + u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent; + + struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink, + POS(0, reflink_offset), iter_flags); + if (bkey_err(k)) + return k; + + if (unlikely(!bkey_extent_is_reflink_data(k.k))) { + unsigned size = min((u64) k.k->size, + REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - + reflink_offset); + bch2_key_resize(&iter->k, size); + + int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, + k.k->p.offset, should_commit); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return bkey_s_c_err(ret); + } + } else if (unlikely(REFLINK_P_ERROR(p.v))) { + int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return bkey_s_c_err(ret); + } + } + + *offset_into_extent = reflink_offset - bkey_start_offset(k.k); + return k; +} + +/* reflink pointer trigger */ + static int trans_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 *idx, unsigned flags) + struct bkey_s_c_reflink_p p, u64 *idx, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i *k; - __le64 *refcount; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; struct printbuf buf = PRINTBUF; - int ret; - k = bch2_bkey_get_mut_noupdate(trans, &iter, - BTREE_ID_reflink, POS(0, *idx), - BTREE_ITER_WITH_UPDATES); - ret = PTR_ERR_OR_ZERO(k); + s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v); + struct btree_iter iter; + struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false, + BTREE_ITER_intent| + BTREE_ITER_with_updates); + int ret = bkey_err(k); if (ret) - goto err; + return ret; - refcount = bkey_refcount(bkey_i_to_s(k)); - if (!refcount) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "nonexistent indirect extent at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; + if (!bkey_refcount_c(k)) { + if (!(flags & BTREE_TRIGGER_overwrite)) + ret = -BCH_ERR_missing_indirect_extent; + goto next; } - if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "indirect extent refcount underflow at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) goto err; + + __le64 *refcount = bkey_refcount(bkey_i_to_s(new)); + if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + log_fsck_err(trans, reflink_refcount_underflow, + "indirect extent refcount underflow while marking\n %s", + buf.buf); + goto next; } - if (flags & BTREE_TRIGGER_INSERT) { + if (flags & BTREE_TRIGGER_insert) { struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; u64 pad; pad = max_t(s64, le32_to_cpu(v->front_pad), - le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); + REFLINK_P_IDX(v) - bkey_start_offset(&new->k)); BUG_ON(pad > U32_MAX); v->front_pad = cpu_to_le32(pad); pad = max_t(s64, le32_to_cpu(v->back_pad), - k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); + new->k.p.offset - p.k->size - REFLINK_P_IDX(v)); BUG_ON(pad > U32_MAX); v->back_pad = cpu_to_le32(pad); } - le64_add_cpu(refcount, add); + le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1); bch2_btree_iter_set_pos_to_extent_start(&iter); - ret = bch2_trans_update(trans, &iter, k, 0); + ret = bch2_trans_update(trans, &iter, new, 0); if (ret) goto err; - - *idx = k->k.p.offset; +next: + *idx = k.k->p.offset; err: +fsck_err: bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; } static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 *idx, unsigned flags, size_t r_idx) + struct bkey_s_c_reflink_p p, u64 *idx, + enum btree_iter_update_trigger_flags flags, + size_t r_idx) { struct bch_fs *c = trans->c; struct reflink_gc *r; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - u64 start = le64_to_cpu(p.v->idx); - u64 end = le64_to_cpu(p.v->idx) + p.k->size; - u64 next_idx = end + le32_to_cpu(p.v->back_pad); + int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; + u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); s64 ret = 0; struct printbuf buf = PRINTBUF; @@ -163,60 +374,40 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, BUG_ON((s64) r->refcount + add < 0); - r->refcount += add; + if (flags & BTREE_TRIGGER_gc) + r->refcount += add; *idx = r->offset; return 0; not_found: - if (fsck_err(c, reflink_p_to_missing_reflink_v, - "pointer to missing indirect extent\n" - " %s\n" - " missing range %llu-%llu", - (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), - *idx, next_idx)) { - struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c); - ret = PTR_ERR_OR_ZERO(update); + if (flags & BTREE_TRIGGER_check_repair) { + ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); if (ret) goto err; - - if (next_idx <= start) { - bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx); - } else if (*idx >= end) { - bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end); - } else { - bkey_error_init(update); - update->k.p = p.k->p; - update->k.p.offset = next_idx; - update->k.size = next_idx - *idx; - set_bkey_val_u64s(&update->k, 0); - } - - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN); } *idx = next_idx; err: -fsck_err: printbuf_exit(&buf); return ret; } static int __trigger_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) + enum btree_id btree_id, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); int ret = 0; - u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); - u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); + u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); + u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { while (idx < end && !ret) ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags); } - if (flags & BTREE_TRIGGER_GC) { + if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) { size_t l = 0, r = c->reflink_gc_nr; while (l < r) { @@ -239,10 +430,10 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { - if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && - (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_transactional) && + (flags & BTREE_TRIGGER_insert)) { struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v; v->front_pad = v->back_pad = 0; @@ -251,92 +442,48 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags); } -/* indirect extents */ +/* indirect extent trigger */ -int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +static inline void +check_indirect_extent_deleting(struct bkey_s new, + enum btree_iter_update_trigger_flags *flags) { - return bch2_bkey_ptrs_invalid(c, k, flags, err); -} - -void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - - prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); - - bch2_bkey_ptrs_to_text(out, c, k); -} - -#if 0 -Currently disabled, needs to be debugged: - -bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -{ - struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); - - return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); -} -#endif - -static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags) -{ - if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) { + if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) { new.k->type = KEY_TYPE_deleted; new.k->size = 0; set_bkey_val_u64s(new.k, 0); - *flags &= ~BTREE_TRIGGER_INSERT; + *flags &= ~BTREE_TRIGGER_insert; } } int bch2_trigger_reflink_v(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { - if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && - (flags & BTREE_TRIGGER_INSERT)) + if ((flags & BTREE_TRIGGER_transactional) && + (flags & BTREE_TRIGGER_insert)) check_indirect_extent_deleting(new, &flags); return bch2_trigger_extent(trans, btree_id, level, old, new, flags); } -/* indirect inline data */ - -int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) -{ - return 0; -} - -void bch2_indirect_inline_data_to_text(struct printbuf *out, - struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); - unsigned datalen = bkey_inline_data_bytes(k.k); - - prt_printf(out, "refcount %llu datalen %u: %*phN", - le64_to_cpu(d.v->refcount), datalen, - min(datalen, 32U), d.v->data); -} - int bch2_trigger_indirect_inline_data(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { check_indirect_extent_deleting(new, &flags); return 0; } +/* create */ + static int bch2_make_extent_indirect(struct btree_trans *trans, struct btree_iter *extent_iter, - struct bkey_i *orig) + struct bkey_i *orig, + bool reflink_p_may_update_opts_field) { struct bch_fs *c = trans->c; struct btree_iter reflink_iter = { NULL }; @@ -350,12 +497,20 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_prev(&reflink_iter); ret = bkey_err(k); if (ret) goto err; + /* + * XXX: we're assuming that 56 bits will be enough for the life of the + * filesystem: we need to implement wraparound, with a cursor in the + * logged ops btree: + */ + if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size))) + return -ENOSPC; + r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); ret = PTR_ERR_OR_ZERO(r_v); if (ret) @@ -365,7 +520,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, r_v->k.type = bkey_type_to_indirect(&orig->k); r_v->k.p = reflink_iter.pos; bch2_key_resize(&r_v->k, orig->k.size); - r_v->k.version = orig->k.version; + r_v->k.bversion = orig->k.bversion; set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); @@ -392,10 +547,13 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, memset(&r_p->v, 0, sizeof(r_p->v)); #endif - r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k)); + + if (reflink_p_may_update_opts_field) + SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true); ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); err: bch2_trans_iter_exit(trans, &reflink_iter); @@ -407,7 +565,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) struct bkey_s_c k; int ret; - for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) { + for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) { if (bkey_extent_is_unwritten(k)) continue; @@ -424,7 +582,8 @@ s64 bch2_remap_range(struct bch_fs *c, subvol_inum dst_inum, u64 dst_offset, subvol_inum src_inum, u64 src_offset, u64 remap_sectors, - u64 new_i_size, s64 *i_sectors_delta) + u64 new_i_size, s64 *i_sectors_delta, + bool may_change_src_io_path_opts) { struct btree_trans *trans; struct btree_iter dst_iter, src_iter; @@ -437,6 +596,8 @@ s64 bch2_remap_range(struct bch_fs *c, struct bpos src_want; u64 dst_done = 0; u32 dst_snapshot, src_snapshot; + bool reflink_p_may_update_opts_field = + bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); int ret = 0, ret2 = 0; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) @@ -456,9 +617,9 @@ s64 bch2_remap_range(struct bch_fs *c, goto err; bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, - BTREE_ITER_INTENT); + BTREE_ITER_intent); bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, - BTREE_ITER_INTENT); + BTREE_ITER_intent); while ((ret == 0 || bch2_err_matches(ret, BCH_ERR_transaction_restart)) && @@ -518,7 +679,8 @@ s64 bch2_remap_range(struct bch_fs *c, src_k = bkey_i_to_s_c(new_src.k); ret = bch2_make_extent_indirect(trans, &src_iter, - new_src.k); + new_src.k, + reflink_p_may_update_opts_field); if (ret) continue; @@ -531,11 +693,15 @@ s64 bch2_remap_range(struct bch_fs *c, struct bkey_i_reflink_p *dst_p = bkey_reflink_p_init(new_dst.k); - u64 offset = le64_to_cpu(src_p.v->idx) + + u64 offset = REFLINK_P_IDX(src_p.v) + (src_want.offset - bkey_start_offset(src_k.k)); - dst_p->v.idx = cpu_to_le64(offset); + SET_REFLINK_P_IDX(&dst_p->v, offset); + + if (reflink_p_may_update_opts_field && + may_change_src_io_path_opts) + SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); } else { BUG(); } @@ -545,7 +711,7 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?: + ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?: bch2_extent_update(trans, dst_inum, &dst_iter, new_dst.k, &disk_res, new_i_size, i_sectors_delta, @@ -568,7 +734,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_trans_begin(trans); ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u, - dst_inum, BTREE_ITER_INTENT); + dst_inum, BTREE_ITER_intent); if (!ret2 && inode_u.bi_size < new_i_size) { @@ -589,3 +755,97 @@ err: return dst_done ?: ret ?: ret2; } + +/* fsck */ + +static int bch2_gc_write_reflink_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + size_t *idx) +{ + struct bch_fs *c = trans->c; + const __le64 *refcount = bkey_refcount_c(k); + struct printbuf buf = PRINTBUF; + struct reflink_gc *r; + int ret = 0; + + if (!refcount) + return 0; + + while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && + r->offset < k.k->p.offset) + ++*idx; + + if (!r || + r->offset != k.k->p.offset || + r->size != k.k->size) { + bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); + return -EINVAL; + } + + if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), + trans, reflink_v_refcount_wrong, + "reflink key has wrong refcount:\n" + " %s\n" + " should be %u", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf), + r->refcount)) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto out; + + if (!r->refcount) + new->k.type = KEY_TYPE_deleted; + else + *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); + ret = bch2_trans_update(trans, iter, new, 0); + } +out: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_gc_reflink_done(struct bch_fs *c) +{ + size_t idx = 0; + + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_gc_write_reflink_key(trans, &iter, k, &idx))); + c->reflink_gc_nr = 0; + return ret; +} + +int bch2_gc_reflink_start(struct bch_fs *c) +{ + c->reflink_gc_nr = 0; + + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch, k, ({ + const __le64 *refcount = bkey_refcount_c(k); + + if (!refcount) + continue; + + struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, + c->reflink_gc_nr++, GFP_KERNEL); + if (!r) { + ret = -BCH_ERR_ENOMEM_gc_reflink_start; + break; + } + + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + 0; + }))); + + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 4d8867289717..1632780bdf18 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -2,50 +2,48 @@ #ifndef _BCACHEFS_REFLINK_H #define _BCACHEFS_REFLINK_H -enum bkey_invalid_flags; - -int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); -void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); +int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ - .key_invalid = bch2_reflink_p_invalid, \ + .key_validate = bch2_reflink_p_validate, \ .val_to_text = bch2_reflink_p_to_text, \ .key_merge = bch2_reflink_p_merge, \ .trigger = bch2_trigger_reflink_p, \ .min_val_size = 16, \ }) -int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); -void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); +int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ - .key_invalid = bch2_reflink_v_invalid, \ + .key_validate = bch2_reflink_v_validate, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_reflink_v, \ .min_val_size = 8, \ }) -int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_indirect_inline_data(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, - unsigned); + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ - .key_invalid = bch2_indirect_inline_data_invalid, \ + .key_validate = bch2_indirect_inline_data_validate, \ .val_to_text = bch2_indirect_inline_data_to_text, \ .trigger = bch2_trigger_indirect_inline_data, \ .min_val_size = 8, \ @@ -75,7 +73,15 @@ static inline __le64 *bkey_refcount(struct bkey_s k) } } +struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *, + s64 *, struct bkey_s_c_reflink_p, + bool, unsigned); + s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, - subvol_inum, u64, u64, u64, s64 *); + subvol_inum, u64, u64, u64, s64 *, + bool); + +int bch2_gc_reflink_done(struct bch_fs *); +int bch2_gc_reflink_start(struct bch_fs *); #endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h index 6772eebb1fc6..92995e4f898e 100644 --- a/fs/bcachefs/reflink_format.h +++ b/fs/bcachefs/reflink_format.h @@ -4,7 +4,7 @@ struct bch_reflink_p { struct bch_val v; - __le64 idx; + __le64 idx_flags; /* * A reflink pointer might point to an indirect extent which is then * later split (by copygc or rebalance). If we only pointed to part of @@ -17,6 +17,11 @@ struct bch_reflink_p { __le32 back_pad; } __packed __aligned(8); +LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56); +LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57); +LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS, + struct bch_reflink_p, idx_flags, 57, 58); + struct bch_reflink_v { struct bch_val v; __le64 refcount; diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index cc2672c12031..477ef0997949 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -2,16 +2,20 @@ #include "bcachefs.h" #include "buckets.h" +#include "disk_accounting.h" #include "journal.h" #include "replicas.h" #include "super-io.h" +#include <linux/sort.h> + static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ -static int bch2_memcmp(const void *l, const void *r, size_t size) +static int bch2_memcmp(const void *l, const void *r, const void *priv) { + size_t size = (size_t) priv; return memcmp(l, r, size); } @@ -20,14 +24,11 @@ static int bch2_memcmp(const void *l, const void *r, size_t size) static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) { #ifdef CONFIG_BCACHEFS_DEBUG - unsigned i; - - BUG_ON(e->data_type >= BCH_DATA_NR); BUG_ON(!e->nr_devs); BUG_ON(e->nr_required > 1 && e->nr_required >= e->nr_devs); - for (i = 0; i + 1 < e->nr_devs; i++) + for (unsigned i = 0; i + 1 < e->nr_devs; i++) BUG_ON(e->devs[i] >= e->devs[i + 1]); #endif } @@ -39,7 +40,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { - eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); + eytzinger0_sort_r(r->entries, r->nr, r->entry_size, + bch2_memcmp, NULL, (void *)(size_t)r->entry_size); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, @@ -64,8 +66,36 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } +static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, + struct bch_sb *sb, + struct printbuf *err) +{ + if (!r->nr_devs) { + prt_printf(err, "no devices in entry "); + goto bad; + } + + if (r->nr_required > 1 && + r->nr_required >= r->nr_devs) { + prt_printf(err, "bad nr_required in entry "); + goto bad; + } + + for (unsigned i = 0; i < r->nr_devs; i++) + if (r->devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_member_exists(sb, r->devs[i])) { + prt_printf(err, "invalid device %u in entry ", r->devs[i]); + goto bad; + } + + return 0; +bad: + bch2_replicas_entry_to_text(err, r); + return -BCH_ERR_invalid_replicas_entry; +} + int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, - struct bch_sb *sb, + struct bch_fs *c, struct printbuf *err) { if (!r->nr_devs) { @@ -80,7 +110,8 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, } for (unsigned i = 0; i < r->nr_devs; i++) - if (!bch2_dev_exists(sb, r->devs[i])) { + if (r->devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_dev_exists(c, r->devs[i])) { prt_printf(err, "invalid device %u in entry ", r->devs[i]); goto bad; } @@ -120,7 +151,7 @@ static void extent_to_replicas(struct bkey_s_c k, continue; if (!p.has_ec) - r->devs[r->nr_devs++] = p.ptr.dev; + replicas_entry_add_dev(r, p.ptr.dev); else r->nr_required = 0; } @@ -137,7 +168,7 @@ static void stripe_to_replicas(struct bkey_s_c k, for (ptr = s.v->ptrs; ptr < s.v->ptrs + s.v->nr_blocks; ptr++) - r->devs[r->nr_devs++] = ptr->dev; + replicas_entry_add_dev(r, ptr->dev); } void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, @@ -178,7 +209,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, e->nr_required = 1; darray_for_each(devs, i) - e->devs[e->nr_devs++] = *i; + replicas_entry_add_dev(e, *i); bch2_replicas_entry_sort(e); } @@ -188,24 +219,17 @@ cpu_replicas_add_entry(struct bch_fs *c, struct bch_replicas_cpu *old, struct bch_replicas_entry_v1 *new_entry) { - unsigned i; struct bch_replicas_cpu new = { .nr = old->nr + 1, .entry_size = max_t(unsigned, old->entry_size, replicas_entry_bytes(new_entry)), }; - for (i = 0; i < new_entry->nr_devs; i++) - BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i])); - - BUG_ON(!new_entry->data_type); - verify_replicas_entry(new_entry); - new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); if (!new.entries) return new; - for (i = 0; i < old->nr; i++) + for (unsigned i = 0; i < old->nr; i++) memcpy(cpu_replicas_entry(&new, i), cpu_replicas_entry(old, i), old->entry_size); @@ -226,9 +250,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, if (unlikely(entry_size > r->entry_size)) return -1; - verify_replicas_entry(search); - -#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) +#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) idx = eytzinger0_find(r->entries, r->nr, r->entry_size, entry_cmp, search); #undef entry_cmp @@ -250,145 +272,25 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r, return __replicas_entry_idx(r, search) >= 0; } -bool bch2_replicas_marked(struct bch_fs *c, +bool bch2_replicas_marked_locked(struct bch_fs *c, struct bch_replicas_entry_v1 *search) { - bool marked; - - if (!search->nr_devs) - return true; - verify_replicas_entry(search); - percpu_down_read(&c->mark_lock); - marked = __replicas_has_entry(&c->replicas, search) && - (likely((!c->replicas_gc.entries)) || - __replicas_has_entry(&c->replicas_gc, search)); - percpu_up_read(&c->mark_lock); - - return marked; + return !search->nr_devs || + (__replicas_has_entry(&c->replicas, search) && + (likely((!c->replicas_gc.entries)) || + __replicas_has_entry(&c->replicas_gc, search))); } -static void __replicas_table_update(struct bch_fs_usage *dst, - struct bch_replicas_cpu *dst_r, - struct bch_fs_usage *src, - struct bch_replicas_cpu *src_r) -{ - int src_idx, dst_idx; - - *dst = *src; - - for (src_idx = 0; src_idx < src_r->nr; src_idx++) { - if (!src->replicas[src_idx]) - continue; - - dst_idx = __replicas_entry_idx(dst_r, - cpu_replicas_entry(src_r, src_idx)); - BUG_ON(dst_idx < 0); - - dst->replicas[dst_idx] = src->replicas[src_idx]; - } -} - -static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, - struct bch_replicas_cpu *dst_r, - struct bch_fs_usage __percpu *src_p, - struct bch_replicas_cpu *src_r) -{ - unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; - struct bch_fs_usage *dst, *src = (void *) - bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr); - - preempt_disable(); - dst = this_cpu_ptr(dst_p); - preempt_enable(); - - __replicas_table_update(dst, dst_r, src, src_r); -} - -/* - * Resize filesystem accounting: - */ -static int replicas_table_update(struct bch_fs *c, - struct bch_replicas_cpu *new_r) +bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry_v1 *search) { - struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; - struct bch_fs_usage_online *new_scratch = NULL; - struct bch_fs_usage __percpu *new_gc = NULL; - struct bch_fs_usage *new_base = NULL; - unsigned i, bytes = sizeof(struct bch_fs_usage) + - sizeof(u64) * new_r->nr; - unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + - sizeof(u64) * new_r->nr; - int ret = 0; - - memset(new_usage, 0, sizeof(new_usage)); - - for (i = 0; i < ARRAY_SIZE(new_usage); i++) - if (!(new_usage[i] = __alloc_percpu_gfp(bytes, - sizeof(u64), GFP_KERNEL))) - goto err; + percpu_down_read(&c->mark_lock); + bool ret = bch2_replicas_marked_locked(c, search); + percpu_up_read(&c->mark_lock); - if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || - !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) || - (c->usage_gc && - !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) - goto err; - - for (i = 0; i < ARRAY_SIZE(new_usage); i++) - if (c->usage[i]) - __replicas_table_update_pcpu(new_usage[i], new_r, - c->usage[i], &c->replicas); - if (c->usage_base) - __replicas_table_update(new_base, new_r, - c->usage_base, &c->replicas); - if (c->usage_gc) - __replicas_table_update_pcpu(new_gc, new_r, - c->usage_gc, &c->replicas); - - for (i = 0; i < ARRAY_SIZE(new_usage); i++) - swap(c->usage[i], new_usage[i]); - swap(c->usage_base, new_base); - swap(c->usage_scratch, new_scratch); - swap(c->usage_gc, new_gc); - swap(c->replicas, *new_r); -out: - free_percpu(new_gc); - kfree(new_scratch); - for (i = 0; i < ARRAY_SIZE(new_usage); i++) - free_percpu(new_usage[i]); - kfree(new_base); return ret; -err: - bch_err(c, "error updating replicas table: memory allocation failure"); - ret = -BCH_ERR_ENOMEM_replicas_table; - goto out; -} - -static unsigned reserve_journal_replicas(struct bch_fs *c, - struct bch_replicas_cpu *r) -{ - struct bch_replicas_entry_v1 *e; - unsigned journal_res_u64s = 0; - - /* nr_inodes: */ - journal_res_u64s += - DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); - - /* key_version: */ - journal_res_u64s += - DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); - - /* persistent_reserved: */ - journal_res_u64s += - DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * - BCH_REPLICAS_MAX; - - for_each_cpu_replicas_entry(r, e) - journal_res_u64s += - DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + - e->nr_devs, sizeof(u64)); - return journal_res_u64s; } noinline @@ -424,10 +326,6 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); if (ret) goto err; - - bch2_journal_entry_res_resize(&c->journal, - &c->replicas_journal_res, - reserve_journal_replicas(c, &new_r)); } if (!new_r.entries && @@ -442,7 +340,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, /* don't update in memory replicas until changes are persistent */ percpu_down_write(&c->mark_lock); if (new_r.entries) - ret = replicas_table_update(c, &new_r); + swap(c->replicas, new_r); if (new_gc.entries) swap(new_gc, c->replicas_gc); percpu_up_write(&c->mark_lock); @@ -464,20 +362,6 @@ int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) ? 0 : bch2_mark_replicas_slowpath(c, r); } -/* replicas delta list: */ - -int bch2_replicas_delta_list_mark(struct bch_fs *c, - struct replicas_delta_list *r) -{ - struct replicas_delta *d = r->d; - struct replicas_delta *top = (void *) r->d + r->used; - int ret = 0; - - for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) - ret = bch2_mark_replicas(c, &d->r); - return ret; -} - /* * Old replicas_gc mechanism: only used for journal replicas entries now, should * die at some point: @@ -491,8 +375,9 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) percpu_down_write(&c->mark_lock); ret = ret ?: - bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?: - replicas_table_update(c, &c->replicas_gc); + bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); + if (!ret) + swap(c->replicas, c->replicas_gc); kfree(c->replicas_gc.entries); c->replicas_gc.entries = NULL; @@ -520,13 +405,16 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) c->replicas_gc.nr = 0; c->replicas_gc.entry_size = 0; - for_each_cpu_replicas_entry(&c->replicas, e) - if (!((1 << e->data_type) & typemask)) { + for_each_cpu_replicas_entry(&c->replicas, e) { + /* Preserve unknown data types */ + if (e->data_type >= BCH_DATA_NR || + !((1 << e->data_type) & typemask)) { c->replicas_gc.nr++; c->replicas_gc.entry_size = max_t(unsigned, c->replicas_gc.entry_size, replicas_entry_bytes(e)); } + } c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, c->replicas_gc.entry_size, @@ -538,7 +426,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) } for_each_cpu_replicas_entry(&c->replicas, e) - if (!((1 << e->data_type) & typemask)) + if (e->data_type >= BCH_DATA_NR || + !((1 << e->data_type) & typemask)) memcpy(cpu_replicas_entry(&c->replicas_gc, i++), e, c->replicas_gc.entry_size); @@ -559,10 +448,10 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) int bch2_replicas_gc2(struct bch_fs *c) { struct bch_replicas_cpu new = { 0 }; - unsigned i, nr; + unsigned nr; int ret = 0; - bch2_journal_meta(&c->journal); + bch2_accounting_mem_gc(c); retry: nr = READ_ONCE(c->replicas.nr); new.entry_size = READ_ONCE(c->replicas.entry_size); @@ -583,24 +472,34 @@ retry: goto retry; } - for (i = 0; i < c->replicas.nr; i++) { + for (unsigned i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); - if (e->data_type == BCH_DATA_journal || - c->usage_base->replicas[i] || - percpu_u64_get(&c->usage[0]->replicas[i]) || - percpu_u64_get(&c->usage[1]->replicas[i]) || - percpu_u64_get(&c->usage[2]->replicas[i]) || - percpu_u64_get(&c->usage[3]->replicas[i])) + struct disk_accounting_pos k = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + + unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), + "embedded variable length struct"); + + struct bpos p = disk_accounting_pos_to_bpos(&k); + + struct bch_accounting_mem *acc = &c->accounting; + bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &p) >= acc->k.nr; + + if (e->data_type == BCH_DATA_journal || !kill) memcpy(cpu_replicas_entry(&new, new.nr++), e, new.entry_size); } bch2_cpu_replicas_sort(&new); - ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?: - replicas_table_update(c, &new); + ret = bch2_cpu_replicas_to_sb_replicas(c, &new); + + if (!ret) + swap(c->replicas, new); kfree(new.entries); @@ -614,34 +513,6 @@ retry: return ret; } -int bch2_replicas_set_usage(struct bch_fs *c, - struct bch_replicas_entry_v1 *r, - u64 sectors) -{ - int ret, idx = bch2_replicas_entry_idx(c, r); - - if (idx < 0) { - struct bch_replicas_cpu n; - - n = cpu_replicas_add_entry(c, &c->replicas, r); - if (!n.entries) - return -BCH_ERR_ENOMEM_cpu_replicas; - - ret = replicas_table_update(c, &n); - if (ret) - return ret; - - kfree(n.entries); - - idx = bch2_replicas_entry_idx(c, r); - BUG_ON(ret < 0); - } - - c->usage_base->replicas[idx] = sectors; - - return 0; -} - /* Replicas tracking - superblock: */ static int @@ -727,8 +598,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) bch2_cpu_replicas_sort(&new_r); percpu_down_write(&c->mark_lock); - - ret = replicas_table_update(c, &new_r); + swap(c->replicas, new_r); percpu_up_write(&c->mark_lock); kfree(new_r.entries); @@ -824,16 +694,17 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, { unsigned i; - sort_cmp_size(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - bch2_memcmp, NULL); + sort_r(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + bch2_memcmp, NULL, + (void *)(size_t)cpu_r->entry_size); for (i = 0; i < cpu_r->nr; i++) { struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); - int ret = bch2_replicas_entry_validate(e, sb, err); + int ret = bch2_replicas_entry_sb_validate(e, sb, err); if (ret) return ret; @@ -855,7 +726,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, } static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); struct bch_replicas_cpu cpu_r; @@ -894,7 +765,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas = { }; static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); struct bch_replicas_cpu cpu_r; @@ -942,20 +813,27 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, percpu_down_read(&c->mark_lock); for_each_cpu_replicas_entry(&c->replicas, e) { - unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; + unsigned nr_online = 0, nr_failed = 0, dflags = 0; bool metadata = e->data_type < BCH_DATA_user; if (e->data_type == BCH_DATA_cached) continue; - for (i = 0; i < e->nr_devs; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); + rcu_read_lock(); + for (unsigned i = 0; i < e->nr_devs; i++) { + if (e->devs[i] == BCH_SB_MEMBER_INVALID) { + nr_failed++; + continue; + } nr_online += test_bit(e->devs[i], devs.d); - nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; + + struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); + nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; } + rcu_read_unlock(); - if (nr_failed == e->nr_devs) + if (nr_online + nr_failed == e->nr_devs) continue; if (nr_online < e->nr_required) @@ -991,7 +869,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) { struct bch_sb_field_replicas *replicas; struct bch_sb_field_replicas_v0 *replicas_v0; - unsigned i, data_has = 0; + unsigned data_has = 0; replicas = bch2_sb_field_get(sb, replicas); replicas_v0 = bch2_sb_field_get(sb, replicas_v0); @@ -999,17 +877,26 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) if (replicas) { struct bch_replicas_entry_v1 *r; - for_each_replicas_entry(replicas, r) - for (i = 0; i < r->nr_devs; i++) + for_each_replicas_entry(replicas, r) { + if (r->data_type >= sizeof(data_has) * 8) + continue; + + for (unsigned i = 0; i < r->nr_devs; i++) if (r->devs[i] == dev) data_has |= 1 << r->data_type; + } + } else if (replicas_v0) { struct bch_replicas_entry_v0 *r; - for_each_replicas_entry_v0(replicas_v0, r) - for (i = 0; i < r->nr_devs; i++) + for_each_replicas_entry_v0(replicas_v0, r) { + if (r->data_type >= sizeof(data_has) * 8) + continue; + + for (unsigned i = 0; i < r->nr_devs; i++) if (r->devs[i] == dev) data_has |= 1 << r->data_type; + } } @@ -1018,10 +905,8 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) { - unsigned ret; - mutex_lock(&c->sb_lock); - ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); + unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); mutex_unlock(&c->sb_lock); return ret; @@ -1029,25 +914,6 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) void bch2_fs_replicas_exit(struct bch_fs *c) { - unsigned i; - - kfree(c->usage_scratch); - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - free_percpu(c->usage[i]); - kfree(c->usage_base); kfree(c->replicas.entries); kfree(c->replicas_gc.entries); - - mempool_exit(&c->replicas_delta_pool); -} - -int bch2_fs_replicas_init(struct bch_fs *c) -{ - bch2_journal_entry_res_resize(&c->journal, - &c->replicas_journal_res, - reserve_journal_replicas(c, &c->replicas)); - - return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1, - REPLICAS_DELTA_LIST_MAX) ?: - replicas_table_update(c, &c->replicas); } diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index 654a4b26d3a3..5aba2c1ce133 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -10,7 +10,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); void bch2_replicas_entry_to_text(struct printbuf *, struct bch_replicas_entry_v1 *); int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *, - struct bch_sb *, struct printbuf *); + struct bch_fs *, struct printbuf *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); static inline struct bch_replicas_entry_v1 * @@ -25,18 +25,13 @@ int bch2_replicas_entry_idx(struct bch_fs *, void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *, enum bch_data_type, struct bch_devs_list); + +bool bch2_replicas_marked_locked(struct bch_fs *, + struct bch_replicas_entry_v1 *); bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *); int bch2_mark_replicas(struct bch_fs *, struct bch_replicas_entry_v1 *); -static inline struct replicas_delta * -replicas_delta_next(struct replicas_delta *d) -{ - return (void *) d + replicas_entry_bytes(&d->r) + 8; -} - -int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); - void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c); static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e, @@ -58,10 +53,6 @@ int bch2_replicas_gc_end(struct bch_fs *, int); int bch2_replicas_gc_start(struct bch_fs *, unsigned); int bch2_replicas_gc2(struct bch_fs *); -int bch2_replicas_set_usage(struct bch_fs *, - struct bch_replicas_entry_v1 *, - u64); - #define for_each_cpu_replicas_entry(_r, _i) \ for (_i = (_r)->entries; \ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ @@ -88,6 +79,5 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; void bch2_fs_replicas_exit(struct bch_fs *); -int bch2_fs_replicas_init(struct bch_fs *); #endif /* _BCACHEFS_REPLICAS_H */ diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h new file mode 100644 index 000000000000..b7eff904acdb --- /dev/null +++ b/fs/bcachefs/replicas_format.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REPLICAS_FORMAT_H +#define _BCACHEFS_REPLICAS_FORMAT_H + +struct bch_replicas_entry_v0 { + __u8 data_type; + __u8 nr_devs; + __u8 devs[] __counted_by(nr_devs); +} __packed; + +struct bch_sb_field_replicas_v0 { + struct bch_sb_field field; + struct bch_replicas_entry_v0 entries[]; +} __packed __aligned(8); + +struct bch_replicas_entry_v1 { + __u8 data_type; + __u8 nr_devs; + __u8 nr_required; + __u8 devs[] __counted_by(nr_devs); +} __packed; + +struct bch_sb_field_replicas { + struct bch_sb_field field; + struct bch_replicas_entry_v1 entries[]; +} __packed __aligned(8); + +#define replicas_entry_bytes(_i) \ + (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) + +#define replicas_entry_add_dev(e, d) ({ \ + (e)->nr_devs++; \ + (e)->devs[(e)->nr_devs - 1] = (d); \ +}) + +#endif /* _BCACHEFS_REPLICAS_FORMAT_H */ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h index ac90d142c4e8..fed71c861fe7 100644 --- a/fs/bcachefs/replicas_types.h +++ b/fs/bcachefs/replicas_types.h @@ -8,20 +8,4 @@ struct bch_replicas_cpu { struct bch_replicas_entry_v1 *entries; }; -struct replicas_delta { - s64 delta; - struct bch_replicas_entry_v1 r; -} __packed; - -struct replicas_delta_list { - unsigned size; - unsigned used; - - struct {} memset_start; - u64 nr_inodes; - u64 persistent_reserved[BCH_REPLICAS_MAX]; - struct {} memset_end; - struct replicas_delta d[]; -}; - #endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index b6bf0ebe7e84..59c8770e4a0e 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -23,16 +23,28 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) { + struct bkey_validate_context from = { + .flags = write, + .from = BKEY_VALIDATE_superblock, + }; struct jset_entry *entry; int ret; for (entry = clean->start; entry < (struct jset_entry *) vstruct_end(&clean->field); entry = vstruct_next(entry)) { + if (vstruct_end(entry) > vstruct_end(&clean->field)) { + bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu", + le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s), + (u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field)); + bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun); + return -BCH_ERR_fsck_repair_unimplemented; + } + ret = bch2_journal_entry_validate(c, NULL, entry, le16_to_cpu(c->disk_sb.sb->version), BCH_SB_BIG_ENDIAN(c->disk_sb.sb), - write); + from); if (ret) return ret; } @@ -147,7 +159,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->sb.clean = false; mutex_unlock(&c->sb_lock); - return NULL; + return ERR_PTR(-BCH_ERR_invalid_sb_clean); } clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), @@ -159,6 +171,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) ret = bch2_sb_clean_validate_late(c, clean, READ); if (ret) { + kfree(clean); mutex_unlock(&c->sb_lock); return ERR_PTR(ret); } @@ -171,45 +184,10 @@ fsck_err: return ERR_PTR(ret); } -static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) -{ - struct jset_entry *entry = *end; - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - - memset(entry, 0, u64s * sizeof(u64)); - /* - * The u64s field counts from the start of data, ignoring the shared - * fields. - */ - entry->u64s = cpu_to_le16(u64s - 1); - - *end = vstruct_next(*end); - return entry; -} - void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry **end, u64 journal_seq) { - percpu_down_read(&c->mark_lock); - - if (!journal_seq) { - for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++) - bch2_fs_usage_acc_to_base(c, i); - } else { - bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); - } - - { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_inodes; - u->v = cpu_to_le64(c->usage_base->b.nr_inodes); - } - { struct jset_entry_usage *u = container_of(jset_entry_init(end, sizeof(*u)), @@ -220,49 +198,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->v = cpu_to_le64(atomic64_read(&c->key_version)); } - for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_reserved; - u->entry.level = i; - u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); - } - - for (unsigned i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); - struct jset_entry_data_usage *u = - container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), - struct jset_entry_data_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_data_usage; - u->v = cpu_to_le64(c->usage_base->replicas[i]); - unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), - "embedded variable length struct"); - } - - for_each_member_device(c, ca) { - unsigned b = sizeof(struct jset_entry_dev_usage) + - sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; - struct jset_entry_dev_usage *u = - container_of(jset_entry_init(end, b), - struct jset_entry_dev_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_dev_usage; - u->dev = cpu_to_le32(ca->dev_idx); - - for (unsigned i = 0; i < BCH_DATA_NR; i++) { - u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); - u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); - u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); - } - } - - percpu_up_read(&c->mark_lock); - for (unsigned i = 0; i < 2; i++) { struct jset_entry_clock *clock = container_of(jset_entry_init(end, sizeof(*clock)), @@ -274,9 +209,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, } } -static int bch2_sb_clean_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_clean *clean = field_to_type(f, clean); @@ -286,6 +220,17 @@ static int bch2_sb_clean_validate(struct bch_sb *sb, return -BCH_ERR_invalid_sb_clean; } + for (struct jset_entry *entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) { + prt_str(err, "entry type "); + bch2_prt_jset_entry_type(err, entry->type); + prt_str(err, " overruns end of section"); + return -BCH_ERR_invalid_sb_clean; + } + } + return 0; } @@ -295,14 +240,15 @@ static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field_clean *clean = field_to_type(f, clean); struct jset_entry *entry; - prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); - prt_newline(out); - prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); - prt_newline(out); + prt_printf(out, "flags: %x\n", le32_to_cpu(clean->flags)); + prt_printf(out, "journal_seq: %llu\n", le64_to_cpu(clean->journal_seq)); for (entry = clean->start; entry != vstruct_end(&clean->field); entry = vstruct_next(entry)) { + if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) + break; + if (entry->type == BCH_JSET_ENTRY_btree_keys && !entry->u64s) continue; @@ -386,6 +332,8 @@ void bch2_fs_mark_clean(struct bch_fs *c) goto out; } + bch2_journal_pos_from_member_info_set(c); + bch2_write_super(c); out: mutex_unlock(&c->sb_lock); diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c index 7dc898761bb3..6992e7469112 100644 --- a/fs/bcachefs/sb-counters.c +++ b/fs/bcachefs/sb-counters.c @@ -20,9 +20,8 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; }; -static int bch2_sb_counters_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { return 0; }; @@ -31,19 +30,12 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_counters *ctrs = field_to_type(f, counters); - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - for (i = 0; i < nr; i++) { - if (i < BCH_COUNTER_NR) - prt_printf(out, "%s ", bch2_counter_names[i]); - else - prt_printf(out, "(unknown)"); - - prt_tab(out); - prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i])); - prt_newline(out); - } + for (unsigned i = 0; i < nr; i++) + prt_printf(out, "%s \t%llu\n", + i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)", + le64_to_cpu(ctrs->d[i])); }; int bch2_sb_counters_to_cpu(struct bch_fs *c) diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index 62ea478215d0..fdcf598f08b1 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -2,86 +2,91 @@ #ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H #define _BCACHEFS_SB_COUNTERS_FORMAT_H -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0) \ - x(io_write, 1) \ - x(io_move, 2) \ - x(bucket_invalidate, 3) \ - x(bucket_discard, 4) \ - x(bucket_alloc, 5) \ - x(bucket_alloc_fail, 6) \ - x(btree_cache_scan, 7) \ - x(btree_cache_reap, 8) \ - x(btree_cache_cannibalize, 9) \ - x(btree_cache_cannibalize_lock, 10) \ - x(btree_cache_cannibalize_lock_fail, 11) \ - x(btree_cache_cannibalize_unlock, 12) \ - x(btree_node_write, 13) \ - x(btree_node_read, 14) \ - x(btree_node_compact, 15) \ - x(btree_node_merge, 16) \ - x(btree_node_split, 17) \ - x(btree_node_rewrite, 18) \ - x(btree_node_alloc, 19) \ - x(btree_node_free, 20) \ - x(btree_node_set_root, 21) \ - x(btree_path_relock_fail, 22) \ - x(btree_path_upgrade_fail, 23) \ - x(btree_reserve_get_fail, 24) \ - x(journal_entry_full, 25) \ - x(journal_full, 26) \ - x(journal_reclaim_finish, 27) \ - x(journal_reclaim_start, 28) \ - x(journal_write, 29) \ - x(read_promote, 30) \ - x(read_bounce, 31) \ - x(read_split, 33) \ - x(read_retry, 32) \ - x(read_reuse_race, 34) \ - x(move_extent_read, 35) \ - x(move_extent_write, 36) \ - x(move_extent_finish, 37) \ - x(move_extent_fail, 38) \ - x(move_extent_start_fail, 39) \ - x(copygc, 40) \ - x(copygc_wait, 41) \ - x(gc_gens_end, 42) \ - x(gc_gens_start, 43) \ - x(trans_blocked_journal_reclaim, 44) \ - x(trans_restart_btree_node_reused, 45) \ - x(trans_restart_btree_node_split, 46) \ - x(trans_restart_fault_inject, 47) \ - x(trans_restart_iter_upgrade, 48) \ - x(trans_restart_journal_preres_get, 49) \ - x(trans_restart_journal_reclaim, 50) \ - x(trans_restart_journal_res_get, 51) \ - x(trans_restart_key_cache_key_realloced, 52) \ - x(trans_restart_key_cache_raced, 53) \ - x(trans_restart_mark_replicas, 54) \ - x(trans_restart_mem_realloced, 55) \ - x(trans_restart_memory_allocation_failure, 56) \ - x(trans_restart_relock, 57) \ - x(trans_restart_relock_after_fill, 58) \ - x(trans_restart_relock_key_cache_fill, 59) \ - x(trans_restart_relock_next_node, 60) \ - x(trans_restart_relock_parent_for_fill, 61) \ - x(trans_restart_relock_path, 62) \ - x(trans_restart_relock_path_intent, 63) \ - x(trans_restart_too_many_iters, 64) \ - x(trans_restart_traverse, 65) \ - x(trans_restart_upgrade, 66) \ - x(trans_restart_would_deadlock, 67) \ - x(trans_restart_would_deadlock_write, 68) \ - x(trans_restart_injected, 69) \ - x(trans_restart_key_cache_upgrade, 70) \ - x(trans_traverse_all, 71) \ - x(transaction_commit, 72) \ - x(write_super, 73) \ - x(trans_restart_would_deadlock_recursion_limit, 74) \ - x(trans_restart_write_buffer_flush, 75) \ - x(trans_restart_split_race, 76) \ - x(write_buffer_flush_slowpath, 77) \ - x(write_buffer_flush_sync, 78) +enum counters_flags { + TYPE_COUNTER = BIT(0), /* event counters */ + TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */ +}; + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0, TYPE_SECTORS) \ + x(io_write, 1, TYPE_SECTORS) \ + x(io_move, 2, TYPE_SECTORS) \ + x(bucket_invalidate, 3, TYPE_COUNTER) \ + x(bucket_discard, 4, TYPE_COUNTER) \ + x(bucket_alloc, 5, TYPE_COUNTER) \ + x(bucket_alloc_fail, 6, TYPE_COUNTER) \ + x(btree_cache_scan, 7, TYPE_COUNTER) \ + x(btree_cache_reap, 8, TYPE_COUNTER) \ + x(btree_cache_cannibalize, 9, TYPE_COUNTER) \ + x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \ + x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \ + x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \ + x(btree_node_write, 13, TYPE_COUNTER) \ + x(btree_node_read, 14, TYPE_COUNTER) \ + x(btree_node_compact, 15, TYPE_COUNTER) \ + x(btree_node_merge, 16, TYPE_COUNTER) \ + x(btree_node_split, 17, TYPE_COUNTER) \ + x(btree_node_rewrite, 18, TYPE_COUNTER) \ + x(btree_node_alloc, 19, TYPE_COUNTER) \ + x(btree_node_free, 20, TYPE_COUNTER) \ + x(btree_node_set_root, 21, TYPE_COUNTER) \ + x(btree_path_relock_fail, 22, TYPE_COUNTER) \ + x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \ + x(btree_reserve_get_fail, 24, TYPE_COUNTER) \ + x(journal_entry_full, 25, TYPE_COUNTER) \ + x(journal_full, 26, TYPE_COUNTER) \ + x(journal_reclaim_finish, 27, TYPE_COUNTER) \ + x(journal_reclaim_start, 28, TYPE_COUNTER) \ + x(journal_write, 29, TYPE_COUNTER) \ + x(read_promote, 30, TYPE_COUNTER) \ + x(read_bounce, 31, TYPE_COUNTER) \ + x(read_split, 33, TYPE_COUNTER) \ + x(read_retry, 32, TYPE_COUNTER) \ + x(read_reuse_race, 34, TYPE_COUNTER) \ + x(move_extent_read, 35, TYPE_SECTORS) \ + x(move_extent_write, 36, TYPE_SECTORS) \ + x(move_extent_finish, 37, TYPE_SECTORS) \ + x(move_extent_fail, 38, TYPE_COUNTER) \ + x(move_extent_start_fail, 39, TYPE_COUNTER) \ + x(copygc, 40, TYPE_COUNTER) \ + x(copygc_wait, 41, TYPE_COUNTER) \ + x(gc_gens_end, 42, TYPE_COUNTER) \ + x(gc_gens_start, 43, TYPE_COUNTER) \ + x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \ + x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \ + x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \ + x(trans_restart_fault_inject, 47, TYPE_COUNTER) \ + x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \ + x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \ + x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \ + x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \ + x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \ + x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \ + x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \ + x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \ + x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \ + x(trans_restart_relock, 57, TYPE_COUNTER) \ + x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \ + x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \ + x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \ + x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \ + x(trans_restart_relock_path, 62, TYPE_COUNTER) \ + x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \ + x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \ + x(trans_restart_traverse, 65, TYPE_COUNTER) \ + x(trans_restart_upgrade, 66, TYPE_COUNTER) \ + x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \ + x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \ + x(trans_restart_injected, 69, TYPE_COUNTER) \ + x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \ + x(trans_traverse_all, 71, TYPE_COUNTER) \ + x(transaction_commit, 72, TYPE_COUNTER) \ + x(write_super, 73, TYPE_COUNTER) \ + x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \ + x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \ + x(trans_restart_split_race, 76, TYPE_COUNTER) \ + x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \ + x(write_buffer_flush_sync, 78, TYPE_COUNTER) enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 441dcb1bf160..051214fdc735 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -7,7 +7,7 @@ #include "bcachefs.h" #include "darray.h" -#include "recovery.h" +#include "recovery_passes.h" #include "sb-downgrade.h" #include "sb-errors.h" #include "super-io.h" @@ -45,9 +45,100 @@ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \ x(rebalance_work, \ - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) - -#define DOWNGRADE_TABLE() + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \ + x(subvolume_fs_parent, \ + BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ + BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ + x(btree_subvolume_children, \ + BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ + BCH_FSCK_ERR_subvol_children_not_set) \ + x(mi_btree_bitmap, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_btree_bitmap_not_marked) \ + x(disk_accounting_v2, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_bkey_version_in_future, \ + BCH_FSCK_ERR_dev_usage_buckets_wrong, \ + BCH_FSCK_ERR_dev_usage_sectors_wrong, \ + BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_accounting_mismatch) \ + x(disk_accounting_v3, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_bkey_version_in_future, \ + BCH_FSCK_ERR_dev_usage_buckets_wrong, \ + BCH_FSCK_ERR_dev_usage_sectors_wrong, \ + BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \ + BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \ + BCH_FSCK_ERR_accounting_key_junk_at_end) \ + x(disk_accounting_inum, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch) \ + x(rebalance_work_acct_fix, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch) \ + x(inode_has_child_snapshots, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \ + x(backpointer_bucket_gen, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_backpointer_to_missing_ptr, \ + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(disk_accounting_big_endian, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ + BCH_FSCK_ERR_accounting_key_junk_at_end) + +#define DOWNGRADE_TABLE() \ + x(bucket_stripe_sectors, \ + 0) \ + x(disk_accounting_v2, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_dev_usage_buckets_wrong, \ + BCH_FSCK_ERR_dev_usage_sectors_wrong, \ + BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_fs_usage_hidden_wrong, \ + BCH_FSCK_ERR_fs_usage_btree_wrong, \ + BCH_FSCK_ERR_fs_usage_data_wrong, \ + BCH_FSCK_ERR_fs_usage_cached_wrong, \ + BCH_FSCK_ERR_fs_usage_reserved_wrong, \ + BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ + BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ + BCH_FSCK_ERR_fs_usage_replicas_wrong, \ + BCH_FSCK_ERR_bkey_version_in_future) \ + x(disk_accounting_v3, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_dev_usage_buckets_wrong, \ + BCH_FSCK_ERR_dev_usage_sectors_wrong, \ + BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_fs_usage_hidden_wrong, \ + BCH_FSCK_ERR_fs_usage_btree_wrong, \ + BCH_FSCK_ERR_fs_usage_data_wrong, \ + BCH_FSCK_ERR_fs_usage_cached_wrong, \ + BCH_FSCK_ERR_fs_usage_reserved_wrong, \ + BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ + BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ + BCH_FSCK_ERR_fs_usage_replicas_wrong, \ + BCH_FSCK_ERR_accounting_replicas_not_marked, \ + BCH_FSCK_ERR_bkey_version_in_future) \ + x(rebalance_work_acct_fix, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ + BCH_FSCK_ERR_accounting_key_junk_at_end) \ + x(backpointer_bucket_gen, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \ + BCH_FSCK_ERR_backpointer_to_missing_ptr, \ + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(disk_accounting_big_endian, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ + BCH_FSCK_ERR_accounting_key_junk_at_end) struct upgrade_downgrade_entry { u64 recovery_passes; @@ -71,6 +162,40 @@ UPGRADE_TABLE() #undef x }; +static int have_stripes(struct bch_fs *c) +{ + if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b)) + return 0; + + return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b); +} + +int bch2_sb_set_upgrade_extra(struct bch_fs *c) +{ + unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; + unsigned new_version = c->sb.version; + bool write_sb = false; + int ret = 0; + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (old_version < bcachefs_metadata_version_bucket_stripe_sectors && + new_version >= bcachefs_metadata_version_bucket_stripe_sectors && + (ret = have_stripes(c) > 0)) { + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent); + write_sb = true; + } + + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return ret < 0 ? ret : 0; +} + void bch2_sb_set_upgrade(struct bch_fs *c, unsigned old_version, unsigned new_version) @@ -92,16 +217,12 @@ void bch2_sb_set_upgrade(struct bch_fs *c, ext->recovery_passes_required[0] |= cpu_to_le64(bch2_recovery_passes_to_stable(passes)); - for (const u16 *e = i->errors; - e < i->errors + i->nr_errors; - e++) { - __set_bit(*e, c->sb.errors_silent); - ext->errors_silent[*e / 64] |= cpu_to_le64(BIT_ULL(*e % 64)); - } + for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++) + __set_bit_le64(*e, ext->errors_silent); } } -#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ }; +#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ }; DOWNGRADE_TABLE() #undef x @@ -116,6 +237,37 @@ DOWNGRADE_TABLE() #undef x }; +static int downgrade_table_extra(struct bch_fs *c, darray_char *table) +{ + struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table); + unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); + int ret = 0; + + unsigned nr_errors = le16_to_cpu(dst->nr_errors); + + switch (le16_to_cpu(dst->version)) { + case bcachefs_metadata_version_bucket_stripe_sectors: + if (have_stripes(c)) { + bytes += sizeof(dst->errors[0]) * 2; + + ret = darray_make_room(table, bytes); + if (ret) + return ret; + + /* open coded __set_bit_le64, as dst is packed and + * dst->recovery_passes is misaligned */ + unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations; + dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64)); + + dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong); + } + break; + } + + dst->nr_errors = cpu_to_le16(nr_errors); + return ret; +} + static inline const struct bch_sb_field_downgrade_entry * downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e) { @@ -125,15 +277,32 @@ downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e) #define for_each_downgrade_entry(_d, _i) \ for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \ (void *) _i < vstruct_end(&(_d)->field) && \ - (void *) &_i->errors[0] < vstruct_end(&(_d)->field); \ + (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) && \ + (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field); \ _i = downgrade_entry_next_c(_i)) static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_downgrade *e = field_to_type(f, downgrade); - for_each_downgrade_entry(e, i) { + for (const struct bch_sb_field_downgrade_entry *i = e->entries; + (void *) i < vstruct_end(&e->field); + i = downgrade_entry_next_c(i)) { + /* + * Careful: sb_field_downgrade_entry is only 2 byte aligned, but + * section sizes are 8 byte aligned - an empty entry spanning + * the end of the section is allowed (and ignored): + */ + if ((void *) &i->errors[0] > vstruct_end(&e->field)) + break; + + if (flags & BCH_VALIDATE_write && + (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) { + prt_printf(err, "downgrade entry overruns end of superblock section"); + return -BCH_ERR_invalid_sb_downgrade; + } + if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) != BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) { prt_printf(err, "downgrade entry with mismatched major version (%u != %u)", @@ -155,26 +324,22 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb, printbuf_tabstop_push(out, 16); for_each_downgrade_entry(e, i) { - prt_str(out, "version:"); - prt_tab(out); + prt_str(out, "version:\t"); bch2_version_to_text(out, le16_to_cpu(i->version)); prt_newline(out); - prt_str(out, "recovery passes:"); - prt_tab(out); + prt_str(out, "recovery passes:\t"); prt_bitflags(out, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0]))); prt_newline(out); - prt_str(out, "errors:"); - prt_tab(out); + prt_str(out, "errors:\t"); bool first = true; for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { if (!first) prt_char(out, ','); first = false; - unsigned e = le16_to_cpu(i->errors[j]); - prt_str(out, e < BCH_SB_ERR_MAX ? bch2_sb_error_strs[e] : "(unknown)"); + bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j])); } prt_newline(out); } @@ -187,6 +352,9 @@ const struct bch_sb_field_ops bch_sb_field_ops_downgrade = { int bch2_sb_downgrade_update(struct bch_fs *c) { + if (!test_bit(BCH_FS_btree_running, &c->flags)) + return 0; + darray_char table = {}; int ret = 0; @@ -205,13 +373,22 @@ int bch2_sb_downgrade_update(struct bch_fs *c) dst = (void *) &darray_top(table); dst->version = cpu_to_le16(src->version); - dst->recovery_passes[0] = cpu_to_le64(src->recovery_passes); + dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes)); dst->recovery_passes[1] = 0; dst->nr_errors = cpu_to_le16(src->nr_errors); for (unsigned i = 0; i < src->nr_errors; i++) dst->errors[i] = cpu_to_le16(src->errors[i]); - table.nr += bytes; + ret = downgrade_table_extra(c, &table); + if (ret) + goto out; + + if (!dst->recovery_passes[0] && + !dst->recovery_passes[1] && + !dst->nr_errors) + continue; + + table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); } struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade); @@ -250,10 +427,10 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { unsigned e = le16_to_cpu(i->errors[j]); - if (e < BCH_SB_ERR_MAX) + if (e < BCH_FSCK_ERR_MAX) __set_bit(e, c->sb.errors_silent); if (e < sizeof(ext->errors_silent) * 8) - ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64)); + __set_bit_le64(e, ext->errors_silent); } } } diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h index 57e6c916fc73..095b7cc9bb47 100644 --- a/fs/bcachefs/sb-downgrade.h +++ b/fs/bcachefs/sb-downgrade.h @@ -6,6 +6,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade; int bch2_sb_downgrade_update(struct bch_fs *); void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned); +int bch2_sb_set_upgrade_extra(struct bch_fs *); void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned); #endif /* _BCACHEFS_SB_DOWNGRADE_H */ diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h new file mode 100644 index 000000000000..cffd932be3ec --- /dev/null +++ b/fs/bcachefs/sb-downgrade_format.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H +#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H + +struct bch_sb_field_downgrade_entry { + __le16 version; + __le64 recovery_passes[2]; + __le16 nr_errors; + __le16 errors[] __counted_by(nr_errors); +} __packed __aligned(2); + +struct bch_sb_field_downgrade { + struct bch_sb_field field; + struct bch_sb_field_downgrade_entry entries[]; +}; + +#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index 5f5bcae391fb..013a96883b4e 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -7,12 +7,12 @@ const char * const bch2_sb_error_strs[] = { #define x(t, n, ...) [n] = #t, BCH_SB_ERRS() - NULL +#undef x }; -static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) +void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) { - if (id < BCH_SB_ERR_MAX) + if (id < BCH_FSCK_ERR_MAX) prt_str(out, bch2_sb_error_strs[id]); else prt_printf(out, "(unknown error %u)", id); @@ -30,7 +30,7 @@ static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) } static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_errors *e = field_to_type(f, errors); unsigned i, nr = bch2_sb_field_errors_nr_entries(e); @@ -110,19 +110,25 @@ out: void bch2_sb_errors_from_cpu(struct bch_fs *c) { bch_sb_errors_cpu *src = &c->fsck_error_counts; - struct bch_sb_field_errors *dst = - bch2_sb_field_resize(&c->disk_sb, errors, - bch2_sb_field_errors_u64s(src->nr)); + struct bch_sb_field_errors *dst; unsigned i; + mutex_lock(&c->fsck_error_counts_lock); + + dst = bch2_sb_field_resize(&c->disk_sb, errors, + bch2_sb_field_errors_u64s(src->nr)); + if (!dst) - return; + goto err; for (i = 0; i < src->nr; i++) { SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); } + +err: + mutex_unlock(&c->fsck_error_counts_lock); } static int bch2_sb_errors_to_cpu(struct bch_fs *c) diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h index 8889001e7db4..b2357b8e6107 100644 --- a/fs/bcachefs/sb-errors.h +++ b/fs/bcachefs/sb-errors.h @@ -6,6 +6,8 @@ extern const char * const bch2_sb_error_strs[]; +void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id); + extern const struct bch_sb_field_ops bch_sb_field_ops_errors; void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id); diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h new file mode 100644 index 000000000000..b86ec013d7d7 --- /dev/null +++ b/fs/bcachefs/sb-errors_format.h @@ -0,0 +1,336 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H +#define _BCACHEFS_SB_ERRORS_FORMAT_H + +enum bch_fsck_flags { + FSCK_CAN_FIX = 1 << 0, + FSCK_CAN_IGNORE = 1 << 1, + FSCK_NO_RATELIMIT = 1 << 2, + FSCK_AUTOFIX = 1 << 3, +}; + +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0, 0) \ + x(dirty_but_no_journal_entries, 1, 0) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \ + x(sb_clean_journal_seq_mismatch, 3, 0) \ + x(sb_clean_btree_root_mismatch, 4, 0) \ + x(sb_clean_missing, 5, 0) \ + x(jset_unsupported_version, 6, 0) \ + x(jset_unknown_csum, 7, 0) \ + x(jset_last_seq_newer_than_seq, 8, 0) \ + x(jset_past_bucket_end, 9, 0) \ + x(jset_seq_blacklisted, 10, 0) \ + x(journal_entries_missing, 11, 0) \ + x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \ + x(journal_entry_past_jset_end, 13, 0) \ + x(journal_entry_replicas_data_mismatch, 14, 0) \ + x(journal_entry_bkey_u64s_0, 15, 0) \ + x(journal_entry_bkey_past_end, 16, 0) \ + x(journal_entry_bkey_bad_format, 17, 0) \ + x(journal_entry_bkey_invalid, 18, 0) \ + x(journal_entry_btree_root_bad_size, 19, 0) \ + x(journal_entry_blacklist_bad_size, 20, 0) \ + x(journal_entry_blacklist_v2_bad_size, 21, 0) \ + x(journal_entry_blacklist_v2_start_past_end, 22, 0) \ + x(journal_entry_usage_bad_size, 23, 0) \ + x(journal_entry_data_usage_bad_size, 24, 0) \ + x(journal_entry_clock_bad_size, 25, 0) \ + x(journal_entry_clock_bad_rw, 26, 0) \ + x(journal_entry_dev_usage_bad_size, 27, 0) \ + x(journal_entry_dev_usage_bad_dev, 28, 0) \ + x(journal_entry_dev_usage_bad_pad, 29, 0) \ + x(btree_node_unreadable, 30, 0) \ + x(btree_node_fault_injected, 31, 0) \ + x(btree_node_bad_magic, 32, 0) \ + x(btree_node_bad_seq, 33, 0) \ + x(btree_node_unsupported_version, 34, 0) \ + x(btree_node_bset_older_than_sb_min, 35, 0) \ + x(btree_node_bset_newer_than_sb, 36, 0) \ + x(btree_node_data_missing, 37, 0) \ + x(btree_node_bset_after_end, 38, 0) \ + x(btree_node_replicas_sectors_written_mismatch, 39, 0) \ + x(btree_node_replicas_data_mismatch, 40, 0) \ + x(bset_unknown_csum, 41, 0) \ + x(bset_bad_csum, 42, 0) \ + x(bset_past_end_of_btree_node, 43, 0) \ + x(bset_wrong_sector_offset, 44, 0) \ + x(bset_empty, 45, 0) \ + x(bset_bad_seq, 46, 0) \ + x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \ + x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ + x(btree_node_bad_btree, 49, 0) \ + x(btree_node_bad_level, 50, 0) \ + x(btree_node_bad_min_key, 51, 0) \ + x(btree_node_bad_max_key, 52, 0) \ + x(btree_node_bad_format, 53, 0) \ + x(btree_node_bkey_past_bset_end, 54, 0) \ + x(btree_node_bkey_bad_format, 55, 0) \ + x(btree_node_bad_bkey, 56, 0) \ + x(btree_node_bkey_out_of_order, 57, FSCK_AUTOFIX) \ + x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \ + x(btree_root_read_error, 59, FSCK_AUTOFIX) \ + x(btree_root_bad_min_key, 60, 0) \ + x(btree_root_bad_max_key, 61, 0) \ + x(btree_node_read_error, 62, FSCK_AUTOFIX) \ + x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \ + x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \ + x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \ + x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \ + x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \ + x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ + x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ + x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ + x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \ + x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \ + x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \ + x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \ + x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \ + x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \ + x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \ + x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \ + x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \ + x(bkey_version_in_future, 80, 0) \ + x(bkey_u64s_too_small, 81, 0) \ + x(bkey_invalid_type_for_btree, 82, 0) \ + x(bkey_extent_size_zero, 83, 0) \ + x(bkey_extent_size_greater_than_offset, 84, 0) \ + x(bkey_size_nonzero, 85, 0) \ + x(bkey_snapshot_nonzero, 86, 0) \ + x(bkey_snapshot_zero, 87, 0) \ + x(bkey_at_pos_max, 88, 0) \ + x(bkey_before_start_of_btree_node, 89, 0) \ + x(bkey_after_end_of_btree_node, 90, 0) \ + x(bkey_val_size_nonzero, 91, 0) \ + x(bkey_val_size_too_small, 92, 0) \ + x(alloc_v1_val_size_bad, 93, 0) \ + x(alloc_v2_unpack_error, 94, 0) \ + x(alloc_v3_unpack_error, 95, 0) \ + x(alloc_v4_val_size_bad, 96, 0) \ + x(alloc_v4_backpointers_start_bad, 97, 0) \ + x(alloc_key_data_type_bad, 98, 0) \ + x(alloc_key_empty_but_have_data, 99, 0) \ + x(alloc_key_dirty_sectors_0, 100, 0) \ + x(alloc_key_data_type_inconsistency, 101, 0) \ + x(alloc_key_to_missing_dev_bucket, 102, 0) \ + x(alloc_key_cached_inconsistency, 103, 0) \ + x(alloc_key_cached_but_read_time_zero, 104, FSCK_AUTOFIX) \ + x(alloc_key_to_missing_lru_entry, 105, FSCK_AUTOFIX) \ + x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \ + x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \ + x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \ + x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ + x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ + x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ + x(alloc_key_journal_seq_in_future, 298, FSCK_AUTOFIX) \ + x(bucket_sector_count_overflow, 112, 0) \ + x(bucket_metadata_type_mismatch, 113, 0) \ + x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \ + x(freespace_key_wrong, 115, FSCK_AUTOFIX) \ + x(freespace_hole_missing, 116, FSCK_AUTOFIX) \ + x(bucket_gens_val_size_bad, 117, 0) \ + x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \ + x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \ + x(bucket_gens_to_invalid_dev, 120, FSCK_AUTOFIX) \ + x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ + x(need_discard_freespace_key_bad, 124, 0) \ + x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ + x(backpointer_bucket_offset_wrong, 125, 0) \ + x(backpointer_level_bad, 294, 0) \ + x(backpointer_dev_bad, 297, 0) \ + x(backpointer_to_missing_device, 126, FSCK_AUTOFIX) \ + x(backpointer_to_missing_alloc, 127, FSCK_AUTOFIX) \ + x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \ + x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \ + x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \ + x(lru_entry_bad, 131, FSCK_AUTOFIX) \ + x(btree_ptr_val_too_big, 132, 0) \ + x(btree_ptr_v2_val_too_big, 133, 0) \ + x(btree_ptr_has_non_ptr, 134, 0) \ + x(extent_ptrs_invalid_entry, 135, 0) \ + x(extent_ptrs_no_ptrs, 136, 0) \ + x(extent_ptrs_too_many_ptrs, 137, 0) \ + x(extent_ptrs_redundant_crc, 138, 0) \ + x(extent_ptrs_redundant_stripe, 139, 0) \ + x(extent_ptrs_unwritten, 140, 0) \ + x(extent_ptrs_written_and_unwritten, 141, 0) \ + x(ptr_to_invalid_device, 142, 0) \ + x(ptr_to_duplicate_device, 143, 0) \ + x(ptr_after_last_bucket, 144, 0) \ + x(ptr_before_first_bucket, 145, 0) \ + x(ptr_spans_multiple_buckets, 146, 0) \ + x(ptr_to_missing_backpointer, 147, FSCK_AUTOFIX) \ + x(ptr_to_missing_alloc_key, 148, FSCK_AUTOFIX) \ + x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \ + x(ptr_to_missing_stripe, 150, 0) \ + x(ptr_to_incorrect_stripe, 151, 0) \ + x(ptr_gen_newer_than_bucket_gen, 152, 0) \ + x(ptr_too_stale, 153, 0) \ + x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ + x(ptr_bucket_data_type_mismatch, 155, 0) \ + x(ptr_cached_and_erasure_coded, 156, 0) \ + x(ptr_crc_uncompressed_size_too_small, 157, 0) \ + x(ptr_crc_uncompressed_size_too_big, 161, 0) \ + x(ptr_crc_uncompressed_size_mismatch, 300, 0) \ + x(ptr_crc_csum_type_unknown, 158, 0) \ + x(ptr_crc_compression_type_unknown, 159, 0) \ + x(ptr_crc_redundant, 160, 0) \ + x(ptr_crc_nonce_mismatch, 162, 0) \ + x(ptr_stripe_redundant, 163, 0) \ + x(reservation_key_nr_replicas_invalid, 164, 0) \ + x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ + x(reflink_v_pos_bad, 292, 0) \ + x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \ + x(reflink_refcount_underflow, 293, 0) \ + x(stripe_pos_bad, 167, 0) \ + x(stripe_val_size_bad, 168, 0) \ + x(stripe_csum_granularity_bad, 290, 0) \ + x(stripe_sector_count_wrong, 169, 0) \ + x(snapshot_tree_pos_bad, 170, 0) \ + x(snapshot_tree_to_missing_snapshot, 171, 0) \ + x(snapshot_tree_to_missing_subvol, 172, 0) \ + x(snapshot_tree_to_wrong_subvol, 173, 0) \ + x(snapshot_tree_to_snapshot_subvol, 174, 0) \ + x(snapshot_pos_bad, 175, 0) \ + x(snapshot_parent_bad, 176, 0) \ + x(snapshot_children_not_normalized, 177, 0) \ + x(snapshot_child_duplicate, 178, 0) \ + x(snapshot_child_bad, 179, 0) \ + x(snapshot_skiplist_not_normalized, 180, 0) \ + x(snapshot_skiplist_bad, 181, 0) \ + x(snapshot_should_not_have_subvol, 182, 0) \ + x(snapshot_to_bad_snapshot_tree, 183, FSCK_AUTOFIX) \ + x(snapshot_bad_depth, 184, 0) \ + x(snapshot_bad_skiplist, 185, 0) \ + x(subvol_pos_bad, 186, 0) \ + x(subvol_not_master_and_not_snapshot, 187, 0) \ + x(subvol_to_missing_root, 188, 0) \ + x(subvol_root_wrong_bi_subvol, 189, 0) \ + x(bkey_in_missing_snapshot, 190, 0) \ + x(inode_pos_inode_nonzero, 191, 0) \ + x(inode_pos_blockdev_range, 192, 0) \ + x(inode_alloc_cursor_inode_bad, 301, 0) \ + x(inode_unpack_error, 193, 0) \ + x(inode_str_hash_invalid, 194, 0) \ + x(inode_v3_fields_start_bad, 195, 0) \ + x(inode_snapshot_mismatch, 196, 0) \ + x(inode_unlinked_but_clean, 197, 0) \ + x(inode_unlinked_but_nlink_nonzero, 198, 0) \ + x(inode_unlinked_and_not_open, 281, 0) \ + x(inode_unlinked_but_has_dirent, 285, 0) \ + x(inode_checksum_type_invalid, 199, 0) \ + x(inode_compression_type_invalid, 200, 0) \ + x(inode_subvol_root_but_not_dir, 201, 0) \ + x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \ + x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \ + x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \ + x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \ + x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \ + x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \ + x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \ + x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ + x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ + x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ + x(inode_has_child_snapshots_wrong, 287, 0) \ + x(inode_unreachable, 210, FSCK_AUTOFIX) \ + x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ + x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ + x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ + x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ + x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ + x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ + x(extent_overlapping, 215, 0) \ + x(key_in_missing_inode, 216, 0) \ + x(key_in_wrong_inode_type, 217, 0) \ + x(extent_past_end_of_inode, 218, 0) \ + x(dirent_empty_name, 219, 0) \ + x(dirent_val_too_big, 220, 0) \ + x(dirent_name_too_long, 221, 0) \ + x(dirent_name_embedded_nul, 222, 0) \ + x(dirent_name_dot_or_dotdot, 223, 0) \ + x(dirent_name_has_slash, 224, 0) \ + x(dirent_d_type_wrong, 225, 0) \ + x(inode_bi_parent_wrong, 226, 0) \ + x(dirent_in_missing_dir_inode, 227, 0) \ + x(dirent_in_non_dir_inode, 228, 0) \ + x(dirent_to_missing_inode, 229, 0) \ + x(dirent_to_overwritten_inode, 302, 0) \ + x(dirent_to_missing_subvol, 230, 0) \ + x(dirent_to_itself, 231, 0) \ + x(quota_type_invalid, 232, 0) \ + x(xattr_val_size_too_small, 233, 0) \ + x(xattr_val_size_too_big, 234, 0) \ + x(xattr_invalid_type, 235, 0) \ + x(xattr_name_invalid_chars, 236, 0) \ + x(xattr_in_missing_inode, 237, 0) \ + x(root_subvol_missing, 238, 0) \ + x(root_dir_missing, 239, 0) \ + x(root_inode_not_dir, 240, 0) \ + x(dir_loop, 241, 0) \ + x(hash_table_key_duplicate, 242, 0) \ + x(hash_table_key_wrong_offset, 243, 0) \ + x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \ + x(reflink_p_front_pad_bad, 245, 0) \ + x(journal_entry_dup_same_device, 246, 0) \ + x(inode_bi_subvol_missing, 247, 0) \ + x(inode_bi_subvol_wrong, 248, 0) \ + x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \ + x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \ + x(inode_bi_parent_nonzero, 251, 0) \ + x(dirent_to_missing_parent_subvol, 252, 0) \ + x(dirent_not_visible_in_parent_subvol, 253, 0) \ + x(subvol_fs_path_parent_wrong, 254, 0) \ + x(subvol_root_fs_path_parent_nonzero, 255, 0) \ + x(subvol_children_not_set, 256, 0) \ + x(subvol_children_bad, 257, 0) \ + x(subvol_loop, 258, 0) \ + x(subvol_unreachable, 259, FSCK_AUTOFIX) \ + x(btree_node_bkey_bad_u64s, 260, 0) \ + x(btree_node_topology_empty_interior_node, 261, 0) \ + x(btree_ptr_v2_min_key_bad, 262, 0) \ + x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ + x(snapshot_node_missing, 264, 0) \ + x(dup_backpointer_to_bad_csum_extent, 265, 0) \ + x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ + x(sb_clean_entry_overrun, 267, 0) \ + x(btree_ptr_v2_written_0, 268, 0) \ + x(subvol_snapshot_bad, 269, 0) \ + x(subvol_inode_bad, 270, 0) \ + x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ + x(accounting_mismatch, 272, FSCK_AUTOFIX) \ + x(accounting_replicas_not_marked, 273, 0) \ + x(accounting_to_invalid_device, 289, 0) \ + x(invalid_btree_id, 274, 0) \ + x(alloc_key_io_time_bad, 275, 0) \ + x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ + x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ + x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ + x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ + x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ + x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ + x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ + x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ + x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ + x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ + x(MAX, 304, 0) + +enum bch_sb_error_id { +#define x(t, n, ...) BCH_FSCK_ERR_##t = n, + BCH_SB_ERRS() +#undef x +}; + +struct bch_sb_field_errors { + struct bch_sb_field field; + struct bch_sb_field_error_entry { + __le64 v; + __le64 last_error_time; + } entries[]; +}; + +LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); +LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); + +#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index c08aacdfd073..40325239c3b0 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -4,261 +4,6 @@ #include "darray.h" -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0) \ - x(dirty_but_no_journal_entries, 1) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ - x(sb_clean_journal_seq_mismatch, 3) \ - x(sb_clean_btree_root_mismatch, 4) \ - x(sb_clean_missing, 5) \ - x(jset_unsupported_version, 6) \ - x(jset_unknown_csum, 7) \ - x(jset_last_seq_newer_than_seq, 8) \ - x(jset_past_bucket_end, 9) \ - x(jset_seq_blacklisted, 10) \ - x(journal_entries_missing, 11) \ - x(journal_entry_replicas_not_marked, 12) \ - x(journal_entry_past_jset_end, 13) \ - x(journal_entry_replicas_data_mismatch, 14) \ - x(journal_entry_bkey_u64s_0, 15) \ - x(journal_entry_bkey_past_end, 16) \ - x(journal_entry_bkey_bad_format, 17) \ - x(journal_entry_bkey_invalid, 18) \ - x(journal_entry_btree_root_bad_size, 19) \ - x(journal_entry_blacklist_bad_size, 20) \ - x(journal_entry_blacklist_v2_bad_size, 21) \ - x(journal_entry_blacklist_v2_start_past_end, 22) \ - x(journal_entry_usage_bad_size, 23) \ - x(journal_entry_data_usage_bad_size, 24) \ - x(journal_entry_clock_bad_size, 25) \ - x(journal_entry_clock_bad_rw, 26) \ - x(journal_entry_dev_usage_bad_size, 27) \ - x(journal_entry_dev_usage_bad_dev, 28) \ - x(journal_entry_dev_usage_bad_pad, 29) \ - x(btree_node_unreadable, 30) \ - x(btree_node_fault_injected, 31) \ - x(btree_node_bad_magic, 32) \ - x(btree_node_bad_seq, 33) \ - x(btree_node_unsupported_version, 34) \ - x(btree_node_bset_older_than_sb_min, 35) \ - x(btree_node_bset_newer_than_sb, 36) \ - x(btree_node_data_missing, 37) \ - x(btree_node_bset_after_end, 38) \ - x(btree_node_replicas_sectors_written_mismatch, 39) \ - x(btree_node_replicas_data_mismatch, 40) \ - x(bset_unknown_csum, 41) \ - x(bset_bad_csum, 42) \ - x(bset_past_end_of_btree_node, 43) \ - x(bset_wrong_sector_offset, 44) \ - x(bset_empty, 45) \ - x(bset_bad_seq, 46) \ - x(bset_blacklisted_journal_seq, 47) \ - x(first_bset_blacklisted_journal_seq, 48) \ - x(btree_node_bad_btree, 49) \ - x(btree_node_bad_level, 50) \ - x(btree_node_bad_min_key, 51) \ - x(btree_node_bad_max_key, 52) \ - x(btree_node_bad_format, 53) \ - x(btree_node_bkey_past_bset_end, 54) \ - x(btree_node_bkey_bad_format, 55) \ - x(btree_node_bad_bkey, 56) \ - x(btree_node_bkey_out_of_order, 57) \ - x(btree_root_bkey_invalid, 58) \ - x(btree_root_read_error, 59) \ - x(btree_root_bad_min_key, 60) \ - x(btree_root_bad_max_key, 61) \ - x(btree_node_read_error, 62) \ - x(btree_node_topology_bad_min_key, 63) \ - x(btree_node_topology_bad_max_key, 64) \ - x(btree_node_topology_overwritten_by_prev_node, 65) \ - x(btree_node_topology_overwritten_by_next_node, 66) \ - x(btree_node_topology_interior_node_empty, 67) \ - x(fs_usage_hidden_wrong, 68) \ - x(fs_usage_btree_wrong, 69) \ - x(fs_usage_data_wrong, 70) \ - x(fs_usage_cached_wrong, 71) \ - x(fs_usage_reserved_wrong, 72) \ - x(fs_usage_persistent_reserved_wrong, 73) \ - x(fs_usage_nr_inodes_wrong, 74) \ - x(fs_usage_replicas_wrong, 75) \ - x(dev_usage_buckets_wrong, 76) \ - x(dev_usage_sectors_wrong, 77) \ - x(dev_usage_fragmented_wrong, 78) \ - x(dev_usage_buckets_ec_wrong, 79) \ - x(bkey_version_in_future, 80) \ - x(bkey_u64s_too_small, 81) \ - x(bkey_invalid_type_for_btree, 82) \ - x(bkey_extent_size_zero, 83) \ - x(bkey_extent_size_greater_than_offset, 84) \ - x(bkey_size_nonzero, 85) \ - x(bkey_snapshot_nonzero, 86) \ - x(bkey_snapshot_zero, 87) \ - x(bkey_at_pos_max, 88) \ - x(bkey_before_start_of_btree_node, 89) \ - x(bkey_after_end_of_btree_node, 90) \ - x(bkey_val_size_nonzero, 91) \ - x(bkey_val_size_too_small, 92) \ - x(alloc_v1_val_size_bad, 93) \ - x(alloc_v2_unpack_error, 94) \ - x(alloc_v3_unpack_error, 95) \ - x(alloc_v4_val_size_bad, 96) \ - x(alloc_v4_backpointers_start_bad, 97) \ - x(alloc_key_data_type_bad, 98) \ - x(alloc_key_empty_but_have_data, 99) \ - x(alloc_key_dirty_sectors_0, 100) \ - x(alloc_key_data_type_inconsistency, 101) \ - x(alloc_key_to_missing_dev_bucket, 102) \ - x(alloc_key_cached_inconsistency, 103) \ - x(alloc_key_cached_but_read_time_zero, 104) \ - x(alloc_key_to_missing_lru_entry, 105) \ - x(alloc_key_data_type_wrong, 106) \ - x(alloc_key_gen_wrong, 107) \ - x(alloc_key_dirty_sectors_wrong, 108) \ - x(alloc_key_cached_sectors_wrong, 109) \ - x(alloc_key_stripe_wrong, 110) \ - x(alloc_key_stripe_redundancy_wrong, 111) \ - x(bucket_sector_count_overflow, 112) \ - x(bucket_metadata_type_mismatch, 113) \ - x(need_discard_key_wrong, 114) \ - x(freespace_key_wrong, 115) \ - x(freespace_hole_missing, 116) \ - x(bucket_gens_val_size_bad, 117) \ - x(bucket_gens_key_wrong, 118) \ - x(bucket_gens_hole_wrong, 119) \ - x(bucket_gens_to_invalid_dev, 120) \ - x(bucket_gens_to_invalid_buckets, 121) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ - x(need_discard_freespace_key_bad, 124) \ - x(backpointer_pos_wrong, 125) \ - x(backpointer_to_missing_device, 126) \ - x(backpointer_to_missing_alloc, 127) \ - x(backpointer_to_missing_ptr, 128) \ - x(lru_entry_at_time_0, 129) \ - x(lru_entry_to_invalid_bucket, 130) \ - x(lru_entry_bad, 131) \ - x(btree_ptr_val_too_big, 132) \ - x(btree_ptr_v2_val_too_big, 133) \ - x(btree_ptr_has_non_ptr, 134) \ - x(extent_ptrs_invalid_entry, 135) \ - x(extent_ptrs_no_ptrs, 136) \ - x(extent_ptrs_too_many_ptrs, 137) \ - x(extent_ptrs_redundant_crc, 138) \ - x(extent_ptrs_redundant_stripe, 139) \ - x(extent_ptrs_unwritten, 140) \ - x(extent_ptrs_written_and_unwritten, 141) \ - x(ptr_to_invalid_device, 142) \ - x(ptr_to_duplicate_device, 143) \ - x(ptr_after_last_bucket, 144) \ - x(ptr_before_first_bucket, 145) \ - x(ptr_spans_multiple_buckets, 146) \ - x(ptr_to_missing_backpointer, 147) \ - x(ptr_to_missing_alloc_key, 148) \ - x(ptr_to_missing_replicas_entry, 149) \ - x(ptr_to_missing_stripe, 150) \ - x(ptr_to_incorrect_stripe, 151) \ - x(ptr_gen_newer_than_bucket_gen, 152) \ - x(ptr_too_stale, 153) \ - x(stale_dirty_ptr, 154) \ - x(ptr_bucket_data_type_mismatch, 155) \ - x(ptr_cached_and_erasure_coded, 156) \ - x(ptr_crc_uncompressed_size_too_small, 157) \ - x(ptr_crc_csum_type_unknown, 158) \ - x(ptr_crc_compression_type_unknown, 159) \ - x(ptr_crc_redundant, 160) \ - x(ptr_crc_uncompressed_size_too_big, 161) \ - x(ptr_crc_nonce_mismatch, 162) \ - x(ptr_stripe_redundant, 163) \ - x(reservation_key_nr_replicas_invalid, 164) \ - x(reflink_v_refcount_wrong, 165) \ - x(reflink_p_to_missing_reflink_v, 166) \ - x(stripe_pos_bad, 167) \ - x(stripe_val_size_bad, 168) \ - x(stripe_sector_count_wrong, 169) \ - x(snapshot_tree_pos_bad, 170) \ - x(snapshot_tree_to_missing_snapshot, 171) \ - x(snapshot_tree_to_missing_subvol, 172) \ - x(snapshot_tree_to_wrong_subvol, 173) \ - x(snapshot_tree_to_snapshot_subvol, 174) \ - x(snapshot_pos_bad, 175) \ - x(snapshot_parent_bad, 176) \ - x(snapshot_children_not_normalized, 177) \ - x(snapshot_child_duplicate, 178) \ - x(snapshot_child_bad, 179) \ - x(snapshot_skiplist_not_normalized, 180) \ - x(snapshot_skiplist_bad, 181) \ - x(snapshot_should_not_have_subvol, 182) \ - x(snapshot_to_bad_snapshot_tree, 183) \ - x(snapshot_bad_depth, 184) \ - x(snapshot_bad_skiplist, 185) \ - x(subvol_pos_bad, 186) \ - x(subvol_not_master_and_not_snapshot, 187) \ - x(subvol_to_missing_root, 188) \ - x(subvol_root_wrong_bi_subvol, 189) \ - x(bkey_in_missing_snapshot, 190) \ - x(inode_pos_inode_nonzero, 191) \ - x(inode_pos_blockdev_range, 192) \ - x(inode_unpack_error, 193) \ - x(inode_str_hash_invalid, 194) \ - x(inode_v3_fields_start_bad, 195) \ - x(inode_snapshot_mismatch, 196) \ - x(inode_unlinked_but_clean, 197) \ - x(inode_unlinked_but_nlink_nonzero, 198) \ - x(inode_checksum_type_invalid, 199) \ - x(inode_compression_type_invalid, 200) \ - x(inode_subvol_root_but_not_dir, 201) \ - x(inode_i_size_dirty_but_clean, 202) \ - x(inode_i_sectors_dirty_but_clean, 203) \ - x(inode_i_sectors_wrong, 204) \ - x(inode_dir_wrong_nlink, 205) \ - x(inode_dir_multiple_links, 206) \ - x(inode_multiple_links_but_nlink_0, 207) \ - x(inode_wrong_backpointer, 208) \ - x(inode_wrong_nlink, 209) \ - x(inode_unreachable, 210) \ - x(deleted_inode_but_clean, 211) \ - x(deleted_inode_missing, 212) \ - x(deleted_inode_is_dir, 213) \ - x(deleted_inode_not_unlinked, 214) \ - x(extent_overlapping, 215) \ - x(extent_in_missing_inode, 216) \ - x(extent_in_non_reg_inode, 217) \ - x(extent_past_end_of_inode, 218) \ - x(dirent_empty_name, 219) \ - x(dirent_val_too_big, 220) \ - x(dirent_name_too_long, 221) \ - x(dirent_name_embedded_nul, 222) \ - x(dirent_name_dot_or_dotdot, 223) \ - x(dirent_name_has_slash, 224) \ - x(dirent_d_type_wrong, 225) \ - x(dirent_d_parent_subvol_wrong, 226) \ - x(dirent_in_missing_dir_inode, 227) \ - x(dirent_in_non_dir_inode, 228) \ - x(dirent_to_missing_inode, 229) \ - x(dirent_to_missing_subvol, 230) \ - x(dirent_to_itself, 231) \ - x(quota_type_invalid, 232) \ - x(xattr_val_size_too_small, 233) \ - x(xattr_val_size_too_big, 234) \ - x(xattr_invalid_type, 235) \ - x(xattr_name_invalid_chars, 236) \ - x(xattr_in_missing_inode, 237) \ - x(root_subvol_missing, 238) \ - x(root_dir_missing, 239) \ - x(root_inode_not_dir, 240) \ - x(dir_loop, 241) \ - x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) \ - x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) - -enum bch_sb_error_id { -#define x(t, n) BCH_FSCK_ERR_##t = n, - BCH_SB_ERRS() -#undef x - BCH_SB_ERR_MAX -}; - struct bch_sb_error_entry_cpu { u64 id:16, nr:48; @@ -268,4 +13,3 @@ struct bch_sb_error_entry_cpu { typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; #endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ - diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index eff5ce18c69c..116131f95815 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -1,12 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "disk_groups.h" +#include "error.h" #include "opts.h" #include "replicas.h" #include "sb-members.h" #include "super-io.h" +void bch2_dev_missing(struct bch_fs *c, unsigned dev) +{ + if (dev != BCH_SB_MEMBER_INVALID) + bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); +} + +void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket) +{ + bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset); +} + #define x(t, n, ...) [n] = #t, static const char * const bch2_iops_measurements[] = { BCH_IOPS_MEASUREMENTS() @@ -123,9 +136,9 @@ static int validate_member(struct printbuf *err, struct bch_sb *sb, int i) { - if (le64_to_cpu(m.nbuckets) > LONG_MAX) { - prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", - i, le64_to_cpu(m.nbuckets), LONG_MAX); + if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) { + prt_printf(err, "device %u: too many buckets (got %llu, max %u)", + i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX); return -BCH_ERR_invalid_sb_members; } @@ -150,6 +163,11 @@ static int validate_member(struct printbuf *err, return -BCH_ERR_invalid_sb_members; } + if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) { + prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift); + return -BCH_ERR_invalid_sb_members; + } + return 0; } @@ -163,18 +181,14 @@ static void member_to_text(struct printbuf *out, u64 bucket_size = le16_to_cpu(m.bucket_size); u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; - if (!bch2_member_exists(&m)) + if (!bch2_member_alive(&m)) return; - prt_printf(out, "Device:"); - prt_tab(out); - prt_printf(out, "%u", i); - prt_newline(out); + prt_printf(out, "Device:\t%u\n", i); printbuf_indent_add(out, 2); - prt_printf(out, "Label:"); - prt_tab(out); + prt_printf(out, "Label:\t"); if (BCH_MEMBER_GROUP(&m)) { unsigned idx = BCH_MEMBER_GROUP(&m) - 1; @@ -188,103 +202,76 @@ static void member_to_text(struct printbuf *out, } prt_newline(out); - prt_printf(out, "UUID:"); - prt_tab(out); + prt_printf(out, "UUID:\t"); pr_uuid(out, m.uuid.b); prt_newline(out); - prt_printf(out, "Size:"); - prt_tab(out); + prt_printf(out, "Size:\t"); prt_units_u64(out, device_size << 9); prt_newline(out); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s errors:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, le64_to_cpu(m.errors[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i])); - for (unsigned i = 0; i < BCH_IOPS_NR; i++) { - prt_printf(out, "%s iops:", bch2_iops_measurements[i]); - prt_tab(out); - prt_printf(out, "%u", le32_to_cpu(m.iops[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_IOPS_NR; i++) + prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i])); - prt_printf(out, "Bucket size:"); - prt_tab(out); + prt_printf(out, "Bucket size:\t"); prt_units_u64(out, bucket_size << 9); prt_newline(out); - prt_printf(out, "First bucket:"); - prt_tab(out); - prt_printf(out, "%u", le16_to_cpu(m.first_bucket)); - prt_newline(out); - - prt_printf(out, "Buckets:"); - prt_tab(out); - prt_printf(out, "%llu", le64_to_cpu(m.nbuckets)); - prt_newline(out); + prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket)); + prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets)); - prt_printf(out, "Last mount:"); - prt_tab(out); + prt_printf(out, "Last mount:\t"); if (m.last_mount) bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); else prt_printf(out, "(never)"); prt_newline(out); - prt_printf(out, "Last superblock write:"); - prt_tab(out); - prt_u64(out, le64_to_cpu(m.seq)); - prt_newline(out); + prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq)); - prt_printf(out, "State:"); - prt_tab(out); - prt_printf(out, "%s", + prt_printf(out, "State:\t%s\n", BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR ? bch2_member_states[BCH_MEMBER_STATE(&m)] : "unknown"); - prt_newline(out); - prt_printf(out, "Data allowed:"); - prt_tab(out); + prt_printf(out, "Data allowed:\t"); if (BCH_MEMBER_DATA_ALLOWED(&m)) prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); else prt_printf(out, "(none)"); prt_newline(out); - prt_printf(out, "Has data:"); - prt_tab(out); + prt_printf(out, "Has data:\t"); if (data_have) prt_bitflags(out, __bch2_data_types, data_have); else prt_printf(out, "(none)"); prt_newline(out); - prt_str(out, "Durability:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + prt_printf(out, "Btree allocated bitmap blocksize:\t"); + if (m.btree_bitmap_shift < 64) + prt_units_u64(out, 1ULL << m.btree_bitmap_shift); + else + prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift); prt_newline(out); - prt_printf(out, "Discard:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m)); + prt_printf(out, "Btree allocated bitmap:\t"); + bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64); prt_newline(out); - prt_printf(out, "Freespace initialized:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); - prt_newline(out); + prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + + prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); + prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); printbuf_indent_sub(out, 2); } -static int bch2_sb_members_v1_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); unsigned i; @@ -332,9 +319,8 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, member_to_text(out, members_v2_get(mi, i), gi, sb, i); } -static int bch2_sb_members_v2_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - @@ -389,12 +375,8 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) prt_newline(out); printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, atomic64_read(&ca->errors[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i])); printbuf_indent_sub(out, 2); prt_str(out, "IO errors since "); @@ -403,12 +385,9 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) prt_newline(out); printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], + atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); printbuf_indent_sub(out, 2); } @@ -426,3 +405,128 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bch2_write_super(c); mutex_unlock(&c->sb_lock); } + +/* + * Per member "range has btree nodes" bitmap: + * + * This is so that if we ever have to run the btree node scan to repair we don't + * have to scan full devices: + */ + +bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) +{ + bool ret = true; + rcu_read_lock(); + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; + + if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) { + ret = false; + break; + } + } + rcu_read_unlock(); + return ret; +} + +static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, + u64 start, unsigned sectors) +{ + struct bch_member *m = __bch2_members_v2_get_mut(mi, dev); + u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap); + + u64 end = start + sectors; + + int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6); + if (resize > 0) { + u64 new_bitmap = 0; + + for (unsigned i = 0; i < 64; i++) + if (bitmap & BIT_ULL(i)) + new_bitmap |= BIT_ULL(i >> resize); + bitmap = new_bitmap; + m->btree_bitmap_shift += resize; + } + + BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX); + BUG_ON(end > 64ULL << m->btree_bitmap_shift); + + for (unsigned bit = start >> m->btree_bitmap_shift; + (u64) bit << m->btree_bitmap_shift < end; + bit++) + bitmap |= BIT_ULL(bit); + + m->btree_allocated_bitmap = cpu_to_le64(bitmap); +} + +void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) +{ + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (!bch2_member_exists(c->disk_sb.sb, ptr->dev)) + continue; + + __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); + } +} + +unsigned bch2_sb_nr_devices(const struct bch_sb *sb) +{ + unsigned nr = 0; + + for (unsigned i = 0; i < sb->nr_devices; i++) + nr += bch2_member_exists((struct bch_sb *) sb, i); + return nr; +} + +int bch2_sb_member_alloc(struct bch_fs *c) +{ + unsigned dev_idx = c->sb.nr_devices; + struct bch_sb_field_members_v2 *mi; + unsigned nr_devices; + unsigned u64s; + int best = -1; + u64 best_last_mount = 0; + + if (dev_idx < BCH_SB_MEMBERS_MAX) + goto have_slot; + + for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) { + /* eventually BCH_SB_MEMBERS_MAX will be raised */ + if (dev_idx == BCH_SB_MEMBER_INVALID) + continue; + + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); + if (bch2_member_alive(&m)) + continue; + + u64 last_mount = le64_to_cpu(m.last_mount); + if (best < 0 || last_mount < best_last_mount) { + best = dev_idx; + best_last_mount = last_mount; + } + } + if (best >= 0) { + dev_idx = best; + goto have_slot; + } + + return -BCH_ERR_ENOSPC_sb_members; +have_slot: + nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); + + mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + + le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); + + mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); + if (!mi) + return -BCH_ERR_ENOSPC_sb_members; + + c->disk_sb.sb->nr_devices = nr_devices; + return dev_idx; +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index be0a94183271..762083b564ee 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -3,6 +3,7 @@ #define _BCACHEFS_SB_MEMBERS_H #include "darray.h" +#include "bkey_types.h" extern char * const bch2_member_error_strs[]; @@ -28,19 +29,6 @@ static inline bool bch2_dev_is_readable(struct bch_dev *ca) ca->mi.state != BCH_MEMBER_STATE_failed; } -static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) -{ - if (!percpu_ref_tryget(&ca->io_ref)) - return false; - - if (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) - return true; - - percpu_ref_put(&ca->io_ref); - return false; -} - static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) { return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); @@ -104,14 +92,41 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * for (struct bch_dev *_ca = NULL; \ (_ca = __bch2_next_dev((_c), _ca, (_mask)));) -static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) +static inline void bch2_dev_get(struct bch_dev *ca) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L); +#else + percpu_ref_get(&ca->ref); +#endif +} + +static inline void __bch2_dev_put(struct bch_dev *ca) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + long r = atomic_long_dec_return(&ca->ref); + if (r < (long) !ca->dying) + panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put); + ca->last_put = _THIS_IP_; + if (!r) + complete(&ca->ref_completion); +#else + percpu_ref_put(&ca->ref); +#endif +} + +static inline void bch2_dev_put(struct bch_dev *ca) { if (ca) - percpu_ref_put(&ca->ref); + __bch2_dev_put(ca); +} +static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) +{ rcu_read_lock(); + bch2_dev_put(ca); if ((ca = __bch2_next_dev(c, ca, NULL))) - percpu_ref_get(&ca->ref); + bch2_dev_get(ca); rcu_read_unlock(); return ca; @@ -131,10 +146,10 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, struct bch_dev *ca, unsigned state_mask) { + rcu_read_lock(); if (ca) percpu_ref_put(&ca->io_ref); - rcu_read_lock(); while ((ca = __bch2_next_dev(c, ca, NULL)) && (!((1 << ca->mi.state) & state_mask) || !percpu_ref_tryget(&ca->io_ref))) @@ -157,26 +172,121 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, #define for_each_readable_member(c, ca) \ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro)) -/* - * If a key exists that references a device, the device won't be going away and - * we can omit rcu_read_lock(): - */ -static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) +static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) +{ + return dev < c->sb.nr_devices && c->devs[dev]; +} + +static inline bool bucket_valid(const struct bch_dev *ca, u64 b) +{ + return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first; +} + +static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev) { - EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + EBUG_ON(!bch2_dev_exists(c, dev)); - return rcu_dereference_check(c->devs[idx], 1); + return rcu_dereference_check(c->devs[dev], 1); } -static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) +static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev) { - EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + EBUG_ON(!bch2_dev_exists(c, dev)); - return rcu_dereference_protected(c->devs[idx], + return rcu_dereference_protected(c->devs[dev], lockdep_is_held(&c->sb_lock) || lockdep_is_held(&c->state_lock)); } +static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev) +{ + return c && dev < c->sb.nr_devices + ? rcu_dereference(c->devs[dev]) + : NULL; +} + +void bch2_dev_missing(struct bch_fs *, unsigned); + +static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) +{ + struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); + if (unlikely(!ca)) + bch2_dev_missing(c, dev); + return ca; +} + +static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); + if (ca) + bch2_dev_get(ca); + rcu_read_unlock(); + return ca; +} + +static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) +{ + struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); + if (unlikely(!ca)) + bch2_dev_missing(c, dev); + return ca; +} + +static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket) +{ + struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode); + if (ca && !bucket_valid(ca, bucket.offset)) { + bch2_dev_put(ca); + ca = NULL; + } + return ca; +} + +void bch2_dev_bucket_missing(struct bch_fs *, struct bpos); + +static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket) +{ + struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket); + if (!ca) + bch2_dev_bucket_missing(c, bucket); + return ca; +} + +static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) +{ + if (ca && ca->dev_idx == dev_idx) + return ca; + bch2_dev_put(ca); + return bch2_dev_tryget_noerror(c, dev_idx); +} + +static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) +{ + if (ca && ca->dev_idx == dev_idx) + return ca; + bch2_dev_put(ca); + return bch2_dev_tryget(c, dev_idx); +} + +static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + if (ca && !percpu_ref_tryget(&ca->io_ref)) + ca = NULL; + rcu_read_unlock(); + + if (ca && + (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))) + return ca; + + if (ca) + percpu_ref_put(&ca->io_ref); + return NULL; +} + /* XXX kill, move to struct bch_fs */ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) { @@ -191,24 +301,28 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; -static inline bool bch2_member_exists(struct bch_member *m) +static inline bool bch2_member_alive(struct bch_member *m) { return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); } -static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev) +static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) { if (dev < sb->nr_devices) { struct bch_member m = bch2_sb_member_get(sb, dev); - return bch2_member_exists(&m); + return bch2_member_alive(&m); } return false; } +unsigned bch2_sb_nr_devices(const struct bch_sb *); + static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) { return (struct bch_member_cpu) { .nbuckets = le64_to_cpu(mi->nbuckets), + .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) - + le16_to_cpu(mi->first_bucket), .first_bucket = le16_to_cpu(mi->first_bucket), .bucket_size = le16_to_cpu(mi->bucket_size), .group = BCH_MEMBER_GROUP(mi), @@ -219,7 +333,9 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), - .valid = bch2_member_exists(mi), + .valid = bch2_member_alive(mi), + .btree_bitmap_shift = mi->btree_bitmap_shift, + .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), }; } @@ -228,4 +344,24 @@ void bch2_sb_members_from_cpu(struct bch_fs *); void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); void bch2_dev_errors_reset(struct bch_dev *); +static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors) +{ + u64 end = start + sectors; + + if (end > 64ULL << ca->mi.btree_bitmap_shift) + return false; + + for (unsigned bit = start >> ca->mi.btree_bitmap_shift; + (u64) bit << ca->mi.btree_bitmap_shift < end; + bit++) + if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit))) + return false; + return true; +} + +bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); +void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); + +int bch2_sb_member_alloc(struct bch_fs *); + #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h new file mode 100644 index 000000000000..2adf1221a440 --- /dev/null +++ b/fs/bcachefs/sb-members_format.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H +#define _BCACHEFS_SB_MEMBERS_FORMAT_H + +/* + * We refer to members with bitmasks in various places - but we need to get rid + * of this limit: + */ +#define BCH_SB_MEMBERS_MAX 64 + +/* + * Sentinal value - indicates a device that does not exist + */ +#define BCH_SB_MEMBER_INVALID 255 + +#define BCH_MIN_NR_NBUCKETS (1 << 6) + +#define BCH_IOPS_MEASUREMENTS() \ + x(seqread, 0) \ + x(seqwrite, 1) \ + x(randread, 2) \ + x(randwrite, 3) + +enum bch_iops_measurement { +#define x(t, n) BCH_IOPS_##t = n, + BCH_IOPS_MEASUREMENTS() +#undef x + BCH_IOPS_NR +}; + +#define BCH_MEMBER_ERROR_TYPES() \ + x(read, 0) \ + x(write, 1) \ + x(checksum, 2) + +enum bch_member_error_type { +#define x(t, n) BCH_MEMBER_ERROR_##t = n, + BCH_MEMBER_ERROR_TYPES() +#undef x + BCH_MEMBER_ERROR_NR +}; + +struct bch_member { + __uuid_t uuid; + __le64 nbuckets; /* device size */ + __le16 first_bucket; /* index of first bucket used */ + __le16 bucket_size; /* sectors */ + __u8 btree_bitmap_shift; + __u8 pad[3]; + __le64 last_mount; /* time_t */ + + __le64 flags; + __le32 iops[4]; + __le64 errors[BCH_MEMBER_ERROR_NR]; + __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; + __le64 errors_reset_time; + __le64 seq; + __le64 btree_allocated_bitmap; + /* + * On recovery from a clean shutdown we don't normally read the journal, + * but we still want to resume writing from where we left off so we + * don't overwrite more than is necessary, for list journal debugging: + */ + __le32 last_journal_bucket; + __le32 last_journal_bucket_offset; +}; + +/* + * btree_allocated_bitmap can represent sector addresses of a u64: it itself has + * 64 elements, so 64 - ilog2(64) + */ +#define BCH_MI_BTREE_BITMAP_SHIFT_MAX 58 + +/* + * This limit comes from the bucket_gens array - it's a single allocation, and + * kernel allocation are limited to INT_MAX + */ +#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) + +#define BCH_MEMBER_V1_BYTES 56 + +LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) +/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ +LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) +LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) +LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) +LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) +LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, + struct bch_member, flags, 30, 31) + +#if 0 +LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); +LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); +#endif + +#define BCH_MEMBER_STATES() \ + x(rw, 0) \ + x(ro, 1) \ + x(failed, 2) \ + x(spare, 3) + +enum bch_member_state { +#define x(t, n) BCH_MEMBER_STATE_##t = n, + BCH_MEMBER_STATES() +#undef x + BCH_MEMBER_STATE_NR +}; + +struct bch_sb_field_members_v1 { + struct bch_sb_field field; + struct bch_member _members[]; //Members are now variable size +}; + +struct bch_sb_field_members_v2 { + struct bch_sb_field field; + __le16 member_bytes; //size of single member entry + u8 pad[6]; + struct bch_member _members[]; +}; + +#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */ diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h new file mode 100644 index 000000000000..c0eda888fe39 --- /dev/null +++ b/fs/bcachefs/sb-members_types.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H +#define _BCACHEFS_SB_MEMBERS_TYPES_H + +struct bch_member_cpu { + u64 nbuckets; /* device size */ + u64 nbuckets_minus_first; + u16 first_bucket; /* index of first bucket used */ + u16 bucket_size; /* sectors */ + u16 group; + u8 state; + u8 discard; + u8 data_allowed; + u8 durability; + u8 freespace_initialized; + u8 valid; + u8 btree_bitmap_shift; + u64 btree_allocated_bitmap; +}; + +#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h index c1860d8163fb..c4b3d8d3f414 100644 --- a/fs/bcachefs/seqmutex.h +++ b/fs/bcachefs/seqmutex.h @@ -19,17 +19,14 @@ static inline bool seqmutex_trylock(struct seqmutex *lock) static inline void seqmutex_lock(struct seqmutex *lock) { mutex_lock(&lock->lock); -} - -static inline void seqmutex_unlock(struct seqmutex *lock) -{ lock->seq++; - mutex_unlock(&lock->lock); } -static inline u32 seqmutex_seq(struct seqmutex *lock) +static inline u32 seqmutex_unlock(struct seqmutex *lock) { - return lock->seq; + u32 seq = lock->seq; + mutex_unlock(&lock->lock); + return seq; } static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq) diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c index dc1a27cc31cd..a1cc44e66c7e 100644 --- a/fs/bcachefs/siphash.c +++ b/fs/bcachefs/siphash.c @@ -45,7 +45,7 @@ */ #include <asm/byteorder.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/bitops.h> #include <linux/string.h> diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 3a494c5d1247..7c403427fbdb 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -169,11 +169,17 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, ret = -1 - SIX_LOCK_write; } } else if (type == SIX_LOCK_write && lock->readers) { - if (try) { + if (try) atomic_add(SIX_LOCK_HELD_write, &lock->state); - smp_mb__after_atomic(); - } + /* + * Make sure atomic_add happens before pcpu_read_count and + * six_set_bitmask in slow path happens before pcpu_read_count. + * + * Paired with the smp_mb() in read lock fast path (per-cpu mode) + * and the one before atomic_read in read unlock path. + */ + smp_mb(); ret = !pcpu_read_count(lock); if (try && !ret) { @@ -335,7 +341,7 @@ static inline bool six_owner_running(struct six_lock *lock) */ rcu_read_lock(); struct task_struct *owner = READ_ONCE(lock->owner); - bool ret = owner ? owner_on_cpu(owner) : !rt_task(current); + bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); rcu_read_unlock(); return ret; @@ -485,8 +491,12 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, list_del(&wait->list); raw_spin_unlock(&lock->wait_lock); - if (unlikely(acquired)) + if (unlikely(acquired)) { do_six_unlock_type(lock, type); + } else if (type == SIX_LOCK_write) { + six_clear_bitmask(lock, SIX_LOCK_HELD_write); + six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); + } break; } @@ -495,10 +505,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, __set_current_state(TASK_RUNNING); out: - if (ret && type == SIX_LOCK_write) { - six_clear_bitmask(lock, SIX_LOCK_HELD_write); - six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); - } trace_contention_end(lock, 0); return ret; @@ -610,8 +616,6 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long if (type != SIX_LOCK_write) six_release(&lock->dep_map, ip); - else - lock->seq++; if (type == SIX_LOCK_intent && lock->intent_lock_recurse) { @@ -619,6 +623,15 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long return; } + if (type == SIX_LOCK_write && + lock->write_lock_recurse) { + --lock->write_lock_recurse; + return; + } + + if (type == SIX_LOCK_write) + lock->seq++; + do_six_unlock_type(lock, type); } EXPORT_SYMBOL_GPL(six_unlock_ip); @@ -729,13 +742,13 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) atomic_add(l[type].lock_val, &lock->state); } break; + case SIX_LOCK_write: + lock->write_lock_recurse++; + fallthrough; case SIX_LOCK_intent: EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); lock->intent_lock_recurse++; break; - case SIX_LOCK_write: - BUG(); - break; } } EXPORT_SYMBOL_GPL(six_lock_increment); @@ -837,7 +850,8 @@ void six_lock_exit(struct six_lock *lock) EXPORT_SYMBOL_GPL(six_lock_exit); void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags) + struct lock_class_key *key, enum six_lock_init_flags flags, + gfp_t gfp) { atomic_set(&lock->state, 0); raw_spin_lock_init(&lock->wait_lock); @@ -860,7 +874,7 @@ void __six_lock_init(struct six_lock *lock, const char *name, * failure if they wish by checking lock->readers, but generally * will not want to treat it as an error. */ - lock->readers = alloc_percpu(unsigned); + lock->readers = alloc_percpu_gfp(unsigned, gfp); } #endif } diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h index 68d46fd7f391..59b851cf8bac 100644 --- a/fs/bcachefs/six.h +++ b/fs/bcachefs/six.h @@ -137,6 +137,7 @@ struct six_lock { atomic_t state; u32 seq; unsigned intent_lock_recurse; + unsigned write_lock_recurse; struct task_struct *owner; unsigned __percpu *readers; raw_spinlock_t wait_lock; @@ -163,18 +164,19 @@ enum six_lock_init_flags { }; void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags); + struct lock_class_key *key, enum six_lock_init_flags flags, + gfp_t gfp); /** * six_lock_init - initialize a six lock * @lock: lock to initialize * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU */ -#define six_lock_init(lock, flags) \ +#define six_lock_init(lock, flags, gfp) \ do { \ static struct lock_class_key __key; \ \ - __six_lock_init((lock), #lock, &__key, flags); \ + __six_lock_init((lock), #lock, &__key, flags, gfp); \ } while (0) /** diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index ac6ba04d5521..c54091a28909 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -2,12 +2,14 @@ #include "bcachefs.h" #include "bkey_buf.h" +#include "btree_cache.h" #include "btree_key_cache.h" #include "btree_update.h" #include "buckets.h" #include "errcode.h" #include "error.h" #include "fs.h" +#include "recovery_passes.h" #include "snapshot.h" #include <linux/random.h> @@ -30,15 +32,14 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, le32_to_cpu(t.v->root_snapshot)); } -int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { int ret = 0; bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), c, err, - snapshot_tree_pos_bad, + bkey_lt(k.k->p, POS(0, 1)), + c, snapshot_tree_pos_bad, "bad pos"); fsck_err: return ret; @@ -48,7 +49,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot_tree *s) { int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), - BTREE_ITER_WITH_UPDATES, snapshot_tree, s); + BTREE_ITER_with_updates, snapshot_tree, s); if (bch2_err_matches(ret, ENOENT)) ret = -BCH_ERR_ENOENT_snapshot_tree; @@ -91,23 +92,29 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans, /* Snapshot nodes: */ -static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) +static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor) { - struct snapshot_table *t; + while (id && id < ancestor) { + const struct snapshot_t *s = __snapshot_t(t, id); + id = s ? s->parent : 0; + } + return id == ancestor; +} +static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) +{ rcu_read_lock(); - t = rcu_dereference(c->snapshots); - - while (id && id < ancestor) - id = __snapshot_t(t, id)->parent; + bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); rcu_read_unlock(); - return id == ancestor; + return ret; } static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) { const struct snapshot_t *s = __snapshot_t(t, id); + if (!s) + return 0; if (s->skip[2] <= ancestor) return s->skip[2]; @@ -118,27 +125,36 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances return s->parent; } +static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) +{ + const struct snapshot_t *s = __snapshot_t(t, id); + if (!s) + return false; + + return test_bit(ancestor - id - 1, s->is_ancestor); +} + bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { - struct snapshot_table *t; bool ret; - EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots); - rcu_read_lock(); - t = rcu_dereference(c->snapshots); + struct snapshot_table *t = rcu_dereference(c->snapshots); + + if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { + ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); + goto out; + } while (id && id < ancestor - IS_ANCESTOR_BITMAP) id = get_ancestor_below(t, id, ancestor); - if (id && id < ancestor) { - ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor); - - EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor)); - } else { - ret = id == ancestor; - } + ret = id && id < ancestor + ? test_ancestor_bitmap(t, id, ancestor) + : id == ancestor; + EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); +out: rcu_read_unlock(); return ret; @@ -147,36 +163,42 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) { size_t idx = U32_MAX - id; - size_t new_size; struct snapshot_table *new, *old; - new_size = max(16UL, roundup_pow_of_two(idx + 1)); + size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); + size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); + + if (unlikely(new_bytes > INT_MAX)) + return NULL; - new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); + new = kvzalloc(new_bytes, GFP_KERNEL); if (!new) return NULL; + new->nr = new_size; + old = rcu_dereference_protected(c->snapshots, true); if (old) - memcpy(new->s, - rcu_dereference_protected(c->snapshots, true)->s, - sizeof(new->s[0]) * c->snapshot_table_size); + memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr); rcu_assign_pointer(c->snapshots, new); - c->snapshot_table_size = new_size; - kvfree_rcu_mightsleep(old); + kvfree_rcu(old, rcu); - return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + return &rcu_dereference_protected(c->snapshots, + lockdep_is_held(&c->snapshot_table_lock))->s[idx]; } static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) { size_t idx = U32_MAX - id; + struct snapshot_table *table = + rcu_dereference_protected(c->snapshots, + lockdep_is_held(&c->snapshot_table_lock)); lockdep_assert_held(&c->snapshot_table_lock); - if (likely(idx < c->snapshot_table_size)) - return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + if (likely(table && idx < table->nr)) + return &table->s[idx]; return __snapshot_t_mut(c, id); } @@ -203,55 +225,54 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, le32_to_cpu(s.v->skip[2])); } -int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_s_c_snapshot s; u32 i, id; int ret = 0; bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), c, err, - snapshot_pos_bad, + bkey_lt(k.k->p, POS(0, 1)), + c, snapshot_pos_bad, "bad pos"); s = bkey_s_c_to_snapshot(k); id = le32_to_cpu(s.v->parent); - bkey_fsck_err_on(id && id <= k.k->p.offset, c, err, - snapshot_parent_bad, + bkey_fsck_err_on(id && id <= k.k->p.offset, + c, snapshot_parent_bad, "bad parent node (%u <= %llu)", id, k.k->p.offset); - bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err, - snapshot_children_not_normalized, + bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), + c, snapshot_children_not_normalized, "children not normalized"); - bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err, - snapshot_child_duplicate, + bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], + c, snapshot_child_duplicate, "duplicate child nodes"); for (i = 0; i < 2; i++) { id = le32_to_cpu(s.v->children[i]); - bkey_fsck_err_on(id >= k.k->p.offset, c, err, - snapshot_child_bad, + bkey_fsck_err_on(id >= k.k->p.offset, + c, snapshot_child_bad, "bad child node (%u >= %llu)", id, k.k->p.offset); } if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || - le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err, - snapshot_skiplist_not_normalized, + le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), + c, snapshot_skiplist_not_normalized, "skiplist not normalized"); for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { id = le32_to_cpu(s.v->skip[i]); - bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err, - snapshot_skiplist_bad, + bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), + c, snapshot_skiplist_bad, "bad skiplist node %u", id); } } @@ -259,27 +280,10 @@ fsck_err: return ret; } -static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) -{ - struct snapshot_t *t = snapshot_t_mut(c, id); - u32 parent = id; - - while ((parent = bch2_snapshot_parent_early(c, parent)) && - parent - id - 1 < IS_ANCESTOR_BITMAP) - __set_bit(parent - id - 1, t->is_ancestor); -} - -static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) -{ - mutex_lock(&c->snapshot_table_lock); - __set_is_ancestor_bitmap(c, id); - mutex_unlock(&c->snapshot_table_lock); -} - static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct snapshot_t *t; @@ -297,6 +301,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + t->live = true; t->parent = le32_to_cpu(s.v->parent); t->children[0] = le32_to_cpu(s.v->children[0]); t->children[1] = le32_to_cpu(s.v->children[1]); @@ -315,7 +320,11 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, t->skip[2] = 0; } - __set_is_ancestor_bitmap(c, id); + u32 parent = id; + + while ((parent = bch2_snapshot_parent_early(c, parent)) && + parent - id - 1 < IS_ANCESTOR_BITMAP) + __set_bit(parent - id - 1, t->is_ancestor); if (BCH_SNAPSHOT_DELETED(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); @@ -333,7 +342,7 @@ err: int bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags); } @@ -342,71 +351,7 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s) { return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_WITH_UPDATES, snapshot, s); -} - -static int bch2_snapshot_live(struct btree_trans *trans, u32 id) -{ - struct bch_snapshot v; - int ret; - - if (!id) - return 0; - - ret = bch2_snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot node %u not found", id); - if (ret) - return ret; - - return !BCH_SNAPSHOT_DELETED(&v); -} - -/* - * If @k is a snapshot with just one live child, it's part of a linear chain, - * which we consider to be an equivalence class: and then after snapshot - * deletion cleanup, there should only be a single key at a given position in - * this equivalence class. - * - * This sets the equivalence class of @k to be the child's equivalence class, if - * it's part of such a linear chain: this correctly sets equivalence classes on - * startup if we run leaf to root (i.e. in natural key order). - */ -static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - unsigned i, nr_live = 0, live_idx = 0; - struct bkey_s_c_snapshot snap; - u32 id = k.k->p.offset, child[2]; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - snap = bkey_s_c_to_snapshot(k); - - child[0] = le32_to_cpu(snap.v->children[0]); - child[1] = le32_to_cpu(snap.v->children[1]); - - for (i = 0; i < 2; i++) { - int ret = bch2_snapshot_live(trans, child[i]); - - if (ret < 0) - return ret; - - if (ret) - live_idx = i; - nr_live += ret; - } - - mutex_lock(&c->snapshot_table_lock); - - snapshot_t_mut(c, id)->equiv = nr_live == 1 - ? snapshot_t_mut(c, child[live_idx])->equiv - : id; - - mutex_unlock(&c->snapshot_table_lock); - - return 0; + BTREE_ITER_with_updates, snapshot, s); } /* fsck: */ @@ -449,6 +394,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) u32 id = snapshot_root; u32 subvol = 0, s; + rcu_read_lock(); while (id) { s = snapshot_t(c, id)->subvol; @@ -457,6 +403,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) id = bch2_snapshot_tree_next(c, id); } + rcu_read_unlock(); return subvol; } @@ -484,7 +431,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, break; } } - bch2_trans_iter_exit(trans, &iter); if (!ret && !found) { @@ -514,6 +460,7 @@ static int check_snapshot_tree(struct btree_trans *trans, struct bch_snapshot s; struct bch_subvolume subvol; struct printbuf buf = PRINTBUF; + struct btree_iter snapshot_iter = {}; u32 root_id; int ret; @@ -523,39 +470,52 @@ static int check_snapshot_tree(struct btree_trans *trans, st = bkey_s_c_to_snapshot_tree(k); root_id = le32_to_cpu(st.v->root_snapshot); - ret = bch2_snapshot_lookup(trans, root_id, &s); + struct bkey_s_c_snapshot snapshot_k = + bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, + POS(0, root_id), 0, snapshot); + ret = bkey_err(snapshot_k); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; + if (!ret) + bkey_val_copy(&s, snapshot_k); + if (fsck_err_on(ret || root_id != bch2_snapshot_root(c, root_id) || st.k->p.offset != le32_to_cpu(s.tree), - c, snapshot_tree_to_missing_snapshot, + trans, snapshot_tree_to_missing_snapshot, "snapshot tree points to missing/incorrect snapshot:\n %s", - (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { + (bch2_bkey_val_to_text(&buf, c, st.s_c), + prt_newline(&buf), + ret + ? prt_printf(&buf, "(%s)", bch2_err_str(ret)) + : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c), + buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); goto err; } - ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), - false, 0, &subvol); + if (!st.v->master_subvol) + goto out; + + ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; if (fsck_err_on(ret, - c, snapshot_tree_to_missing_subvol, + trans, snapshot_tree_to_missing_subvol, "snapshot tree points to missing subvolume:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(!bch2_snapshot_is_ancestor_early(c, + fsck_err_on(!bch2_snapshot_is_ancestor(c, le32_to_cpu(subvol.snapshot), root_id), - c, snapshot_tree_to_wrong_subvol, + trans, snapshot_tree_to_wrong_subvol, "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), - c, snapshot_tree_to_snapshot_subvol, + trans, snapshot_tree_to_snapshot_subvol, "snapshot tree points to snapshot subvolume:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { @@ -563,6 +523,13 @@ static int check_snapshot_tree(struct btree_trans *trans, u32 subvol_id; ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); + bch_err_fn(c, ret); + + if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ + ret = 0; + goto err; + } + if (ret) goto err; @@ -574,8 +541,10 @@ static int check_snapshot_tree(struct btree_trans *trans, u->v.master_subvol = cpu_to_le32(subvol_id); st = snapshot_tree_i_to_s_c(u); } +out: err: fsck_err: + bch2_trans_iter_exit(trans, &snapshot_iter); printbuf_exit(&buf); return ret; } @@ -592,7 +561,7 @@ int bch2_check_snapshot_trees(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_snapshot_tree(trans, &iter, k))); bch_err_fn(c, ret); @@ -669,7 +638,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, root = bch2_bkey_get_iter_typed(trans, &root_iter, BTREE_ID_snapshots, POS(0, root_id), - BTREE_ITER_WITH_UPDATES, snapshot); + BTREE_ITER_with_updates, snapshot); ret = bkey_err(root); if (ret) goto err; @@ -720,7 +689,6 @@ static int check_snapshot(struct btree_trans *trans, u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); u32 real_depth; struct printbuf buf = PRINTBUF; - bool should_have_subvol; u32 i, id; int ret = 0; @@ -766,12 +734,12 @@ static int check_snapshot(struct btree_trans *trans, } } - should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && + bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && !BCH_SNAPSHOT_DELETED(&s); if (should_have_subvol) { id = le32_to_cpu(s.subvol); - ret = bch2_subvolume_get(trans, id, 0, false, &subvol); + ret = bch2_subvolume_get(trans, id, false, &subvol); if (bch2_err_matches(ret, ENOENT)) bch_err(c, "snapshot points to nonexistent subvolume:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -786,7 +754,7 @@ static int check_snapshot(struct btree_trans *trans, } } else { if (fsck_err_on(s.subvol, - c, snapshot_should_not_have_subvol, + trans, snapshot_should_not_have_subvol, "snapshot should not point to subvol:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); @@ -803,7 +771,8 @@ static int check_snapshot(struct btree_trans *trans, if (ret < 0) goto err; - if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree, + if (fsck_err_on(!ret, + trans, snapshot_to_bad_snapshot_tree, "snapshot points to missing/incorrect tree:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = snapshot_tree_ptr_repair(trans, iter, k, &s); @@ -815,7 +784,7 @@ static int check_snapshot(struct btree_trans *trans, real_depth = bch2_snapshot_depth(c, parent_id); if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, - c, snapshot_bad_depth, + trans, snapshot_bad_depth, "snapshot with incorrect depth field, should be %u:\n %s", real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); @@ -831,7 +800,8 @@ static int check_snapshot(struct btree_trans *trans, if (ret < 0) goto err; - if (fsck_err_on(!ret, c, snapshot_bad_skiplist, + if (fsck_err_on(!ret, + trans, snapshot_bad_skiplist, "snapshot with bad skiplist field:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); @@ -861,26 +831,221 @@ int bch2_check_snapshots(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_snapshots, POS_MAX, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_snapshot(trans, &iter, k))); bch_err_fn(c, ret); return ret; } +static int check_snapshot_exists(struct btree_trans *trans, u32 id) +{ + struct bch_fs *c = trans->c; + + if (bch2_snapshot_exists(c, id)) + return 0; + + /* Do we need to reconstruct the snapshot_tree entry as well? */ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + u32 tree_id = 0; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, + 0, k, ret) { + if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { + tree_id = k.k->p.offset; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; + + if (!tree_id) { + ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + if (ret) + return ret; + } + + struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); + ret = PTR_ERR_OR_ZERO(snapshot); + if (ret) + return ret; + + bkey_snapshot_init(&snapshot->k_i); + snapshot->k.p = POS(0, id); + snapshot->v.tree = cpu_to_le32(tree_id); + snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, + 0, k, ret) { + if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { + snapshot->v.subvol = cpu_to_le32(k.k->p.offset); + SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: + bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); +} + +/* Figure out which snapshot nodes belong in the same tree: */ +struct snapshot_tree_reconstruct { + enum btree_id btree; + struct bpos cur_pos; + snapshot_id_list cur_ids; + DARRAY(snapshot_id_list) trees; +}; + +static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r) +{ + darray_for_each(r->trees, i) + darray_exit(i); + darray_exit(&r->trees); + darray_exit(&r->cur_ids); +} + +static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos) +{ + return r->btree == BTREE_ID_inodes + ? r->cur_pos.offset == pos.offset + : r->cur_pos.inode == pos.inode; +} + +static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) +{ + darray_for_each(*l, i) + if (snapshot_list_has_id(r, *i)) + return true; + return false; +} + +static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) +{ + bool first = true; + darray_for_each(*s, i) { + if (!first) + prt_char(out, ' '); + first = false; + prt_printf(out, "%u", *i); + } +} + +static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r) +{ + if (r->cur_ids.nr) { + darray_for_each(r->trees, i) + if (snapshot_id_lists_have_common(i, &r->cur_ids)) { + int ret = snapshot_list_merge(c, i, &r->cur_ids); + if (ret) + return ret; + goto out; + } + darray_push(&r->trees, r->cur_ids); + darray_init(&r->cur_ids); + } +out: + r->cur_ids.nr = 0; + return 0; +} + +static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos) +{ + if (!same_snapshot(r, pos)) + snapshot_tree_reconstruct_next(c, r); + r->cur_pos = pos; + return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot); +} + +int bch2_reconstruct_snapshots(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct printbuf buf = PRINTBUF; + struct snapshot_tree_reconstruct r = {}; + int ret = 0; + + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { + if (btree_type_has_snapshots(btree)) { + r.btree = btree; + + ret = for_each_btree_key(trans, iter, btree, POS_MIN, + BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ + get_snapshot_trees(c, &r, k.k->p); + })); + if (ret) + goto err; + + snapshot_tree_reconstruct_next(c, &r); + } + } + + darray_for_each(r.trees, t) { + printbuf_reset(&buf); + snapshot_id_list_to_text(&buf, t); + + darray_for_each(*t, id) { + if (fsck_err_on(!bch2_snapshot_exists(c, *id), + trans, snapshot_node_missing, + "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { + if (t->nr > 1) { + bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; + } + + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_snapshot_exists(trans, *id)); + if (ret) + goto err; + } + } + } +fsck_err: +err: + bch2_trans_put(trans); + snapshot_tree_reconstruct_exit(&r); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +int bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), + trans, bkey_in_missing_snapshot, + "key in missing snapshot %s, delete?", + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; +fsck_err: + printbuf_exit(&buf); + return ret; +} + /* * Mark a snapshot as deleted, for future cleanup: */ int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) { struct btree_iter iter; - struct bkey_i_snapshot *s; - int ret = 0; - - s = bch2_bkey_get_mut_typed(trans, &iter, + struct bkey_i_snapshot *s = + bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), 0, snapshot); - ret = PTR_ERR_OR_ZERO(s); + int ret = PTR_ERR_OR_ZERO(s); if (unlikely(ret)) { bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing snapshot %u", id); @@ -917,7 +1082,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) int ret = 0; s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_INTENT, snapshot); + BTREE_ITER_intent, snapshot); ret = bkey_err(s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "missing snapshot %u", id); @@ -1026,7 +1191,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, - POS_MIN, BTREE_ITER_INTENT); + POS_MIN, BTREE_ITER_intent); k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) @@ -1068,10 +1233,6 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, goto err; new_snapids[i] = iter.pos.offset; - - mutex_lock(&c->snapshot_table_lock); - snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i]; - mutex_unlock(&c->snapshot_table_lock); } err: bch2_trans_iter_exit(trans, &iter); @@ -1177,126 +1338,153 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -static int snapshot_delete_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - snapshot_id_list *deleted, - snapshot_id_list *equiv_seen, - struct bpos *last_pos) +struct snapshot_interior_delete { + u32 id; + u32 live_child; +}; +typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; + +static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) { - struct bch_fs *c = trans->c; - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + darray_for_each(*l, i) + if (i->id == id) + return i->live_child; + return 0; +} - if (!bkey_eq(k.k->p, *last_pos)) - equiv_seen->nr = 0; - *last_pos = k.k->p; +static unsigned __live_child(struct snapshot_table *t, u32 id, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) +{ + struct snapshot_t *s = __snapshot_t(t, id); + if (!s) + return 0; - if (snapshot_list_has_id(deleted, k.k->p.snapshot) || - snapshot_list_has_id(equiv_seen, equiv)) { - return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - } else { - return snapshot_list_add(c, equiv_seen, equiv); + for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) + if (s->children[i] && + !snapshot_list_has_id(delete_leaves, s->children[i]) && + !interior_delete_has_id(delete_interior, s->children[i])) + return s->children[i]; + + for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) { + u32 live_child = s->children[i] + ? __live_child(t, s->children[i], delete_leaves, delete_interior) + : 0; + if (live_child) + return live_child; } + + return 0; } -static int move_key_to_correct_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +static unsigned live_child(struct bch_fs *c, u32 id, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) { - struct bch_fs *c = trans->c; - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + rcu_read_lock(); + u32 ret = __live_child(rcu_dereference(c->snapshots), id, + delete_leaves, delete_interior); + rcu_read_unlock(); + return ret; +} - /* - * When we have a linear chain of snapshot nodes, we consider - * those to form an equivalence class: we're going to collapse - * them all down to a single node, and keep the leaf-most node - - * which has the same id as the equivalence class id. - * - * If there are multiple keys in different snapshots at the same - * position, we're only going to keep the one in the newest - * snapshot - the rest have been overwritten and are redundant, - * and for the key we're going to keep we need to move it to the - * equivalance class ID if it's not there already. - */ - if (equiv != k.k->p.snapshot) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - struct btree_iter new_iter; - int ret; +static int delete_dead_snapshots_process_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) +{ + if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); - ret = PTR_ERR_OR_ZERO(new); + u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); + if (live_child) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; - new->k.p.snapshot = equiv; + new->k.p.snapshot = live_child; - bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - - ret = bch2_btree_iter_traverse(&new_iter) ?: - bch2_trans_update(trans, &new_iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - bch2_trans_iter_exit(trans, &new_iter); + struct btree_iter dst_iter; + struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, + iter->btree_id, new->k.p, + BTREE_ITER_all_snapshots| + BTREE_ITER_intent); + ret = bkey_err(dst_k); if (ret) return ret; + + ret = (bkey_deleted(dst_k.k) + ? bch2_trans_update(trans, &dst_iter, new, + BTREE_UPDATE_internal_snapshot_node) + : 0) ?: + bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); + bch2_trans_iter_exit(trans, &dst_iter); + return ret; } return 0; } -static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k) +/* + * For a given snapshot, if it doesn't have a subvolume that points to it, and + * it doesn't have child snapshot nodes - it's now redundant and we can mark it + * as deleted. + */ +static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) { - struct bkey_s_c_snapshot snap; - u32 children[2]; - int ret; - if (k.k->type != KEY_TYPE_snapshot) return 0; - snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || - BCH_SNAPSHOT_SUBVOL(snap.v)) + struct bch_fs *c = trans->c; + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + unsigned live_children = 0; + + if (BCH_SNAPSHOT_SUBVOL(s.v)) return 0; - children[0] = le32_to_cpu(snap.v->children[0]); - children[1] = le32_to_cpu(snap.v->children[1]); + for (unsigned i = 0; i < 2; i++) { + u32 child = le32_to_cpu(s.v->children[i]); - ret = bch2_snapshot_live(trans, children[0]) ?: - bch2_snapshot_live(trans, children[1]); - if (ret < 0) - return ret; - return !ret; -} + live_children += child && + !snapshot_list_has_id(delete_leaves, child); + } -/* - * For a given snapshot, if it doesn't have a subvolume that points to it, and - * it doesn't have child snapshot nodes - it's now redundant and we can mark it - * as deleted. - */ -static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k) -{ - int ret = bch2_snapshot_needs_delete(trans, k); + if (live_children == 0) { + return snapshot_list_add(c, delete_leaves, s.k->p.offset); + } else if (live_children == 1) { + struct snapshot_interior_delete d = { + .id = s.k->p.offset, + .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), + }; + + if (!d.live_child) { + bch_err(c, "error finding live child of snapshot %u", d.id); + return -EINVAL; + } - return ret <= 0 - ? ret - : bch2_snapshot_node_set_deleted(trans, k.k->p.offset); + return darray_push(delete_interior, d); + } else { + return 0; + } } static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, - snapshot_id_list *skip) + interior_delete_list *skip) { rcu_read_lock(); - while (snapshot_list_has_id(skip, id)) + while (interior_delete_has_id(skip, id)) id = __bch2_snapshot_parent(c, id); while (n--) { do { id = __bch2_snapshot_parent(c, id); - } while (snapshot_list_has_id(skip, id)); + } while (interior_delete_has_id(skip, id)); } rcu_read_unlock(); @@ -1305,7 +1493,7 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - snapshot_id_list *deleted) + interior_delete_list *deleted) { struct bch_fs *c = trans->c; u32 nr_deleted_ancestors = 0; @@ -1315,7 +1503,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, if (k.k->type != KEY_TYPE_snapshot) return 0; - if (snapshot_list_has_id(deleted, k.k->p.offset)) + if (interior_delete_has_id(deleted, k.k->p.offset)) return 0; s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); @@ -1324,7 +1512,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, return ret; darray_for_each(*deleted, i) - nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i); + nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id); if (!nr_deleted_ancestors) return 0; @@ -1342,7 +1530,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { u32 id = le32_to_cpu(s->v.skip[j]); - if (snapshot_list_has_id(deleted, id)) { + if (interior_delete_has_id(deleted, id)) { id = bch2_snapshot_nth_parent_skip(c, parent, depth > 1 @@ -1361,108 +1549,74 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, int bch2_delete_dead_snapshots(struct bch_fs *c) { - struct btree_trans *trans; - snapshot_id_list deleted = { 0 }; - snapshot_id_list deleted_interior = { 0 }; - u32 id; - int ret = 0; - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - if (!test_bit(BCH_FS_started, &c->flags)) { - ret = bch2_fs_read_write_early(c); - bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); - if (ret) - return ret; - } - - trans = bch2_trans_get(c); + struct btree_trans *trans = bch2_trans_get(c); + snapshot_id_list delete_leaves = {}; + interior_delete_list delete_interior = {}; + int ret = 0; /* * For every snapshot node: If we have no live children and it's not * pointed to by a subvolume, delete it: */ - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - NULL, NULL, 0, - bch2_delete_redundant_snapshot(trans, k)); - bch_err_msg(c, ret, "deleting redundant snapshots"); + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, + check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "walking snapshots"); if (ret) goto err; - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - bch2_snapshot_set_equiv(trans, k)); - bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); - if (ret) + if (!delete_leaves.nr && !delete_interior.nr) goto err; - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ({ - if (k.k->type != KEY_TYPE_snapshot) - continue; + { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "deleting leaves"); + darray_for_each(delete_leaves, i) + prt_printf(&buf, " %u", *i); - BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v) - ? snapshot_list_add(c, &deleted, k.k->p.offset) - : 0; - })); - bch_err_msg(c, ret, "walking snapshots"); - if (ret) - goto err; + prt_printf(&buf, " interior"); + darray_for_each(delete_interior, i) + prt_printf(&buf, " %u->%u", i->id, i->live_child); - for (id = 0; id < BTREE_ID_NR; id++) { - struct bpos last_pos = POS_MIN; - snapshot_id_list equiv_seen = { 0 }; - struct disk_reservation res = { 0 }; + ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); + printbuf_exit(&buf); + if (ret) + goto err; + } - if (!btree_type_has_snapshots(id)) - continue; + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { + struct disk_reservation res = { 0 }; - /* - * deleted inodes btree is maintained by a trigger on the inodes - * btree - no work for us to do here, and it's not safe to scan - * it because we'll see out of date keys due to the btree write - * buffer: - */ - if (id == BTREE_ID_deleted_inodes) + if (!btree_type_has_snapshots(btree)) continue; ret = for_each_btree_key_commit(trans, iter, - id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, - snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: - for_each_btree_key_commit(trans, iter, - id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + btree, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, - move_key_to_correct_snapshot(trans, &iter, k)); + delete_dead_snapshots_process_key(trans, &iter, k, + &delete_leaves, + &delete_interior)); bch2_disk_reservation_put(c, &res); - darray_exit(&equiv_seen); - bch_err_msg(c, ret, "deleting keys from dying snapshots"); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "deleting keys from dying snapshots"); if (ret) goto err; } - bch2_trans_unlock(trans); - down_write(&c->snapshot_create_lock); - - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ({ - u32 snapshot = k.k->p.offset; - u32 equiv = bch2_snapshot_equiv(c, snapshot); - - equiv != snapshot - ? snapshot_list_add(c, &deleted_interior, snapshot) - : 0; - })); - - bch_err_msg(c, ret, "walking snapshots"); - if (ret) - goto err_create_lock; + darray_for_each(delete_leaves, i) { + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, *i)); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "deleting snapshot %u", *i); + if (ret) + goto err; + } /* * Fixing children of deleted snapshots can't be done completely @@ -1470,34 +1624,26 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) * nodes some depth fields will be off: */ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); if (ret) - goto err_create_lock; - - darray_for_each(deleted, i) { - ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, *i)); - bch_err_msg(c, ret, "deleting snapshot %u", *i); - if (ret) - goto err_create_lock; - } + goto err; - darray_for_each(deleted_interior, i) { + darray_for_each(delete_interior, i) { ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, *i)); - bch_err_msg(c, ret, "deleting snapshot %u", *i); + bch2_snapshot_node_delete(trans, i->id)); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "deleting snapshot %u", i->id); if (ret) - goto err_create_lock; + goto err; } -err_create_lock: - up_write(&c->snapshot_create_lock); err: - darray_exit(&deleted_interior); - darray_exit(&deleted); + darray_exit(&delete_interior); + darray_exit(&delete_leaves); bch2_trans_put(trans); - bch_err_fn(c, ret); + if (!bch2_err_matches(ret, EROFS)) + bch_err_fn(c, ret); return ret; } @@ -1505,14 +1651,20 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); + bch2_delete_dead_snapshots(c); bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } void bch2_delete_dead_snapshots_async(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && - !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots)) + return; + + BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); + + if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } @@ -1525,18 +1677,10 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, &iter, id, pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); - while (1) { - k = bch2_btree_iter_prev(&iter); - ret = bkey_err(k); - if (ret) - break; - - if (!k.k) - break; - + for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos), + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots, + k, ret) { if (!bkey_eq(pos, k.k->p)) break; @@ -1550,134 +1694,51 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return ret; } -static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - - return s->children[1] ?: s->children[0]; -} - -static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) -{ - u32 child; - - while ((child = bch2_snapshot_smallest_child(c, id))) - id = child; - return id; -} - -static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, - enum btree_id btree, - struct bkey_s_c interior_k, - u32 leaf_id, struct bpos *new_min_pos) -{ - struct btree_iter iter; - struct bpos pos = interior_k.k->p; - struct bkey_s_c k; - struct bkey_i *new; - int ret; - - pos.snapshot = leaf_id; - - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto out; - - /* key already overwritten in this snapshot? */ - if (k.k->p.snapshot != interior_k.k->p.snapshot) - goto out; - - if (bpos_eq(*new_min_pos, POS_MIN)) { - *new_min_pos = k.k->p; - new_min_pos->snapshot = leaf_id; - } - - new = bch2_bkey_make_mut_noupdate(trans, interior_k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - new->k.p.snapshot = leaf_id; - ret = bch2_trans_update(trans, &iter, new, 0); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, - enum btree_id btree, - struct bkey_s_c k, - struct bpos *new_min_pos) +static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap) { - struct bch_fs *c = trans->c; - struct bkey_buf sk; - u32 restart_count = trans->restart_count; - int ret = 0; - - bch2_bkey_buf_init(&sk); - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - *new_min_pos = POS_MIN; - - for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot); - id < k.k->p.snapshot; - id++) { - if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) || - !bch2_snapshot_is_leaf(c, id)) - continue; -again: - ret = btree_trans_too_many_iters(trans) ?: - bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?: - bch2_trans_commit(trans, NULL, NULL, 0); - if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - bch2_trans_begin(trans); - goto again; - } - - if (ret) - break; - } - - bch2_bkey_buf_exit(&sk, c); - - return ret ?: trans_was_restarted(trans, restart_count); + /* If there's one child, it's redundant and keys will be moved to the child */ + return !!snap.v->children[0] + !!snap.v->children[1] == 1; } static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) { - struct bch_fs *c = trans->c; - struct bkey_s_c_snapshot snap; - int ret = 0; - if (k.k->type != KEY_TYPE_snapshot) return 0; - snap = bkey_s_c_to_snapshot(k); + struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); if (BCH_SNAPSHOT_DELETED(snap.v) || - bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || - (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - return 0; - } + interior_snapshot_needs_delete(snap)) + set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); - return ret; + return 0; } int bch2_snapshots_read(struct bch_fs *c) { + /* + * Initializing the is_ancestor bitmaps requires ancestors to already be + * initialized - so mark in reverse: + */ int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, + for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots, + POS_MAX, 0, k, __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: - bch2_snapshot_set_equiv(trans, k) ?: - bch2_check_snapshot_needs_deletion(trans, k)) ?: - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); + bch2_check_snapshot_needs_deletion(trans, k))); bch_err_fn(c, ret); + + /* + * It's important that we check if we need to reconstruct snapshots + * before going RW, so we mark that pass as required in the superblock - + * otherwise, we could end up deleting keys with missing snapshot nodes + * instead + */ + BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && + test_bit(BCH_FS_may_go_rw, &c->flags)); + + if (bch2_err_matches(ret, EIO) || + (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); + return ret; } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 7c66ffc06385..00373cf32e7b 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -2,14 +2,12 @@ #ifndef _BCACHEFS_SNAPSHOT_H #define _BCACHEFS_SNAPSHOT_H -enum bkey_invalid_flags; - void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ - .key_invalid = bch2_snapshot_tree_invalid, \ + .key_validate = bch2_snapshot_tree_validate, \ .val_to_text = bch2_snapshot_tree_to_text, \ .min_val_size = 8, \ }) @@ -19,13 +17,14 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ - .key_invalid = bch2_snapshot_invalid, \ + .key_validate = bch2_snapshot_validate, \ .val_to_text = bch2_snapshot_to_text, \ .trigger = bch2_mark_snapshot, \ .min_val_size = 24, \ @@ -33,7 +32,11 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) { - return &t->s[U32_MAX - id]; + u32 idx = U32_MAX - id; + + return likely(t && idx < t->nr) + ? &t->s[idx] + : NULL; } static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) @@ -44,7 +47,8 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) { rcu_read_lock(); - id = snapshot_t(c, id)->tree; + const struct snapshot_t *s = snapshot_t(c, id); + id = s ? s->tree : 0; rcu_read_unlock(); return id; @@ -52,7 +56,8 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) { - return snapshot_t(c, id)->parent; + const struct snapshot_t *s = snapshot_t(c, id); + return s ? s->parent : 0; } static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) @@ -66,19 +71,19 @@ static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) { -#ifdef CONFIG_BCACHEFS_DEBUG - u32 parent = snapshot_t(c, id)->parent; + const struct snapshot_t *s = snapshot_t(c, id); + if (!s) + return 0; - if (parent && - snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) + u32 parent = s->parent; + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + parent && + s->depth != snapshot_t(c, parent)->depth + 1) panic("id %u depth=%u parent %u depth=%u\n", id, snapshot_t(c, id)->depth, parent, snapshot_t(c, parent)->depth); return parent; -#else - return snapshot_t(c, id)->parent; -#endif } static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) @@ -114,57 +119,37 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) return id; } -static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) +static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) { - return snapshot_t(c, id)->equiv; + const struct snapshot_t *s = snapshot_t(c, id); + return s ? s->live : 0; } -static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) +static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) { rcu_read_lock(); - id = __bch2_snapshot_equiv(c, id); + bool ret = __bch2_snapshot_exists(c, id); rcu_read_unlock(); - return id; -} - -static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) -{ - return id == bch2_snapshot_equiv(c, id); + return ret; } -static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) +static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { - const struct snapshot_t *s; - bool ret; - rcu_read_lock(); - s = snapshot_t(c, id); - ret = s->children[0]; + const struct snapshot_t *s = snapshot_t(c, id); + int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; rcu_read_unlock(); return ret; } -static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) -{ - return !bch2_snapshot_is_internal_node(c, id); -} - -static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) +static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) { - const struct snapshot_t *s; - u32 parent = __bch2_snapshot_parent(c, id); - - if (!parent) - return 0; - - s = snapshot_t(c, __bch2_snapshot_parent(c, id)); - if (id == s->children[0]) - return s->children[1]; - if (id == s->children[1]) - return s->children[0]; - return 0; + int ret = bch2_snapshot_is_internal_node(c, id); + if (ret < 0) + return ret; + return !ret; } static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) @@ -189,12 +174,9 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) { - const struct snapshot_t *t; - bool ret; - rcu_read_lock(); - t = snapshot_t(c, id); - ret = (t->children[0]|t->children[1]) != 0; + const struct snapshot_t *t = snapshot_t(c, id); + bool ret = t && (t->children[0]|t->children[1]) != 0; rcu_read_unlock(); return ret; @@ -218,15 +200,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) { - int ret; - BUG_ON(snapshot_list_has_id(s, id)); - ret = darray_push(s, id); + int ret = darray_push(s, id); if (ret) bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); return ret; } +static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id) +{ + int ret = snapshot_list_has_id(s, id) + ? 0 + : darray_push(s, id); + if (ret) + bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); + return ret; +} + +static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src) +{ + darray_for_each(*src, i) { + int ret = snapshot_list_add_nodup(c, dst, *i); + if (ret) + return ret; + } + + return 0; +} + int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s); int bch2_snapshot_get_subvol(struct btree_trans *, u32, @@ -238,6 +239,8 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); +int bch2_reconstruct_snapshots(struct bch_fs *); +int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); void bch2_delete_dead_snapshots_work(struct work_struct *); @@ -249,15 +252,12 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, struct bpos pos) { if (!btree_type_has_snapshots(id) || - bch2_snapshot_is_leaf(trans->c, pos.snapshot)) + bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0) return 0; return __bch2_key_has_snapshot_overwrites(trans, id, pos); } -int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id, - struct bkey_s_c, struct bpos *); - int bch2_snapshots_read(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c new file mode 100644 index 000000000000..d78451c2a0c6 --- /dev/null +++ b/fs/bcachefs/str_hash.c @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_cache.h" +#include "btree_update.h" +#include "dirent.h" +#include "fsck.h" +#include "str_hash.h" +#include "subvolume.h" + +static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) +{ + if (d.v->d_type == DT_SUBVOL) { + struct bch_subvolume subvol; + int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol), + false, &subvol); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + return !ret; + } else { + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); + int ret = bkey_err(k); + if (ret) + return ret; + + ret = bkey_is_inode(k.k); + bch2_trans_iter_exit(trans, &iter); + return ret; + } +} + +static noinline int fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old) +{ + struct qstr old_name = bch2_dirent_get_name(old); + struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bkey_dirent_init(&new->k_i); + dirent_copy_target(new, old); + new->k.p = old.k->p; + + for (unsigned i = 0; i < 1000; i++) { + unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", + old_name.len, old_name.name, i); + unsigned u64s = BKEY_U64s + dirent_val_u64s(len); + + if (u64s > U8_MAX) + return -EINVAL; + + new->k.u64s = u64s; + + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, + (subvol_inum) { 0, old.k->p.inode }, + old.k->p.snapshot, &new->k_i, + BTREE_UPDATE_internal_snapshot_node); + if (!bch2_err_matches(ret, EEXIST)) + break; + } + + if (ret) + return ret; + + return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); +} + +static noinline int hash_pick_winner(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c k1, + struct bkey_s_c k2) +{ + if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && + !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) + return 0; + + switch (desc.btree_id) { + case BTREE_ID_dirents: { + int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1)); + if (ret < 0) + return ret; + if (!ret) + return 0; + + ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2)); + if (ret < 0) + return ret; + if (!ret) + return 1; + return 2; + } + default: + return 0; + } +} + +static int repair_inode_hash_info(struct btree_trans *trans, + struct bch_inode_unpacked *snapshot_root) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != snapshot_root->bi_inum) + break; + if (!bkey_is_inode(k.k)) + continue; + + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(k, &inode); + if (ret) + break; + + if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), + trans, inode_snapshot_mismatch, + "inode hash info in different snapshots don't match")) { + inode.bi_hash_seed = snapshot_root->bi_hash_seed; + SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); + ret = __bch2_fsck_write_inode(trans, &inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + break; + } + } +fsck_err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* + * All versions of the same inode in different snapshots must have the same hash + * seed/type: verify that the hash info we're using matches the root + */ +static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, + struct bch_hash_info *hash_info) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) + goto found; + } + bch_err(c, "%s(): inum %llu not found", __func__, inum); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; +found:; + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(k, &inode); + if (ret) + goto err; + + struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); + if (hash_info->type != hash2.type || + memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) { + ret = repair_inode_hash_info(trans, &inode); + if (!ret) { + bch_err(c, "inode hash info mismatch with root, but mismatch not found\n" + "%u %llx %llx\n" + "%u %llx %llx", + hash_info->type, + hash_info->siphash_key.k0, + hash_info->siphash_key.k1, + hash2.type, + hash2.siphash_key.k0, + hash2.siphash_key.k1); + ret = -BCH_ERR_fsck_repair_unimplemented; + } + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int __bch2_str_hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct printbuf buf = PRINTBUF; + struct bkey_s_c k; + int ret = 0; + + u64 hash = desc->hash_bkey(hash_info, hash_k); + if (hash_k.k->p.offset < hash) + goto bad_hash; + + for_each_btree_key_norestart(trans, iter, desc->btree_id, + SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), + BTREE_ITER_slots, k, ret) { + if (bkey_eq(k.k->p, hash_k.k->p)) + break; + + if (k.k->type == desc->key_type && + !desc->cmp_bkey(k, hash_k)) + goto duplicate_entries; + + if (bkey_deleted(k.k)) { + bch2_trans_iter_exit(trans, &iter); + goto bad_hash; + } + } +out: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +bad_hash: + /* + * Before doing any repair, check hash_info itself: + */ + ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info); + if (ret) + goto out; + + if (fsck_err(trans, hash_table_key_wrong_offset, + "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", + bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); + if (IS_ERR(new)) + return PTR_ERR(new); + + k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info, + (subvol_inum) { 0, hash_k.k->p.inode }, + hash_k.k->p.snapshot, new, + STR_HASH_must_create| + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node); + ret = bkey_err(k); + if (ret) + goto out; + if (k.k) + goto duplicate_entries; + + ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, + BTREE_UPDATE_internal_snapshot_node) ?: + bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + goto out; + } +fsck_err: + goto out; +duplicate_entries: + ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k); + if (ret < 0) + goto out; + + if (!fsck_err(trans, hash_table_key_duplicate, + "duplicate hash table keys%s:\n%s", + ret != 2 ? "" : ", both point to valid inodes", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) + goto out; + + switch (ret) { + case 0: + ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); + break; + case 1: + ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0); + break; + case 2: + ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: + bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); + goto out; + } + + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; +} diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index fcaa5a888744..55a4ac7bf220 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -15,16 +15,6 @@ #include <crypto/hash.h> #include <crypto/sha2.h> -typedef unsigned __bitwise bch_str_hash_flags_t; - -enum bch_str_hash_flags { - __BCH_HASH_SET_MUST_CREATE, - __BCH_HASH_SET_MUST_REPLACE, -}; - -#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE) -#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE) - static inline enum bch_str_hash_type bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) { @@ -56,8 +46,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) { /* XXX ick */ struct bch_hash_info info = { - .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & - ~(~0U << INODE_STR_HASH_BITS), + .type = INODE_STR_HASH(bi), .siphash_key = { .k0 = bi->bi_hash_seed } }; @@ -159,24 +148,25 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s desc.is_visible(inum, k)); } -static __always_inline int +static __always_inline struct bkey_s_c bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags, u32 snapshot) + enum btree_iter_update_trigger_flags flags, + u32 snapshot) { struct bkey_s_c k; int ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), - BTREE_ITER_SLOTS|flags, k, ret) { + BTREE_ITER_slots|flags, k, ret) { if (is_visible_key(desc, inum, k)) { if (!desc.cmp_key(k, key)) - return 0; + return k; } else if (k.k->type == KEY_TYPE_hash_whiteout) { ; } else { @@ -186,20 +176,23 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, } bch2_trans_iter_exit(trans, iter); - return ret ?: -BCH_ERR_ENOENT_str_hash_lookup; + return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup); } -static __always_inline int +static __always_inline struct bkey_s_c bch2_hash_lookup(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { u32 snapshot; - return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: - bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); + int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return bkey_s_c_err(ret); + + return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); } static __always_inline int @@ -217,10 +210,10 @@ bch2_hash_hole(struct btree_trans *trans, if (ret) return ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) if (!is_visible_key(desc, inum, k)) return 0; bch2_trans_iter_exit(trans, iter); @@ -242,7 +235,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, bch2_btree_iter_advance(&iter); - for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) { + for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) { if (k.k->type != desc.key_type && k.k->type != KEY_TYPE_hash_whiteout) break; @@ -259,25 +252,25 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, } static __always_inline -int bch2_hash_set_snapshot(struct btree_trans *trans, +struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, + struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, struct bkey_i *insert, - bch_str_hash_flags_t str_hash_flags, - int update_flags) + enum btree_iter_update_trigger_flags flags) { - struct btree_iter iter, slot = { NULL }; + struct btree_iter slot = {}; struct bkey_s_c k; bool found = false; int ret; - for_each_btree_key_upto_norestart(trans, iter, desc.btree_id, + for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, SPOS(insert->k.p.inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), POS(insert->k.p.inode, U64_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) { if (is_visible_key(desc, inum, k)) { if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) goto found; @@ -286,9 +279,8 @@ int bch2_hash_set_snapshot(struct btree_trans *trans, continue; } - if (!slot.path && - !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) - bch2_trans_copy_iter(&slot, &iter); + if (!slot.path && !(flags & STR_HASH_must_replace)) + bch2_trans_copy_iter(&slot, iter); if (k.k->type != KEY_TYPE_hash_whiteout) goto not_found; @@ -298,47 +290,63 @@ int bch2_hash_set_snapshot(struct btree_trans *trans, ret = -BCH_ERR_ENOSPC_str_hash_create; out: bch2_trans_iter_exit(trans, &slot); - bch2_trans_iter_exit(trans, &iter); - - return ret; + bch2_trans_iter_exit(trans, iter); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; found: found = true; not_found: - - if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) { + if (found && (flags & STR_HASH_must_create)) { + bch2_trans_iter_exit(trans, &slot); + return k; + } else if (!found && (flags & STR_HASH_must_replace)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; - } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) { - ret = -EEXIST; } else { if (!found && slot.path) - swap(iter, slot); + swap(*iter, slot); - insert->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, insert, update_flags); + insert->k.p = iter->pos; + ret = bch2_trans_update(trans, iter, insert, flags); } goto out; } static __always_inline +int bch2_hash_set_in_snapshot(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, u32 snapshot, + struct bkey_i *insert, + enum btree_iter_update_trigger_flags flags) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum, + snapshot, insert, flags); + int ret = bkey_err(k); + if (ret) + return ret; + if (k.k) { + bch2_trans_iter_exit(trans, &iter); + return -BCH_ERR_EEXIST_str_hash_set; + } + + return 0; +} + +static __always_inline int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, struct bkey_i *insert, - bch_str_hash_flags_t str_hash_flags) + enum btree_iter_update_trigger_flags flags) { - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - insert->k.p.inode = inum.inum; - return bch2_hash_set_snapshot(trans, desc, info, inum, - snapshot, insert, str_hash_flags, 0); + u32 snapshot; + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + bch2_hash_set_in_snapshot(trans, desc, info, inum, + snapshot, insert, flags); } static __always_inline @@ -346,7 +354,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, struct btree_iter *iter, - unsigned update_flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_i *delete; int ret; @@ -364,7 +372,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, delete->k.p = iter->pos; delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; - return bch2_trans_update(trans, iter, delete, update_flags); + return bch2_trans_update(trans, iter, delete, flags); } static __always_inline @@ -374,10 +382,9 @@ int bch2_hash_delete(struct btree_trans *trans, subvol_inum inum, const void *key) { struct btree_iter iter; - int ret; - - ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, - BTREE_ITER_INTENT); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key, + BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) return ret; @@ -386,4 +393,26 @@ int bch2_hash_delete(struct btree_trans *trans, return ret; } +struct snapshots_seen; +int __bch2_str_hash_check_key(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc *, + struct bch_hash_info *, + struct btree_iter *, struct bkey_s_c); + +static inline int bch2_str_hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k) +{ + if (hash_k.k->type != desc->key_type) + return 0; + + if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) + return 0; + + return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k); +} + #endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 7c67c28d3ef8..b7b96283c316 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -13,13 +13,26 @@ static int bch2_subvolume_delete(struct btree_trans *, u32); +static struct bpos subvolume_children_pos(struct bkey_s_c k) +{ + if (k.k->type != KEY_TYPE_subvolume) + return POS_MIN; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + if (!s.v->fs_path_parent) + return POS_MIN; + return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset); +} + static int check_subvol(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_s_c_subvolume subvol; + struct btree_iter subvol_children_iter = {}; struct bch_snapshot snapshot; + struct printbuf buf = PRINTBUF; unsigned snapid; int ret = 0; @@ -42,6 +55,72 @@ static int check_subvol(struct btree_trans *trans, return ret ?: -BCH_ERR_transaction_restart_nested; } + if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && + subvol.v->fs_path_parent, + trans, subvol_root_fs_path_parent_nonzero, + "root subvolume has nonzero fs_path_parent\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.fs_path_parent = 0; + } + + if (subvol.v->fs_path_parent) { + struct bpos pos = subvolume_children_pos(k); + + struct bkey_s_c subvol_children_k = + bch2_bkey_get_iter(trans, &subvol_children_iter, + BTREE_ID_subvolume_children, pos, 0); + ret = bkey_err(subvol_children_k); + if (ret) + goto err; + + if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set, + trans, subvol_children_not_set, + "subvolume not set in subvolume_children btree at %llu:%llu\n%s", + pos.inode, pos.offset, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true); + if (ret) + goto err; + } + } + + struct bch_inode_unpacked inode; + ret = bch2_inode_find_by_inum_nowarn_trans(trans, + (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, + &inode); + if (!ret) { + if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, + trans, subvol_root_wrong_bi_subvol, + "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", + inode.bi_inum, inode.bi_snapshot, + inode.bi_subvol, subvol.k->p.offset)) { + inode.bi_subvol = subvol.k->p.offset; + inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + goto err; + } + } else if (bch2_err_matches(ret, ENOENT)) { + if (fsck_err(trans, subvol_to_missing_root, + "subvolume %llu points to missing subvolume root %llu:%u", + k.k->p.offset, le64_to_cpu(subvol.v->inode), + le32_to_cpu(subvol.v->snapshot))) { + ret = bch2_subvolume_delete(trans, iter->pos.offset); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); + ret = ret ?: -BCH_ERR_transaction_restart_nested; + goto err; + } + } else { + goto err; + } + if (!BCH_SUBVOLUME_SNAP(subvol.v)) { u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); u32 snapshot_tree; @@ -57,23 +136,25 @@ static int check_subvol(struct btree_trans *trans, "%s: snapshot tree %u not found", __func__, snapshot_tree); if (ret) - return ret; + goto err; if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, - c, subvol_not_master_and_not_snapshot, + trans, subvol_not_master_and_not_snapshot, "subvolume %llu is not set as snapshot but is not master subvolume", k.k->p.offset)) { struct bkey_i_subvolume *s = bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); ret = PTR_ERR_OR_ZERO(s); if (ret) - return ret; + goto err; SET_BCH_SUBVOLUME_SNAP(&s->v, true); } } - +err: fsck_err: + bch2_trans_iter_exit(trans, &subvol_children_iter); + printbuf_exit(&buf); return ret; } @@ -81,24 +162,68 @@ int bch2_check_subvols(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_subvol(trans, &iter, k))); bch_err_fn(c, ret); return ret; } +static int check_subvol_child(struct btree_trans *trans, + struct btree_iter *child_iter, + struct bkey_s_c child_k) +{ + struct bch_subvolume s; + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset), + 0, subvolume, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret || + le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode, + trans, subvol_children_bad, + "incorrect entry in subvolume_children btree %llu:%llu", + child_k.k->p.inode, child_k.k->p.offset)) { + ret = bch2_btree_delete_at(trans, child_iter, 0); + if (ret) + goto err; + } +err: +fsck_err: + return ret; +} + +int bch2_check_subvol_children(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_subvol_child(trans, &iter, k))); + bch_err_fn(c, ret); + return 0; +} + /* Subvolumes: */ -int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) +int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { + struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k); int ret = 0; bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) || - bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err, - subvol_pos_bad, + bkey_gt(k.k->p, SUBVOL_POS_MAX), + c, subvol_pos_bad, "invalid pos"); + + bkey_fsck_err_on(!subvol.v->snapshot, + c, subvol_snapshot_bad, + "invalid snapshot"); + + bkey_fsck_err_on(!subvol.v->inode, + c, subvol_inode_bad, + "invalid inode"); fsck_err: return ret; } @@ -112,18 +237,60 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, le64_to_cpu(s.v->inode), le32_to_cpu(s.v->snapshot)); - if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent)) - prt_printf(out, " parent %u", le32_to_cpu(s.v->parent)); + if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) { + prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); + prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); + } +} + +static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) +{ + return !bpos_eq(pos, POS_MIN) + ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set) + : 0; +} + +int bch2_subvolume_trigger(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + enum btree_iter_update_trigger_flags flags) +{ + if (flags & BTREE_TRIGGER_transactional) { + struct bpos children_pos_old = subvolume_children_pos(old); + struct bpos children_pos_new = subvolume_children_pos(new.s_c); + + if (!bpos_eq(children_pos_old, children_pos_new)) { + int ret = subvolume_children_mod(trans, children_pos_old, false) ?: + subvolume_children_mod(trans, children_pos_new, true); + if (ret) + return ret; + } + } + + return 0; +} + +int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) +{ + struct btree_iter iter; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); + struct bkey_s_c k = bch2_btree_iter_peek(&iter); + bch2_trans_iter_exit(trans, &iter); + + return bkey_err(k) ?: k.k && k.k->p.inode == subvol + ? -BCH_ERR_ENOTEMPTY_subvol_not_empty + : 0; } static __always_inline int bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, bool inconsistent_if_not_found, - int iter_flags, struct bch_subvolume *s) { int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), - iter_flags, subvolume, s); + BTREE_ITER_cached| + BTREE_ITER_with_updates, subvolume, s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found, trans->c, "missing subvolume %u", subvol); @@ -132,16 +299,15 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, bool inconsistent_if_not_found, - int iter_flags, struct bch_subvolume *s) { - return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s); + return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s); } int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) { struct bch_subvolume s; - int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s); + int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s); if (ret) return ret; @@ -152,8 +318,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol) { - return bch2_trans_do(c, NULL, NULL, 0, - bch2_subvol_is_ro_trans(trans, subvol)); + return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol)); } int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, @@ -162,11 +327,11 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, struct bch_snapshot snap; return bch2_snapshot_lookup(trans, snapshot, &snap) ?: - bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); + bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol); } -int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, - u32 *snapid) +int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, + u32 *snapid, bool warn) { struct btree_iter iter; struct bkey_s_c_subvolume subvol; @@ -174,10 +339,11 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, subvol = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES, + BTREE_ITER_cached|BTREE_ITER_with_updates, subvolume); ret = bkey_err(subvol); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + + bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c, "missing subvolume %u", subvolid); if (likely(!ret)) @@ -186,6 +352,12 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, return ret; } +int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, + u32 *snapid) +{ + return __bch2_subvolume_get_snapshot(trans, subvolid, snapid, true); +} + static int bch2_subvolume_reparent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -197,8 +369,8 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, if (k.k->type != KEY_TYPE_subvolume) return 0; - if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) && - le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent) + if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) && + le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent) return 0; s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); @@ -206,7 +378,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, if (ret) return ret; - s->v.parent = cpu_to_le32(new_parent); + s->v.creation_parent = cpu_to_le32(new_parent); return 0; } @@ -223,13 +395,12 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d struct bch_subvolume s; return lockrestart_do(trans, - bch2_subvolume_get(trans, subvolid_to_delete, true, - BTREE_ITER_CACHED, &s)) ?: + bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?: for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_subvolume_reparent(trans, &iter, k, - subvolid_to_delete, le32_to_cpu(s.parent))); + subvolid_to_delete, le32_to_cpu(s.creation_parent))); } /* @@ -238,26 +409,61 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d */ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { - struct btree_iter iter; - struct bkey_s_c_subvolume subvol; - u32 snapid; - int ret = 0; + struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {}; - subvol = bch2_bkey_get_iter_typed(trans, &iter, + struct bkey_s_c_subvolume subvol = + bch2_bkey_get_iter_typed(trans, &subvol_iter, BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_CACHED|BTREE_ITER_INTENT, + BTREE_ITER_cached|BTREE_ITER_intent, subvolume); - ret = bkey_err(subvol); + int ret = bkey_err(subvol); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing subvolume %u", subvolid); if (ret) - return ret; + goto err; - snapid = le32_to_cpu(subvol.v->snapshot); + u32 snapid = le32_to_cpu(subvol.v->snapshot); + + struct bkey_s_c_snapshot snapshot = + bch2_bkey_get_iter_typed(trans, &snapshot_iter, + BTREE_ID_snapshots, POS(0, snapid), + 0, snapshot); + ret = bkey_err(snapshot); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing snapshot %u", snapid); + if (ret) + goto err; + + u32 treeid = le32_to_cpu(snapshot.v->tree); + + struct bkey_s_c_snapshot_tree snapshot_tree = + bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, + BTREE_ID_snapshot_trees, POS(0, treeid), + 0, snapshot_tree); + ret = bkey_err(snapshot_tree); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing snapshot tree %u", treeid); + if (ret) + goto err; + + if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { + struct bkey_i_snapshot_tree *snapshot_tree_mut = + bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter, + &snapshot_tree.s_c, + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(snapshot_tree_mut); + if (ret) + goto err; - ret = bch2_btree_delete_at(trans, &iter, 0) ?: + snapshot_tree_mut->v.master_subvol = 0; + } + + ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?: bch2_snapshot_node_set_deleted(trans, snapid); - bch2_trans_iter_exit(trans, &iter); +err: + bch2_trans_iter_exit(trans, &snapshot_tree_iter); + bch2_trans_iter_exit(trans, &snapshot_iter); + bch2_trans_iter_exit(trans, &subvol_iter); return ret; } @@ -346,7 +552,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) n = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_CACHED, subvolume); + BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(n); if (unlikely(ret)) { bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, @@ -360,6 +566,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) } int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + u32 parent_subvolid, u32 src_subvolid, u32 *new_subvolid, u32 *new_snapshotid, @@ -387,7 +594,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, BTREE_ID_subvolumes, POS(0, src_subvolid), - BTREE_ITER_CACHED, subvolume); + BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(src_subvol); if (unlikely(ret)) { bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, @@ -416,12 +623,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, if (ret) goto err; - new_subvol->v.flags = 0; - new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); - new_subvol->v.inode = cpu_to_le64(inode); - new_subvol->v.parent = cpu_to_le32(src_subvolid); - new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); - new_subvol->v.otime.hi = 0; + new_subvol->v.flags = 0; + new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); + new_subvol->v.inode = cpu_to_le64(inode); + new_subvol->v.creation_parent = cpu_to_le32(src_subvolid); + new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid); + new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); + new_subvol->v.otime.hi = 0; SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); @@ -434,6 +642,78 @@ err: return ret; } +int bch2_initialize_subvolumes(struct bch_fs *c) +{ + struct bkey_i_snapshot_tree root_tree; + struct bkey_i_snapshot root_snapshot; + struct bkey_i_subvolume root_volume; + int ret; + + bkey_snapshot_tree_init(&root_tree.k_i); + root_tree.k.p.offset = 1; + root_tree.v.master_subvol = cpu_to_le32(1); + root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); + + bkey_snapshot_init(&root_snapshot.k_i); + root_snapshot.k.p.offset = U32_MAX; + root_snapshot.v.flags = 0; + root_snapshot.v.parent = 0; + root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); + root_snapshot.v.tree = cpu_to_le32(1); + SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); + + bkey_subvolume_init(&root_volume.k_i); + root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_volume.v.flags = 0; + root_volume.v.snapshot = cpu_to_le32(U32_MAX); + root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); + + ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?: + bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0, 0) ?: + bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0, 0); + bch_err_fn(c, ret); + return ret; +} + +static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); + ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_is_inode(k.k)) { + bch_err(trans->c, "root inode not found"); + ret = -BCH_ERR_ENOENT_inode; + goto err; + } + + ret = bch2_inode_unpack(k, &inode); + BUG_ON(ret); + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + + ret = bch2_inode_write(trans, &iter, &inode); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* set bi_subvol on root inode */ +int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) +{ + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_fs_upgrade_for_subvolumes(trans)); + bch_err_fn(c, ret); + return ret; +} + int bch2_fs_subvolumes_init(struct bch_fs *c) { INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index a6f56f66e27c..910f6196700e 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -5,33 +5,86 @@ #include "darray.h" #include "subvolume_types.h" -enum bkey_invalid_flags; - int bch2_check_subvols(struct bch_fs *); +int bch2_check_subvol_children(struct bch_fs *); -int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ - .key_invalid = bch2_subvolume_invalid, \ + .key_validate = bch2_subvolume_validate, \ .val_to_text = bch2_subvolume_to_text, \ + .trigger = bch2_subvolume_trigger, \ .min_val_size = 16, \ }) +int bch2_subvol_has_children(struct btree_trans *, u32); int bch2_subvolume_get(struct btree_trans *, unsigned, - bool, int, struct bch_subvolume *); + bool, struct bch_subvolume *); +int __bch2_subvolume_get_snapshot(struct btree_trans *, u32, + u32 *, bool); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); int bch2_subvol_is_ro_trans(struct btree_trans *, u32); int bch2_subvol_is_ro(struct bch_fs *, u32); +static inline struct bkey_s_c +bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end, + u32 subvolid, unsigned flags) +{ + u32 snapshot; + int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot); + if (ret) + return bkey_s_c_err(ret); + + bch2_btree_iter_set_snapshot(iter, snapshot); + return bch2_btree_iter_peek_max_type(iter, end, flags); +} + +#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ + _end, _subvolid, _flags, _k, _do) \ +({ \ + struct bkey_s_c _k; \ + int _ret3 = 0; \ + \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ + (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \ + _end, _subvolid, (_flags)); \ + if (!(_k).k) \ + break; \ + \ + bkey_err(_k) ?: (_do); \ + })); \ + } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret3; \ +}) + +#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \ + _start, _end, _subvolid, _flags, _k, _do) \ +({ \ + struct btree_iter _iter; \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ + _end, _subvolid, _flags, _k, _do); \ +}) + int bch2_delete_dead_snapshots(struct bch_fs *); void bch2_delete_dead_snapshots_async(struct bch_fs *); int bch2_subvolume_unlink(struct btree_trans *, u32); -int bch2_subvolume_create(struct btree_trans *, u64, u32, - u32 *, u32 *, bool); +int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); + +int bch2_initialize_subvolumes(struct bch_fs *); +int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); int bch2_fs_subvolumes_init(struct bch_fs *); diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h index af79134b07d6..e029df7ba89f 100644 --- a/fs/bcachefs/subvolume_format.h +++ b/fs/bcachefs/subvolume_format.h @@ -19,8 +19,8 @@ struct bch_subvolume { * This is _not_ necessarily the subvolume of the directory containing * this subvolume: */ - __le32 parent; - __le32 pad; + __le32 creation_parent; + __le32 fs_path_parent; bch_le128 otime; }; diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index ae644adfc391..1549d6daf7af 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -9,17 +9,19 @@ typedef DARRAY(u32) snapshot_id_list; #define IS_ANCESTOR_BITMAP 128 struct snapshot_t { + bool live; u32 parent; u32 skip[3]; u32 depth; u32 children[2]; u32 subvol; /* Nonzero only if a subvolume points to this node: */ u32 tree; - u32 equiv; unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; }; struct snapshot_table { + struct rcu_head rcu; + size_t nr; #ifndef RUST_BINDGEN DECLARE_FLEX_ARRAY(struct snapshot_t, s); #else @@ -28,7 +30,8 @@ struct snapshot_table { }; typedef struct { - u32 subvol; + /* we can't have padding in this struct: */ + u64 subvol; u64 inum; } subvol_inum; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 36988add581f..a81a7b6c0989 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -8,7 +8,7 @@ #include "journal.h" #include "journal_sb.h" #include "journal_seq_blacklist.h" -#include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "quota.h" #include "sb-clean.h" @@ -23,6 +23,7 @@ #include <linux/backing-dev.h> #include <linux/sort.h> +#include <linux/string_choices.h> static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { }; @@ -41,7 +42,7 @@ static const struct bch2_metadata_version bch2_metadata_versions[] = { #undef x }; -void bch2_version_to_text(struct printbuf *out, unsigned v) +void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v) { const char *str = "(unknown version)"; @@ -54,7 +55,7 @@ void bch2_version_to_text(struct printbuf *out, unsigned v) prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); } -unsigned bch2_latest_compatible_version(unsigned v) +enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v) { if (!BCH_VERSION_MAJOR(v)) return v; @@ -68,6 +69,22 @@ unsigned bch2_latest_compatible_version(unsigned v) return v; } +bool bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) +{ + bool ret = (c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && + version <= c->sb.version_incompat_allowed; + + if (ret) { + mutex_lock(&c->sb_lock); + SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + return ret; +} + const char * const bch2_sb_fields[] = { #define x(name, nr) #name, BCH_SB_FIELDS() @@ -76,7 +93,7 @@ const char * const bch2_sb_fields[] = { }; static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, - struct printbuf *); + enum bch_validate_flags, struct printbuf *); struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, enum bch_sb_field_type type) @@ -142,8 +159,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb, void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); - if (!IS_ERR_OR_NULL(sb->bdev_handle)) - bdev_release(sb->bdev_handle); + if (!IS_ERR_OR_NULL(sb->s_bdev_file)) + bdev_fput(sb->s_bdev_file); kfree(sb->holder); kfree(sb->sb_name); @@ -232,7 +249,7 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, struct bch_sb_handle *dev_sb = &ca->disk_sb; if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { - percpu_ref_put(&ca->ref); + percpu_ref_put(&ca->io_ref); return NULL; } } @@ -287,6 +304,11 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out return -BCH_ERR_invalid_sb_layout_nr_superblocks; } + if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) { + prt_printf(out, "Invalid superblock layout: max_size_bits too high"); + return -BCH_ERR_invalid_sb_layout_sb_max_size_bits; + } + max_sectors = 1 << layout->sb_max_size_bits; prev_offset = le64_to_cpu(layout->sb_offset[0]); @@ -344,8 +366,8 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) return 0; } -static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, - int rw) +static int bch2_sb_validate(struct bch_sb_handle *disk_sb, + enum bch_validate_flags flags, struct printbuf *out) { struct bch_sb *sb = disk_sb->sb; struct bch_sb_field_members_v1 *mi; @@ -363,6 +385,12 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return -BCH_ERR_invalid_sb_features; } + if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || + BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { + prt_printf(out, "Filesystem has incompatible version"); + return -BCH_ERR_invalid_sb_features; + } + block_size = le16_to_cpu(sb->block_size); if (block_size > PAGE_SECTORS) { @@ -401,7 +429,22 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return -BCH_ERR_invalid_sb_time_precision; } - if (rw == READ) { + /* old versions didn't know to downgrade this field */ + if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version)) + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version)); + + if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) { + prt_printf(out, "Invalid version_incompat "); + bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); + prt_str(out, " > incompat_allowed "); + bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); + if (flags & BCH_VALIDATE_write) + return -BCH_ERR_invalid_sb_version; + else + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); + } + + if (!flags) { /* * Been seeing a bug where these are getting inexplicably * zeroed, so we're now validating them, but we have to be @@ -414,8 +457,20 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 && + !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb)) + SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) + SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); } +#ifdef __KERNEL__ + if (!BCH_SB_SHARD_INUMS_NBITS(sb)) + SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus()))); +#endif + for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { const struct bch_option *opt = bch2_opt_table + opt_id; @@ -457,7 +512,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return -BCH_ERR_invalid_sb_members_missing; } - ret = bch2_sb_field_validate(sb, &mi->field, out); + ret = bch2_sb_field_validate(sb, &mi->field, flags, out); if (ret) return ret; @@ -465,11 +520,19 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1) continue; - ret = bch2_sb_field_validate(sb, f, out); + ret = bch2_sb_field_validate(sb, f, flags, out); if (ret) return ret; } + if ((flags & BCH_VALIDATE_write) && + bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) { + prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu", + le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq), + le64_to_cpu(sb->seq)); + return -BCH_ERR_invalid_sb_members_missing; + } + return 0; } @@ -499,6 +562,9 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.uuid = src->uuid; c->sb.user_uuid = src->user_uuid; c->sb.version = le16_to_cpu(src->version); + c->sb.version_incompat = BCH_SB_VERSION_INCOMPAT(src); + c->sb.version_incompat_allowed + = BCH_SB_VERSION_INCOMPAT_ALLOWED(src); c->sb.version_min = le16_to_cpu(src->version_min); c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); c->sb.nr_devices = src->nr_devices; @@ -519,9 +585,11 @@ static void bch2_sb_update(struct bch_fs *c) memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); - if (ext) + if (ext) { le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, sizeof(c->sb.errors_silent) * 8); + c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); + } for_each_member_device(c, ca) { struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); @@ -639,9 +707,10 @@ reread: bytes = vstruct_bytes(sb->sb); - if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { - prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", - bytes, 512UL << sb->sb->layout.sb_max_size_bits); + u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits); + if (bytes > sb_size) { + prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)", + bytes, sb_size); return -BCH_ERR_invalid_sb_too_big; } @@ -653,7 +722,8 @@ reread: } enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb); - if (csum_type >= BCH_CSUM_NR) { + if (csum_type >= BCH_CSUM_NR || + bch2_csum_type_is_encryption(csum_type)) { prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); return -BCH_ERR_invalid_sb_csum_type; } @@ -690,8 +760,11 @@ retry: return -ENOMEM; sb->sb_name = kstrdup(path, GFP_KERNEL); - if (!sb->sb_name) - return -ENOMEM; + if (!sb->sb_name) { + ret = -ENOMEM; + prt_printf(&err, "error allocating memory for sb_name"); + goto err; + } #ifndef __KERNEL__ if (opt_get(*opts, direct_io) == false) @@ -704,22 +777,23 @@ retry: if (!opt_get(*opts, nochanges)) sb->mode |= BLK_OPEN_WRITE; - sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (IS_ERR(sb->bdev_handle) && - PTR_ERR(sb->bdev_handle) == -EACCES && + sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (IS_ERR(sb->s_bdev_file) && + PTR_ERR(sb->s_bdev_file) == -EACCES && opt_get(*opts, read_only)) { sb->mode &= ~BLK_OPEN_WRITE; - sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (!IS_ERR(sb->bdev_handle)) + sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (!IS_ERR(sb->s_bdev_file)) opt_set(*opts, nochanges, true); } - if (IS_ERR(sb->bdev_handle)) { - ret = PTR_ERR(sb->bdev_handle); + if (IS_ERR(sb->s_bdev_file)) { + ret = PTR_ERR(sb->s_bdev_file); + prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret)); goto err; } - sb->bdev = sb->bdev_handle->bdev; + sb->bdev = file_bdev(sb->s_bdev_file); ret = bch2_sb_realloc(sb, 0); if (ret) { @@ -743,9 +817,9 @@ retry: prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n", path, err.buf); if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg) - printk(KERN_INFO "%s", err2.buf); + bch2_print_opts(opts, KERN_INFO "%s", err2.buf); else - printk(KERN_ERR "%s", err2.buf); + bch2_print_opts(opts, KERN_ERR "%s", err2.buf); printbuf_exit(&err2); printbuf_reset(&err); @@ -777,8 +851,10 @@ retry: i < layout.sb_offset + layout.nr_superblocks; i++) { offset = le64_to_cpu(*i); - if (offset == opt_get(*opts, sb)) + if (offset == opt_get(*opts, sb)) { + ret = -BCH_ERR_invalid; continue; + } ret = read_one_super(sb, offset, &err); if (!ret) @@ -803,21 +879,20 @@ got_super: goto err; } - ret = 0; sb->have_layout = true; - ret = bch2_sb_validate(sb, &err, READ); + ret = bch2_sb_validate(sb, 0, &err); if (ret) { - printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n", - path, err.buf); + bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", + path, err.buf); goto err_no_print; } out: printbuf_exit(&err); return ret; err: - printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n", - path, err.buf); + bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n", + path, err.buf); err_no_print: bch2_free_super(sb); goto out; @@ -850,7 +925,7 @@ static void write_super_endio(struct bio *bio) ? BCH_MEMBER_ERROR_write : BCH_MEMBER_ERROR_read, "superblock %s error: %s", - bio_data_dir(bio) ? "write" : "read", + str_write_read(bio_data_dir(bio)), bch2_blk_status_to_str(bio->bi_status))) ca->sb_write_error = 1; @@ -863,14 +938,15 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) struct bch_sb *sb = ca->disk_sb.sb; struct bio *bio = ca->disk_sb.bio; + memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); bio->bi_end_io = write_super_endio; bio->bi_private = ca; - bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); + bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE); - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], - bio_sectors(bio)); + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); percpu_ref_get(&ca->io_ref); closure_bio_submit(bio, &c->sb_write); @@ -910,6 +986,7 @@ int bch2_write_super(struct bch_fs *c) struct bch_devs_mask sb_written; bool wrote, can_mount_without_written, can_mount_with_written; unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; + DARRAY(struct bch_dev *) online_devices = {}; int ret = 0; trace_and_count(c, write_super, c, _RET_IP_); @@ -922,6 +999,15 @@ int bch2_write_super(struct bch_fs *c) closure_init_stack(cl); memset(&sb_written, 0, sizeof(sb_written)); + for_each_online_member(c, ca) { + ret = darray_push(&online_devices, ca); + if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { + percpu_ref_put(&ca->io_ref); + goto out; + } + percpu_ref_get(&ca->io_ref); + } + /* Make sure we're using the new magic numbers: */ c->disk_sb.sb->magic = BCHFS_MAGIC; c->disk_sb.sb->layout.magic = BCHFS_MAGIC; @@ -929,8 +1015,8 @@ int bch2_write_super(struct bch_fs *c) le64_add_cpu(&c->disk_sb.sb->seq, 1); struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - for_each_online_member(c, ca) - __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq; + darray_for_each(online_devices, ca) + __bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq; c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds()); if (test_bit(BCH_FS_error, &c->flags)) @@ -946,16 +1032,15 @@ int bch2_write_super(struct bch_fs *c) bch2_sb_errors_from_cpu(c); bch2_sb_downgrade_update(c); - for_each_online_member(c, ca) - bch2_sb_from_fs(c, ca); + darray_for_each(online_devices, ca) + bch2_sb_from_fs(c, (*ca)); - for_each_online_member(c, ca) { + darray_for_each(online_devices, ca) { printbuf_reset(&err); - ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); + ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err); if (ret) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); - percpu_ref_put(&ca->io_ref); goto out; } } @@ -977,58 +1062,79 @@ int bch2_write_super(struct bch_fs *c) prt_str(&buf, " > "); bch2_version_to_text(&buf, bcachefs_metadata_version_current); prt_str(&buf, ")"); - bch2_fs_fatal_error(c, "%s", buf.buf); + bch2_fs_fatal_error(c, ": %s", buf.buf); printbuf_exit(&buf); return -BCH_ERR_sb_not_downgraded; } - for_each_online_member(c, ca) { - __set_bit(ca->dev_idx, sb_written.d); - ca->sb_write_error = 0; + darray_for_each(online_devices, ca) { + __set_bit((*ca)->dev_idx, sb_written.d); + (*ca)->sb_write_error = 0; } - for_each_online_member(c, ca) - read_back_super(c, ca); + darray_for_each(online_devices, ca) + read_back_super(c, *ca); closure_sync(cl); - for_each_online_member(c, ca) { + darray_for_each(online_devices, cap) { + struct bch_dev *ca = *cap; + if (ca->sb_write_error) continue; if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { - bch2_fs_fatal_error(c, - "Superblock write was silently dropped! (seq %llu expected %llu)", + struct printbuf buf = PRINTBUF; + prt_char(&buf, ' '); + prt_bdevname(&buf, ca->disk_sb.bdev); + prt_printf(&buf, + ": Superblock write was silently dropped! (seq %llu expected %llu)", le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); - percpu_ref_put(&ca->io_ref); - ret = -BCH_ERR_erofs_sb_err; - goto out; + + if (c->opts.errors != BCH_ON_ERROR_continue && + c->opts.errors != BCH_ON_ERROR_fix_safe) { + ret = -BCH_ERR_erofs_sb_err; + bch2_fs_fatal_error(c, "%s", buf.buf); + } else { + bch_err(c, "%s", buf.buf); + } + + printbuf_exit(&buf); } if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { - bch2_fs_fatal_error(c, - "Superblock modified by another process (seq %llu expected %llu)", + struct printbuf buf = PRINTBUF; + prt_char(&buf, ' '); + prt_bdevname(&buf, ca->disk_sb.bdev); + prt_printf(&buf, + ": Superblock modified by another process (seq %llu expected %llu)", le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); - percpu_ref_put(&ca->io_ref); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); ret = -BCH_ERR_erofs_sb_err; - goto out; } } + if (ret) + goto out; + do { wrote = false; - for_each_online_member(c, ca) + darray_for_each(online_devices, cap) { + struct bch_dev *ca = *cap; if (!ca->sb_write_error && sb < ca->disk_sb.sb->layout.nr_superblocks) { write_one_super(c, ca, sb); wrote = true; } + } closure_sync(cl); sb++; } while (wrote); - for_each_online_member(c, ca) { + darray_for_each(online_devices, cap) { + struct bch_dev *ca = *cap; if (ca->sb_write_error) __clear_bit(ca->dev_idx, sb_written.d); else @@ -1058,12 +1164,15 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written || (can_mount_without_written && !can_mount_with_written), c, - "Unable to write superblock to sufficient devices (from %ps)", + ": Unable to write superblock to sufficient devices (from %ps)", (void *) _RET_IP_)) ret = -1; out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); + darray_for_each(online_devices, ca) + percpu_ref_put(&(*ca)->io_ref); + darray_exit(&online_devices); printbuf_exit(&err); return ret; } @@ -1093,23 +1202,19 @@ bool bch2_check_version_downgrade(struct bch_fs *c) * c->sb will be checked before we write the superblock, so update it as * well: */ - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) { + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - c->sb.version_upgrade_complete = bcachefs_metadata_version_current; - } - if (c->sb.version > bcachefs_metadata_version_current) { + if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current) + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current); + if (c->sb.version > bcachefs_metadata_version_current) c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); - c->sb.version = bcachefs_metadata_version_current; - } - if (c->sb.version_min > bcachefs_metadata_version_current) { + if (c->sb.version_min > bcachefs_metadata_version_current) c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); - c->sb.version_min = bcachefs_metadata_version_current; - } c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); return ret; } -void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) +void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) { lockdep_assert_held(&c->sb_lock); @@ -1119,10 +1224,16 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) c->disk_sb.sb->version = cpu_to_le16(new_version); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + + if (incompat) { + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); + } } static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { if (vstruct_bytes(f) < 88) { prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88); @@ -1137,8 +1248,7 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_ext *e = field_to_type(f, ext); - prt_printf(out, "Recovery passes required:"); - prt_tab(out); + prt_printf(out, "Recovery passes required:\t"); prt_bitflags(out, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0]))); prt_newline(out); @@ -1147,13 +1257,17 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, if (errors_silent) { le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8); - prt_printf(out, "Errors to silently fix:"); - prt_tab(out); - prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8); + prt_printf(out, "Errors to silently fix:\t"); + prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, + min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8)); prt_newline(out); kfree(errors_silent); } + + prt_printf(out, "Btrees with missing data:\t"); + prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data)); + prt_newline(out); } static const struct bch_sb_field_ops bch_sb_field_ops_ext = { @@ -1178,14 +1292,14 @@ static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) } static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { unsigned type = le32_to_cpu(f->type); struct printbuf field_err = PRINTBUF; const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); int ret; - ret = ops->validate ? ops->validate(sb, f, &field_err) : 0; + ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0; if (ret) { prt_printf(err, "Invalid superblock section %s: %s", bch2_sb_fields[type], field_err.buf); @@ -1252,104 +1366,86 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, bool print_layout, unsigned fields) { - u64 fields_have = 0; - unsigned nr_devices = 0; - if (!out->nr_tabstops) printbuf_tabstop_push(out, 44); - for (int i = 0; i < sb->nr_devices; i++) - nr_devices += bch2_dev_exists(sb, i); - - prt_printf(out, "External UUID:"); - prt_tab(out); + prt_printf(out, "External UUID:\t"); pr_uuid(out, sb->user_uuid.b); prt_newline(out); - prt_printf(out, "Internal UUID:"); - prt_tab(out); + prt_printf(out, "Internal UUID:\t"); pr_uuid(out, sb->uuid.b); prt_newline(out); - prt_printf(out, "Magic number:"); - prt_tab(out); + prt_printf(out, "Magic number:\t"); pr_uuid(out, sb->magic.b); prt_newline(out); - prt_str(out, "Device index:"); - prt_tab(out); - prt_printf(out, "%u", sb->dev_idx); - prt_newline(out); + prt_printf(out, "Device index:\t%u\n", sb->dev_idx); - prt_str(out, "Label:"); - prt_tab(out); - prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); + prt_printf(out, "Label:\t"); + if (!strlen(sb->label)) + prt_printf(out, "(none)"); + else + prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); prt_newline(out); - prt_str(out, "Version:"); - prt_tab(out); + prt_printf(out, "Version:\t"); bch2_version_to_text(out, le16_to_cpu(sb->version)); prt_newline(out); - prt_str(out, "Version upgrade complete:"); - prt_tab(out); + prt_printf(out, "Incompatible features allowed:\t"); + bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); + prt_newline(out); + + prt_printf(out, "Incompatible features in use:\t"); + bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); + prt_newline(out); + + prt_printf(out, "Version upgrade complete:\t"); bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); prt_newline(out); - prt_printf(out, "Oldest version on disk:"); - prt_tab(out); + prt_printf(out, "Oldest version on disk:\t"); bch2_version_to_text(out, le16_to_cpu(sb->version_min)); prt_newline(out); - prt_printf(out, "Created:"); - prt_tab(out); + prt_printf(out, "Created:\t"); if (sb->time_base_lo) bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); else prt_printf(out, "(not set)"); prt_newline(out); - prt_printf(out, "Sequence number:"); - prt_tab(out); + prt_printf(out, "Sequence number:\t"); prt_printf(out, "%llu", le64_to_cpu(sb->seq)); prt_newline(out); - prt_printf(out, "Time of last write:"); - prt_tab(out); + prt_printf(out, "Time of last write:\t"); bch2_prt_datetime(out, le64_to_cpu(sb->write_time)); prt_newline(out); - prt_printf(out, "Superblock size:"); - prt_tab(out); + prt_printf(out, "Superblock size:\t"); prt_units_u64(out, vstruct_bytes(sb)); prt_str(out, "/"); prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits); prt_newline(out); - prt_printf(out, "Clean:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_SB_CLEAN(sb)); - prt_newline(out); + prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb)); + prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb)); - prt_printf(out, "Devices:"); - prt_tab(out); - prt_printf(out, "%u", nr_devices); - prt_newline(out); - - prt_printf(out, "Sections:"); + prt_printf(out, "Sections:\t"); + u64 fields_have = 0; vstruct_for_each(sb, f) fields_have |= 1 << le32_to_cpu(f->type); - prt_tab(out); prt_bitflags(out, bch2_sb_fields, fields_have); prt_newline(out); - prt_printf(out, "Features:"); - prt_tab(out); + prt_printf(out, "Features:\t"); prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); prt_newline(out); - prt_printf(out, "Compat features:"); - prt_tab(out); + prt_printf(out, "Compat features:\t"); prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); prt_newline(out); @@ -1366,8 +1462,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, if (opt->get_sb != BCH2_NO_SB_OPT) { u64 v = bch2_opt_from_sb(sb, id); - prt_printf(out, "%s:", opt->attr.name); - prt_tab(out); + prt_printf(out, "%s:\t", opt->attr.name); bch2_opt_to_text(out, NULL, sb, opt, v, OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); prt_newline(out); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 95e80e06316b..b4cff9ebdebb 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -10,14 +10,26 @@ #include <asm/byteorder.h> +#define BCH_SB_READ_SCRATCH_BUF_SIZE 4096 + static inline bool bch2_version_compatible(u16 version) { return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && version >= bcachefs_metadata_version_min; } -void bch2_version_to_text(struct printbuf *, unsigned); -unsigned bch2_latest_compatible_version(unsigned); +void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); +enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); + +bool bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); + +static inline bool bch2_request_incompat_feature(struct bch_fs *c, + enum bcachefs_metadata_version version) +{ + return likely(version <= c->sb.version_incompat) + ? true + : bch2_set_version_incompat(c, version); +} static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) { @@ -51,7 +63,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); extern const char * const bch2_sb_fields[]; struct bch_sb_field_ops { - int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *); + int (*validate)(struct bch_sb *, struct bch_sb_field *, + enum bch_validate_flags, struct printbuf *); void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); }; @@ -91,7 +104,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) } bool bch2_check_version_downgrade(struct bch_fs *); -void bch2_sb_upgrade(struct bch_fs *, unsigned); +void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 6b23e11825e6..0459c875e189 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -15,6 +15,7 @@ #include "btree_gc.h" #include "btree_journal_iter.h" #include "btree_key_cache.h" +#include "btree_node_scan.h" #include "btree_update_interior.h" #include "btree_io.h" #include "btree_write_buffer.h" @@ -24,6 +25,7 @@ #include "clock.h" #include "compress.h" #include "debug.h" +#include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" #include "errcode.h" @@ -56,6 +58,7 @@ #include "super.h" #include "super-io.h" #include "sysfs.h" +#include "thread_with_file.h" #include "trace.h" #include <linux/backing-dev.h> @@ -86,26 +89,51 @@ const char * const bch2_fs_flag_strs[] = { NULL }; -void __bch2_print(struct bch_fs *c, const char *fmt, ...) +void bch2_print_str(struct bch_fs *c, const char *str) { +#ifdef __KERNEL__ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); - va_list args; - va_start(args, fmt); - if (likely(!stdio)) { - vprintk(fmt, args); - } else { - unsigned long flags; + if (unlikely(stdio)) { + bch2_stdio_redirect_printf(stdio, true, "%s", str); + return; + } +#endif + bch2_print_string_as_lines(KERN_ERR, str); +} +__printf(2, 0) +static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) +{ +#ifdef __KERNEL__ + if (unlikely(stdio)) { if (fmt[0] == KERN_SOH[0]) fmt += 2; - spin_lock_irqsave(&stdio->output_lock, flags); - prt_vprintf(&stdio->output_buf, fmt, args); - spin_unlock_irqrestore(&stdio->output_lock, flags); - - wake_up(&stdio->output_wait); + bch2_stdio_redirect_vprintf(stdio, true, fmt, args); + return; } +#endif + vprintk(fmt, args); +} + +void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) +{ + struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; + + va_list args; + va_start(args, fmt); + bch2_print_maybe_redirect(stdio, fmt, args); + va_end(args); +} + +void __bch2_print(struct bch_fs *c, const char *fmt, ...) +{ + struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); + + va_list args; + va_start(args, fmt); + bch2_print_maybe_redirect(stdio, fmt, args); va_end(args); } @@ -156,6 +184,7 @@ static DEFINE_MUTEX(bch_fs_list_lock); DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); +static void bch2_dev_unlink(struct bch_dev *); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); @@ -208,22 +237,6 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) return c; } -static void bch2_dev_usage_journal_reserve(struct bch_fs *c) -{ - unsigned nr = 0, u64s = - ((sizeof(struct jset_entry_dev_usage) + - sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) / - sizeof(u64); - - rcu_read_lock(); - for_each_member_device_rcu(c, ca, NULL) - nr++; - rcu_read_unlock(); - - bch2_journal_entry_res_resize(&c->journal, - &c->dev_usage_journal_res, u64s * nr); -} - /* Filesystem RO/RW: */ /* @@ -250,7 +263,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_open_buckets_stop(c, NULL, true); bch2_rebalance_stop(c); bch2_copygc_stop(c); - bch2_gc_thread_stop(c); bch2_fs_ec_flush(c); bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", @@ -260,6 +272,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) clean_passes++; if (bch2_btree_interior_updates_flush(c) || + bch2_btree_write_buffer_flush_going_ro(c) || bch2_journal_flush_all_pins(&c->journal) || bch2_btree_flush_all_writes(c) || seq != atomic64_read(&c->journal.seq)) { @@ -271,11 +284,16 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", journal_cur_seq(&c->journal)); - if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && + if (test_bit(JOURNAL_replay_done, &c->journal.flags) && !test_bit(BCH_FS_emergency_ro, &c->flags)) set_bit(BCH_FS_clean_shutdown, &c->flags); + bch2_fs_journal_stop(&c->journal); + bch_info(c, "%sclean shutdown complete, journal seq %llu", + test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", + c->journal.seq_ondisk); + /* * After stopping journal: */ @@ -352,12 +370,13 @@ void bch2_fs_read_only(struct bch_fs *c) !test_bit(BCH_FS_emergency_ro, &c->flags) && test_bit(BCH_FS_started, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) && - !c->opts.norecovery) { + c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); - BUG_ON(atomic_read(&c->btree_cache.dirty)); + BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); BUG_ON(c->btree_write_buffer.inc.keys.nr); BUG_ON(c->btree_write_buffer.flushing.keys.nr); + bch2_verify_accounting_clean(c); bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); @@ -392,6 +411,17 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) return ret; } +bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) +{ + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); + + bch2_journal_halt_locked(&c->journal); + bch2_fs_read_only_async(c); + + wake_up(&bch2_read_only_wait); + return ret; +} + static int bch2_fs_read_write_late(struct bch_fs *c) { int ret; @@ -422,6 +452,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) { int ret; + BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); return -BCH_ERR_erofs_unfixed_errors; @@ -448,7 +480,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) * overwriting whatever was there previously, and there must always be * at least one non-flush write in the journal or recovery will fail: */ - set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags); + set_bit(JOURNAL_need_flush_write, &c->journal.flags); + set_bit(JOURNAL_running, &c->journal.flags); for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); @@ -466,12 +499,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) } #endif - ret = bch2_gc_thread_start(c); - if (ret) { - bch_err(c, "error starting gc thread"); - return ret; - } - ret = bch2_journal_reclaim_start(&c->journal); if (ret) goto err; @@ -497,7 +524,8 @@ err: int bch2_fs_read_write(struct bch_fs *c) { - if (c->opts.norecovery) + if (c->opts.recovery_pass_last && + c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) return -BCH_ERR_erofs_norecovery; if (c->opts.nochanges) @@ -517,12 +545,12 @@ int bch2_fs_read_write_early(struct bch_fs *c) static void __bch2_fs_free(struct bch_fs *c) { - unsigned i; - - for (i = 0; i < BCH_TIME_STAT_NR; i++) + for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); + bch2_fs_accounting_exit(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); @@ -530,6 +558,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_fs_io_direct_exit(c); bch2_fs_fs_io_buffered_exit(c); bch2_fs_fsio_exit(c); + bch2_fs_vfs_exit(c); bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_nocow_locking_exit(c); @@ -537,22 +566,29 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_io_read_exit(c); bch2_fs_buckets_waiting_for_journal_exit(c); bch2_fs_btree_interior_update_exit(c); - bch2_fs_btree_iter_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); bch2_fs_btree_cache_exit(c); + bch2_fs_btree_iter_exit(c); bch2_fs_replicas_exit(c); bch2_fs_journal_exit(&c->journal); bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); + bch2_fs_btree_gc_exit(c); bch2_journal_keys_put_initial(c); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); - free_percpu(c->online_reserved); + if (c->online_reserved) { + u64 v = percpu_u64_get(c->online_reserved); + WARN(v, "online_reserved not 0 at shutdown: %lli", v); + free_percpu(c->online_reserved); + } darray_exit(&c->btree_roots_extra); free_percpu(c->pcpu); + free_percpu(c->usage); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); @@ -562,12 +598,13 @@ static void __bch2_fs_free(struct bch_fs *c) #endif kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); - kfree(c->unused_inode_hints); if (c->write_ref_wq) destroy_workqueue(c->write_ref_wq); - if (c->io_complete_wq) - destroy_workqueue(c->io_complete_wq); + if (c->btree_write_submit_wq) + destroy_workqueue(c->btree_write_submit_wq); + if (c->btree_read_complete_wq) + destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->btree_io_complete_wq) @@ -576,7 +613,7 @@ static void __bch2_fs_free(struct bch_fs *c) destroy_workqueue(c->btree_update_wq); bch2_free_super(&c->disk_sb); - kvpfree(c, sizeof(*c)); + kvfree(c); module_put(THIS_MODULE); } @@ -593,16 +630,12 @@ void __bch2_fs_stop(struct bch_fs *c) set_bit(BCH_FS_stopping, &c->flags); - cancel_work_sync(&c->journal_seq_blacklist_gc_work); - down_write(&c->state_lock); bch2_fs_read_only(c); up_write(&c->state_lock); for_each_member_device(c, ca) - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev) - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + bch2_dev_unlink(ca); if (c->kobj.state_in_sysfs) kobject_del(&c->kobj); @@ -642,6 +675,7 @@ void bch2_fs_free(struct bch_fs *c) struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); if (ca) { + EBUG_ON(atomic_long_read(&ca->ref) != 1); bch2_free_super(&ca->disk_sb); bch2_dev_free(ca); } @@ -696,7 +730,7 @@ static int bch2_fs_online(struct bch_fs *c) ret = bch2_dev_sysfs_online(c, ca); if (ret) { bch_err(c, "error creating sysfs objects"); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); goto err; } } @@ -715,7 +749,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) unsigned i, iter_size; int ret = 0; - c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); + c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); if (!c) { c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); goto out; @@ -745,13 +779,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) refcount_set(&c->ro_ref, 1); init_waitqueue_head(&c->ro_ref_wait); + spin_lock_init(&c->recovery_pass_lock); sema_init(&c->online_fsck_mutex, 1); - init_rwsem(&c->gc_lock); - mutex_init(&c->gc_gens_lock); - atomic_set(&c->journal_keys.ref, 1); - c->journal_keys.initial_ref_held = true; - for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); @@ -759,6 +789,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); bch2_fs_btree_iter_init_early(c); bch2_fs_btree_interior_update_init_early(c); + bch2_fs_journal_keys_init(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); bch2_fs_rebalance_init(c); @@ -769,24 +800,17 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->list); - mutex_init(&c->usage_scratch_lock); - mutex_init(&c->bio_bounce_pages_lock); mutex_init(&c->snapshot_table_lock); init_rwsem(&c->snapshot_create_lock); spin_lock_init(&c->btree_write_error_lock); - INIT_WORK(&c->journal_seq_blacklist_gc_work, - bch2_blacklist_entries_gc); - INIT_LIST_HEAD(&c->journal_iters); INIT_LIST_HEAD(&c->fsck_error_msgs); mutex_init(&c->fsck_error_msgs_lock); - seqcount_init(&c->gc_pos_lock); - seqcount_init(&c->usage_lock); sema_init(&c->io_in_flight, 128); @@ -794,10 +818,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->vfs_inodes_list); mutex_init(&c->vfs_inodes_lock); - c->copy_gc_enabled = 1; - c->rebalance.enabled = 1; - c->promote_whole_extents = true; - c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; @@ -818,13 +838,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; pr_uuid(&name, c->sb.user_uuid.b); - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; if (ret) goto err; + strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + /* Compat: */ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) @@ -859,16 +879,16 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); - c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->io_complete_wq = alloc_workqueue("bcachefs_io", - WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || + !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG @@ -881,12 +901,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || + !(c->usage = alloc_percpu(struct bch_fs_usage_base)) || !(c->online_reserved = alloc_percpu(u64)) || - mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, - c->opts.btree_node_size) || - mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || - !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, - sizeof(u64), GFP_KERNEL))) { + mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, + c->opts.btree_node_size) || + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { ret = -BCH_ERR_ENOMEM_fs_other_alloc; goto err; } @@ -896,11 +915,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_io_clock_init(&c->io_clock[READ]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: - bch2_fs_replicas_init(c) ?: + bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_btree_gc_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: bch2_fs_subvolumes_init(c) ?: @@ -910,23 +929,24 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: bch2_fs_ec_init(c) ?: + bch2_fs_vfs_init(c) ?: bch2_fs_fsio_init(c) ?: bch2_fs_fs_io_buffered_init(c) ?: bch2_fs_fs_io_direct_init(c); if (ret) goto err; - for (i = 0; i < c->sb.nr_devices; i++) - if (bch2_dev_exists(c->disk_sb.sb, i) && - bch2_dev_alloc(c, i)) { - ret = -EEXIST; + for (i = 0; i < c->sb.nr_devices; i++) { + if (!bch2_member_exists(c->disk_sb.sb, i)) + continue; + ret = bch2_dev_alloc(c, i); + if (ret) goto err; - } + } bch2_journal_entry_res_resize(&c->journal, &c->btree_root_journal_res, BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); - bch2_dev_usage_journal_reserve(c); bch2_journal_entry_res_resize(&c->journal, &c->clock_journal_res, (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); @@ -952,7 +972,7 @@ static void print_mount_opts(struct bch_fs *c) struct printbuf p = PRINTBUF; bool first = true; - prt_str(&p, "mounting version "); + prt_str(&p, "starting version "); bch2_version_to_text(&p, c->sb.version); if (c->opts.read_only) { @@ -1002,15 +1022,26 @@ int bch2_fs_start(struct bch_fs *c) for_each_online_member(c, ca) bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); + struct bch_sb_field_ext *ext = + bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); mutex_unlock(&c->sb_lock); + if (!ext) { + bch_err(c, "insufficient space in superblock for sb_field_ext"); + ret = -BCH_ERR_ENOSPC_sb; + goto err; + } + for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + c->recovery_task = current; ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ? bch2_fs_recovery(c) : bch2_fs_initialize(c); + c->recovery_task = NULL; + if (ret) goto err; @@ -1061,7 +1092,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) } static int bch2_dev_in_fs(struct bch_sb_handle *fs, - struct bch_sb_handle *sb) + struct bch_sb_handle *sb, + struct bch_opts *opts) { if (fs == sb) return 0; @@ -1069,7 +1101,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) return -BCH_ERR_device_not_a_member_of_filesystem; - if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx)) + if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) return -BCH_ERR_device_has_been_removed; if (fs->sb->block_size != sb->sb->block_size) @@ -1094,19 +1126,22 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, prt_bdevname(&buf, fs->bdev); prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));; + bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); prt_newline(&buf); prt_bdevname(&buf, sb->bdev); prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; + bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); prt_newline(&buf); - prt_printf(&buf, "Not using older sb"); + if (!opts->no_splitbrain_check) + prt_printf(&buf, "Not using older sb"); pr_err("%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_device_splitbrain; + + if (!opts->no_splitbrain_check) + return -BCH_ERR_device_splitbrain; } struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); @@ -1124,17 +1159,22 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, prt_newline(&buf); prt_bdevname(&buf, fs->bdev); - prt_str(&buf, "believes seq of "); + prt_str(&buf, " believes seq of "); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " to be %llu, but ", seq_from_fs); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " has %llu\n", seq_from_member); - prt_str(&buf, "Not using "); - prt_bdevname(&buf, sb->bdev); + + if (!opts->no_splitbrain_check) { + prt_str(&buf, "Not using "); + prt_bdevname(&buf, sb->bdev); + } pr_err("%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_device_splitbrain; + + if (!opts->no_splitbrain_check) + return -BCH_ERR_device_splitbrain; } return 0; @@ -1153,26 +1193,26 @@ static void bch2_dev_free(struct bch_dev *ca) { cancel_work_sync(&ca->io_error_work); - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev) - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + bch2_dev_unlink(ca); if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); bch2_free_super(&ca->disk_sb); + bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); free_percpu(ca->io_done); - bioset_exit(&ca->replica_set); bch2_dev_buckets_free(ca); - free_page((unsigned long) ca->sb_read_scratch); + kfree(ca->sb_read_scratch); - bch2_time_stats_exit(&ca->io_latency[WRITE]); - bch2_time_stats_exit(&ca->io_latency[READ]); + bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); + bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); percpu_ref_exit(&ca->io_ref); +#ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_exit(&ca->ref); +#endif kobject_put(&ca->kobj); } @@ -1190,21 +1230,20 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) percpu_ref_kill(&ca->io_ref); wait_for_completion(&ca->io_ref_completion); - if (ca->kobj.state_in_sysfs) { - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); - sysfs_remove_link(&ca->kobj, "block"); - } + bch2_dev_unlink(ca); bch2_free_super(&ca->disk_sb); bch2_dev_journal_exit(ca); } +#ifndef CONFIG_BCACHEFS_DEBUG static void bch2_dev_ref_complete(struct percpu_ref *ref) { struct bch_dev *ca = container_of(ref, struct bch_dev, ref); complete(&ca->ref_completion); } +#endif static void bch2_dev_io_ref_complete(struct percpu_ref *ref) { @@ -1213,6 +1252,26 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref) complete(&ca->io_ref_completion); } +static void bch2_dev_unlink(struct bch_dev *ca) +{ + struct kobject *b; + + /* + * This is racy w.r.t. the underlying block device being hot-removed, + * which removes it from sysfs. + * + * It'd be lovely if we had a way to handle this race, but the sysfs + * code doesn't appear to provide a good method and block/holder.c is + * susceptible as well: + */ + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev && + (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { + sysfs_remove_link(b, "bcachefs"); + sysfs_remove_link(&ca->kobj, "block"); + } +} + static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) { int ret; @@ -1256,12 +1315,10 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, init_completion(&ca->ref_completion); init_completion(&ca->io_ref_completion); - init_rwsem(&ca->bucket_lock); - INIT_WORK(&ca->io_error_work, bch2_io_error_work); - bch2_time_stats_init(&ca->io_latency[READ]); - bch2_time_stats_init(&ca->io_latency[WRITE]); + bch2_time_stats_quantiles_init(&ca->io_latency[READ]); + bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); ca->mi = bch2_mi_to_cpu(member); @@ -1273,14 +1330,19 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ca->mi.bucket_size / btree_sectors(c)); - if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, - 0, GFP_KERNEL) || - percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, +#ifndef CONFIG_BCACHEFS_DEBUG + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) + goto err; +#else + atomic_long_set(&ca->ref, 1); +#endif + + bch2_dev_allocator_background_init(ca); + + if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || - !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || + !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || - bioset_init(&ca->replica_set, 4, - offsetof(struct bch_write_bio, bio), 0) || !(ca->io_done = alloc_percpu(*ca->io_done))) goto err; @@ -1308,7 +1370,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) { struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); struct bch_dev *ca = NULL; - int ret = 0; if (bch2_fs_init_fault("dev_alloc")) goto err; @@ -1320,10 +1381,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->fs = c; bch2_dev_attach(c, ca, dev_idx); - return ret; + return 0; err: - if (ca) - bch2_dev_free(ca); return -BCH_ERR_ENOMEM_dev_alloc; } @@ -1371,10 +1430,9 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) le64_to_cpu(c->disk_sb.sb->seq)) bch2_sb_to_fs(c, sb->sb); - BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || - !c->devs[sb->sb->dev_idx]); + BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); - ca = bch_dev_locked(c, sb->sb->dev_idx); + ca = bch2_dev_locked(c, sb->sb->dev_idx); ret = __bch2_dev_attach_bdev(ca, sb); if (ret) @@ -1466,10 +1524,10 @@ static bool bch2_fs_may_start(struct bch_fs *c) mutex_lock(&c->sb_lock); for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { - if (!bch2_dev_exists(c->disk_sb.sb, i)) + if (!bch2_member_exists(c->disk_sb.sb, i)) continue; - ca = bch_dev_locked(c, i); + ca = bch2_dev_locked(c, i); if (!bch2_dev_is_online(ca) && (ca->mi.state == BCH_MEMBER_STATE_rw || @@ -1490,6 +1548,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) * The allocator thread itself allocates btree nodes, so stop it first: */ bch2_dev_allocator_remove(c, ca); + bch2_recalc_capacity(c); bch2_dev_journal_stop(&c->journal, ca); } @@ -1501,6 +1560,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + bch2_dev_do_discards(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, @@ -1548,32 +1608,6 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, /* Device add/removal: */ -static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) -{ - struct bpos start = POS(ca->dev_idx, 0); - struct bpos end = POS(ca->dev_idx, U64_MAX); - int ret; - - /* - * We clear the LRU and need_discard btrees first so that we don't race - * with bch2_do_invalidates() and bch2_do_discards() - */ - ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, - BTREE_TRIGGER_NORUN, NULL); - bch_err_msg(c, ret, "removing dev alloc info"); - return ret; -} - int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_member *m; @@ -1586,7 +1620,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) * We consume a reference to ca->ref, regardless of whether we succeed * or fail: */ - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot remove without losing data"); @@ -1597,27 +1631,37 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); ret = bch2_dev_data_drop(c, ca->dev_idx, flags); - bch_err_msg(ca, ret, "dropping data"); + bch_err_msg(ca, ret, "bch2_dev_data_drop()"); if (ret) goto err; ret = bch2_dev_remove_alloc(c, ca); - bch_err_msg(ca, ret, "deleting alloc info"); + bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); if (ret) goto err; + /* + * We need to flush the entire journal to get rid of keys that reference + * the device being removed before removing the superblock entry + */ + bch2_journal_flush_all_pins(&c->journal); + + /* + * this is really just needed for the bch2_replicas_gc_(start|end) + * calls, and could be cleaned up: + */ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); - bch_err_msg(ca, ret, "flushing journal"); + bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); if (ret) goto err; ret = bch2_journal_flush(&c->journal); - bch_err(ca, "journal error"); + bch_err_msg(ca, ret, "bch2_journal_flush()"); if (ret) goto err; ret = bch2_replicas_gc2(c); - bch_err_msg(ca, ret, "in replicas_gc2()"); + bch_err_msg(ca, ret, "bch2_replicas_gc2()"); if (ret) goto err; @@ -1638,23 +1682,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) rcu_assign_pointer(c->devs[ca->dev_idx], NULL); mutex_unlock(&c->sb_lock); +#ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_kill(&ca->ref); +#else + ca->dying = true; + bch2_dev_put(ca); +#endif wait_for_completion(&ca->ref_completion); bch2_dev_free(ca); /* - * At this point the device object has been removed in-core, but the - * on-disk journal might still refer to the device index via sb device - * usage entries. Recovery fails if it sees usage information for an - * invalid device. Flush journal pins to push the back of the journal - * past now invalid device index references before we update the - * superblock, but after the device object has been removed so any - * further journal writes elide usage info for the device. - */ - bch2_journal_flush_all_pins(&c->journal); - - /* * Free this device's slot in the bch_member array - all pointers to * this device must be gone: */ @@ -1666,8 +1704,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) mutex_unlock(&c->sb_lock); up_write(&c->state_lock); - - bch2_dev_usage_journal_reserve(c); return 0; err: if (ca->mi.state == BCH_MEMBER_STATE_rw && @@ -1683,9 +1719,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb; struct bch_dev *ca = NULL; - struct bch_sb_field_members_v2 *mi; - struct bch_member dev_mi; - unsigned dev_idx, nr_devices, u64s; struct printbuf errbuf = PRINTBUF; struct printbuf label = PRINTBUF; int ret; @@ -1695,7 +1728,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); + struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); if (BCH_MEMBER_GROUP(&dev_mi)) { bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); @@ -1715,17 +1748,10 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err; } - bch2_dev_usage_init(ca); - ret = __bch2_dev_attach_bdev(ca, &sb); if (ret) goto err; - ret = bch2_dev_journal_alloc(ca); - bch_err_msg(c, ret, "allocating journal"); - if (ret) - goto err; - down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1735,36 +1761,19 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err_unlock; if (dynamic_fault("bcachefs:add:no_slot")) - goto no_slot; - - for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) - if (!bch2_dev_exists(c->disk_sb.sb, dev_idx)) - goto have_slot; -no_slot: - ret = -BCH_ERR_ENOSPC_sb_members; - bch_err_msg(c, ret, "setting up new superblock"); - goto err_unlock; - -have_slot: - nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); - - mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + - le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); + goto err_unlock; - mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); - if (!mi) { - ret = -BCH_ERR_ENOSPC_sb_members; + ret = bch2_sb_member_alloc(c); + if (ret < 0) { bch_err_msg(c, ret, "setting up new superblock"); goto err_unlock; } - struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); + unsigned dev_idx = ret; /* success: */ - *m = dev_mi; - m->last_mount = cpu_to_le64(ktime_get_real_seconds()); - c->disk_sb.sb->nr_devices = nr_devices; + dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); + *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); @@ -1779,9 +1788,11 @@ have_slot: bch2_write_super(c); mutex_unlock(&c->sb_lock); - bch2_dev_usage_journal_reserve(c); + ret = bch2_dev_usage_init(ca, false); + if (ret) + goto err_late; - ret = bch2_trans_mark_dev_sb(c, ca); + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(ca, ret, "marking new superblock"); if (ret) goto err_late; @@ -1791,13 +1802,20 @@ have_slot: if (ret) goto err_late; - ca->new_fs_bucket_idx = 0; - if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); + ret = bch2_dev_journal_alloc(ca, false); + bch_err_msg(c, ret, "allocating journal"); + if (ret) + goto err_late; + up_write(&c->state_lock); - return 0; +out: + printbuf_exit(&label); + printbuf_exit(&errbuf); + bch_err_fn(c, ret); + return ret; err_unlock: mutex_unlock(&c->sb_lock); @@ -1806,10 +1824,7 @@ err: if (ca) bch2_dev_free(ca); bch2_free_super(&sb); - printbuf_exit(&label); - printbuf_exit(&errbuf); - bch_err_fn(c, ret); - return ret; + goto out; err_late: up_write(&c->state_lock); ca = NULL; @@ -1835,7 +1850,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; - ret = bch2_dev_in_fs(&c->disk_sb, &sb); + ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); bch_err_msg(c, ret, "bringing %s online", path); if (ret) goto err; @@ -1844,9 +1859,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path) if (ret) goto err; - ca = bch_dev_locked(c, dev_idx); + ca = bch2_dev_locked(c, dev_idx); - ret = bch2_trans_mark_dev_sb(c, ca); + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); if (ret) goto err; @@ -1862,7 +1877,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) } if (!ca->journal.nr) { - ret = bch2_dev_journal_alloc(ca); + ret = bch2_dev_journal_alloc(ca, false); bch_err_msg(ca, ret, "allocating journal"); if (ret) goto err; @@ -1919,6 +1934,13 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) goto err; } + if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { + bch_err(ca, "New device size too big (%llu greater than max %u)", + nbuckets, BCH_MEMBER_NBUCKETS_MAX); + ret = -BCH_ERR_device_size_too_big; + goto err; + } + if (bch2_dev_is_online(ca) && get_capacity(ca->disk_sb.bdev->bd_disk) < ca->mi.bucket_size * nbuckets) { @@ -1932,7 +1954,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (ret) goto err; - ret = bch2_trans_mark_dev_sb(c, ca); + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); if (ret) goto err; @@ -1944,15 +1966,18 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { - ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_dev_data_type, + .dev_data_type.dev = ca->dev_idx, + .dev_data_type.data_type = BCH_DATA_free, + }; + u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; + + ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, + bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: + bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); if (ret) goto err; - - /* - * XXX: this is all wrong transactionally - we'll be able to do - * this correctly after the disk space accounting rewrite - */ - ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets; } bch2_recalc_capacity(c); @@ -1964,13 +1989,12 @@ err: /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { - rcu_read_lock(); - for_each_member_device_rcu(c, ca, NULL) - if (!strcmp(name, ca->name)) { - rcu_read_unlock(); + if (!strncmp(name, "/dev/", strlen("/dev/"))) + name += strlen("/dev/"); + + for_each_member_device(c, ca) + if (!strcmp(name, ca->name)) return ca; - } - rcu_read_unlock(); return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); } @@ -2023,7 +2047,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, best = sb; darray_for_each_reverse(sbs, sb) { - ret = bch2_dev_in_fs(best, sb); + ret = bch2_dev_in_fs(best, sb, &opts); if (ret == -BCH_ERR_device_has_been_removed || ret == -BCH_ERR_device_splitbrain) { diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index dada09331d2e..04f8287eff5c 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -29,21 +29,12 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); +bool bch2_fs_emergency_read_only_locked(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); int bch2_fs_read_write_early(struct bch_fs *); -/* - * Only for use in the recovery/fsck path: - */ -static inline void bch2_fs_lazy_rw(struct bch_fs *c) -{ - if (!test_bit(BCH_FS_rw, &c->flags) && - !test_bit(BCH_FS_was_rw, &c->flags)) - bch2_fs_read_write_early(c); -} - void __bch2_fs_stop(struct bch_fs *); void bch2_fs_free(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 0e5a14fc8e7f..368a63d938cf 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -4,7 +4,7 @@ struct bch_sb_handle { struct bch_sb *sb; - struct bdev_handle *bdev_handle; + struct file *s_bdev_file; struct block_device *bdev; char *sb_name; struct bio *bio; @@ -26,17 +26,4 @@ struct bch_devs_list { u8 data[BCH_BKEY_PTRS_MAX]; }; -struct bch_member_cpu { - u64 nbuckets; /* device size */ - u16 first_bucket; /* index of first bucket used */ - u16 bucket_size; /* sectors */ - u16 group; - u8 state; - u8 discard; - u8 data_allowed; - u8 durability; - u8 freespace_initialized; - u8 valid; -}; - #endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index cee80c47feea..a7eb1f511484 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -22,10 +22,12 @@ #include "buckets.h" #include "clock.h" #include "compress.h" +#include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" #include "inode.h" #include "journal.h" +#include "journal_reclaim.h" #include "keylist.h" #include "move.h" #include "movinggc.h" @@ -139,10 +141,12 @@ do { \ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); -write_attribute(prune_cache); -write_attribute(btree_wakeup); -rw_attribute(btree_gc_periodic); -rw_attribute(gc_gens_pos); +write_attribute(trigger_journal_flush); +write_attribute(trigger_journal_writes); +write_attribute(trigger_btree_cache_shrink); +write_attribute(trigger_btree_key_cache_shrink); +write_attribute(trigger_freelist_wakeup); +read_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); @@ -166,9 +170,9 @@ read_attribute(btree_write_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); read_attribute(journal_debug); -read_attribute(btree_updates); read_attribute(btree_cache); read_attribute(btree_key_cache); +read_attribute(btree_reserve_cache); read_attribute(stripes_heap); read_attribute(open_buckets); read_attribute(open_buckets_partial); @@ -189,12 +193,8 @@ static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) { bch2_printbuf_tabstop_push(out, 24); - for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) { - prt_str(out, bch2_write_refs[i]); - prt_tab(out); - prt_printf(out, "%li", atomic_long_read(&c->writes[i])); - prt_newline(out); - } + for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) + prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i])); } #endif @@ -203,21 +203,20 @@ read_attribute(disk_groups); read_attribute(has_data); read_attribute(alloc_debug); +read_attribute(usage_base); #define x(t, n, ...) read_attribute(t); BCH_PERSISTENT_COUNTERS() #undef x rw_attribute(discard); +read_attribute(state); rw_attribute(label); -rw_attribute(copy_gc_enabled); read_attribute(copy_gc_wait); -rw_attribute(rebalance_enabled); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); -rw_attribute(promote_whole_extents); read_attribute(new_stripes); @@ -232,149 +231,88 @@ write_attribute(perf_test); #define x(_name) \ static struct attribute sysfs_time_stat_##_name = \ - { .name = #_name, .mode = 0444 }; + { .name = #_name, .mode = 0644 }; BCH_TIME_STATS() #undef x -static struct attribute sysfs_state_rw = { - .name = "state", - .mode = 0444, -}; - static size_t bch2_btree_cache_size(struct bch_fs *c) { + struct btree_cache *bc = &c->btree_cache; size_t ret = 0; struct btree *b; - mutex_lock(&c->btree_cache.lock); - list_for_each_entry(b, &c->btree_cache.live, list) + mutex_lock(&bc->lock); + list_for_each_entry(b, &bc->live[0].list, list) ret += btree_buf_bytes(b); - - mutex_unlock(&c->btree_cache.lock); + list_for_each_entry(b, &bc->live[1].list, list) + ret += btree_buf_bytes(b); + list_for_each_entry(b, &bc->freeable, list) + ret += btree_buf_bytes(b); + mutex_unlock(&bc->lock); return ret; } static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { - struct btree_trans *trans; - enum btree_id id; - struct compression_type_stats { - u64 nr_extents; - u64 sectors_compressed; - u64 sectors_uncompressed; - } s[BCH_COMPRESSION_TYPE_NR]; - u64 compressed_incompressible = 0; - int ret = 0; - - memset(s, 0, sizeof(s)); - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EPERM; - - trans = bch2_trans_get(c); - - for (id = 0; id < BTREE_ID_NR; id++) { - if (!btree_type_has_ptrs(id)) - continue; - - ret = for_each_btree_key(trans, iter, id, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS, k, ({ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *entry; - bool compressed = false, incompressible = false; - - bkey_for_each_crc(k.k, ptrs, crc, entry) { - incompressible |= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible; - compressed |= crc_is_compressed(crc); - - if (crc_is_compressed(crc)) { - s[crc.compression_type].nr_extents++; - s[crc.compression_type].sectors_compressed += crc.compressed_size; - s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size; - } - } - - compressed_incompressible += compressed && incompressible; - - if (!compressed) { - unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0; - - s[t].nr_extents++; - s[t].sectors_compressed += k.k->size; - s[t].sectors_uncompressed += k.k->size; - } - 0; - })); - } - - bch2_trans_put(trans); - - if (ret) - return ret; - prt_str(out, "type"); printbuf_tabstop_push(out, 12); - prt_tab(out); - - prt_str(out, "compressed"); printbuf_tabstop_push(out, 16); - prt_tab_rjust(out); - - prt_str(out, "uncompressed"); printbuf_tabstop_push(out, 16); - prt_tab_rjust(out); - - prt_str(out, "average extent size"); printbuf_tabstop_push(out, 24); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n"); + + for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) { + struct disk_accounting_pos a = { + .type = BCH_DISK_ACCOUNTING_compression, + .compression.type = i, + }; + struct bpos p = disk_accounting_pos_to_bpos(&a); + u64 v[3]; + bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v)); + + u64 nr_extents = v[0]; + u64 sectors_uncompressed = v[1]; + u64 sectors_compressed = v[2]; - for (unsigned i = 0; i < ARRAY_SIZE(s); i++) { bch2_prt_compression_type(out, i); prt_tab(out); - prt_human_readable_u64(out, s[i].sectors_compressed << 9); + prt_human_readable_u64(out, sectors_compressed << 9); prt_tab_rjust(out); - prt_human_readable_u64(out, s[i].sectors_uncompressed << 9); + prt_human_readable_u64(out, sectors_uncompressed << 9); prt_tab_rjust(out); - prt_human_readable_u64(out, s[i].nr_extents - ? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents) + prt_human_readable_u64(out, nr_extents + ? div64_u64(sectors_uncompressed << 9, nr_extents) : 0); prt_tab_rjust(out); prt_newline(out); } - if (compressed_incompressible) { - prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible); - prt_newline(out); - } - return 0; } static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) { - prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree)); + bch2_btree_id_to_text(out, c->gc_gens_btree); + prt_printf(out, ": "); bch2_bpos_to_text(out, c->gc_gens_pos); prt_printf(out, "\n"); } -static void bch2_btree_wakeup_all(struct bch_fs *c) +static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) { - struct btree_trans *trans; - - seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct btree_bkey_cached_common *b = READ_ONCE(trans->locking); + struct bch_fs_usage_base b = {}; - if (b) - six_lock_wakeup_all(&b->lock); + acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64)); - } - seqmutex_unlock(&c->btree_trans_lock); + prt_printf(out, "hidden:\t\t%llu\n", b.hidden); + prt_printf(out, "btree:\t\t%llu\n", b.btree); + prt_printf(out, "data:\t\t%llu\n", b.data); + prt_printf(out, "cached:\t%llu\n", b.cached); + prt_printf(out, "reserved:\t\t%llu\n", b.reserved); + prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes); } SHOW(bch2_fs) @@ -392,14 +330,9 @@ SHOW(bch2_fs) if (attr == &sysfs_btree_write_stats) bch2_btree_write_stats_to_text(out, c); - sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); - if (attr == &sysfs_gc_gens_pos) bch2_gc_gens_pos_to_text(out, c); - sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); - - sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ if (attr == &sysfs_copy_gc_wait) @@ -408,27 +341,25 @@ SHOW(bch2_fs) if (attr == &sysfs_rebalance_status) bch2_rebalance_status_to_text(out, c); - sysfs_print(promote_whole_extents, c->promote_whole_extents); - /* Debugging: */ if (attr == &sysfs_journal_debug) bch2_journal_debug_to_text(out, &c->journal); - if (attr == &sysfs_btree_updates) - bch2_btree_updates_to_text(out, c); - if (attr == &sysfs_btree_cache) - bch2_btree_cache_to_text(out, c); + bch2_btree_cache_to_text(out, &c->btree_cache); if (attr == &sysfs_btree_key_cache) bch2_btree_key_cache_to_text(out, &c->btree_key_cache); + if (attr == &sysfs_btree_reserve_cache) + bch2_btree_reserve_cache_to_text(out, c); + if (attr == &sysfs_stripes_heap) bch2_stripes_heap_to_text(out, c); if (attr == &sysfs_open_buckets) - bch2_open_buckets_to_text(out, c); + bch2_open_buckets_to_text(out, c, NULL); if (attr == &sysfs_open_buckets_partial) bch2_open_buckets_partial_to_text(out, c); @@ -462,6 +393,12 @@ SHOW(bch2_fs) if (attr == &sysfs_disk_groups) bch2_disk_groups_to_text(out, c); + if (attr == &sysfs_alloc_debug) + bch2_fs_alloc_debug_to_text(out, c); + + if (attr == &sysfs_usage_base) + bch2_fs_usage_base_to_text(out, c); + return 0; } @@ -469,35 +406,8 @@ STORE(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - if (attr == &sysfs_btree_gc_periodic) { - ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) - ?: (ssize_t) size; - - wake_up_process(c->gc_thread); - return ret; - } - - if (attr == &sysfs_copy_gc_enabled) { - ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) - ?: (ssize_t) size; - - if (c->copygc_thread) - wake_up_process(c->copygc_thread); - return ret; - } - - if (attr == &sysfs_rebalance_enabled) { - ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) - ?: (ssize_t) size; - - rebalance_wakeup(c); - return ret; - } - sysfs_pd_controller_store(rebalance, &c->rebalance.pd); - sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); - /* Debugging: */ if (!test_bit(BCH_FS_started, &c->flags)) @@ -505,39 +415,46 @@ STORE(bch2_fs) /* Debugging: */ - if (!test_bit(BCH_FS_rw, &c->flags)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) return -EROFS; - if (attr == &sysfs_prune_cache) { + if (attr == &sysfs_trigger_btree_cache_shrink) { + struct btree_cache *bc = &c->btree_cache; struct shrink_control sc; sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); + bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc); } - if (attr == &sysfs_btree_wakeup) - bch2_btree_wakeup_all(c); - - if (attr == &sysfs_trigger_gc) { - /* - * Full gc is currently incompatible with btree key cache: - */ -#if 0 - down_read(&c->state_lock); - bch2_gc(c, false, false); - up_read(&c->state_lock); -#else - bch2_gc_gens(c); -#endif + if (attr == &sysfs_trigger_btree_key_cache_shrink) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = strtoul_or_return(buf); + c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc); } + if (attr == &sysfs_trigger_gc) + bch2_gc_gens(c); + if (attr == &sysfs_trigger_discards) bch2_do_discards(c); if (attr == &sysfs_trigger_invalidates) bch2_do_invalidates(c); + if (attr == &sysfs_trigger_journal_flush) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_meta(&c->journal); + } + + if (attr == &sysfs_trigger_journal_writes) + bch2_journal_do_writes(&c->journal); + + if (attr == &sysfs_trigger_freelist_wakeup) + closure_wake_up(&c->freelist_wait); + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -558,6 +475,7 @@ STORE(bch2_fs) size = ret; } #endif + bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); return size; } SYSFS_OPS(bch2_fs); @@ -567,7 +485,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_btree_cache_size, &sysfs_btree_write_stats, - &sysfs_promote_whole_extents, + &sysfs_rebalance_status, &sysfs_compression_stats, @@ -587,17 +505,22 @@ SHOW(bch2_fs_counters) printbuf_tabstop_push(out, 32); - #define x(t, ...) \ + #define x(t, n, f, ...) \ if (attr == &sysfs_##t) { \ counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ - prt_printf(out, "since mount:"); \ - prt_tab(out); \ + if (f & TYPE_SECTORS) { \ + counter <<= 9; \ + counter_since_mount <<= 9; \ + } \ + \ + prt_printf(out, "since mount:\t"); \ + (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\ prt_human_readable_u64(out, counter_since_mount); \ prt_newline(out); \ \ - prt_printf(out, "since filesystem creation:"); \ - prt_tab(out); \ + prt_printf(out, "since filesystem creation:\t"); \ + (f & TYPE_COUNTER) ? prt_u64(out, counter) : \ prt_human_readable_u64(out, counter); \ prt_newline(out); \ } @@ -639,9 +562,9 @@ SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { &sysfs_flags, &sysfs_journal_debug, - &sysfs_btree_updates, &sysfs_btree_cache, &sysfs_btree_key_cache, + &sysfs_btree_reserve_cache, &sysfs_new_stripes, &sysfs_stripes_heap, &sysfs_open_buckets, @@ -657,16 +580,16 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_gc, &sysfs_trigger_discards, &sysfs_trigger_invalidates, - &sysfs_prune_cache, - &sysfs_btree_wakeup, + &sysfs_trigger_journal_flush, + &sysfs_trigger_journal_writes, + &sysfs_trigger_btree_cache_shrink, + &sysfs_trigger_btree_key_cache_shrink, + &sysfs_trigger_freelist_wakeup, &sysfs_gc_gens_pos, - &sysfs_copy_gc_enabled, &sysfs_copy_gc_wait, - &sysfs_rebalance_enabled, - &sysfs_rebalance_status, sysfs_pd_controller_files(rebalance), &sysfs_moving_ctxts, @@ -674,6 +597,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_internal_uuid, &sysfs_disk_groups, + &sysfs_alloc_debug, + &sysfs_usage_base, NULL }; @@ -723,7 +648,7 @@ STORE(bch2_fs_opts_dir) if (ret < 0) goto err; - bch2_opt_set_sb(c, opt, v); + bch2_opt_set_sb(c, NULL, opt, v); bch2_opt_set_by_id(&c->opts, id, v); if (v && @@ -732,6 +657,13 @@ STORE(bch2_fs_opts_dir) (id == Opt_compression && !c->opts.background_compression))) bch2_set_rebalance_needs_scan(c, 0); + if (v && id == Opt_rebalance_enabled) + rebalance_wakeup(c); + + if (v && id == Opt_copygc_enabled && + c->copygc_thread) + wake_up_process(c->copygc_thread); + ret = size; err: bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); @@ -777,6 +709,13 @@ SHOW(bch2_fs_time_stats) STORE(bch2_fs_time_stats) { + struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); + +#define x(name) \ + if (attr == &sysfs_time_stat_##name) \ + bch2_time_stats_reset(&c->times[BCH_TIME_##name]); + BCH_TIME_STATS() +#undef x return size; } SYSFS_OPS(bch2_fs_time_stats); @@ -789,88 +728,6 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch2_dev_usage_read(ca); - unsigned i, nr[BCH_DATA_NR]; - - memset(nr, 0, sizeof(nr)); - - for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) - nr[c->open_buckets[i].data_type]++; - - printbuf_tabstop_push(out, 8); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - - bch2_dev_usage_to_text(out, &stats); - - prt_newline(out); - - prt_printf(out, "reserves:"); - prt_newline(out); - for (i = 0; i < BCH_WATERMARK_NR; i++) { - prt_str(out, bch2_watermarks[i]); - prt_tab(out); - prt_u64(out, bch2_dev_buckets_reserved(ca, i)); - prt_tab_rjust(out); - prt_newline(out); - } - - prt_newline(out); - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 24); - - prt_str(out, "freelist_wait"); - prt_tab(out); - prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty"); - prt_newline(out); - - prt_str(out, "open buckets allocated"); - prt_tab(out); - prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); - prt_newline(out); - - prt_str(out, "open buckets this dev"); - prt_tab(out); - prt_u64(out, ca->nr_open_buckets); - prt_newline(out); - - prt_str(out, "open buckets total"); - prt_tab(out); - prt_u64(out, OPEN_BUCKETS_COUNT); - prt_newline(out); - - prt_str(out, "open_buckets_wait"); - prt_tab(out); - prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty"); - prt_newline(out); - - prt_str(out, "open_buckets_btree"); - prt_tab(out); - prt_u64(out, nr[BCH_DATA_btree]); - prt_newline(out); - - prt_str(out, "open_buckets_user"); - prt_tab(out); - prt_u64(out, nr[BCH_DATA_user]); - prt_newline(out); - - prt_str(out, "buckets_to_invalidate"); - prt_tab(out); - prt_u64(out, should_invalidate_buckets(ca, stats)); - prt_newline(out); - - prt_str(out, "btree reserve cache"); - prt_tab(out); - prt_u64(out, c->btree_reserve_cache_nr); - prt_newline(out); -} - static const char * const bch2_rw[] = { "read", "write", @@ -915,7 +772,7 @@ SHOW(bch2_dev) prt_char(out, '\n'); } - if (attr == &sysfs_state_rw) { + if (attr == &sysfs_state) { prt_string_option(out, bch2_member_states, ca->mi.state); prt_char(out, '\n'); } @@ -930,17 +787,20 @@ SHOW(bch2_dev) sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); if (attr == &sysfs_io_latency_stats_read) - bch2_time_stats_to_text(out, &ca->io_latency[READ]); + bch2_time_stats_to_text(out, &ca->io_latency[READ].stats); if (attr == &sysfs_io_latency_stats_write) - bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); + bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats); sysfs_printf(congested, "%u%%", clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) * 100 / CONGESTED_MAX); if (attr == &sysfs_alloc_debug) - dev_alloc_debug_to_text(out, ca); + bch2_dev_alloc_debug_to_text(out, ca); + + if (attr == &sysfs_open_buckets) + bch2_open_buckets_to_text(out, c, ca); return 0; } @@ -949,32 +809,17 @@ STORE(bch2_dev) { struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); struct bch_fs *c = ca->fs; - struct bch_member *mi; if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); - mutex_lock(&c->sb_lock); - mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - - if (v != BCH_MEMBER_DISCARD(mi)) { - SET_BCH_MEMBER_DISCARD(mi, v); - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); + bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v); } if (attr == &sysfs_durability) { u64 v = strtoul_or_return(buf); - mutex_lock(&c->sb_lock); - mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - - if (v + 1 != BCH_MEMBER_DURABILITY(mi)) { - SET_BCH_MEMBER_DURABILITY(mi, v + 1); - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); + bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v); } if (attr == &sysfs_label) { @@ -1007,7 +852,7 @@ struct attribute *bch2_dev_files[] = { /* settings: */ &sysfs_discard, - &sysfs_state_rw, + &sysfs_state, &sysfs_label, &sysfs_has_data, @@ -1023,6 +868,7 @@ struct attribute *bch2_dev_files[] = { /* debug: */ &sysfs_alloc_debug, + &sysfs_open_buckets, NULL }; diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index b3fe9fc57747..6c6469814637 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -40,7 +40,7 @@ static int test_delete(struct bch_fs *c, u64 nr) k.k.p.snapshot = U32_MAX; bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: @@ -81,7 +81,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) k.k.p.snapshot = U32_MAX; bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: @@ -121,7 +121,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) ck.k.p.offset = i; ck.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0); bch_err_msg(c, ret, "insert error"); if (ret) return ret; @@ -131,7 +131,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i++); @@ -176,7 +176,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) ck.k.p.snapshot = U32_MAX; ck.k.size = 8; - ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0); bch_err_msg(c, ret, "insert error"); if (ret) return ret; @@ -186,7 +186,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_max(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i); @@ -232,7 +232,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) ck.k.p.offset = i * 2; ck.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0); bch_err_msg(c, ret, "insert error"); if (ret) return ret; @@ -242,7 +242,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i); @@ -259,9 +259,9 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ + BTREE_ITER_slots, k, ({ if (i >= nr * 2) break; @@ -292,7 +292,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) ck.k.p.snapshot = U32_MAX; ck.k.size = 8; - ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0); bch_err_msg(c, ret, "insert error"); if (ret) return ret; @@ -302,7 +302,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_max(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i + 8); @@ -320,9 +320,9 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_max(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ + BTREE_ITER_slots, k, ({ if (i == nr) break; BUG_ON(bkey_deleted(k.k) != !(i % 16)); @@ -349,10 +349,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(trans, &iter); @@ -369,10 +369,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(trans, &iter); @@ -394,9 +394,9 @@ static int insert_test_extent(struct bch_fs *c, k.k_i.k.p.offset = end; k.k_i.k.p.snapshot = U32_MAX; k.k_i.k.size = end - start; - k.k_i.k.version.lo = test_version++; + k.k_i.k.bversion.lo = test_version++; - ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0); bch_err_fn(c, ret); return ret; } @@ -450,9 +450,9 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, k.k_i.k.p.snapshot = snapid; k.k_i.k.size = len; - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + BTREE_UPDATE_internal_snapshot_node)); bch_err_fn(c, ret); return ret; } @@ -481,14 +481,14 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) bkey_cookie_init(&cookie.k_i); cookie.k.p.snapshot = snapid_hi; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); if (ret) return ret; trans = bch2_trans_get(c); bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, snapid_lo), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k->p.snapshot != U32_MAX); @@ -506,11 +506,11 @@ static int test_snapshots(struct bch_fs *c, u64 nr) bkey_cookie_init(&cookie.k_i); cookie.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); if (ret) return ret; - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_snapshot_node_create(trans, U32_MAX, snapids, snapid_subvols, @@ -671,8 +671,8 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek(&iter); + BTREE_ITER_intent); + k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) goto err; @@ -714,7 +714,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, + BTREE_ITER_slots|BTREE_ITER_intent, k, NULL, NULL, 0, ({ if (iter.pos.offset >= nr) break; @@ -726,7 +726,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) static int seq_lookup(struct bch_fs *c, u64 nr) { return bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, 0)); @@ -737,7 +737,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, 0, ({ struct bkey_i_cookie u; @@ -809,6 +809,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, unsigned i; u64 time; + if (nr == 0 || nr_threads == 0) { + pr_err("nr of iterations or threads is not allowed to be 0"); + return -EINVAL; + } + atomic_set(&j.ready, nr_threads); init_waitqueue_head(&j.ready_wait); diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index 9220d7de10db..dea73bc1cb51 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -2,7 +2,6 @@ #ifndef NO_BCACHEFS_FS #include "bcachefs.h" -#include "printbuf.h" #include "thread_with_file.h" #include <linux/anon_inodes.h> @@ -10,6 +9,7 @@ #include <linux/kthread.h> #include <linux/pagemap.h> #include <linux/poll.h> +#include <linux/sched/sysctl.h> void bch2_thread_with_file_exit(struct thread_with_file *thr) { @@ -65,68 +65,87 @@ err: return ret; } -static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr) +/* stdio_redirect */ + +static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen) { - return thr->stdio.output_buf.pos || - thr->output2.nr || - thr->thr.done; + return stdio->input.buf.nr > seen || stdio->done; } -static ssize_t thread_with_stdio_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) +static bool stdio_redirect_has_input(struct stdio_redirect *stdio) { - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - size_t copied = 0, b; - int ret = 0; + return stdio_redirect_has_more_input(stdio, 0); +} - if ((file->f_flags & O_NONBLOCK) && - !thread_with_stdio_has_output(thr)) - return -EAGAIN; +static bool stdio_redirect_has_output(struct stdio_redirect *stdio) +{ + return stdio->output.buf.nr || stdio->done; +} - ret = wait_event_interruptible(thr->stdio.output_wait, - thread_with_stdio_has_output(thr)); - if (ret) - return ret; +#define STDIO_REDIRECT_BUFSIZE 4096 - if (thr->thr.done) - return 0; +static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio) +{ + return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; +} - while (len) { - ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos); - if (ret) - break; +static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio) +{ + return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; +} - spin_lock_irq(&thr->stdio.output_lock); - b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos); +static void stdio_buf_init(struct stdio_buf *buf) +{ + spin_lock_init(&buf->lock); + init_waitqueue_head(&buf->wait); + darray_init(&buf->buf); +} - memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b); - memmove(thr->stdio.output_buf.buf, - thr->stdio.output_buf.buf + b, - thr->stdio.output_buf.pos - b); +/* thread_with_stdio */ - thr->output2.nr += b; - thr->stdio.output_buf.pos -= b; - spin_unlock_irq(&thr->stdio.output_lock); +static void thread_with_stdio_done(struct thread_with_stdio *thr) +{ + thr->thr.done = true; + thr->stdio.done = true; + wake_up(&thr->stdio.input.wait); + wake_up(&thr->stdio.output.wait); +} - b = min(len, thr->output2.nr); - if (!b) - break; +static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf, + size_t len, loff_t *ppos) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + struct stdio_buf *buf = &thr->stdio.output; + size_t copied = 0, b; + int ret = 0; - b -= copy_to_user(buf, thr->output2.data, b); - if (!b) { + if (!(file->f_flags & O_NONBLOCK)) { + ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio)); + if (ret) + return ret; + } else if (!stdio_redirect_has_output(&thr->stdio)) + return -EAGAIN; + + while (len && buf->buf.nr) { + if (fault_in_writeable(ubuf, len) == len) { ret = -EFAULT; break; } - copied += b; - buf += b; - len -= b; - - memmove(thr->output2.data, - thr->output2.data + b, - thr->output2.nr - b); - thr->output2.nr -= b; + spin_lock_irq(&buf->lock); + b = min_t(size_t, len, buf->buf.nr); + + if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) { + ubuf += b; + len -= b; + copied += b; + buf->buf.nr -= b; + memmove(buf->buf.data, + buf->buf.data + b, + buf->buf.nr); + } + spin_unlock_irq(&buf->lock); } return copied ?: ret; @@ -137,27 +156,20 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file) struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); + thread_with_stdio_done(thr); bch2_thread_with_file_exit(&thr->thr); - printbuf_exit(&thr->stdio.input_buf); - printbuf_exit(&thr->stdio.output_buf); - darray_exit(&thr->output2); - thr->exit(thr); + darray_exit(&thr->stdio.input.buf); + darray_exit(&thr->stdio.output.buf); + thr->ops->exit(thr); return 0; } -#define WRITE_BUFFER 4096 - -static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr) -{ - return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done; -} - static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, size_t len, loff_t *ppos) { struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); - struct printbuf *buf = &thr->stdio.input_buf; + struct stdio_buf *buf = &thr->stdio.input; size_t copied = 0; ssize_t ret = 0; @@ -173,29 +185,34 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu break; } - spin_lock(&thr->stdio.input_lock); - if (buf->pos < WRITE_BUFFER) - bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos)); - b = min(len, printbuf_remaining_size(buf)); - - if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) { - ubuf += b; - len -= b; - copied += b; - buf->pos += b; + spin_lock(&buf->lock); + size_t makeroom = b; + if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr)) + makeroom = min_t(ssize_t, makeroom, + max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr, + 0)); + darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT); + + b = min(len, darray_room(buf->buf)); + + if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { + buf->buf.nr += b; + ubuf += b; + len -= b; + copied += b; } - spin_unlock(&thr->stdio.input_lock); + spin_unlock(&buf->lock); if (b) { - wake_up(&thr->stdio.input_wait); + wake_up(&buf->wait); } else { if ((file->f_flags & O_NONBLOCK)) { ret = -EAGAIN; break; } - ret = wait_event_interruptible(thr->stdio.input_wait, - thread_with_stdio_has_input_space(thr)); + ret = wait_event_interruptible(buf->wait, + stdio_redirect_has_input_space(&thr->stdio)); if (ret) break; } @@ -209,90 +226,266 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); - poll_wait(file, &thr->stdio.output_wait, wait); - poll_wait(file, &thr->stdio.input_wait, wait); + poll_wait(file, &thr->stdio.output.wait, wait); + poll_wait(file, &thr->stdio.input.wait, wait); __poll_t mask = 0; - if (thread_with_stdio_has_output(thr)) + if (stdio_redirect_has_output(&thr->stdio)) mask |= EPOLLIN; - if (thread_with_stdio_has_input_space(thr)) + if (stdio_redirect_has_input_space(&thr->stdio)) mask |= EPOLLOUT; if (thr->thr.done) mask |= EPOLLHUP|EPOLLERR; return mask; } +static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + poll_wait(file, &thr->stdio.output.wait, wait); + + __poll_t mask = 0; + + if (stdio_redirect_has_output(&thr->stdio)) + mask |= EPOLLIN; + if (thr->thr.done) + mask |= EPOLLHUP|EPOLLERR; + return mask; +} + +static int thread_with_stdio_flush(struct file *file, fl_owner_t id) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + return thr->thr.ret; +} + +static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + if (thr->ops->unlocked_ioctl) + return thr->ops->unlocked_ioctl(thr, cmd, p); + return -ENOTTY; +} + static const struct file_operations thread_with_stdio_fops = { - .release = thread_with_stdio_release, .read = thread_with_stdio_read, .write = thread_with_stdio_write, .poll = thread_with_stdio_poll, - .llseek = no_llseek, + .flush = thread_with_stdio_flush, + .release = thread_with_stdio_release, + .unlocked_ioctl = thread_with_stdio_ioctl, +}; + +static const struct file_operations thread_with_stdout_fops = { + .read = thread_with_stdio_read, + .poll = thread_with_stdout_poll, + .flush = thread_with_stdio_flush, + .release = thread_with_stdio_release, + .unlocked_ioctl = thread_with_stdio_ioctl, }; +static int thread_with_stdio_fn(void *arg) +{ + struct thread_with_stdio *thr = arg; + + thr->thr.ret = thr->ops->fn(thr); + + thread_with_stdio_done(thr); + return 0; +} + +void bch2_thread_with_stdio_init(struct thread_with_stdio *thr, + const struct thread_with_stdio_ops *ops) +{ + stdio_buf_init(&thr->stdio.input); + stdio_buf_init(&thr->stdio.output); + thr->ops = ops; +} + +int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr) +{ + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn); +} + int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, - void (*exit)(struct thread_with_stdio *), - int (*fn)(void *)) + const struct thread_with_stdio_ops *ops) { - thr->stdio.input_buf = PRINTBUF; - thr->stdio.input_buf.atomic++; - spin_lock_init(&thr->stdio.input_lock); - init_waitqueue_head(&thr->stdio.input_wait); + bch2_thread_with_stdio_init(thr, ops); - thr->stdio.output_buf = PRINTBUF; - thr->stdio.output_buf.atomic++; - spin_lock_init(&thr->stdio.output_lock); - init_waitqueue_head(&thr->stdio.output_wait); + return __bch2_run_thread_with_stdio(thr); +} - darray_init(&thr->output2); - thr->exit = exit; +int bch2_run_thread_with_stdout(struct thread_with_stdio *thr, + const struct thread_with_stdio_ops *ops) +{ + stdio_buf_init(&thr->stdio.input); + stdio_buf_init(&thr->stdio.output); + thr->ops = ops; - return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn); + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn); } +EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout); -int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len) +int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len) { - wait_event(stdio->input_wait, - stdio->input_buf.pos || stdio->done); + struct stdio_buf *buf = &stdio->input; + + /* + * we're waiting on user input (or for the file descriptor to be + * closed), don't want a hung task warning: + */ + do { + wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), + sysctl_hung_task_timeout_secs * HZ / 2); + } while (!stdio_redirect_has_input(stdio)); if (stdio->done) return -1; - spin_lock(&stdio->input_lock); - int ret = min(len, stdio->input_buf.pos); - stdio->input_buf.pos -= ret; - memcpy(buf, stdio->input_buf.buf, ret); - memmove(stdio->input_buf.buf, - stdio->input_buf.buf + ret, - stdio->input_buf.pos); - spin_unlock(&stdio->input_lock); + spin_lock(&buf->lock); + int ret = min(len, buf->buf.nr); + buf->buf.nr -= ret; + memcpy(ubuf, buf->buf.data, ret); + memmove(buf->buf.data, + buf->buf.data + ret, + buf->buf.nr); + spin_unlock(&buf->lock); - wake_up(&stdio->input_wait); + wake_up(&buf->wait); return ret; } -int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len) +int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio, + darray_char *line, + unsigned long timeout) { - wait_event(stdio->input_wait, - stdio->input_buf.pos || stdio->done); + unsigned long until = jiffies + timeout, t; + struct stdio_buf *buf = &stdio->input; + size_t seen = 0; +again: + t = timeout != MAX_SCHEDULE_TIMEOUT + ? max_t(long, until - jiffies, 0) + : timeout; + + t = min(t, sysctl_hung_task_timeout_secs * HZ / 2); + + wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t); if (stdio->done) return -1; - spin_lock(&stdio->input_lock); - int ret = min(len, stdio->input_buf.pos); - char *n = memchr(stdio->input_buf.buf, '\n', ret); - if (n) - ret = min(ret, n + 1 - stdio->input_buf.buf); - stdio->input_buf.pos -= ret; - memcpy(buf, stdio->input_buf.buf, ret); - memmove(stdio->input_buf.buf, - stdio->input_buf.buf + ret, - stdio->input_buf.pos); - spin_unlock(&stdio->input_lock); - - wake_up(&stdio->input_wait); + spin_lock(&buf->lock); + seen = buf->buf.nr; + char *n = memchr(buf->buf.data, '\n', seen); + + if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) { + spin_unlock(&buf->lock); + return -ETIME; + } + + if (!n) { + buf->waiting_for_line = true; + spin_unlock(&buf->lock); + goto again; + } + + size_t b = n + 1 - buf->buf.data; + if (b > line->size) { + spin_unlock(&buf->lock); + int ret = darray_resize(line, b); + if (ret) + return ret; + seen = 0; + goto again; + } + + buf->buf.nr -= b; + memcpy(line->data, buf->buf.data, b); + memmove(buf->buf.data, + buf->buf.data + b, + buf->buf.nr); + line->nr = b; + + buf->waiting_for_line = false; + spin_unlock(&buf->lock); + + wake_up(&buf->wait); + return 0; +} + +int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line) +{ + return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT); +} + +__printf(3, 0) +static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args) +{ + ssize_t ret; + + do { + va_list args2; + size_t len; + + va_copy(args2, args); + len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2); + va_end(args2); + + if (len + 1 <= darray_room(*out)) { + out->nr += len; + return len; + } + + ret = darray_make_room_gfp(out, len + 1, gfp); + } while (ret == 0); + + return ret; +} + +ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking, + const char *fmt, va_list args) +{ + struct stdio_buf *buf = &stdio->output; + unsigned long flags; + ssize_t ret; + +again: + spin_lock_irqsave(&buf->lock, flags); + ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args); + spin_unlock_irqrestore(&buf->lock, flags); + + if (ret < 0) { + if (nonblocking) + return -EAGAIN; + + ret = wait_event_interruptible(buf->wait, + stdio_redirect_has_output_space(stdio)); + if (ret) + return ret; + goto again; + } + + wake_up(&buf->wait); + return ret; +} + +ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking, + const char *fmt, ...) +{ + va_list args; + ssize_t ret; + + va_start(args, fmt); + ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args); + va_end(args); + return ret; } diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index 05879c5048c8..72497b921911 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -4,6 +4,38 @@ #include "thread_with_file_types.h" +/* + * Thread with file: Run a kthread and connect it to a file descriptor, so that + * it can be interacted with via fd read/write methods and closing the file + * descriptor stops the kthread. + * + * We have two different APIs: + * + * thread_with_file, the low level version. + * You get to define the full file_operations, including your release function, + * which means that you must call bch2_thread_with_file_exit() from your + * .release method + * + * thread_with_stdio, the higher level version + * This implements full piping of input and output, including .poll. + * + * Notes on behaviour: + * - kthread shutdown behaves like writing or reading from a pipe that has been + * closed + * - Input and output buffers are 4096 bytes, although buffers may in some + * situations slightly exceed that limit so as to avoid chopping off a + * message in the middle in nonblocking mode. + * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations - + * should be fine but might change in future revisions. + * - Output buffer may grow past 4096 bytes to deal with messages that are + * bigger than 4096 bytes + * - Writing may be done blocking or nonblocking; in nonblocking mode, we only + * drop entire messages. + * + * To write, use stdio_redirect_printf() + * To read, use stdio_redirect_read() or stdio_redirect_readline() + */ + struct task_struct; struct thread_with_file { @@ -17,25 +49,33 @@ int bch2_run_thread_with_file(struct thread_with_file *, const struct file_operations *, int (*fn)(void *)); +struct thread_with_stdio; + +struct thread_with_stdio_ops { + void (*exit)(struct thread_with_stdio *); + int (*fn)(struct thread_with_stdio *); + long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long); +}; + struct thread_with_stdio { struct thread_with_file thr; struct stdio_redirect stdio; - DARRAY(char) output2; - void (*exit)(struct thread_with_stdio *); + const struct thread_with_stdio_ops *ops; }; -static inline void thread_with_stdio_done(struct thread_with_stdio *thr) -{ - thr->thr.done = true; - thr->stdio.done = true; - wake_up(&thr->stdio.input_wait); - wake_up(&thr->stdio.output_wait); -} - +void bch2_thread_with_stdio_init(struct thread_with_stdio *, + const struct thread_with_stdio_ops *); +int __bch2_run_thread_with_stdio(struct thread_with_stdio *); int bch2_run_thread_with_stdio(struct thread_with_stdio *, - void (*exit)(struct thread_with_stdio *), - int (*fn)(void *)); + const struct thread_with_stdio_ops *); +int bch2_run_thread_with_stdout(struct thread_with_stdio *, + const struct thread_with_stdio_ops *); int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); -int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); + +int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long); +int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *); + +__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list); +__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...); #endif /* _BCACHEFS_THREAD_WITH_FILE_H */ diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h index 90b5e645e98c..f4d484d44f63 100644 --- a/fs/bcachefs/thread_with_file_types.h +++ b/fs/bcachefs/thread_with_file_types.h @@ -2,14 +2,18 @@ #ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H #define _BCACHEFS_THREAD_WITH_FILE_TYPES_H -struct stdio_redirect { - spinlock_t output_lock; - wait_queue_head_t output_wait; - struct printbuf output_buf; +#include "darray.h" + +struct stdio_buf { + spinlock_t lock; + wait_queue_head_t wait; + darray_char buf; + bool waiting_for_line; +}; - spinlock_t input_lock; - wait_queue_head_t input_wait; - struct printbuf input_buf; +struct stdio_redirect { + struct stdio_buf input; + struct stdio_buf output; bool done; }; diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c new file mode 100644 index 000000000000..3fe82757f93a --- /dev/null +++ b/fs/bcachefs/time_stats.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/time.h> +#include <linux/spinlock.h> + +#include "eytzinger.h" +#include "time_stats.h" + +static const struct time_unit time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, + { "d", (u64) NSEC_PER_SEC * 3600 * 24}, + { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7}, + { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */ + { "eon", U64_MAX }, +}; + +const struct time_unit *bch2_pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} + +static void quantiles_update(struct quantiles *q, u64 v) +{ + unsigned i = 0; + + while (i < ARRAY_SIZE(q->entries)) { + struct quantile_entry *e = q->entries + i; + + if (unlikely(!e->step)) { + e->m = v; + e->step = max_t(unsigned, v / 2, 1024); + } else if (e->m > v) { + e->m = e->m >= e->step + ? e->m - e->step + : 0; + } else if (e->m < v) { + e->m = e->m + e->step > e->m + ? e->m + e->step + : U32_MAX; + } + + if ((e->m > v ? e->m - v : v - e->m) < e->step) + e->step = max_t(unsigned, e->step / 2, 1); + + if (v >= e->m) + break; + + i = eytzinger0_child(i, v > e->m); + } +} + +static inline void time_stats_update_one(struct bch2_time_stats *stats, + u64 start, u64 end) +{ + u64 duration, freq; + bool initted = stats->last_event != 0; + + if (time_after64(end, start)) { + struct quantiles *quantiles = time_stats_to_quantiles(stats); + + duration = end - start; + mean_and_variance_update(&stats->duration_stats, duration); + mean_and_variance_weighted_update(&stats->duration_stats_weighted, + duration, initted, TIME_STATS_MV_WEIGHT); + stats->max_duration = max(stats->max_duration, duration); + stats->min_duration = min(stats->min_duration, duration); + stats->total_duration += duration; + + if (quantiles) + quantiles_update(quantiles, duration); + } + + if (stats->last_event && time_after64(end, stats->last_event)) { + freq = end - stats->last_event; + mean_and_variance_update(&stats->freq_stats, freq); + mean_and_variance_weighted_update(&stats->freq_stats_weighted, + freq, initted, TIME_STATS_MV_WEIGHT); + stats->max_freq = max(stats->max_freq, freq); + stats->min_freq = min(stats->min_freq, freq); + } + + stats->last_event = end; +} + +void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct time_stat_buffer *b) +{ + for (struct time_stat_buffer_entry *i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + time_stats_update_one(stats, i->start, i->end); + b->nr = 0; +} + +static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats, + struct time_stat_buffer *b) +{ + unsigned long flags; + + spin_lock_irqsave(&stats->lock, flags); + __bch2_time_stats_clear_buffer(stats, b); + spin_unlock_irqrestore(&stats->lock, flags); +} + +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) +{ + unsigned long flags; + + if (!stats->buffer) { + spin_lock_irqsave(&stats->lock, flags); + time_stats_update_one(stats, start, end); + + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && + stats->duration_stats.n > 1024) + stats->buffer = + alloc_percpu_gfp(struct time_stat_buffer, + GFP_ATOMIC); + spin_unlock_irqrestore(&stats->lock, flags); + } else { + struct time_stat_buffer *b; + + preempt_disable(); + b = this_cpu_ptr(stats->buffer); + + BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); + b->entries[b->nr++] = (struct time_stat_buffer_entry) { + .start = start, + .end = end + }; + + if (unlikely(b->nr == ARRAY_SIZE(b->entries))) + time_stats_clear_buffer(stats, b); + preempt_enable(); + } +} + +void bch2_time_stats_reset(struct bch2_time_stats *stats) +{ + spin_lock_irq(&stats->lock); + unsigned offset = offsetof(struct bch2_time_stats, min_duration); + memset((void *) stats + offset, 0, sizeof(*stats) - offset); + + if (stats->buffer) { + int cpu; + for_each_possible_cpu(cpu) + per_cpu_ptr(stats->buffer, cpu)->nr = 0; + } + spin_unlock_irq(&stats->lock); +} + +void bch2_time_stats_exit(struct bch2_time_stats *stats) +{ + free_percpu(stats->buffer); +} + +void bch2_time_stats_init(struct bch2_time_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); + stats->min_duration = U64_MAX; + stats->min_freq = U64_MAX; + spin_lock_init(&stats->lock); +} diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h new file mode 100644 index 000000000000..dc6493f7bbab --- /dev/null +++ b/fs/bcachefs/time_stats.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * bch2_time_stats - collect statistics on events that have a duration, with nicely + * formatted textual output on demand + * + * - percpu buffering of event collection: cheap enough to shotgun + * everywhere without worrying about overhead + * + * tracks: + * - number of events + * - maximum event duration ever seen + * - sum of all event durations + * - average event duration, standard and weighted + * - standard deviation of event durations, standard and weighted + * and analagous statistics for the frequency of events + * + * We provide both mean and weighted mean (exponentially weighted), and standard + * deviation and weighted standard deviation, to give an efficient-to-compute + * view of current behaviour versus. average behaviour - "did this event source + * just become wonky, or is this typical?". + * + * Particularly useful for tracking down latency issues. + */ +#ifndef _BCACHEFS_TIME_STATS_H +#define _BCACHEFS_TIME_STATS_H + +#include <linux/sched/clock.h> +#include <linux/spinlock_types.h> +#include <linux/string.h> + +#include "mean_and_variance.h" + +struct time_unit { + const char *name; + u64 nsecs; +}; + +/* + * given a nanosecond value, pick the preferred time units for printing: + */ +const struct time_unit *bch2_pick_time_units(u64 ns); + +/* + * quantiles - do not use: + * + * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't + * use in new code. + */ + +#define NR_QUANTILES 15 +#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) +#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) +#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) + +struct quantiles { + struct quantile_entry { + u64 m; + u64 step; + } entries[NR_QUANTILES]; +}; + +struct time_stat_buffer { + unsigned nr; + struct time_stat_buffer_entry { + u64 start; + u64 end; + } entries[31]; +}; + +struct bch2_time_stats { + spinlock_t lock; + bool have_quantiles; + struct time_stat_buffer __percpu *buffer; + /* all fields are in nanoseconds */ + u64 min_duration; + u64 max_duration; + u64 total_duration; + u64 max_freq; + u64 min_freq; + u64 last_event; + u64 last_event_start; + + struct mean_and_variance duration_stats; + struct mean_and_variance freq_stats; + +/* default weight for weighted mean and variance calculations */ +#define TIME_STATS_MV_WEIGHT 8 + + struct mean_and_variance_weighted duration_stats_weighted; + struct mean_and_variance_weighted freq_stats_weighted; +}; + +struct bch2_time_stats_quantiles { + struct bch2_time_stats stats; + struct quantiles quantiles; +}; + +static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats) +{ + return stats->have_quantiles + ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles + : NULL; +} + +void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *); +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); + +/** + * time_stats_update - collect a new event being tracked + * + * @stats - bch2_time_stats to update + * @start - start time of event, recorded with local_clock() + * + * The end duration of the event will be the current time + */ +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) +{ + __bch2_time_stats_update(stats, start, local_clock()); +} + +/** + * track_event_change - track state change events + * + * @stats - bch2_time_stats to update + * @v - new state, true or false + * + * Use this when tracking time stats for state changes, i.e. resource X becoming + * blocked/unblocked. + */ +static inline bool track_event_change(struct bch2_time_stats *stats, bool v) +{ + if (v != !!stats->last_event_start) { + if (!v) { + bch2_time_stats_update(stats, stats->last_event_start); + stats->last_event_start = 0; + } else { + stats->last_event_start = local_clock() ?: 1; + return true; + } + } + + return false; +} + +void bch2_time_stats_reset(struct bch2_time_stats *); +void bch2_time_stats_exit(struct bch2_time_stats *); +void bch2_time_stats_init(struct bch2_time_stats *); + +static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) +{ + bch2_time_stats_exit(&statq->stats); +} +static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq) +{ + bch2_time_stats_init(&statq->stats); + statq->stats.have_quantiles = true; + memset(&statq->quantiles, 0, sizeof(statq->quantiles)); +} + +#endif /* _BCACHEFS_TIME_STATS_H */ diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c index dc48b52b01b4..dfad1d06633d 100644 --- a/fs/bcachefs/trace.c +++ b/fs/bcachefs/trace.c @@ -4,6 +4,7 @@ #include "buckets.h" #include "btree_cache.h" #include "btree_iter.h" +#include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update_interior.h" #include "keylist.h" diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 293b90d704fb..c1b51009edf6 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -3,7 +3,6 @@ #define TRACE_SYSTEM bcachefs #if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_BCACHEFS_H #include <linux/tracepoint.h> @@ -43,7 +42,7 @@ DECLARE_EVENT_CLASS(fs_str, TP_fast_assign( __entry->dev = c->dev; - __assign_str(str, str); + __assign_str(str); ), TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) @@ -64,7 +63,7 @@ DECLARE_EVENT_CLASS(trans_str, __entry->dev = trans->c->dev; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; - __assign_str(str, str); + __assign_str(str); ), TP_printk("%d,%d %s %pS %s", @@ -85,7 +84,7 @@ DECLARE_EVENT_CLASS(trans_str_nocaller, TP_fast_assign( __entry->dev = trans->c->dev; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(str, str); + __assign_str(str); ), TP_printk("%d,%d %s %s", @@ -200,6 +199,80 @@ DECLARE_EVENT_CLASS(bio, (unsigned long long)__entry->sector, __entry->nr_sector) ); +/* disk_accounting.c */ + +TRACE_EVENT(accounting_mem_insert, + TP_PROTO(struct bch_fs *c, const char *acc), + TP_ARGS(c, acc), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(unsigned, new_nr ) + __string(acc, acc ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->new_nr = c->accounting.k.nr; + __assign_str(acc); + ), + + TP_printk("%d,%d entries %u added %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->new_nr, + __get_str(acc)) +); + +/* fs.c: */ +TRACE_EVENT(bch2_sync_fs, + TP_PROTO(struct super_block *sb, int wait), + + TP_ARGS(sb, wait), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, wait ) + + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->wait = wait; + ), + + TP_printk("dev %d,%d wait %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->wait) +); + +/* fs-io.c: */ +TRACE_EVENT(bch2_fsync, + TP_PROTO(struct file *file, int datasync), + + TP_ARGS(file, datasync), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ino_t, parent ) + __field( int, datasync ) + ), + + TP_fast_assign( + struct dentry *dentry = file->f_path.dentry; + + __entry->dev = dentry->d_sb->s_dev; + __entry->ino = d_inode(dentry)->i_ino; + __entry->parent = d_inode(dentry->d_parent)->i_ino; + __entry->datasync = datasync; + ), + + TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long) __entry->parent, __entry->datasync) +); + /* super-io.c: */ TRACE_EVENT(write_super, TP_PROTO(struct bch_fs *c, unsigned long ip), @@ -508,6 +581,7 @@ TRACE_EVENT(btree_path_relock_fail, __field(unsigned long, caller_ip ) __field(u8, btree_id ) __field(u8, level ) + __field(u8, path_idx) TRACE_BPOS_entries(pos) __array(char, node, 24 ) __field(u8, self_read_count ) @@ -525,7 +599,8 @@ TRACE_EVENT(btree_path_relock_fail, strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; - __entry->level = path->level; + __entry->level = level; + __entry->path_idx = path - trans->paths; TRACE_BPOS_assign(pos, path->pos); c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level); @@ -538,7 +613,7 @@ TRACE_EVENT(btree_path_relock_fail, c = six_lock_counts(&path->l[level].b->c.lock); __entry->read_count = c.n[SIX_LOCK_read]; __entry->intent_count = c.n[SIX_LOCK_intent]; - scnprintf(__entry->node, sizeof(__entry->node), "%px", b); + scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c); } __entry->iter_lock_seq = path->l[level].lock_seq; __entry->node_lock_seq = is_btree_node(path, level) @@ -546,9 +621,10 @@ TRACE_EVENT(btree_path_relock_fail, : 0; ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", + TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, + __entry->path_idx, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, @@ -575,6 +651,7 @@ TRACE_EVENT(btree_path_upgrade_fail, __field(unsigned long, caller_ip ) __field(u8, btree_id ) __field(u8, level ) + __field(u8, path_idx) TRACE_BPOS_entries(pos) __field(u8, locked ) __field(u8, self_read_count ) @@ -592,6 +669,7 @@ TRACE_EVENT(btree_path_upgrade_fail, __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; __entry->level = level; + __entry->path_idx = path - trans->paths; TRACE_BPOS_assign(pos, path->pos); __entry->locked = btree_node_locked(path, level); @@ -607,9 +685,10 @@ TRACE_EVENT(btree_path_upgrade_fail, : 0; ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", + TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, + __entry->path_idx, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, @@ -638,102 +717,17 @@ DEFINE_EVENT(bch_fs, gc_gens_end, /* Allocator */ -DECLARE_EVENT_CLASS(bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - u64 bucket, - u64 free, - u64 avail, - u64 copygc_wait_amount, - s64 copygc_waiting_for, - struct bucket_alloc_state *s, - bool nonblocking, - const char *err), - TP_ARGS(ca, alloc_reserve, bucket, free, avail, - copygc_wait_amount, copygc_waiting_for, - s, nonblocking, err), - - TP_STRUCT__entry( - __field(u8, dev ) - __array(char, reserve, 16 ) - __field(u64, bucket ) - __field(u64, free ) - __field(u64, avail ) - __field(u64, copygc_wait_amount ) - __field(s64, copygc_waiting_for ) - __field(u64, seen ) - __field(u64, open ) - __field(u64, need_journal_commit ) - __field(u64, nouse ) - __field(bool, nonblocking ) - __field(u64, nocow ) - __array(char, err, 32 ) - ), - - TP_fast_assign( - __entry->dev = ca->dev_idx; - strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); - __entry->bucket = bucket; - __entry->free = free; - __entry->avail = avail; - __entry->copygc_wait_amount = copygc_wait_amount; - __entry->copygc_waiting_for = copygc_waiting_for; - __entry->seen = s->buckets_seen; - __entry->open = s->skipped_open; - __entry->need_journal_commit = s->skipped_need_journal_commit; - __entry->nouse = s->skipped_nouse; - __entry->nonblocking = nonblocking; - __entry->nocow = s->skipped_nocow; - strscpy(__entry->err, err, sizeof(__entry->err)); - ), +DEFINE_EVENT(fs_str, bucket_alloc, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); - TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", - __entry->reserve, - __entry->dev, - __entry->bucket, - __entry->free, - __entry->avail, - __entry->copygc_wait_amount, - __entry->copygc_waiting_for, - __entry->seen, - __entry->open, - __entry->need_journal_commit, - __entry->nouse, - __entry->nocow, - __entry->nonblocking, - __entry->err) +DEFINE_EVENT(fs_str, bucket_alloc_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(bucket_alloc, bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - u64 bucket, - u64 free, - u64 avail, - u64 copygc_wait_amount, - s64 copygc_waiting_for, - struct bucket_alloc_state *s, - bool nonblocking, - const char *err), - TP_ARGS(ca, alloc_reserve, bucket, free, avail, - copygc_wait_amount, copygc_waiting_for, - s, nonblocking, err) -); - -DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - u64 bucket, - u64 free, - u64 avail, - u64 copygc_wait_amount, - s64 copygc_waiting_for, - struct bucket_alloc_state *s, - bool nonblocking, - const char *err), - TP_ARGS(ca, alloc_reserve, bucket, free, avail, - copygc_wait_amount, copygc_waiting_for, - s, nonblocking, err) -); - -TRACE_EVENT(discard_buckets, +DECLARE_EVENT_CLASS(discard_buckets_class, TP_PROTO(struct bch_fs *c, u64 seen, u64 open, u64 need_journal_commit, u64 discarded, const char *err), TP_ARGS(c, seen, open, need_journal_commit, discarded, err), @@ -765,6 +759,18 @@ TRACE_EVENT(discard_buckets, __entry->err) ); +DEFINE_EVENT(discard_buckets_class, discard_buckets, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err) +); + +DEFINE_EVENT(discard_buckets_class, discard_buckets_fast, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err) +); + TRACE_EVENT(bucket_invalidate, TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), TP_ARGS(c, dev, bucket, sectors), @@ -878,8 +884,8 @@ TRACE_EVENT(move_data, TRACE_EVENT(evacuate_bucket, TP_PROTO(struct bch_fs *c, struct bpos *bucket, unsigned sectors, unsigned bucket_size, - u64 fragmentation, int ret), - TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret), + int ret), + TP_ARGS(c, bucket, sectors, bucket_size, ret), TP_STRUCT__entry( __field(dev_t, dev ) @@ -887,7 +893,6 @@ TRACE_EVENT(evacuate_bucket, __field(u64, bucket ) __field(u32, sectors ) __field(u32, bucket_size ) - __field(u64, fragmentation ) __field(int, ret ) ), @@ -897,45 +902,42 @@ TRACE_EVENT(evacuate_bucket, __entry->bucket = bucket->offset; __entry->sectors = sectors; __entry->bucket_size = bucket_size; - __entry->fragmentation = fragmentation; __entry->ret = ret; ), - TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i", + TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->member, __entry->bucket, __entry->sectors, __entry->bucket_size, - __entry->fragmentation, __entry->ret) + __entry->ret) ); TRACE_EVENT(copygc, TP_PROTO(struct bch_fs *c, - u64 sectors_moved, u64 sectors_not_moved, - u64 buckets_moved, u64 buckets_not_moved), - TP_ARGS(c, - sectors_moved, sectors_not_moved, - buckets_moved, buckets_not_moved), + u64 buckets, + u64 sectors_seen, + u64 sectors_moved), + TP_ARGS(c, buckets, sectors_seen, sectors_moved), TP_STRUCT__entry( __field(dev_t, dev ) + __field(u64, buckets ) + __field(u64, sectors_seen ) __field(u64, sectors_moved ) - __field(u64, sectors_not_moved ) - __field(u64, buckets_moved ) - __field(u64, buckets_not_moved ) ), TP_fast_assign( __entry->dev = c->dev; + __entry->buckets = buckets; + __entry->sectors_seen = sectors_seen; __entry->sectors_moved = sectors_moved; - __entry->sectors_not_moved = sectors_not_moved; - __entry->buckets_moved = buckets_moved; - __entry->buckets_not_moved = buckets_moved; ), - TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", + TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->sectors_moved, __entry->sectors_not_moved, - __entry->buckets_moved, __entry->buckets_not_moved) + __entry->buckets, + __entry->sectors_seen, + __entry->sectors_moved) ); TRACE_EVENT(copygc_wait, @@ -1023,10 +1025,33 @@ TRACE_EVENT(trans_restart_split_race, __entry->u64s_remaining) ); -DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, +TRACE_EVENT(trans_blocked_journal_reclaim, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + TP_ARGS(trans, caller_ip), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + + __field(unsigned long, key_cache_nr_keys ) + __field(unsigned long, key_cache_nr_dirty ) + __field(long, must_wait ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->key_cache_nr_keys = atomic_long_read(&trans->c->btree_key_cache.nr_keys); + __entry->key_cache_nr_dirty = atomic_long_read(&trans->c->btree_key_cache.nr_dirty); + __entry->must_wait = __bch2_btree_key_cache_must_wait(trans->c); + ), + + TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li", + __entry->trans_fn, (void *) __entry->caller_ip, + __entry->key_cache_nr_keys, + __entry->key_cache_nr_dirty, + __entry->must_wait) ); TRACE_EVENT(trans_restart_journal_preres_get, @@ -1323,6 +1348,12 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, __entry->new_u64s) ); +DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + TRACE_EVENT(path_downgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1359,10 +1390,21 @@ TRACE_EVENT(path_downgrade, __entry->pos_snapshot) ); -DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) +TRACE_EVENT(key_cache_fill, + TP_PROTO(struct btree_trans *trans, const char *key), + TP_ARGS(trans, key), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __string(key, key ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __assign_str(key); + ), + + TP_printk("%s %s", __entry->trans_fn, __get_str(key)) ); TRACE_EVENT(write_buffer_flush, @@ -1421,6 +1463,24 @@ TRACE_EVENT(write_buffer_flush_slowpath, TP_printk("%zu/%zu", __entry->slowpath, __entry->total) ); +TRACE_EVENT(write_buffer_maybe_flush, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key), + TP_ARGS(trans, caller_ip, key), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __string(key, key ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __assign_str(key); + ), + + TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key)) +); + DEFINE_EVENT(fs_str, rebalance_extent, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) @@ -1431,6 +1491,475 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); +TRACE_EVENT(error_downcast, + TP_PROTO(int bch_err, int std_err, unsigned long ip), + TP_ARGS(bch_err, std_err, ip), + + TP_STRUCT__entry( + __array(char, bch_err, 32 ) + __array(char, std_err, 32 ) + __array(char, ip, 32 ) + ), + + TP_fast_assign( + strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); + strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); + ), + + TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip) +); + +#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS + +TRACE_EVENT(update_by_path, + TP_PROTO(struct btree_trans *trans, struct btree_path *path, + struct btree_insert_entry *i, bool overwrite), + TP_ARGS(trans, path, i, overwrite), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(btree_path_idx_t, path_idx ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + __field(u8, overwrite ) + __field(btree_path_idx_t, update_idx ) + __field(btree_path_idx_t, nr_updates ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->path_idx = path - trans->paths; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); + __entry->overwrite = overwrite; + __entry->update_idx = i - trans->updates; + __entry->nr_updates = trans->nr_updates; + ), + + TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u", + __entry->trans_fn, + __entry->path_idx, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->overwrite, + __entry->update_idx, + __entry->nr_updates) +); + +TRACE_EVENT(btree_path_lock, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_bkey_cached_common *b), + TP_ARGS(trans, caller_ip, b), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u8, level ) + __array(char, node, 24 ) + __field(u32, lock_seq ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = b->btree_id; + __entry->level = b->level; + + scnprintf(__entry->node, sizeof(__entry->node), "%px", b); + __entry->lock_seq = six_lock_seq(&b->lock); + ), + + TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_id_str(__entry->btree_id), + __entry->level, + __entry->node, + __entry->lock_seq) +); + +DECLARE_EVENT_CLASS(btree_path_ev, + TP_PROTO(struct btree_trans *trans, struct btree_path *path), + TP_ARGS(trans, path), + + TP_STRUCT__entry( + __field(u16, idx ) + __field(u8, ref ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->idx = path - trans->paths; + __entry->ref = path->ref; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); + ), + + TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u", + __entry->idx, __entry->ref, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot) +); + +DEFINE_EVENT(btree_path_ev, btree_path_get_ll, + TP_PROTO(struct btree_trans *trans, struct btree_path *path), + TP_ARGS(trans, path) +); + +DEFINE_EVENT(btree_path_ev, btree_path_put_ll, + TP_PROTO(struct btree_trans *trans, struct btree_path *path), + TP_ARGS(trans, path) +); + +DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked, + TP_PROTO(struct btree_trans *trans, struct btree_path *path), + TP_ARGS(trans, path) +); + +TRACE_EVENT(btree_path_alloc, + TP_PROTO(struct btree_trans *trans, struct btree_path *path), + TP_ARGS(trans, path), + + TP_STRUCT__entry( + __field(btree_path_idx_t, idx ) + __field(u8, locks_want ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->idx = path - trans->paths; + __entry->locks_want = path->locks_want; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); + ), + + TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u", + __entry->idx, + bch2_btree_id_str(__entry->btree_id), + __entry->locks_want, + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot) +); + +TRACE_EVENT(btree_path_get, + TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos), + TP_ARGS(trans, path, new_pos), + + TP_STRUCT__entry( + __field(btree_path_idx_t, idx ) + __field(u8, ref ) + __field(u8, preserve ) + __field(u8, locks_want ) + __field(u8, btree_id ) + TRACE_BPOS_entries(old_pos) + TRACE_BPOS_entries(new_pos) + ), + + TP_fast_assign( + __entry->idx = path - trans->paths; + __entry->ref = path->ref; + __entry->preserve = path->preserve; + __entry->locks_want = path->locks_want; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(old_pos, path->pos); + TRACE_BPOS_assign(new_pos, *new_pos); + ), + + TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u", + __entry->idx, + __entry->ref, + __entry->preserve, + bch2_btree_id_str(__entry->btree_id), + __entry->locks_want, + __entry->old_pos_inode, + __entry->old_pos_offset, + __entry->old_pos_snapshot, + __entry->new_pos_inode, + __entry->new_pos_offset, + __entry->new_pos_snapshot) +); + +DECLARE_EVENT_CLASS(btree_path_clone, + TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), + TP_ARGS(trans, path, new), + + TP_STRUCT__entry( + __field(btree_path_idx_t, idx ) + __field(u8, new_idx ) + __field(u8, btree_id ) + __field(u8, ref ) + __field(u8, preserve ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->idx = path - trans->paths; + __entry->new_idx = new - trans->paths; + __entry->btree_id = path->btree_id; + __entry->ref = path->ref; + __entry->preserve = path->preserve; + TRACE_BPOS_assign(pos, path->pos); + ), + + TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u", + __entry->idx, + __entry->ref, + __entry->preserve, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->new_idx) +); + +DEFINE_EVENT(btree_path_clone, btree_path_clone, + TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), + TP_ARGS(trans, path, new) +); + +DEFINE_EVENT(btree_path_clone, btree_path_save_pos, + TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), + TP_ARGS(trans, path, new) +); + +DECLARE_EVENT_CLASS(btree_path_traverse, + TP_PROTO(struct btree_trans *trans, + struct btree_path *path), + TP_ARGS(trans, path), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(btree_path_idx_t, idx ) + __field(u8, ref ) + __field(u8, preserve ) + __field(u8, should_be_locked ) + __field(u8, btree_id ) + __field(u8, level ) + TRACE_BPOS_entries(pos) + __field(u8, locks_want ) + __field(u8, nodes_locked ) + __array(char, node0, 24 ) + __array(char, node1, 24 ) + __array(char, node2, 24 ) + __array(char, node3, 24 ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + + __entry->idx = path - trans->paths; + __entry->ref = path->ref; + __entry->preserve = path->preserve; + __entry->btree_id = path->btree_id; + __entry->level = path->level; + TRACE_BPOS_assign(pos, path->pos); + + __entry->locks_want = path->locks_want; + __entry->nodes_locked = path->nodes_locked; + struct btree *b = path->l[0].b; + if (IS_ERR(b)) + strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c); + b = path->l[1].b; + if (IS_ERR(b)) + strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c); + b = path->l[2].b; + if (IS_ERR(b)) + strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c); + b = path->l[3].b; + if (IS_ERR(b)) + strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c); + ), + + TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n" + "locks %u %u %u %u node %s %s %s %s", + __entry->trans_fn, + __entry->idx, + __entry->ref, + __entry->preserve, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->level, + __entry->locks_want, + (__entry->nodes_locked >> 6) & 3, + (__entry->nodes_locked >> 4) & 3, + (__entry->nodes_locked >> 2) & 3, + (__entry->nodes_locked >> 0) & 3, + __entry->node3, + __entry->node2, + __entry->node1, + __entry->node0) +); + +DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start, + TP_PROTO(struct btree_trans *trans, + struct btree_path *path), + TP_ARGS(trans, path) +); + +DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end, + TP_PROTO(struct btree_trans *trans, struct btree_path *path), + TP_ARGS(trans, path) +); + +TRACE_EVENT(btree_path_set_pos, + TP_PROTO(struct btree_trans *trans, + struct btree_path *path, + struct bpos *new_pos), + TP_ARGS(trans, path, new_pos), + + TP_STRUCT__entry( + __field(btree_path_idx_t, idx ) + __field(u8, ref ) + __field(u8, preserve ) + __field(u8, btree_id ) + TRACE_BPOS_entries(old_pos) + TRACE_BPOS_entries(new_pos) + __field(u8, locks_want ) + __field(u8, nodes_locked ) + __array(char, node0, 24 ) + __array(char, node1, 24 ) + __array(char, node2, 24 ) + __array(char, node3, 24 ) + ), + + TP_fast_assign( + __entry->idx = path - trans->paths; + __entry->ref = path->ref; + __entry->preserve = path->preserve; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(old_pos, path->pos); + TRACE_BPOS_assign(new_pos, *new_pos); + + __entry->nodes_locked = path->nodes_locked; + struct btree *b = path->l[0].b; + if (IS_ERR(b)) + strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c); + b = path->l[1].b; + if (IS_ERR(b)) + strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c); + b = path->l[2].b; + if (IS_ERR(b)) + strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c); + b = path->l[3].b; + if (IS_ERR(b)) + strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c); + ), + + TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n" + "locks %u %u %u %u node %s %s %s %s", + __entry->idx, + __entry->ref, + __entry->preserve, + bch2_btree_id_str(__entry->btree_id), + __entry->old_pos_inode, + __entry->old_pos_offset, + __entry->old_pos_snapshot, + __entry->new_pos_inode, + __entry->new_pos_offset, + __entry->new_pos_snapshot, + (__entry->nodes_locked >> 6) & 3, + (__entry->nodes_locked >> 4) & 3, + (__entry->nodes_locked >> 2) & 3, + (__entry->nodes_locked >> 0) & 3, + __entry->node3, + __entry->node2, + __entry->node1, + __entry->node0) +); + +TRACE_EVENT(btree_path_free, + TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup), + TP_ARGS(trans, path, dup), + + TP_STRUCT__entry( + __field(btree_path_idx_t, idx ) + __field(u8, preserve ) + __field(u8, should_be_locked) + __field(s8, dup ) + __field(u8, dup_locked ) + ), + + TP_fast_assign( + __entry->idx = path; + __entry->preserve = trans->paths[path].preserve; + __entry->should_be_locked = trans->paths[path].should_be_locked; + __entry->dup = dup ? dup - trans->paths : -1; + __entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0; + ), + + TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx, + __entry->preserve ? 'P' : ' ', + __entry->should_be_locked ? 'S' : ' ', + __entry->dup, + __entry->dup_locked) +); + +TRACE_EVENT(btree_path_free_trans_begin, + TP_PROTO(btree_path_idx_t path), + TP_ARGS(path), + + TP_STRUCT__entry( + __field(btree_path_idx_t, idx ) + ), + + TP_fast_assign( + __entry->idx = path; + ), + + TP_printk(" path %3u", __entry->idx) +); + +#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ +#ifndef _TRACE_BCACHEFS_H + +static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct btree_insert_entry *i, bool overwrite) {} +static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {} +static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {} +static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {} +static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {} +static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {} +static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} +static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {} +static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {} +static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {} +static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {} +static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} +static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {} +static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {} + +#endif +#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ + +#define _TRACE_BCACHEFS_H #endif /* _TRACE_BCACHEFS_H */ /* This part must be outside protection */ diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h index 905801772002..7f647846b511 100644 --- a/fs/bcachefs/two_state_shared_lock.h +++ b/fs/bcachefs/two_state_shared_lock.h @@ -36,15 +36,14 @@ static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s) static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s) { long i = s ? 1 : -1; - long v = atomic_long_read(&lock->v), old; + long old; + old = atomic_long_read(&lock->v); do { - old = v; - - if (i > 0 ? v < 0 : v > 0) + if (i > 0 ? old < 0 : old > 0) return false; - } while ((v = atomic_long_cmpxchg_acquire(&lock->v, - old, old + i)) != old); + } while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i)); + return true; } diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 3a32faa86b5c..da2cd11b3025 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * random utiility code, for bcache but in theory not specific to bcache + * random utility code, for bcache but in theory not specific to bcache * * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc. @@ -64,7 +64,7 @@ static int bch2_pow(u64 n, u64 p, u64 *res) *res = 1; while (p--) { - if (*res > div_u64(U64_MAX, n)) + if (*res > div64_u64(U64_MAX, n)) return -ERANGE; *res *= n; } @@ -140,14 +140,14 @@ static int __bch2_strtou64_h(const char *cp, u64 *res) parse_or_ret(cp, parse_unit_suffix(cp, &b)); - if (v > div_u64(U64_MAX, b)) + if (v > div64_u64(U64_MAX, b)) return -ERANGE; v *= b; - if (f_n > div_u64(U64_MAX, b)) + if (f_n > div64_u64(U64_MAX, b)) return -ERANGE; - f_n = div_u64(f_n * b, f_d); + f_n = div64_u64(f_n * b, f_d); if (v + f_n < v) return -ERANGE; v += f_n; @@ -204,7 +204,7 @@ STRTO_H(strtoll, long long) STRTO_H(strtoull, unsigned long long) STRTO_H(strtou64, u64) -u64 bch2_read_flag_list(char *opt, const char * const list[]) +u64 bch2_read_flag_list(const char *opt, const char * const list[]) { u64 ret = 0; char *p, *s, *d = kstrdup(opt, GFP_KERNEL); @@ -214,7 +214,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[]) s = strim(d); - while ((p = strsep(&s, ","))) { + while ((p = strsep(&s, ",;"))) { int flag = match_string(list, -1, p); if (flag < 0) { @@ -222,7 +222,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[]) break; } - ret |= 1 << flag; + ret |= BIT_ULL(flag); } kfree(d); @@ -252,8 +252,10 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v) bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); } -void bch2_print_string_as_lines(const char *prefix, const char *lines) +static void __bch2_print_string_as_lines(const char *prefix, const char *lines, + bool nonblocking) { + bool locked = false; const char *p; if (!lines) { @@ -261,7 +263,13 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) return; } - console_lock(); + if (!nonblocking) { + console_lock(); + locked = true; + } else { + locked = console_trylock(); + } + while (1) { p = strchrnul(lines, '\n'); printk("%s%.*s\n", prefix, (int) (p - lines), lines); @@ -269,7 +277,18 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) break; lines = p + 1; } - console_unlock(); + if (locked) + console_unlock(); +} + +void bch2_print_string_as_lines(const char *prefix, const char *lines) +{ + return __bch2_print_string_as_lines(prefix, lines, false); +} + +void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines) +{ + return __bch2_print_string_as_lines(prefix, lines, true); } int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, @@ -337,167 +356,23 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec) } #endif -static const struct time_unit { - const char *name; - u64 nsecs; -} time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "eon", U64_MAX }, -}; - -static const struct time_unit *pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} - void bch2_pr_time_units(struct printbuf *out, u64 ns) { - const struct time_unit *u = pick_time_units(ns); + const struct time_unit *u = bch2_pick_time_units(ns); - prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); -} - -/* time stats: */ - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) -{ - unsigned i = 0; - - while (i < ARRAY_SIZE(q->entries)) { - struct bch2_quantile_entry *e = q->entries + i; - - if (unlikely(!e->step)) { - e->m = v; - e->step = max_t(unsigned, v / 2, 1024); - } else if (e->m > v) { - e->m = e->m >= e->step - ? e->m - e->step - : 0; - } else if (e->m < v) { - e->m = e->m + e->step > e->m - ? e->m + e->step - : U32_MAX; - } - - if ((e->m > v ? e->m - v : v - e->m) < e->step) - e->step = max_t(unsigned, e->step / 2, 1); - - if (v >= e->m) - break; - - i = eytzinger0_child(i, v > e->m); - } -} - -static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, - u64 start, u64 end) -{ - u64 duration, freq; - - if (time_after64(end, start)) { - duration = end - start; - mean_and_variance_update(&stats->duration_stats, duration); - mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); - stats->max_duration = max(stats->max_duration, duration); - stats->min_duration = min(stats->min_duration, duration); - stats->total_duration += duration; - bch2_quantiles_update(&stats->quantiles, duration); - } - - if (stats->last_event && time_after64(end, stats->last_event)) { - freq = end - stats->last_event; - mean_and_variance_update(&stats->freq_stats, freq); - mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); - stats->max_freq = max(stats->max_freq, freq); - stats->min_freq = min(stats->min_freq, freq); - } - - stats->last_event = end; -} - -static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct bch2_time_stat_buffer *b) -{ - for (struct bch2_time_stat_buffer_entry *i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - bch2_time_stats_update_one(stats, i->start, i->end); - b->nr = 0; -} - -static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct bch2_time_stat_buffer *b) -{ - unsigned long flags; - - spin_lock_irqsave(&stats->lock, flags); - __bch2_time_stats_clear_buffer(stats, b); - spin_unlock_irqrestore(&stats->lock, flags); -} - -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) -{ - unsigned long flags; - - WARN_ONCE(!stats->duration_stats_weighted.weight || - !stats->freq_stats_weighted.weight, - "uninitialized time_stats"); - - if (!stats->buffer) { - spin_lock_irqsave(&stats->lock, flags); - bch2_time_stats_update_one(stats, start, end); - - if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && - stats->duration_stats.n > 1024) - stats->buffer = - alloc_percpu_gfp(struct bch2_time_stat_buffer, - GFP_ATOMIC); - spin_unlock_irqrestore(&stats->lock, flags); - } else { - struct bch2_time_stat_buffer *b; - - preempt_disable(); - b = this_cpu_ptr(stats->buffer); - - BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); - b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) { - .start = start, - .end = end - }; - - if (unlikely(b->nr == ARRAY_SIZE(b->entries))) - bch2_time_stats_clear_buffer(stats, b); - preempt_enable(); - } + prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name); } static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { - const struct time_unit *u = pick_time_units(ns); + const struct time_unit *u = bch2_pick_time_units(ns); - prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); - prt_tab_rjust(out); - prt_printf(out, "%s", u->name); + prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name); } static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) { - prt_str(out, name); - prt_tab(out); + prt_printf(out, "%s\t", name); bch2_pr_time_units_aligned(out, ns); prt_newline(out); } @@ -506,10 +381,9 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) { - const struct time_unit *u; + struct quantiles *quantiles = time_stats_to_quantiles(stats); s64 f_mean = 0, d_mean = 0; - u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; - int i; + u64 f_stddev = 0, d_stddev = 0; if (stats->buffer) { int cpu; @@ -531,12 +405,8 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats } printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); - prt_printf(out, "count:"); - prt_tab(out); - prt_printf(out, "%llu ", - stats->duration_stats.n); + prt_printf(out, "count:\t%llu\n", stats->duration_stats.n); printbuf_tabstop_pop(out); - prt_newline(out); printbuf_tabstops_reset(out); @@ -545,13 +415,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstop_push(out, 0); printbuf_tabstop_push(out, TABSTOP_SIZE + 2); - prt_tab(out); - prt_printf(out, "since mount"); - prt_tab_rjust(out); - prt_tab(out); - prt_printf(out, "recent"); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\tsince mount\r\trecent\r\n"); printbuf_tabstops_reset(out); printbuf_tabstop_push(out, out->indent + 20); @@ -559,88 +423,67 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstop_push(out, 2); printbuf_tabstop_push(out, TABSTOP_SIZE); - prt_printf(out, "duration of events"); - prt_newline(out); + prt_printf(out, "duration of events\n"); printbuf_indent_add(out, 2); pr_name_and_units(out, "min:", stats->min_duration); pr_name_and_units(out, "max:", stats->max_duration); pr_name_and_units(out, "total:", stats->total_duration); - prt_printf(out, "mean:"); - prt_tab(out); + prt_printf(out, "mean:\t"); bch2_pr_time_units_aligned(out, d_mean); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); - prt_printf(out, "stddev:"); - prt_tab(out); + prt_printf(out, "stddev:\t"); bch2_pr_time_units_aligned(out, d_stddev); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); printbuf_indent_sub(out, 2); prt_newline(out); - prt_printf(out, "time between events"); - prt_newline(out); + prt_printf(out, "time between events\n"); printbuf_indent_add(out, 2); pr_name_and_units(out, "min:", stats->min_freq); pr_name_and_units(out, "max:", stats->max_freq); - prt_printf(out, "mean:"); - prt_tab(out); + prt_printf(out, "mean:\t"); bch2_pr_time_units_aligned(out, f_mean); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); - prt_printf(out, "stddev:"); - prt_tab(out); + prt_printf(out, "stddev:\t"); bch2_pr_time_units_aligned(out, f_stddev); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); printbuf_indent_sub(out, 2); prt_newline(out); printbuf_tabstops_reset(out); - i = eytzinger0_first(NR_QUANTILES); - u = pick_time_units(stats->quantiles.entries[i].m); - - prt_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(i, NR_QUANTILES) { - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; - - q = max(stats->quantiles.entries[i].m, last_q); - prt_printf(out, "%llu ", - div_u64(q, u->nsecs)); - if (is_last) - prt_newline(out); - last_q = q; + if (quantiles) { + int i = eytzinger0_first(NR_QUANTILES); + const struct time_unit *u = + bch2_pick_time_units(quantiles->entries[i].m); + u64 last_q = 0; + + prt_printf(out, "quantiles (%s):\t", u->name); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + + u64 q = max(quantiles->entries[i].m, last_q); + prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); + if (is_last) + prt_newline(out); + last_q = q; + } } } -#else -void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {} -#endif - -void bch2_time_stats_exit(struct bch2_time_stats *stats) -{ - free_percpu(stats->buffer); -} - -void bch2_time_stats_init(struct bch2_time_stats *stats) -{ - memset(stats, 0, sizeof(*stats)); - stats->duration_stats_weighted.weight = 8; - stats->freq_stats_weighted.weight = 8; - stats->min_duration = U64_MAX; - stats->min_freq = U64_MAX; - spin_lock_init(&stats->lock); -} /* ratelimit: */ @@ -746,40 +589,31 @@ void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_contro if (!out->nr_tabstops) printbuf_tabstop_push(out, 20); - prt_printf(out, "rate:"); - prt_tab(out); + prt_printf(out, "rate:\t"); prt_human_readable_s64(out, pd->rate.rate); prt_newline(out); - prt_printf(out, "target:"); - prt_tab(out); + prt_printf(out, "target:\t"); prt_human_readable_u64(out, pd->last_target); prt_newline(out); - prt_printf(out, "actual:"); - prt_tab(out); + prt_printf(out, "actual:\t"); prt_human_readable_u64(out, pd->last_actual); prt_newline(out); - prt_printf(out, "proportional:"); - prt_tab(out); + prt_printf(out, "proportional:\t"); prt_human_readable_s64(out, pd->last_proportional); prt_newline(out); - prt_printf(out, "derivative:"); - prt_tab(out); + prt_printf(out, "derivative:\t"); prt_human_readable_s64(out, pd->last_derivative); prt_newline(out); - prt_printf(out, "change:"); - prt_tab(out); + prt_printf(out, "change:\t"); prt_human_readable_s64(out, pd->last_change); prt_newline(out); - prt_printf(out, "next io:"); - prt_tab(out); - prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); - prt_newline(out); + prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); } /* misc: */ @@ -819,19 +653,25 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) return 0; } -size_t bch2_rand_range(size_t max) +u64 bch2_get_random_u64_below(u64 ceil) { - size_t rand; + if (ceil <= U32_MAX) + return __get_random_u32_below(ceil); - if (!max) - return 0; + /* this is the same (clever) algorithm as in __get_random_u32_below() */ + u64 rand = get_random_u64(); + u64 mult = ceil * rand; - do { - rand = get_random_long(); - rand &= roundup_pow_of_two(max) - 1; - } while (rand >= max); + if (unlikely(mult < ceil)) { + u64 bound; + div64_u64_rem(-ceil, ceil, &bound); + while (unlikely(mult < bound)) { + rand = get_random_u64(); + mult = ceil * rand; + } + } - return rand; + return mul_u64_u64_shr(ceil, rand, 64); } void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) @@ -864,171 +704,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } -static int alignment_ok(const void *base, size_t align) -{ - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || - ((unsigned long)base & (align - 1)) == 0; -} - -static void u32_swap(void *a, void *b, size_t size) -{ - u32 t = *(u32 *)a; - *(u32 *)a = *(u32 *)b; - *(u32 *)b = t; -} - -static void u64_swap(void *a, void *b, size_t size) -{ - u64 t = *(u64 *)a; - *(u64 *)a = *(u64 *)b; - *(u64 *)b = t; -} - -static void generic_swap(void *a, void *b, size_t size) -{ - char t; - - do { - t = *(char *)a; - *(char *)a++ = *(char *)b; - *(char *)b++ = t; - } while (--size > 0); -} - -static inline int do_cmp(void *base, size_t n, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - size_t l, size_t r) -{ - return cmp_func(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size); -} - -static inline void do_swap(void *base, size_t n, size_t size, - void (*swap_func)(void *, void *, size_t), - size_t l, size_t r) -{ - swap_func(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size); -} - -void eytzinger0_sort(void *base, size_t n, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)) -{ - int i, c, r; - - if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; - else - swap_func = generic_swap; - } - - /* heapify */ - for (i = n / 2 - 1; i >= 0; --i) { - for (r = i; r * 2 + 1 < n; r = c) { - c = r * 2 + 1; - - if (c + 1 < n && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); - } - } - - /* sort */ - for (i = n - 1; i > 0; --i) { - do_swap(base, n, size, swap_func, 0, i); - - for (r = 0; r * 2 + 1 < i; r = c) { - c = r * 2 + 1; - - if (c + 1 < i && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); - } - } -} - -void sort_cmp_size(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t size)) -{ - /* pre-scale counters for performance */ - int i = (num/2 - 1) * size, n = num * size, c, r; - - if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; - else - swap_func = generic_swap; - } - - /* heapify */ - for ( ; i >= 0; i -= size) { - for (r = i; r * 2 + size < n; r = c) { - c = r * 2 + size; - if (c < n - size && - cmp_func(base + c, base + c + size, size) < 0) - c += size; - if (cmp_func(base + r, base + c, size) >= 0) - break; - swap_func(base + r, base + c, size); - } - } - - /* sort */ - for (i = n - size; i > 0; i -= size) { - swap_func(base, base + i, size); - for (r = 0; r * 2 + size < i; r = c) { - c = r * 2 + size; - if (c < i - size && - cmp_func(base + c, base + c + size, size) < 0) - c += size; - if (cmp_func(base + r, base + c, size) >= 0) - break; - swap_func(base + r, base + c, size); - } - } -} - -static void mempool_free_vp(void *element, void *pool_data) -{ - size_t size = (size_t) pool_data; - - vpfree(element, size); -} - -static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t) pool_data; - - return vpmalloc(size, gfp_mask); -} - -int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) -{ - return size < PAGE_SIZE - ? mempool_init_kmalloc_pool(pool, min_nr, size) - : mempool_init(pool, min_nr, mempool_alloc_vp, - mempool_free_vp, (void *) size); -} - #if 0 void eytzinger1_test(void) { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index b414736d59a5..f4a4783219d9 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -8,6 +8,7 @@ #include <linux/errno.h> #include <linux/freezer.h> #include <linux/kernel.h> +#include <linux/min_heap.h> #include <linux/sched/clock.h> #include <linux/llist.h> #include <linux/log2.h> @@ -21,6 +22,7 @@ #include "mean_and_variance.h" #include "darray.h" +#include "time_stats.h" struct closure; @@ -53,167 +55,30 @@ static inline size_t buf_pages(void *p, size_t len) PAGE_SIZE); } -static inline void vpfree(void *p, size_t size) +static inline void *bch2_kvmalloc(size_t n, gfp_t flags) { - if (is_vmalloc_addr(p)) - vfree(p); - else - free_pages((unsigned long) p, get_order(size)); + void *p = unlikely(n >= INT_MAX) + ? vmalloc(n) + : kvmalloc(n, flags & ~__GFP_ZERO); + if (p && (flags & __GFP_ZERO)) + memset(p, 0, n); + return p; } -static inline void *vpmalloc(size_t size, gfp_t gfp_mask) -{ - return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, - get_order(size)) ?: - __vmalloc(size, gfp_mask); -} - -static inline void kvpfree(void *p, size_t size) -{ - if (size < PAGE_SIZE) - kfree(p); - else - vpfree(p, size); -} - -static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) -{ - return size < PAGE_SIZE - ? kmalloc(size, gfp_mask) - : vpmalloc(size, gfp_mask); -} - -int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); - -#define HEAP(type) \ -struct { \ - size_t size, used; \ - type *data; \ -} - -#define DECLARE_HEAP(type, name) HEAP(type) name - #define init_heap(heap, _size, gfp) \ ({ \ - (heap)->used = 0; \ + (heap)->nr = 0; \ (heap)->size = (_size); \ - (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ + (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\ (gfp)); \ }) #define free_heap(heap) \ do { \ - kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ + kvfree((heap)->data); \ (heap)->data = NULL; \ } while (0) -#define heap_set_backpointer(h, i, _fn) \ -do { \ - void (*fn)(typeof(h), size_t) = _fn; \ - if (fn) \ - fn(h, i); \ -} while (0) - -#define heap_swap(h, i, j, set_backpointer) \ -do { \ - swap((h)->data[i], (h)->data[j]); \ - heap_set_backpointer(h, i, set_backpointer); \ - heap_set_backpointer(h, j, set_backpointer); \ -} while (0) - -#define heap_peek(h) \ -({ \ - EBUG_ON(!(h)->used); \ - (h)->data[0]; \ -}) - -#define heap_full(h) ((h)->used == (h)->size) - -#define heap_sift_down(h, i, cmp, set_backpointer) \ -do { \ - size_t _c, _j = i; \ - \ - for (; _j * 2 + 1 < (h)->used; _j = _c) { \ - _c = _j * 2 + 1; \ - if (_c + 1 < (h)->used && \ - cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ - _c++; \ - \ - if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ - break; \ - heap_swap(h, _c, _j, set_backpointer); \ - } \ -} while (0) - -#define heap_sift_up(h, i, cmp, set_backpointer) \ -do { \ - while (i) { \ - size_t p = (i - 1) / 2; \ - if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ - break; \ - heap_swap(h, i, p, set_backpointer); \ - i = p; \ - } \ -} while (0) - -#define __heap_add(h, d, cmp, set_backpointer) \ -({ \ - size_t _i = (h)->used++; \ - (h)->data[_i] = d; \ - heap_set_backpointer(h, _i, set_backpointer); \ - \ - heap_sift_up(h, _i, cmp, set_backpointer); \ - _i; \ -}) - -#define heap_add(h, d, cmp, set_backpointer) \ -({ \ - bool _r = !heap_full(h); \ - if (_r) \ - __heap_add(h, d, cmp, set_backpointer); \ - _r; \ -}) - -#define heap_add_or_replace(h, new, cmp, set_backpointer) \ -do { \ - if (!heap_add(h, new, cmp, set_backpointer) && \ - cmp(h, new, heap_peek(h)) >= 0) { \ - (h)->data[0] = new; \ - heap_set_backpointer(h, 0, set_backpointer); \ - heap_sift_down(h, 0, cmp, set_backpointer); \ - } \ -} while (0) - -#define heap_del(h, i, cmp, set_backpointer) \ -do { \ - size_t _i = (i); \ - \ - BUG_ON(_i >= (h)->used); \ - (h)->used--; \ - if ((_i) < (h)->used) { \ - heap_swap(h, _i, (h)->used, set_backpointer); \ - heap_sift_up(h, _i, cmp, set_backpointer); \ - heap_sift_down(h, _i, cmp, set_backpointer); \ - } \ -} while (0) - -#define heap_pop(h, d, cmp, set_backpointer) \ -({ \ - bool _r = (h)->used; \ - if (_r) { \ - (d) = (h)->data[0]; \ - heap_del(h, 0, cmp, set_backpointer); \ - } \ - _r; \ -}) - -#define heap_resort(heap, cmp, set_backpointer) \ -do { \ - ssize_t _i; \ - for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ - heap_sift_down(heap, _i, cmp, set_backpointer); \ -} while (0) - #define ANYSINT_MAX(t) \ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) @@ -340,12 +205,13 @@ static inline int bch2_strtoul_h(const char *cp, long *res) bool bch2_is_zero(const void *, size_t); -u64 bch2_read_flag_list(char *, const char * const[]); +u64 bch2_read_flag_list(const char *, const char * const[]); void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); void bch2_prt_u64_base2(struct printbuf *, u64); void bch2_print_string_as_lines(const char *prefix, const char *lines); +void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines); typedef DARRAY(unsigned long) bch_stacktrace; int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); @@ -361,84 +227,8 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) #endif } -#define NR_QUANTILES 15 -#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) -#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) -#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) - -struct bch2_quantiles { - struct bch2_quantile_entry { - u64 m; - u64 step; - } entries[NR_QUANTILES]; -}; - -struct bch2_time_stat_buffer { - unsigned nr; - struct bch2_time_stat_buffer_entry { - u64 start; - u64 end; - } entries[32]; -}; - -struct bch2_time_stats { - spinlock_t lock; - /* all fields are in nanoseconds */ - u64 min_duration; - u64 max_duration; - u64 total_duration; - u64 max_freq; - u64 min_freq; - u64 last_event; - struct bch2_quantiles quantiles; - - struct mean_and_variance duration_stats; - struct mean_and_variance_weighted duration_stats_weighted; - struct mean_and_variance freq_stats; - struct mean_and_variance_weighted freq_stats_weighted; - struct bch2_time_stat_buffer __percpu *buffer; -}; - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); - -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) -{ - __bch2_time_stats_update(stats, start, local_clock()); -} - -static inline bool track_event_change(struct bch2_time_stats *stats, - u64 *start, bool v) -{ - if (v != !!*start) { - if (!v) { - bch2_time_stats_update(stats, *start); - *start = 0; - } else { - *start = local_clock() ?: 1; - return true; - } - } - - return false; -} -#else -static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {} -static inline bool track_event_change(struct bch2_time_stats *stats, - u64 *start, bool v) -{ - bool ret = v && !*start; - *start = v; - return ret; -} -#endif - void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); -void bch2_time_stats_exit(struct bch2_time_stats *); -void bch2_time_stats_init(struct bch2_time_stats *); - #define ewma_add(ewma, val, weight) \ ({ \ typeof(ewma) _ewma = (ewma); \ @@ -537,6 +327,19 @@ do { \ _ptr ? container_of(_ptr, type, member) : NULL; \ }) +static inline struct list_head *list_pop(struct list_head *head) +{ + if (list_empty(head)) + return NULL; + + struct list_head *ret = head->next; + list_del_init(ret); + return ret; +} + +#define list_pop_entry(head, type, member) \ + container_of_or_null(list_pop(head), type, member) + /* Does linear interpolation between powers of two */ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) { @@ -552,11 +355,6 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) void bch2_bio_map(struct bio *bio, void *base, size_t); int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); -static inline sector_t bdev_sectors(struct block_device *bdev) -{ - return bdev->bd_inode->i_size >> 9; -} - #define closure_bio_submit(bio, cl) \ do { \ closure_get(cl); \ @@ -603,7 +401,7 @@ do { \ _ret; \ }) -size_t bch2_rand_range(size_t); +u64 bch2_get_random_u64_below(u64); void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); void memcpy_from_bio(void *, struct bio *, struct bvec_iter); @@ -738,10 +536,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -void sort_cmp_size(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)); - /* just the memmove, doesn't update @_nr */ #define __array_insert_item(_array, _nr, _pos) \ memmove(&(_array)[(_pos) + 1], \ @@ -788,8 +582,15 @@ static inline void __move_gap(void *array, size_t element_size, } /* Move the gap in a gap buffer: */ -#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \ - __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap) +#define move_gap(_d, _new_gap) \ +do { \ + BUG_ON(_new_gap > (_d)->nr); \ + BUG_ON((_d)->gap > (_d)->nr); \ + \ + __move_gap((_d)->data, sizeof((_d)->data[0]), \ + (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \ + (_d)->gap = _new_gap; \ +} while (0) #define bubble_sort(_base, _nr, _cmp) \ do { \ @@ -806,14 +607,19 @@ do { \ } \ } while (0) +#define per_cpu_sum(_p) \ +({ \ + typeof(*_p) _ret = 0; \ + \ + int cpu; \ + for_each_possible_cpu(cpu) \ + _ret += *per_cpu_ptr(_p, cpu); \ + _ret; \ +}) + static inline u64 percpu_u64_get(u64 __percpu *src) { - u64 ret = 0; - int cpu; - - for_each_possible_cpu(cpu) - ret += *per_cpu_ptr(src, cpu); - return ret; + return per_cpu_sum(src); } static inline void percpu_u64_set(u64 __percpu *dst, u64 src) @@ -827,9 +633,7 @@ static inline void percpu_u64_set(u64 __percpu *dst, u64 src) static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) { - unsigned i; - - for (i = 0; i < nr; i++) + for (unsigned i = 0; i < nr; i++) acc[i] += src[i]; } @@ -866,8 +670,6 @@ static inline int cmp_le32(__le32 l, __le32 r) #include <linux/uuid.h> -#define QSTR(n) { { { .len = strlen(n) } }, .name = n } - static inline bool qstr_eq(const struct qstr l, const struct qstr r) { return l.len == r.len && !memcmp(l.name, r.name, l.len); @@ -876,4 +678,52 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r) void bch2_darray_str_exit(darray_str *); int bch2_split_devs(const char *, darray_str *); +#ifdef __KERNEL__ + +__must_check +static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) +{ + return copy_to_user(to, from, n) ? -EFAULT : 0; +} + +__must_check +static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n) +{ + return copy_from_user(to, from, n) ? -EFAULT : 0; +} + +#endif + +static inline void mod_bit(long nr, volatile unsigned long *addr, bool v) +{ + if (v) + set_bit(nr, addr); + else + clear_bit(nr, addr); +} + +static inline void __set_bit_le64(size_t bit, __le64 *addr) +{ + addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); +} + +static inline void __clear_bit_le64(size_t bit, __le64 *addr) +{ + addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64)); +} + +static inline bool test_bit_le64(size_t bit, __le64 *addr) +{ + return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; +} + +static inline void memcpy_swab(void *_dst, void *_src, size_t len) +{ + u8 *dst = _dst + len; + u8 *src = _src; + + while (len--) + *--dst = *src++; +} + #endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c index cb4f33ed9ab3..6620ecae26af 100644 --- a/fs/bcachefs/varint.c +++ b/fs/bcachefs/varint.c @@ -3,12 +3,13 @@ #include <linux/bitops.h> #include <linux/math.h> #include <linux/string.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #ifdef CONFIG_VALGRIND #include <valgrind/memcheck.h> #endif +#include "errcode.h" #include "varint.h" /** @@ -53,7 +54,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) u64 v; if (unlikely(in + bytes > end)) - return -1; + return -BCH_ERR_varint_decode_error; if (likely(bytes < 9)) { __le64 v_le = 0; @@ -85,7 +86,7 @@ int bch2_varint_encode_fast(u8 *out, u64 v) if (likely(bytes < 9)) { v <<= bytes; - v |= ~(~0 << (bytes - 1)); + v |= ~(~0U << (bytes - 1)); } else { *out++ = 255; bytes = 9; @@ -115,7 +116,7 @@ int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) unsigned bytes = ffz(*in) + 1; if (unlikely(in + bytes > end)) - return -1; + return -BCH_ERR_varint_decode_error; if (likely(bytes < 9)) { v >>= bytes; diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 9c0d2316031b..aed7c6984173 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -70,17 +70,16 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { .cmp_bkey = xattr_cmp_bkey, }; -int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) +int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) { struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, le16_to_cpu(xattr.v->x_val_len)); int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err, - xattr_val_size_too_small, + bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, + c, xattr_val_size_too_small, "value too small (%zu < %u)", bkey_val_u64s(k.k), val_u64s); @@ -88,17 +87,17 @@ int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, val_u64s = xattr_val_u64s(xattr.v->x_name_len, le16_to_cpu(xattr.v->x_val_len) + 4); - bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err, - xattr_val_size_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, + c, xattr_val_size_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), val_u64s); - bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err, - xattr_invalid_type, + bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), + c, xattr_invalid_type, "invalid type (%u)", xattr.v->x_type); - bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err, - xattr_name_invalid_chars, + bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), + c, xattr_name_invalid_chars, "xattr name has invalid characters"); fsck_err: return ret; @@ -118,11 +117,17 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, else prt_printf(out, "(unknown type %u)", xattr.v->x_type); + unsigned name_len = xattr.v->x_name_len; + unsigned val_len = le16_to_cpu(xattr.v->x_val_len); + unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - + offsetof(struct bch_xattr, x_name); + + val_len = min_t(int, val_len, max_name_val_bytes - name_len); + name_len = min(name_len, max_name_val_bytes); + prt_printf(out, "%.*s:%.*s", - xattr.v->x_name_len, - xattr.v->x_name, - le16_to_cpu(xattr.v->x_val_len), - (char *) xattr_val(xattr.v)); + name_len, xattr.v->x_name, + val_len, (char *) xattr_val(xattr.v)); if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) { @@ -138,21 +143,13 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); struct btree_iter iter; - struct bkey_s_c_xattr xattr; - struct bkey_s_c k; - int ret; - - ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, - inode_inum(inode), &search, 0); - if (ret) - goto err1; - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, + inode_inum(inode), &search, 0); + int ret = bkey_err(k); if (ret) - goto err2; + return ret; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ret = le16_to_cpu(xattr.v->x_val_len); if (buffer) { if (ret > size) @@ -160,10 +157,8 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info else memcpy(buffer, xattr_val(xattr.v), ret); } -err2: bch2_trans_iter_exit(trans, &iter); -err1: - return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret; + return ret; } int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, @@ -177,7 +172,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, int ret; ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: - bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); if (ret) return ret; @@ -212,8 +207,8 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, inum, &xattr->k_i, - (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| - (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); + (flags & XATTR_CREATE ? STR_HASH_must_create : 0)| + (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0)); } else { struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); @@ -255,17 +250,27 @@ static int __bch2_xattr_emit(const char *prefix, return 0; } +static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry) +{ + const struct xattr_handler *handler = bch2_xattr_type_to_handler(type); + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); +} + static int bch2_xattr_emit(struct dentry *dentry, const struct bch_xattr *xattr, struct xattr_buf *buf) { - const struct xattr_handler *handler = - bch2_xattr_type_to_handler(xattr->x_type); + const char *prefix; + + prefix = bch2_xattr_prefix(xattr->x_type, dentry); + if (!prefix) + return 0; - return handler && (!handler->list || handler->list(dentry)) - ? __bch2_xattr_emit(handler->prefix ?: handler->name, - xattr->x_name, xattr->x_name_len, buf) - : 0; + return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf); } static int bch2_xattr_list_bcachefs(struct bch_fs *c, @@ -300,54 +305,23 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct bch_fs *c = dentry->d_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; u64 offset = 0, inum = inode->ei_inode.bi_inum; - u32 snapshot; - int ret; -retry: - bch2_trans_begin(trans); - iter = (struct btree_iter) { NULL }; - - ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); - if (ret) - goto err; - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs, - SPOS(inum, offset, snapshot), - POS(inum, U64_MAX), 0, k, ret) { - if (k.k->type != KEY_TYPE_xattr) - continue; + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs, + POS(inum, offset), + POS(inum, U64_MAX), + inode->ei_inum.subvol, 0, k, ({ + if (k.k->type != KEY_TYPE_xattr) + continue; - ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); - if (ret) - break; - } + bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); + }))) ?: + bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?: + bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); - offset = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); - - if (ret) - goto out; - - ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false); - if (ret) - goto out; - - ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); - if (ret) - goto out; - - return buf.used; -out: - return bch2_err_class(ret); + return ret ? bch2_err_class(ret) : buf.used; } static int bch2_xattr_get_handler(const struct xattr_handler *handler, @@ -356,9 +330,12 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret = bch2_trans_do(c, NULL, NULL, 0, + int ret = bch2_trans_do(c, bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); + if (ret < 0 && bch2_err_matches(ret, ENOENT)) + ret = -ENODATA; + return bch2_err_class(ret); } @@ -544,11 +521,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, kfree(buf); if (ret < 0) - return ret; + goto err_class_exit; ret = bch2_opt_check_may_set(c, opt_id, v); if (ret < 0) - return ret; + goto err_class_exit; s.v = v + 1; s.defined = true; @@ -588,13 +565,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); err: mutex_unlock(&inode->ei_update_lock); - - if (value && - (opt_id == Opt_background_target || - opt_id == Opt_background_compression || - (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) - bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); - +err_class_exit: return bch2_err_class(ret); } @@ -613,20 +584,26 @@ static int bch2_xattr_bcachefs_get_effective( name, buffer, size, true); } +/* Noop - xattrs in the bcachefs_effective namespace are inherited */ +static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *vinode, + const char *name, const void *value, + size_t size, int flags) +{ + return 0; +} + static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { .prefix = "bcachefs_effective.", .get = bch2_xattr_bcachefs_get_effective, - .set = bch2_xattr_bcachefs_set, + .set = bch2_xattr_bcachefs_set_effective, }; #endif /* NO_BCACHEFS_FS */ -const struct xattr_handler *bch2_xattr_handlers[] = { +const struct xattr_handler * const bch2_xattr_handlers[] = { &bch_xattr_user_handler, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - &nop_posix_acl_access, - &nop_posix_acl_default, -#endif &bch_xattr_trusted_handler, &bch_xattr_security_handler, #ifndef NO_BCACHEFS_FS diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 1337f31a5c49..132fbbd15a66 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -6,12 +6,12 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; -int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); +int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ - .key_invalid = bch2_xattr_invalid, \ + .key_validate = bch2_xattr_validate, \ .val_to_text = bch2_xattr_to_text, \ .min_val_size = 8, \ }) @@ -45,6 +45,6 @@ int bch2_xattr_set(struct btree_trans *, subvol_inum, ssize_t bch2_xattr_list(struct dentry *, char *, size_t); -extern const struct xattr_handler *bch2_xattr_handlers[]; +extern const struct xattr_handler * const bch2_xattr_handlers[]; #endif /* _BCACHEFS_XATTR_H */ diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h index e9f810539552..c7916011ef34 100644 --- a/fs/bcachefs/xattr_format.h +++ b/fs/bcachefs/xattr_format.h @@ -13,7 +13,7 @@ struct bch_xattr { __u8 x_type; __u8 x_name_len; __le16 x_val_len; - __u8 x_name[]; + __u8 x_name[] __counted_by(x_name_len); } __packed __aligned(8); #endif /* _BCACHEFS_XATTR_FORMAT_H */ |