diff options
Diffstat (limited to 'fs/bcachefs/sb-members.c')
-rw-r--r-- | fs/bcachefs/sb-members.c | 370 |
1 files changed, 274 insertions, 96 deletions
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index eff5ce18c69c..6245e342a8a8 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -1,12 +1,47 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "disk_groups.h" +#include "error.h" #include "opts.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-members.h" #include "super-io.h" +int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) +{ + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev); + bch2_bkey_val_to_text(&buf, c, k); + + bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); + + int ret = bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_allocations, 0); + + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + return ret; +} + +void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev) +{ + if (dev != BCH_SB_MEMBER_INVALID) + bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); +} + +void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket) +{ + bch2_fs_inconsistent(ca->fs, + "pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)", + bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets); +} + #define x(t, n, ...) [n] = #t, static const char * const bch2_iops_measurements[] = { BCH_IOPS_MEASUREMENTS() @@ -66,7 +101,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c) mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); if (!mi) - return -BCH_ERR_ENOSPC_sb_members_v2; + return bch_err_throw(c, ENOSPC_sb_members_v2); for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); @@ -104,6 +139,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) struct bch_sb_field_members_v1 *mi1; struct bch_sb_field_members_v2 *mi2; + if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) { + bch2_sb_field_resize(disk_sb, members_v1, 0); + return 0; + } + mi1 = bch2_sb_field_resize(disk_sb, members_v1, DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * disk_sb->sb->nr_devices, sizeof(u64))); @@ -123,9 +163,9 @@ static int validate_member(struct printbuf *err, struct bch_sb *sb, int i) { - if (le64_to_cpu(m.nbuckets) > LONG_MAX) { - prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", - i, le64_to_cpu(m.nbuckets), LONG_MAX); + if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) { + prt_printf(err, "device %u: too many buckets (got %llu, max %u)", + i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX); return -BCH_ERR_invalid_sb_members; } @@ -150,6 +190,17 @@ static int validate_member(struct printbuf *err, return -BCH_ERR_invalid_sb_members; } + if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) { + prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift); + return -BCH_ERR_invalid_sb_members; + } + + if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) && + sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) { + prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i); + return -BCH_ERR_invalid_sb_members; + } + return 0; } @@ -163,128 +214,92 @@ static void member_to_text(struct printbuf *out, u64 bucket_size = le16_to_cpu(m.bucket_size); u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; - if (!bch2_member_exists(&m)) + if (!bch2_member_alive(&m)) return; - prt_printf(out, "Device:"); - prt_tab(out); - prt_printf(out, "%u", i); - prt_newline(out); + prt_printf(out, "Device:\t%u\n", i); printbuf_indent_add(out, 2); - prt_printf(out, "Label:"); - prt_tab(out); - if (BCH_MEMBER_GROUP(&m)) { - unsigned idx = BCH_MEMBER_GROUP(&m) - 1; - - if (idx < disk_groups_nr(gi)) - prt_printf(out, "%s (%u)", - gi->entries[idx].label, idx); - else - prt_printf(out, "(bad disk labels section)"); - } else { + prt_printf(out, "Label:\t"); + if (BCH_MEMBER_GROUP(&m)) + bch2_disk_path_to_text_sb(out, sb, + BCH_MEMBER_GROUP(&m) - 1); + else prt_printf(out, "(none)"); - } prt_newline(out); - prt_printf(out, "UUID:"); - prt_tab(out); + prt_printf(out, "UUID:\t"); pr_uuid(out, m.uuid.b); prt_newline(out); - prt_printf(out, "Size:"); - prt_tab(out); + prt_printf(out, "Size:\t"); prt_units_u64(out, device_size << 9); prt_newline(out); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s errors:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, le64_to_cpu(m.errors[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i])); - for (unsigned i = 0; i < BCH_IOPS_NR; i++) { - prt_printf(out, "%s iops:", bch2_iops_measurements[i]); - prt_tab(out); - prt_printf(out, "%u", le32_to_cpu(m.iops[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_IOPS_NR; i++) + prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i])); - prt_printf(out, "Bucket size:"); - prt_tab(out); + prt_printf(out, "Bucket size:\t"); prt_units_u64(out, bucket_size << 9); prt_newline(out); - prt_printf(out, "First bucket:"); - prt_tab(out); - prt_printf(out, "%u", le16_to_cpu(m.first_bucket)); - prt_newline(out); - - prt_printf(out, "Buckets:"); - prt_tab(out); - prt_printf(out, "%llu", le64_to_cpu(m.nbuckets)); - prt_newline(out); + prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket)); + prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets)); - prt_printf(out, "Last mount:"); - prt_tab(out); + prt_printf(out, "Last mount:\t"); if (m.last_mount) bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); else prt_printf(out, "(never)"); prt_newline(out); - prt_printf(out, "Last superblock write:"); - prt_tab(out); - prt_u64(out, le64_to_cpu(m.seq)); - prt_newline(out); + prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq)); - prt_printf(out, "State:"); - prt_tab(out); - prt_printf(out, "%s", + prt_printf(out, "State:\t%s\n", BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR ? bch2_member_states[BCH_MEMBER_STATE(&m)] : "unknown"); - prt_newline(out); - prt_printf(out, "Data allowed:"); - prt_tab(out); + prt_printf(out, "Data allowed:\t"); if (BCH_MEMBER_DATA_ALLOWED(&m)) prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); else prt_printf(out, "(none)"); prt_newline(out); - prt_printf(out, "Has data:"); - prt_tab(out); + prt_printf(out, "Has data:\t"); if (data_have) prt_bitflags(out, __bch2_data_types, data_have); else prt_printf(out, "(none)"); prt_newline(out); - prt_str(out, "Durability:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + prt_printf(out, "Btree allocated bitmap blocksize:\t"); + if (m.btree_bitmap_shift < 64) + prt_units_u64(out, 1ULL << m.btree_bitmap_shift); + else + prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift); prt_newline(out); - prt_printf(out, "Discard:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m)); + prt_printf(out, "Btree allocated bitmap:\t"); + bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64); prt_newline(out); - prt_printf(out, "Freespace initialized:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); - prt_newline(out); + prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + + prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); + prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); + prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m)); printbuf_indent_sub(out, 2); } -static int bch2_sb_members_v1_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); unsigned i; @@ -310,9 +325,17 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - unsigned i; - for (i = 0; i < sb->nr_devices; i++) + if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { + prt_printf(out, "field ends before start of entries"); + return; + } + + unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]); + if (nr != sb->nr_devices) + prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); + + for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) member_to_text(out, members_v1_get(mi, i), gi, sb, i); } @@ -326,15 +349,32 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - unsigned i; - for (i = 0; i < sb->nr_devices; i++) + if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { + prt_printf(out, "field ends before start of entries"); + return; + } + + if (!le16_to_cpu(mi->member_bytes)) { + prt_printf(out, "member_bytes 0"); + return; + } + + unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes); + if (nr != sb->nr_devices) + prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); + + /* + * We call to_text() on superblock sections that haven't passed + * validate, so we can't trust sb->nr_devices. + */ + + for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) member_to_text(out, members_v2_get(mi, i), gi, sb, i); } -static int bch2_sb_members_v2_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - @@ -364,14 +404,13 @@ void bch2_sb_members_from_cpu(struct bch_fs *c) { struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - rcu_read_lock(); + guard(rcu)(); for_each_member_device_rcu(c, ca, NULL) { struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++) m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); } - rcu_read_unlock(); } void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) @@ -389,12 +428,8 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) prt_newline(out); printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, atomic64_read(&ca->errors[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i])); printbuf_indent_sub(out, 2); prt_str(out, "IO errors since "); @@ -403,12 +438,9 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) prt_newline(out); printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], + atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); printbuf_indent_sub(out, 2); } @@ -426,3 +458,149 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bch2_write_super(c); mutex_unlock(&c->sb_lock); } + +/* + * Per member "range has btree nodes" bitmap: + * + * This is so that if we ever have to run the btree node scan to repair we don't + * have to scan full devices: + */ + +bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) +{ + guard(rcu)(); + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (ca && + !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) + return false; + } + return true; +} + +static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, + u64 start, unsigned sectors) +{ + struct bch_member *m = __bch2_members_v2_get_mut(mi, dev); + u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap); + + u64 end = start + sectors; + + int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6); + if (resize > 0) { + u64 new_bitmap = 0; + + for (unsigned i = 0; i < 64; i++) + if (bitmap & BIT_ULL(i)) + new_bitmap |= BIT_ULL(i >> resize); + bitmap = new_bitmap; + m->btree_bitmap_shift += resize; + } + + BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX); + BUG_ON(end > 64ULL << m->btree_bitmap_shift); + + for (unsigned bit = start >> m->btree_bitmap_shift; + (u64) bit << m->btree_bitmap_shift < end; + bit++) + bitmap |= BIT_ULL(bit); + + m->btree_allocated_bitmap = cpu_to_le64(bitmap); +} + +void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) +{ + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (!bch2_member_exists(c->disk_sb.sb, ptr->dev)) + continue; + + __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); + } +} + +unsigned bch2_sb_nr_devices(const struct bch_sb *sb) +{ + unsigned nr = 0; + + for (unsigned i = 0; i < sb->nr_devices; i++) + nr += bch2_member_exists((struct bch_sb *) sb, i); + return nr; +} + +int bch2_sb_member_alloc(struct bch_fs *c) +{ + unsigned dev_idx = c->sb.nr_devices; + struct bch_sb_field_members_v2 *mi; + unsigned nr_devices; + unsigned u64s; + int best = -1; + u64 best_last_mount = 0; + unsigned nr_deleted = 0; + + if (dev_idx < BCH_SB_MEMBERS_MAX) + goto have_slot; + + for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) { + /* eventually BCH_SB_MEMBERS_MAX will be raised */ + if (dev_idx == BCH_SB_MEMBER_INVALID) + continue; + + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); + + nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); + + if (!bch2_is_zero(&m.uuid, sizeof(m.uuid))) + continue; + + u64 last_mount = le64_to_cpu(m.last_mount); + if (best < 0 || last_mount < best_last_mount) { + best = dev_idx; + best_last_mount = last_mount; + } + } + if (best >= 0) { + dev_idx = best; + goto have_slot; + } + + if (nr_deleted) + bch_err(c, "unable to allocate new member, but have %u deleted: run fsck", + nr_deleted); + + return -BCH_ERR_ENOSPC_sb_members; +have_slot: + nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); + + mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + + le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); + + mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); + if (!mi) + return -BCH_ERR_ENOSPC_sb_members; + + c->disk_sb.sb->nr_devices = nr_devices; + return dev_idx; +} + +void bch2_sb_members_clean_deleted(struct bch_fs *c) +{ + mutex_lock(&c->sb_lock); + bool write_sb = false; + + for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i); + + if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) { + memset(&m->uuid, 0, sizeof(m->uuid)); + write_sb = true; + } + } + + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} |