summaryrefslogtreecommitdiff
path: root/fs/bcachefs/sb-members.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/sb-members.c')
-rw-r--r--fs/bcachefs/sb-members.c370
1 files changed, 274 insertions, 96 deletions
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index eff5ce18c69c..6245e342a8a8 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -1,12 +1,47 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_cache.h"
#include "disk_groups.h"
+#include "error.h"
#include "opts.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "sb-members.h"
#include "super-io.h"
+int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev)
+{
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev);
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf);
+
+ int ret = bch2_run_explicit_recovery_pass(c, &buf,
+ BCH_RECOVERY_PASS_check_allocations, 0);
+
+ if (print)
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev)
+{
+ if (dev != BCH_SB_MEMBER_INVALID)
+ bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
+}
+
+void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket)
+{
+ bch2_fs_inconsistent(ca->fs,
+ "pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)",
+ bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets);
+}
+
#define x(t, n, ...) [n] = #t,
static const char * const bch2_iops_measurements[] = {
BCH_IOPS_MEASUREMENTS()
@@ -66,7 +101,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c)
mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
if (!mi)
- return -BCH_ERR_ENOSPC_sb_members_v2;
+ return bch_err_throw(c, ENOSPC_sb_members_v2);
for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
@@ -104,6 +139,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
struct bch_sb_field_members_v1 *mi1;
struct bch_sb_field_members_v2 *mi2;
+ if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) {
+ bch2_sb_field_resize(disk_sb, members_v1, 0);
+ return 0;
+ }
+
mi1 = bch2_sb_field_resize(disk_sb, members_v1,
DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
disk_sb->sb->nr_devices, sizeof(u64)));
@@ -123,9 +163,9 @@ static int validate_member(struct printbuf *err,
struct bch_sb *sb,
int i)
{
- if (le64_to_cpu(m.nbuckets) > LONG_MAX) {
- prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
- i, le64_to_cpu(m.nbuckets), LONG_MAX);
+ if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) {
+ prt_printf(err, "device %u: too many buckets (got %llu, max %u)",
+ i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX);
return -BCH_ERR_invalid_sb_members;
}
@@ -150,6 +190,17 @@ static int validate_member(struct printbuf *err,
return -BCH_ERR_invalid_sb_members;
}
+ if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) {
+ prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift);
+ return -BCH_ERR_invalid_sb_members;
+ }
+
+ if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) &&
+ sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) {
+ prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i);
+ return -BCH_ERR_invalid_sb_members;
+ }
+
return 0;
}
@@ -163,128 +214,92 @@ static void member_to_text(struct printbuf *out,
u64 bucket_size = le16_to_cpu(m.bucket_size);
u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
- if (!bch2_member_exists(&m))
+ if (!bch2_member_alive(&m))
return;
- prt_printf(out, "Device:");
- prt_tab(out);
- prt_printf(out, "%u", i);
- prt_newline(out);
+ prt_printf(out, "Device:\t%u\n", i);
printbuf_indent_add(out, 2);
- prt_printf(out, "Label:");
- prt_tab(out);
- if (BCH_MEMBER_GROUP(&m)) {
- unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
-
- if (idx < disk_groups_nr(gi))
- prt_printf(out, "%s (%u)",
- gi->entries[idx].label, idx);
- else
- prt_printf(out, "(bad disk labels section)");
- } else {
+ prt_printf(out, "Label:\t");
+ if (BCH_MEMBER_GROUP(&m))
+ bch2_disk_path_to_text_sb(out, sb,
+ BCH_MEMBER_GROUP(&m) - 1);
+ else
prt_printf(out, "(none)");
- }
prt_newline(out);
- prt_printf(out, "UUID:");
- prt_tab(out);
+ prt_printf(out, "UUID:\t");
pr_uuid(out, m.uuid.b);
prt_newline(out);
- prt_printf(out, "Size:");
- prt_tab(out);
+ prt_printf(out, "Size:\t");
prt_units_u64(out, device_size << 9);
prt_newline(out);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
- prt_printf(out, "%s errors:", bch2_member_error_strs[i]);
- prt_tab(out);
- prt_u64(out, le64_to_cpu(m.errors[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+ prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i]));
- for (unsigned i = 0; i < BCH_IOPS_NR; i++) {
- prt_printf(out, "%s iops:", bch2_iops_measurements[i]);
- prt_tab(out);
- prt_printf(out, "%u", le32_to_cpu(m.iops[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_IOPS_NR; i++)
+ prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i]));
- prt_printf(out, "Bucket size:");
- prt_tab(out);
+ prt_printf(out, "Bucket size:\t");
prt_units_u64(out, bucket_size << 9);
prt_newline(out);
- prt_printf(out, "First bucket:");
- prt_tab(out);
- prt_printf(out, "%u", le16_to_cpu(m.first_bucket));
- prt_newline(out);
-
- prt_printf(out, "Buckets:");
- prt_tab(out);
- prt_printf(out, "%llu", le64_to_cpu(m.nbuckets));
- prt_newline(out);
+ prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket));
+ prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets));
- prt_printf(out, "Last mount:");
- prt_tab(out);
+ prt_printf(out, "Last mount:\t");
if (m.last_mount)
bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
else
prt_printf(out, "(never)");
prt_newline(out);
- prt_printf(out, "Last superblock write:");
- prt_tab(out);
- prt_u64(out, le64_to_cpu(m.seq));
- prt_newline(out);
+ prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq));
- prt_printf(out, "State:");
- prt_tab(out);
- prt_printf(out, "%s",
+ prt_printf(out, "State:\t%s\n",
BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
? bch2_member_states[BCH_MEMBER_STATE(&m)]
: "unknown");
- prt_newline(out);
- prt_printf(out, "Data allowed:");
- prt_tab(out);
+ prt_printf(out, "Data allowed:\t");
if (BCH_MEMBER_DATA_ALLOWED(&m))
prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
else
prt_printf(out, "(none)");
prt_newline(out);
- prt_printf(out, "Has data:");
- prt_tab(out);
+ prt_printf(out, "Has data:\t");
if (data_have)
prt_bitflags(out, __bch2_data_types, data_have);
else
prt_printf(out, "(none)");
prt_newline(out);
- prt_str(out, "Durability:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
+ prt_printf(out, "Btree allocated bitmap blocksize:\t");
+ if (m.btree_bitmap_shift < 64)
+ prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
+ else
+ prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift);
prt_newline(out);
- prt_printf(out, "Discard:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));
+ prt_printf(out, "Btree allocated bitmap:\t");
+ bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64);
prt_newline(out);
- prt_printf(out, "Freespace initialized:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
- prt_newline(out);
+ prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
+
+ prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m));
+ prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
+ prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m));
printbuf_indent_sub(out, 2);
}
-static int bch2_sb_members_v1_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
unsigned i;
@@ -310,9 +325,17 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
{
struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
- unsigned i;
- for (i = 0; i < sb->nr_devices; i++)
+ if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) {
+ prt_printf(out, "field ends before start of entries");
+ return;
+ }
+
+ unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]);
+ if (nr != sb->nr_devices)
+ prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices);
+
+ for (unsigned i = 0; i < min(sb->nr_devices, nr); i++)
member_to_text(out, members_v1_get(mi, i), gi, sb, i);
}
@@ -326,15 +349,32 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
{
struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
- unsigned i;
- for (i = 0; i < sb->nr_devices; i++)
+ if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) {
+ prt_printf(out, "field ends before start of entries");
+ return;
+ }
+
+ if (!le16_to_cpu(mi->member_bytes)) {
+ prt_printf(out, "member_bytes 0");
+ return;
+ }
+
+ unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes);
+ if (nr != sb->nr_devices)
+ prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices);
+
+ /*
+ * We call to_text() on superblock sections that haven't passed
+ * validate, so we can't trust sb->nr_devices.
+ */
+
+ for (unsigned i = 0; i < min(sb->nr_devices, nr); i++)
member_to_text(out, members_v2_get(mi, i), gi, sb, i);
}
-static int bch2_sb_members_v2_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
@@ -364,14 +404,13 @@ void bch2_sb_members_from_cpu(struct bch_fs *c)
{
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- rcu_read_lock();
+ guard(rcu)();
for_each_member_device_rcu(c, ca, NULL) {
struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
}
- rcu_read_unlock();
}
void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
@@ -389,12 +428,8 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
prt_newline(out);
printbuf_indent_add(out, 2);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
- prt_printf(out, "%s:", bch2_member_error_strs[i]);
- prt_tab(out);
- prt_u64(out, atomic64_read(&ca->errors[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+ prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i]));
printbuf_indent_sub(out, 2);
prt_str(out, "IO errors since ");
@@ -403,12 +438,9 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
prt_newline(out);
printbuf_indent_add(out, 2);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
- prt_printf(out, "%s:", bch2_member_error_strs[i]);
- prt_tab(out);
- prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+ prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i],
+ atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
printbuf_indent_sub(out, 2);
}
@@ -426,3 +458,149 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
+
+/*
+ * Per member "range has btree nodes" bitmap:
+ *
+ * This is so that if we ever have to run the btree node scan to repair we don't
+ * have to scan full devices:
+ */
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
+{
+ guard(rcu)();
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (ca &&
+ !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c)))
+ return false;
+ }
+ return true;
+}
+
+static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
+ u64 start, unsigned sectors)
+{
+ struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
+ u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
+
+ u64 end = start + sectors;
+
+ int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
+ if (resize > 0) {
+ u64 new_bitmap = 0;
+
+ for (unsigned i = 0; i < 64; i++)
+ if (bitmap & BIT_ULL(i))
+ new_bitmap |= BIT_ULL(i >> resize);
+ bitmap = new_bitmap;
+ m->btree_bitmap_shift += resize;
+ }
+
+ BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX);
+ BUG_ON(end > 64ULL << m->btree_bitmap_shift);
+
+ for (unsigned bit = start >> m->btree_bitmap_shift;
+ (u64) bit << m->btree_bitmap_shift < end;
+ bit++)
+ bitmap |= BIT_ULL(bit);
+
+ m->btree_allocated_bitmap = cpu_to_le64(bitmap);
+}
+
+void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ if (!bch2_member_exists(c->disk_sb.sb, ptr->dev))
+ continue;
+
+ __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
+ }
+}
+
+unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
+{
+ unsigned nr = 0;
+
+ for (unsigned i = 0; i < sb->nr_devices; i++)
+ nr += bch2_member_exists((struct bch_sb *) sb, i);
+ return nr;
+}
+
+int bch2_sb_member_alloc(struct bch_fs *c)
+{
+ unsigned dev_idx = c->sb.nr_devices;
+ struct bch_sb_field_members_v2 *mi;
+ unsigned nr_devices;
+ unsigned u64s;
+ int best = -1;
+ u64 best_last_mount = 0;
+ unsigned nr_deleted = 0;
+
+ if (dev_idx < BCH_SB_MEMBERS_MAX)
+ goto have_slot;
+
+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
+ /* eventually BCH_SB_MEMBERS_MAX will be raised */
+ if (dev_idx == BCH_SB_MEMBER_INVALID)
+ continue;
+
+ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
+
+ nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID);
+
+ if (!bch2_is_zero(&m.uuid, sizeof(m.uuid)))
+ continue;
+
+ u64 last_mount = le64_to_cpu(m.last_mount);
+ if (best < 0 || last_mount < best_last_mount) {
+ best = dev_idx;
+ best_last_mount = last_mount;
+ }
+ }
+ if (best >= 0) {
+ dev_idx = best;
+ goto have_slot;
+ }
+
+ if (nr_deleted)
+ bch_err(c, "unable to allocate new member, but have %u deleted: run fsck",
+ nr_deleted);
+
+ return -BCH_ERR_ENOSPC_sb_members;
+have_slot:
+ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+
+ mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
+ le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+
+ mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+ if (!mi)
+ return -BCH_ERR_ENOSPC_sb_members;
+
+ c->disk_sb.sb->nr_devices = nr_devices;
+ return dev_idx;
+}
+
+void bch2_sb_members_clean_deleted(struct bch_fs *c)
+{
+ mutex_lock(&c->sb_lock);
+ bool write_sb = false;
+
+ for (unsigned i = 0; i < c->sb.nr_devices; i++) {
+ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i);
+
+ if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) {
+ memset(&m->uuid, 0, sizeof(m->uuid));
+ write_sb = true;
+ }
+ }
+
+ if (write_sb)
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}