diff options
Diffstat (limited to 'fs/bcachefs/disk_accounting_format.h')
-rw-r--r-- | fs/bcachefs/disk_accounting_format.h | 225 |
1 files changed, 225 insertions, 0 deletions
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h new file mode 100644 index 000000000000..8269af1dbe2a --- /dev/null +++ b/fs/bcachefs/disk_accounting_format.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H +#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H + +#include "replicas_format.h" + +/* + * Disk accounting - KEY_TYPE_accounting - on disk format: + * + * Here, the key has considerably more structure than a typical key (bpos); an + * accounting key is 'struct disk_accounting_pos', which is a union of bpos. + * + * More specifically: a key is just a muliword integer (where word endianness + * matches native byte order), so we're treating bpos as an opaque 20 byte + * integer and mapping bch_accounting_key to that. + * + * This is a type-tagged union of all our various subtypes; a disk accounting + * key can be device counters, replicas counters, et cetera - it's extensible. + * + * The value is a list of u64s or s64s; the number of counters is specific to a + * given accounting type. + * + * Unlike with other key types, updates are _deltas_, and the deltas are not + * resolved until the update to the underlying btree, done by btree write buffer + * flush or journal replay. + * + * Journal replay in particular requires special handling. The journal tracks a + * range of entries which may possibly have not yet been applied to the btree + * yet - it does not know definitively whether individual entries are dirty and + * still need to be applied. + * + * To handle this, we use the version field of struct bkey, and give every + * accounting update a unique version number - a total ordering in time; the + * version number is derived from the key's position in the journal. Then + * journal replay can compare the version number of the key from the journal + * with the version number of the key in the btree to determine if a key needs + * to be replayed. + * + * For this to work, we must maintain this strict time ordering of updates as + * they are flushed to the btree, both via write buffer flush and via journal + * replay. This has complications for the write buffer code while journal replay + * is still in progress; the write buffer cannot flush any accounting keys to + * the btree until journal replay has finished replaying its accounting keys, or + * the (newer) version number of the keys from the write buffer will cause + * updates from journal replay to be lost. + */ + +struct bch_accounting { + struct bch_val v; + __u64 d[]; +}; + +#define BCH_ACCOUNTING_MAX_COUNTERS 3 + +#define BCH_DATA_TYPES() \ + x(free, 0) \ + x(sb, 1) \ + x(journal, 2) \ + x(btree, 3) \ + x(user, 4) \ + x(cached, 5) \ + x(parity, 6) \ + x(stripe, 7) \ + x(need_gc_gens, 8) \ + x(need_discard, 9) \ + x(unstriped, 10) + +enum bch_data_type { +#define x(t, n) BCH_DATA_##t, + BCH_DATA_TYPES() +#undef x + BCH_DATA_NR +}; + +static inline bool data_type_is_empty(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: + return true; + default: + return false; + } +} + +static inline bool data_type_is_hidden(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_sb: + case BCH_DATA_journal: + return true; + default: + return false; + } +} + +/* + * field 1: name + * field 2: id + * field 3: number of counters (max 3) + */ + +#define BCH_DISK_ACCOUNTING_TYPES() \ + x(nr_inodes, 0, 1) \ + x(persistent_reserved, 1, 1) \ + x(replicas, 2, 1) \ + x(dev_data_type, 3, 3) \ + x(compression, 4, 3) \ + x(snapshot, 5, 1) \ + x(btree, 6, 1) \ + x(rebalance_work, 7, 1) \ + x(inum, 8, 3) + +enum disk_accounting_type { +#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, + BCH_DISK_ACCOUNTING_TYPES() +#undef x + BCH_DISK_ACCOUNTING_TYPE_NR, +}; + +/* + * No subtypes - number of inodes in the entire filesystem + * + * XXX: perhaps we could add a per-subvolume counter? + */ +struct bch_acct_nr_inodes { +}; + +/* + * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the + * reservation: + */ +struct bch_acct_persistent_reserved { + __u8 nr_replicas; +}; + +/* + * device, data type counter fields: + * [ + * nr_buckets + * live sectors (in buckets of that data type) + * sectors of internal fragmentation + * ] + * + * XXX: live sectors should've been done differently, you can have multiple data + * types in the same bucket (user, stripe, cached) and this collapses them to + * the bucket data type, and makes the internal fragmentation counter redundant + */ +struct bch_acct_dev_data_type { + __u8 dev; + __u8 data_type; +}; + +/* + * Compression type fields: + * [ + * number of extents + * uncompressed size + * compressed size + * ] + * + * Compression ratio, average extent size (fragmentation). + */ +struct bch_acct_compression { + __u8 type; +}; + +/* + * On disk usage by snapshot id; counts same values as replicas counter, but + * aggregated differently + */ +struct bch_acct_snapshot { + __u32 id; +} __packed; + +struct bch_acct_btree { + __u32 id; +} __packed; + +/* + * inum counter fields: + * [ + * number of extents + * sum of extent sizes - bkey size + * this field is similar to inode.bi_sectors, except here extents in + * different snapshots but the same inode number are all collapsed to the + * same counter + * sum of on disk size - same values tracked by replicas counters + * ] + * + * This tracks on disk fragmentation. + */ +struct bch_acct_inum { + __u64 inum; +} __packed; + +/* + * Simple counter of the amount of data (on disk sectors) rebalance needs to + * move, extents counted here are also in the rebalance_work btree. + */ +struct bch_acct_rebalance_work { +}; + +struct disk_accounting_pos { + union { + struct { + __u8 type; + union { + struct bch_acct_nr_inodes nr_inodes; + struct bch_acct_persistent_reserved persistent_reserved; + struct bch_replicas_entry_v1 replicas; + struct bch_acct_dev_data_type dev_data_type; + struct bch_acct_compression compression; + struct bch_acct_snapshot snapshot; + struct bch_acct_btree btree; + struct bch_acct_rebalance_work rebalance_work; + struct bch_acct_inum inum; + } __packed; + } __packed; + struct bpos _pad; + }; +}; + +#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */ |