summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-12 20:47:51 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-12 20:47:51 -0800
commit149c51f876322d9bfbd5e2d6ffae7aff3d794384 (patch)
treea61c7dd828356e307fca06fc66dbdbf9b109c18f /include
parent97971df811b8854882c0f6c6631e23ab8cdcc44f (diff)
parentb7af0635c87ff78d6bd523298ab7471f9ffd3ce5 (diff)
Merge tag 'for-6.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "This round there are a lot of cleanups and moved code so the diffstat looks huge, otherwise there are some nice performance improvements and an update to raid56 reliability. User visible features: - raid56 reliability vs performance trade off: - fix destructive RMW for raid5 data (raid6 still needs work): do full checksum verification for all data during RMW cycle, this should prevent rewriting potentially corrupted data without notice - stripes are cached in memory which should reduce the performance impact but still can hurt some workloads - checksums are verified after repair again - this is the last option without introducing additional features (write intent bitmap, journal, another tree), the extra checksum read/verification was supposed to be avoided by the original implementation exactly for performance reasons but that caused all the reliability problems - discard=async by default for devices that support it - implement emergency flush reserve to avoid almost all unnecessary transaction aborts due to ENOSPC in cases where there are too many delayed refs or delayed allocation - skip block group synchronization if there's no change in used bytes, can reduce transaction commit count for some workloads Performance improvements: - fiemap and lseek: - overall speedup due to skipping unnecessary or duplicate searches (-40% run time) - cache some data structures and sharedness of extents (-30% run time) - send: - faster backref resolution when finding clones - cached leaf to root mapping for faster backref walking - improved clone/sharing detection - overall run time improvements (-70%) Core: - module initialization converted to a table of function pointers run in a sequence - preparation for fscrypt, extend passing file names across calls, dir item can store encryption status - raid56 updates: - more accurate error tracking of sectors within stripe - simplify recovery path and remove dedicated endio worker kthread - simplify scrub call paths - refactoring to support the extra data checksum verification during RMW cycle - tree block parentness checks consolidated and done at metadata read time - improved error handling - cleanups: - move a lot of code for better synchronization between kernel and user space sources, split big files - enum cleanups - GFP flag cleanups - header file cleanups, prototypes, dependencies - redundant parameter cleanups - inline extent handling simplifications - inode parameter conversion - data structure cleanups, reductions, renames, merges" * tag 'for-6.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (249 commits) btrfs: print transaction aborted messages with an error level btrfs: sync some cleanups from progs into uapi/btrfs.h btrfs: do not BUG_ON() on ENOMEM when dropping extent items for a range btrfs: fix extent map use-after-free when handling missing device in read_one_chunk btrfs: remove outdated logic from overwrite_item() and add assertion btrfs: unify overwrite_item() and do_overwrite_item() btrfs: replace strncpy() with strscpy() btrfs: fix uninitialized variable in find_first_clear_extent_bit btrfs: fix uninitialized parent in insert_state btrfs: add might_sleep() annotations btrfs: add stack helpers for a few btrfs items btrfs: add nr_global_roots to the super block definition btrfs: remove BTRFS_LEAF_DATA_OFFSET btrfs: add helpers for manipulating leaf items and data btrfs: add eb to btrfs_node_key_ptr_offset btrfs: pass the extent buffer for the btrfs_item_nr helpers btrfs: move the csum helpers into ctree.h btrfs: move eb offset helpers into extent_io.h btrfs: move file_extent_item helpers into file-item.h btrfs: move leaf_data_end into ctree.c ...
Diffstat (limited to 'include')
-rw-r--r--include/trace/events/btrfs.h27
-rw-r--r--include/uapi/linux/btrfs.h36
-rw-r--r--include/uapi/linux/btrfs_tree.h235
3 files changed, 277 insertions, 21 deletions
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index ed50e81174bf..0bce0b4ff2fa 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1993,12 +1993,11 @@ TRACE_EVENT(btrfs_set_extent_bit,
TP_fast_assign_btrfs(tree->fs_info,
__entry->owner = tree->owner;
- if (tree->private_data) {
- const struct inode *inode = tree->private_data;
+ if (tree->inode) {
+ const struct btrfs_inode *inode = tree->inode;
- __entry->ino = btrfs_ino(BTRFS_I(inode));
- __entry->rootid =
- BTRFS_I(inode)->root->root_key.objectid;
+ __entry->ino = btrfs_ino(inode);
+ __entry->rootid = inode->root->root_key.objectid;
} else {
__entry->ino = 0;
__entry->rootid = 0;
@@ -2032,12 +2031,11 @@ TRACE_EVENT(btrfs_clear_extent_bit,
TP_fast_assign_btrfs(tree->fs_info,
__entry->owner = tree->owner;
- if (tree->private_data) {
- const struct inode *inode = tree->private_data;
+ if (tree->inode) {
+ const struct btrfs_inode *inode = tree->inode;
- __entry->ino = btrfs_ino(BTRFS_I(inode));
- __entry->rootid =
- BTRFS_I(inode)->root->root_key.objectid;
+ __entry->ino = btrfs_ino(inode);
+ __entry->rootid = inode->root->root_key.objectid;
} else {
__entry->ino = 0;
__entry->rootid = 0;
@@ -2072,12 +2070,11 @@ TRACE_EVENT(btrfs_convert_extent_bit,
TP_fast_assign_btrfs(tree->fs_info,
__entry->owner = tree->owner;
- if (tree->private_data) {
- const struct inode *inode = tree->private_data;
+ if (tree->inode) {
+ const struct btrfs_inode *inode = tree->inode;
- __entry->ino = btrfs_ino(BTRFS_I(inode));
- __entry->rootid =
- BTRFS_I(inode)->root->root_key.objectid;
+ __entry->ino = btrfs_ino(inode);
+ __entry->rootid = inode->root->root_key.objectid;
} else {
__entry->ino = 0;
__entry->rootid = 0;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 5655e89b962b..b4f0f9531119 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -19,8 +19,14 @@
#ifndef _UAPI_LINUX_BTRFS_H
#define _UAPI_LINUX_BTRFS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
#include <linux/types.h>
#include <linux/ioctl.h>
+#include <linux/fs.h>
#define BTRFS_IOCTL_MAGIC 0x94
#define BTRFS_VOL_NAME_MAX 255
@@ -333,6 +339,12 @@ struct btrfs_ioctl_feature_flags {
*/
struct btrfs_balance_args {
__u64 profiles;
+
+ /*
+ * usage filter
+ * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
+ * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
+ */
union {
__u64 usage;
struct {
@@ -549,7 +561,7 @@ struct btrfs_ioctl_search_header {
__u64 offset;
__u32 type;
__u32 len;
-};
+} __attribute__ ((__may_alias__));
#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
/*
@@ -562,6 +574,10 @@ struct btrfs_ioctl_search_args {
char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
};
+/*
+ * Extended version of TREE_SEARCH ioctl that can return more than 4k of bytes.
+ * The allocated size of the buffer is set in buf_size.
+ */
struct btrfs_ioctl_search_args_v2 {
struct btrfs_ioctl_search_key key; /* in/out - search parameters */
__u64 buf_size; /* in - size of buffer
@@ -570,10 +586,11 @@ struct btrfs_ioctl_search_args_v2 {
__u64 buf[]; /* out - found items */
};
+/* With a @src_length of zero, the range from @src_offset->EOF is cloned! */
struct btrfs_ioctl_clone_range_args {
- __s64 src_fd;
- __u64 src_offset, src_length;
- __u64 dest_offset;
+ __s64 src_fd;
+ __u64 src_offset, src_length;
+ __u64 dest_offset;
};
/*
@@ -677,8 +694,11 @@ struct btrfs_ioctl_logical_ino_args {
/* struct btrfs_data_container *inodes; out */
__u64 inodes;
};
-/* Return every ref to the extent, not just those containing logical block.
- * Requires logical == extent bytenr. */
+
+/*
+ * Return every ref to the extent, not just those containing logical block.
+ * Requires logical == extent bytenr.
+ */
#define BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET (1ULL << 0)
enum btrfs_dev_stat_values {
@@ -1144,4 +1164,8 @@ enum btrfs_err_code {
#define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \
struct btrfs_ioctl_encoded_io_args)
+#ifdef __cplusplus
+}
+#endif
+
#endif /* _UAPI_LINUX_BTRFS_H */
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 1f7a38ec6ac3..ab38d0f411fa 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -10,6 +10,23 @@
#include <stddef.h>
#endif
+/* ASCII for _BHRfS_M, no terminating nul */
+#define BTRFS_MAGIC 0x4D5F53665248425FULL
+
+#define BTRFS_MAX_LEVEL 8
+
+/*
+ * We can actually store much bigger names, but lets not confuse the rest of
+ * linux.
+ */
+#define BTRFS_NAME_LEN 255
+
+/*
+ * Theoretical limit is larger, but we keep this down to a sane value. That
+ * should limit greatly the possibility of collisions on inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
+
/*
* This header contains the structure definitions and constants used
* by file system objects that can be retrieved using
@@ -359,6 +376,50 @@ enum btrfs_csum_type {
#define BTRFS_FT_SYMLINK 7
#define BTRFS_FT_XATTR 8
#define BTRFS_FT_MAX 9
+/* Directory contains encrypted data */
+#define BTRFS_FT_ENCRYPTED 0x80
+
+static inline __u8 btrfs_dir_flags_to_ftype(__u8 flags)
+{
+ return flags & ~BTRFS_FT_ENCRYPTED;
+}
+
+/*
+ * Inode flags
+ */
+#define BTRFS_INODE_NODATASUM (1U << 0)
+#define BTRFS_INODE_NODATACOW (1U << 1)
+#define BTRFS_INODE_READONLY (1U << 2)
+#define BTRFS_INODE_NOCOMPRESS (1U << 3)
+#define BTRFS_INODE_PREALLOC (1U << 4)
+#define BTRFS_INODE_SYNC (1U << 5)
+#define BTRFS_INODE_IMMUTABLE (1U << 6)
+#define BTRFS_INODE_APPEND (1U << 7)
+#define BTRFS_INODE_NODUMP (1U << 8)
+#define BTRFS_INODE_NOATIME (1U << 9)
+#define BTRFS_INODE_DIRSYNC (1U << 10)
+#define BTRFS_INODE_COMPRESS (1U << 11)
+
+#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31)
+
+#define BTRFS_INODE_FLAG_MASK \
+ (BTRFS_INODE_NODATASUM | \
+ BTRFS_INODE_NODATACOW | \
+ BTRFS_INODE_READONLY | \
+ BTRFS_INODE_NOCOMPRESS | \
+ BTRFS_INODE_PREALLOC | \
+ BTRFS_INODE_SYNC | \
+ BTRFS_INODE_IMMUTABLE | \
+ BTRFS_INODE_APPEND | \
+ BTRFS_INODE_NODUMP | \
+ BTRFS_INODE_NOATIME | \
+ BTRFS_INODE_DIRSYNC | \
+ BTRFS_INODE_COMPRESS | \
+ BTRFS_INODE_ROOT_ITEM_INIT)
+
+#define BTRFS_INODE_RO_VERITY (1U << 0)
+
+#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY)
/*
* The key defines the order in the tree, and so it also defines (optimal)
@@ -389,6 +450,109 @@ struct btrfs_key {
__u64 offset;
} __attribute__ ((__packed__));
+/*
+ * Every tree block (leaf or node) starts with this header.
+ */
+struct btrfs_header {
+ /* These first four must match the super block */
+ __u8 csum[BTRFS_CSUM_SIZE];
+ /* FS specific uuid */
+ __u8 fsid[BTRFS_FSID_SIZE];
+ /* Which block this node is supposed to live in */
+ __le64 bytenr;
+ __le64 flags;
+
+ /* Allowed to be different from the super from here on down */
+ __u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+ __le64 generation;
+ __le64 owner;
+ __le32 nritems;
+ __u8 level;
+} __attribute__ ((__packed__));
+
+/*
+ * This is a very generous portion of the super block, giving us room to
+ * translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+
+/*
+ * Just in case we somehow lose the roots and are not able to mount, we store
+ * an array of the roots from previous transactions in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+ __le64 tree_root;
+ __le64 tree_root_gen;
+
+ __le64 chunk_root;
+ __le64 chunk_root_gen;
+
+ __le64 extent_root;
+ __le64 extent_root_gen;
+
+ __le64 fs_root;
+ __le64 fs_root_gen;
+
+ __le64 dev_root;
+ __le64 dev_root_gen;
+
+ __le64 csum_root;
+ __le64 csum_root_gen;
+
+ __le64 total_bytes;
+ __le64 bytes_used;
+ __le64 num_devices;
+ /* future */
+ __le64 unused_64[4];
+
+ __u8 tree_root_level;
+ __u8 chunk_root_level;
+ __u8 extent_root_level;
+ __u8 fs_root_level;
+ __u8 dev_root_level;
+ __u8 csum_root_level;
+ /* future and to align */
+ __u8 unused_8[10];
+} __attribute__ ((__packed__));
+
+/*
+ * A leaf is full of items. offset and size tell us where to find the item in
+ * the leaf (relative to the start of the data area)
+ */
+struct btrfs_item {
+ struct btrfs_disk_key key;
+ __le32 offset;
+ __le32 size;
+} __attribute__ ((__packed__));
+
+/*
+ * Leaves have an item area and a data area:
+ * [item0, item1....itemN] [free space] [dataN...data1, data0]
+ *
+ * The data is separate from the items to get the keys closer together during
+ * searches.
+ */
+struct btrfs_leaf {
+ struct btrfs_header header;
+ struct btrfs_item items[];
+} __attribute__ ((__packed__));
+
+/*
+ * All non-leaf blocks are nodes, they hold only keys and pointers to other
+ * blocks.
+ */
+struct btrfs_key_ptr {
+ struct btrfs_disk_key key;
+ __le64 blockptr;
+ __le64 generation;
+} __attribute__ ((__packed__));
+
+struct btrfs_node {
+ struct btrfs_header header;
+ struct btrfs_key_ptr ptrs[];
+} __attribute__ ((__packed__));
+
struct btrfs_dev_item {
/* the internal btrfs device id */
__le64 devid;
@@ -472,6 +636,69 @@ struct btrfs_chunk {
/* additional stripes go here */
} __attribute__ ((__packed__));
+/*
+ * The super block basically lists the main trees of the FS.
+ */
+struct btrfs_super_block {
+ /* The first 4 fields must match struct btrfs_header */
+ __u8 csum[BTRFS_CSUM_SIZE];
+ /* FS specific UUID, visible to user */
+ __u8 fsid[BTRFS_FSID_SIZE];
+ /* This block number */
+ __le64 bytenr;
+ __le64 flags;
+
+ /* Allowed to be different from the btrfs_header from here own down */
+ __le64 magic;
+ __le64 generation;
+ __le64 root;
+ __le64 chunk_root;
+ __le64 log_root;
+
+ /*
+ * This member has never been utilized since the very beginning, thus
+ * it's always 0 regardless of kernel version. We always use
+ * generation + 1 to read log tree root. So here we mark it deprecated.
+ */
+ __le64 __unused_log_root_transid;
+ __le64 total_bytes;
+ __le64 bytes_used;
+ __le64 root_dir_objectid;
+ __le64 num_devices;
+ __le32 sectorsize;
+ __le32 nodesize;
+ __le32 __unused_leafsize;
+ __le32 stripesize;
+ __le32 sys_chunk_array_size;
+ __le64 chunk_root_generation;
+ __le64 compat_flags;
+ __le64 compat_ro_flags;
+ __le64 incompat_flags;
+ __le16 csum_type;
+ __u8 root_level;
+ __u8 chunk_root_level;
+ __u8 log_root_level;
+ struct btrfs_dev_item dev_item;
+
+ char label[BTRFS_LABEL_SIZE];
+
+ __le64 cache_generation;
+ __le64 uuid_tree_generation;
+
+ /* The UUID written into btree blocks */
+ __u8 metadata_uuid[BTRFS_FSID_SIZE];
+
+ __u64 nr_global_roots;
+
+ /* Future expansion */
+ __le64 reserved[27];
+ __u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+ struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+
+ /* Padded to 4096 bytes */
+ __u8 padding[565];
+} __attribute__ ((__packed__));
+
#define BTRFS_FREE_SPACE_EXTENT 1
#define BTRFS_FREE_SPACE_BITMAP 2
@@ -526,6 +753,14 @@ struct btrfs_extent_item_v0 {
/* use full backrefs for extent pointers in the block */
#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
+#define BTRFS_BACKREF_REV_MAX 256
+#define BTRFS_BACKREF_REV_SHIFT 56
+#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
+ BTRFS_BACKREF_REV_SHIFT)
+
+#define BTRFS_OLD_BACKREF_REV 0
+#define BTRFS_MIXED_BACKREF_REV 1
+
/*
* this flag is only used internally by scrub and may be changed at any time
* it is only declared here to avoid collisions