summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig46
-rw-r--r--fs/btrfs/Makefile4
-rw-r--r--fs/btrfs/accessors.c164
-rw-r--r--fs/btrfs/accessors.h39
-rw-r--r--fs/btrfs/acl.c25
-rw-r--r--fs/btrfs/acl.h2
-rw-r--r--fs/btrfs/async-thread.c20
-rw-r--r--fs/btrfs/backref.c292
-rw-r--r--fs/btrfs/backref.h50
-rw-r--r--fs/btrfs/bio.c447
-rw-r--r--fs/btrfs/bio.h42
-rw-r--r--fs/btrfs/block-group.c571
-rw-r--r--fs/btrfs/block-group.h22
-rw-r--r--fs/btrfs/block-rsv.c35
-rw-r--r--fs/btrfs/block-rsv.h1
-rw-r--r--fs/btrfs/btrfs_inode.h64
-rw-r--r--fs/btrfs/compression.c410
-rw-r--r--fs/btrfs/compression.h92
-rw-r--r--fs/btrfs/ctree.c659
-rw-r--r--fs/btrfs/ctree.h105
-rw-r--r--fs/btrfs/defrag.c294
-rw-r--r--fs/btrfs/defrag.h4
-rw-r--r--fs/btrfs/delalloc-space.c55
-rw-r--r--fs/btrfs/delalloc-space.h4
-rw-r--r--fs/btrfs/delayed-inode.c511
-rw-r--r--fs/btrfs/delayed-inode.h109
-rw-r--r--fs/btrfs/delayed-ref.c150
-rw-r--r--fs/btrfs/delayed-ref.h19
-rw-r--r--fs/btrfs/dev-replace.c90
-rw-r--r--fs/btrfs/dev-replace.h2
-rw-r--r--fs/btrfs/dir-item.c34
-rw-r--r--fs/btrfs/dir-item.h3
-rw-r--r--fs/btrfs/direct-io.c117
-rw-r--r--fs/btrfs/direct-io.h2
-rw-r--r--fs/btrfs/discard.c53
-rw-r--r--fs/btrfs/discard.h1
-rw-r--r--fs/btrfs/disk-io.c564
-rw-r--r--fs/btrfs/disk-io.h14
-rw-r--r--fs/btrfs/export.c61
-rw-r--r--fs/btrfs/extent-io-tree.c530
-rw-r--r--fs/btrfs/extent-io-tree.h166
-rw-r--r--fs/btrfs/extent-tree.c827
-rw-r--r--fs/btrfs/extent-tree.h48
-rw-r--r--fs/btrfs/extent_io.c2147
-rw-r--r--fs/btrfs/extent_io.h46
-rw-r--r--fs/btrfs/extent_map.c282
-rw-r--r--fs/btrfs/extent_map.h50
-rw-r--r--fs/btrfs/fiemap.c13
-rw-r--r--fs/btrfs/file-item.c209
-rw-r--r--fs/btrfs/file-item.h10
-rw-r--r--fs/btrfs/file.c1117
-rw-r--r--fs/btrfs/file.h2
-rw-r--r--fs/btrfs/free-space-cache.c152
-rw-r--r--fs/btrfs/free-space-tree.c627
-rw-r--r--fs/btrfs/free-space-tree.h52
-rw-r--r--fs/btrfs/fs.c179
-rw-r--r--fs/btrfs/fs.h154
-rw-r--r--fs/btrfs/inode-item.c77
-rw-r--r--fs/btrfs/inode-item.h11
-rw-r--r--fs/btrfs/inode.c2729
-rw-r--r--fs/btrfs/ioctl.c992
-rw-r--r--fs/btrfs/ioctl.h11
-rw-r--r--fs/btrfs/locking.c11
-rw-r--r--fs/btrfs/locking.h19
-rw-r--r--fs/btrfs/lzo.c98
-rw-r--r--fs/btrfs/messages.c2
-rw-r--r--fs/btrfs/messages.h188
-rw-r--r--fs/btrfs/misc.h89
-rw-r--r--fs/btrfs/ordered-data.c184
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/print-tree.c268
-rw-r--r--fs/btrfs/print-tree.h2
-rw-r--r--fs/btrfs/props.c66
-rw-r--r--fs/btrfs/props.h8
-rw-r--r--fs/btrfs/qgroup.c695
-rw-r--r--fs/btrfs/qgroup.h3
-rw-r--r--fs/btrfs/raid-stripe-tree.c174
-rw-r--r--fs/btrfs/raid-stripe-tree.h1
-rw-r--r--fs/btrfs/raid56.c969
-rw-r--r--fs/btrfs/raid56.h107
-rw-r--r--fs/btrfs/rcu-string.h58
-rw-r--r--fs/btrfs/ref-verify.c159
-rw-r--r--fs/btrfs/ref-verify.h8
-rw-r--r--fs/btrfs/reflink.c155
-rw-r--r--fs/btrfs/relocation.c835
-rw-r--r--fs/btrfs/relocation.h3
-rw-r--r--fs/btrfs/root-tree.c72
-rw-r--r--fs/btrfs/scrub.c874
-rw-r--r--fs/btrfs/scrub.h2
-rw-r--r--fs/btrfs/send.c1237
-rw-r--r--fs/btrfs/send.h4
-rw-r--r--fs/btrfs/space-info.c707
-rw-r--r--fs/btrfs/space-info.h71
-rw-r--r--fs/btrfs/subpage.c481
-rw-r--r--fs/btrfs/subpage.h97
-rw-r--r--fs/btrfs/super.c533
-rw-r--r--fs/btrfs/sysfs.c363
-rw-r--r--fs/btrfs/sysfs.h10
-rw-r--r--fs/btrfs/tests/btrfs-tests.c50
-rw-r--r--fs/btrfs/tests/btrfs-tests.h6
-rw-r--r--fs/btrfs/tests/delayed-refs-tests.c1016
-rw-r--r--fs/btrfs/tests/extent-io-tests.c94
-rw-r--r--fs/btrfs/tests/extent-map-tests.c111
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c93
-rw-r--r--fs/btrfs/tests/inode-tests.c107
-rw-r--r--fs/btrfs/tests/qgroup-tests.c16
-rw-r--r--fs/btrfs/tests/raid-stripe-tree-tests.c661
-rw-r--r--fs/btrfs/transaction.c259
-rw-r--r--fs/btrfs/transaction.h22
-rw-r--r--fs/btrfs/tree-checker.c225
-rw-r--r--fs/btrfs/tree-checker.h7
-rw-r--r--fs/btrfs/tree-log.c2948
-rw-r--r--fs/btrfs/tree-log.h8
-rw-r--r--fs/btrfs/tree-mod-log.c81
-rw-r--r--fs/btrfs/ulist.c59
-rw-r--r--fs/btrfs/uuid-tree.c122
-rw-r--r--fs/btrfs/verity.c48
-rw-r--r--fs/btrfs/volumes.c1107
-rw-r--r--fs/btrfs/volumes.h88
-rw-r--r--fs/btrfs/xattr.c51
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/btrfs/zlib.c170
-rw-r--r--fs/btrfs/zoned.c627
-rw-r--r--fs/btrfs/zoned.h26
-rw-r--r--fs/btrfs/zstd.c259
125 files changed, 18918 insertions, 13515 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index fa8515598341..4438637c8900 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -3,9 +3,9 @@
config BTRFS_FS
tristate "Btrfs filesystem support"
select BLK_CGROUP_PUNT_BIO
+ select CRC32
select CRYPTO
select CRYPTO_CRC32C
- select LIBCRC32C
select CRYPTO_XXHASH
select CRYPTO_SHA256
select CRYPTO_BLAKE2B
@@ -52,20 +52,24 @@ config BTRFS_FS_RUN_SANITY_TESTS
bool "Btrfs will run sanity tests upon loading"
depends on BTRFS_FS
help
- This will run some basic sanity tests on the free space cache
- code to make sure it is acting as it should. These are mostly
- regression tests and are only really interesting to btrfs
- developers.
+ This will run sanity tests for core functionality like free space,
+ extent maps, extent io, extent buffers, inodes, qgroups and others,
+ at module load time. These are mostly regression tests and are only
+ interesting to developers.
If unsure, say N.
config BTRFS_DEBUG
bool "Btrfs debugging support"
depends on BTRFS_FS
+ select REF_TRACKER if STACKTRACE_SUPPORT
help
- Enable run-time debugging support for the btrfs filesystem. This may
- enable additional and expensive checks with negative impact on
- performance, or export extra information via sysfs.
+ Enable run-time debugging support for the btrfs filesystem.
+
+ Additional potentially expensive checks, debugging functionality or
+ sysfs exported information is enabled, like leak checks of internal
+ objects, optional forced space fragmentation and /sys/fs/btrfs/debug .
+ This has negative impact on performance.
If unsure, say N.
@@ -73,8 +77,10 @@ config BTRFS_ASSERT
bool "Btrfs assert support"
depends on BTRFS_FS
help
- Enable run-time assertion checking. This will result in panics if
- any of the assertions trip. This is meant for btrfs developers only.
+ Enable run-time assertion checking. Additional safety checks are
+ done, simple enough not to affect performance but verify invariants
+ and assumptions of code to run properly. This may result in panics,
+ and is meant for developers but can be enabled in general.
If unsure, say N.
@@ -89,7 +95,14 @@ config BTRFS_EXPERIMENTAL
Current list:
- - extent map shrinker - performance problems with too frequent shrinks
+ - COW fixup worker warning - last warning before removing the
+ functionality catching out-of-band page
+ dirtying, not necessary since 5.8
+
+ - RAID mirror read policy - additional read policies for balancing
+ reading from redundant block group
+ profiles (currently: pid, round-robin,
+ fixed devid)
- send stream protocol v3 - fs-verity support
@@ -102,15 +115,6 @@ config BTRFS_EXPERIMENTAL
- extent tree v2 - complex rework of extent tracking
- If unsure, say N.
-
-config BTRFS_FS_REF_VERIFY
- bool "Btrfs with the ref verify tool compiled in"
- depends on BTRFS_FS
- default n
- help
- Enable run-time extent reference verification instrumentation. This
- is meant to be used by btrfs developers for tracking down extent
- reference problems or verifying they didn't break something.
+ - large folio support
If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 3cfc440c636c..743d7677b175 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -36,7 +36,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
-btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
+btrfs-$(CONFIG_BTRFS_DEBUG) += ref-verify.o
btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
btrfs-$(CONFIG_FS_VERITY) += verity.o
@@ -44,4 +44,4 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
tests/free-space-tree-tests.o tests/extent-map-tests.o \
- tests/raid-stripe-tree-tests.o
+ tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index e3716516ca38..1248aa2535d3 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -9,27 +9,24 @@
#include "fs.h"
#include "accessors.h"
-static bool check_setget_bounds(const struct extent_buffer *eb,
- const void *ptr, unsigned off, int size)
+static void __cold report_setget_bounds(const struct extent_buffer *eb,
+ const void *ptr, unsigned off, int size)
{
- const unsigned long member_offset = (unsigned long)ptr + off;
+ unsigned long member_offset = (unsigned long)ptr + off;
- if (unlikely(member_offset + size > eb->len)) {
- btrfs_warn(eb->fs_info,
- "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
- (member_offset > eb->len ? "start" : "end"),
- (unsigned long)ptr, eb->start, member_offset, size);
- return false;
- }
-
- return true;
+ btrfs_warn(eb->fs_info,
+ "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
+ (member_offset > eb->len ? "start" : "end"),
+ (unsigned long)ptr, eb->start, member_offset, size);
}
-void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
+/* Copy bytes from @src1 and @src2 to @dest. */
+static __always_inline void memcpy_split_src(char *dest, const char *src1,
+ const char *src2, const size_t len1,
+ const size_t total)
{
- token->eb = eb;
- token->kaddr = folio_address(eb->folios[0]);
- token->offset = 0;
+ memcpy(dest, src1, len1);
+ memcpy(dest + len1, src2, total - len1);
}
/*
@@ -41,134 +38,77 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
* - btrfs_set_8 (for 8/16/32/64)
* - btrfs_get_8 (for 8/16/32/64)
*
- * Generic helpers with a token (cached address of the most recently accessed
- * page):
- * - btrfs_set_token_8 (for 8/16/32/64)
- * - btrfs_get_token_8 (for 8/16/32/64)
- *
* The set/get functions handle data spanning two pages transparently, in case
* metadata block size is larger than page. Every pointer to metadata items is
* an offset into the extent buffer page array, cast to a specific type. This
* gives us all the type checking.
*
* The extent buffer pages stored in the array folios may not form a contiguous
- * phyusical range, but the API functions assume the linear offset to the range
+ * physical range, but the API functions assume the linear offset to the range
* from 0 to metadata node size.
*/
#define DEFINE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off) \
-{ \
- const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
- const unsigned long oil = get_eb_offset_in_folio(token->eb, \
- member_offset);\
- const int unit_size = token->eb->folio_size; \
- const int unit_shift = token->eb->folio_shift; \
- const int size = sizeof(u##bits); \
- u8 lebytes[sizeof(u##bits)]; \
- const int part = unit_size - oil; \
- \
- ASSERT(token); \
- ASSERT(token->kaddr); \
- ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
- if (token->offset <= member_offset && \
- member_offset + size <= token->offset + unit_size) { \
- return get_unaligned_le##bits(token->kaddr + oil); \
- } \
- token->kaddr = folio_address(token->eb->folios[idx]); \
- token->offset = idx << unit_shift; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
- return get_unaligned_le##bits(token->kaddr + oil); \
- \
- memcpy(lebytes, token->kaddr + oil, part); \
- token->kaddr = folio_address(token->eb->folios[idx + 1]); \
- token->offset = (idx + 1) << unit_shift; \
- memcpy(lebytes + part, token->kaddr, size - part); \
- return get_unaligned_le##bits(lebytes); \
-} \
u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
const unsigned long idx = get_eb_folio_index(eb, member_offset);\
- const unsigned long oil = get_eb_offset_in_folio(eb, \
- member_offset);\
- const int unit_size = eb->folio_size; \
- char *kaddr = folio_address(eb->folios[idx]); \
- const int size = sizeof(u##bits); \
- const int part = unit_size - oil; \
- u8 lebytes[sizeof(u##bits)]; \
- \
- ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
- return get_unaligned_le##bits(kaddr + oil); \
- \
- memcpy(lebytes, kaddr + oil, part); \
- kaddr = folio_address(eb->folios[idx + 1]); \
- memcpy(lebytes + part, kaddr, size - part); \
- return get_unaligned_le##bits(lebytes); \
-} \
-void btrfs_set_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off, \
- u##bits val) \
-{ \
- const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
- const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ const unsigned long oif = get_eb_offset_in_folio(eb, \
member_offset);\
- const int unit_size = token->eb->folio_size; \
- const int unit_shift = token->eb->folio_shift; \
- const int size = sizeof(u##bits); \
+ char *kaddr = folio_address(eb->folios[idx]) + oif; \
+ const int part = eb->folio_size - oif; \
u8 lebytes[sizeof(u##bits)]; \
- const int part = unit_size - oil; \
\
- ASSERT(token); \
- ASSERT(token->kaddr); \
- ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
- if (token->offset <= member_offset && \
- member_offset + size <= token->offset + unit_size) { \
- put_unaligned_le##bits(val, token->kaddr + oil); \
- return; \
+ if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \
+ report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \
+ return 0; \
} \
- token->kaddr = folio_address(token->eb->folios[idx]); \
- token->offset = idx << unit_shift; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
- oil + size <= unit_size) { \
- put_unaligned_le##bits(val, token->kaddr + oil); \
- return; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \
+ likely(sizeof(u##bits) <= part)) \
+ return get_unaligned_le##bits(kaddr); \
+ \
+ if (sizeof(u##bits) == 2) { \
+ lebytes[0] = *kaddr; \
+ kaddr = folio_address(eb->folios[idx + 1]); \
+ lebytes[1] = *kaddr; \
+ } else { \
+ memcpy_split_src(lebytes, kaddr, \
+ folio_address(eb->folios[idx + 1]), \
+ part, sizeof(u##bits)); \
} \
- put_unaligned_le##bits(val, lebytes); \
- memcpy(token->kaddr + oil, lebytes, part); \
- token->kaddr = folio_address(token->eb->folios[idx + 1]); \
- token->offset = (idx + 1) << unit_shift; \
- memcpy(token->kaddr, lebytes + part, size - part); \
+ return get_unaligned_le##bits(lebytes); \
} \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
const unsigned long idx = get_eb_folio_index(eb, member_offset);\
- const unsigned long oil = get_eb_offset_in_folio(eb, \
+ const unsigned long oif = get_eb_offset_in_folio(eb, \
member_offset);\
- const int unit_size = eb->folio_size; \
- char *kaddr = folio_address(eb->folios[idx]); \
- const int size = sizeof(u##bits); \
- const int part = unit_size - oil; \
+ char *kaddr = folio_address(eb->folios[idx]) + oif; \
+ const int part = eb->folio_size - oif; \
u8 lebytes[sizeof(u##bits)]; \
\
- ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
- oil + size <= unit_size) { \
- put_unaligned_le##bits(val, kaddr + oil); \
+ if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \
+ report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \
+ return; \
+ } \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \
+ likely(sizeof(u##bits) <= part)) { \
+ put_unaligned_le##bits(val, kaddr); \
return; \
} \
- \
put_unaligned_le##bits(val, lebytes); \
- memcpy(kaddr + oil, lebytes, part); \
- kaddr = folio_address(eb->folios[idx + 1]); \
- memcpy(kaddr, lebytes + part, size - part); \
+ if (sizeof(u##bits) == 2) { \
+ *kaddr = lebytes[0]; \
+ kaddr = folio_address(eb->folios[idx + 1]); \
+ *kaddr = lebytes[1]; \
+ } else { \
+ memcpy(kaddr, lebytes, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
+ memcpy(kaddr, lebytes + part, sizeof(u##bits) - part); \
+ } \
}
DEFINE_BTRFS_SETGET_BITS(8)
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 7a7e0ef69973..78721412951c 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -12,17 +12,11 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <uapi/linux/btrfs_tree.h>
+#include "fs.h"
+#include "extent_io.h"
struct extent_buffer;
-struct btrfs_map_token {
- struct extent_buffer *eb;
- char *kaddr;
- unsigned long offset;
-};
-
-void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb);
-
/*
* Some macros to generate set/get functions for the struct fields. This
* assumes there is a lefoo_to_cpu for every type, so lets make a simple one
@@ -55,11 +49,6 @@ static inline void put_unaligned_le8(u8 val, void *p)
sizeof_field(type, member)))
#define DECLARE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off); \
-void btrfs_set_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off, \
- u##bits val); \
u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off); \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
@@ -82,18 +71,6 @@ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
{ \
static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
-} \
-static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
- const type *s) \
-{ \
- static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
- return btrfs_get_token_##bits(token, s, offsetof(type, member));\
-} \
-static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
- type *s, u##bits val) \
-{ \
- static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
- btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
}
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
@@ -478,18 +455,6 @@ static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
int slot, u32 val) \
{ \
btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \
-} \
-static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
- int slot) \
-{ \
- struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \
- return btrfs_token_raw_item_##member(token, item); \
-} \
-static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
- int slot, u32 val) \
-{ \
- struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \
- btrfs_set_token_raw_item_##member(token, item, val); \
}
BTRFS_ITEM_SETGET_FUNCS(offset)
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index e0ba00d64ea0..c336e2ab7f8a 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -14,12 +14,13 @@
#include "ctree.h"
#include "xattr.h"
#include "acl.h"
+#include "misc.h"
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
{
int size;
const char *name;
- char *value = NULL;
+ char AUTO_KFREE(value);
struct posix_acl *acl;
if (rcu)
@@ -49,7 +50,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
acl = NULL;
else
acl = ERR_PTR(size);
- kfree(value);
return acl;
}
@@ -59,7 +59,7 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
{
int ret, size = 0;
const char *name;
- char *value = NULL;
+ char AUTO_KFREE(value);
switch (type) {
case ACL_TYPE_ACCESS:
@@ -85,28 +85,23 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
nofs_flag = memalloc_nofs_save();
value = kmalloc(size, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
- if (!value) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!value)
+ return -ENOMEM;
ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
if (ret < 0)
- goto out;
+ return ret;
}
if (trans)
ret = btrfs_setxattr(trans, inode, name, value, size, 0);
else
ret = btrfs_setxattr_trans(inode, name, value, size, 0);
+ if (ret < 0)
+ return ret;
-out:
- kfree(value);
-
- if (!ret)
- set_cached_acl(inode, type, acl);
-
- return ret;
+ set_cached_acl(inode, type, acl);
+ return 0;
}
int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
index 48b9ddae4a46..0458cd51ed48 100644
--- a/fs/btrfs/acl.h
+++ b/fs/btrfs/acl.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_ACL_H
#define BTRFS_ACL_H
+#include <linux/types.h>
+
struct posix_acl;
struct inode;
struct btrfs_trans_handle;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 361a866c1995..6c6f3bb58f4e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -18,7 +18,7 @@ enum {
};
#define NO_THRESHOLD (-1)
-#define DFT_THRESHOLD (32)
+#define DEFAULT_THRESHOLD (32)
struct btrfs_workqueue {
struct workqueue_struct *normal_wq;
@@ -94,9 +94,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
ret->limit_active = limit_active;
if (thresh == 0)
- thresh = DFT_THRESHOLD;
+ thresh = DEFAULT_THRESHOLD;
/* For low threshold, disabling threshold is a better choice */
- if (thresh < DFT_THRESHOLD) {
+ if (thresh < DEFAULT_THRESHOLD) {
ret->current_active = limit_active;
ret->thresh = NO_THRESHOLD;
} else {
@@ -168,7 +168,7 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
{
int new_current_active;
long pending;
- int need_change = 0;
+ bool need_change = false;
if (wq->thresh == NO_THRESHOLD)
return;
@@ -196,15 +196,14 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
new_current_active--;
new_current_active = clamp_val(new_current_active, 1, wq->limit_active);
if (new_current_active != wq->current_active) {
- need_change = 1;
+ need_change = true;
wq->current_active = new_current_active;
}
out:
spin_unlock(&wq->thres_lock);
- if (need_change) {
+ if (need_change)
workqueue_set_max_active(wq->normal_wq, wq->current_active);
- }
}
static void run_ordered_work(struct btrfs_workqueue *wq,
@@ -220,8 +219,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
spin_lock_irqsave(lock, flags);
if (list_empty(list))
break;
- work = list_entry(list->next, struct btrfs_work,
- ordered_list);
+ work = list_first_entry(list, struct btrfs_work, ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
/*
@@ -296,7 +294,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
normal_work);
struct btrfs_workqueue *wq = work->wq;
- int need_order = 0;
+ bool need_order = false;
/*
* We should not touch things inside work in the following cases:
@@ -307,7 +305,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
* So we save the needed things here.
*/
if (work->ordered_func)
- need_order = 1;
+ need_order = true;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 04f53ca548e1..78da47a3d00e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -250,6 +250,21 @@ static int prelim_ref_compare(const struct prelim_ref *ref1,
return 0;
}
+static int prelim_ref_rb_add_cmp(const struct rb_node *new,
+ const struct rb_node *exist)
+{
+ const struct prelim_ref *ref_new =
+ rb_entry(new, struct prelim_ref, rbnode);
+ const struct prelim_ref *ref_exist =
+ rb_entry(exist, struct prelim_ref, rbnode);
+
+ /*
+ * prelim_ref_compare() expects the first parameter as the existing one,
+ * different from the rb_find_add_cached() order.
+ */
+ return prelim_ref_compare(ref_exist, ref_new);
+}
+
static void update_share_count(struct share_check *sc, int oldcount,
int newcount, const struct prelim_ref *newref)
{
@@ -278,55 +293,39 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
struct share_check *sc)
{
struct rb_root_cached *root;
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct prelim_ref *ref;
- int result;
- bool leftmost = true;
+ struct rb_node *exist;
root = &preftree->root;
- p = &root->rb_root.rb_node;
+ exist = rb_find_add_cached(&newref->rbnode, root, prelim_ref_rb_add_cmp);
+ if (exist) {
+ struct prelim_ref *ref = rb_entry(exist, struct prelim_ref, rbnode);
+ /* Identical refs, merge them and free @newref */
+ struct extent_inode_elem *eie = ref->inode_list;
- while (*p) {
- parent = *p;
- ref = rb_entry(parent, struct prelim_ref, rbnode);
- result = prelim_ref_compare(ref, newref);
- if (result < 0) {
- p = &(*p)->rb_left;
- } else if (result > 0) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else {
- /* Identical refs, merge them and free @newref */
- struct extent_inode_elem *eie = ref->inode_list;
-
- while (eie && eie->next)
- eie = eie->next;
+ while (eie && eie->next)
+ eie = eie->next;
- if (!eie)
- ref->inode_list = newref->inode_list;
- else
- eie->next = newref->inode_list;
- trace_btrfs_prelim_ref_merge(fs_info, ref, newref,
- preftree->count);
- /*
- * A delayed ref can have newref->count < 0.
- * The ref->count is updated to follow any
- * BTRFS_[ADD|DROP]_DELAYED_REF actions.
- */
- update_share_count(sc, ref->count,
- ref->count + newref->count, newref);
- ref->count += newref->count;
- free_pref(newref);
- return;
- }
+ if (!eie)
+ ref->inode_list = newref->inode_list;
+ else
+ eie->next = newref->inode_list;
+ trace_btrfs_prelim_ref_merge(fs_info, ref, newref,
+ preftree->count);
+ /*
+ * A delayed ref can have newref->count < 0.
+ * The ref->count is updated to follow any
+ * BTRFS_[ADD|DROP]_DELAYED_REF actions.
+ */
+ update_share_count(sc, ref->count,
+ ref->count + newref->count, newref);
+ ref->count += newref->count;
+ free_pref(newref);
+ return;
}
update_share_count(sc, 0, newref->count, newref);
preftree->count++;
trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
- rb_link_node(&newref->rbnode, parent, p);
- rb_insert_color_cached(&newref->rbnode, root, leftmost);
}
/*
@@ -667,10 +666,9 @@ static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx,
ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq);
btrfs_debug(ctx->fs_info,
- "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
- ref->root_id, level, ref->count, ret,
- ref->key_for_search.objectid, ref->key_for_search.type,
- ref->key_for_search.offset);
+"search slot in root %llu (level %d, ref count %d) returned %d for key " BTRFS_KEY_FMT,
+ ref->root_id, level, ref->count, ret,
+ BTRFS_KEY_FMT_VALUE(&ref->key_for_search));
if (ret < 0)
goto out;
@@ -734,7 +732,6 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
struct preftrees *preftrees,
struct share_check *sc)
{
- int err;
int ret = 0;
struct ulist *parents;
struct ulist_node *node;
@@ -753,6 +750,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
*/
while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
struct prelim_ref *ref;
+ int ret2;
ref = rb_entry(rnode, struct prelim_ref, rbnode);
if (WARN(ref->parent,
@@ -774,18 +772,18 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
ret = BACKREF_FOUND_SHARED;
goto out;
}
- err = resolve_indirect_ref(ctx, path, preftrees, ref, parents);
+ ret2 = resolve_indirect_ref(ctx, path, preftrees, ref, parents);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
*/
- if (err == -ENOENT) {
+ if (ret2 == -ENOENT) {
prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref,
NULL);
continue;
- } else if (err) {
+ } else if (ret2) {
free_pref(ref);
- ret = err;
+ ret = ret2;
goto out;
}
@@ -860,7 +858,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
free_pref(ref);
return PTR_ERR(eb);
}
- if (!extent_buffer_uptodate(eb)) {
+ if (unlikely(!extent_buffer_uptodate(eb))) {
free_pref(ref);
free_extent_buffer(eb);
return -EIO;
@@ -1063,7 +1061,7 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
iref = (struct btrfs_extent_inline_ref *)ptr;
type = btrfs_get_extent_inline_ref_type(leaf, iref,
BTRFS_REF_TYPE_ANY);
- if (type == BTRFS_REF_TYPE_INVALID)
+ if (unlikely(type == BTRFS_REF_TYPE_INVALID))
return -EUCLEAN;
offset = btrfs_extent_inline_ref_offset(leaf, iref);
@@ -1400,22 +1398,22 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx,
ASSERT(ctx->roots == NULL);
key.objectid = ctx->bytenr;
- key.offset = (u64)-1;
if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
if (!ctx->trans) {
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
}
if (ctx->time_seq == BTRFS_SEQ_LAST)
- path->skip_locking = 1;
+ path->skip_locking = true;
again:
head = NULL;
@@ -1423,7 +1421,7 @@ again:
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -1562,7 +1560,7 @@ again:
btrfs_release_path(path);
- ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0);
+ ret = add_missing_keys(ctx->fs_info, &preftrees, !path->skip_locking);
if (ret)
goto out;
@@ -1615,7 +1613,7 @@ again:
ret = PTR_ERR(eb);
goto out;
}
- if (!extent_buffer_uptodate(eb)) {
+ if (unlikely(!extent_buffer_uptodate(eb))) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
@@ -1653,7 +1651,7 @@ again:
* case.
*/
ASSERT(eie);
- if (!eie) {
+ if (unlikely(!eie)) {
ret = -EUCLEAN;
goto out;
}
@@ -1691,7 +1689,7 @@ out:
* @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are
* added to the ulist at @ctx->refs, and that ulist is allocated by this
* function. The caller should free the ulist with free_leaf_list() if
- * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is
+ * @ctx->ignore_extent_item_pos is false, otherwise a simple ulist_free() is
* enough.
*
* Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated.
@@ -2202,22 +2200,21 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
int ret;
u64 flags;
u64 size = 0;
- u32 item_size;
const struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct btrfs_key key;
+ key.objectid = logical;
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = logical;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -2245,7 +2242,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
eb = path->nodes[0];
- item_size = btrfs_item_size(eb, path->slots[0]);
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
flags = btrfs_extent_flags(eb, ei);
@@ -2253,7 +2249,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
btrfs_debug(fs_info,
"logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u",
logical, logical - found_key->objectid, found_key->objectid,
- found_key->offset, flags, item_size);
+ found_key->offset, flags, btrfs_item_size(eb, path->slots[0]));
WARN_ON(!flags_ret);
if (flags_ret) {
@@ -2315,7 +2311,7 @@ static int get_extent_inline_ref(unsigned long *ptr,
*out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
*out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref,
BTRFS_REF_TYPE_ANY);
- if (*out_type == BTRFS_REF_TYPE_INVALID)
+ if (unlikely(*out_type == BTRFS_REF_TYPE_INVALID))
return -EUCLEAN;
*ptr += btrfs_extent_inline_ref_size(*out_type);
@@ -2549,17 +2545,20 @@ static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *c
}
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
void *ctx, bool ignore_offset)
{
struct btrfs_backref_walk_ctx walk_ctx = { 0 };
int ret;
u64 flags = 0;
struct btrfs_key found_key;
- int search_commit_root = path->search_commit_root;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
- btrfs_release_path(path);
+ btrfs_free_path(path);
if (ret < 0)
return ret;
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
@@ -2572,8 +2571,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
walk_ctx.extent_item_pos = logical - found_key.objectid;
walk_ctx.fs_info = fs_info;
- return iterate_extent_inodes(&walk_ctx, search_commit_root,
- build_ino_list, ctx);
+ return iterate_extent_inodes(&walk_ctx, false, build_ino_list, ctx);
}
static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
@@ -2787,7 +2785,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
* allocates space to return multiple file system paths for an inode.
* total_bytes to allocate are passed, note that space usable for actual path
* information will be total_bytes - sizeof(struct inode_fs_paths).
- * the returned pointer must be freed with free_ipath() in the end.
+ * the returned pointer must be freed with __free_inode_fs_paths() in the end.
*/
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path)
@@ -2812,14 +2810,6 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
return ifp;
}
-void free_ipath(struct inode_fs_paths *ipath)
-{
- if (!ipath)
- return;
- kvfree(ipath->fspath);
- kfree(ipath);
-}
-
struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info)
{
struct btrfs_backref_iter *ret;
@@ -2835,8 +2825,8 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf
}
/* Current backref iterator only supports iteration in commit root */
- ret->path->search_commit_root = 1;
- ret->path->skip_locking = 1;
+ ret->path->search_commit_root = true;
+ ret->path->skip_locking = true;
ret->fs_info = fs_info;
return ret;
@@ -2869,7 +2859,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -2877,8 +2867,8 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
ret = -EUCLEAN;
goto release;
}
- if (path->slots[0] == 0) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ if (unlikely(path->slots[0] == 0)) {
+ DEBUG_WARN();
ret = -EUCLEAN;
goto release;
}
@@ -3022,9 +3012,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
cache->rb_root = RB_ROOT;
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
INIT_LIST_HEAD(&cache->pending[i]);
- INIT_LIST_HEAD(&cache->changed);
- INIT_LIST_HEAD(&cache->detached);
- INIT_LIST_HEAD(&cache->leaves);
INIT_LIST_HEAD(&cache->pending_edge);
INIT_LIST_HEAD(&cache->useless_node);
cache->fs_info = fs_info;
@@ -3132,29 +3119,17 @@ void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node)
{
- struct btrfs_backref_node *upper;
struct btrfs_backref_edge *edge;
if (!node)
return;
- BUG_ON(!node->lowest && !node->detached);
while (!list_empty(&node->upper)) {
- edge = list_entry(node->upper.next, struct btrfs_backref_edge,
- list[LOWER]);
- upper = edge->node[UPPER];
+ edge = list_first_entry(&node->upper, struct btrfs_backref_edge,
+ list[LOWER]);
list_del(&edge->list[LOWER]);
list_del(&edge->list[UPPER]);
btrfs_backref_free_edge(cache, edge);
-
- /*
- * Add the node to leaf node list if no other child block
- * cached.
- */
- if (list_empty(&upper->lower)) {
- list_add_tail(&upper->lower, &cache->leaves);
- upper->lowest = 1;
- }
}
btrfs_backref_drop_node(cache, node);
@@ -3166,49 +3141,25 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
{
struct btrfs_backref_node *node;
- int i;
- while (!list_empty(&cache->detached)) {
- node = list_entry(cache->detached.next,
- struct btrfs_backref_node, list);
+ while ((node = rb_entry_safe(rb_first(&cache->rb_root),
+ struct btrfs_backref_node, rb_node)))
btrfs_backref_cleanup_node(cache, node);
- }
- while (!list_empty(&cache->leaves)) {
- node = list_entry(cache->leaves.next,
- struct btrfs_backref_node, lower);
- btrfs_backref_cleanup_node(cache, node);
- }
-
- for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
- while (!list_empty(&cache->pending[i])) {
- node = list_first_entry(&cache->pending[i],
- struct btrfs_backref_node,
- list);
- btrfs_backref_cleanup_node(cache, node);
- }
- }
ASSERT(list_empty(&cache->pending_edge));
ASSERT(list_empty(&cache->useless_node));
- ASSERT(list_empty(&cache->changed));
- ASSERT(list_empty(&cache->detached));
- ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
ASSERT(!cache->nr_nodes);
ASSERT(!cache->nr_edges);
}
-void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
- struct btrfs_backref_node *lower,
- struct btrfs_backref_node *upper,
- int link_which)
+static void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper)
{
ASSERT(upper && lower && upper->level == lower->level + 1);
edge->node[LOWER] = lower;
edge->node[UPPER] = upper;
- if (link_which & LINK_LOWER)
- list_add_tail(&edge->list[LOWER], &lower->upper);
- if (link_which & LINK_UPPER)
- list_add_tail(&edge->list[UPPER], &upper->lower);
+ list_add_tail(&edge->list[LOWER], &lower->upper);
}
/*
* Handle direct tree backref
@@ -3278,7 +3229,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
}
- btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER);
+ btrfs_backref_link_edge(edge, cur, upper);
return 0;
}
@@ -3316,8 +3267,12 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
root = btrfs_get_fs_root(fs_info, ref_key->offset, false);
if (IS_ERR(root))
return PTR_ERR(root);
- if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- cur->cowonly = 1;
+
+ /* We shouldn't be using backref cache for non-shareable roots. */
+ if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
+ btrfs_put_root(root);
+ return -EUCLEAN;
+ }
if (btrfs_root_level(&root->root_item) == cur->level) {
/* Tree root */
@@ -3344,8 +3299,8 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
level = cur->level + 1;
/* Search the tree to find parent blocks referring to the block */
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0);
path->lowest_level = 0;
@@ -3359,9 +3314,9 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
eb = path->nodes[level];
if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) {
btrfs_err(fs_info,
-"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
+"couldn't find block (%llu) (level %d) in tree (%llu) with key " BTRFS_KEY_FMT,
cur->bytenr, level - 1, btrfs_root_id(root),
- tree_key->objectid, tree_key->type, tree_key->offset);
+ BTRFS_KEY_FMT_VALUE(tree_key));
btrfs_put_root(root);
ret = -ENOENT;
goto out;
@@ -3403,8 +3358,15 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
goto out;
}
upper->owner = btrfs_header_owner(eb);
- if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- upper->cowonly = 1;
+
+ /* We shouldn't be using backref cache for non shareable roots. */
+ if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
+ btrfs_put_root(root);
+ btrfs_backref_free_edge(cache, edge);
+ btrfs_backref_free_node(cache, upper);
+ ret = -EUCLEAN;
+ goto out;
+ }
/*
* If we know the block isn't shared we can avoid
@@ -3437,7 +3399,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
if (!upper->owner)
upper->owner = btrfs_header_owner(eb);
}
- btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER);
+ btrfs_backref_link_edge(edge, lower, upper);
if (rb_node) {
btrfs_put_root(root);
@@ -3486,7 +3448,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
/* No extra backref? This means the tree block is corrupted */
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -3498,8 +3460,8 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
* type BTRFS_TREE_BLOCK_REF_KEY
*/
ASSERT(list_is_singular(&cur->upper));
- edge = list_entry(cur->upper.next, struct btrfs_backref_edge,
- list[LOWER]);
+ edge = list_first_entry(&cur->upper, struct btrfs_backref_edge,
+ list[LOWER]);
ASSERT(list_empty(&edge->list[UPPER]));
exist = edge->node[UPPER];
/*
@@ -3529,7 +3491,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
((unsigned long)iter->cur_ptr);
type = btrfs_get_extent_inline_ref_type(eb, iref,
BTRFS_REF_TYPE_BLOCK);
- if (type == BTRFS_REF_TYPE_INVALID) {
+ if (unlikely(type == BTRFS_REF_TYPE_INVALID)) {
ret = -EUCLEAN;
goto out;
}
@@ -3595,15 +3557,9 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
ASSERT(start->checked);
- /* Insert this node to cache if it's not COW-only */
- if (!start->cowonly) {
- rb_node = rb_simple_insert(&cache->rb_root, start->bytenr,
- &start->rb_node);
- if (rb_node)
- btrfs_backref_panic(cache->fs_info, start->bytenr,
- -EEXIST);
- list_add_tail(&start->lower, &cache->leaves);
- }
+ rb_node = rb_simple_insert(&cache->rb_root, &start->simple_node);
+ if (rb_node)
+ btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST);
/*
* Use breadth first search to iterate all related edges.
@@ -3642,38 +3598,22 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
* parents have already been linked.
*/
if (!RB_EMPTY_NODE(&upper->rb_node)) {
- if (upper->lowest) {
- list_del_init(&upper->lower);
- upper->lowest = 0;
- }
-
list_add_tail(&edge->list[UPPER], &upper->lower);
continue;
}
/* Sanity check, we shouldn't have any unchecked nodes */
- if (!upper->checked) {
- ASSERT(0);
+ if (unlikely(!upper->checked)) {
+ DEBUG_WARN("we should not have any unchecked nodes");
return -EUCLEAN;
}
- /* Sanity check, COW-only node has non-COW-only parent */
- if (start->cowonly != upper->cowonly) {
- ASSERT(0);
+ rb_node = rb_simple_insert(&cache->rb_root, &upper->simple_node);
+ if (unlikely(rb_node)) {
+ btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST);
return -EUCLEAN;
}
- /* Only cache non-COW-only (subvolume trees) tree blocks */
- if (!upper->cowonly) {
- rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
- &upper->rb_node);
- if (rb_node) {
- btrfs_backref_panic(cache->fs_info,
- upper->bytenr, -EEXIST);
- return -EUCLEAN;
- }
- }
-
list_add_tail(&edge->list[UPPER], &upper->lower);
/*
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index e8c22cccb5c1..1d009b0f4c69 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -190,7 +190,7 @@ struct btrfs_backref_share_check_ctx {
* It's very common to have several file extent items that point to the
* same extent (bytenr) but with different offsets and lengths. This
* typically happens for COW writes, partial writes into prealloc
- * extents, NOCOW writes after snapshoting a root, hole punching or
+ * extents, NOCOW writes after snapshotting a root, hole punching or
* reflinking within the same file (less common perhaps).
* So keep a small cache with the lookup results for the extent pointed
* by the last few file extent items. This cache is checked, with a
@@ -226,8 +226,7 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
iterate_extent_inodes_t *iterate, void *user_ctx);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, void *ctx,
- bool ignore_offset);
+ void *ctx, bool ignore_offset);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
@@ -242,7 +241,12 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
struct btrfs_data_container *init_data_container(u32 total_bytes);
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path);
-void free_ipath(struct inode_fs_paths *ipath);
+
+DEFINE_FREE(inode_fs_paths, struct inode_fs_paths *,
+ if (_T) {
+ kvfree(_T->fspath);
+ kfree(_T);
+ })
int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
@@ -313,11 +317,22 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
* Represent a tree block in the backref cache
*/
struct btrfs_backref_node {
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- }; /* Use rb_simple_node for search/insert */
+ union{
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+
+ struct rb_simple_node simple_node;
+ };
+ /*
+ * This is a sanity check, whenever we COW a block we will update
+ * new_bytenr with it's current location, and we will check this in
+ * various places to validate that the cache makes sense, it shouldn't
+ * be used for anything else.
+ */
u64 new_bytenr;
/* Objectid of tree block owner, can be not uptodate */
u64 owner;
@@ -335,10 +350,6 @@ struct btrfs_backref_node {
struct extent_buffer *eb;
/* Level of the tree block */
unsigned int level:8;
- /* Is the block in a non-shareable tree */
- unsigned int cowonly:1;
- /* 1 if no child node is in the cache */
- unsigned int lowest:1;
/* Is the extent buffer locked */
unsigned int locked:1;
/* Has the block been processed */
@@ -391,12 +402,6 @@ struct btrfs_backref_cache {
* level blocks may not reflect the new location
*/
struct list_head pending[BTRFS_MAX_LEVEL];
- /* List of backref nodes with no child node */
- struct list_head leaves;
- /* List of blocks that have been COWed in current transaction */
- struct list_head changed;
- /* List of detached backref node. */
- struct list_head detached;
u64 last_trans;
@@ -414,7 +419,7 @@ struct btrfs_backref_cache {
/*
* Whether this cache is for relocation
*
- * Reloction backref cache require more info for reloc root compared
+ * Relocation backref cache require more info for reloc root compared
* to generic backref cache.
*/
bool is_reloc;
@@ -427,13 +432,6 @@ struct btrfs_backref_node *btrfs_backref_alloc_node(
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
struct btrfs_backref_cache *cache);
-#define LINK_LOWER (1 << 0)
-#define LINK_UPPER (1 << 1)
-
-void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
- struct btrfs_backref_node *lower,
- struct btrfs_backref_node *upper,
- int link_which);
void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node);
void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 1f216d07eff6..fa1d321a2fb8 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -27,12 +27,12 @@ struct btrfs_failed_bio {
};
/* Is this a data path I/O that needs storage layer checksum and repair? */
-static inline bool is_data_bbio(struct btrfs_bio *bbio)
+static inline bool is_data_bbio(const struct btrfs_bio *bbio)
{
return bbio->inode && is_data_inode(bbio->inode);
}
-static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
+static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio)
{
return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
}
@@ -41,13 +41,17 @@ static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
* Initialize a btrfs_bio structure. This skips the embedded bio itself as it
* is already initialized by the block layer.
*/
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset,
btrfs_bio_end_io_t end_io, void *private)
{
+ /* @inode parameter is mandatory. */
+ ASSERT(inode);
+
memset(bbio, 0, offsetof(struct btrfs_bio, bio));
- bbio->fs_info = fs_info;
+ bbio->inode = inode;
bbio->end_io = end_io;
bbio->private = private;
+ bbio->file_offset = file_offset;
atomic_set(&bbio->pending_ios, 1);
WRITE_ONCE(bbio->status, BLK_STS_OK);
}
@@ -60,7 +64,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
* a mempool.
*/
struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
- struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *inode, u64 file_offset,
btrfs_bio_end_io_t end_io, void *private)
{
struct btrfs_bio *bbio;
@@ -68,7 +72,7 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
bbio = btrfs_bio(bio);
- btrfs_bio_init(bbio, fs_info, end_io, private);
+ btrfs_bio_init(bbio, inode, file_offset, end_io, private);
return bbio;
}
@@ -81,46 +85,40 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
&btrfs_clone_bioset);
+ if (IS_ERR(bio))
+ return ERR_CAST(bio);
+
bbio = btrfs_bio(bio);
- btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
- bbio->inode = orig_bbio->inode;
- bbio->file_offset = orig_bbio->file_offset;
+ btrfs_bio_init(bbio, orig_bbio->inode, orig_bbio->file_offset, NULL, orig_bbio);
orig_bbio->file_offset += map_length;
if (bbio_has_ordered_extent(bbio)) {
refcount_inc(&orig_bbio->ordered->refs);
bbio->ordered = orig_bbio->ordered;
+ bbio->orig_logical = orig_bbio->orig_logical;
+ orig_bbio->orig_logical += map_length;
}
+ bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root;
atomic_inc(&orig_bbio->pending_ios);
return bbio;
}
-/* Free a bio that was never submitted to the underlying device. */
-static void btrfs_cleanup_bio(struct btrfs_bio *bbio)
+void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
- if (bbio_has_ordered_extent(bbio))
- btrfs_put_ordered_extent(bbio->ordered);
- bio_put(&bbio->bio);
-}
+ /* Make sure we're already in task context. */
+ ASSERT(in_task());
-static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
-{
- if (bbio_has_ordered_extent(bbio)) {
- struct btrfs_ordered_extent *ordered = bbio->ordered;
+ if (bbio->async_csum)
+ wait_for_completion(&bbio->csum_done);
- bbio->end_io(bbio);
- btrfs_put_ordered_extent(ordered);
- } else {
- bbio->end_io(bbio);
- }
-}
-
-void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
-{
bbio->bio.bi_status = status;
if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
struct btrfs_bio *orig_bbio = bbio->private;
- btrfs_cleanup_bio(bbio);
+ /* Free bio that was never submitted to the underlying device. */
+ if (bbio_has_ordered_extent(bbio))
+ btrfs_put_ordered_extent(bbio->ordered);
+ bio_put(&bbio->bio);
+
bbio = orig_bbio;
}
@@ -135,18 +133,26 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
/* Load split bio's error which might be set above. */
if (status == BLK_STS_OK)
bbio->bio.bi_status = READ_ONCE(bbio->status);
- __btrfs_bio_end_io(bbio);
+
+ if (bbio_has_ordered_extent(bbio)) {
+ struct btrfs_ordered_extent *ordered = bbio->ordered;
+
+ bbio->end_io(bbio);
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ bbio->end_io(bbio);
+ }
}
}
-static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
+static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
{
if (cur_mirror == fbio->num_copies)
return cur_mirror + 1 - fbio->num_copies;
return cur_mirror + 1;
}
-static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
+static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
{
if (cur_mirror == 1)
return fbio->num_copies;
@@ -167,17 +173,30 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
struct btrfs_failed_bio *fbio = repair_bbio->private;
struct btrfs_inode *inode = repair_bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
- int mirror = repair_bbio->mirror_num;
-
/*
- * We can only trigger this for data bio, which doesn't support larger
- * folios yet.
+ * We can not move forward the saved_iter, as it will be later
+ * utilized by repair_bbio again.
*/
- ASSERT(folio_order(page_folio(bv->bv_page)) == 0);
+ struct bvec_iter saved_iter = repair_bbio->saved_iter;
+ const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
+ const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT;
+ const u32 nr_steps = repair_bbio->saved_iter.bi_size / step;
+ int mirror = repair_bbio->mirror_num;
+ phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
+ phys_addr_t paddr;
+ unsigned int slot = 0;
+
+ /* Repair bbio should be eaxctly one block sized. */
+ ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize);
+
+ btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) {
+ ASSERT(slot < nr_steps);
+ paddrs[slot] = paddr;
+ slot++;
+ }
if (repair_bbio->bio.bi_status ||
- !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
+ !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) {
bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
@@ -196,8 +215,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
mirror = prev_repair_mirror(fbio, mirror);
btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
repair_bbio->file_offset, fs_info->sectorsize,
- repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
- page_folio(bv->bv_page), bv->bv_offset, mirror);
+ logical, paddrs, step, mirror);
} while (mirror != fbio->bbio->mirror_num);
done:
@@ -214,13 +232,20 @@ done:
*/
static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
u32 bio_offset,
- struct bio_vec *bv,
+ phys_addr_t paddrs[],
struct btrfs_failed_bio *fbio)
{
struct btrfs_inode *inode = failed_bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
- const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT);
+ const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
+ const u32 nr_steps = sectorsize / step;
+ /*
+ * For bs > ps cases, the saved_iter can be partially moved forward.
+ * In that case we should round it down to the block boundary.
+ */
+ const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
+ sectorsize);
struct btrfs_bio *repair_bbio;
struct bio *repair_bio;
int num_copies;
@@ -245,15 +270,22 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
atomic_inc(&fbio->repair_count);
- repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
+ repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS,
&btrfs_repair_bioset);
- repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
- __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
+ repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+ for (int i = 0; i < nr_steps; i++) {
+ int ret;
+
+ ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE);
+
+ ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step,
+ offset_in_page(paddrs[i]));
+ ASSERT(ret == step);
+ }
repair_bbio = btrfs_bio(repair_bio);
- btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
- repair_bbio->inode = failed_bbio->inode;
- repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
+ btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset,
+ NULL, fbio);
mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
@@ -265,10 +297,14 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u32 sectorsize = fs_info->sectorsize;
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 step = min(sectorsize, PAGE_SIZE);
+ const u32 nr_steps = sectorsize / step;
struct bvec_iter *iter = &bbio->saved_iter;
blk_status_t status = bbio->bio.bi_status;
struct btrfs_failed_bio *fbio = NULL;
+ phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
+ phys_addr_t paddr;
u32 offset = 0;
/* Read-repair requires the inode field to be set by the submitter. */
@@ -286,19 +322,19 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
/* Clear the I/O error. A failed repair will reset it. */
bbio->bio.bi_status = BLK_STS_OK;
- while (iter->bi_size) {
- struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter);
+ btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) {
+ paddrs[(offset / step) % nr_steps] = paddr;
+ offset += step;
- bv.bv_len = min(bv.bv_len, sectorsize);
- if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv))
- fbio = repair_one_sector(bbio, offset, &bv, fbio);
-
- bio_advance_iter_single(&bbio->bio, iter, sectorsize);
- offset += sectorsize;
+ if (IS_ALIGNED(offset, sectorsize)) {
+ if (status ||
+ !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs))
+ fbio = repair_one_sector(bbio, offset - sectorsize,
+ paddrs, fbio);
+ }
}
-
if (bbio->csum != bbio->csum_inline)
- kfree(bbio->csum);
+ kvfree(bbio->csum);
if (fbio)
btrfs_repair_done(fbio);
@@ -306,7 +342,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
-static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev)
{
if (!dev || !dev->bdev)
return;
@@ -321,44 +357,43 @@ static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
}
-static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
- struct bio *bio)
+static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info,
+ const struct bio *bio)
{
if (bio->bi_opf & REQ_META)
return fs_info->endio_meta_workers;
return fs_info->endio_workers;
}
-static void btrfs_end_bio_work(struct work_struct *work)
+static void simple_end_io_work(struct work_struct *work)
{
struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
+ struct bio *bio = &bbio->bio;
- /* Metadata reads are checked and repaired by the submitter. */
- if (is_data_bbio(bbio))
- btrfs_check_read_bio(bbio, bbio->bio.bi_private);
- else
- btrfs_bio_end_io(bbio, bbio->bio.bi_status);
+ if (bio_op(bio) == REQ_OP_READ) {
+ /* Metadata reads are checked and repaired by the submitter. */
+ if (is_data_bbio(bbio))
+ return btrfs_check_read_bio(bbio, bbio->bio.bi_private);
+ return btrfs_bio_end_io(bbio, bbio->bio.bi_status);
+ }
+ if (bio_is_zone_append(bio) && !bio->bi_status)
+ btrfs_record_physical_zoned(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
static void btrfs_simple_end_io(struct bio *bio)
{
struct btrfs_bio *bbio = btrfs_bio(bio);
struct btrfs_device *dev = bio->bi_private;
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
btrfs_bio_counter_dec(fs_info);
if (bio->bi_status)
btrfs_log_dev_io_error(bio, dev);
- if (bio_op(bio) == REQ_OP_READ) {
- INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
- queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
- } else {
- if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
- btrfs_record_physical_zoned(bbio);
- btrfs_bio_end_io(bbio, bbio->bio.bi_status);
- }
+ INIT_WORK(&bbio->end_io_work, simple_end_io_work);
+ queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
}
static void btrfs_raid56_end_io(struct bio *bio)
@@ -366,6 +401,9 @@ static void btrfs_raid56_end_io(struct bio *bio)
struct btrfs_io_context *bioc = bio->bi_private;
struct btrfs_bio *bbio = btrfs_bio(bio);
+ /* RAID56 endio is always handled in workqueue. */
+ ASSERT(in_task());
+
btrfs_bio_counter_dec(bioc->fs_info);
bbio->mirror_num = bioc->mirror_num;
if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
@@ -376,11 +414,12 @@ static void btrfs_raid56_end_io(struct bio *bio)
btrfs_put_bioc(bioc);
}
-static void btrfs_orig_write_end_io(struct bio *bio)
+static void orig_write_end_io_work(struct work_struct *work)
{
+ struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
+ struct bio *bio = &bbio->bio;
struct btrfs_io_stripe *stripe = bio->bi_private;
struct btrfs_io_context *bioc = stripe->bioc;
- struct btrfs_bio *bbio = btrfs_bio(bio);
btrfs_bio_counter_dec(bioc->fs_info);
@@ -398,21 +437,31 @@ static void btrfs_orig_write_end_io(struct bio *bio)
else
bio->bi_status = BLK_STS_OK;
- if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+ if (bio_is_zone_append(bio) && !bio->bi_status)
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
btrfs_put_bioc(bioc);
}
-static void btrfs_clone_write_end_io(struct bio *bio)
+static void btrfs_orig_write_end_io(struct bio *bio)
{
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ INIT_WORK(&bbio->end_io_work, orig_write_end_io_work);
+ queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
+}
+
+static void clone_write_end_io_work(struct work_struct *work)
+{
+ struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
+ struct bio *bio = &bbio->bio;
struct btrfs_io_stripe *stripe = bio->bi_private;
if (bio->bi_status) {
atomic_inc(&stripe->bioc->error);
btrfs_log_dev_io_error(bio, stripe->dev);
- } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ } else if (bio_is_zone_append(bio)) {
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
}
@@ -421,6 +470,14 @@ static void btrfs_clone_write_end_io(struct bio *bio)
bio_put(bio);
}
+static void btrfs_clone_write_end_io(struct bio *bio)
+{
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ INIT_WORK(&bbio->end_io_work, clone_write_end_io_work);
+ queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
+}
+
static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
{
if (!dev || !dev->bdev ||
@@ -444,12 +501,20 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
ASSERT(btrfs_dev_is_sequential(dev, physical));
bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
}
- btrfs_debug_in_rcu(dev->fs_info,
+ btrfs_debug(dev->fs_info,
"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
__func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
dev->devid, bio->bi_iter.bi_size);
+ /*
+ * Track reads if tracking is enabled; ignore I/O operations before the
+ * filesystem is fully initialized.
+ */
+ if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
+ percpu_counter_add(&dev->fs_info->stats_read_blocks,
+ bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);
+
if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
blkcg_punt_bio_submit(bio);
else
@@ -459,6 +524,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
{
struct bio *orig_bio = bioc->orig_bio, *bio;
+ struct btrfs_bio *orig_bbio = btrfs_bio(orig_bio);
ASSERT(bio_op(orig_bio) != REQ_OP_READ);
@@ -467,8 +533,11 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
bio = orig_bio;
bio->bi_end_io = btrfs_orig_write_end_io;
} else {
- bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
+ /* We need to use endio_work to run end_io in task context. */
+ bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &btrfs_bioset);
bio_inc_remaining(orig_bio);
+ btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode,
+ orig_bbio->file_offset, NULL, NULL);
bio->bi_end_io = btrfs_clone_write_end_io;
}
@@ -509,11 +578,15 @@ static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
}
}
-static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio)
+static int btrfs_bio_csum(struct btrfs_bio *bbio)
{
if (bbio->bio.bi_opf & REQ_META)
return btree_csum_one_bio(bbio);
- return btrfs_csum_one_bio(bbio);
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ return btrfs_csum_one_bio(bbio, true);
+#else
+ return btrfs_csum_one_bio(bbio, false);
+#endif
}
/*
@@ -540,11 +613,11 @@ static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async =
container_of(work, struct async_submit_bio, work);
- blk_status_t ret;
+ int ret;
ret = btrfs_bio_csum(async->bbio);
if (ret)
- async->bbio->bio.bi_status = ret;
+ async->bbio->bio.bi_status = errno_to_blk_status(ret);
}
/*
@@ -570,7 +643,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
/* If an error occurred we just want to clean up the bio and move on. */
if (bio->bi_status) {
- btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status);
+ btrfs_bio_end_io(async->bbio, bio->bi_status);
return;
}
@@ -585,20 +658,25 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
static bool should_async_write(struct btrfs_bio *bbio)
{
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
bool auto_csum_mode = true;
#ifdef CONFIG_BTRFS_EXPERIMENTAL
- struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
- if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF)
- return false;
-
- auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO);
+ if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_ON)
+ return true;
+ /*
+ * Write bios will calculate checksum and submit bio at the same time.
+ * Unless explicitly required don't offload serial csum calculate and bio
+ * submit into a workqueue.
+ */
+ return false;
#endif
/* Submit synchronously if the checksum implementation is fast. */
- if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
+ if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
return false;
/*
@@ -609,7 +687,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
return false;
/* Zoned devices require I/O to be submitted in order. */
- if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info))
+ if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info))
return false;
return true;
@@ -624,7 +702,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
struct btrfs_io_context *bioc,
struct btrfs_io_stripe *smap, int mirror_num)
{
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
@@ -643,21 +721,28 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
{
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
unsigned int nr_segs;
int sector_offset;
- map_length = min(map_length, bbio->fs_info->max_zone_append_size);
- sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
+ map_length = min(map_length, fs_info->max_zone_append_size);
+ sector_offset = bio_split_rw_at(&bbio->bio, &fs_info->limits,
&nr_segs, map_length);
- if (sector_offset)
- return sector_offset << SECTOR_SHIFT;
+ if (sector_offset) {
+ /*
+ * bio_split_rw_at() could split at a size smaller than our
+ * sectorsize and thus cause unaligned I/Os. Fix that by
+ * always rounding down to the nearest boundary.
+ */
+ return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, fs_info->sectorsize);
+ }
return map_length;
}
static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
{
struct btrfs_inode *inode = bbio->inode;
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = &bbio->bio;
u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 length = bio->bi_iter.bi_size;
@@ -665,28 +750,45 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bool use_append = btrfs_use_zone_append(bbio);
struct btrfs_io_context *bioc = NULL;
struct btrfs_io_stripe smap;
- blk_status_t ret;
- int error;
+ blk_status_t status;
+ int ret;
- if (!bbio->inode || btrfs_is_data_reloc_root(inode->root))
+ if (bbio->is_scrub || btrfs_is_data_reloc_root(inode->root))
smap.rst_search_commit_root = true;
else
smap.rst_search_commit_root = false;
btrfs_bio_counter_inc_blocked(fs_info);
- error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
- &bioc, &smap, &mirror_num);
- if (error) {
- ret = errno_to_blk_status(error);
- goto fail;
+ ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ &bioc, &smap, &mirror_num);
+ if (ret) {
+ status = errno_to_blk_status(ret);
+ btrfs_bio_counter_dec(fs_info);
+ goto end_bbio;
}
+ /*
+ * For fscrypt writes we will get the encrypted bio after we've remapped
+ * our bio to the physical disk location, so we need to save the
+ * original bytenr so we know what we're checksumming.
+ */
+ if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio))
+ bbio->orig_logical = logical;
+
map_length = min(map_length, length);
if (use_append)
map_length = btrfs_append_map_length(bbio, map_length);
if (map_length < length) {
- bbio = btrfs_split_bio(fs_info, bbio, map_length);
+ struct btrfs_bio *split;
+
+ split = btrfs_split_bio(fs_info, bbio, map_length);
+ if (IS_ERR(split)) {
+ status = errno_to_blk_status(PTR_ERR(split));
+ btrfs_bio_counter_dec(fs_info);
+ goto end_bbio;
+ }
+ bbio = split;
bio = &bbio->bio;
}
@@ -697,7 +799,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
bbio->saved_iter = bio->bi_iter;
ret = btrfs_lookup_bio_sums(bbio);
- if (ret)
+ status = errno_to_blk_status(ret);
+ if (status)
goto fail;
}
@@ -707,8 +810,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bio->bi_opf |= REQ_OP_ZONE_APPEND;
}
- if (is_data_bbio(bbio) && bioc &&
- btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
+ if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
/*
* No locking for the list update, as we only add to
* the list in the I/O submission path, and list
@@ -723,7 +825,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
* Csum items for reloc roots have already been cloned at this
* point, so they are handled as part of the no-checksum case.
*/
- if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
+ if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
!test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
!btrfs_is_data_reloc_root(inode->root)) {
if (should_async_write(bbio) &&
@@ -731,13 +833,15 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
goto done;
ret = btrfs_bio_csum(bbio);
- if (ret)
+ status = errno_to_blk_status(ret);
+ if (status)
goto fail;
} else if (use_append ||
(btrfs_is_zoned(fs_info) && inode &&
inode->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_alloc_dummy_sum(bbio);
- if (ret)
+ status = errno_to_blk_status(ret);
+ if (status)
goto fail;
}
}
@@ -758,18 +862,48 @@ fail:
ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
ASSERT(remaining);
- btrfs_bio_end_io(remaining, ret);
+ btrfs_bio_end_io(remaining, status);
}
- btrfs_bio_end_io(bbio, ret);
+end_bbio:
+ btrfs_bio_end_io(bbio, status);
/* Do not submit another chunk */
return true;
}
+static void assert_bbio_alignment(struct btrfs_bio *bbio)
+{
+#ifdef CONFIG_BTRFS_ASSERT
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 alignment = min(blocksize, PAGE_SIZE);
+ const u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ const u32 length = bbio->bio.bi_iter.bi_size;
+
+ /* The logical and length should still be aligned to blocksize. */
+ ASSERT(IS_ALIGNED(logical, blocksize) && IS_ALIGNED(length, blocksize) &&
+ length != 0, "root=%llu inode=%llu logical=%llu length=%u",
+ btrfs_root_id(bbio->inode->root),
+ btrfs_ino(bbio->inode), logical, length);
+
+ bio_for_each_bvec(bvec, &bbio->bio, iter)
+ ASSERT(IS_ALIGNED(bvec.bv_offset, alignment) &&
+ IS_ALIGNED(bvec.bv_len, alignment),
+ "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u",
+ btrfs_root_id(bbio->inode->root),
+ btrfs_ino(bbio->inode), logical, length, iter.bi_idx,
+ bvec.bv_offset, bvec.bv_len);
+#endif
+}
+
void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
{
/* If bbio->inode is not populated, its file_offset must be 0. */
ASSERT(bbio->inode || bbio->file_offset == 0);
+ assert_bbio_alignment(bbio);
+
while (!btrfs_submit_chunk(bbio, mirror_num))
;
}
@@ -783,19 +917,36 @@ void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
*
* The I/O is issued synchronously to block the repair read completion from
* freeing the bio.
+ *
+ * @ino: Offending inode number
+ * @fileoff: File offset inside the inode
+ * @length: Length of the repair write
+ * @logical: Logical address of the range
+ * @paddrs: Physical address array of the content
+ * @step: Length of for each paddrs
+ * @mirror_num: Mirror number to write to. Must not be zero
*/
-int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct folio *folio,
- unsigned int folio_offset, int mirror_num)
+int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff,
+ u32 length, u64 logical, const phys_addr_t paddrs[],
+ unsigned int step, int mirror_num)
{
+ const u32 nr_steps = DIV_ROUND_UP_POW2(length, step);
struct btrfs_io_stripe smap = { 0 };
- struct bio_vec bvec;
- struct bio bio;
+ struct bio *bio = NULL;
int ret = 0;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
+ /* Basic alignment checks. */
+ ASSERT(IS_ALIGNED(logical, fs_info->sectorsize));
+ ASSERT(IS_ALIGNED(length, fs_info->sectorsize));
+ ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize));
+ /* Either it's a single data or metadata block. */
+ ASSERT(length <= BTRFS_MAX_BLOCKSIZE);
+ ASSERT(step <= length);
+ ASSERT(is_power_of_2(step));
+
if (btrfs_repair_one_zone(fs_info, logical))
return 0;
@@ -809,31 +960,33 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
if (ret < 0)
goto out_counter_dec;
- if (!smap.dev->bdev ||
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
+ if (unlikely(!smap.dev->bdev ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) {
ret = -EIO;
goto out_counter_dec;
}
- bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
- bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
- ret = bio_add_folio(&bio, folio, length, folio_offset);
- ASSERT(ret);
- ret = submit_bio_wait(&bio);
+ bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
+ bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
+ for (int i = 0; i < nr_steps; i++) {
+ ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i]));
+ /* We should have allocated enough slots to contain all the different pages. */
+ ASSERT(ret == step);
+ }
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
if (ret) {
/* try to remap that extent elsewhere? */
btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
- goto out_bio_uninit;
+ goto out_counter_dec;
}
- btrfs_info_rl_in_rcu(fs_info,
+ btrfs_info_rl(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
- ino, start, btrfs_dev_name(smap.dev),
+ ino, fileoff, btrfs_dev_name(smap.dev),
smap.physical >> SECTOR_SHIFT);
ret = 0;
-out_bio_uninit:
- bio_uninit(&bio);
out_counter_dec:
btrfs_bio_counter_dec(fs_info);
return ret;
@@ -846,16 +999,16 @@ out_counter_dec:
*/
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
{
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
u64 length = bbio->bio.bi_iter.bi_size;
struct btrfs_io_stripe smap = { 0 };
int ret;
- ASSERT(fs_info);
ASSERT(mirror_num > 0);
ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
- ASSERT(!bbio->inode);
+ ASSERT(!is_data_inode(bbio->inode));
+ ASSERT(bbio->is_scrub);
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
@@ -882,22 +1035,18 @@ int __init btrfs_bioset_init(void)
return -ENOMEM;
if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_bio, bio), 0))
- goto out_free_bioset;
+ goto out;
if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_bio, bio),
BIOSET_NEED_BVECS))
- goto out_free_clone_bioset;
+ goto out;
if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
sizeof(struct btrfs_failed_bio)))
- goto out_free_repair_bioset;
+ goto out;
return 0;
-out_free_repair_bioset:
- bioset_exit(&btrfs_repair_bioset);
-out_free_clone_bioset:
- bioset_exit(&btrfs_clone_bioset);
-out_free_bioset:
- bioset_exit(&btrfs_bioset);
+out:
+ btrfs_bioset_exit();
return -ENOMEM;
}
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index e2fe16074ad6..1be74209f0b8 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -18,13 +18,6 @@ struct btrfs_inode;
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
-/*
- * Maximum number of sectors for a single bio to limit the size of the
- * checksum array. This matches the number of bio_vecs per bio and thus the
- * I/O size for buffered I/O.
- */
-#define BTRFS_MAX_BIO_SECTORS (256)
-
typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
/*
@@ -34,7 +27,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
struct btrfs_bio {
/*
* Inode and offset into it that this I/O operates on.
- * Only set for data I/O.
+ *
+ * If the inode is a data one, csum verification and read-repair
+ * will be done automatically.
+ * If the inode is a metadata one, everything is handled by the caller.
*/
struct btrfs_inode *inode;
u64 file_offset;
@@ -56,11 +52,16 @@ struct btrfs_bio {
* - pointer to the checksums for this bio
* - original physical address from the allocator
* (for zone append only)
+ * - original logical address, used for checksumming fscrypt bios
*/
struct {
struct btrfs_ordered_extent *ordered;
struct btrfs_ordered_sum *sums;
+ struct work_struct csum_work;
+ struct completion csum_done;
+ struct bvec_iter csum_saved_iter;
u64 orig_physical;
+ u64 orig_logical;
};
/* For metadata reads: parentness verification. */
@@ -76,12 +77,21 @@ struct btrfs_bio {
atomic_t pending_ios;
struct work_struct end_io_work;
- /* File system that this I/O operates on. */
- struct btrfs_fs_info *fs_info;
-
/* Save the first error status of split bio. */
blk_status_t status;
+ /* Use the commit root to look up csums (data read bio only). */
+ bool csum_search_commit_root;
+
+ /*
+ * Since scrub will reuse btree inode, we need this flag to distinguish
+ * scrub bios.
+ */
+ bool is_scrub;
+
+ /* Whether the csum generation for data write is async. */
+ bool async_csum;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
@@ -97,10 +107,10 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
int __init btrfs_bioset_init(void);
void __cold btrfs_bioset_exit(void);
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset,
btrfs_bio_end_io_t end_io, void *private);
struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
- struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *inode, u64 file_offset,
btrfs_bio_end_io_t end_io, void *private);
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
@@ -109,8 +119,8 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num);
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
-int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct folio *folio,
- unsigned int folio_offset, int mirror_num);
+int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff,
+ u32 length, u64 logical, const phys_addr_t paddrs[],
+ unsigned int step, int mirror_num);
#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 4427c1b835e8..08b14449fabe 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -34,6 +34,19 @@ int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group
}
#endif
+static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group)
+{
+ /* The meta_write_pointer is available only on the zoned setup. */
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return false;
+
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ return block_group->start + block_group->alloc_offset >
+ block_group->meta_write_pointer;
+}
+
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
* is not in progress
@@ -173,43 +186,41 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
}
}
+static int btrfs_bg_start_cmp(const struct rb_node *new,
+ const struct rb_node *exist)
+{
+ const struct btrfs_block_group *new_bg =
+ rb_entry(new, struct btrfs_block_group, cache_node);
+ const struct btrfs_block_group *exist_bg =
+ rb_entry(exist, struct btrfs_block_group, cache_node);
+
+ if (new_bg->start < exist_bg->start)
+ return -1;
+ if (new_bg->start > exist_bg->start)
+ return 1;
+ return 0;
+}
+
/*
* This adds the block group to the fs_info rb tree for the block group cache
*/
-static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
- struct btrfs_block_group *block_group)
+static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group)
{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct btrfs_block_group *cache;
- bool leftmost = true;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct rb_node *exist;
+ int ret = 0;
ASSERT(block_group->length != 0);
- write_lock(&info->block_group_cache_lock);
- p = &info->block_group_cache_tree.rb_root.rb_node;
-
- while (*p) {
- parent = *p;
- cache = rb_entry(parent, struct btrfs_block_group, cache_node);
- if (block_group->start < cache->start) {
- p = &(*p)->rb_left;
- } else if (block_group->start > cache->start) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else {
- write_unlock(&info->block_group_cache_lock);
- return -EEXIST;
- }
- }
-
- rb_link_node(&block_group->cache_node, parent, p);
- rb_insert_color_cached(&block_group->cache_node,
- &info->block_group_cache_tree, leftmost);
+ write_lock(&fs_info->block_group_cache_lock);
- write_unlock(&info->block_group_cache_lock);
+ exist = rb_find_add_cached(&block_group->cache_node,
+ &fs_info->block_group_cache_tree, btrfs_bg_start_cmp);
+ if (exist)
+ ret = -EEXIST;
+ write_unlock(&fs_info->block_group_cache_lock);
- return 0;
+ return ret;
}
/*
@@ -527,10 +538,9 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
*total_added_ret = 0;
while (start < end) {
- if (!find_first_extent_bit(&info->excluded_extents, start,
- &extent_start, &extent_end,
- EXTENT_DIRTY | EXTENT_UPTODATE,
- NULL))
+ if (!btrfs_find_first_extent_bit(&info->excluded_extents, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY, NULL))
break;
if (extent_start <= start) {
@@ -586,7 +596,7 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
struct btrfs_root *extent_root;
u64 search_offset;
u64 search_end = block_group->start + block_group->length;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key search_key;
int ret = 0;
@@ -603,8 +613,8 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
BTRFS_SUPER_INFO_OFFSET));
- path->skip_locking = 1;
- path->search_commit_root = 1;
+ path->skip_locking = true;
+ path->search_commit_root = true;
path->reada = READA_FORWARD;
search_offset = index * div_u64(block_group->length, max_index);
@@ -628,7 +638,6 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
lockdep_assert_held(&caching_ctl->mutex);
lockdep_assert_held_read(&fs_info->commit_root_sem);
- btrfs_free_path(path);
return ret;
}
@@ -704,7 +713,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *extent_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
u64 total_found = 0;
@@ -735,13 +744,13 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
* root to add free space. So we skip locking and search the commit
* root, since its read-only
*/
- path->skip_locking = 1;
- path->search_commit_root = 1;
+ path->skip_locking = true;
+ path->search_commit_root = true;
path->reada = READA_FORWARD;
key.objectid = last;
- key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -787,8 +796,8 @@ next:
if (key.objectid < last) {
key.objectid = last;
- key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
btrfs_release_path(path);
goto next;
}
@@ -831,14 +840,13 @@ next:
block_group->start + block_group->length,
NULL);
out:
- btrfs_free_path(path);
return ret;
}
static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
{
- clear_extent_bits(&bg->fs_info->excluded_extents, bg->start,
- bg->start + bg->length - 1, EXTENT_UPTODATE);
+ btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start,
+ bg->start + bg->length - 1, EXTENT_DIRTY, NULL);
}
static noinline void caching_thread(struct btrfs_work *work)
@@ -882,7 +890,7 @@ static noinline void caching_thread(struct btrfs_work *work)
*/
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
- ret = load_free_space_tree(caching_ctl);
+ ret = btrfs_load_free_space_tree(caching_ctl);
else
ret = load_extent_tree_free(caching_ctl);
done:
@@ -1057,7 +1065,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_chunk_map *map)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_block_group *block_group;
struct btrfs_free_cluster *cluster;
struct inode *inode;
@@ -1223,7 +1231,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
block_group->space_info->total_bytes -= block_group->length;
block_group->space_info->bytes_readonly -=
(block_group->length - block_group->zone_unusable);
- btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info,
+ btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
-block_group->zone_unusable);
block_group->space_info->disk_total -= block_group->length * factor;
@@ -1240,7 +1248,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* another task to attempt to create another block group with the same
* item key (and failing with -EEXIST and a transaction abort).
*/
- ret = remove_block_group_free_space(trans, block_group);
+ ret = btrfs_remove_block_group_free_space(trans, block_group);
if (ret)
goto out;
@@ -1249,6 +1257,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
spin_lock(&block_group->lock);
+ /*
+ * Hitting this WARN means we removed a block group with an unwritten
+ * region. It will cause "unable to find chunk map for logical" errors.
+ */
+ if (WARN_ON(has_unwritten_metadata(block_group)))
+ btrfs_warn(fs_info,
+ "block group %llu is removed before metadata write out",
+ block_group->start);
+
set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
/*
@@ -1288,7 +1305,6 @@ out:
btrfs_put_block_group(block_group);
if (remove_rsv)
btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
- btrfs_free_path(path);
return ret;
}
@@ -1341,7 +1357,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
* data in this block group. That check should be done by relocation routine,
* not this function.
*/
-static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group *cache, bool force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
@@ -1386,8 +1402,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
* BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
* leeway to allow us to mark this block group as read only.
*/
- if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
- BTRFS_RESERVE_NO_FLUSH))
+ if (btrfs_can_overcommit(sinfo, num_bytes, BTRFS_RESERVE_NO_FLUSH))
ret = 0;
}
@@ -1396,8 +1411,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes to readonly */
sinfo->bytes_readonly += cache->zone_unusable;
- btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
- -cache->zone_unusable);
+ btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable);
cache->zone_unusable = 0;
}
cache->ro++;
@@ -1409,7 +1423,7 @@ out:
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
btrfs_info(cache->fs_info,
"unable to make block group %llu ro", cache->start);
- btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+ btrfs_dump_space_info(cache->space_info, 0, false);
}
return ret;
}
@@ -1424,9 +1438,8 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
int ret;
spin_lock(&fs_info->trans_lock);
- if (trans->transaction->list.prev != &fs_info->trans_list) {
- prev_trans = list_last_entry(&trans->transaction->list,
- struct btrfs_transaction, list);
+ if (!list_is_first(&trans->transaction->list, &fs_info->trans_list)) {
+ prev_trans = list_prev_entry(trans->transaction, list);
refcount_inc(&prev_trans->use_count);
}
spin_unlock(&fs_info->trans_lock);
@@ -1443,14 +1456,14 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans) {
- ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
- EXTENT_DIRTY);
+ ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end,
+ EXTENT_DIRTY, NULL);
if (ret)
goto out;
}
- ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
- EXTENT_DIRTY);
+ ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end,
+ EXTENT_DIRTY, NULL);
out:
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans)
@@ -1460,6 +1473,32 @@ out:
}
/*
+ * Link the block_group to a list via bg_list.
+ *
+ * @bg: The block_group to link to the list.
+ * @list: The list to link it to.
+ *
+ * Use this rather than list_add_tail() directly to ensure proper respect
+ * to locking and refcounting.
+ *
+ * Returns: true if the bg was linked with a refcount bump and false otherwise.
+ */
+static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ bool added = false;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ list_add_tail(&bg->bg_list, list);
+ added = true;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ return added;
+}
+
+/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
*/
@@ -1567,15 +1606,15 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* needing to allocate extents from the block group.
*/
used = btrfs_space_info_used(space_info, true);
- if (space_info->total_bytes - block_group->length < used &&
- block_group->zone_unusable < block_group->length) {
+ if ((space_info->total_bytes - block_group->length < used &&
+ block_group->zone_unusable < block_group->length) ||
+ has_unwritten_metadata(block_group)) {
/*
* Add a reference for the list, compensate for the ref
* drop under the "next" label for the
* fs_info->unused_bgs list.
*/
- btrfs_get_block_group(block_group);
- list_add_tail(&block_group->bg_list, &retry_list);
+ btrfs_link_bg_list(block_group, &retry_list);
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
@@ -1598,8 +1637,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
ret = btrfs_zone_finish(block_group);
if (ret < 0) {
btrfs_dec_block_group_ro(block_group);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
+ btrfs_link_bg_list(block_group, &retry_list);
ret = 0;
+ }
goto next;
}
@@ -1645,8 +1686,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- btrfs_space_info_update_bytes_pinned(fs_info, space_info,
- -block_group->pinned);
+ btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
block_group->pinned = 0;
@@ -1753,7 +1793,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
bg1 = list_entry(a, struct btrfs_block_group, bg_list);
bg2 = list_entry(b, struct btrfs_block_group, bg_list);
- return bg1->used > bg2->used;
+ /*
+ * Some other task may be updating the ->used field concurrently, but it
+ * is not serious if we get a stale value or load/store tearing issues,
+ * as sorting the list of block groups to reclaim is not critical and an
+ * occasional imperfect order is ok. So silence KCSAN and avoid the
+ * overhead of locking or any other synchronization.
+ */
+ return data_race(bg1->used > bg2->used);
}
static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
@@ -1801,12 +1848,10 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!btrfs_should_reclaim(fs_info))
return;
- sb_start_write(fs_info->sb);
+ guard(super_write)(fs_info->sb);
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
- sb_end_write(fs_info->sb);
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
return;
- }
/*
* Long running balances can keep us blocked here for eternity, so
@@ -1814,7 +1859,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
btrfs_exclop_finish(fs_info);
- sb_end_write(fs_info->sb);
return;
}
@@ -1826,8 +1870,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
- u64 zone_unusable;
- u64 reclaimed;
+ u64 used;
+ u64 reserved;
int ret = 0;
bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1891,13 +1935,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
up_write(&space_info->groups_sem);
goto next;
}
+
spin_unlock(&bg->lock);
spin_unlock(&space_info->lock);
/*
* Get out fast, in case we're read-only or unmounting the
* filesystem. It is OK to drop block groups from the list even
- * for the read-only case. As we did sb_start_write(),
+ * for the read-only case. As we did take the super write lock,
* "mount -o remount,ro" won't happen and read-only filesystem
* means it is forced read-only due to a fatal error. So, it
* never gets back to read-write to let us reclaim again.
@@ -1907,31 +1952,41 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
goto next;
}
- /*
- * Cache the zone_unusable value before turning the block group
- * to read only. As soon as the blog group is read only it's
- * zone_unusable value gets moved to the block group's read-only
- * bytes and isn't available for calculations anymore.
- */
- zone_unusable = bg->zone_unusable;
ret = inc_block_group_ro(bg, 0);
up_write(&space_info->groups_sem);
if (ret < 0)
goto next;
- btrfs_info(fs_info,
- "reclaiming chunk %llu with %llu%% used %llu%% unusable",
- bg->start,
- div64_u64(bg->used * 100, bg->length),
- div64_u64(zone_unusable * 100, bg->length));
+ /*
+ * The amount of bytes reclaimed corresponds to the sum of the
+ * "used" and "reserved" counters. We have set the block group
+ * to RO above, which prevents reservations from happening but
+ * we may have existing reservations for which allocation has
+ * not yet been done - btrfs_update_block_group() was not yet
+ * called, which is where we will transfer a reserved extent's
+ * size from the "reserved" counter to the "used" counter - this
+ * happens when running delayed references. When we relocate the
+ * chunk below, relocation first flushes delalloc, waits for
+ * ordered extent completion (which is where we create delayed
+ * references for data extents) and commits the current
+ * transaction (which runs delayed references), and only after
+ * it does the actual work to move extents out of the block
+ * group. So the reported amount of reclaimed bytes is
+ * effectively the sum of the 'used' and 'reserved' counters.
+ */
+ spin_lock(&bg->lock);
+ used = bg->used;
+ reserved = bg->reserved;
+ spin_unlock(&bg->lock);
+
trace_btrfs_reclaim_block_group(bg);
- reclaimed = bg->used;
- ret = btrfs_relocate_chunk(fs_info, bg->start);
+ ret = btrfs_relocate_chunk(fs_info, bg->start, false);
if (ret) {
btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
- reclaimed = 0;
+ used = 0;
+ reserved = 0;
spin_lock(&space_info->lock);
space_info->reclaim_errors++;
if (READ_ONCE(space_info->periodic_reclaim))
@@ -1940,24 +1995,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
spin_lock(&space_info->lock);
space_info->reclaim_count++;
- space_info->reclaim_bytes += reclaimed;
+ space_info->reclaim_bytes += used;
+ space_info->reclaim_bytes += reserved;
spin_unlock(&space_info->lock);
next:
- if (ret && !READ_ONCE(space_info->periodic_reclaim)) {
- /* Refcount held by the reclaim_bgs list after splice. */
- spin_lock(&fs_info->unused_bgs_lock);
- /*
- * This block group might be added to the unused list
- * during the above process. Move it back to the
- * reclaim list otherwise.
- */
- if (list_empty(&bg->bg_list)) {
- btrfs_get_block_group(bg);
- list_add_tail(&bg->bg_list, &retry_list);
- }
- spin_unlock(&fs_info->unused_bgs_lock);
- }
+ if (ret && !READ_ONCE(space_info->periodic_reclaim))
+ btrfs_link_bg_list(bg, &retry_list);
btrfs_put_block_group(bg);
mutex_unlock(&fs_info->reclaim_bgs_lock);
@@ -1981,7 +2025,6 @@ end:
list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
btrfs_exclop_finish(fs_info);
- sb_end_write(fs_info->sb);
}
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
@@ -1989,7 +2032,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
btrfs_reclaim_sweep(fs_info);
spin_lock(&fs_info->unused_bgs_lock);
if (!list_empty(&fs_info->reclaim_bgs))
- queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
+ queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work);
spin_unlock(&fs_info->unused_bgs_lock);
}
@@ -1997,13 +2040,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
- spin_lock(&fs_info->unused_bgs_lock);
- if (list_empty(&bg->bg_list)) {
- btrfs_get_block_group(bg);
+ if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs))
trace_btrfs_add_reclaim_block_group(bg);
- list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
- }
- spin_unlock(&fs_info->unused_bgs_lock);
}
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key,
@@ -2027,7 +2065,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key
return -ENOENT;
}
- if (map->start != key->objectid || map->chunk_len != key->offset) {
+ if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) {
btrfs_err(fs_info,
"block group %llu len %llu mismatch with chunk %llu len %llu",
key->objectid, key->offset, map->start, map->chunk_len);
@@ -2040,7 +2078,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key
flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;
- if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
key->objectid, key->offset, flags,
@@ -2186,9 +2224,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
cache->bytes_super += stripe_len;
- ret = set_extent_bit(&fs_info->excluded_extents, cache->start,
- cache->start + stripe_len - 1,
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_set_extent_bit(&fs_info->excluded_extents, cache->start,
+ cache->start + stripe_len - 1,
+ EXTENT_DIRTY, NULL);
if (ret)
return ret;
}
@@ -2201,7 +2239,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return ret;
/* Shouldn't have super stripes in sequential zones */
- if (zoned && nr) {
+ if (unlikely(zoned && nr)) {
kfree(logical);
btrfs_err(fs_info,
"zoned: block group %llu must not contain super block",
@@ -2214,9 +2252,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
cache->start + cache->length - logical[nr]);
cache->bytes_super += len;
- ret = set_extent_bit(&fs_info->excluded_extents, logical[nr],
- logical[nr] + len - 1,
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_set_extent_bit(&fs_info->excluded_extents,
+ logical[nr], logical[nr] + len - 1,
+ EXTENT_DIRTY, NULL);
if (ret) {
kfree(logical);
return ret;
@@ -2292,7 +2330,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
break;
bg = btrfs_lookup_block_group(fs_info, map->start);
- if (!bg) {
+ if (unlikely(!bg)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu doesn't have corresponding block group",
map->start, map->chunk_len);
@@ -2300,9 +2338,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
btrfs_free_chunk_map(map);
break;
}
- if (bg->start != map->start || bg->length != map->chunk_len ||
- (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
- (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (unlikely(bg->start != map->start || bg->length != map->chunk_len ||
+ (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+ (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
map->start, map->chunk_len,
@@ -2341,8 +2379,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
cache->commit_used = cache->used;
cache->flags = btrfs_stack_block_group_flags(bgi);
cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
+ cache->space_info = btrfs_find_space_info(info, cache->flags);
- set_free_space_tree_thresholds(cache);
+ btrfs_set_free_space_tree_thresholds(cache);
if (need_clear) {
/*
@@ -2414,11 +2453,12 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
- ret = btrfs_add_block_group_cache(info, cache);
+ ret = btrfs_add_block_group_cache(cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
goto error;
}
+
trace_btrfs_add_block_group(info, cache, 0);
btrfs_add_bg_to_space_info(info, cache);
@@ -2463,7 +2503,8 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
bg->cached = BTRFS_CACHE_FINISHED;
bg->used = map->chunk_len;
bg->flags = map->type;
- ret = btrfs_add_block_group_cache(fs_info, bg);
+ bg->space_info = btrfs_find_space_info(fs_info, bg->flags);
+ ret = btrfs_add_block_group_cache(bg);
/*
* We may have some valid block group cache added already, in
* that case we skip to the next one.
@@ -2513,8 +2554,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
return fill_dummy_bgs(info);
key.objectid = 0;
- key.offset = 0;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2645,7 +2686,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dev_extent *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -2662,7 +2703,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
key.offset = start;
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
@@ -2670,11 +2711,8 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
btrfs_set_dev_extent_chunk_objectid(leaf, extent,
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
-
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
- btrfs_mark_buffer_dirty(trans, leaf);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -2762,7 +2800,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
block_group->length);
if (ret)
btrfs_abort_transaction(trans, ret);
- add_block_group_free_space(trans, block_group);
+ btrfs_add_block_group_free_space(trans, block_group);
/*
* If we restriped during balance, we may have added a new raid
@@ -2776,8 +2814,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
/* Already aborted the transaction if it failed. */
next:
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+ btrfs_put_block_group(block_group);
+ spin_unlock(&fs_info->unused_bgs_lock);
/*
* If the block group is still unused, add it to the list of
@@ -2791,7 +2833,7 @@ next:
* space or none at all (due to no need to COW, extent buffers
* were already COWed in the current transaction and still
* unwritten, tree heights lower than the maximum possible
- * height, etc). For data we generally reserve the axact amount
+ * height, etc). For data we generally reserve the exact amount
* of space we are going to allocate later, the exception is
* when using compression, as we must reserve space based on the
* uncompressed data size, because the compression is only done
@@ -2835,8 +2877,8 @@ static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 off
}
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
- u64 type,
- u64 chunk_offset, u64 size)
+ struct btrfs_space_info *space_info,
+ u64 type, u64 chunk_offset, u64 size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache;
@@ -2856,7 +2898,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
cache->length = size;
- set_free_space_tree_thresholds(cache);
+ btrfs_set_free_space_tree_thresholds(cache);
cache->flags = type;
cache->cached = BTRFS_CACHE_FINISHED;
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
@@ -2890,10 +2932,10 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
* assigned to our block group. We want our bg to be added to the rbtree
* with its ->space_info set.
*/
- cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
+ cache->space_info = space_info;
ASSERT(cache->space_info);
- ret = btrfs_add_block_group_cache(fs_info, cache);
+ ret = btrfs_add_block_group_cache(cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
btrfs_put_block_group(cache);
@@ -2915,7 +2957,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
}
#endif
- list_add_tail(&cache->bg_list, &trans->new_bgs);
+ btrfs_link_bg_list(cache, &trans->new_bgs);
btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
set_avail_alloc_bits(fs_info, type);
@@ -2935,6 +2977,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_space_info *space_info = cache->space_info;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = btrfs_block_group_root(fs_info);
u64 alloc_flags;
@@ -2987,7 +3030,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
*/
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
if (alloc_flags != cache->flags) {
- ret = btrfs_chunk_alloc(trans, alloc_flags,
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags,
CHUNK_ALLOC_FORCE);
/*
* ENOSPC is allowed here, we may have enough space
@@ -3015,15 +3058,15 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
(cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
goto unlock_out;
- alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
- ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
/*
* We have allocated a new chunk. We also need to activate that chunk to
* grant metadata tickets for zoned filesystem.
*/
- ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
+ ret = btrfs_zoned_activate_one_bg(space_info, true);
if (ret < 0)
goto out;
@@ -3060,8 +3103,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
(cache->alloc_offset - cache->used - cache->pinned -
cache->reserved) +
(cache->length - cache->zone_capacity);
- btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
- cache->zone_unusable);
+ btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable);
sinfo->bytes_readonly -= cache->zone_unusable;
}
num_bytes = cache->length - cache->reserved -
@@ -3123,7 +3165,6 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
- btrfs_mark_buffer_dirty(trans, leaf);
fail:
btrfs_release_path(path);
/*
@@ -3201,7 +3242,7 @@ again:
*/
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret) {
+ if (unlikely(ret)) {
/*
* So theoretically we could recover from this, simply set the
* super cache generation to 0 so we know to invalidate the
@@ -3313,7 +3354,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache, *tmp;
struct btrfs_transaction *cur_trans = trans->transaction;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
if (list_empty(&cur_trans->dirty_bgs) ||
!btrfs_test_opt(fs_info, SPACE_CACHE))
@@ -3330,7 +3371,6 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
cache_save_setup(cache, trans, path);
}
- btrfs_free_path(path);
return 0;
}
@@ -3353,7 +3393,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
int loops = 0;
@@ -3508,7 +3548,6 @@ out:
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
}
- btrfs_free_path(path);
return ret;
}
@@ -3519,7 +3558,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct list_head *io = &cur_trans->io_bgs;
path = btrfs_alloc_path();
@@ -3606,9 +3645,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
ret = update_block_group_item(trans, path, cache);
- }
- if (ret)
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else if (ret) {
btrfs_abort_transaction(trans, ret);
+ }
}
/* If its not on the io list, we need to put the block group */
@@ -3631,7 +3672,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
btrfs_put_block_group(cache);
}
- btrfs_free_path(path);
return ret;
}
@@ -3699,7 +3739,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
old_val -= num_bytes;
cache->used = old_val;
cache->pinned += num_bytes;
- btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
+ btrfs_space_info_update_bytes_pinned(space_info, num_bytes);
space_info->bytes_used -= num_bytes;
space_info->disk_used -= num_bytes * factor;
if (READ_ONCE(space_info->periodic_reclaim))
@@ -3710,8 +3750,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
- set_extent_bit(&trans->transaction->pinned_extents, bytenr,
- bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
}
spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -3757,7 +3797,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* reservation and return -EAGAIN, otherwise this function always succeeds.
*/
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
- u64 ram_bytes, u64 num_bytes, int delalloc,
+ u64 ram_bytes, u64 num_bytes, bool delalloc,
bool force_wrong_size_class)
{
struct btrfs_space_info *space_info = cache->space_info;
@@ -3768,31 +3808,38 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
spin_lock(&cache->lock);
if (cache->ro) {
ret = -EAGAIN;
- goto out;
+ goto out_error;
}
if (btrfs_block_group_should_use_size_class(cache)) {
size_class = btrfs_calc_block_group_size_class(num_bytes);
ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
if (ret)
- goto out;
+ goto out_error;
}
+
cache->reserved += num_bytes;
- space_info->bytes_reserved += num_bytes;
- trace_btrfs_space_reservation(cache->fs_info, "space_info",
- space_info->flags, num_bytes, 1);
- btrfs_space_info_update_bytes_may_use(cache->fs_info,
- space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;
+ trace_btrfs_space_reservation(cache->fs_info, "space_info",
+ space_info->flags, num_bytes, 1);
+ spin_unlock(&cache->lock);
+
+ space_info->bytes_reserved += num_bytes;
+ btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes);
+
/*
* Compression can use less space than we reserved, so wake tickets if
* that happens.
*/
if (num_bytes < ram_bytes)
- btrfs_try_granting_tickets(cache->fs_info, space_info);
-out:
+ btrfs_try_granting_tickets(space_info);
+ spin_unlock(&space_info->lock);
+
+ return 0;
+
+out_error:
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
return ret;
@@ -3801,35 +3848,38 @@ out:
/*
* Update the block_group and space info counters.
*
- * @cache: The cache we are manipulating
- * @num_bytes: The number of bytes in question
- * @delalloc: The blocks are allocated for the delalloc write
+ * @cache: The cache we are manipulating.
+ * @num_bytes: The number of bytes in question.
+ * @is_delalloc: Whether the blocks are allocated for a delalloc write.
*
* This is called by somebody who is freeing space that was never actually used
* on disk. For example if you reserve some space for a new leaf in transaction
* A and before transaction A commits you free that leaf, you call this with
* reserve set to 0 in order to clear the reservation.
*/
-void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
- u64 num_bytes, int delalloc)
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes,
+ bool is_delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
+ bool bg_ro;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
- if (cache->ro)
+ bg_ro = cache->ro;
+ cache->reserved -= num_bytes;
+ if (is_delalloc)
+ cache->delalloc_bytes -= num_bytes;
+ spin_unlock(&cache->lock);
+
+ if (bg_ro)
space_info->bytes_readonly += num_bytes;
else if (btrfs_is_zoned(cache->fs_info))
space_info->bytes_zone_unusable += num_bytes;
- cache->reserved -= num_bytes;
+
space_info->bytes_reserved -= num_bytes;
space_info->max_extent_size = 0;
- if (delalloc)
- cache->delalloc_bytes -= num_bytes;
- spin_unlock(&cache->lock);
-
- btrfs_try_granting_tickets(cache->fs_info, space_info);
+ btrfs_try_granting_tickets(space_info);
spin_unlock(&space_info->lock);
}
@@ -3844,14 +3894,14 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
}
}
-static int should_alloc_chunk(const struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *sinfo, int force)
+static bool should_alloc_chunk(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_space_info *sinfo, int force)
{
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
if (force == CHUNK_ALLOC_FORCE)
- return 1;
+ return true;
/*
* in limited mode, we want to have some free space up to
@@ -3862,22 +3912,31 @@ static int should_alloc_chunk(const struct btrfs_fs_info *fs_info,
thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
if (sinfo->total_bytes - bytes_used < thresh)
- return 1;
+ return true;
}
if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
- return 0;
- return 1;
+ return false;
+ return true;
}
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
{
u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
+ struct btrfs_space_info *space_info;
+
+ space_info = btrfs_find_space_info(trans->fs_info, type);
+ if (!space_info) {
+ DEBUG_WARN();
+ return -EINVAL;
+ }
- return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ return btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
}
-static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_space_info *space_info,
+ u64 flags)
{
struct btrfs_block_group *bg;
int ret;
@@ -3890,7 +3949,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans
*/
check_system_chunk(trans, flags);
- bg = btrfs_create_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, space_info, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
goto out;
@@ -3938,8 +3997,16 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
struct btrfs_block_group *sys_bg;
+ struct btrfs_space_info *sys_space_info;
+
+ sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags);
+ if (unlikely(!sys_space_info)) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
- sys_bg = btrfs_create_chunk(trans, sys_flags);
+ sys_bg = btrfs_create_chunk(trans, sys_space_info, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -3947,17 +4014,17 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- } else if (ret) {
+ } else if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4070,6 +4137,8 @@ out:
*
* This function, btrfs_chunk_alloc(), belongs to phase 1.
*
+ * @space_info: specify which space_info the new chunk should belong to.
+ *
* If @force is CHUNK_ALLOC_FORCE:
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
@@ -4078,11 +4147,11 @@ out:
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
*/
-int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_space_info *space_info, u64 flags,
enum btrfs_chunk_alloc_enum force)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_space_info *space_info;
struct btrfs_block_group *ret_bg;
bool wait_for_alloc = false;
bool should_alloc = false;
@@ -4121,9 +4190,6 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
return -ENOSPC;
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
-
do {
spin_lock(&space_info->lock);
if (force < space_info->force_alloc)
@@ -4131,11 +4197,11 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
should_alloc = should_alloc_chunk(fs_info, space_info, force);
if (space_info->full) {
/* No more free physical space */
+ spin_unlock(&space_info->lock);
if (should_alloc)
ret = -ENOSPC;
else
ret = 0;
- spin_unlock(&space_info->lock);
return ret;
} else if (!should_alloc) {
spin_unlock(&space_info->lock);
@@ -4147,16 +4213,16 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
* recheck if we should continue with our allocation
* attempt.
*/
+ spin_unlock(&space_info->lock);
wait_for_alloc = true;
force = CHUNK_ALLOC_NO_FORCE;
- spin_unlock(&space_info->lock);
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
} else {
/* Proceed with allocation */
- space_info->chunk_alloc = 1;
- wait_for_alloc = false;
+ space_info->chunk_alloc = true;
spin_unlock(&space_info->lock);
+ wait_for_alloc = false;
}
cond_resched();
@@ -4184,7 +4250,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
force_metadata_allocation(fs_info);
}
- ret_bg = do_chunk_alloc(trans, flags);
+ ret_bg = do_chunk_alloc(trans, space_info, flags);
trans->allocating_chunk = false;
if (IS_ERR(ret_bg)) {
@@ -4203,7 +4269,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
spin_lock(&space_info->lock);
if (ret < 0) {
if (ret == -ENOSPC)
- space_info->full = 1;
+ space_info->full = true;
else
goto out;
} else {
@@ -4213,7 +4279,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
out:
- space_info->chunk_alloc = 0;
+ space_info->chunk_alloc = false;
spin_unlock(&space_info->lock);
mutex_unlock(&fs_info->chunk_mutex);
@@ -4254,12 +4320,16 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
left, bytes, type);
- btrfs_dump_space_info(fs_info, info, 0, 0);
+ btrfs_dump_space_info(info, 0, false);
}
if (left < bytes) {
u64 flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *bg;
+ struct btrfs_space_info *space_info;
+
+ space_info = btrfs_find_space_info(fs_info, flags);
+ ASSERT(space_info);
/*
* Ignore failure to create system chunk. We might end up not
@@ -4267,7 +4337,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
*/
- bg = btrfs_create_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, space_info, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
} else {
@@ -4275,7 +4345,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
* We have a new chunk. We also need to activate it for
* zoned filesystem.
*/
- ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
+ ret = btrfs_zoned_activate_one_bg(info, true);
if (ret < 0)
return;
@@ -4375,6 +4445,43 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
}
}
+static void check_removing_space_info(struct btrfs_space_info *space_info)
+{
+ struct btrfs_fs_info *info = space_info->fs_info;
+
+ if (space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY) {
+ /* This is a top space_info, proceed with its children first. */
+ for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
+ if (space_info->sub_group[i]) {
+ check_removing_space_info(space_info->sub_group[i]);
+ kfree(space_info->sub_group[i]);
+ space_info->sub_group[i] = NULL;
+ }
+ }
+ }
+
+ /*
+ * Do not hide this behind enospc_debug, this is actually important and
+ * indicates a real bug if this happens.
+ */
+ if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0))
+ btrfs_dump_space_info(space_info, 0, false);
+
+ /*
+ * If there was a failure to cleanup a log tree, very likely due to an
+ * IO failure on a writeback attempt of one or more of its extent
+ * buffers, we could not do proper (and cheap) unaccounting of their
+ * reserved space, so don't warn on bytes_reserved > 0 in that case.
+ */
+ if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
+ if (WARN_ON(space_info->bytes_reserved > 0))
+ btrfs_dump_space_info(space_info, 0, false);
+ }
+
+ WARN_ON(space_info->reclaim_size > 0);
+}
+
/*
* Must be called only after stopping all workers, since we could have block
* group caching kthreads running, and therefore they could race with us if we
@@ -4400,8 +4507,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
write_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
- caching_ctl = list_entry(info->caching_block_groups.next,
- struct btrfs_caching_control, list);
+ caching_ctl = list_first_entry(&info->caching_block_groups,
+ struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
@@ -4472,32 +4579,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
btrfs_release_global_block_rsv(info);
while (!list_empty(&info->space_info)) {
- space_info = list_entry(info->space_info.next,
- struct btrfs_space_info,
- list);
-
- /*
- * Do not hide this behind enospc_debug, this is actually
- * important and indicates a real bug if this happens.
- */
- if (WARN_ON(space_info->bytes_pinned > 0 ||
- space_info->bytes_may_use > 0))
- btrfs_dump_space_info(info, space_info, 0, 0);
-
- /*
- * If there was a failure to cleanup a log tree, very likely due
- * to an IO failure on a writeback attempt of one or more of its
- * extent buffers, we could not do proper (and cheap) unaccounting
- * of their reserved space, so don't warn on bytes_reserved > 0 in
- * that case.
- */
- if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
- !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
- if (WARN_ON(space_info->bytes_reserved > 0))
- btrfs_dump_space_info(info, space_info, 0, 0);
- }
+ space_info = list_first_entry(&info->space_info,
+ struct btrfs_space_info, list);
- WARN_ON(space_info->reclaim_size > 0);
+ check_removing_space_info(space_info);
list_del(&space_info->list);
btrfs_sysfs_remove_space_info(space_info);
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 36937eeab9b8..5f933455118c 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -63,7 +63,7 @@ enum btrfs_discard_state {
* CHUNK_ALLOC_FORCE means it must try to allocate one
*
* CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from
- * find_free_extent() that also activaes the zone
+ * find_free_extent() that also activates the zone
*/
enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_NO_FORCE,
@@ -83,6 +83,8 @@ enum btrfs_block_group_flags {
BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
/* Does the block group need to be added to the free space tree? */
BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ /* Set after we add a new block group to the free space tree. */
+ BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
/* Indicate that the block group is placed on a sequential zone */
BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
/*
@@ -244,6 +246,11 @@ struct btrfs_block_group {
/* Lock for free space tree operations. */
struct mutex free_space_lock;
+ /* Protected by @free_space_lock. */
+ bool using_free_space_bitmaps;
+ /* Protected by @free_space_lock. */
+ bool using_free_space_bitmaps_cached;
+
/*
* Number of extents in this block group used for swap files.
* All accesses protected by the spinlock 'lock'.
@@ -326,8 +333,8 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
- u64 type,
- u64 chunk_offset, u64 size);
+ struct btrfs_space_info *space_info,
+ u64 type, u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc);
@@ -338,11 +345,12 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc);
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
- u64 ram_bytes, u64 num_bytes, int delalloc,
+ u64 ram_bytes, u64 num_bytes, bool delalloc,
bool force_wrong_size_class);
-void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
- u64 num_bytes, int delalloc);
-int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes,
+ bool is_delalloc);
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_space_info *space_info, u64 flags,
enum btrfs_chunk_alloc_enum force);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index a07b9594dc70..96cf7a162987 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -150,9 +150,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
spin_unlock(&dest->lock);
}
if (num_bytes)
- btrfs_space_info_free_bytes_may_use(fs_info,
- space_info,
- num_bytes);
+ btrfs_space_info_free_bytes_may_use(space_info, num_bytes);
}
if (qgroup_to_release_ret)
*qgroup_to_release_ret = qgroup_to_release;
@@ -220,8 +218,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
if (num_bytes == 0)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
- num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, num_bytes, flush);
if (!ret)
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
@@ -261,8 +258,7 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
if (!ret)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
- num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, num_bytes, flush);
if (!ret) {
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
@@ -383,15 +379,13 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
if (block_rsv->reserved < block_rsv->size) {
num_bytes = block_rsv->size - block_rsv->reserved;
- btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
- num_bytes);
+ btrfs_space_info_update_bytes_may_use(sinfo, num_bytes);
block_rsv->reserved = block_rsv->size;
} else if (block_rsv->reserved > block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
- btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
- -num_bytes);
+ btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes);
block_rsv->reserved = block_rsv->size;
- btrfs_try_granting_tickets(fs_info, sinfo);
+ btrfs_try_granting_tickets(sinfo);
}
block_rsv->full = (block_rsv->reserved == block_rsv->size);
@@ -422,6 +416,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_CHUNK_TREE_OBJECTID:
root->block_rsv = &fs_info->chunk_block_rsv;
break;
+ case BTRFS_TREE_LOG_OBJECTID:
+ root->block_rsv = &fs_info->treelog_rsv;
+ break;
default:
root->block_rsv = NULL;
break;
@@ -442,6 +439,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->delayed_block_rsv.space_info = space_info;
fs_info->delayed_refs_rsv.space_info = space_info;
+ /* The treelog_rsv uses a dedicated space_info on the zoned mode. */
+ if (!btrfs_is_zoned(fs_info)) {
+ fs_info->treelog_rsv.space_info = space_info;
+ } else {
+ ASSERT(space_info->sub_group[0]->subgroup_id == BTRFS_SUB_GROUP_TREELOG);
+ fs_info->treelog_rsv.space_info = space_info->sub_group[0];
+ }
+
btrfs_update_global_block_rsv(fs_info);
}
@@ -523,8 +528,8 @@ again:
block_rsv->type, ret);
}
try_reserve:
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
- blocksize, BTRFS_RESERVE_NO_FLUSH);
+ ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
/*
@@ -545,7 +550,7 @@ try_reserve:
* one last time to force a reservation if there's enough actual space
* on disk to make the reservation.
*/
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize,
+ ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, blocksize,
BTRFS_RESERVE_FLUSH_EMERGENCY);
if (!ret)
return block_rsv;
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index d12b1fac5c74..79ae9d05cd91 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -24,6 +24,7 @@ enum btrfs_rsv_type {
BTRFS_BLOCK_RSV_CHUNK,
BTRFS_BLOCK_RSV_DELOPS,
BTRFS_BLOCK_RSV_DELREFS,
+ BTRFS_BLOCK_RSV_TREELOG,
BTRFS_BLOCK_RSV_EMPTY,
BTRFS_BLOCK_RSV_TEMP,
};
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index aa1f55cd81b7..73602ee8de3f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -18,20 +18,20 @@
#include <linux/lockdep.h>
#include <uapi/linux/btrfs_tree.h>
#include <trace/events/btrfs.h>
+#include "ctree.h"
#include "block-rsv.h"
#include "extent_map.h"
-#include "extent_io.h"
#include "extent-io-tree.h"
-#include "ordered-data.h"
-#include "delayed-inode.h"
-struct extent_state;
struct posix_acl;
struct iov_iter;
struct writeback_control;
struct btrfs_root;
struct btrfs_fs_info;
struct btrfs_trans_handle;
+struct btrfs_bio;
+struct btrfs_file_extent;
+struct btrfs_delayed_node;
/*
* Since we search a directory based on f_pos (struct dir_context::pos) we have
@@ -145,6 +145,7 @@ struct btrfs_inode {
* different from prop_compress and takes precedence if set.
*/
u8 defrag_compress;
+ s8 defrag_compress_level;
/*
* Lock for counters and all fields used to determine if the inode is in
@@ -247,7 +248,7 @@ struct btrfs_inode {
u64 new_delalloc_bytes;
/*
* The offset of the last dir index key that was logged.
- * This is used only for directories.
+ * This is used only for directories. Protected by 'log_mutex'.
*/
u64 last_dir_index_offset;
};
@@ -337,6 +338,11 @@ struct btrfs_inode {
struct list_head delayed_iput;
struct rw_semaphore i_mmap_lock;
+
+#ifdef CONFIG_FS_VERITY
+ struct fsverity_info *i_verity_info;
+#endif
+
struct inode vfs_inode;
};
@@ -516,17 +522,38 @@ static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode)
lockdep_assert_held(&inode->vfs_inode.i_rwsem);
}
-/* Array of bytes with variable length, hexadecimal format 0x1234 */
-#define CSUM_FMT "0x%*phN"
-#define CSUM_FMT_VALUE(size, bytes) size, bytes
+static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode)
+{
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ mapping_clear_stable_writes(inode->vfs_inode.i_mapping);
+ else
+ mapping_set_stable_writes(inode->vfs_inode.i_mapping);
+}
+
+static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode)
+{
+ /* Metadata inode should not reach here. */
+ ASSERT(is_data_inode(inode));
+
+ /* We only allow BITS_PER_LONGS blocks for each bitmap. */
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ mapping_set_folio_order_range(inode->vfs_inode.i_mapping,
+ inode->root->fs_info->block_min_order,
+ inode->root->fs_info->block_max_order);
+#endif
+}
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
- u32 pgoff, u8 *csum, const u8 * const csum_expected);
+void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info,
+ const phys_addr_t paddr, u8 *dest);
+void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info,
+ const phys_addr_t paddrs[], u8 *dest);
+int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum,
+ const u8 * const csum_expected);
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
- u32 bio_offset, struct bio_vec *bv);
-noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+ u32 bio_offset, const phys_addr_t paddrs[]);
+noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
struct btrfs_file_extent *file_extent,
- bool nowait, bool strict);
+ bool nowait);
void btrfs_del_delalloc_inode(struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
@@ -536,10 +563,9 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name);
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
- const struct fscrypt_str *name, int add_backref, u64 index);
+ const struct fscrypt_str *name, bool add_backref, u64 index);
int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry);
-int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
- int front);
+int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
@@ -584,9 +610,9 @@ void btrfs_free_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
-struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
- struct btrfs_path *path);
-struct inode *btrfs_iget(u64 ino, struct btrfs_root *root);
+struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path);
+struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct folio *folio, u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0c4d486c3048..6b3357287b42 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -67,9 +67,7 @@ static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode,
bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op,
GFP_NOFS, &btrfs_compressed_bioset));
- btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL);
- bbio->inode = inode;
- bbio->file_offset = start;
+ btrfs_bio_init(bbio, inode, start, end_io, NULL);
return to_compressed_bio(bbio);
}
@@ -90,19 +88,19 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len)
}
static int compression_compress_pages(int type, struct list_head *ws,
- struct address_space *mapping, u64 start,
+ struct btrfs_inode *inode, u64 start,
struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB:
- return zlib_compress_folios(ws, mapping, start, folios,
+ return zlib_compress_folios(ws, inode, start, folios,
out_folios, total_in, total_out);
case BTRFS_COMPRESS_LZO:
- return lzo_compress_folios(ws, mapping, start, folios,
+ return lzo_compress_folios(ws, inode, start, folios,
out_folios, total_in, total_out);
case BTRFS_COMPRESS_ZSTD:
- return zstd_compress_folios(ws, mapping, start, folios,
+ return zstd_compress_folios(ws, inode, start, folios,
out_folios, total_in, total_out);
case BTRFS_COMPRESS_NONE:
default:
@@ -194,15 +192,13 @@ static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_c
static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc)
{
- struct list_head remove;
+ LIST_HEAD(remove);
struct list_head *tmp, *next;
int freed;
if (compr_pool.count == 0)
return SHRINK_STOP;
- INIT_LIST_HEAD(&remove);
-
/* For now, just simply drain the whole list. */
spin_lock(&compr_pool.lock);
list_splice_init(&compr_pool.list, &remove);
@@ -223,10 +219,14 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co
/*
* Common wrappers for page allocation from compression wrappers
*/
-struct folio *btrfs_alloc_compr_folio(void)
+struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info)
{
struct folio *folio = NULL;
+ /* For bs > ps cases, no cached folio pool for now. */
+ if (fs_info->block_min_order)
+ goto alloc;
+
spin_lock(&compr_pool.lock);
if (compr_pool.count > 0) {
folio = list_first_entry(&compr_pool.list, struct folio, lru);
@@ -238,13 +238,18 @@ struct folio *btrfs_alloc_compr_folio(void)
if (folio)
return folio;
- return folio_alloc(GFP_NOFS, 0);
+alloc:
+ return folio_alloc(GFP_NOFS, fs_info->block_min_order);
}
void btrfs_free_compr_folio(struct folio *folio)
{
bool do_free = false;
+ /* The folio is from bs > ps fs, no cached pool for now. */
+ if (folio_order(folio))
+ goto free;
+
spin_lock(&compr_pool.lock);
if (compr_pool.count > compr_pool.thresh) {
do_free = true;
@@ -257,6 +262,7 @@ void btrfs_free_compr_folio(struct folio *folio)
if (!do_free)
return;
+free:
ASSERT(folio_ref_count(folio) == 1);
folio_put(folio);
}
@@ -282,15 +288,15 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
{
struct inode *inode = &cb->bbio.inode->vfs_inode;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- unsigned long index = cb->start >> PAGE_SHIFT;
- unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
+ pgoff_t index = cb->start >> PAGE_SHIFT;
+ const pgoff_t end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct folio_batch fbatch;
- const int error = blk_status_to_errno(cb->bbio.bio.bi_status);
int i;
int ret;
- if (error)
- mapping_set_error(inode->i_mapping, error);
+ ret = blk_status_to_errno(cb->bbio.bio.bi_status);
+ if (ret)
+ mapping_set_error(inode->i_mapping, ret);
folio_batch_init(&fbatch);
while (index <= end_index) {
@@ -311,22 +317,6 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
/* the inode may be gone now */
}
-static void btrfs_finish_compressed_write_work(struct work_struct *work)
-{
- struct compressed_bio *cb =
- container_of(work, struct compressed_bio, write_end_work);
-
- btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len,
- cb->bbio.bio.bi_status == BLK_STS_OK);
-
- if (cb->writeback)
- end_compressed_writeback(cb);
- /* Note, our inode could be gone now */
-
- btrfs_free_compressed_folios(cb);
- bio_put(&cb->bbio.bio);
-}
-
/*
* Do the cleanup once all the compressed pages hit the disk. This will clear
* writeback on the file pages and free the compressed pages.
@@ -337,25 +327,33 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
static void end_bbio_compressed_write(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
- queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
+ btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len,
+ cb->bbio.bio.bi_status == BLK_STS_OK);
+
+ if (cb->writeback)
+ end_compressed_writeback(cb);
+ /* Note, our inode could be gone now. */
+ btrfs_free_compressed_folios(cb);
+ bio_put(&cb->bbio.bio);
}
static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
{
struct bio *bio = &cb->bbio.bio;
u32 offset = 0;
+ unsigned int findex = 0;
while (offset < cb->compressed_len) {
+ struct folio *folio = cb->compressed_folios[findex];
+ u32 len = min_t(u32, cb->compressed_len - offset, folio_size(folio));
int ret;
- u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
/* Maximum compressed extent is smaller than bio size limit. */
- ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT],
- len, 0);
+ ret = bio_add_folio(bio, folio, len, 0);
ASSERT(ret);
offset += len;
+ findex++;
}
}
@@ -389,7 +387,6 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
cb->compressed_folios = compressed_folios;
cb->compressed_len = ordered->disk_num_bytes;
cb->writeback = writeback;
- INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
cb->nr_folios = nr_folios;
cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT;
cb->bbio.ordered = ordered;
@@ -415,7 +412,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
int *memstall, unsigned long *pflags)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- unsigned long end_index;
+ pgoff_t end_index;
struct bio *orig_bio = &cb->orig_bbio->bio;
u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
u64 isize = i_size_read(inode);
@@ -443,11 +440,15 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (fs_info->sectorsize < PAGE_SIZE)
return 0;
+ /* For bs > ps cases, we don't support readahead for compressed folios for now. */
+ if (fs_info->block_min_order)
+ return 0;
+
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
while (cur < compressed_end) {
- u64 page_end;
- u64 pg_index = cur >> PAGE_SHIFT;
+ pgoff_t page_end;
+ pgoff_t pg_index = cur >> PAGE_SHIFT;
u32 add_size;
if (pg_index > end_index)
@@ -474,8 +475,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
continue;
}
- folio = filemap_alloc_folio(mapping_gfp_constraint(mapping,
- ~__GFP_FS), 0);
+ folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, ~__GFP_FS),
+ 0, NULL);
if (!folio)
break;
@@ -499,9 +500,9 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1;
- lock_extent(tree, cur, page_end, NULL);
+ btrfs_lock_extent(tree, cur, page_end, NULL);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
+ em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
/*
@@ -510,20 +511,20 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* to this compressed extent on disk.
*/
if (!em || cur < em->start ||
- (cur + fs_info->sectorsize > extent_map_end(em)) ||
- (extent_map_block_start(em) >> SECTOR_SHIFT) !=
+ (cur + fs_info->sectorsize > btrfs_extent_map_end(em)) ||
+ (btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) !=
orig_bio->bi_iter.bi_sector) {
- free_extent_map(em);
- unlock_extent(tree, cur, page_end, NULL);
+ btrfs_free_extent_map(em);
+ btrfs_unlock_extent(tree, cur, page_end, NULL);
folio_unlock(folio);
folio_put(folio);
break;
}
add_size = min(em->start + em->len, page_end + 1) - cur;
- free_extent_map(em);
- unlock_extent(tree, cur, page_end, NULL);
+ btrfs_free_extent_map(em);
+ btrfs_unlock_extent(tree, cur, page_end, NULL);
- if (folio->index == end_index) {
+ if (folio_contains(folio, end_index)) {
size_t zero_offset = offset_in_folio(folio, isize);
if (zero_offset) {
@@ -576,19 +577,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
struct extent_map *em;
unsigned long pflags;
int memstall = 0;
- blk_status_t ret;
- int ret2;
+ blk_status_t status;
+ int ret;
/* we need the actual starting offset of this extent in the file */
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
+ em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
read_unlock(&em_tree->lock);
if (!em) {
- ret = BLK_STS_IOERR;
+ status = BLK_STS_IOERR;
goto out;
}
- ASSERT(extent_map_is_compressed(em));
+ ASSERT(btrfs_extent_map_is_compressed(em));
compressed_len = em->disk_num_bytes;
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
@@ -600,21 +601,23 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
cb->len = bbio->bio.bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = extent_map_compression(em);
+ cb->compress_type = btrfs_extent_map_compression(em);
cb->orig_bbio = bbio;
+ cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
- cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS);
+ cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info));
+ cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS);
if (!cb->compressed_folios) {
- ret = BLK_STS_RESOURCE;
+ status = BLK_STS_RESOURCE;
goto out_free_bio;
}
- ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios);
- if (ret2) {
- ret = BLK_STS_RESOURCE;
+ ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order,
+ cb->compressed_folios);
+ if (ret) {
+ status = BLK_STS_RESOURCE;
goto out_free_compressed_pages;
}
@@ -637,7 +640,7 @@ out_free_compressed_pages:
out_free_bio:
bio_put(&cb->bbio.bio);
out:
- btrfs_bio_end_io(bbio, ret);
+ btrfs_bio_end_io(bbio, status);
}
/*
@@ -687,8 +690,6 @@ struct heuristic_ws {
struct list_head list;
};
-static struct workspace_manager heuristic_wsm;
-
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
@@ -701,7 +702,7 @@ static void free_heuristic_ws(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *alloc_heuristic_ws(void)
+static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info)
{
struct heuristic_ws *ws;
@@ -728,11 +729,9 @@ fail:
return ERR_PTR(-ENOMEM);
}
-const struct btrfs_compress_op btrfs_heuristic_compress = {
- .workspace_manager = &heuristic_wsm,
-};
+const struct btrfs_compress_levels btrfs_heuristic_compress = { 0 };
-static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+static const struct btrfs_compress_levels * const btrfs_compress_levels[] = {
/* The heuristic is represented as compression type 0 */
&btrfs_heuristic_compress,
&btrfs_zlib_compress,
@@ -740,13 +739,13 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zstd_compress,
};
-static struct list_head *alloc_workspace(int type, unsigned int level)
+static struct list_head *alloc_workspace(struct btrfs_fs_info *fs_info, int type, int level)
{
switch (type) {
- case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
- case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
- case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace();
- case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
+ case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(fs_info);
+ case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(fs_info, level);
+ case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(fs_info);
+ case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(fs_info, level);
default:
/*
* This can't happen, the type is validated several times
@@ -772,44 +771,58 @@ static void free_workspace(int type, struct list_head *ws)
}
}
-static void btrfs_init_workspace_manager(int type)
+static int alloc_workspace_manager(struct btrfs_fs_info *fs_info,
+ enum btrfs_compression_type type)
{
- struct workspace_manager *wsm;
+ struct workspace_manager *gwsm;
struct list_head *workspace;
- wsm = btrfs_compress_op[type]->workspace_manager;
- INIT_LIST_HEAD(&wsm->idle_ws);
- spin_lock_init(&wsm->ws_lock);
- atomic_set(&wsm->total_ws, 0);
- init_waitqueue_head(&wsm->ws_wait);
+ ASSERT(fs_info->compr_wsm[type] == NULL);
+ gwsm = kzalloc(sizeof(*gwsm), GFP_KERNEL);
+ if (!gwsm)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&gwsm->idle_ws);
+ spin_lock_init(&gwsm->ws_lock);
+ atomic_set(&gwsm->total_ws, 0);
+ init_waitqueue_head(&gwsm->ws_wait);
+ fs_info->compr_wsm[type] = gwsm;
/*
* Preallocate one workspace for each compression type so we can
* guarantee forward progress in the worst case
*/
- workspace = alloc_workspace(type, 0);
+ workspace = alloc_workspace(fs_info, type, 0);
if (IS_ERR(workspace)) {
- pr_warn(
- "BTRFS: cannot preallocate compression workspace, will try later\n");
+ btrfs_warn(fs_info,
+ "cannot preallocate compression workspace for %s, will try later",
+ btrfs_compress_type2str(type));
} else {
- atomic_set(&wsm->total_ws, 1);
- wsm->free_ws = 1;
- list_add(workspace, &wsm->idle_ws);
+ atomic_set(&gwsm->total_ws, 1);
+ gwsm->free_ws = 1;
+ list_add(workspace, &gwsm->idle_ws);
}
+ return 0;
}
-static void btrfs_cleanup_workspace_manager(int type)
+static void free_workspace_manager(struct btrfs_fs_info *fs_info,
+ enum btrfs_compression_type type)
{
- struct workspace_manager *wsman;
struct list_head *ws;
+ struct workspace_manager *gwsm = fs_info->compr_wsm[type];
- wsman = btrfs_compress_op[type]->workspace_manager;
- while (!list_empty(&wsman->idle_ws)) {
- ws = wsman->idle_ws.next;
+ /* ZSTD uses its own workspace manager, should enter here. */
+ ASSERT(type != BTRFS_COMPRESS_ZSTD && type < BTRFS_NR_COMPRESS_TYPES);
+ if (!gwsm)
+ return;
+ fs_info->compr_wsm[type] = NULL;
+ while (!list_empty(&gwsm->idle_ws)) {
+ ws = gwsm->idle_ws.next;
list_del(ws);
free_workspace(type, ws);
- atomic_dec(&wsman->total_ws);
+ atomic_dec(&gwsm->total_ws);
}
+ kfree(gwsm);
}
/*
@@ -818,9 +831,9 @@ static void btrfs_cleanup_workspace_manager(int type)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-struct list_head *btrfs_get_workspace(int type, unsigned int level)
+struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level)
{
- struct workspace_manager *wsm;
+ struct workspace_manager *wsm = fs_info->compr_wsm[type];
struct list_head *workspace;
int cpus = num_online_cpus();
unsigned nofs_flag;
@@ -830,7 +843,7 @@ struct list_head *btrfs_get_workspace(int type, unsigned int level)
wait_queue_head_t *ws_wait;
int *free_ws;
- wsm = btrfs_compress_op[type]->workspace_manager;
+ ASSERT(wsm);
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws;
@@ -866,7 +879,7 @@ again:
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
- workspace = alloc_workspace(type, level);
+ workspace = alloc_workspace(fs_info, type, level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
@@ -888,22 +901,22 @@ again:
/* once per minute */ 60 * HZ,
/* no burst */ 1);
- if (__ratelimit(&_rs)) {
- pr_warn("BTRFS: no compression workspaces, low memory, retrying\n");
- }
+ if (__ratelimit(&_rs))
+ btrfs_warn(fs_info,
+ "no compression workspaces, low memory, retrying");
}
goto again;
}
return workspace;
}
-static struct list_head *get_workspace(int type, int level)
+static struct list_head *get_workspace(struct btrfs_fs_info *fs_info, int type, int level)
{
switch (type) {
- case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
- case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
- case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level);
- case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
+ case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(fs_info, type, level);
+ case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(fs_info, level);
+ case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(fs_info, type, level);
+ case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(fs_info, level);
default:
/*
* This can't happen, the type is validated several times
@@ -917,21 +930,21 @@ static struct list_head *get_workspace(int type, int level)
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
-void btrfs_put_workspace(int type, struct list_head *ws)
+void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws)
{
- struct workspace_manager *wsm;
+ struct workspace_manager *gwsm = fs_info->compr_wsm[type];
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
- wsm = btrfs_compress_op[type]->workspace_manager;
- idle_ws = &wsm->idle_ws;
- ws_lock = &wsm->ws_lock;
- total_ws = &wsm->total_ws;
- ws_wait = &wsm->ws_wait;
- free_ws = &wsm->free_ws;
+ ASSERT(gwsm);
+ idle_ws = &gwsm->idle_ws;
+ ws_lock = &gwsm->ws_lock;
+ total_ws = &gwsm->total_ws;
+ ws_wait = &gwsm->ws_wait;
+ free_ws = &gwsm->free_ws;
spin_lock(ws_lock);
if (*free_ws <= num_online_cpus()) {
@@ -948,13 +961,13 @@ wake:
cond_wake_up(ws_wait);
}
-static void put_workspace(int type, struct list_head *ws)
+static void put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws)
{
switch (type) {
- case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
- case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
- case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws);
- case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
+ case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(fs_info, type, ws);
+ case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(fs_info, type, ws);
+ case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(fs_info, type, ws);
+ case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(fs_info, ws);
default:
/*
* This can't happen, the type is validated several times
@@ -968,18 +981,28 @@ static void put_workspace(int type, struct list_head *ws)
* Adjust @level according to the limits of the compression algorithm or
* fallback to default
*/
-static unsigned int btrfs_compress_set_level(int type, unsigned level)
+static int btrfs_compress_set_level(unsigned int type, int level)
{
- const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+ const struct btrfs_compress_levels *levels = btrfs_compress_levels[type];
if (level == 0)
- level = ops->default_level;
+ level = levels->default_level;
else
- level = min(level, ops->max_level);
+ level = clamp(level, levels->min_level, levels->max_level);
return level;
}
+/*
+ * Check whether the @level is within the valid range for the given type.
+ */
+bool btrfs_compress_level_valid(unsigned int type, int level)
+{
+ const struct btrfs_compress_levels *levels = btrfs_compress_levels[type];
+
+ return levels->min_level <= level && level <= levels->max_level;
+}
+
/* Wrapper around find_get_page(), with extra error message. */
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
struct folio **in_folio_ret)
@@ -1012,46 +1035,46 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
* - compression algo are 0-3
* - the level are bits 4-7
*
- * @out_pages is an in/out parameter, holds maximum number of pages to allocate
- * and returns number of actually allocated pages
+ * @out_folios is an in/out parameter, holds maximum number of folios to allocate
+ * and returns number of actually allocated folios
*
* @total_in is used to return the number of bytes actually read. It
* may be smaller than the input length if we had to exit early because we
- * ran out of room in the pages array or because we cross the
+ * ran out of room in the folios array or because we cross the
* max_out threshold.
*
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
*/
-int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
- int type = btrfs_compress_type(type_level);
- int level = btrfs_compress_level(type_level);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
const unsigned long orig_len = *total_out;
struct list_head *workspace;
int ret;
level = btrfs_compress_set_level(type, level);
- workspace = get_workspace(type, level);
- ret = compression_compress_pages(type, workspace, mapping, start, folios,
+ workspace = get_workspace(fs_info, type, level);
+ ret = compression_compress_pages(type, workspace, inode, start, folios,
out_folios, total_in, total_out);
/* The total read-in bytes should be no larger than the input. */
ASSERT(*total_in <= orig_len);
- put_workspace(type, workspace);
+ put_workspace(fs_info, type, workspace);
return ret;
}
static int btrfs_decompress_bio(struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
struct list_head *workspace;
int ret;
int type = cb->compress_type;
- workspace = get_workspace(type, 0);
+ workspace = get_workspace(fs_info, type, 0);
ret = compression_decompress_bio(workspace, cb);
- put_workspace(type, workspace);
+ put_workspace(fs_info, type, workspace);
if (!ret)
zero_fill_bio(&cb->orig_bbio->bio);
@@ -1061,7 +1084,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
/*
* a less complex decompression routine. Our compressed data fits in a
* single page, and we want to read a single page out of it.
- * start_byte tells us the offset into the compressed data we're interested in
+ * dest_pgoff tells us the offset into the destination folio where we write the
+ * decompressed data.
*/
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
unsigned long dest_pgoff, size_t srclen, size_t destlen)
@@ -1072,20 +1096,50 @@ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
int ret;
/*
- * The full destination page range should not exceed the page size.
+ * The full destination folio range should not exceed the folio size.
* And the @destlen should not exceed sectorsize, as this is only called for
* inline file extents, which should not exceed sectorsize.
*/
- ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
+ ASSERT(dest_pgoff + destlen <= folio_size(dest_folio) && destlen <= sectorsize);
- workspace = get_workspace(type, 0);
+ workspace = get_workspace(fs_info, type, 0);
ret = compression_decompress(type, workspace, data_in, dest_folio,
dest_pgoff, srclen, destlen);
- put_workspace(type, workspace);
+ put_workspace(fs_info, type, workspace);
return ret;
}
+int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+
+ ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_NONE);
+ if (ret < 0)
+ goto error;
+ ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB);
+ if (ret < 0)
+ goto error;
+ ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_LZO);
+ if (ret < 0)
+ goto error;
+ ret = zstd_alloc_workspace_manager(fs_info);
+ if (ret < 0)
+ goto error;
+ return 0;
+error:
+ btrfs_free_compress_wsm(fs_info);
+ return ret;
+}
+
+void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info)
+{
+ free_workspace_manager(fs_info, BTRFS_COMPRESS_NONE);
+ free_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB);
+ free_workspace_manager(fs_info, BTRFS_COMPRESS_LZO);
+ zstd_free_workspace_manager(fs_info);
+}
+
int __init btrfs_init_compress(void)
{
if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE,
@@ -1097,11 +1151,6 @@ int __init btrfs_init_compress(void)
if (!compr_pool.shrinker)
return -ENOMEM;
- btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
- btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
- btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
- zstd_init_workspace_manager();
-
spin_lock_init(&compr_pool.lock);
INIT_LIST_HEAD(&compr_pool.list);
compr_pool.count = 0;
@@ -1122,14 +1171,26 @@ void __cold btrfs_exit_compress(void)
btrfs_compr_pool_scan(NULL, NULL);
shrinker_free(compr_pool.shrinker);
- btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
- btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
- btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
- zstd_cleanup_workspace_manager();
bioset_exit(&btrfs_compressed_bioset);
}
/*
+ * The bvec is a single page bvec from a bio that contains folios from a filemap.
+ *
+ * Since the folio may be a large one, and if the bv_page is not a head page of
+ * a large folio, then page->index is unreliable.
+ *
+ * Thus we need this helper to grab the proper file offset.
+ */
+static u64 file_offset_from_bvec(const struct bio_vec *bvec)
+{
+ const struct page *page = bvec->bv_page;
+ const struct folio *folio = page_folio(page);
+
+ return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset;
+}
+
+/*
* Copy decompressed data from working buffer to pages.
*
* @buf: The decompressed data buffer
@@ -1174,13 +1235,14 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
u32 copy_start;
/* Offset inside the full decompressed extent */
u32 bvec_offset;
+ void *kaddr;
bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
/*
* cb->start may underflow, but subtracting that value can still
* give us correct offset inside the full decompressed extent.
*/
- bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
+ bvec_offset = file_offset_from_bvec(&bvec) - cb->start;
/* Haven't reached the bvec range, exit */
if (decompressed + buf_len <= bvec_offset)
@@ -1196,10 +1258,12 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
* @buf + @buf_len.
*/
ASSERT(copy_start - decompressed < buf_len);
- memcpy_to_page(bvec.bv_page, bvec.bv_offset,
- buf + copy_start - decompressed, copy_len);
- cur_offset += copy_len;
+ kaddr = bvec_kmap_local(&bvec);
+ memcpy(kaddr, buf + copy_start - decompressed, copy_len);
+ kunmap_local(kaddr);
+
+ cur_offset += copy_len;
bio_advance(orig_bio, copy_len);
/* Finished the bio */
if (!orig_bio->bi_iter.bi_size)
@@ -1229,7 +1293,7 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
#define ENTROPY_LVL_HIGH (80)
/*
- * For increasead precision in shannon_entropy calculation,
+ * For increased precision in shannon_entropy calculation,
* let's do pow(n, M) to save more digits after comma:
*
* - maximum int bit length is 64
@@ -1455,7 +1519,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
struct heuristic_ws *ws)
{
struct page *page;
- u64 index, index_end;
+ pgoff_t index, index_end;
u32 i, curr_sample_pos;
u8 *in_data;
@@ -1515,7 +1579,8 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
*/
int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct list_head *ws_list = get_workspace(0, 0);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct list_head *ws_list = get_workspace(fs_info, 0, 0);
struct heuristic_ws *ws;
u32 i;
u8 byte;
@@ -1584,29 +1649,34 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
}
out:
- put_workspace(0, ws_list);
+ put_workspace(fs_info, 0, ws_list);
return ret;
}
/*
- * Convert the compression suffix (eg. after "zlib" starting with ":") to
- * level, unrecognized string will set the default level
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to level.
+ *
+ * If the resulting level exceeds the algo's supported levels, it will be clamped.
+ *
+ * Return <0 if no valid string can be found.
+ * Return 0 if everything is fine.
*/
-unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
+int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret)
{
- unsigned int level = 0;
+ int level = 0;
int ret;
- if (!type)
+ if (!type) {
+ *level_ret = btrfs_compress_set_level(type, level);
return 0;
+ }
if (str[0] == ':') {
- ret = kstrtouint(str + 1, 10, &level);
+ ret = kstrtoint(str + 1, 10, &level);
if (ret)
- level = 0;
+ return ret;
}
- level = btrfs_compress_set_level(type, level);
-
- return level;
+ *level_ret = btrfs_compress_set_level(type, level);
+ return 0;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 954034086d0d..e0228017e861 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -11,14 +11,15 @@
#include <linux/list.h>
#include <linux/workqueue.h>
#include <linux/wait.h>
+#include <linux/pagemap.h>
#include "bio.h"
+#include "fs.h"
+#include "btrfs_inode.h"
struct address_space;
-struct page;
struct inode;
struct btrfs_inode;
struct btrfs_ordered_extent;
-struct btrfs_bio;
/*
* We want to make sure that amount of RAM required to uncompress an extent is
@@ -62,42 +63,39 @@ struct compressed_bio {
/* Whether this is a write for writeback. */
bool writeback;
- union {
- /* For reads, this is the bio we are copying the data into */
- struct btrfs_bio *orig_bbio;
- struct work_struct write_end_work;
- };
+ /* For reads, this is the bio we are copying the data into. */
+ struct btrfs_bio *orig_bbio;
/* Must be last. */
struct btrfs_bio bbio;
};
-static inline unsigned int btrfs_compress_type(unsigned int type_level)
+static inline struct btrfs_fs_info *cb_to_fs_info(const struct compressed_bio *cb)
{
- return (type_level & 0xF);
-}
-
-static inline unsigned int btrfs_compress_level(unsigned int type_level)
-{
- return ((type_level & 0xF0) >> 4);
+ return cb->bbio.inode->root->fs_info;
}
/* @range_end must be exclusive. */
-static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur)
+static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur)
{
- u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE;
-
- return min(range_end, page_end) - cur;
+ /* @cur must be inside the folio. */
+ ASSERT(folio_pos(folio) <= cur);
+ ASSERT(cur < folio_next_pos(folio));
+ return umin(range_end, folio_next_pos(folio)) - cur;
}
+int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info);
+void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info);
+
int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
-int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+bool btrfs_compress_level_valid(unsigned int type, int level);
+int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
- unsigned long start_byte, size_t srclen, size_t destlen);
+ unsigned long dest_pgoff, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
@@ -107,19 +105,11 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
bool writeback);
void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
-unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
+int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret);
-struct folio *btrfs_alloc_compr_folio(void);
+struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info);
void btrfs_free_compr_folio(struct folio *folio);
-enum btrfs_compression_type {
- BTRFS_COMPRESS_NONE = 0,
- BTRFS_COMPRESS_ZLIB = 1,
- BTRFS_COMPRESS_LZO = 2,
- BTRFS_COMPRESS_ZSTD = 3,
- BTRFS_NR_COMPRESS_TYPES = 4,
-};
-
struct workspace_manager {
struct list_head idle_ws;
spinlock_t ws_lock;
@@ -131,23 +121,23 @@ struct workspace_manager {
wait_queue_head_t ws_wait;
};
-struct list_head *btrfs_get_workspace(int type, unsigned int level);
-void btrfs_put_workspace(int type, struct list_head *ws);
+struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level);
+void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws);
-struct btrfs_compress_op {
- struct workspace_manager *workspace_manager;
+struct btrfs_compress_levels {
/* Maximum level supported by the compression algorithm */
- unsigned int max_level;
- unsigned int default_level;
+ int min_level;
+ int max_level;
+ int default_level;
};
/* The heuristic workspaces are managed via the 0th workspace manager */
#define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES
-extern const struct btrfs_compress_op btrfs_heuristic_compress;
-extern const struct btrfs_compress_op btrfs_zlib_compress;
-extern const struct btrfs_compress_op btrfs_lzo_compress;
-extern const struct btrfs_compress_op btrfs_zstd_compress;
+extern const struct btrfs_compress_levels btrfs_heuristic_compress;
+extern const struct btrfs_compress_levels btrfs_zlib_compress;
+extern const struct btrfs_compress_levels btrfs_lzo_compress;
+extern const struct btrfs_compress_levels btrfs_zstd_compress;
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
bool btrfs_compress_is_valid_type(const char *str, size_t len);
@@ -157,39 +147,39 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
struct folio **in_folio_ret);
-int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
-struct list_head *zlib_alloc_workspace(unsigned int level);
+struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level);
void zlib_free_workspace(struct list_head *ws);
-struct list_head *zlib_get_workspace(unsigned int level);
+struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level);
-int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
-struct list_head *lzo_alloc_workspace(void);
+struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info);
void lzo_free_workspace(struct list_head *ws);
-int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zstd_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
-void zstd_init_workspace_manager(void);
-void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(unsigned int level);
+int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info);
+void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info);
+struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level);
void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(unsigned int level);
-void zstd_put_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level);
+void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws);
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 148648ea1c8b..a48b4befbee7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -30,26 +30,13 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *ins_key, struct btrfs_path *path,
- int data_size, int extend);
+ int data_size, bool extend);
static int push_node_left(struct btrfs_trans_handle *trans,
struct extent_buffer *dst,
- struct extent_buffer *src, int empty);
+ struct extent_buffer *src, bool empty);
static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
-
-static const struct btrfs_csums {
- u16 size;
- const char name[10];
- const char driver[12];
-} btrfs_csums[] = {
- [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
- [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
- [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
- [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
- .driver = "blake2b-256" },
-};
-
/*
* The leaf data grows from end-to-front in the node. this returns the address
* of the start of the last item, which is the stop of the leaf data stack.
@@ -148,44 +135,6 @@ static inline void copy_leaf_items(const struct extent_buffer *dst,
nr_items * sizeof(struct btrfs_item));
}
-/* This exists for btrfs-progs usages. */
-u16 btrfs_csum_type_size(u16 type)
-{
- return btrfs_csums[type].size;
-}
-
-int btrfs_super_csum_size(const struct btrfs_super_block *s)
-{
- u16 t = btrfs_super_csum_type(s);
- /*
- * csum type is validated at mount time
- */
- return btrfs_csum_type_size(t);
-}
-
-const char *btrfs_super_csum_name(u16 csum_type)
-{
- /* csum type is validated at mount time */
- return btrfs_csums[csum_type].name;
-}
-
-/*
- * Return driver name if defined, otherwise the name that's also a valid driver
- * name
- */
-const char *btrfs_super_csum_driver(u16 csum_type)
-{
- /* csum type is validated at mount time */
- return btrfs_csums[csum_type].driver[0] ?
- btrfs_csums[csum_type].driver :
- btrfs_csums[csum_type].name;
-}
-
-size_t __attribute_const__ btrfs_get_num_csums(void)
-{
- return ARRAY_SIZE(btrfs_csums);
-}
-
struct btrfs_path *btrfs_alloc_path(void)
{
might_sleep();
@@ -226,22 +175,6 @@ noinline void btrfs_release_path(struct btrfs_path *p)
}
/*
- * We want the transaction abort to print stack trace only for errors where the
- * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
- * caused by external factors.
- */
-bool __cold abort_should_print_stack(int error)
-{
- switch (error) {
- case -EIO:
- case -EROFS:
- case -ENOMEM:
- return false;
- }
- return true;
-}
-
-/*
* safely gets a reference on the root node of a tree. A lock
* is not taken, so a concurrent writer may put a different node
* at the root of the tree. See btrfs_lock_root_node for the
@@ -265,7 +198,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
* the inc_not_zero dance and if it doesn't work then
* synchronize_rcu and try again.
*/
- if (atomic_inc_not_zero(&eb->refs)) {
+ if (refcount_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
break;
}
@@ -350,15 +283,26 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
- WARN_ON(btrfs_header_generation(buf) > trans->transid);
- if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (unlikely(btrfs_header_generation(buf) > trans->transid)) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_inc_ref(trans, root, cow, 1);
- else
+ if (unlikely(ret))
+ btrfs_abort_transaction(trans, ret);
+ } else {
ret = btrfs_inc_ref(trans, root, cow, 0);
+ if (unlikely(ret))
+ btrfs_abort_transaction(trans, ret);
+ }
if (ret) {
btrfs_tree_unlock(cow);
free_extent_buffer(cow);
- btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -370,9 +314,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
/*
* check if the tree block can be shared by multiple trees
*/
-bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf)
{
const u64 buf_gen = btrfs_header_generation(buf);
@@ -592,14 +536,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
@@ -612,18 +556,18 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
parent_start = buf->start;
ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
- atomic_inc(&cow->refs);
+ refcount_inc(&cow->refs);
rcu_assign_pointer(root->node, cow);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
parent_start, last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
@@ -631,7 +575,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(trans->transid != btrfs_header_generation(parent));
ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
@@ -642,18 +586,20 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(trans, parent);
if (last_ref) {
ret = btrfs_tree_mod_log_free_eb(buf);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
}
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
parent_start, last_ref);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
}
+
+ trace_btrfs_cow_block(root, buf, cow);
if (unlock_orig)
btrfs_tree_unlock(buf);
free_extent_buffer_stale(buf);
@@ -667,15 +613,12 @@ error_unlock_cow:
return ret;
}
-static inline int should_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+static inline bool should_cow_block(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf)
{
if (btrfs_is_testing(root->fs_info))
- return 0;
-
- /* Ensure we can see the FORCE_COW bit */
- smp_mb__before_atomic();
+ return false;
/*
* We do not need to cow a block if
@@ -688,13 +631,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
* after we've finished copying src root, we must COW the shared
* block to ensure the metadata consistency.
*/
- if (btrfs_header_generation(buf) == trans->transid &&
- !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
- !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
- btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
- !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
- return 0;
- return 1;
+
+ if (btrfs_header_generation(buf) != trans->transid)
+ return true;
+
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN))
+ return true;
+
+ /* Ensure we can see the FORCE_COW bit. */
+ smp_mb__before_atomic();
+ if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
+ return true;
+
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
+ return false;
+
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
+ return true;
+
+ return false;
}
/*
@@ -710,7 +665,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
- int ret;
if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
btrfs_abort_transaction(trans, -EUCLEAN);
@@ -751,12 +705,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
* Also We don't care about the error, as it's handled internally.
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
- ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
- cow_ret, search_start, 0, nest);
-
- trace_btrfs_cow_block(root, buf, *cow_ret);
-
- return ret;
+ return btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
+ cow_ret, search_start, 0, nest);
}
ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
@@ -794,7 +744,7 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke
* Slot may point to the total number of items (i.e. one position beyond the last
* key) if the key is bigger than the last key in the extent buffer.
*/
-int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
+int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
const struct btrfs_key *key, int *slot)
{
unsigned long p;
@@ -903,7 +853,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
&check);
if (IS_ERR(eb))
return eb;
- if (!extent_buffer_uptodate(eb)) {
+ if (unlikely(!extent_buffer_uptodate(eb))) {
free_extent_buffer(eb);
return ERR_PTR(-EIO);
}
@@ -912,6 +862,75 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
}
/*
+ * Promote a child node to become the new tree root.
+ *
+ * @trans: Transaction handle
+ * @root: Tree root structure to update
+ * @path: Path holding nodes and locks
+ * @level: Level of the parent (old root)
+ * @parent: The parent (old root) with exactly one item
+ *
+ * This helper is called during rebalancing when the root node contains only
+ * a single item (nritems == 1). We can reduce the tree height by promoting
+ * that child to become the new root and freeing the old root node. The path
+ * locks and references are updated accordingly.
+ *
+ * Return: 0 on success, negative errno on failure. The transaction is aborted
+ * on critical errors.
+ */
+static int promote_child_to_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ int level, struct extent_buffer *parent)
+{
+ struct extent_buffer *child;
+ int ret;
+
+ ASSERT(btrfs_header_nritems(parent) == 1);
+
+ child = btrfs_read_node_slot(parent, 0);
+ if (IS_ERR(child))
+ return PTR_ERR(child);
+
+ btrfs_tree_lock(child);
+ ret = btrfs_cow_block(trans, root, child, parent, 0, &child, BTRFS_NESTING_COW);
+ if (ret) {
+ btrfs_tree_unlock(child);
+ free_extent_buffer(child);
+ return ret;
+ }
+
+ ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
+ if (unlikely(ret < 0)) {
+ btrfs_tree_unlock(child);
+ free_extent_buffer(child);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ rcu_assign_pointer(root->node, child);
+
+ add_root_to_dirty_list(root);
+ btrfs_tree_unlock(child);
+
+ path->locks[level] = 0;
+ path->nodes[level] = NULL;
+ btrfs_clear_buffer_dirty(trans, parent);
+ btrfs_tree_unlock(parent);
+ /* Once for the path. */
+ free_extent_buffer(parent);
+
+ root_sub_used_bytes(root);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), parent, 0, 1);
+ /* Once for the root ptr. */
+ free_extent_buffer_stale(parent);
+ if (unlikely(ret < 0)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
* node level balancing, used to make sure nodes are in proper order for
* item deletion. We balance from the top down, so we have to make sure
* that a deletion won't leave an node completely empty later on.
@@ -950,55 +969,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
* by promoting the node below to a root
*/
if (!parent) {
- struct extent_buffer *child;
-
if (btrfs_header_nritems(mid) != 1)
return 0;
- /* promote the child to a root */
- child = btrfs_read_node_slot(mid, 0);
- if (IS_ERR(child)) {
- ret = PTR_ERR(child);
- goto out;
- }
-
- btrfs_tree_lock(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
- BTRFS_NESTING_COW);
- if (ret) {
- btrfs_tree_unlock(child);
- free_extent_buffer(child);
- goto out;
- }
-
- ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
- if (ret < 0) {
- btrfs_tree_unlock(child);
- free_extent_buffer(child);
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
- rcu_assign_pointer(root->node, child);
-
- add_root_to_dirty_list(root);
- btrfs_tree_unlock(child);
-
- path->locks[level] = 0;
- path->nodes[level] = NULL;
- btrfs_clear_buffer_dirty(trans, mid);
- btrfs_tree_unlock(mid);
- /* once for the path */
- free_extent_buffer(mid);
-
- root_sub_used_bytes(root);
- ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
- /* once for the root ptr */
- free_extent_buffer_stale(mid);
- if (ret < 0) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
- return 0;
+ return promote_child_to_root(trans, root, path, level, mid);
}
if (btrfs_header_nritems(mid) >
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
@@ -1069,7 +1043,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right, 0, 1);
free_extent_buffer_stale(right);
right = NULL;
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1078,7 +1052,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &right_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1130,7 +1104,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1140,7 +1114,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_node_key(mid, &mid_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1151,11 +1125,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the path */
if (left) {
if (btrfs_header_nritems(left) > orig_slot) {
- atomic_inc(&left->refs);
/* left was locked after cow */
path->nodes[level] = left;
path->slots[level + 1] -= 1;
path->slots[level] = orig_slot;
+ /* Left is now owned by path. */
+ left = NULL;
if (mid) {
btrfs_tree_unlock(mid);
free_extent_buffer(mid);
@@ -1175,8 +1150,7 @@ out:
free_extent_buffer(right);
}
if (left) {
- if (path->nodes[level] != left)
- btrfs_tree_unlock(left);
+ btrfs_tree_unlock(left);
free_extent_buffer(left);
}
return ret;
@@ -1245,7 +1219,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(mid, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_tree_unlock(left);
free_extent_buffer(left);
btrfs_abort_transaction(trans, ret);
@@ -1305,7 +1279,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_tree_unlock(right);
free_extent_buffer(right);
btrfs_abort_transaction(trans, ret);
@@ -1338,7 +1312,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
* to the block in 'slot', and triggering ra on them.
*/
static void reada_for_search(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
int level, int slot, u64 objectid)
{
struct extent_buffer *node;
@@ -1420,7 +1394,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
}
}
-static noinline void reada_for_balance(struct btrfs_path *path, int level)
+static noinline void reada_for_balance(const struct btrfs_path *path, int level)
{
struct extent_buffer *parent;
int slot;
@@ -1485,8 +1459,8 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
}
if (i >= lowest_unlock && i > skip_level) {
- check_skip = false;
btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+ check_skip = false;
path->locks[i] = 0;
if (write_lock_level &&
i > min_write_lock_level &&
@@ -1516,8 +1490,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
u64 blocknr;
struct extent_buffer *tmp = NULL;
int ret = 0;
+ int ret2;
int parent_level;
- int err;
bool read_tmp = false;
bool tmp_locked = false;
bool path_released = false;
@@ -1543,13 +1517,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
reada_for_search(fs_info, p, parent_level, slot, key->objectid);
/* first we do an atomic uptodate check */
- if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) {
+ if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) {
/*
* Do extra check for first_key, eb can be stale due to
* being cached, read from scrub, or have multiple
* parents (shared tree blocks).
*/
- if (btrfs_verify_level_key(tmp, &check)) {
+ if (unlikely(btrfs_verify_level_key(tmp, &check))) {
ret = -EUCLEAN;
goto out;
}
@@ -1566,6 +1540,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (!p->skip_locking) {
btrfs_unlock_up_safe(p, parent_level + 1);
+ btrfs_maybe_reset_lockdep_class(root, tmp);
tmp_locked = true;
btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
@@ -1574,9 +1549,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
}
/* Now we're allowed to do a blocking uptodate check. */
- err = btrfs_read_extent_buffer(tmp, &check);
- if (err) {
- ret = err;
+ ret2 = btrfs_read_extent_buffer(tmp, &check);
+ if (ret2) {
+ ret = ret2;
goto out;
}
@@ -1609,6 +1584,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (!p->skip_locking) {
ASSERT(ret == -EAGAIN);
+ btrfs_maybe_reset_lockdep_class(root, tmp);
tmp_locked = true;
btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
@@ -1616,9 +1592,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
}
/* Now we're allowed to do a blocking uptodate check. */
- err = btrfs_read_extent_buffer(tmp, &check);
- if (err) {
- ret = err;
+ ret2 = btrfs_read_extent_buffer(tmp, &check);
+ if (ret2) {
+ ret = ret2;
goto out;
}
@@ -1628,7 +1604,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
* and give up so that our caller doesn't loop forever
* on our EAGAINs.
*/
- if (!extent_buffer_uptodate(tmp)) {
+ if (unlikely(!extent_buffer_uptodate(tmp))) {
ret = -EIO;
goto out;
}
@@ -1753,13 +1729,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
if (p->search_commit_root) {
b = root->commit_root;
- atomic_inc(&b->refs);
+ refcount_inc(&b->refs);
level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
- * p->search_commit_root = 1.
+ * p->search_commit_root is true.
*/
- ASSERT(p->skip_locking == 1);
+ ASSERT(p->skip_locking);
goto out;
}
@@ -1809,7 +1785,7 @@ out:
* The root may have failed to write out at some point, and thus is no
* longer valid, return an error in this case.
*/
- if (!extent_buffer_uptodate(b)) {
+ if (unlikely(!extent_buffer_uptodate(b))) {
if (root_lock)
btrfs_tree_unlock_rw(b, root_lock);
free_extent_buffer(b);
@@ -1862,7 +1838,7 @@ static int finish_need_commit_sem_search(struct btrfs_path *path)
return 0;
}
-static inline int search_for_key_slot(struct extent_buffer *eb,
+static inline int search_for_key_slot(const struct extent_buffer *eb,
int search_low_slot,
const struct btrfs_key *key,
int prev_cmp,
@@ -1996,15 +1972,14 @@ static int search_leaf(struct btrfs_trans_handle *trans,
ASSERT(leaf_free_space >= 0);
if (leaf_free_space < ins_len) {
- int err;
-
- err = split_leaf(trans, root, key, path, ins_len,
- (ret == 0));
- ASSERT(err <= 0);
- if (WARN_ON(err > 0))
- err = -EUCLEAN;
- if (err)
- ret = err;
+ int ret2;
+
+ ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0));
+ ASSERT(ret2 <= 0);
+ if (WARN_ON(ret2 > 0))
+ ret2 = -EUCLEAN;
+ if (ret2)
+ ret = ret2;
}
}
@@ -2046,11 +2021,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_path *p,
int ins_len, int cow)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info;
struct extent_buffer *b;
int slot;
int ret;
- int err;
int level;
int lowest_unlock = 1;
/* everything at write_lock_level or lower must be write locked */
@@ -2059,6 +2033,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int min_write_lock_level;
int prev_cmp;
+ if (!root)
+ return -EINVAL;
+
+ fs_info = root->fs_info;
might_sleep();
lowest_level = p->lowest_level;
@@ -2117,6 +2095,7 @@ again:
while (b) {
int dec = 0;
+ int ret2;
level = btrfs_header_level(b);
@@ -2145,16 +2124,15 @@ again:
}
if (last_level)
- err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b,
- BTRFS_NESTING_COW);
+ ret2 = btrfs_cow_block(trans, root, b, NULL, 0,
+ &b, BTRFS_NESTING_COW);
else
- err = btrfs_cow_block(trans, root, b,
- p->nodes[level + 1],
- p->slots[level + 1], &b,
- BTRFS_NESTING_COW);
- if (err) {
- ret = err;
+ ret2 = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
+ if (ret2) {
+ ret = ret2;
goto done;
}
}
@@ -2202,12 +2180,12 @@ cow_done:
slot--;
}
p->slots[level] = slot;
- err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
- &write_lock_level);
- if (err == -EAGAIN)
+ ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len,
+ &write_lock_level);
+ if (ret2 == -EAGAIN)
goto again;
- if (err) {
- ret = err;
+ if (ret2) {
+ ret = ret2;
goto done;
}
b = p->nodes[level];
@@ -2233,11 +2211,11 @@ cow_done:
goto done;
}
- err = read_block_for_search(root, p, &b, slot, key);
- if (err == -EAGAIN && !p->nowait)
+ ret2 = read_block_for_search(root, p, &b, slot, key);
+ if (ret2 == -EAGAIN && !p->nowait)
goto again;
- if (err) {
- ret = err;
+ if (ret2) {
+ ret = ret2;
goto done;
}
@@ -2300,7 +2278,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
struct extent_buffer *b;
int slot;
int ret;
- int err;
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
@@ -2316,7 +2293,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
again:
b = btrfs_get_old_root(root, time_seq);
- if (!b) {
+ if (unlikely(!b)) {
ret = -EIO;
goto done;
}
@@ -2325,6 +2302,7 @@ again:
while (b) {
int dec = 0;
+ int ret2;
level = btrfs_header_level(b);
p->nodes[level] = b;
@@ -2360,11 +2338,11 @@ again:
goto done;
}
- err = read_block_for_search(root, p, &b, slot, key);
- if (err == -EAGAIN && !p->nowait)
+ ret2 = read_block_for_search(root, p, &b, slot, key);
+ if (ret2 == -EAGAIN && !p->nowait)
goto again;
- if (err) {
- ret = err;
+ if (ret2) {
+ ret = ret2;
goto done;
}
@@ -2645,12 +2623,11 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
btrfs_print_leaf(eb);
btrfs_crit(fs_info,
- "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
+ "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT,
slot, btrfs_disk_key_objectid(&disk_key),
btrfs_disk_key_type(&disk_key),
btrfs_disk_key_offset(&disk_key),
- new_key->objectid, new_key->type,
- new_key->offset);
+ BTRFS_KEY_FMT_VALUE(new_key));
BUG();
}
}
@@ -2659,12 +2636,11 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
btrfs_print_leaf(eb);
btrfs_crit(fs_info,
- "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
+ "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT,
slot, btrfs_disk_key_objectid(&disk_key),
btrfs_disk_key_type(&disk_key),
btrfs_disk_key_offset(&disk_key),
- new_key->objectid, new_key->type,
- new_key->offset);
+ BTRFS_KEY_FMT_VALUE(new_key));
BUG();
}
}
@@ -2723,10 +2699,9 @@ static bool check_sibling_keys(const struct extent_buffer *left,
btrfs_crit(left->fs_info, "right extent buffer:");
btrfs_print_tree(right, false);
btrfs_crit(left->fs_info,
-"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
- left_last.objectid, left_last.type,
- left_last.offset, right_first.objectid,
- right_first.type, right_first.offset);
+"bad key order, sibling blocks, left last " BTRFS_KEY_FMT " right first " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&left_last),
+ BTRFS_KEY_FMT_VALUE(&right_first));
return true;
}
return false;
@@ -2741,7 +2716,7 @@ static bool check_sibling_keys(const struct extent_buffer *left,
*/
static int push_node_left(struct btrfs_trans_handle *trans,
struct extent_buffer *dst,
- struct extent_buffer *src, int empty)
+ struct extent_buffer *src, bool empty)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int push_items = 0;
@@ -2777,13 +2752,13 @@ static int push_node_left(struct btrfs_trans_handle *trans,
push_items = min(src_nritems - 8, push_items);
/* dst is the left eb, src is the middle eb */
- if (check_sibling_keys(dst, src)) {
+ if (unlikely(check_sibling_keys(dst, src))) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2851,7 +2826,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
push_items = max_push;
/* dst is the right eb, src is the middle eb */
- if (check_sibling_keys(src, dst)) {
+ if (unlikely(check_sibling_keys(src, dst))) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
return ret;
@@ -2868,7 +2843,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
push_items);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2936,8 +2911,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
if (ret < 0) {
int ret2;
+ btrfs_clear_buffer_dirty(trans, c);
ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
- if (ret2 < 0)
+ if (unlikely(ret2 < 0))
btrfs_abort_transaction(trans, ret2);
btrfs_tree_unlock(c);
free_extent_buffer(c);
@@ -2949,7 +2925,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
free_extent_buffer(old);
add_root_to_dirty_list(root);
- atomic_inc(&c->refs);
+ refcount_inc(&c->refs);
path->nodes[level] = c;
path->locks[level] = BTRFS_WRITE_LOCK;
path->slots[level] = 0;
@@ -2982,7 +2958,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans,
if (level) {
ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
slot, nritems - slot);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2995,7 +2971,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans,
if (level) {
ret = btrfs_tree_mod_log_insert_key(lower, slot,
BTRFS_MOD_LOG_KEY_ADD);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -3071,7 +3047,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ASSERT(btrfs_header_level(c) == level);
ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_tree_unlock(split);
free_extent_buffer(split);
btrfs_abort_transaction(trans, ret);
@@ -3140,7 +3116,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf)
int ret;
ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_crit(fs_info,
"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
ret,
@@ -3156,7 +3132,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf)
*/
static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- int data_size, int empty,
+ int data_size, bool empty,
struct extent_buffer *right,
int free_space, u32 left_nritems,
u32 min_slot)
@@ -3164,7 +3140,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = right->fs_info;
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *upper = path->nodes[1];
- struct btrfs_map_token token;
struct btrfs_disk_key disk_key;
int slot;
u32 i;
@@ -3238,13 +3213,12 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
/* update the item pointers */
- btrfs_init_map_token(&token, right);
right_nritems += push_items;
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- push_space -= btrfs_token_item_size(&token, i);
- btrfs_set_token_item_offset(&token, i, push_space);
+ push_space -= btrfs_item_size(right, i);
+ btrfs_set_item_offset(right, i, push_space);
}
left_nritems -= push_items;
@@ -3264,10 +3238,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
/* then fixup the leaf pointer in the path */
if (path->slots[0] >= left_nritems) {
path->slots[0] -= left_nritems;
- if (btrfs_header_nritems(path->nodes[0]) == 0)
- btrfs_clear_buffer_dirty(trans, path->nodes[0]);
- btrfs_tree_unlock(path->nodes[0]);
- free_extent_buffer(path->nodes[0]);
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
path->nodes[0] = right;
path->slots[1] += 1;
} else {
@@ -3295,7 +3267,7 @@ out_unlock:
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path,
int min_data_size, int data_size,
- int empty, u32 min_slot)
+ bool empty, u32 min_slot)
{
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *right;
@@ -3334,7 +3306,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (left_nritems == 0)
goto out_unlock;
- if (check_sibling_keys(left, right)) {
+ if (unlikely(check_sibling_keys(left, right))) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
btrfs_tree_unlock(right);
@@ -3372,7 +3344,7 @@ out_unlock:
*/
static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
struct btrfs_path *path, int data_size,
- int empty, struct extent_buffer *left,
+ bool empty, struct extent_buffer *left,
int free_space, u32 right_nritems,
u32 max_slot)
{
@@ -3387,7 +3359,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
int ret = 0;
u32 this_item_size;
u32 old_left_item_size;
- struct btrfs_map_token token;
if (empty)
nr = min(right_nritems, max_slot);
@@ -3435,21 +3406,24 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
- btrfs_init_map_token(&token, left);
old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i,
+ ioff = btrfs_item_offset(left, i);
+ btrfs_set_item_offset(left, i,
ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
/* fixup right node */
- if (push_items > right_nritems)
- WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
- right_nritems);
+ if (unlikely(push_items > right_nritems)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ btrfs_crit(fs_info, "push items (%d) > right leaf items (%u)",
+ push_items, right_nritems);
+ goto out;
+ }
if (push_items < right_nritems) {
push_space = btrfs_item_offset(right, push_items - 1) -
@@ -3462,13 +3436,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_header_nritems(right) - push_items);
}
- btrfs_init_map_token(&token, right);
right_nritems -= push_items;
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- push_space = push_space - btrfs_token_item_size(&token, i);
- btrfs_set_token_item_offset(&token, i, push_space);
+ push_space = push_space - btrfs_item_size(right, i);
+ btrfs_set_item_offset(right, i, push_space);
}
btrfs_mark_buffer_dirty(trans, left);
@@ -3483,8 +3456,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
path->slots[0] += old_left_nritems;
- btrfs_tree_unlock(path->nodes[0]);
- free_extent_buffer(path->nodes[0]);
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
path->nodes[0] = left;
path->slots[1] -= 1;
} else {
@@ -3553,7 +3526,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- if (check_sibling_keys(left, right)) {
+ if (unlikely(check_sibling_keys(left, right))) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3582,7 +3555,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
int i;
int ret;
struct btrfs_disk_key disk_key;
- struct btrfs_map_token token;
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
@@ -3595,12 +3567,11 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
- btrfs_init_map_token(&token, right);
for (i = 0; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
+ ioff = btrfs_item_offset(right, i);
+ btrfs_set_item_offset(right, i, ioff + rt_data_off);
}
btrfs_set_header_nritems(l, mid);
@@ -3703,7 +3674,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const struct btrfs_key *ins_key,
struct btrfs_path *path, int data_size,
- int extend)
+ bool extend)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *l;
@@ -3899,6 +3870,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+ key.type != BTRFS_RAID_STRIPE_KEY &&
key.type != BTRFS_EXTENT_CSUM_KEY);
if (btrfs_leaf_free_space(leaf) >= ins_len)
@@ -3912,10 +3884,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- path->keep_locks = 1;
- path->search_for_split = 1;
+ path->keep_locks = true;
+ path->search_for_split = true;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- path->search_for_split = 0;
+ path->search_for_split = false;
if (ret > 0)
ret = -EAGAIN;
if (ret < 0)
@@ -3942,11 +3914,11 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
if (ret)
goto err;
- path->keep_locks = 0;
+ path->keep_locks = false;
btrfs_unlock_up_safe(path, 1);
return 0;
err:
- path->keep_locks = 0;
+ path->keep_locks = false;
return ret;
}
@@ -4065,7 +4037,6 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
unsigned int old_size;
unsigned int size_diff;
int i;
- struct btrfs_map_token token;
leaf = path->nodes[0];
slot = path->slots[0];
@@ -4088,12 +4059,11 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
- btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff + size_diff);
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i, ioff + size_diff);
}
/* shift the data */
@@ -4137,7 +4107,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
btrfs_set_item_size(leaf, slot, new_size);
btrfs_mark_buffer_dirty(trans, leaf);
- if (btrfs_leaf_free_space(leaf) < 0) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4156,14 +4126,13 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
unsigned int old_data;
unsigned int old_size;
int i;
- struct btrfs_map_token token;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
- if (btrfs_leaf_free_space(leaf) < data_size) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < data_size)) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4171,7 +4140,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
old_data = btrfs_item_data_end(leaf, slot);
BUG_ON(slot < 0);
- if (slot >= nritems) {
+ if (unlikely(slot >= nritems)) {
btrfs_print_leaf(leaf);
btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
slot, nritems);
@@ -4182,24 +4151,22 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
- btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff - data_size);
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i, ioff - data_size);
}
/* shift the data */
memmove_leaf_data(leaf, data_end - data_size, data_end,
old_data - data_end);
- data_end = old_data;
old_size = btrfs_item_size(leaf, slot);
btrfs_set_item_size(leaf, slot, old_size + data_size);
btrfs_mark_buffer_dirty(trans, leaf);
- if (btrfs_leaf_free_space(leaf) < 0) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4227,7 +4194,6 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
struct extent_buffer *leaf;
int slot;
- struct btrfs_map_token token;
u32 total_size;
/*
@@ -4248,18 +4214,17 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
data_end = leaf_data_end(leaf);
total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
- if (btrfs_leaf_free_space(leaf) < total_size) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) {
btrfs_print_leaf(leaf);
btrfs_crit(fs_info, "not enough freespace need %u have %d",
total_size, btrfs_leaf_free_space(leaf));
BUG();
}
- btrfs_init_map_token(&token, leaf);
if (slot != nritems) {
unsigned int old_data = btrfs_item_data_end(leaf, slot);
- if (old_data < data_end) {
+ if (unlikely(old_data < data_end)) {
btrfs_print_leaf(leaf);
btrfs_crit(fs_info,
"item at slot %d with data offset %u beyond data end of leaf %u",
@@ -4273,8 +4238,8 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
for (i = slot; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i,
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i,
ioff - batch->total_data_size);
}
/* shift the items */
@@ -4291,14 +4256,14 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
btrfs_set_item_key(leaf, &disk_key, slot + i);
data_end -= batch->data_sizes[i];
- btrfs_set_token_item_offset(&token, slot + i, data_end);
- btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
+ btrfs_set_item_offset(leaf, slot + i, data_end);
+ btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]);
}
btrfs_set_header_nritems(leaf, nritems + batch->nr);
btrfs_mark_buffer_dirty(trans, leaf);
- if (btrfs_leaf_free_space(leaf) < 0) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4369,7 +4334,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u32 data_size)
{
int ret = 0;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
unsigned long ptr;
@@ -4383,7 +4348,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
write_extent_buffer(leaf, data, ptr, data_size);
btrfs_mark_buffer_dirty(trans, leaf);
}
- btrfs_free_path(path);
return ret;
}
@@ -4441,7 +4405,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (level) {
ret = btrfs_tree_mod_log_insert_move(parent, slot,
slot + 1, nritems - slot - 1);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -4454,7 +4418,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
} else if (level) {
ret = btrfs_tree_mod_log_insert_key(parent, slot,
BTRFS_MOD_LOG_KEY_REMOVE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -4506,7 +4470,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used_bytes(root);
- atomic_inc(&leaf->refs);
+ refcount_inc(&leaf->refs);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
if (ret < 0)
@@ -4533,7 +4497,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (slot + nr != nritems) {
const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
const int data_end = leaf_data_end(leaf);
- struct btrfs_map_token token;
u32 dsize = 0;
int i;
@@ -4543,12 +4506,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
memmove_leaf_data(leaf, data_end + dsize, data_end,
last_off - data_end);
- btrfs_init_map_token(&token, leaf);
for (i = slot + nr; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff + dsize);
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i, ioff + dsize);
}
memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
@@ -4558,9 +4520,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
/* delete the leaf if we've emptied it */
if (nritems == 0) {
- if (leaf == root->node) {
- btrfs_set_header_level(leaf, 0);
- } else {
+ if (leaf != root->node) {
btrfs_clear_buffer_dirty(trans, leaf);
ret = btrfs_del_leaf(trans, root, path, leaf);
if (ret < 0)
@@ -4591,7 +4551,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* for possible call to btrfs_del_ptr below
*/
slot = path->slots[1];
- atomic_inc(&leaf->refs);
+ refcount_inc(&leaf->refs);
/*
* We want to be able to at least push one item to the
* left neighbour leaf, and that's the first item.
@@ -4626,10 +4586,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (btrfs_header_nritems(leaf) == 0) {
path->slots[1] = slot;
ret = btrfs_del_leaf(trans, root, path, leaf);
+ free_extent_buffer(leaf);
if (ret < 0)
return ret;
- free_extent_buffer(leaf);
- ret = 0;
} else {
/* if we're still in the path, make sure
* we're dirty. Otherwise, one of the
@@ -4649,16 +4608,13 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
/*
* A helper function to walk down the tree starting at min_key, and looking
- * for nodes or leaves that are have a minimum transaction id.
+ * for leaves that have a minimum transaction id.
* This is used by the btree defrag code, and tree logging
*
* This does not cow, but it does stuff the starting key it finds back
* into min_key, so you can call btrfs_search_slot with cow=1 on the
* key and get a writable path.
*
- * This honors path->lowest_level to prevent descent past a given level
- * of the tree.
- *
* min_trans indicates the oldest transaction that you are interested
* in walking through. Any nodes or leaves older than min_trans are
* skipped over (without reading them).
@@ -4671,16 +4627,16 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
u64 min_trans)
{
struct extent_buffer *cur;
- struct btrfs_key found_key;
int slot;
int sret;
u32 nritems;
int level;
int ret = 1;
- int keep_locks = path->keep_locks;
+ const bool keep_locks = path->keep_locks;
ASSERT(!path->nowait);
- path->keep_locks = 1;
+ ASSERT(path->lowest_level == 0);
+ path->keep_locks = true;
again:
cur = btrfs_read_lock_root_node(root);
level = btrfs_header_level(cur);
@@ -4701,13 +4657,14 @@ again:
goto out;
}
- /* at the lowest level, we're done, setup the path and exit */
- if (level == path->lowest_level) {
+ /* At level 0 we're done, setup the path and exit. */
+ if (level == 0) {
if (slot >= nritems)
goto find_next_key;
ret = 0;
path->slots[level] = slot;
- btrfs_item_key_to_cpu(cur, &found_key, slot);
+ /* Save our key for returning back. */
+ btrfs_item_key_to_cpu(cur, min_key, slot);
goto out;
}
if (sret && slot > 0)
@@ -4731,8 +4688,8 @@ find_next_key:
* we didn't find a candidate key in this node, walk forward
* and find another one
*/
+ path->slots[level] = slot;
if (slot >= nritems) {
- path->slots[level] = slot;
sret = btrfs_find_next_key(root, path, min_key, level,
min_trans);
if (sret == 0) {
@@ -4742,13 +4699,6 @@ find_next_key:
goto out;
}
}
- /* save our key for returning back */
- btrfs_node_key_to_cpu(cur, &found_key, slot);
- path->slots[level] = slot;
- if (level == path->lowest_level) {
- ret = 0;
- goto out;
- }
cur = btrfs_read_node_slot(cur, slot);
if (IS_ERR(cur)) {
ret = PTR_ERR(cur);
@@ -4763,10 +4713,8 @@ find_next_key:
}
out:
path->keep_locks = keep_locks;
- if (ret == 0) {
- btrfs_unlock_up_safe(path, path->lowest_level + 1);
- memcpy(min_key, &found_key, sizeof(found_key));
- }
+ if (ret == 0)
+ btrfs_unlock_up_safe(path, 1);
return ret;
}
@@ -4778,7 +4726,7 @@ out:
* 0 is returned if another key is found, < 0 if there are any errors
* and 1 is returned if there are no higher keys in the tree
*
- * path->keep_locks should be set to 1 on the search made before
+ * path->keep_locks should be set to true on the search made before
* calling this function.
*/
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
@@ -4877,13 +4825,13 @@ again:
next = NULL;
btrfs_release_path(path);
- path->keep_locks = 1;
+ path->keep_locks = true;
if (time_seq) {
ret = btrfs_search_old_slot(root, &key, path, time_seq);
} else {
if (path->need_commit_sem) {
- path->need_commit_sem = 0;
+ path->need_commit_sem = false;
need_commit_sem = true;
if (path->nowait) {
if (!down_read_trylock(&fs_info->commit_root_sem)) {
@@ -4896,41 +4844,30 @@ again:
}
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
}
- path->keep_locks = 0;
+ path->keep_locks = false;
if (ret < 0)
goto done;
nritems = btrfs_header_nritems(path->nodes[0]);
/*
- * by releasing the path above we dropped all our locks. A balance
- * could have added more items next to the key that used to be
- * at the very end of the block. So, check again here and
- * advance the path if there are now more items available.
- */
- if (nritems > 0 && path->slots[0] < nritems - 1) {
- if (ret == 0)
- path->slots[0]++;
- ret = 0;
- goto done;
- }
- /*
- * So the above check misses one case:
- * - after releasing the path above, someone has removed the item that
- * used to be at the very end of the block, and balance between leafs
- * gets another one with bigger key.offset to replace it.
+ * By releasing the path above we dropped all our locks. A balance
+ * could have happened and
*
- * This one should be returned as well, or we can get leaf corruption
- * later(esp. in __btrfs_drop_extents()).
+ * 1. added more items after the previous last item
+ * 2. deleted the previous last item
*
- * And a bit more explanation about this check,
- * with ret > 0, the key isn't found, the path points to the slot
- * where it should be inserted, so the path->slots[0] item must be the
- * bigger one.
+ * So, check again here and advance the path if there are now more
+ * items available.
*/
- if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
- ret = 0;
- goto done;
+ if (nritems > 0 && path->slots[0] <= nritems - 1) {
+ if (ret == 0 && path->slots[0] != nritems - 1) {
+ path->slots[0]++;
+ goto done;
+ } else if (ret > 0) {
+ ret = 0;
+ goto done;
+ }
}
while (level < BTRFS_MAX_LEVEL) {
@@ -5035,7 +4972,7 @@ done:
if (need_commit_sem) {
int ret2;
- path->need_commit_sem = 1;
+ path->need_commit_sem = true;
ret2 = finish_need_commit_sem_search(path);
up_read(&fs_info->commit_root_sem);
if (ret2)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 307dedf95c70..692370fc07b2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -6,8 +6,7 @@
#ifndef BTRFS_CTREE_H
#define BTRFS_CTREE_H
-#include "linux/cleanup.h"
-#include <linux/pagemap.h>
+#include <linux/cleanup.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/mutex.h>
@@ -18,9 +17,7 @@
#include <linux/refcount.h>
#include <uapi/linux/btrfs_tree.h>
#include "locking.h"
-#include "fs.h"
#include "accessors.h"
-#include "extent-io-tree.h"
struct extent_buffer;
struct btrfs_block_rsv;
@@ -62,27 +59,27 @@ struct btrfs_path {
/* if there is real range locking, this locks field will change */
u8 locks[BTRFS_MAX_LEVEL];
u8 reada;
- /* keep some upper locks as we walk down */
u8 lowest_level;
/*
* set by btrfs_split_item, tells search_slot to keep all locks
* and to force calls to keep space in the nodes
*/
- unsigned int search_for_split:1;
- unsigned int keep_locks:1;
- unsigned int skip_locking:1;
- unsigned int search_commit_root:1;
- unsigned int need_commit_sem:1;
- unsigned int skip_release_on_error:1;
+ bool search_for_split:1;
+ /* Keep some upper locks as we walk down. */
+ bool keep_locks:1;
+ bool skip_locking:1;
+ bool search_commit_root:1;
+ bool need_commit_sem:1;
+ bool skip_release_on_error:1;
/*
* Indicate that new item (btrfs_search_slot) is extending already
* existing item and ins_len contains only the data size and not item
* header (ie. sizeof(struct btrfs_item) is not included).
*/
- unsigned int search_for_extension:1;
+ bool search_for_extension:1;
/* Stop search if any locks need to be taken (for read) */
- unsigned int nowait:1;
+ bool nowait:1;
};
#define BTRFS_PATH_AUTO_FREE(path_name) \
@@ -225,16 +222,10 @@ struct btrfs_root {
struct list_head root_list;
- /*
- * Xarray that keeps track of in-memory inodes, protected by the lock
- * @inode_lock.
- */
+ /* Xarray that keeps track of in-memory inodes. */
struct xarray inodes;
- /*
- * Xarray that keeps track of delayed nodes of every inode, protected
- * by @inode_lock.
- */
+ /* Xarray that keeps track of delayed nodes of every inode. */
struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
@@ -371,6 +362,25 @@ static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transi
}
/*
+ * Return the generation this root started with.
+ *
+ * Every normal root that is created with root->root_key.offset set to it's
+ * originating generation. If it is a snapshot it is the generation when the
+ * snapshot was created.
+ *
+ * However for TREE_RELOC roots root_key.offset is the objectid of the owning
+ * tree root. Thankfully we copy the root item of the owning tree root, which
+ * has it's last_snapshot set to what we would have root_key.offset set to, so
+ * return that if this is a TREE_RELOC root.
+ */
+static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root)
+{
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
+ return btrfs_root_last_snapshot(&root->root_item);
+ return root->root_key.offset;
+}
+
+/*
* Structure that conveys information about an extent that is going to replace
* all the extents in a file range.
*/
@@ -487,24 +497,10 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
}
-#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
- ((bytes) >> (fs_info)->sectorsize_bits)
-
-static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
-{
- return mapping_gfp_constraint(mapping, ~__GFP_FS);
-}
-
-void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
-int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes);
-int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
-
-/* ctree.c */
int __init btrfs_ctree_init(void);
void __cold btrfs_ctree_exit(void);
-int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
+int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
const struct btrfs_key *key, int *slot);
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
@@ -572,9 +568,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid);
-bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf);
+bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf);
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_trans_handle *trans,
@@ -723,13 +719,18 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
}
int btrfs_leaf_free_space(const struct extent_buffer *leaf);
-static inline int is_fstree(u64 rootid)
+static inline bool btrfs_is_fstree(u64 rootid)
{
- if (rootid == BTRFS_FS_TREE_OBJECTID ||
- ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
- !btrfs_qgroup_level(rootid)))
- return 1;
- return 0;
+ if (rootid == BTRFS_FS_TREE_OBJECTID)
+ return true;
+
+ if ((s64)rootid < (s64)BTRFS_FIRST_FREE_OBJECTID)
+ return false;
+
+ if (btrfs_qgroup_level(rootid) != 0)
+ return false;
+
+ return true;
}
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
@@ -737,18 +738,4 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
}
-u16 btrfs_csum_type_size(u16 type);
-int btrfs_super_csum_size(const struct btrfs_super_block *s);
-const char *btrfs_super_csum_name(u16 csum_type);
-const char *btrfs_super_csum_driver(u16 csum_type);
-size_t __attribute_const__ btrfs_get_num_csums(void);
-
-/*
- * We use folio flag owner_2 to indicate there is an ordered extent with
- * unfinished IO.
- */
-#define folio_test_ordered(folio) folio_test_owner_2(folio)
-#define folio_set_ordered(folio) folio_set_owner_2(folio)
-#define folio_clear_ordered(folio) folio_clear_owner_2(folio)
-
#endif
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 968dae953948..b81e224d4a27 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -15,6 +15,7 @@
#include "defrag.h"
#include "file-item.h"
#include "super.h"
+#include "compression.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
@@ -60,6 +61,14 @@ static int compare_inode_defrag(const struct inode_defrag *defrag1,
return 0;
}
+static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node);
+ const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node);
+
+ return compare_inode_defrag(new_defrag, existing_defrag);
+}
+
/*
* Insert a record for an inode into the defrag tree. The lock must be held
* already.
@@ -71,49 +80,35 @@ static int btrfs_insert_inode_defrag(struct btrfs_inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct inode_defrag *entry;
- struct rb_node **p;
- struct rb_node *parent = NULL;
- int ret;
+ struct rb_node *node;
- p = &fs_info->defrag_inodes.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct inode_defrag, rb_node);
+ node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp);
+ if (node) {
+ struct inode_defrag *entry;
- ret = compare_inode_defrag(defrag, entry);
- if (ret < 0)
- p = &parent->rb_left;
- else if (ret > 0)
- p = &parent->rb_right;
- else {
- /*
- * If we're reinserting an entry for an old defrag run,
- * make sure to lower the transid of our existing
- * record.
- */
- if (defrag->transid < entry->transid)
- entry->transid = defrag->transid;
- entry->extent_thresh = min(defrag->extent_thresh,
- entry->extent_thresh);
- return -EEXIST;
- }
+ entry = rb_entry(node, struct inode_defrag, rb_node);
+ /*
+ * If we're reinserting an entry for an old defrag run, make
+ * sure to lower the transid of our existing record.
+ */
+ if (defrag->transid < entry->transid)
+ entry->transid = defrag->transid;
+ entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh);
+ return -EEXIST;
}
set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
- rb_link_node(&defrag->rb_node, parent, p);
- rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
return 0;
}
-static inline int need_auto_defrag(struct btrfs_fs_info *fs_info)
+static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info)
{
if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
- return 0;
+ return false;
if (btrfs_fs_closing(fs_info))
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
@@ -159,7 +154,7 @@ void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh)
}
/*
- * Pick the defragable inode that we want, if it doesn't exist, we will get the
+ * Pick the defraggable inode that we want, if it doesn't exist, we will get the
* next one.
*/
static struct inode_defrag *btrfs_pick_defrag_inode(
@@ -191,10 +186,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode(
if (parent && compare_inode_defrag(&tmp, entry) > 0) {
parent = rb_next(parent);
- if (parent)
- entry = rb_entry(parent, struct inode_defrag, rb_node);
- else
- entry = NULL;
+ entry = rb_entry_safe(parent, struct inode_defrag, rb_node);
}
out:
if (entry)
@@ -225,7 +217,7 @@ static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct file_ra_state *ra)
{
struct btrfs_root *inode_root;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_ioctl_defrag_range_args range;
int ret = 0;
u64 cur = 0;
@@ -250,24 +242,23 @@ again:
goto cleanup;
}
- if (cur >= i_size_read(inode)) {
- iput(inode);
+ if (cur >= i_size_read(&inode->vfs_inode)) {
+ iput(&inode->vfs_inode);
goto cleanup;
}
/* Do a chunk of defrag */
- clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
range.start = cur;
range.extent_thresh = defrag->extent_thresh;
- file_ra_state_init(ra, inode->i_mapping);
+ file_ra_state_init(ra, inode->vfs_inode.i_mapping);
- sb_start_write(fs_info->sb);
- ret = btrfs_defrag_file(inode, ra, &range, defrag->transid,
- BTRFS_DEFRAG_BATCH);
- sb_end_write(fs_info->sb);
- iput(inode);
+ scoped_guard(super_write, fs_info->sb)
+ ret = btrfs_defrag_file(inode, ra, &range,
+ defrag->transid, BTRFS_DEFRAG_BATCH);
+ iput(&inode->vfs_inode);
if (ret < 0)
goto cleanup;
@@ -480,7 +471,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
memcpy(&key, &root->defrag_progress, sizeof(key));
}
- path->keep_locks = 1;
+ path->keep_locks = true;
ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
if (ret < 0)
@@ -523,7 +514,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
/*
* Now that we reallocated the node we can find the next key. Note that
* btrfs_find_next_key() can release our path and do another search
- * without COWing, this is because even with path->keep_locks = 1,
+ * without COWing, this is because even with path->keep_locks == true,
* btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
* node when path->slots[node_level - 1] does not point to the last
* item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
@@ -624,7 +615,7 @@ static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
u64 ino = btrfs_ino(inode);
int ret;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
ret = -ENOMEM;
goto err;
@@ -734,12 +725,12 @@ next:
not_found:
btrfs_release_path(&path);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return NULL;
err:
btrfs_release_path(&path);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ERR_PTR(ret);
}
@@ -756,7 +747,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* full extent lock.
*/
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, sectorsize);
+ em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize);
read_unlock(&em_tree->lock);
/*
@@ -769,7 +760,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* file extent items in the inode's subvolume tree).
*/
if (em && (em->flags & EXTENT_FLAG_MERGED)) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
}
@@ -779,10 +770,10 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
/* Get the big lock and read metadata off disk. */
if (!locked)
- lock_extent(io_tree, start, end, &cached);
+ btrfs_lock_extent(io_tree, start, end, &cached);
em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
if (!locked)
- unlock_extent(io_tree, start, end, &cached);
+ btrfs_unlock_extent(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -794,7 +785,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
const struct extent_map *em)
{
- if (extent_map_is_compressed(em))
+ if (btrfs_extent_map_is_compressed(em))
return BTRFS_MAX_COMPRESSED;
return fs_info->max_extent_size;
}
@@ -837,7 +828,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
ret = true;
out:
- free_extent_map(next);
+ btrfs_free_extent_map(next);
return ret;
}
@@ -857,13 +848,14 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t
{
struct address_space *mapping = inode->vfs_inode.i_mapping;
gfp_t mask = btrfs_alloc_write_mask(mapping);
- u64 page_start = (u64)index << PAGE_SHIFT;
- u64 page_end = page_start + PAGE_SIZE - 1;
+ u64 lock_start;
+ u64 lock_end;
struct extent_state *cached_state = NULL;
struct folio *folio;
int ret;
again:
+ /* TODO: Add order fgp order flags when large folios are fully enabled. */
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
if (IS_ERR(folio))
@@ -871,13 +863,16 @@ again:
/*
* Since we can defragment files opened read-only, we can encounter
- * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
- * can't do I/O using huge pages yet, so return an error for now.
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS).
+ *
+ * The IO for such large folios is not fully tested, thus return
+ * an error to reject such folios unless it's an experimental build.
+ *
* Filesystem transparent huge pages are typically only used for
* executables that explicitly enable them, so this isn't very
* restrictive.
*/
- if (folio_test_large(folio)) {
+ if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) {
folio_unlock(folio);
folio_put(folio);
return ERR_PTR(-ETXTBSY);
@@ -890,14 +885,15 @@ again:
return ERR_PTR(ret);
}
+ lock_start = folio_pos(folio);
+ lock_end = folio_next_pos(folio) - 1;
/* Wait for any existing ordered extent in the range */
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
- unlock_extent(&inode->io_tree, page_start, page_end,
- &cached_state);
+ btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio));
+ btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state);
if (!ordered)
break;
@@ -928,7 +924,7 @@ again:
folio_put(folio);
goto again;
}
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
folio_unlock(folio);
folio_put(folio);
return ERR_PTR(-EIO);
@@ -951,7 +947,7 @@ struct defrag_target_range {
* @extent_thresh: file extent size threshold, any extent size >= this value
* will be ignored
* @newer_than: only defrag extents newer than this value
- * @do_compress: whether the defrag is doing compression
+ * @do_compress: whether the defrag is doing compression or no-compression
* if true, @extent_thresh will be ignored and all regular
* file extents meeting @newer_than will be targets.
* @locked: if the range has already held extent lock
@@ -1027,8 +1023,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* very likely resulting in a larger extent after writeback is
* triggered (except in a case of free space fragmentation).
*/
- if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
- EXTENT_DELALLOC))
+ if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
+ EXTENT_DELALLOC))
goto next;
/*
@@ -1066,8 +1062,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
/* Empty target list, no way to merge with last entry */
if (list_empty(target_list))
goto next;
- last = list_entry(target_list->prev,
- struct defrag_target_range, list);
+ last = list_last_entry(target_list,
+ struct defrag_target_range, list);
/* Not mergeable with last entry */
if (last->start + last->len != cur)
goto next;
@@ -1077,7 +1073,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
add:
last_is_target = true;
- range_len = min(extent_map_end(em), start + len) - cur;
+ range_len = min(btrfs_extent_map_end(em), start + len) - cur;
/*
* This one is a good target, check if it can be merged into
* last range of the target list.
@@ -1085,8 +1081,8 @@ add:
if (!list_empty(target_list)) {
struct defrag_target_range *last;
- last = list_entry(target_list->prev,
- struct defrag_target_range, list);
+ last = list_last_entry(target_list,
+ struct defrag_target_range, list);
ASSERT(last->start + last->len <= cur);
if (last->start + last->len == cur) {
/* Mergeable, enlarge the last entry */
@@ -1099,7 +1095,7 @@ add:
/* Allocate new defrag_target_range */
new = kmalloc(sizeof(*new), GFP_NOFS);
if (!new) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret = -ENOMEM;
break;
}
@@ -1108,8 +1104,8 @@ add:
list_add_tail(&new->list, target_list);
next:
- cur = extent_map_end(em);
- free_extent_map(em);
+ cur = btrfs_extent_map_end(em);
+ btrfs_free_extent_map(em);
}
if (ret < 0) {
struct defrag_target_range *entry;
@@ -1162,27 +1158,31 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
struct extent_changeset *data_reserved = NULL;
const u64 start = target->start;
const u64 len = target->len;
- unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
- unsigned long start_index = start >> PAGE_SHIFT;
- unsigned long first_index = folios[0]->index;
int ret = 0;
- int i;
-
- ASSERT(last_index - first_index + 1 <= nr_pages);
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
if (ret < 0)
return ret;
- clear_extent_bit(&inode->io_tree, start, start + len - 1,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, cached_state);
- set_extent_bit(&inode->io_tree, start, start + len - 1,
- EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
-
- /* Update the page status */
- for (i = start_index - first_index; i <= last_index - first_index; i++) {
- folio_clear_checked(folios[i]);
- btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len);
+ btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, cached_state);
+ btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
+
+ /*
+ * Update the page status.
+ * Due to possible large folios, we have to check all folios one by one.
+ */
+ for (int i = 0; i < nr_pages && folios[i]; i++) {
+ struct folio *folio = folios[i];
+
+ if (!folio)
+ break;
+ if (start >= folio_next_pos(folio) ||
+ start + len <= folio_pos(folio))
+ continue;
+ btrfs_folio_clamp_clear_checked(fs_info, folio, start, len);
+ btrfs_folio_clamp_set_dirty(fs_info, folio, start, len);
}
btrfs_delalloc_release_extents(inode, len);
extent_changeset_free(data_reserved);
@@ -1200,11 +1200,10 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
LIST_HEAD(target_list);
struct folio **folios;
const u32 sectorsize = inode->root->fs_info->sectorsize;
- u64 last_index = (start + len - 1) >> PAGE_SHIFT;
- u64 start_index = start >> PAGE_SHIFT;
- unsigned int nr_pages = last_index - start_index + 1;
+ u64 cur = start;
+ const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) -
+ (start >> PAGE_SHIFT) + 1;
int ret = 0;
- int i;
ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
@@ -1214,21 +1213,25 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
return -ENOMEM;
/* Prepare all pages */
- for (i = 0; i < nr_pages; i++) {
- folios[i] = defrag_prepare_one_folio(inode, start_index + i);
+ for (int i = 0; cur < start + len && i < nr_pages; i++) {
+ folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT);
if (IS_ERR(folios[i])) {
ret = PTR_ERR(folios[i]);
- nr_pages = i;
+ folios[i] = NULL;
goto free_folios;
}
+ cur = folio_next_pos(folios[i]);
}
- for (i = 0; i < nr_pages; i++)
+ for (int i = 0; i < nr_pages; i++) {
+ if (!folios[i])
+ break;
folio_wait_writeback(folios[i]);
+ }
+ /* We should get at least one folio. */
+ ASSERT(folios[0]);
/* Lock the pages range */
- lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
+ btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state);
/*
* Now we have a consistent view about the extent map, re-check
* which range really needs to be defragged.
@@ -1254,11 +1257,11 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
kfree(entry);
}
unlock_extent:
- unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state);
free_folios:
- for (i = 0; i < nr_pages; i++) {
+ for (int i = 0; i < nr_pages; i++) {
+ if (!folios[i])
+ break;
folio_unlock(folios[i]);
folio_put(folios[i]);
}
@@ -1352,17 +1355,19 @@ out:
* (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
* defragging all the range).
*/
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long sectors_defragged = 0;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
u64 cur;
u64 last_byte;
bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
+ bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS);
int compress_type = BTRFS_COMPRESS_ZLIB;
+ int compress_level = 0;
int ret = 0;
u32 extent_thresh = range->extent_thresh;
pgoff_t start_index;
@@ -1376,10 +1381,24 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
return -EINVAL;
if (do_compress) {
- if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
- return -EINVAL;
- if (range->compress_type)
- compress_type = range->compress_type;
+ if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) {
+ if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress.type) {
+ compress_type = range->compress.type;
+ compress_level = range->compress.level;
+ if (!btrfs_compress_level_valid(compress_type, compress_level))
+ return -EINVAL;
+ }
+ } else {
+ if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress_type)
+ compress_type = range->compress_type;
+ }
+ } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) {
+ compress_type = BTRFS_DEFRAG_DONT_COMPRESS;
+ compress_level = 1;
}
if (extent_thresh == 0)
@@ -1402,8 +1421,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
* defrag range can be written sequentially.
*/
start_index = cur >> PAGE_SHIFT;
- if (start_index < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = start_index;
+ if (start_index < inode->vfs_inode.i_mapping->writeback_index)
+ inode->vfs_inode.i_mapping->writeback_index = start_index;
while (cur < last_byte) {
const unsigned long prev_sectors_defragged = sectors_defragged;
@@ -1420,27 +1439,30 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
(SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
cluster_end = min(cluster_end, last_byte);
- btrfs_inode_lock(BTRFS_I(inode), 0);
- if (IS_SWAPFILE(inode)) {
+ btrfs_inode_lock(inode, 0);
+ if (IS_SWAPFILE(&inode->vfs_inode)) {
ret = -ETXTBSY;
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ btrfs_inode_unlock(inode, 0);
break;
}
- if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) {
+ btrfs_inode_unlock(inode, 0);
break;
}
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ if (do_compress || no_compress) {
+ inode->defrag_compress = compress_type;
+ inode->defrag_compress_level = compress_level;
+ }
+ ret = defrag_one_cluster(inode, ra, cur,
cluster_end + 1 - cur, extent_thresh,
- newer_than, do_compress, &sectors_defragged,
+ newer_than, do_compress || no_compress,
+ &sectors_defragged,
max_to_defrag, &last_scanned);
if (sectors_defragged > prev_sectors_defragged)
- balance_dirty_pages_ratelimited(inode->i_mapping);
+ balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping);
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ btrfs_inode_unlock(inode, 0);
if (ret < 0)
break;
cur = max(cluster_end + 1, last_scanned);
@@ -1462,10 +1484,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
* need to be written back immediately.
*/
if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
- filemap_flush(inode->i_mapping);
+ filemap_flush(inode->vfs_inode.i_mapping);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_flush(inode->i_mapping);
+ &inode->runtime_flags))
+ filemap_flush(inode->vfs_inode.i_mapping);
}
if (range->compress_type == BTRFS_COMPRESS_LZO)
btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
@@ -1473,10 +1495,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
ret = sectors_defragged;
}
- if (do_compress) {
- btrfs_inode_lock(BTRFS_I(inode), 0);
- BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ if (do_compress || no_compress) {
+ btrfs_inode_lock(inode, 0);
+ inode->defrag_compress = BTRFS_COMPRESS_NONE;
+ btrfs_inode_unlock(inode, 0);
}
return ret;
}
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 6b7596c4f0dc..a7f917a38dbf 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -6,14 +6,14 @@
#include <linux/types.h>
#include <linux/compiler_types.h>
-struct inode;
struct file_ra_state;
+struct btrfs_inode;
struct btrfs_fs_info;
struct btrfs_root;
struct btrfs_trans_handle;
struct btrfs_ioctl_defrag_range_args;
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag);
int __init btrfs_auto_defrag_init(void);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 7aa8a395d838..0970799d0aa4 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -111,6 +111,18 @@
* making error handling and cleanup easier.
*/
+static inline struct btrfs_space_info *data_sinfo_for_inode(const struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (btrfs_is_zoned(fs_info) && btrfs_is_data_reloc_root(inode->root)) {
+ ASSERT(fs_info->data_sinfo->sub_group[0]->subgroup_id ==
+ BTRFS_SUB_GROUP_DATA_RELOC);
+ return fs_info->data_sinfo->sub_group[0];
+ }
+ return fs_info->data_sinfo;
+}
+
int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
@@ -123,7 +135,7 @@ int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes)
if (btrfs_is_free_space_inode(inode))
flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
- return btrfs_reserve_data_bytes(fs_info, bytes, flush);
+ return btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), bytes, flush);
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
@@ -144,14 +156,14 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
else if (btrfs_is_free_space_inode(inode))
flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
- ret = btrfs_reserve_data_bytes(fs_info, len, flush);
+ ret = btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), len, flush);
if (ret < 0)
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
if (ret < 0) {
- btrfs_free_reserved_data_space_noquota(fs_info, len);
+ btrfs_free_reserved_data_space_noquota(inode, len);
extent_changeset_free(*reserved);
*reserved = NULL;
} else {
@@ -168,15 +180,13 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
* which we can't sleep and is sure it won't affect qgroup reserved space.
* Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
- u64 len)
+void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len)
{
- struct btrfs_space_info *data_sinfo;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
- data_sinfo = fs_info->data_sinfo;
- btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
+ btrfs_space_info_free_bytes_may_use(data_sinfo_for_inode(inode), len);
}
/*
@@ -196,7 +206,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
- btrfs_free_reserved_data_space_noquota(fs_info, len);
+ btrfs_free_reserved_data_space_noquota(inode, len);
btrfs_qgroup_free_data(inode, reserved, start, len, NULL);
}
@@ -348,8 +358,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
noflush);
if (ret)
return ret;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
- meta_reserve, flush);
+ ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, meta_reserve,
+ flush);
if (ret) {
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
return ret;
@@ -439,6 +449,29 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
btrfs_inode_rsv_release(inode, true);
}
+/* Shrink a previously reserved extent to a new length. */
+void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 reserved_num_extents = count_max_extents(fs_info, reserved_len);
+ const u32 new_num_extents = count_max_extents(fs_info, new_len);
+ const int diff_num_extents = new_num_extents - reserved_num_extents;
+
+ ASSERT(new_len <= reserved_len);
+ if (new_num_extents == reserved_num_extents)
+ return;
+
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, diff_num_extents);
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
+ if (btrfs_is_testing(fs_info))
+ return;
+
+ btrfs_inode_rsv_release(inode, true);
+}
+
/*
* Reserve data and metadata space for delalloc
*
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index 3f32953c0a80..6119c0d3f883 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -18,8 +18,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free);
-void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
- u64 len);
+void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
@@ -27,5 +26,6 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
u64 disk_num_bytes, bool noflush);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len);
#endif /* BTRFS_DELALLOC_SPACE_H */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 508bdbae29a0..ce6e9f8812e0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -57,6 +57,7 @@ static inline void btrfs_init_delayed_node(
delayed_node->root = root;
delayed_node->inode_id = inode_id;
refcount_set(&delayed_node->refs, 0);
+ btrfs_delayed_node_ref_tracker_dir_init(delayed_node);
delayed_node->ins_root = RB_ROOT_CACHED;
delayed_node->del_root = RB_ROOT_CACHED;
mutex_init(&delayed_node->mutex);
@@ -65,7 +66,8 @@ static inline void btrfs_init_delayed_node(
}
static struct btrfs_delayed_node *btrfs_get_delayed_node(
- struct btrfs_inode *btrfs_inode)
+ struct btrfs_inode *btrfs_inode,
+ struct btrfs_ref_tracker *tracker)
{
struct btrfs_root *root = btrfs_inode->root;
u64 ino = btrfs_ino(btrfs_inode);
@@ -74,6 +76,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
node = READ_ONCE(btrfs_inode->delayed_node);
if (node) {
refcount_inc(&node->refs);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_NOFS);
return node;
}
@@ -83,6 +86,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
if (node) {
if (btrfs_inode->delayed_node) {
refcount_inc(&node->refs); /* can be accessed */
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
BUG_ON(btrfs_inode->delayed_node != node);
xa_unlock(&root->delayed_nodes);
return node;
@@ -106,6 +110,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
*/
if (refcount_inc_not_zero(&node->refs)) {
refcount_inc(&node->refs);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+ btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker,
+ GFP_ATOMIC);
btrfs_inode->delayed_node = node;
} else {
node = NULL;
@@ -119,9 +126,15 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
return NULL;
}
-/* Will return either the node or PTR_ERR(-ENOMEM) */
+/*
+ * Look up an existing delayed node associated with @btrfs_inode or create a new
+ * one and insert it to the delayed nodes of the root.
+ *
+ * Return the delayed node, or error pointer on failure.
+ */
static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
- struct btrfs_inode *btrfs_inode)
+ struct btrfs_inode *btrfs_inode,
+ struct btrfs_ref_tracker *tracker)
{
struct btrfs_delayed_node *node;
struct btrfs_root *root = btrfs_inode->root;
@@ -130,7 +143,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
void *ptr;
again:
- node = btrfs_get_delayed_node(btrfs_inode);
+ node = btrfs_get_delayed_node(btrfs_inode, tracker);
if (node)
return node;
@@ -139,12 +152,10 @@ again:
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
- /* Cached in the inode and can be accessed. */
- refcount_set(&node->refs, 2);
-
/* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
if (ret == -ENOMEM) {
+ btrfs_delayed_node_ref_tracker_dir_exit(node);
kmem_cache_free(delayed_node_cache, node);
return ERR_PTR(-ENOMEM);
}
@@ -153,6 +164,7 @@ again:
if (ptr) {
/* Somebody inserted it, go back and read it. */
xa_unlock(&root->delayed_nodes);
+ btrfs_delayed_node_ref_tracker_dir_exit(node);
kmem_cache_free(delayed_node_cache, node);
node = NULL;
goto again;
@@ -161,6 +173,12 @@ again:
ASSERT(xa_err(ptr) != -EINVAL);
ASSERT(xa_err(ptr) != -ENOMEM);
ASSERT(ptr == NULL);
+
+ /* Cached in the inode and can be accessed. */
+ refcount_set(&node->refs, 2);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+ btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, GFP_ATOMIC);
+
btrfs_inode->delayed_node = node;
xa_unlock(&root->delayed_nodes);
@@ -186,6 +204,8 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
list_add_tail(&node->n_list, &root->node_list);
list_add_tail(&node->p_list, &root->prepare_list);
refcount_inc(&node->refs); /* inserted into list */
+ btrfs_delayed_node_ref_tracker_alloc(node, &node->node_list_tracker,
+ GFP_ATOMIC);
root->nodes++;
set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
}
@@ -199,6 +219,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
spin_lock(&root->lock);
if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
root->nodes--;
+ btrfs_delayed_node_ref_tracker_free(node, &node->node_list_tracker);
refcount_dec(&node->refs); /* not in the list */
list_del_init(&node->n_list);
if (!list_empty(&node->p_list))
@@ -209,26 +230,26 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
}
static struct btrfs_delayed_node *btrfs_first_delayed_node(
- struct btrfs_delayed_root *delayed_root)
+ struct btrfs_delayed_root *delayed_root,
+ struct btrfs_ref_tracker *tracker)
{
- struct list_head *p;
- struct btrfs_delayed_node *node = NULL;
+ struct btrfs_delayed_node *node;
spin_lock(&delayed_root->lock);
- if (list_empty(&delayed_root->node_list))
- goto out;
-
- p = delayed_root->node_list.next;
- node = list_entry(p, struct btrfs_delayed_node, n_list);
- refcount_inc(&node->refs);
-out:
+ node = list_first_entry_or_null(&delayed_root->node_list,
+ struct btrfs_delayed_node, n_list);
+ if (node) {
+ refcount_inc(&node->refs);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+ }
spin_unlock(&delayed_root->lock);
return node;
}
static struct btrfs_delayed_node *btrfs_next_delayed_node(
- struct btrfs_delayed_node *node)
+ struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
{
struct btrfs_delayed_root *delayed_root;
struct list_head *p;
@@ -248,6 +269,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
next = list_entry(p, struct btrfs_delayed_node, n_list);
refcount_inc(&next->refs);
+ btrfs_delayed_node_ref_tracker_alloc(next, tracker, GFP_ATOMIC);
out:
spin_unlock(&delayed_root->lock);
@@ -256,7 +278,7 @@ out:
static void __btrfs_release_delayed_node(
struct btrfs_delayed_node *delayed_node,
- int mod)
+ int mod, struct btrfs_ref_tracker *tracker)
{
struct btrfs_delayed_root *delayed_root;
@@ -272,6 +294,7 @@ static void __btrfs_release_delayed_node(
btrfs_dequeue_delayed_node(delayed_root, delayed_node);
mutex_unlock(&delayed_node->mutex);
+ btrfs_delayed_node_ref_tracker_free(delayed_node, tracker);
if (refcount_dec_and_test(&delayed_node->refs)) {
struct btrfs_root *root = delayed_node->root;
@@ -281,39 +304,41 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
+ btrfs_delayed_node_ref_tracker_dir_exit(delayed_node);
kmem_cache_free(delayed_node_cache, delayed_node);
}
}
-static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
{
- __btrfs_release_delayed_node(node, 0);
+ __btrfs_release_delayed_node(node, 0, tracker);
}
static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
- struct btrfs_delayed_root *delayed_root)
+ struct btrfs_delayed_root *delayed_root,
+ struct btrfs_ref_tracker *tracker)
{
- struct list_head *p;
- struct btrfs_delayed_node *node = NULL;
+ struct btrfs_delayed_node *node;
spin_lock(&delayed_root->lock);
- if (list_empty(&delayed_root->prepare_list))
- goto out;
-
- p = delayed_root->prepare_list.next;
- list_del_init(p);
- node = list_entry(p, struct btrfs_delayed_node, p_list);
- refcount_inc(&node->refs);
-out:
+ node = list_first_entry_or_null(&delayed_root->prepare_list,
+ struct btrfs_delayed_node, p_list);
+ if (node) {
+ list_del_init(&node->p_list);
+ refcount_inc(&node->refs);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+ }
spin_unlock(&delayed_root->lock);
return node;
}
static inline void btrfs_release_prepared_delayed_node(
- struct btrfs_delayed_node *node)
+ struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
{
- __btrfs_release_delayed_node(node, 1);
+ __btrfs_release_delayed_node(node, 1, tracker);
}
static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
@@ -336,6 +361,20 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
return item;
}
+static int delayed_item_index_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *index = key;
+ const struct btrfs_delayed_item *delayed_item = rb_entry(node,
+ struct btrfs_delayed_item, rb_node);
+
+ if (delayed_item->index < *index)
+ return 1;
+ else if (delayed_item->index > *index)
+ return -1;
+
+ return 0;
+}
+
/*
* Look up the delayed item by key.
*
@@ -349,57 +388,35 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
struct rb_root *root,
u64 index)
{
- struct rb_node *node = root->rb_node;
- struct btrfs_delayed_item *delayed_item = NULL;
+ struct rb_node *node;
- while (node) {
- delayed_item = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- if (delayed_item->index < index)
- node = node->rb_right;
- else if (delayed_item->index > index)
- node = node->rb_left;
- else
- return delayed_item;
- }
+ node = rb_find(&index, root, delayed_item_index_cmp);
+ return rb_entry_safe(node, struct btrfs_delayed_item, rb_node);
+}
- return NULL;
+static int btrfs_delayed_item_cmp(const struct rb_node *new,
+ const struct rb_node *exist)
+{
+ const struct btrfs_delayed_item *new_item =
+ rb_entry(new, struct btrfs_delayed_item, rb_node);
+
+ return delayed_item_index_cmp(&new_item->index, exist);
}
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
struct btrfs_delayed_item *ins)
{
- struct rb_node **p, *node;
- struct rb_node *parent_node = NULL;
struct rb_root_cached *root;
- struct btrfs_delayed_item *item;
- bool leftmost = true;
+ struct rb_node *exist;
if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
else
root = &delayed_node->del_root;
- p = &root->rb_root.rb_node;
- node = &ins->rb_node;
-
- while (*p) {
- parent_node = *p;
- item = rb_entry(parent_node, struct btrfs_delayed_item,
- rb_node);
-
- if (item->index < ins->index) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else if (item->index > ins->index) {
- p = &(*p)->rb_left;
- } else {
- return -EEXIST;
- }
- }
-
- rb_link_node(node, parent_node, p);
- rb_insert_color_cached(node, root, leftmost);
+ exist = rb_find_add_cached(&ins->rb_node, root, btrfs_delayed_item_cmp);
+ if (exist)
+ return -EEXIST;
if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
ins->index >= delayed_node->index_cnt)
@@ -459,40 +476,25 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
struct btrfs_delayed_node *delayed_node)
{
- struct rb_node *p;
- struct btrfs_delayed_item *item = NULL;
-
- p = rb_first_cached(&delayed_node->ins_root);
- if (p)
- item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+ struct rb_node *p = rb_first_cached(&delayed_node->ins_root);
- return item;
+ return rb_entry_safe(p, struct btrfs_delayed_item, rb_node);
}
static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
struct btrfs_delayed_node *delayed_node)
{
- struct rb_node *p;
- struct btrfs_delayed_item *item = NULL;
-
- p = rb_first_cached(&delayed_node->del_root);
- if (p)
- item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+ struct rb_node *p = rb_first_cached(&delayed_node->del_root);
- return item;
+ return rb_entry_safe(p, struct btrfs_delayed_item, rb_node);
}
static struct btrfs_delayed_item *__btrfs_next_delayed_item(
struct btrfs_delayed_item *item)
{
- struct rb_node *p;
- struct btrfs_delayed_item *next = NULL;
+ struct rb_node *p = rb_next(&item->rb_node);
- p = rb_next(&item->rb_node);
- if (p)
- next = rb_entry(p, struct btrfs_delayed_item, rb_node);
-
- return next;
+ return rb_entry_safe(p, struct btrfs_delayed_item, rb_node);
}
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -666,7 +668,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_key first_key;
const u32 first_data_size = first_item->data_len;
int total_size;
- char *ins_data = NULL;
+ char AUTO_KFREE(ins_data);
int ret;
bool continuous_keys_only = false;
@@ -736,12 +738,10 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
u32 *ins_sizes;
int i = 0;
- ins_data = kmalloc(batch.nr * sizeof(u32) +
- batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
- if (!ins_data) {
- ret = -ENOMEM;
- goto out;
- }
+ ins_data = kmalloc_array(batch.nr,
+ sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
batch.keys = ins_keys;
@@ -757,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_items(trans, root, path, &batch);
if (ret)
- goto out;
+ return ret;
list_for_each_entry(curr, &item_list, tree_list) {
char *data_ptr;
@@ -812,9 +812,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
}
-out:
- kfree(ins_data);
- return ret;
+
+ return 0;
}
static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
@@ -1030,15 +1029,22 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
if (ret > 0)
ret = -ENOENT;
- if (ret < 0)
+ if (ret < 0) {
+ /*
+ * If we fail to update the delayed inode we need to abort the
+ * transaction, because we could leave the inode with the
+ * improper counts behind.
+ */
+ if (unlikely(ret != -ENOENT))
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
sizeof(struct btrfs_inode_item));
- btrfs_mark_buffer_dirty(trans, leaf);
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
goto out;
@@ -1057,8 +1063,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
+ if (unlikely(ret < 0)) {
+ btrfs_abort_transaction(trans, ret);
goto err_out;
+ }
ASSERT(ret > 0);
ASSERT(path->slots[0] > 0);
ret = 0;
@@ -1080,21 +1088,14 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
* in the same item doesn't exist.
*/
ret = btrfs_del_item(trans, root, path);
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
out:
btrfs_release_delayed_iref(node);
btrfs_release_path(path);
err_out:
btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
btrfs_release_delayed_inode(node);
-
- /*
- * If we fail to update the delayed inode we need to abort the
- * transaction, because we could leave the inode with the improper
- * counts behind.
- */
- if (ret && ret != -ENOENT)
- btrfs_abort_transaction(trans, ret);
-
return ret;
}
@@ -1149,6 +1150,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
+ struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker;
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret = 0;
@@ -1166,17 +1168,18 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
delayed_root = fs_info->delayed_root;
- curr_node = btrfs_first_delayed_node(delayed_root);
+ curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker);
while (curr_node && (!count || nr--)) {
ret = __btrfs_commit_inode_delayed_items(trans, path,
curr_node);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
prev_node = curr_node;
- curr_node = btrfs_next_delayed_node(curr_node);
+ prev_delayed_node_tracker = curr_delayed_node_tracker;
+ curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker);
/*
* See the comment below about releasing path before releasing
* node. If the commit of delayed items was successful the path
@@ -1184,7 +1187,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
* point to locked extent buffers (a leaf at the very least).
*/
ASSERT(path->nodes[0] == NULL);
- btrfs_release_delayed_node(prev_node);
+ btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker);
}
/*
@@ -1197,7 +1200,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
btrfs_free_path(path);
if (curr_node)
- btrfs_release_delayed_node(curr_node);
+ btrfs_release_delayed_node(curr_node, &curr_delayed_node_tracker);
trans->block_rsv = block_rsv;
return ret;
@@ -1216,8 +1219,10 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
- struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
- struct btrfs_path *path;
+ struct btrfs_ref_tracker delayed_node_tracker;
+ struct btrfs_delayed_node *delayed_node =
+ btrfs_get_delayed_node(inode, &delayed_node_tracker);
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_block_rsv *block_rsv;
int ret;
@@ -1227,14 +1232,14 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
mutex_lock(&delayed_node->mutex);
if (!delayed_node->count) {
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
mutex_unlock(&delayed_node->mutex);
path = btrfs_alloc_path();
if (!path) {
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return -ENOMEM;
}
@@ -1243,8 +1248,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
- btrfs_release_delayed_node(delayed_node);
- btrfs_free_path(path);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
trans->block_rsv = block_rsv;
return ret;
@@ -1254,18 +1258,20 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_trans_handle *trans;
- struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_ref_tracker delayed_node_tracker;
+ struct btrfs_delayed_node *delayed_node;
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret;
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return 0;
mutex_lock(&delayed_node->mutex);
if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
mutex_unlock(&delayed_node->mutex);
@@ -1299,7 +1305,7 @@ trans_out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out:
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return ret;
}
@@ -1313,7 +1319,8 @@ void btrfs_remove_delayed_node(struct btrfs_inode *inode)
return;
inode->delayed_node = NULL;
- btrfs_release_delayed_node(delayed_node);
+
+ btrfs_release_delayed_node(delayed_node, &delayed_node->inode_cache_tracker);
}
struct btrfs_async_delayed_work {
@@ -1329,6 +1336,7 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
struct btrfs_delayed_node *delayed_node = NULL;
+ struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_root *root;
struct btrfs_block_rsv *block_rsv;
int total_done = 0;
@@ -1345,7 +1353,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
BTRFS_DELAYED_BACKGROUND / 2)
break;
- delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+ delayed_node = btrfs_first_prepared_delayed_node(delayed_root,
+ &delayed_node_tracker);
if (!delayed_node)
break;
@@ -1354,7 +1363,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
btrfs_release_path(path);
- btrfs_release_prepared_delayed_node(delayed_node);
+ btrfs_release_prepared_delayed_node(delayed_node,
+ &delayed_node_tracker);
total_done++;
continue;
}
@@ -1369,7 +1379,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
btrfs_btree_balance_dirty_nodelay(root->fs_info);
btrfs_release_path(path);
- btrfs_release_prepared_delayed_node(delayed_node);
+ btrfs_release_prepared_delayed_node(delayed_node,
+ &delayed_node_tracker);
total_done++;
} while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
@@ -1401,20 +1412,28 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
{
- WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root));
+ struct btrfs_ref_tracker delayed_node_tracker;
+ struct btrfs_delayed_node *node;
+
+ node = btrfs_first_delayed_node( fs_info->delayed_root, &delayed_node_tracker);
+ if (WARN_ON(node)) {
+ btrfs_delayed_node_ref_tracker_free(node,
+ &delayed_node_tracker);
+ refcount_dec(&node->refs);
+ }
}
-static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
+static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
{
int val = atomic_read(&delayed_root->items_seq);
if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
- return 1;
+ return true;
if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
- return 1;
+ return true;
- return 0;
+ return false;
}
void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
@@ -1475,13 +1494,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_delayed_item *delayed_item;
struct btrfs_dir_item *dir_item;
bool reserve_leaf_space;
u32 data_len;
int ret;
- delayed_node = btrfs_get_or_create_delayed_node(dir);
+ delayed_node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
@@ -1557,13 +1577,12 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
mutex_unlock(&delayed_node->mutex);
release_node:
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return ret;
}
-static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_node *node,
- u64 index)
+static bool btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
+ u64 index)
{
struct btrfs_delayed_item *item;
@@ -1571,7 +1590,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
if (!item) {
mutex_unlock(&node->mutex);
- return 1;
+ return false;
}
/*
@@ -1606,23 +1625,25 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
}
mutex_unlock(&node->mutex);
- return 0;
+ return true;
}
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, u64 index)
{
struct btrfs_delayed_node *node;
+ struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_delayed_item *item;
int ret;
- node = btrfs_get_or_create_delayed_node(dir);
+ node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker);
if (IS_ERR(node))
return PTR_ERR(node);
- ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
- if (!ret)
+ if (btrfs_delete_delayed_insertion_item(node, index)) {
+ ret = 0;
goto end;
+ }
item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
if (!item) {
@@ -1639,7 +1660,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
*/
if (ret < 0) {
btrfs_err(trans->fs_info,
-"metadata reservation failed for delayed dir item deltiona, should have been reserved");
+"metadata reservation failed for delayed dir item deletion, index: %llu, root: %llu, inode: %llu, error: %d",
+ index, btrfs_root_id(node->root), node->inode_id, ret);
btrfs_release_delayed_item(item);
goto end;
}
@@ -1648,22 +1670,23 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
ret = __btrfs_add_delayed_item(node, item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
- "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- index, btrfs_root_id(node->root),
- node->inode_id, ret);
+"failed to add delayed dir index item, root: %llu, inode: %llu, index: %llu, error: %d",
+ index, btrfs_root_id(node->root), node->inode_id, ret);
btrfs_delayed_item_release_metadata(dir->root, item);
btrfs_release_delayed_item(item);
}
mutex_unlock(&node->mutex);
end:
- btrfs_release_delayed_node(node);
+ btrfs_release_delayed_node(node, &delayed_node_tracker);
return ret;
}
int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
{
- struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_ref_tracker delayed_node_tracker;
+ struct btrfs_delayed_node *delayed_node;
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return -ENOENT;
@@ -1673,12 +1696,12 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
* is updated now. So we needn't lock the delayed node.
*/
if (!delayed_node->index_cnt) {
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return -EINVAL;
}
inode->index_cnt = delayed_node->index_cnt;
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
@@ -1689,8 +1712,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
{
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_item *item;
+ struct btrfs_ref_tracker delayed_node_tracker;
- delayed_node = btrfs_get_delayed_node(inode);
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return false;
@@ -1725,6 +1749,7 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
* insert/delete delayed items in this period. So we also needn't
* requeue or dequeue this delayed node.
*/
+ btrfs_delayed_node_ref_tracker_free(delayed_node, &delayed_node_tracker);
refcount_dec(&delayed_node->refs);
return true;
@@ -1755,17 +1780,16 @@ void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
downgrade_write(&inode->vfs_inode.i_rwsem);
}
-int btrfs_should_delete_dir_index(const struct list_head *del_list,
- u64 index)
+bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index)
{
struct btrfs_delayed_item *curr;
- int ret = 0;
+ bool ret = false;
list_for_each_entry(curr, del_list, readdir_list) {
if (curr->index > index)
break;
if (curr->index == index) {
- ret = 1;
+ ret = true;
break;
}
}
@@ -1775,15 +1799,14 @@ int btrfs_should_delete_dir_index(const struct list_head *del_list,
/*
* Read dir info stored in the delayed tree.
*/
-int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- const struct list_head *ins_list)
+bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+ const struct list_head *ins_list)
{
struct btrfs_dir_item *di;
struct btrfs_delayed_item *curr, *next;
struct btrfs_key location;
char *name;
int name_len;
- int over = 0;
unsigned char d_type;
/*
@@ -1792,6 +1815,8 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
* directory, nobody can delete any directory indexes now.
*/
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+ bool over;
+
list_del(&curr->readdir_list);
if (curr->index < ctx->pos) {
@@ -1809,116 +1834,112 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type));
btrfs_disk_key_to_cpu(&location, &di->location);
- over = !dir_emit(ctx, name, name_len,
- location.objectid, d_type);
+ over = !dir_emit(ctx, name, name_len, location.objectid, d_type);
if (refcount_dec_and_test(&curr->refs))
kfree(curr);
if (over)
- return 1;
+ return true;
ctx->pos++;
}
- return 0;
+ return false;
}
static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
+ struct inode *vfs_inode = &inode->vfs_inode;
u64 flags;
- btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
- btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
- btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
- btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
- btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
- btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
- btrfs_set_stack_inode_generation(inode_item,
- BTRFS_I(inode)->generation);
+ btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode));
+ btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode));
+ btrfs_set_stack_inode_size(inode_item, inode->disk_i_size);
+ btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode);
+ btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink);
+ btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode));
+ btrfs_set_stack_inode_generation(inode_item, inode->generation);
btrfs_set_stack_inode_sequence(inode_item,
- inode_peek_iversion(inode));
+ inode_peek_iversion(vfs_inode));
btrfs_set_stack_inode_transid(inode_item, trans->transid);
- btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
- flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
- BTRFS_I(inode)->ro_flags);
+ btrfs_set_stack_inode_rdev(inode_item, vfs_inode->i_rdev);
+ flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags);
btrfs_set_stack_inode_flags(inode_item, flags);
btrfs_set_stack_inode_block_group(inode_item, 0);
btrfs_set_stack_timespec_sec(&inode_item->atime,
- inode_get_atime_sec(inode));
+ inode_get_atime_sec(vfs_inode));
btrfs_set_stack_timespec_nsec(&inode_item->atime,
- inode_get_atime_nsec(inode));
+ inode_get_atime_nsec(vfs_inode));
btrfs_set_stack_timespec_sec(&inode_item->mtime,
- inode_get_mtime_sec(inode));
+ inode_get_mtime_sec(vfs_inode));
btrfs_set_stack_timespec_nsec(&inode_item->mtime,
- inode_get_mtime_nsec(inode));
+ inode_get_mtime_nsec(vfs_inode));
btrfs_set_stack_timespec_sec(&inode_item->ctime,
- inode_get_ctime_sec(inode));
+ inode_get_ctime_sec(vfs_inode));
btrfs_set_stack_timespec_nsec(&inode_item->ctime,
- inode_get_ctime_nsec(inode));
+ inode_get_ctime_nsec(vfs_inode));
- btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec);
- btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec);
+ btrfs_set_stack_timespec_sec(&inode_item->otime, inode->i_otime_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->otime, inode->i_otime_nsec);
}
-int btrfs_fill_inode(struct inode *inode, u32 *rdev)
+int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
{
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_inode_item *inode_item;
+ struct inode *vfs_inode = &inode->vfs_inode;
- delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return -ENOENT;
mutex_lock(&delayed_node->mutex);
if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return -ENOENT;
}
inode_item = &delayed_node->inode_item;
- i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
- i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
- btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
- btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
- round_up(i_size_read(inode), fs_info->sectorsize));
- inode->i_mode = btrfs_stack_inode_mode(inode_item);
- set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
- inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
- BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
- BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
-
- inode_set_iversion_queried(inode,
- btrfs_stack_inode_sequence(inode_item));
- inode->i_rdev = 0;
+ i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item));
+ i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item));
+ btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
+ vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item);
+ set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item));
+ inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item));
+ inode->generation = btrfs_stack_inode_generation(inode_item);
+ inode->last_trans = btrfs_stack_inode_transid(inode_item);
+
+ inode_set_iversion_queried(vfs_inode, btrfs_stack_inode_sequence(inode_item));
+ vfs_inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
- &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
+ &inode->flags, &inode->ro_flags);
- inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime),
+ inode_set_atime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->atime),
btrfs_stack_timespec_nsec(&inode_item->atime));
- inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime),
+ inode_set_mtime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->mtime),
btrfs_stack_timespec_nsec(&inode_item->mtime));
- inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime),
+ inode_set_ctime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->ctime),
btrfs_stack_timespec_nsec(&inode_item->ctime));
- BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
- BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
+ inode->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
+ inode->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
- inode->i_generation = BTRFS_I(inode)->generation;
- if (S_ISDIR(inode->i_mode))
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ vfs_inode->i_generation = inode->generation;
+ if (S_ISDIR(vfs_inode->i_mode))
+ inode->index_cnt = (u64)-1;
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
@@ -1927,16 +1948,16 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
int ret = 0;
- delayed_node = btrfs_get_or_create_delayed_node(inode);
+ delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
mutex_lock(&delayed_node->mutex);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
- fill_stack_inode_item(trans, &delayed_node->inode_item,
- &inode->vfs_inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
goto release_node;
}
@@ -1944,13 +1965,13 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
if (ret)
goto release_node;
- fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count++;
atomic_inc(&root->fs_info->delayed_root->items);
release_node:
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return ret;
}
@@ -1958,6 +1979,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
/*
* we don't do delayed inode updates during log recovery because it
@@ -1967,7 +1989,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
return -EAGAIN;
- delayed_node = btrfs_get_or_create_delayed_node(inode);
+ delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
@@ -1986,15 +2008,12 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
* It is very rare.
*/
mutex_lock(&delayed_node->mutex);
- if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
- goto release_node;
-
- set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
- delayed_node->count++;
- atomic_inc(&fs_info->delayed_root->items);
-release_node:
+ if (!test_and_set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) {
+ delayed_node->count++;
+ atomic_inc(&fs_info->delayed_root->items);
+ }
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
@@ -2038,19 +2057,21 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
{
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
- delayed_node = btrfs_get_delayed_node(inode);
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return;
__btrfs_kill_delayed_node(delayed_node);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
}
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
unsigned long index = 0;
struct btrfs_delayed_node *delayed_nodes[8];
+ struct btrfs_ref_tracker delayed_node_trackers[8];
while (1) {
struct btrfs_delayed_node *node;
@@ -2069,6 +2090,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
* about to be removed from the tree in the loop below
*/
if (refcount_inc_not_zero(&node->refs)) {
+ btrfs_delayed_node_ref_tracker_alloc(node,
+ &delayed_node_trackers[count],
+ GFP_ATOMIC);
delayed_nodes[count] = node;
count++;
}
@@ -2080,7 +2104,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
for (int i = 0; i < count; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
- btrfs_release_delayed_node(delayed_nodes[i]);
+ btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]);
+ btrfs_release_delayed_node(delayed_nodes[i],
+ &delayed_node_trackers[i]);
}
}
}
@@ -2088,14 +2114,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
{
struct btrfs_delayed_node *curr_node, *prev_node;
+ struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker;
- curr_node = btrfs_first_delayed_node(fs_info->delayed_root);
+ curr_node = btrfs_first_delayed_node(fs_info->delayed_root,
+ &curr_delayed_node_tracker);
while (curr_node) {
__btrfs_kill_delayed_node(curr_node);
prev_node = curr_node;
- curr_node = btrfs_next_delayed_node(curr_node);
- btrfs_release_delayed_node(prev_node);
+ prev_delayed_node_tracker = curr_delayed_node_tracker;
+ curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker);
+ btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker);
}
}
@@ -2105,8 +2134,9 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
{
struct btrfs_delayed_node *node;
struct btrfs_delayed_item *item;
+ struct btrfs_ref_tracker delayed_node_tracker;
- node = btrfs_get_delayed_node(inode);
+ node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!node)
return;
@@ -2164,6 +2194,7 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
* delete delayed items.
*/
ASSERT(refcount_read(&node->refs) > 1);
+ btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker);
refcount_dec(&node->refs);
}
@@ -2174,8 +2205,9 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
struct btrfs_delayed_node *node;
struct btrfs_delayed_item *item;
struct btrfs_delayed_item *next;
+ struct btrfs_ref_tracker delayed_node_tracker;
- node = btrfs_get_delayed_node(inode);
+ node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!node)
return;
@@ -2207,5 +2239,6 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
* delete delayed items.
*/
ASSERT(refcount_read(&node->refs) > 1);
+ btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker);
refcount_dec(&node->refs);
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f4d9feac0d0e..b09d4ec8c77d 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -16,6 +16,7 @@
#include <linux/fs.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
+#include <linux/ref_tracker.h>
#include "ctree.h"
struct btrfs_disk_key;
@@ -44,6 +45,22 @@ struct btrfs_delayed_root {
wait_queue_head_t wait;
};
+struct btrfs_ref_tracker_dir {
+#ifdef CONFIG_BTRFS_DEBUG
+ struct ref_tracker_dir dir;
+#else
+ struct {} tracker;
+#endif
+};
+
+struct btrfs_ref_tracker {
+#ifdef CONFIG_BTRFS_DEBUG
+ struct ref_tracker *tracker;
+#else
+ struct {} tracker;
+#endif
+};
+
#define BTRFS_DELAYED_NODE_IN_LIST 0
#define BTRFS_DELAYED_NODE_INODE_DIRTY 1
#define BTRFS_DELAYED_NODE_DEL_IREF 2
@@ -78,6 +95,12 @@ struct btrfs_delayed_node {
* actual number of leaves we end up using. Protected by @mutex.
*/
u32 index_item_leaves;
+ /* Track all references to this delayed node. */
+ struct btrfs_ref_tracker_dir ref_dir;
+ /* Track delayed node reference stored in node list. */
+ struct btrfs_ref_tracker node_list_tracker;
+ /* Track delayed node reference stored in inode cache. */
+ struct btrfs_ref_tracker inode_cache_tracker;
};
struct btrfs_delayed_item {
@@ -133,7 +156,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
-int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev);
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
/* Used for drop dead root */
@@ -150,10 +173,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
struct list_head *ins_list,
struct list_head *del_list);
-int btrfs_should_delete_dir_index(const struct list_head *del_list,
- u64 index);
-int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- const struct list_head *ins_list);
+bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index);
+bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+ const struct list_head *ins_list);
/* Used during directory logging. */
void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
@@ -170,4 +192,81 @@ void __cold btrfs_delayed_inode_exit(void);
/* for debugging */
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info);
+#define BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT 16
+#define BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT 16
+
+#ifdef CONFIG_BTRFS_DEBUG
+static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return;
+
+ ref_tracker_dir_init(&node->ref_dir.dir,
+ BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT,
+ "delayed_node");
+}
+
+static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return;
+
+ ref_tracker_dir_exit(&node->ref_dir.dir);
+}
+
+static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return;
+
+ /*
+ * Only print if there are leaked references. The caller is
+ * holding one reference, so if refs == 1 there is no leak.
+ */
+ if (refcount_read(&node->refs) == 1)
+ return;
+
+ ref_tracker_dir_print(&node->ref_dir.dir,
+ BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT);
+}
+
+static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker,
+ gfp_t gfp)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return 0;
+
+ return ref_tracker_alloc(&node->ref_dir.dir, &tracker->tracker, gfp);
+}
+
+static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return 0;
+
+ return ref_tracker_free(&node->ref_dir.dir, &tracker->tracker);
+}
+#else
+static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) { }
+
+static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) { }
+
+static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) { }
+
+static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker,
+ gfp_t gfp)
+{
+ return 0;
+}
+
+static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
+{
+ return 0;
+}
+#endif
+
#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 0d878dbbabba..e8bc37453336 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -93,6 +93,9 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
u64 num_bytes;
u64 reserved_bytes;
+ if (btrfs_is_testing(fs_info))
+ return;
+
num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);
num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,
trans->delayed_ref_csum_deletions);
@@ -225,7 +228,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
if (!num_bytes)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(space_info, num_bytes, flush);
if (ret)
return ret;
@@ -254,7 +257,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
spin_unlock(&block_rsv->lock);
if (to_free > 0)
- btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free);
+ btrfs_space_info_free_bytes_may_use(space_info, to_free);
if (refilled_bytes > 0)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0,
@@ -265,8 +268,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
/*
* compare two delayed data backrefs with same bytenr and type
*/
-static int comp_data_refs(struct btrfs_delayed_ref_node *ref1,
- struct btrfs_delayed_ref_node *ref2)
+static int comp_data_refs(const struct btrfs_delayed_ref_node *ref1,
+ const struct btrfs_delayed_ref_node *ref2)
{
if (ref1->data_ref.objectid < ref2->data_ref.objectid)
return -1;
@@ -279,8 +282,8 @@ static int comp_data_refs(struct btrfs_delayed_ref_node *ref1,
return 0;
}
-static int comp_refs(struct btrfs_delayed_ref_node *ref1,
- struct btrfs_delayed_ref_node *ref2,
+static int comp_refs(const struct btrfs_delayed_ref_node *ref1,
+ const struct btrfs_delayed_ref_node *ref2,
bool check_seq)
{
int ret = 0;
@@ -314,35 +317,23 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
return 0;
}
+static int cmp_refs_node(const struct rb_node *new, const struct rb_node *exist)
+{
+ const struct btrfs_delayed_ref_node *new_node =
+ rb_entry(new, struct btrfs_delayed_ref_node, ref_node);
+ const struct btrfs_delayed_ref_node *exist_node =
+ rb_entry(exist, struct btrfs_delayed_ref_node, ref_node);
+
+ return comp_refs(new_node, exist_node, true);
+}
+
static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins)
{
- struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *node = &ins->ref_node;
- struct rb_node *parent_node = NULL;
- struct btrfs_delayed_ref_node *entry;
- bool leftmost = true;
-
- while (*p) {
- int comp;
-
- parent_node = *p;
- entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
- ref_node);
- comp = comp_refs(ins, entry, true);
- if (comp < 0) {
- p = &(*p)->rb_left;
- } else if (comp > 0) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else {
- return entry;
- }
- }
+ struct rb_node *exist = rb_find_add_cached(node, root, cmp_refs_node);
- rb_link_node(node, parent_node, p);
- rb_insert_color_cached(node, root, leftmost);
- return NULL;
+ return rb_entry_safe(exist, struct btrfs_delayed_ref_node, ref_node);
}
static struct btrfs_delayed_ref_head *find_first_ref_head(
@@ -555,6 +546,32 @@ void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
delayed_refs->num_heads_ready--;
}
+struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_node *ref;
+
+ lockdep_assert_held(&head->mutex);
+ lockdep_assert_held(&head->lock);
+
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
+ return NULL;
+
+ /*
+ * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
+ * This is to prevent a ref count from going down to zero, which deletes
+ * the extent item from the extent tree, when there still are references
+ * to add, which would fail because they would not find the extent item.
+ */
+ if (!list_empty(&head->ref_add_list))
+ return list_first_entry(&head->ref_add_list,
+ struct btrfs_delayed_ref_node, add_list);
+
+ ref = rb_entry(rb_first_cached(&head->ref_tree),
+ struct btrfs_delayed_ref_node, ref_node);
+ ASSERT(list_empty(&ref->add_list));
+ return ref;
+}
+
/*
* Helper to insert the ref_node to the tail or merge with tail.
*
@@ -781,9 +798,13 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
}
/*
- * helper function to actually insert a head node into the rbtree.
- * this does all the dirty work in terms of maintaining the correct
- * overall modification count.
+ * Helper function to actually insert a head node into the xarray. This does all
+ * the dirty work in terms of maintaining the correct overall modification
+ * count.
+ *
+ * The caller is responsible for calling kfree() on @qrecord. More specifically,
+ * if this function reports that it did not insert it as noted in
+ * @qrecord_inserted_ret, then it's safe to call kfree() on it.
*
* Returns an error pointer in case of an error.
*/
@@ -797,7 +818,14 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs;
const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits);
- bool qrecord_inserted = false;
+
+ /*
+ * If 'qrecord_inserted_ret' is provided, then the first thing we need
+ * to do is to initialize it to false just in case we have an exit
+ * before trying to insert the record.
+ */
+ if (qrecord_inserted_ret)
+ *qrecord_inserted_ret = false;
delayed_refs = &trans->transaction->delayed_refs;
lockdep_assert_held(&delayed_refs->lock);
@@ -816,6 +844,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
/* Record qgroup extent info if provided */
if (qrecord) {
+ /*
+ * Setting 'qrecord' but not 'qrecord_inserted_ret' will likely
+ * result in a memory leakage.
+ */
+ ASSERT(qrecord_inserted_ret != NULL);
+
int ret;
ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord,
@@ -823,12 +857,10 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
if (ret) {
/* Clean up if insertion fails or item exists. */
xa_release(&delayed_refs->dirty_extents, index);
- /* Caller responsible for freeing qrecord on error. */
if (ret < 0)
return ERR_PTR(ret);
- kfree(qrecord);
- } else {
- qrecord_inserted = true;
+ } else if (qrecord_inserted_ret) {
+ *qrecord_inserted_ret = true;
}
}
@@ -871,14 +903,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
}
- if (qrecord_inserted_ret)
- *qrecord_inserted_ret = qrecord_inserted;
return head_ref;
}
/*
- * Initialize the structure which represents a modification to a an extent.
+ * Initialize the structure which represents a modification to an extent.
*
* @fs_info: Internal to the mounted filesystem mount structure.
*
@@ -911,7 +941,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
- if (is_fstree(generic_ref->ref_root))
+ if (btrfs_is_fstree(generic_ref->ref_root))
seq = atomic64_read(&fs_info->tree_mod_seq);
refcount_set(&ref->refs, 1);
@@ -935,14 +965,14 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
bool skip_qgroup)
{
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
/* If @real_root not set, use @root as fallback */
generic_ref->real_root = mod_root ?: generic_ref->ref_root;
#endif
generic_ref->tree_ref.level = level;
generic_ref->type = BTRFS_REF_METADATA;
- if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
- (!mod_root || is_fstree(mod_root))))
+ if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) &&
+ (!mod_root || btrfs_is_fstree(mod_root))))
generic_ref->skip_qgroup = true;
else
generic_ref->skip_qgroup = false;
@@ -952,15 +982,15 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
u64 mod_root, bool skip_qgroup)
{
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
/* If @real_root not set, use @root as fallback */
generic_ref->real_root = mod_root ?: generic_ref->ref_root;
#endif
generic_ref->data_ref.objectid = ino;
generic_ref->data_ref.offset = offset;
generic_ref->type = BTRFS_REF_DATA;
- if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
- (!mod_root || is_fstree(mod_root))))
+ if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) &&
+ (!mod_root || btrfs_is_fstree(mod_root))))
generic_ref->skip_qgroup = true;
else
generic_ref->skip_qgroup = false;
@@ -1032,6 +1062,14 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
xa_release(&delayed_refs->head_refs, index);
spin_unlock(&delayed_refs->lock);
ret = PTR_ERR(new_head_ref);
+
+ /*
+ * It's only safe to call kfree() on 'qrecord' if
+ * add_delayed_ref_head() has _not_ inserted it for
+ * tracing. Otherwise we need to handle this here.
+ */
+ if (!qrecord_reserved || qrecord_inserted)
+ goto free_head_ref;
goto free_record;
}
head_ref = new_head_ref;
@@ -1054,6 +1092,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
if (qrecord_inserted)
return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr);
+
+ kfree(record);
return 0;
free_record:
@@ -1263,7 +1303,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
- if (pin_bytes) {
+ if (!btrfs_is_testing(fs_info) && pin_bytes) {
struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, head->bytenr);
@@ -1281,8 +1321,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
spin_lock(&bg->space_info->lock);
spin_lock(&bg->lock);
bg->pinned += head->num_bytes;
- btrfs_space_info_update_bytes_pinned(fs_info,
- bg->space_info,
+ btrfs_space_info_update_bytes_pinned(bg->space_info,
head->num_bytes);
bg->reserved -= head->num_bytes;
bg->space_info->bytes_reserved -= head->num_bytes;
@@ -1295,12 +1334,15 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
btrfs_error_unpin_extent_range(fs_info, head->bytenr,
head->bytenr + head->num_bytes - 1);
}
- btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+ if (!btrfs_is_testing(fs_info))
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
}
- btrfs_qgroup_destroy_extent_records(trans);
+
+ if (!btrfs_is_testing(fs_info))
+ btrfs_qgroup_destroy_extent_records(trans);
spin_unlock(&delayed_refs->lock);
}
@@ -1316,7 +1358,7 @@ int __init btrfs_delayed_ref_init(void)
{
btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0);
if (!btrfs_delayed_ref_head_cachep)
- goto fail;
+ return -ENOMEM;
btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0);
if (!btrfs_delayed_ref_node_cachep)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 611fb3388f82..5ce940532144 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -14,6 +14,8 @@
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <uapi/linux/btrfs_tree.h>
+#include "fs.h"
+#include "messages.h"
struct btrfs_trans_handle;
struct btrfs_fs_info;
@@ -260,7 +262,6 @@ enum btrfs_ref_type {
BTRFS_REF_NOT_SET,
BTRFS_REF_DATA,
BTRFS_REF_METADATA,
- BTRFS_REF_LAST,
} __packed;
struct btrfs_ref {
@@ -275,10 +276,6 @@ struct btrfs_ref {
*/
bool skip_qgroup;
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- /* Through which root is this modification. */
- u64 real_root;
-#endif
u64 bytenr;
u64 num_bytes;
u64 owning_root;
@@ -295,6 +292,11 @@ struct btrfs_ref {
struct btrfs_data_ref data_ref;
struct btrfs_tree_ref tree_ref;
};
+
+#ifdef CONFIG_BTRFS_DEBUG
+ /* Through which root is this modification. */
+ u64 real_root;
+#endif
};
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
@@ -402,6 +404,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
struct btrfs_delayed_ref_root *delayed_refs);
void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
+struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
@@ -418,7 +421,7 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
u64 root, u64 parent);
void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans);
-static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
+static inline u64 btrfs_delayed_ref_owner(const struct btrfs_delayed_ref_node *node)
{
if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
@@ -426,7 +429,7 @@ static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
return node->tree_ref.level;
}
-static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
+static inline u64 btrfs_delayed_ref_offset(const struct btrfs_delayed_ref_node *node)
{
if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
@@ -434,7 +437,7 @@ static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
return 0;
}
-static inline u8 btrfs_ref_type(struct btrfs_ref *ref)
+static inline u8 btrfs_ref_type(const struct btrfs_ref *ref)
{
ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ac8e97ed13f7..b6c7da8e1bc8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -76,7 +76,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
struct extent_buffer *eb;
int slot;
int ret = 0;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
int item_size;
struct btrfs_dev_replace_item *ptr;
u64 src_devid;
@@ -85,10 +85,8 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
return 0;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
@@ -100,13 +98,11 @@ no_valid_dev_replace_entry_found:
* We don't have a replace item or it's corrupted. If there is
* a replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices, &args)) {
+ if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
- ret = 0;
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->cont_reading_from_srcdev_mode =
@@ -123,7 +119,7 @@ no_valid_dev_replace_entry_found:
dev_replace->tgtdev = NULL;
dev_replace->is_valid = 0;
dev_replace->item_needs_writeback = 0;
- goto out;
+ return 0;
}
slot = path->slots[0];
eb = path->nodes[0];
@@ -162,7 +158,7 @@ no_valid_dev_replace_entry_found:
* We don't have an active replace item but if there is a
* replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices, &args)) {
+ if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) {
btrfs_err(fs_info,
"replace without active item, run 'device scan --forget' on the target device");
ret = -EUCLEAN;
@@ -181,8 +177,7 @@ no_valid_dev_replace_entry_found:
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
*/
- if (!dev_replace->srcdev &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
+ if (unlikely(!dev_replace->srcdev && !btrfs_test_opt(fs_info, DEGRADED))) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
@@ -190,8 +185,7 @@ no_valid_dev_replace_entry_found:
"srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
src_devid);
}
- if (!dev_replace->tgtdev &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
+ if (unlikely(!dev_replace->tgtdev && !btrfs_test_opt(fs_info, DEGRADED))) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
@@ -226,8 +220,6 @@ no_valid_dev_replace_entry_found:
break;
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -256,7 +248,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
+ fs_info->sb, &fs_holder_ops);
if (IS_ERR(bdev_file)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
return PTR_ERR(bdev_file);
@@ -333,7 +325,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return 0;
error:
- fput(bdev_file);
+ bdev_fput(bdev_file);
return ret;
}
@@ -346,7 +338,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_dev_replace_item *ptr;
@@ -365,16 +357,15 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
key.offset = 0;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
+
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
btrfs_warn(fs_info,
"error %d while searching for dev_replace item!",
ret);
- goto out;
+ return ret;
}
if (ret == 0 &&
@@ -395,7 +386,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
btrfs_warn(fs_info,
"delete too small dev_replace item failed %d!",
ret);
- goto out;
+ return ret;
}
ret = 1;
}
@@ -408,7 +399,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
if (ret < 0) {
btrfs_warn(fs_info,
"insert dev_replace item failed %d!", ret);
- goto out;
+ return ret;
}
}
@@ -441,11 +432,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
dev_replace->item_needs_writeback = 0;
up_write(&dev_replace->rwsem);
- btrfs_mark_buffer_dirty(trans, eb);
-
-out:
- btrfs_free_path(path);
-
return ret;
}
@@ -503,8 +489,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
}
path->reada = READA_FORWARD;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
key.objectid = src_dev->devid;
key.type = BTRFS_DEV_EXTENT_KEY;
@@ -612,7 +598,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return PTR_ERR(src_device);
if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"cannot replace device %s (devid %llu) due to active swapfile",
btrfs_dev_name(src_device), src_device->devid);
return -ETXTBSY;
@@ -649,7 +635,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- ASSERT(0);
+ DEBUG_WARN("unexpected STARTED or SUSPENDED dev-replace state");
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
up_write(&dev_replace->rwsem);
goto leave;
@@ -659,7 +645,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
dev_replace->srcdev = src_device;
dev_replace->tgtdev = tgt_device;
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"dev_replace from %s (devid %llu) to %s started",
btrfs_dev_name(src_device),
src_device->devid,
@@ -806,17 +792,17 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
- while (find_first_extent_bit(&srcdev->alloc_state, start,
- &found_start, &found_end,
- CHUNK_ALLOCATED, &cached_state)) {
- ret = set_extent_bit(&tgtdev->alloc_state, found_start,
- found_end, CHUNK_ALLOCATED, NULL);
+ while (btrfs_find_first_extent_bit(&srcdev->alloc_state, start,
+ &found_start, &found_end,
+ CHUNK_ALLOCATED, &cached_state)) {
+ ret = btrfs_set_extent_bit(&tgtdev->alloc_state, found_start,
+ found_end, CHUNK_ALLOCATED, NULL);
if (ret)
break;
start = found_end + 1;
}
- free_extent_state(cached_state);
+ btrfs_free_extent_state(cached_state);
return ret;
}
@@ -955,7 +941,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
tgt_device);
} else {
if (scrub_ret != -ECANCELED)
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
btrfs_dev_name(src_device),
src_device->devid,
@@ -973,7 +959,7 @@ error:
return scrub_ret;
}
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"dev_replace from %s (devid %llu) to %s finished",
btrfs_dev_name(src_device),
src_device->devid,
@@ -1121,7 +1107,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
* btrfs_dev_replace_finishing() will handle the
* cleanup part
*/
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"dev_replace from %s (devid %llu) to %s canceled",
btrfs_dev_name(src_device), src_device->devid,
btrfs_dev_name(tgt_device));
@@ -1155,7 +1141,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"suspended dev_replace from %s (devid %llu) to %s canceled",
btrfs_dev_name(src_device), src_device->devid,
btrfs_dev_name(tgt_device));
@@ -1259,7 +1245,7 @@ static int btrfs_dev_replace_kthread(void *data)
progress = btrfs_dev_replace_progress(fs_info);
progress = div_u64(progress, 10);
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
btrfs_dev_name(dev_replace->srcdev),
dev_replace->srcdev->devid,
@@ -1277,16 +1263,16 @@ static int btrfs_dev_replace_kthread(void *data)
return 0;
}
-int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
{
if (!dev_replace->is_valid)
- return 0;
+ return false;
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
- return 0;
+ return false;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
/*
@@ -1301,7 +1287,7 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
*/
break;
}
- return 1;
+ return true;
}
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 23e480efe5e6..b35cecf388f2 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -25,7 +25,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
-int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
struct btrfs_block_group *cache,
u64 physical);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1ea5d8fcfbf7..085a83ae9e62 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -9,6 +9,7 @@
#include "transaction.h"
#include "accessors.h"
#include "dir-item.h"
+#include "delayed-inode.h"
/*
* insert a name into a directory, doing overflow properly if there is a hash
@@ -92,7 +93,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, name, name_ptr, name_len);
write_extent_buffer(leaf, data, data_ptr, data_len);
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
return ret;
}
@@ -112,7 +112,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
int ret = 0;
int ret2 = 0;
struct btrfs_root *root = dir->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *dir_item;
struct extent_buffer *leaf;
unsigned long name_ptr;
@@ -152,7 +152,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
name_ptr = (unsigned long)(dir_item + 1);
write_extent_buffer(leaf, name->name, name_ptr, name->len);
- btrfs_mark_buffer_dirty(trans, leaf);
second_insert:
/* FIXME, use some real flag for selecting the extra index */
@@ -165,7 +164,6 @@ second_insert:
ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir,
&disk_key, type, index);
out_free:
- btrfs_free_path(path);
if (ret)
return ret;
if (ret2)
@@ -229,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
return di;
}
-int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino,
const struct fscrypt_str *name)
{
int ret;
@@ -238,13 +236,13 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
int data_size;
struct extent_buffer *leaf;
int slot;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- key.objectid = dir;
+ key.objectid = dir_ino;
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name->name, name->len);
@@ -253,20 +251,17 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
if (IS_ERR(di)) {
ret = PTR_ERR(di);
/* Nothing found, we're safe */
- if (ret == -ENOENT) {
- ret = 0;
- goto out;
- }
+ if (ret == -ENOENT)
+ return 0;
if (ret < 0)
- goto out;
+ return ret;
}
/* we found an item, look for our name in the item */
if (di) {
/* our exact name was found */
- ret = -EEXIST;
- goto out;
+ return -EEXIST;
}
/* See if there is room in the item to insert this name. */
@@ -275,14 +270,11 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
slot = path->slots[0];
if (data_size + btrfs_item_size(leaf, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
- ret = -EOVERFLOW;
- } else {
- /* plenty of insertion room */
- ret = 0;
+ return -EOVERFLOW;
}
-out:
- btrfs_free_path(path);
- return ret;
+
+ /* Plenty of insertion room. */
+ return 0;
}
/*
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index 28d69970bc70..e52174a8baf9 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -10,10 +10,11 @@ struct fscrypt_str;
struct btrfs_fs_info;
struct btrfs_key;
struct btrfs_path;
+struct btrfs_inode;
struct btrfs_root;
struct btrfs_trans_handle;
-int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino,
const struct fscrypt_str *name);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name, struct btrfs_inode *dir,
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index a7c3e221378d..07e19e88ba4b 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -10,6 +10,8 @@
#include "fs.h"
#include "transaction.h"
#include "volumes.h"
+#include "bio.h"
+#include "ordered-data.h"
struct btrfs_dio_data {
ssize_t submitted;
@@ -42,21 +44,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
/* Direct lock must be taken before the extent lock. */
if (nowait) {
- if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
+ if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
return -EAGAIN;
} else {
- lock_dio_extent(io_tree, lockstart, lockend, cached_state);
+ btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);
}
while (1) {
if (nowait) {
- if (!try_lock_extent(io_tree, lockstart, lockend,
- cached_state)) {
+ if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,
+ cached_state)) {
ret = -EAGAIN;
break;
}
} else {
- lock_extent(io_tree, lockstart, lockend, cached_state);
+ btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);
}
/*
* We're concerned with the entire range that we're going to be
@@ -78,7 +80,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
lockstart, lockend)))
break;
- unlock_extent(io_tree, lockstart, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state);
if (ordered) {
if (nowait) {
@@ -131,7 +133,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
}
if (ret)
- unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
+ btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
return ret;
}
@@ -151,11 +153,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
}
ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
- (1 << type) |
- (1 << BTRFS_ORDERED_DIRECT));
+ (1U << type) |
+ (1U << BTRFS_ORDERED_DIRECT));
if (IS_ERR(ordered)) {
if (em) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
btrfs_drop_extent_map_range(inode, start,
start + file_extent->num_bytes - 1, false);
}
@@ -184,7 +186,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
again:
ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
- 0, alloc_hint, &ins, 1, 1);
+ 0, alloc_hint, &ins, true, true);
if (ret == -EAGAIN) {
ASSERT(btrfs_is_zoned(fs_info));
wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
@@ -204,8 +206,7 @@ again:
BTRFS_ORDERED_REGULAR);
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(em))
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
- 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
return em;
}
@@ -246,10 +247,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
else
type = BTRFS_ORDERED_NOCOW;
len = min(len, em->len - (start - em->start));
- block_start = extent_map_block_start(em) + (start - em->start);
+ block_start = btrfs_extent_map_block_start(em) + (start - em->start);
- if (can_nocow_extent(inode, start, &len,
- &file_extent, false, false) == 1) {
+ if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
+ false) == 1) {
bg = btrfs_inc_nocow_writers(fs_info, block_start);
if (bg)
can_nocow = true;
@@ -265,7 +266,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
nowait);
if (ret < 0) {
/* Our caller expects us to free the input extent map. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*map = NULL;
btrfs_dec_nocow_writers(bg);
if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
@@ -278,7 +279,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
&file_extent, type);
btrfs_dec_nocow_writers(bg);
if (type == BTRFS_ORDERED_PREALLOC) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*map = em2;
em = em2;
}
@@ -291,7 +292,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
dio_data->nocow_done = true;
} else {
/* Our caller expects us to free the input extent map. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*map = NULL;
if (nowait) {
@@ -386,7 +387,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* to allocate a contiguous array for the checksums.
*/
if (!write)
- len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
+ len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS);
lockstart = start;
lockend = start + len - 1;
@@ -440,8 +441,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
start, data_alloc_len, false);
if (!ret)
dio_data->data_space_reserved = true;
- else if (ret && !(BTRFS_I(inode)->flags &
- (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ else if (!(BTRFS_I(inode)->flags &
+ (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
goto err;
}
@@ -474,8 +475,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* to buffered IO. Don't blame me, this is the price we pay for using
* the generic code.
*/
- if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
- free_extent_map(em);
+ if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
+ btrfs_free_extent_map(em);
/*
* If we are in a NOWAIT context, return -EAGAIN in order to
* fallback to buffered IO. This is not only because we can
@@ -516,7 +517,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* after we have submitted bios for all the extents in the range.
*/
if ((flags & IOMAP_NOWAIT) && len < length) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret = -EAGAIN;
goto unlock_err;
}
@@ -558,13 +559,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else {
- iomap->addr = extent_map_block_start(em) + (start - em->start);
+ iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);
iomap->type = IOMAP_MAPPED;
}
iomap->offset = start;
iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
iomap->length = len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/*
* Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
@@ -575,13 +576,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (write)
unlock_bits |= EXTENT_DIO_LOCKED;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- unlock_bits, &cached_state);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ unlock_bits, &cached_state);
/* We didn't use everything, unlock the dio extent for the remainder. */
if (!write && (start + len) < lockend)
- unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
- lockend, NULL);
+ btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
+ lockend, NULL);
return 0;
@@ -591,8 +592,8 @@ unlock_err:
* to update this, be explicit that we expect EXTENT_LOCKED and
* EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
err:
if (dio_data->data_space_reserved) {
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -615,8 +616,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */
- unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1, NULL);
+ btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
return 0;
}
@@ -627,8 +628,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
btrfs_finish_ordered_extent(dio_data->ordered, NULL,
pos, length, false);
else
- unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1, NULL);
+ btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
ret = -ENOTBLK;
}
if (write) {
@@ -660,8 +661,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio)
dip->file_offset, dip->bytes,
!bio->bi_status);
} else {
- unlock_dio_extent(&inode->io_tree, dip->file_offset,
- dip->file_offset + dip->bytes - 1, NULL);
+ btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset,
+ dip->file_offset + dip->bytes - 1, NULL);
}
bbio->bio.bi_private = bbio->private;
@@ -692,9 +693,9 @@ static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
* a pre-existing one.
*/
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
- ret = split_extent_map(bbio->inode, bbio->file_offset,
- ordered->num_bytes, len,
- ordered->disk_bytenr);
+ ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,
+ ordered->num_bytes, len,
+ ordered->disk_bytenr);
if (ret)
return ret;
}
@@ -714,10 +715,8 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
container_of(bbio, struct btrfs_dio_private, bbio);
struct btrfs_dio_data *dio_data = iter->private;
- btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
+ btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset,
btrfs_dio_end_io, bio->bi_private);
- bbio->inode = BTRFS_I(iter->inode);
- bbio->file_offset = file_offset;
dip->file_offset = file_offset;
dip->bytes = bio->bi_iter.bi_size;
@@ -787,6 +786,18 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
if (iov_iter_alignment(iter) & blocksize_mask)
return -EINVAL;
+ /*
+ * For bs > ps support, we heavily rely on large folios to make sure no
+ * block will cross large folio boundaries.
+ *
+ * But memory provided by direct IO is only virtually contiguous, not
+ * physically contiguous, and will break the btrfs' large folio requirement.
+ *
+ * So for bs > ps support, all direct IOs should fallback to buffered ones.
+ */
+ if (fs_info->sectorsize > PAGE_SIZE)
+ return -EINVAL;
+
return 0;
}
@@ -856,6 +867,22 @@ relock:
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto buffered;
}
+ /*
+ * We can't control the folios being passed in, applications can write
+ * to them while a direct IO write is in progress. This means the
+ * content might change after we calculated the data checksum.
+ * Therefore we can end up storing a checksum that doesn't match the
+ * persisted data.
+ *
+ * To be extra safe and avoid false data checksum mismatch, if the
+ * inode requires data checksum, just fallback to buffered IO.
+ * For buffered IO we have full control of page cache and can ensure
+ * no one is modifying the content during writeback.
+ */
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ goto buffered;
+ }
/*
* The iov_iter can be mapped to the same file range we are writing to.
diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h
index 3dc3ea926afe..df5d45ee6de7 100644
--- a/fs/btrfs/direct-io.h
+++ b/fs/btrfs/direct-io.h
@@ -5,6 +5,8 @@
#include <linux/types.h>
+struct kiocb;
+
int __init btrfs_init_dio(void);
void __cold btrfs_destroy_dio(void);
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index e815d165cccc..89fe85778115 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
lockdep_assert_held(&discard_ctl->lock);
- if (!btrfs_run_discard_work(discard_ctl))
- return;
if (list_empty(&block_group->discard_list) ||
block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
@@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
if (!btrfs_is_block_group_data_only(block_group))
return;
+ if (!btrfs_run_discard_work(discard_ctl))
+ return;
+
spin_lock(&discard_ctl->lock);
__add_to_discard_list(discard_ctl, block_group);
spin_unlock(&discard_ctl->lock);
@@ -167,13 +168,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
block_group->discard_eligible_time = 0;
queued = !list_empty(&block_group->discard_list);
list_del_init(&block_group->discard_list);
- /*
- * If the block group is currently running in the discard workfn, we
- * don't want to deref it, since it's still being used by the workfn.
- * The workfn will notice this case and deref the block group when it is
- * finished.
- */
- if (queued && !running)
+ if (queued)
btrfs_put_block_group(block_group);
spin_unlock(&discard_ctl->lock);
@@ -250,6 +245,20 @@ again:
block_group->used != 0) {
if (btrfs_is_block_group_data_only(block_group)) {
__add_to_discard_list(discard_ctl, block_group);
+ /*
+ * The block group must have been moved to other
+ * discard list even if discard was disabled in
+ * the meantime or a transaction abort happened,
+ * otherwise we can end up in an infinite loop,
+ * always jumping into the 'again' label and
+ * keep getting this block group over and over
+ * in case there are no other block groups in
+ * the discard lists.
+ */
+ ASSERT(block_group->discard_index !=
+ BTRFS_DISCARD_INDEX_UNUSED,
+ "discard_index=%d",
+ block_group->discard_index);
} else {
list_del_init(&block_group->discard_list);
btrfs_put_block_group(block_group);
@@ -260,9 +269,10 @@ again:
block_group->discard_cursor = block_group->start;
block_group->discard_state = BTRFS_DISCARD_EXTENTS;
}
- discard_ctl->block_group = block_group;
}
if (block_group) {
+ btrfs_get_block_group(block_group);
+ discard_ctl->block_group = block_group;
*discard_state = block_group->discard_state;
*discard_index = block_group->discard_index;
}
@@ -493,9 +503,20 @@ static void btrfs_discard_workfn(struct work_struct *work)
block_group = peek_discard_list(discard_ctl, &discard_state,
&discard_index, now);
- if (!block_group || !btrfs_run_discard_work(discard_ctl))
+ if (!block_group)
+ return;
+ if (!btrfs_run_discard_work(discard_ctl)) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
return;
+ }
if (now < block_group->discard_eligible_time) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
btrfs_discard_schedule_work(discard_ctl, false);
return;
}
@@ -547,15 +568,7 @@ static void btrfs_discard_workfn(struct work_struct *work)
spin_lock(&discard_ctl->lock);
discard_ctl->prev_discard = trimmed;
discard_ctl->prev_discard_time = now;
- /*
- * If the block group was removed from the discard list while it was
- * running in this workfn, then we didn't deref it, since this function
- * still owned that reference. But we set the discard_ctl->block_group
- * back to NULL, so we can use that condition to know that now we need
- * to deref the block_group.
- */
- if (discard_ctl->block_group == NULL)
- btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
discard_ctl->block_group = NULL;
__btrfs_discard_schedule_work(discard_ctl, now, false);
spin_unlock(&discard_ctl->lock);
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
index dddb0f9101ba..2c5e85394092 100644
--- a/fs/btrfs/discard.h
+++ b/fs/btrfs/discard.h
@@ -3,6 +3,7 @@
#ifndef BTRFS_DISCARD_H
#define BTRFS_DISCARD_H
+#include <linux/types.h>
#include <linux/sizes.h>
struct btrfs_fs_info;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 814320948645..89149fac804c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,7 @@
#include "relocation.h"
#include "scrub.h"
#include "super.h"
+#include "delayed-inode.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -116,7 +117,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
* detect blocks that either didn't get written at all or got written
* in the wrong place.
*/
-int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic)
+int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic)
{
if (!extent_buffer_uptodate(eb))
return 0;
@@ -182,26 +183,33 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
+ const u32 step = min(fs_info->nodesize, PAGE_SIZE);
+ const u32 nr_steps = eb->len / step;
+ phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_pages(eb); i++) {
struct folio *folio = eb->folios[i];
- u64 start = max_t(u64, eb->start, folio_pos(folio));
- u64 end = min_t(u64, eb->start + eb->len,
- folio_pos(folio) + eb->folio_size);
- u32 len = end - start;
-
- ret = btrfs_repair_io_failure(fs_info, 0, start, len,
- start, folio, offset_in_folio(folio, start),
- mirror_num);
- if (ret)
- break;
+
+ /* No large folio support yet. */
+ ASSERT(folio_order(folio) == 0);
+ ASSERT(i < nr_steps);
+
+ /*
+ * For nodesize < page size, there is just one paddr, with some
+ * offset inside the page.
+ *
+ * For nodesize >= page size, it's one or more paddrs, and eb->start
+ * must be aligned to page boundary.
+ */
+ paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start);
}
+ ret = btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start,
+ paddrs, step, mirror_num);
return ret;
}
@@ -225,8 +233,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
ASSERT(check);
while (1) {
- clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
- ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check);
+ ret = read_extent_buffer_pages(eb, mirror_num, check);
if (!ret)
break;
@@ -257,7 +264,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
/*
* Checksum a dirty tree block before IO.
*/
-blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
+int btree_csum_one_bio(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
@@ -268,9 +275,9 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
/* Btree blocks are always contiguous on disk. */
if (WARN_ON_ONCE(bbio->file_offset != eb->start))
- return BLK_STS_IOERR;
+ return -EIO;
if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
- return BLK_STS_IOERR;
+ return -EIO;
/*
* If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
@@ -279,14 +286,13 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
*/
if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
memzero_extent_buffer(eb, 0, eb->len);
- return BLK_STS_OK;
+ return 0;
}
if (WARN_ON_ONCE(found_start != eb->start))
- return BLK_STS_IOERR;
- if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
- eb->start, eb->len)))
- return BLK_STS_IOERR;
+ return -EIO;
+ if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb)))
+ return -EIO;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
offsetof(struct btrfs_header, fsid),
@@ -314,7 +320,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
- return BLK_STS_OK;
+ return 0;
error:
btrfs_print_tree(eb, 0);
@@ -328,7 +334,7 @@ error:
*/
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
- return errno_to_blk_status(ret);
+ return ret;
}
static bool check_tree_block_fsid(struct extent_buffer *eb)
@@ -372,21 +378,21 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
ASSERT(check);
found_start = btrfs_header_bytenr(eb);
- if (found_start != eb->start) {
+ if (unlikely(found_start != eb->start)) {
btrfs_err_rl(fs_info,
"bad tree block start, mirror %u want %llu have %llu",
eb->read_mirror, eb->start, found_start);
ret = -EIO;
goto out;
}
- if (check_tree_block_fsid(eb)) {
+ if (unlikely(check_tree_block_fsid(eb))) {
btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
eb->start, eb->read_mirror);
ret = -EIO;
goto out;
}
found_level = btrfs_header_level(eb);
- if (found_level >= BTRFS_MAX_LEVEL) {
+ if (unlikely(found_level >= BTRFS_MAX_LEVEL)) {
btrfs_err(fs_info,
"bad tree block level, mirror %u level %d on logical %llu",
eb->read_mirror, btrfs_header_level(eb), eb->start);
@@ -400,19 +406,19 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
-"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s",
+"checksum verify failed on logical %llu mirror %u wanted " BTRFS_CSUM_FMT " found " BTRFS_CSUM_FMT " level %d%s",
eb->start, eb->read_mirror,
- CSUM_FMT_VALUE(csum_size, header_csum),
- CSUM_FMT_VALUE(csum_size, result),
+ BTRFS_CSUM_FMT_VALUE(csum_size, header_csum),
+ BTRFS_CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb),
ignore_csum ? ", ignored" : "");
- if (!ignore_csum) {
+ if (unlikely(!ignore_csum)) {
ret = -EUCLEAN;
goto out;
}
}
- if (found_level != check->level) {
+ if (unlikely(found_level != check->level)) {
btrfs_err(fs_info,
"level verify failed on logical %llu mirror %u wanted %u found %u",
eb->start, eb->read_mirror, check->level, found_level);
@@ -454,15 +460,9 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
goto out;
}
- /*
- * If this is a leaf block and it is corrupt, set the corrupt bit so
- * that we don't try and read the other copies of this block, just
- * return -EIO.
- */
- if (found_level == 0 && btrfs_check_leaf(eb)) {
- set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+ /* If this is a leaf block and it is corrupt, just return -EIO. */
+ if (found_level == 0 && btrfs_check_leaf(eb))
ret = -EIO;
- }
if (found_level > 0 && btrfs_check_node(eb))
ret = -EIO;
@@ -643,25 +643,19 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
}
-static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
- u64 objectid)
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, gfp_t flags)
{
- bool dummy = btrfs_is_testing(fs_info);
+ struct btrfs_root *root;
+
+ root = kzalloc(sizeof(*root), flags);
+ if (!root)
+ return NULL;
- memset(&root->root_key, 0, sizeof(root->root_key));
- memset(&root->root_item, 0, sizeof(root->root_item));
- memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
root->fs_info = fs_info;
root->root_key.objectid = objectid;
- root->node = NULL;
- root->commit_root = NULL;
- root->state = 0;
RB_CLEAR_NODE(&root->rb_node);
- btrfs_set_root_last_trans(root, 0);
- root->free_objectid = 0;
- root->nr_delalloc_inodes = 0;
- root->nr_ordered_extents = 0;
xa_init(&root->inodes);
xa_init(&root->delayed_nodes);
@@ -695,15 +689,12 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
refcount_set(&root->refs, 1);
atomic_set(&root->snapshot_force_cow, 0);
atomic_set(&root->nr_swapfiles, 0);
- btrfs_set_root_log_transid(root, 0);
root->log_transid_committed = -1;
- btrfs_set_root_last_log_commit(root, 0);
- root->anon_dev = 0;
- if (!dummy) {
- extent_io_tree_init(fs_info, &root->dirty_log_pages,
- IO_TREE_ROOT_DIRTY_LOG_PAGES);
- extent_io_tree_init(fs_info, &root->log_csum_range,
- IO_TREE_LOG_CSUM_RANGE);
+ if (!btrfs_is_testing(fs_info)) {
+ btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages,
+ IO_TREE_ROOT_DIRTY_LOG_PAGES);
+ btrfs_extent_io_tree_init(fs_info, &root->log_csum_range,
+ IO_TREE_LOG_CSUM_RANGE);
}
spin_lock_init(&root->root_item_lock);
@@ -714,14 +705,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
spin_unlock(&fs_info->fs_roots_radix_lock);
#endif
-}
-static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
- u64 objectid, gfp_t flags)
-{
- struct btrfs_root *root = kzalloc(sizeof(*root), flags);
- if (root)
- __setup_root(root, fs_info, objectid);
return root;
}
@@ -894,7 +878,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item, leaf->len);
btrfs_set_root_last_snapshot(&root->root_item, 0);
btrfs_set_root_dirid(&root->root_item, 0);
- if (is_fstree(objectid))
+ if (btrfs_is_fstree(objectid))
generate_random_guid(root->root_item.uuid);
else
export_guid(root->root_item.uuid, &guid_null);
@@ -1057,7 +1041,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
root->node = NULL;
goto fail;
}
- if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) {
ret = -EIO;
goto fail;
}
@@ -1066,10 +1050,10 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
* For real fs, and not log/reloc trees, root owner must
* match its root node owner
*/
- if (!btrfs_is_testing(fs_info) &&
- btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
- btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
- btrfs_root_id(root) != btrfs_header_owner(root->node)) {
+ if (unlikely(!btrfs_is_testing(fs_info) &&
+ btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
+ btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
+ btrfs_root_id(root) != btrfs_header_owner(root->node))) {
btrfs_crit(fs_info,
"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
btrfs_root_id(root), root->node->start,
@@ -1089,21 +1073,22 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
const struct btrfs_key *key)
{
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
root = read_tree_root_path(tree_root, path, key);
- btrfs_free_path(path);
return root;
}
/*
- * Initialize subvolume root in-memory structure
+ * Initialize subvolume root in-memory structure.
*
* @anon_dev: anonymous device to attach to the root, if zero, allocate new
+ *
+ * In case of failure the caller is responsible to call btrfs_free_fs_root()
*/
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
@@ -1113,7 +1098,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
!btrfs_is_data_reloc_root(root) &&
- is_fstree(btrfs_root_id(root))) {
+ btrfs_is_fstree(btrfs_root_id(root))) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1122,12 +1107,12 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
* Don't assign anonymous block device to roots that are not exposed to
* userspace, the id pool is limited to 1M
*/
- if (is_fstree(btrfs_root_id(root)) &&
+ if (btrfs_is_fstree(btrfs_root_id(root)) &&
btrfs_root_refs(&root->root_item) > 0) {
if (!anon_dev) {
ret = get_anon_bdev(&root->anon_dev);
if (ret)
- goto fail;
+ return ret;
} else {
root->anon_dev = anon_dev;
}
@@ -1137,7 +1122,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
ret = btrfs_init_root_free_objectid(root);
if (ret) {
mutex_unlock(&root->objectid_mutex);
- goto fail;
+ return ret;
}
ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
@@ -1145,9 +1130,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
mutex_unlock(&root->objectid_mutex);
return 0;
-fail:
- /* The caller is responsible to call btrfs_free_fs_root */
- return ret;
}
static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
@@ -1258,6 +1240,10 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
+ if (fs_info->fs_devices)
+ btrfs_close_devices(fs_info->fs_devices);
+ btrfs_free_compress_wsm(fs_info);
+ percpu_counter_destroy(&fs_info->stats_read_blocks);
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
percpu_counter_destroy(&fs_info->ordered_bytes);
@@ -1326,7 +1312,7 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
* This is namely for free-space-tree and quota tree, which can change
* at runtime and should only be grabbed from fs_info.
*/
- if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (!btrfs_is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
return ERR_PTR(-ENOENT);
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
@@ -1567,7 +1553,7 @@ static int transaction_kthread(void *arg)
do {
cannot_commit = false;
- delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
+ delay = secs_to_jiffies(fs_info->commit_interval);
mutex_lock(&fs_info->transaction_kthread_mutex);
spin_lock(&fs_info->trans_lock);
@@ -1582,9 +1568,9 @@ static int transaction_kthread(void *arg)
cur->state < TRANS_STATE_COMMIT_PREP &&
delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
- delay -= msecs_to_jiffies((delta - 1) * 1000);
+ delay -= secs_to_jiffies(delta - 1);
delay = min(delay,
- msecs_to_jiffies(fs_info->commit_interval * 1000));
+ secs_to_jiffies(fs_info->commit_interval));
goto sleep;
}
transid = cur->transid;
@@ -1782,8 +1768,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
destroy_workqueue(fs_info->endio_workers);
if (fs_info->rmw_workers)
destroy_workqueue(fs_info->rmw_workers);
- if (fs_info->compressed_write_workers)
- destroy_workqueue(fs_info->compressed_write_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
@@ -1846,6 +1830,8 @@ void btrfs_put_root(struct btrfs_root *root)
if (refcount_dec_and_test(&root->refs)) {
if (WARN_ON(!xa_empty(&root->inodes)))
xa_destroy(&root->inodes);
+ if (WARN_ON(!xa_empty(&root->delayed_nodes)))
+ xa_destroy(&root->delayed_nodes);
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
@@ -1866,8 +1852,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
int i;
while (!list_empty(&fs_info->dead_roots)) {
- gang[0] = list_entry(fs_info->dead_roots.next,
- struct btrfs_root, root_list);
+ gang[0] = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
@@ -1930,13 +1916,14 @@ static int btrfs_init_btree_inode(struct super_block *sb)
inode->i_mapping->a_ops = &btree_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_BTREE_INODE_IO);
- extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
+ btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
+ IO_TREE_BTREE_INODE_IO);
+ btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
__insert_inode_hash(inode, hash);
+ set_bit(AS_KERNEL_FILE, &inode->i_mapping->flags);
fs_info->btree_inode = inode;
return 0;
@@ -1956,7 +1943,6 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_tree = RB_ROOT;
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
fs_info->qgroup_seq = 1;
- fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
mutex_init(&fs_info->qgroup_rescan_lock);
@@ -1966,7 +1952,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
{
u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
- unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;
+ unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU;
fs_info->workers =
btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
@@ -1993,8 +1979,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
- fs_info->compressed_write_workers =
- alloc_workqueue("btrfs-compressed-write", flags, max_active);
fs_info->endio_freespace_worker =
btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
max_active, 0);
@@ -2005,12 +1989,11 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
ordered_flags);
fs_info->discard_ctl.discard_workers =
- alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE);
+ alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE);
if (!(fs_info->workers &&
fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
- fs_info->compressed_write_workers &&
fs_info->endio_write_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->fixup_workers &&
@@ -2037,14 +2020,10 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
fs_info->csum_shash = csum_shash;
- /*
- * Check if the checksum implementation is a fast accelerated one.
- * As-is this is a bit of a hack and should be replaced once the csum
- * implementations provide that information themselves.
- */
+ /* Check if the checksum implementation is a fast accelerated one. */
switch (csum_type) {
case BTRFS_CSUM_TYPE_CRC32:
- if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
+ if (crc32_optimizations() & CRC32C_OPTIMIZATION)
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
break;
case BTRFS_CSUM_TYPE_XXHASH:
@@ -2070,7 +2049,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
u64 bytenr = btrfs_super_log_root(disk_super);
int level = btrfs_super_log_root_level(disk_super);
- if (fs_devices->rw_devices == 0) {
+ if (unlikely(fs_devices->rw_devices == 0)) {
btrfs_warn(fs_info, "log replay required on RO media");
return -EIO;
}
@@ -2091,7 +2070,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
btrfs_put_root(log_tree_root);
return ret;
}
- if (!extent_buffer_uptodate(log_tree_root->node)) {
+ if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) {
btrfs_err(fs_info, "failed to read log tree");
btrfs_put_root(log_tree_root);
return -EIO;
@@ -2099,10 +2078,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
+ btrfs_put_root(log_tree_root);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Failed to recover log tree");
- btrfs_put_root(log_tree_root);
return ret;
}
@@ -2167,8 +2146,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
found = true;
root = read_tree_root_path(tree_root, path, &key);
if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
- ret = PTR_ERR(root);
+ ret = PTR_ERR(root);
break;
}
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
@@ -2199,8 +2177,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
static int load_global_roots(struct btrfs_root *tree_root)
{
- struct btrfs_path *path;
- int ret = 0;
+ BTRFS_PATH_AUTO_FREE(path);
+ int ret;
path = btrfs_alloc_path();
if (!path)
@@ -2209,18 +2187,17 @@ static int load_global_roots(struct btrfs_root *tree_root)
ret = load_global_roots_objectid(tree_root, path,
BTRFS_EXTENT_TREE_OBJECTID, "extent");
if (ret)
- goto out;
+ return ret;
ret = load_global_roots_objectid(tree_root, path,
BTRFS_CSUM_TREE_OBJECTID, "csum");
if (ret)
- goto out;
+ return ret;
if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
- goto out;
+ return ret;
ret = load_global_roots_objectid(tree_root, path,
BTRFS_FREE_SPACE_TREE_OBJECTID,
"free space");
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -2327,6 +2304,71 @@ out:
return ret;
}
+static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *sb)
+{
+ unsigned int cur = 0; /* Offset inside the sys chunk array */
+ /*
+ * At sb read time, fs_info is not fully initialized. Thus we have
+ * to use super block sectorsize, which should have been validated.
+ */
+ const u32 sectorsize = btrfs_super_sectorsize(sb);
+ u32 sys_array_size = btrfs_super_sys_array_size(sb);
+
+ if (unlikely(sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)) {
+ btrfs_err(fs_info, "system chunk array too big %u > %u",
+ sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+ return -EUCLEAN;
+ }
+
+ while (cur < sys_array_size) {
+ struct btrfs_disk_key *disk_key;
+ struct btrfs_chunk *chunk;
+ struct btrfs_key key;
+ u64 type;
+ u16 num_stripes;
+ u32 len;
+ int ret;
+
+ disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur);
+ len = sizeof(*disk_key);
+
+ if (unlikely(cur + len > sys_array_size))
+ goto short_read;
+ cur += len;
+
+ btrfs_disk_key_to_cpu(&key, disk_key);
+ if (unlikely(key.type != BTRFS_CHUNK_ITEM_KEY)) {
+ btrfs_err(fs_info,
+ "unexpected item type %u in sys_array at offset %u",
+ key.type, cur);
+ return -EUCLEAN;
+ }
+ chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur);
+ num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+ if (unlikely(cur + btrfs_chunk_item_size(num_stripes) > sys_array_size))
+ goto short_read;
+ type = btrfs_stack_chunk_type(chunk);
+ if (unlikely(!(type & BTRFS_BLOCK_GROUP_SYSTEM))) {
+ btrfs_err(fs_info,
+ "invalid chunk type %llu in sys_array at offset %u",
+ type, cur);
+ return -EUCLEAN;
+ }
+ ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset,
+ sectorsize);
+ if (ret < 0)
+ return ret;
+ cur += btrfs_chunk_item_size(num_stripes);
+ }
+ return 0;
+short_read:
+ btrfs_err(fs_info,
+ "super block sys chunk array short read, cur=%u sys_array_size=%u",
+ cur, sys_array_size);
+ return -EUCLEAN;
+}
+
/*
* Real super block validation
* NOTE: super csum type and incompat features will not be checked here.
@@ -2381,21 +2423,13 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
* Check sectorsize and nodesize first, other check will need it.
* Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
*/
- if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+ if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE ||
sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
ret = -EINVAL;
}
- /*
- * We only support at most two sectorsizes: 4K and PAGE_SIZE.
- *
- * We can support 16K sectorsize with 64K page size without problem,
- * but such sectorsize/pagesize combination doesn't make much sense.
- * 4K will be our future standard, PAGE_SIZE is supported from the very
- * beginning.
- */
- if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
+ if (!btrfs_supported_blocksize(sectorsize)) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
@@ -2495,6 +2529,11 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ if (ret)
+ return ret;
+
+ ret = validate_sys_chunk_array(fs_info, sb);
+
/*
* Obvious sys_chunk_array corruptions, it must hold at least one key
* and one chunk
@@ -2557,13 +2596,13 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
ret = btrfs_validate_super(fs_info, sb, -1);
if (ret < 0)
goto out;
- if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
+ if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) {
ret = -EUCLEAN;
btrfs_err(fs_info, "invalid csum type, has %u want %u",
btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
goto out;
}
- if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ if (unlikely(btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP)) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"invalid incompat flags, has 0x%llx valid mask 0x%llx",
@@ -2593,7 +2632,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev
root->node = NULL;
return ret;
}
- if (!extent_buffer_uptodate(root->node)) {
+ if (unlikely(!extent_buffer_uptodate(root->node))) {
free_extent_buffer(root->node);
root->node = NULL;
return -EIO;
@@ -2697,10 +2736,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
return ret;
}
+/*
+ * Lockdep gets confused between our buffer_tree which requires IRQ locking because
+ * we modify marks in the IRQ context, and our delayed inode xarray which doesn't
+ * have these requirements. Use a class key so lockdep doesn't get them mixed up.
+ */
+static struct lock_class_key buffer_xa_class;
+
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
- INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+
+ /* Use the same flags as mapping->i_pages. */
+ xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+ lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class);
+
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2712,7 +2762,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
- spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
spin_lock_init(&fs_info->treelog_bg_lock);
spin_lock_init(&fs_info->zone_active_bgs_lock);
@@ -2757,6 +2806,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+ btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG);
btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
BTRFS_BLOCK_RSV_DELOPS);
@@ -2790,8 +2840,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
rwlock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
- extent_io_tree_init(fs_info, &fs_info->excluded_extents,
- IO_TREE_FS_EXCLUDED_EXTENTS);
+ btrfs_extent_io_tree_init(fs_info, &fs_info->excluded_extents,
+ IO_TREE_FS_EXCLUDED_EXTENTS);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
@@ -2856,6 +2906,10 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
+ ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
fs_info->dirty_metadata_batch = PAGE_SIZE *
(1 + ilog2(nr_cpu_ids));
@@ -3179,13 +3233,13 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
}
/*
- * Subpage runtime limitation on v1 cache.
+ * Subpage/bs > ps runtime limitation on v1 cache.
*
- * V1 space cache still has some hard codeed PAGE_SIZE usage, while
+ * V1 space cache still has some hard coded PAGE_SIZE usage, while
* we're already defaulting to v2 cache, no need to bother v1 as it's
* going to be deprecated anyway.
*/
- if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
btrfs_warn(fs_info,
"v1 space cache is not supported for page size %lu with sectorsize %u",
PAGE_SIZE, fs_info->sectorsize);
@@ -3239,7 +3293,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
/*
* Read super block and check the signature bytes only
*/
- disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
+ disk_super = btrfs_read_disk_super(fs_devices->latest_dev->bdev, 0, false);
if (IS_ERR(disk_super)) {
ret = PTR_ERR(disk_super);
goto fail_alloc;
@@ -3316,12 +3370,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
fs_info->nodesize = nodesize;
+ fs_info->nodesize_bits = ilog2(nodesize);
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
- fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
+ fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT);
+ fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
+ fs_info->fs_devices->fs_info = fs_info;
+ if (fs_info->sectorsize > PAGE_SIZE)
+ btrfs_warn(fs_info,
+ "support for block size %u with page size %lu is experimental, some features may be missing",
+ fs_info->sectorsize, PAGE_SIZE);
/*
* Handle the space caching options appropriately now that we have the
* super block loaded and validated.
@@ -3343,11 +3404,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
- if (sectorsize < PAGE_SIZE)
- btrfs_warn(fs_info,
- "read-write for sector size %u with page size %lu is experimental",
- sectorsize, PAGE_SIZE);
-
+ ret = btrfs_alloc_compress_wsm(fs_info);
+ if (ret)
+ goto fail_sb_buffer;
ret = btrfs_init_workqueues(fs_info);
if (ret)
goto fail_sb_buffer;
@@ -3395,7 +3454,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* below in btrfs_init_dev_replace().
*/
btrfs_free_extra_devids(fs_devices);
- if (!fs_devices->latest_dev->bdev) {
+ if (unlikely(!fs_devices->latest_dev->bdev)) {
btrfs_err(fs_info, "failed to read devices");
ret = -EIO;
goto fail_tree_roots;
@@ -3486,6 +3545,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sysfs;
}
+ btrfs_zoned_reserve_data_reloc_bg(fs_info);
btrfs_free_zone_cache(fs_info);
btrfs_check_active_zone_reservation(fs_info);
@@ -3606,7 +3666,6 @@ fail_alloc:
iput(fs_info->btree_inode);
fail:
- btrfs_close_devices(fs_info->fs_devices);
ASSERT(ret < 0);
return ret;
}
@@ -3619,7 +3678,7 @@ static void btrfs_end_super_write(struct bio *bio)
bio_for_each_folio_all(fi, bio) {
if (bio->bi_status) {
- btrfs_warn_rl_in_rcu(device->fs_info,
+ btrfs_warn_rl(device->fs_info,
"lost super block write due to IO error on %s (%d)",
btrfs_dev_name(device),
blk_status_to_errno(bio->bi_status));
@@ -3639,85 +3698,6 @@ static void btrfs_end_super_write(struct bio *bio)
bio_put(bio);
}
-struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num, bool drop_cache)
-{
- struct btrfs_super_block *super;
- struct page *page;
- u64 bytenr, bytenr_orig;
- struct address_space *mapping = bdev->bd_mapping;
- int ret;
-
- bytenr_orig = btrfs_sb_offset(copy_num);
- ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
- if (ret == -ENOENT)
- return ERR_PTR(-EINVAL);
- else if (ret)
- return ERR_PTR(ret);
-
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
- return ERR_PTR(-EINVAL);
-
- if (drop_cache) {
- /* This should only be called with the primary sb. */
- ASSERT(copy_num == 0);
-
- /*
- * Drop the page of the primary superblock, so later read will
- * always read from the device.
- */
- invalidate_inode_pages2_range(mapping,
- bytenr >> PAGE_SHIFT,
- (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
- }
-
- page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
- if (IS_ERR(page))
- return ERR_CAST(page);
-
- super = page_address(page);
- if (btrfs_super_magic(super) != BTRFS_MAGIC) {
- btrfs_release_disk_super(super);
- return ERR_PTR(-ENODATA);
- }
-
- if (btrfs_super_bytenr(super) != bytenr_orig) {
- btrfs_release_disk_super(super);
- return ERR_PTR(-EINVAL);
- }
-
- return super;
-}
-
-
-struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
-{
- struct btrfs_super_block *super, *latest = NULL;
- int i;
- u64 transid = 0;
-
- /* we would like to check all the supers, but that would make
- * a btrfs mount succeed after a mkfs from a different FS.
- * So, we need to add a special mount option to scan for
- * later supers, using BTRFS_SUPER_MIRROR_MAX instead
- */
- for (i = 0; i < 1; i++) {
- super = btrfs_read_dev_one_super(bdev, i, false);
- if (IS_ERR(super))
- continue;
-
- if (!latest || btrfs_super_generation(super) > transid) {
- if (latest)
- btrfs_release_disk_super(super);
-
- latest = super;
- transid = btrfs_super_generation(super);
- }
- }
-
- return super;
-}
-
/*
* Write superblock @sb to the @device. Do not wait for completion, all the
* folios we use for writing are locked.
@@ -3757,8 +3737,8 @@ static int write_dev_supers(struct btrfs_device *device,
continue;
} else if (ret < 0) {
btrfs_err(device->fs_info,
- "couldn't get super block location for mirror %d",
- i);
+ "couldn't get super block location for mirror %d error %d",
+ i, ret);
atomic_inc(&device->sb_write_errors);
continue;
}
@@ -3777,12 +3757,11 @@ static int write_dev_supers(struct btrfs_device *device,
GFP_NOFS);
if (IS_ERR(folio)) {
btrfs_err(device->fs_info,
- "couldn't get super block page for bytenr %llu",
- bytenr);
+ "couldn't get super block page for bytenr %llu error %ld",
+ bytenr, PTR_ERR(folio));
atomic_inc(&device->sb_write_errors);
continue;
}
- ASSERT(folio_order(folio) == 0);
offset = offset_in_folio(folio, bytenr);
disk_super = folio_address(folio) + offset;
@@ -3855,7 +3834,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
/* If the folio has been removed, then we know it completed. */
if (IS_ERR(folio))
continue;
- ASSERT(folio_order(folio) == 0);
/* Folio will be unlocked once the write completes. */
folio_wait_locked(folio);
@@ -3970,7 +3948,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
* Checks last_flush_error of disks in order to determine the device
* state.
*/
- if (errors_wait && !btrfs_check_rw_degradable(info, NULL))
+ if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL)))
return -EIO;
return 0;
@@ -3998,7 +3976,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
}
if (min_tolerated == INT_MAX) {
- pr_warn("BTRFS: unknown raid flag: %llu", flags);
+ btrfs_warn(NULL, "unknown raid flag: %llu", flags);
min_tolerated = 0;
}
@@ -4072,7 +4050,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
ret = btrfs_validate_write_super(fs_info, sb);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_handle_fs_error(fs_info, -EUCLEAN,
"unexpected superblock corruption detected");
@@ -4083,7 +4061,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
if (ret)
total_errors++;
}
- if (total_errors > max_errors) {
+ if (unlikely(total_errors > max_errors)) {
btrfs_err(fs_info, "%d errors while writing supers",
total_errors);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4108,7 +4086,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
total_errors++;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- if (total_errors > max_errors) {
+ if (unlikely(total_errors > max_errors)) {
btrfs_handle_fs_error(fs_info, -EIO,
"%d errors while writing supers",
total_errors);
@@ -4175,8 +4153,9 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
u64 found_end;
found = true;
- while (find_first_extent_bit(&trans->dirty_pages, cur,
- &found_start, &found_end, EXTENT_DIRTY, &cached)) {
+ while (btrfs_find_first_extent_bit(&trans->dirty_pages, cur,
+ &found_start, &found_end,
+ EXTENT_DIRTY, &cached)) {
dirty_bytes += found_end + 1 - found_start;
cur = found_end + 1;
}
@@ -4253,6 +4232,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
/*
+ * Handle the error fs first, as it will flush and wait for all ordered
+ * extents. This will generate delayed iputs, thus we want to handle
+ * it first.
+ */
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
+ btrfs_error_commit_super(fs_info);
+
+ /*
* Wait for any fixup workers to complete.
* If we don't wait for them here and they are still running by the time
* we call kthread_stop() against the cleaner kthread further below, we
@@ -4262,6 +4249,40 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
* already the cleaner, but below we run all pending delayed iputs.
*/
btrfs_flush_workqueue(fs_info->fixup_workers);
+ /*
+ * Similar case here, we have to wait for delalloc workers before we
+ * proceed below and stop the cleaner kthread, otherwise we trigger a
+ * use-after-tree on the cleaner kthread task_struct when a delalloc
+ * worker running submit_compressed_extents() adds a delayed iput, which
+ * does a wake up on the cleaner kthread, which was already freed below
+ * when we call kthread_stop().
+ */
+ btrfs_flush_workqueue(fs_info->delalloc_workers);
+
+ /*
+ * We can have ordered extents getting their last reference dropped from
+ * the fs_info->workers queue because for async writes for data bios we
+ * queue a work for that queue, at btrfs_wq_submit_bio(), that runs
+ * run_one_async_done() which calls btrfs_bio_end_io() in case the bio
+ * has an error, and that later function can do the final
+ * btrfs_put_ordered_extent() on the ordered extent attached to the bio,
+ * which adds a delayed iput for the inode. So we must flush the queue
+ * so that we don't have delayed iputs after committing the current
+ * transaction below and stopping the cleaner and transaction kthreads.
+ */
+ btrfs_flush_workqueue(fs_info->workers);
+
+ /*
+ * When finishing a compressed write bio we schedule a work queue item
+ * to finish an ordered extent - end_bbio_compressed_write()
+ * calls btrfs_finish_ordered_extent() which in turns does a call to
+ * btrfs_queue_ordered_fn(), and that queues the ordered extent
+ * completion either in the endio_write_workers work queue or in the
+ * fs_info->endio_freespace_worker work queue. We flush those queues
+ * below, so before we flush them we must flush this queue for the
+ * workers of compressed writes.
+ */
+ flush_workqueue(fs_info->endio_workers);
/*
* After we parked the cleaner kthread, ordered extents may have
@@ -4274,8 +4295,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*
* So wait for all ongoing ordered extents to complete and then run
* delayed iputs. This works because once we reach this point no one
- * can either create new ordered extents nor create delayed iputs
- * through some other means.
+ * can create new ordered extents, but delayed iputs can still be added
+ * by a reclaim worker (see comments further below).
*
* Also note that btrfs_wait_ordered_roots() is not safe here, because
* it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
@@ -4286,6 +4307,10 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_flush_workqueue(fs_info->endio_write_workers);
/* Ordered extents for free space inodes. */
btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ /*
+ * Run delayed iputs in case an async reclaim worker is waiting for them
+ * to be run as mentioned above.
+ */
btrfs_run_delayed_iputs(fs_info);
cancel_work_sync(&fs_info->async_reclaim_work);
@@ -4293,6 +4318,18 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->preempt_reclaim_work);
cancel_work_sync(&fs_info->em_shrinker_work);
+ /*
+ * Run delayed iputs again because an async reclaim worker may have
+ * added new ones if it was flushing delalloc:
+ *
+ * shrink_delalloc() -> btrfs_start_delalloc_roots() ->
+ * start_delalloc_inodes() -> btrfs_add_delayed_iput()
+ */
+ btrfs_run_delayed_iputs(fs_info);
+
+ /* There should be no more workload to generate new delayed iputs. */
+ set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
+
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4321,9 +4358,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (BTRFS_FS_ERROR(fs_info))
- btrfs_error_commit_super(fs_info);
-
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
@@ -4331,7 +4365,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
if (btrfs_check_quota_leak(fs_info)) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN("qgroup reserved space leaked");
btrfs_err(fs_info, "qgroup reserved space leaked");
}
@@ -4378,7 +4412,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
iput(fs_info->btree_inode);
btrfs_mapping_tree_free(fs_info);
- btrfs_close_devices(fs_info->fs_devices);
}
void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
@@ -4446,10 +4479,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
/* cleanup FS via transaction */
btrfs_cleanup_transaction(fs_info);
- mutex_lock(&fs_info->cleaner_mutex);
- btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_mutex);
-
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
}
@@ -4592,9 +4621,9 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
u64 start = 0;
u64 end;
- while (find_first_extent_bit(dirty_pages, start, &start, &end,
- mark, NULL)) {
- clear_extent_bits(dirty_pages, start, end, mark);
+ while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, NULL)) {
+ btrfs_clear_extent_bit(dirty_pages, start, end, mark, NULL);
while (start <= end) {
eb = find_extent_buffer(fs_info, start);
start += fs_info->nodesize;
@@ -4627,14 +4656,14 @@ static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
* the same extent range.
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
- if (!find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, &cached_state)) {
+ if (!btrfs_find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, &cached_state)) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
- clear_extent_dirty(unpin, start, end, &cached_state);
- free_extent_state(cached_state);
+ btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
+ btrfs_free_extent_state(cached_state);
btrfs_error_unpin_extent_range(fs_info, start, end);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
@@ -4820,7 +4849,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
int btrfs_init_root_free_objectid(struct btrfs_root *root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
struct extent_buffer *l;
struct btrfs_key search_key;
@@ -4836,14 +4865,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
search_key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
- goto error;
- if (ret == 0) {
+ return ret;
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of valid range.
*/
- ret = -EUCLEAN;
- goto error;
+ return -EUCLEAN;
}
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
@@ -4854,10 +4882,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
} else {
root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
}
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a7051e2570c1..5320da83d0cf 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -9,7 +9,8 @@
#include <linux/sizes.h>
#include <linux/compiler_types.h>
#include "ctree.h"
-#include "fs.h"
+#include "bio.h"
+#include "ordered-data.h"
struct block_device;
struct super_block;
@@ -58,9 +59,6 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *sb, int mirror_num);
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
-struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
-struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
const struct btrfs_key *key);
@@ -96,9 +94,6 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
/*
* This function is used to grab the root, and avoid it is freed when we
* access it. But it doesn't ensure that the tree is not dropped.
- *
- * If you want to ensure the whole tree is safe, you should use
- * fs_info->subvol_srcu
*/
static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
{
@@ -112,12 +107,11 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *buf);
-int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
- int atomic);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, bool atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf,
const struct btrfs_tree_parent_check *check);
-blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio);
+int btree_csum_one_bio(struct btrfs_bio *bbio);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index e2b22bea348a..230d9326b685 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -23,7 +23,11 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
int type;
if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
- *max_len = BTRFS_FID_SIZE_CONNECTABLE;
+ if (btrfs_root_id(BTRFS_I(inode)->root) !=
+ btrfs_root_id(BTRFS_I(parent)->root))
+ *max_len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
+ else
+ *max_len = BTRFS_FID_SIZE_CONNECTABLE;
return FILEID_INVALID;
} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
*max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
@@ -45,6 +49,8 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
parent_root_id = btrfs_root_id(BTRFS_I(parent)->root);
if (parent_root_id != fid->root_objectid) {
+ if (*max_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+ return FILEID_INVALID;
fid->parent_root_objectid = parent_root_id;
len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
type = FILEID_BTRFS_WITH_PARENT_ROOT;
@@ -75,7 +81,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
- struct inode *inode;
+ struct btrfs_inode *inode;
if (objectid < BTRFS_FIRST_FREE_OBJECTID)
return ERR_PTR(-ESTALE);
@@ -89,12 +95,12 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (generation != 0 && generation != inode->i_generation) {
- iput(inode);
+ if (generation != 0 && generation != inode->vfs_inode.i_generation) {
+ iput(&inode->vfs_inode);
return ERR_PTR(-ESTALE);
}
- return d_obtain_alias(inode);
+ return d_obtain_alias(&inode->vfs_inode);
}
static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -145,9 +151,10 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
struct dentry *btrfs_get_parent(struct dentry *child)
{
- struct inode *dir = d_inode(child);
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode *dir = BTRFS_I(d_inode(child));
+ struct btrfs_inode *inode;
+ struct btrfs_root *root = dir->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_root_ref *ref;
@@ -159,13 +166,13 @@ struct dentry *btrfs_get_parent(struct dentry *child)
if (!path)
return ERR_PTR(-ENOMEM);
- if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) {
key.objectid = btrfs_root_id(root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
} else {
- key.objectid = btrfs_ino(BTRFS_I(dir));
+ key.objectid = btrfs_ino(dir);
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
}
@@ -173,7 +180,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto fail;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset of -1 found, there would have to exist an
* inode with such number or a root with such id.
@@ -210,7 +217,11 @@ struct dentry *btrfs_get_parent(struct dentry *child)
found_key.offset, 0);
}
- return d_obtain_alias(btrfs_iget(key.objectid, root));
+ inode = btrfs_iget(key.objectid, root);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return d_obtain_alias(&inode->vfs_inode);
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
@@ -219,11 +230,11 @@ fail:
static int btrfs_get_name(struct dentry *parent, char *name,
struct dentry *child)
{
- struct inode *inode = d_inode(child);
- struct inode *dir = d_inode(parent);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(child));
+ struct btrfs_inode *dir = BTRFS_I(d_inode(parent));
+ struct btrfs_root *root = dir->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_inode_ref *iref;
struct btrfs_root_ref *rref;
struct extent_buffer *leaf;
@@ -233,37 +244,34 @@ static int btrfs_get_name(struct dentry *parent, char *name,
int ret;
u64 ino;
- if (!S_ISDIR(dir->i_mode))
+ if (!S_ISDIR(dir->vfs_inode.i_mode))
return -EINVAL;
- ino = btrfs_ino(BTRFS_I(inode));
+ ino = btrfs_ino(inode);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
if (ino == BTRFS_FIRST_FREE_OBJECTID) {
- key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
+ key.objectid = btrfs_root_id(inode->root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
} else {
key.objectid = ino;
- key.offset = btrfs_ino(BTRFS_I(dir));
key.type = BTRFS_INODE_REF_KEY;
+ key.offset = btrfs_ino(dir);
}
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- btrfs_free_path(path);
return ret;
} else if (ret > 0) {
- if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+ if (ino == BTRFS_FIRST_FREE_OBJECTID)
path->slots[0]--;
- } else {
- btrfs_free_path(path);
+ else
return -ENOENT;
- }
}
leaf = path->nodes[0];
@@ -280,7 +288,6 @@ static int btrfs_get_name(struct dentry *parent, char *name,
}
read_extent_buffer(leaf, name, name_ptr, name_len);
- btrfs_free_path(path);
/*
* have to add the null termination to make sure that reconnect_path
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 6d08c100b01d..bb2ca1c9c7b0 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -42,8 +42,9 @@ static inline void btrfs_extent_state_leak_debug_check(void)
struct extent_state *state;
while (!list_empty(&states)) {
- state = list_entry(states.next, struct extent_state, leak_list);
- pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+ state = list_first_entry(&states, struct extent_state, leak_list);
+ btrfs_err(NULL,
+ "state leak: start %llu end %llu state %u in tree %d refs %d",
state->start, state->end, state->state,
extent_state_in_tree(state),
refcount_read(&state->refs));
@@ -59,13 +60,12 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree,
u64 start, u64 end)
{
- const struct btrfs_inode *inode;
+ const struct btrfs_inode *inode = tree->inode;
u64 isize;
if (tree->owner != IO_TREE_INODE_IO)
return;
- inode = extent_io_tree_to_inode_const(tree);
isize = i_size_read(&inode->vfs_inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
btrfs_debug_rl(inode->root->fs_info,
@@ -80,25 +80,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
-
-/*
- * The only tree allowed to set the inode is IO_TREE_INODE_IO.
- */
-static bool is_inode_io_tree(const struct extent_io_tree *tree)
-{
- return tree->owner == IO_TREE_INODE_IO;
-}
-
-/* Return the inode if it's valid for the given tree, otherwise NULL. */
-struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree)
-{
- if (tree->owner == IO_TREE_INODE_IO)
- return tree->inode;
- return NULL;
-}
-
/* Read-only access to the inode. */
-const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree)
+const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree)
{
if (tree->owner == IO_TREE_INODE_IO)
return tree->inode;
@@ -106,15 +89,15 @@ const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_t
}
/* For read-only access to fs_info. */
-const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
+const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
{
if (tree->owner == IO_TREE_INODE_IO)
return tree->inode->root->fs_info;
return tree->fs_info;
}
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner)
+void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner)
{
tree->state = RB_ROOT;
spin_lock_init(&tree->lock);
@@ -129,7 +112,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
* aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never
* set on any extent state when calling this function).
*/
-void extent_io_tree_release(struct extent_io_tree *tree)
+void btrfs_extent_io_tree_release(struct extent_io_tree *tree)
{
struct rb_root root;
struct extent_state *state;
@@ -148,7 +131,7 @@ void extent_io_tree_release(struct extent_io_tree *tree)
* (see wait_extent_bit()).
*/
ASSERT(!waitqueue_active(&state->wq));
- free_extent_state(state);
+ btrfs_free_extent_state(state);
cond_resched_lock(&tree->lock);
}
/*
@@ -176,7 +159,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
btrfs_leak_debug_add_state(state);
refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
- trace_alloc_extent_state(state, mask, _RET_IP_);
+ trace_btrfs_alloc_extent_state(state, mask, _RET_IP_);
return state;
}
@@ -188,14 +171,14 @@ static struct extent_state *alloc_extent_state_atomic(struct extent_state *preal
return prealloc;
}
-void free_extent_state(struct extent_state *state)
+void btrfs_free_extent_state(struct extent_state *state)
{
if (!state)
return;
if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del_state(state);
- trace_free_extent_state(state, _RET_IP_);
+ trace_btrfs_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
}
@@ -222,38 +205,34 @@ static inline struct extent_state *next_state(struct extent_state *state)
{
struct rb_node *next = rb_next(&state->rb_node);
- if (next)
- return rb_entry(next, struct extent_state, rb_node);
- else
- return NULL;
+ return rb_entry_safe(next, struct extent_state, rb_node);
}
static inline struct extent_state *prev_state(struct extent_state *state)
{
struct rb_node *next = rb_prev(&state->rb_node);
- if (next)
- return rb_entry(next, struct extent_state, rb_node);
- else
- return NULL;
+ return rb_entry_safe(next, struct extent_state, rb_node);
}
/*
- * Search @tree for an entry that contains @offset. Such entry would have
- * entry->start <= offset && entry->end >= offset.
+ * Search @tree for an entry that contains @offset or if none exists for the
+ * first entry that starts and ends after that offset.
*
* @tree: the tree to search
- * @offset: offset that should fall within an entry in @tree
+ * @offset: search offset
* @node_ret: pointer where new node should be anchored (used when inserting an
* entry in the tree)
* @parent_ret: points to entry which would have been the parent of the entry,
* containing @offset
*
- * Return a pointer to the entry that contains @offset byte address and don't change
- * @node_ret and @parent_ret.
+ * Return a pointer to the entry that contains @offset byte address.
+ *
+ * If no such entry exists, return the first entry that starts and ends after
+ * @offset if one exists, otherwise NULL.
*
- * If no such entry exists, return pointer to entry that ends before @offset
- * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
+ * If the returned entry starts at @offset, then @node_ret and @parent_ret
+ * aren't changed.
*/
static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
u64 offset,
@@ -282,7 +261,11 @@ static inline struct extent_state *tree_search_for_insert(struct extent_io_tree
if (parent_ret)
*parent_ret = prev;
- /* Search neighbors until we find the first one past the end */
+ /*
+ * Return either the current entry if it contains offset (it ends after
+ * or at offset) or the first entry that starts and ends after offset if
+ * one exists, or NULL.
+ */
while (entry && offset > entry->end)
entry = next_state(entry);
@@ -346,12 +329,12 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64
return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static void extent_io_tree_panic(const struct extent_io_tree *tree,
- const struct extent_state *state,
- const char *opname,
- int err)
+static void __cold extent_io_tree_panic(const struct extent_io_tree *tree,
+ const struct extent_state *state,
+ const char *opname,
+ int err)
{
- btrfs_panic(extent_io_tree_to_fs_info(tree), err,
+ btrfs_panic(btrfs_extent_io_tree_to_fs_info(tree), err,
"extent io tree error on %s state start %llu end %llu",
opname, state->start, state->end);
}
@@ -362,13 +345,12 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s
prev = prev_state(state);
if (prev && prev->end == state->start - 1 && prev->state == state->state) {
- if (is_inode_io_tree(tree))
- btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
- state, prev);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_merge_delalloc_extent(tree->inode, state, prev);
state->start = prev->start;
rb_erase(&prev->rb_node, &tree->state);
RB_CLEAR_NODE(&prev->rb_node);
- free_extent_state(prev);
+ btrfs_free_extent_state(prev);
}
}
@@ -378,13 +360,12 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s
next = next_state(state);
if (next && next->start == state->end + 1 && next->state == state->state) {
- if (is_inode_io_tree(tree))
- btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
- state, next);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_merge_delalloc_extent(tree->inode, state, next);
state->end = next->end;
rb_erase(&next->rb_node, &tree->state);
RB_CLEAR_NODE(&next->rb_node);
- free_extent_state(next);
+ btrfs_free_extent_state(next);
}
}
@@ -413,8 +394,8 @@ static void set_state_bits(struct extent_io_tree *tree,
u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
- if (is_inode_io_tree(tree))
- btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_set_delalloc_extent(tree->inode, state, bits);
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
BUG_ON(ret < 0);
@@ -459,10 +440,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
if (state->end < entry->start) {
if (try_merge && end == entry->start &&
state->state == entry->state) {
- if (is_inode_io_tree(tree))
- btrfs_merge_delalloc_extent(
- extent_io_tree_to_inode(tree),
- state, entry);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_merge_delalloc_extent(tree->inode,
+ state, entry);
entry->start = state->start;
merge_prev_state(tree, entry);
state->state = 0;
@@ -472,10 +452,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
} else if (state->end > entry->end) {
if (try_merge && entry->end == start &&
state->state == entry->state) {
- if (is_inode_io_tree(tree))
- btrfs_merge_delalloc_extent(
- extent_io_tree_to_inode(tree),
- state, entry);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_merge_delalloc_extent(tree->inode,
+ state, entry);
entry->end = state->end;
merge_next_state(tree, entry);
state->state = 0;
@@ -527,9 +506,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct rb_node *parent = NULL;
struct rb_node **node;
- if (is_inode_io_tree(tree))
- btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig,
- split);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_split_delalloc_extent(tree->inode, orig, split);
prealloc->start = orig->start;
prealloc->end = split - 1;
@@ -549,7 +527,7 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
} else if (prealloc->end > entry->end) {
node = &(*node)->rb_right;
} else {
- free_extent_state(prealloc);
+ btrfs_free_extent_state(prealloc);
return -EEXIST;
}
}
@@ -561,6 +539,18 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
}
/*
+ * Use this during tree iteration to avoid doing next node searches when it's
+ * not needed (the current record ends at or after the target range's end).
+ */
+static inline struct extent_state *next_search_state(struct extent_state *state, u64 end)
+{
+ if (state->end < end)
+ return next_state(state);
+
+ return NULL;
+}
+
+/*
* Utility function to clear some bits in an extent state struct. It will
* optionally wake up anyone waiting on this state (wake == 1).
*
@@ -569,16 +559,15 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- u32 bits, int wake,
+ u32 bits, int wake, u64 end,
struct extent_changeset *changeset)
{
struct extent_state *next;
u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
- if (is_inode_io_tree(tree))
- btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state,
- bits);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_clear_delalloc_extent(tree->inode, state, bits);
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
@@ -586,17 +575,17 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
if (wake)
wake_up(&state->wq);
if (state->state == 0) {
- next = next_state(state);
+ next = next_search_state(state, end);
if (extent_state_in_tree(state)) {
rb_erase(&state->rb_node, &tree->state);
RB_CLEAR_NODE(&state->rb_node);
- free_extent_state(state);
+ btrfs_free_extent_state(state);
} else {
WARN_ON(1);
}
} else {
merge_state(tree, state);
- next = next_state(state);
+ next = next_search_state(state, end);
}
return next;
}
@@ -620,18 +609,18 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask)
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached_state,
- struct extent_changeset *changeset)
+int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state,
+ struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
struct extent_state *prealloc = NULL;
u64 last_end;
- int err;
- int clear = 0;
- int wake;
- int delete = (bits & EXTENT_CLEAR_ALL_BITS);
+ int ret = 0;
+ bool clear;
+ bool wake;
+ const bool delete = (bits & EXTENT_CLEAR_ALL_BITS);
gfp_t mask;
set_gfp_mask_from_bits(&bits, &mask);
@@ -644,9 +633,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & EXTENT_DELALLOC)
bits |= EXTENT_NORESERVE;
- wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0);
- if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY))
- clear = 1;
+ wake = (bits & EXTENT_LOCK_BITS);
+ clear = (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY));
again:
if (!prealloc) {
/*
@@ -676,7 +664,7 @@ again:
goto hit_next;
}
if (clear)
- free_extent_state(cached);
+ btrfs_free_extent_state(cached);
}
/* This search will find the extents that end after our range starts. */
@@ -691,7 +679,7 @@ hit_next:
/* The state doesn't have the wanted bits, go ahead. */
if (!(state->state & bits)) {
- state = next_state(state);
+ state = next_search_state(state, end);
goto next;
}
@@ -714,18 +702,24 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, state, "split", err);
-
+ ret = split_state(tree, state, prealloc, start);
prealloc = NULL;
- if (err)
+ if (ret) {
+ extent_io_tree_panic(tree, state, "split", ret);
goto out;
+ }
if (state->end <= end) {
- state = clear_state_bit(tree, state, bits, wake, changeset);
+ state = clear_state_bit(tree, state, bits, wake, end,
+ changeset);
goto next;
}
- goto search_again;
+ if (need_resched())
+ goto search_again;
+ /*
+ * Fallthrough and try atomic extent state allocation if needed.
+ * If it fails we'll jump to 'search_again' retry the allocation
+ * in non-atomic mode and start the search again.
+ */
}
/*
* | ---- desired range ---- |
@@ -736,30 +730,31 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, state, "split", err);
+ ret = split_state(tree, state, prealloc, end + 1);
+ if (ret) {
+ extent_io_tree_panic(tree, state, "split", ret);
+ prealloc = NULL;
+ goto out;
+ }
if (wake)
wake_up(&state->wq);
- clear_state_bit(tree, prealloc, bits, wake, changeset);
+ clear_state_bit(tree, prealloc, bits, wake, end, changeset);
prealloc = NULL;
goto out;
}
- state = clear_state_bit(tree, state, bits, wake, changeset);
+ state = clear_state_bit(tree, state, bits, wake, end, changeset);
next:
- if (last_end == (u64)-1)
+ if (last_end >= end)
goto out;
start = last_end + 1;
- if (start <= end && state && !need_resched())
+ if (state && !need_resched())
goto hit_next;
search_again:
- if (start > end)
- goto out;
spin_unlock(&tree->lock);
if (gfpflags_allow_blocking(mask))
cond_resched();
@@ -767,10 +762,9 @@ search_again:
out:
spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
+ btrfs_free_extent_state(prealloc);
- return 0;
+ return ret;
}
@@ -820,7 +814,7 @@ process_node:
schedule();
spin_lock(&tree->lock);
finish_wait(&state->wq, &wait);
- free_extent_state(state);
+ btrfs_free_extent_state(state);
goto again;
}
start = state->end + 1;
@@ -838,7 +832,7 @@ out:
if (cached_state && *cached_state) {
state = *cached_state;
*cached_state = NULL;
- free_extent_state(state);
+ btrfs_free_extent_state(state);
}
spin_unlock(&tree->lock);
}
@@ -877,7 +871,7 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t
*/
state = tree_search(tree, start);
while (state) {
- if (state->end >= start && (state->state & bits))
+ if (state->state & bits)
return state;
state = next_state(state);
}
@@ -892,9 +886,9 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t
* Return true if we find something, and update @start_ret and @end_ret.
* Return false if we found nothing.
*/
-bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits,
- struct extent_state **cached_state)
+bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state)
{
struct extent_state *state;
bool ret = false;
@@ -914,13 +908,13 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
* again. If we haven't found any, clear as well since
* it's now useless.
*/
- free_extent_state(*cached_state);
+ btrfs_free_extent_state(*cached_state);
*cached_state = NULL;
if (state)
goto got_it;
goto out;
}
- free_extent_state(*cached_state);
+ btrfs_free_extent_state(*cached_state);
*cached_state = NULL;
}
@@ -952,14 +946,17 @@ out:
* contiguous area for given bits. We will search to the first bit we find, and
* then walk down the tree until we find a non-contiguous area. The area
* returned will be the full contiguous area with the bits set.
+ *
+ * Returns true if we found a range with the given bits set, in which case
+ * @start_ret and @end_ret are updated, or false if no range was found.
*/
-int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits)
+bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
- int ret = 1;
+ bool ret = false;
- ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES));
+ ASSERT(!btrfs_fs_incompat(btrfs_extent_io_tree_to_fs_info(tree), NO_HOLES));
spin_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, bits);
@@ -971,7 +968,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
break;
*end_ret = state->end;
}
- ret = 0;
+ ret = true;
}
spin_unlock(&tree->lock);
return ret;
@@ -1046,11 +1043,11 @@ out:
*
* [start, end] is inclusive This takes the tree lock.
*/
-static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u64 *failed_start,
- struct extent_state **failed_state,
- struct extent_state **cached_state,
- struct extent_changeset *changeset)
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u64 *failed_start,
+ struct extent_state **failed_state,
+ struct extent_state **cached_state,
+ struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -1129,12 +1126,11 @@ hit_next:
set_state_bits(tree, state, bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
- if (last_end == (u64)-1)
+ if (last_end >= end)
goto out;
start = last_end + 1;
state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
+ if (state && state->start == start && !need_resched())
goto hit_next;
goto search_again;
}
@@ -1186,12 +1182,11 @@ hit_next:
set_state_bits(tree, state, bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
- if (last_end == (u64)-1)
+ if (last_end >= end)
goto out;
start = last_end + 1;
state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
+ if (state && state->start == start && !need_resched())
goto hit_next;
}
goto search_again;
@@ -1204,14 +1199,8 @@ hit_next:
* extent we found.
*/
if (state->start > start) {
- u64 this_end;
struct extent_state *inserted_state;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
@@ -1221,17 +1210,38 @@ hit_next:
* extent.
*/
prealloc->start = start;
- prealloc->end = this_end;
+ if (end < last_start)
+ prealloc->end = end;
+ else
+ prealloc->end = last_start - 1;
+
inserted_state = insert_state(tree, prealloc, bits, changeset);
if (IS_ERR(inserted_state)) {
ret = PTR_ERR(inserted_state);
extent_io_tree_panic(tree, prealloc, "insert", ret);
+ goto out;
}
cache_state(inserted_state, cached_state);
if (inserted_state == prealloc)
prealloc = NULL;
- start = this_end + 1;
+ start = inserted_state->end + 1;
+
+ /* Beyond target range, stop. */
+ if (start > end)
+ goto out;
+
+ if (need_resched())
+ goto search_again;
+
+ state = next_search_state(inserted_state, end);
+ /*
+ * If there's a next state, whether contiguous or not, we don't
+ * need to unlock and start search again. If it's not contiguous
+ * we will end up here and try to allocate a prealloc state and insert.
+ */
+ if (state)
+ goto hit_next;
goto search_again;
}
/*
@@ -1252,8 +1262,11 @@ hit_next:
if (!prealloc)
goto search_again;
ret = split_state(tree, state, prealloc, end + 1);
- if (ret)
+ if (ret) {
extent_io_tree_panic(tree, state, "split", ret);
+ prealloc = NULL;
+ goto out;
+ }
set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
@@ -1272,18 +1285,16 @@ search_again:
out:
spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
+ btrfs_free_extent_state(prealloc);
return ret;
}
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached_state)
+int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state)
{
- return __set_extent_bit(tree, start, end, bits, NULL, NULL,
- cached_state, NULL);
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, cached_state, NULL);
}
/*
@@ -1304,9 +1315,9 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
*
* All allocations are done with GFP_NOFS.
*/
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u32 clear_bits,
- struct extent_state **cached_state)
+int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -1374,12 +1385,11 @@ hit_next:
if (state->start == start && state->end <= end) {
set_state_bits(tree, state, bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, clear_bits, 0, NULL);
- if (last_end == (u64)-1)
+ state = clear_state_bit(tree, state, clear_bits, 0, end, NULL);
+ if (last_end >= end)
goto out;
start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
+ if (state && state->start == start && !need_resched())
goto hit_next;
goto search_again;
}
@@ -1406,20 +1416,19 @@ hit_next:
goto out;
}
ret = split_state(tree, state, prealloc, start);
- if (ret)
- extent_io_tree_panic(tree, state, "split", ret);
prealloc = NULL;
- if (ret)
+ if (ret) {
+ extent_io_tree_panic(tree, state, "split", ret);
goto out;
+ }
if (state->end <= end) {
set_state_bits(tree, state, bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, clear_bits, 0, NULL);
- if (last_end == (u64)-1)
+ state = clear_state_bit(tree, state, clear_bits, 0, end, NULL);
+ if (last_end >= end)
goto out;
start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
+ if (state && state->start == start && !need_resched())
goto hit_next;
}
goto search_again;
@@ -1432,14 +1441,8 @@ hit_next:
* extent we found.
*/
if (state->start > start) {
- u64 this_end;
struct extent_state *inserted_state;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
ret = -ENOMEM;
@@ -1451,16 +1454,37 @@ hit_next:
* extent.
*/
prealloc->start = start;
- prealloc->end = this_end;
+ if (end < last_start)
+ prealloc->end = end;
+ else
+ prealloc->end = last_start - 1;
+
inserted_state = insert_state(tree, prealloc, bits, NULL);
if (IS_ERR(inserted_state)) {
ret = PTR_ERR(inserted_state);
extent_io_tree_panic(tree, prealloc, "insert", ret);
+ goto out;
}
cache_state(inserted_state, cached_state);
if (inserted_state == prealloc)
prealloc = NULL;
- start = this_end + 1;
+ start = inserted_state->end + 1;
+
+ /* Beyond target range, stop. */
+ if (start > end)
+ goto out;
+
+ if (need_resched())
+ goto search_again;
+
+ state = next_search_state(inserted_state, end);
+ /*
+ * If there's a next state, whether contiguous or not, we don't
+ * need to unlock and start search again. If it's not contiguous
+ * we will end up here and try to allocate a prealloc state and insert.
+ */
+ if (state)
+ goto hit_next;
goto search_again;
}
/*
@@ -1477,12 +1501,15 @@ hit_next:
}
ret = split_state(tree, state, prealloc, end + 1);
- if (ret)
+ if (ret) {
extent_io_tree_panic(tree, state, "split", ret);
+ prealloc = NULL;
+ goto out;
+ }
set_state_bits(tree, prealloc, bits, NULL);
cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
+ clear_state_bit(tree, prealloc, clear_bits, 0, end, NULL);
prealloc = NULL;
goto out;
}
@@ -1497,8 +1524,7 @@ search_again:
out:
spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
+ btrfs_free_extent_state(prealloc);
return ret;
}
@@ -1518,8 +1544,8 @@ out:
* spans (last_range_end, end of device]. In this case it's up to the caller to
* trim @end_ret to the appropriate size.
*/
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits)
+void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
struct extent_state *prev = NULL, *next = NULL;
@@ -1636,10 +1662,10 @@ out:
* all given bits set. If the returned number of bytes is greater than zero
* then @start is updated with the offset of the first byte with the bits set.
*/
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end, u64 max_bytes,
- u32 bits, int contig,
- struct extent_state **cached_state)
+u64 btrfs_count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ u32 bits, bool contig,
+ struct extent_state **cached_state)
{
struct extent_state *state = NULL;
struct extent_state *cached;
@@ -1710,7 +1736,7 @@ search:
}
if (cached_state) {
- free_extent_state(*cached_state);
+ btrfs_free_extent_state(*cached_state);
*cached_state = state;
if (state)
refcount_inc(&state->refs);
@@ -1724,16 +1750,16 @@ search:
/*
* Check if the single @bit exists in the given range.
*/
-bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
+bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
{
- struct extent_state *state = NULL;
+ struct extent_state *state;
bool bitset = false;
ASSERT(is_power_of_2(bit));
spin_lock(&tree->lock);
state = tree_search(tree, start);
- while (state && start <= end) {
+ while (state) {
if (state->start > end)
break;
@@ -1742,9 +1768,7 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32
break;
}
- /* If state->end is (u64)-1, start will overflow to 0 */
- start = state->end + 1;
- if (start > end || start == 0)
+ if (state->end >= end)
break;
state = next_state(state);
}
@@ -1752,16 +1776,51 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32
return bitset;
}
+void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+
+ /*
+ * The cached state is currently mandatory and not used to start the
+ * search, only to cache the first state record found in the range.
+ */
+ ASSERT(cached_state != NULL);
+ ASSERT(*cached_state == NULL);
+
+ *bits = 0;
+
+ spin_lock(&tree->lock);
+ state = tree_search(tree, start);
+ if (state && state->start < end) {
+ *cached_state = state;
+ refcount_inc(&state->refs);
+ }
+ while (state) {
+ if (state->start > end)
+ break;
+
+ *bits |= state->state;
+
+ if (state->end >= end)
+ break;
+
+ state = next_state(state);
+ }
+ spin_unlock(&tree->lock);
+}
+
/*
* Check if the whole range [@start,@end) contains the single @bit set.
*/
-bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
- struct extent_state *cached)
+bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+ struct extent_state *cached)
{
- struct extent_state *state = NULL;
+ struct extent_state *state;
bool bitset = true;
ASSERT(is_power_of_2(bit));
+ ASSERT(start < end);
spin_lock(&tree->lock);
if (cached && extent_state_in_tree(cached) && cached->start <= start &&
@@ -1769,30 +1828,22 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
state = cached;
else
state = tree_search(tree, start);
- while (state && start <= end) {
+ while (state) {
if (state->start > start) {
bitset = false;
break;
}
- if (state->start > end)
- break;
-
if ((state->state & bit) == 0) {
bitset = false;
break;
}
- if (state->end == (u64)-1)
+ if (state->end >= end)
break;
- /*
- * Last entry (if state->end is (u64)-1 and overflow happens),
- * or next entry starts after the range.
- */
+ /* Next state must start where this one ends. */
start = state->end + 1;
- if (start > end || start == 0)
- break;
state = next_state(state);
}
@@ -1804,8 +1855,8 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
}
/* Wrappers around set/clear extent bit */
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
+int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
{
/*
* We don't support EXTENT_LOCK_BITS yet, as current changeset will
@@ -1814,11 +1865,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
ASSERT(!(bits & EXTENT_LOCK_BITS));
- return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
}
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
+int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
{
/*
* Don't support EXTENT_LOCK_BITS case, same reason as
@@ -1826,20 +1877,20 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
ASSERT(!(bits & EXTENT_LOCK_BITS));
- return __clear_extent_bit(tree, start, end, bits, NULL, changeset);
+ return btrfs_clear_extent_bit_changeset(tree, start, end, bits, NULL, changeset);
}
-bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached)
+bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached)
{
- int err;
+ int ret;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, bits, &failed_start,
- NULL, cached, NULL);
- if (err == -EEXIST) {
+ ret = set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL);
+ if (ret == -EEXIST) {
if (failed_start > start)
- clear_extent_bit(tree, start, failed_start - 1, bits, cached);
+ btrfs_clear_extent_bit(tree, start, failed_start - 1,
+ bits, cached);
return 0;
}
return 1;
@@ -1849,35 +1900,54 @@ bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits
* Either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
-int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached_state)
+int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached_state)
{
struct extent_state *failed_state = NULL;
- int err;
+ int ret;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, bits, &failed_start,
- &failed_state, cached_state, NULL);
- while (err == -EEXIST) {
+ ret = set_extent_bit(tree, start, end, bits, &failed_start,
+ &failed_state, cached_state, NULL);
+ while (ret == -EEXIST) {
if (failed_start != start)
- clear_extent_bit(tree, start, failed_start - 1,
- bits, cached_state);
+ btrfs_clear_extent_bit(tree, start, failed_start - 1,
+ bits, cached_state);
wait_extent_bit(tree, failed_start, end, bits, &failed_state);
- err = __set_extent_bit(tree, start, end, bits,
- &failed_start, &failed_state,
- cached_state, NULL);
+ ret = set_extent_bit(tree, start, end, bits, &failed_start,
+ &failed_state, cached_state, NULL);
}
- return err;
+ return ret;
+}
+
+/*
+ * Get the extent state that follows the given extent state.
+ * This is meant to be used in a context where we know no other tasks can
+ * concurrently modify the tree.
+ */
+struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+{
+ struct extent_state *next;
+
+ spin_lock(&tree->lock);
+ ASSERT(extent_state_in_tree(state));
+ next = next_state(state);
+ if (next)
+ refcount_inc(&next->refs);
+ spin_unlock(&tree->lock);
+
+ return next;
}
-void __cold extent_state_free_cachep(void)
+void __cold btrfs_extent_state_free_cachep(void)
{
btrfs_extent_state_leak_debug_check();
kmem_cache_destroy(extent_state_cache);
}
-int __init extent_state_init_cachep(void)
+int __init btrfs_extent_state_init_cachep(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0, 0,
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 6ffef1cd37c1..6f07b965e8da 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -17,10 +17,10 @@ struct btrfs_inode;
/* Bits for the extent state */
enum {
ENUM_BIT(EXTENT_DIRTY),
- ENUM_BIT(EXTENT_UPTODATE),
ENUM_BIT(EXTENT_LOCKED),
ENUM_BIT(EXTENT_DIO_LOCKED),
- ENUM_BIT(EXTENT_NEW),
+ ENUM_BIT(EXTENT_DIRTY_LOG1),
+ ENUM_BIT(EXTENT_DIRTY_LOG2),
ENUM_BIT(EXTENT_DELALLOC),
ENUM_BIT(EXTENT_DEFRAG),
ENUM_BIT(EXTENT_BOUNDARY),
@@ -39,6 +39,11 @@ enum {
*/
ENUM_BIT(EXTENT_DELALLOC_NEW),
/*
+ * Mark that a range is being locked for finishing an ordered extent.
+ * Used together with EXTENT_LOCKED.
+ */
+ ENUM_BIT(EXTENT_FINISHING_ORDERED),
+ /*
* When an ordered extent successfully completes for a region marked as
* a new delalloc range, use this flag when clearing a new delalloc
* range to indicate that the VFS' inode number of bytes should be
@@ -130,117 +135,110 @@ struct extent_state {
#endif
};
-struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree);
-const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree);
-const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
+const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree);
+const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner);
-void extent_io_tree_release(struct extent_io_tree *tree);
-int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached);
-bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached);
+void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner);
+void btrfs_extent_io_tree_release(struct extent_io_tree *tree);
+int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached);
+bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached);
-static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached)
+static inline int btrfs_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
{
- return __lock_extent(tree, start, end, EXTENT_LOCKED, cached);
+ return btrfs_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached);
}
-static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline bool btrfs_try_lock_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached);
+ return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached);
}
-int __init extent_state_init_cachep(void);
-void __cold extent_state_free_cachep(void);
-
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end,
- u64 max_bytes, u32 bits, int contig,
- struct extent_state **cached_state);
-
-void free_extent_state(struct extent_state *state);
-bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
- struct extent_state *cached_state);
-bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit);
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset);
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached,
- struct extent_changeset *changeset);
-
-static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 end, u32 bits,
- struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, bits, cached, NULL);
-}
+int __init btrfs_extent_state_init_cachep(void);
+void __cold btrfs_extent_state_free_cachep(void);
-static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, NULL);
-}
+u64 btrfs_count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end,
+ u64 max_bytes, u32 bits, bool contig,
+ struct extent_state **cached_state);
-static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, u32 bits)
+void btrfs_free_extent_state(struct extent_state *state);
+bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+ struct extent_state *cached_state);
+bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit);
+void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits,
+ struct extent_state **cached_state);
+int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset);
+int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached,
+ struct extent_changeset *changeset);
+
+static inline int btrfs_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits,
+ struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end, bits, NULL);
+ return btrfs_clear_extent_bit_changeset(tree, start, end, bits, cached, NULL);
}
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset);
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached_state);
-
-static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state)
+static inline int btrfs_unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
- cached_state, NULL);
+ return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_LOCKED,
+ cached, NULL);
}
-static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset);
+int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state);
+
+static inline int btrfs_clear_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, cached);
+ return btrfs_clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, cached);
}
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u32 clear_bits,
- struct extent_state **cached_state);
-
-bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits,
- struct extent_state **cached_state);
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits);
-int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits);
+int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state);
+
+bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state);
+void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits);
+bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
-static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline int btrfs_lock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached);
+ return btrfs_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached);
}
-static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline bool btrfs_try_lock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached);
+ return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached);
}
-static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline int btrfs_unlock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL);
+ return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_DIO_LOCKED,
+ cached, NULL);
}
+struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree,
+ struct extent_state *state);
+
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 412e318e4a22..e4cae34620d1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -40,13 +40,14 @@
#include "orphan.h"
#include "tree-checker.h"
#include "raid-stripe-tree.h"
+#include "delayed-inode.h"
#undef SCRAMBLE_DELAYED_REFS
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
@@ -56,12 +57,12 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 flags, u64 owner, u64 offset,
struct btrfs_key *ins, int ref_mod, u64 oref_root);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op);
-static int find_next_key(struct btrfs_path *path, int level,
+static int find_next_key(const struct btrfs_path *path, int level,
struct btrfs_key *key);
-static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
+static int block_group_bits(const struct btrfs_block_group *cache, u64 bits)
{
return (cache->flags & bits) == bits;
}
@@ -70,20 +71,17 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
struct btrfs_root *root = btrfs_extent_root(fs_info, start);
- int ret;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = start;
- key.offset = len;
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- btrfs_free_path(path);
- return ret;
+ key.offset = len;
+ return btrfs_search_slot(NULL, root, &key, path, 0, 0);
}
/*
@@ -103,7 +101,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
u64 num_refs;
u64 extent_flags;
@@ -125,16 +123,16 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
search_again:
key.objectid = bytenr;
- key.offset = offset;
if (metadata)
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = offset;
extent_root = btrfs_extent_root(fs_info, bytenr);
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
- goto out_free;
+ return ret;
if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
if (path->slots[0]) {
@@ -159,7 +157,7 @@ search_again:
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
btrfs_abort_transaction(trans, ret);
- goto out_free;
+ return ret;
}
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -167,10 +165,10 @@ search_again:
if (unlikely(num_refs == 0)) {
ret = -EUCLEAN;
btrfs_err(fs_info,
- "unexpected zero reference count for extent item (%llu %u %llu)",
- key.objectid, key.type, key.offset);
+ "unexpected zero reference count for extent item " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&key));
btrfs_abort_transaction(trans, ret);
- goto out_free;
+ return ret;
}
extent_flags = btrfs_extent_flags(leaf, ei);
owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
@@ -216,8 +214,7 @@ search_again:
*flags = extent_flags;
if (owning_root)
*owning_root = owner;
-out_free:
- btrfs_free_path(path);
+
return ret;
}
@@ -329,11 +326,11 @@ out_free:
/*
* is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
- * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
+ * is_data == BTRFS_REF_TYPE_DATA, data type is required,
* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
*/
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
- struct btrfs_extent_inline_ref *iref,
+ const struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
@@ -405,23 +402,23 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
-static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
- struct btrfs_extent_data_ref *ref)
+static u64 hash_extent_data_ref_item(const struct extent_buffer *leaf,
+ const struct btrfs_extent_data_ref *ref)
{
return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
btrfs_extent_data_ref_objectid(leaf, ref),
btrfs_extent_data_ref_offset(leaf, ref));
}
-static int match_extent_data_ref(struct extent_buffer *leaf,
- struct btrfs_extent_data_ref *ref,
- u64 root_objectid, u64 owner, u64 offset)
+static bool match_extent_data_ref(const struct extent_buffer *leaf,
+ const struct btrfs_extent_data_ref *ref,
+ u64 root_objectid, u64 owner, u64 offset)
{
if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
btrfs_extent_data_ref_offset(leaf, ref) != offset)
- return 0;
- return 1;
+ return false;
+ return true;
}
static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
@@ -501,7 +498,7 @@ fail:
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
@@ -570,7 +567,6 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
}
}
- btrfs_mark_buffer_dirty(trans, leaf);
ret = 0;
fail:
btrfs_release_path(path);
@@ -602,8 +598,8 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
} else {
btrfs_err(trans->fs_info,
- "unrecognized backref key (%llu %u %llu)",
- key.objectid, key.type, key.offset);
+ "unrecognized backref key " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&key));
btrfs_abort_transaction(trans, -EUCLEAN);
return -EUCLEAN;
}
@@ -618,18 +614,17 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
- btrfs_mark_buffer_dirty(trans, leaf);
}
return ret;
}
-static noinline u32 extent_data_ref_count(struct btrfs_path *path,
- struct btrfs_extent_inline_ref *iref)
+static noinline u32 extent_data_ref_count(const struct btrfs_path *path,
+ const struct btrfs_extent_inline_ref *iref)
{
struct btrfs_key key;
struct extent_buffer *leaf;
- struct btrfs_extent_data_ref *ref1;
- struct btrfs_shared_data_ref *ref2;
+ const struct btrfs_extent_data_ref *ref1;
+ const struct btrfs_shared_data_ref *ref2;
u32 num_refs = 0;
int type;
@@ -644,10 +639,10 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path,
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
ASSERT(type != BTRFS_REF_TYPE_INVALID);
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
- ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+ ref1 = (const struct btrfs_extent_data_ref *)(&iref->offset);
num_refs = btrfs_extent_data_ref_count(leaf, ref1);
} else {
- ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+ ref2 = (const struct btrfs_shared_data_ref *)(iref + 1);
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
}
} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
@@ -690,7 +685,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
@@ -728,7 +723,7 @@ static inline int extent_ref_type(u64 parent, u64 owner)
return type;
}
-static int find_next_key(struct btrfs_path *path, int level,
+static int find_next_key(const struct btrfs_path *path, int level,
struct btrfs_key *key)
{
@@ -794,7 +789,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
want = extent_ref_type(parent, owner);
if (insert) {
extra_size = btrfs_extent_inline_ref_size(want);
- path->search_for_extension = 1;
+ path->search_for_extension = true;
} else
extra_size = -1;
@@ -885,7 +880,7 @@ again:
ptr += btrfs_extent_inline_ref_size(type);
continue;
}
- if (type == BTRFS_REF_TYPE_INVALID) {
+ if (unlikely(type == BTRFS_REF_TYPE_INVALID)) {
ret = -EUCLEAN;
goto out;
}
@@ -960,7 +955,7 @@ again:
if (!path->keep_locks) {
btrfs_release_path(path);
- path->keep_locks = 1;
+ path->keep_locks = true;
goto again;
}
@@ -981,11 +976,11 @@ out_no_entry:
*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
if (path->keep_locks) {
- path->keep_locks = 0;
+ path->keep_locks = false;
btrfs_unlock_up_safe(path, 1);
}
if (insert)
- path->search_for_extension = 0;
+ path->search_for_extension = false;
return ret;
}
@@ -1050,7 +1045,6 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
} else {
btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
}
- btrfs_mark_buffer_dirty(trans, leaf);
}
static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1195,7 +1189,6 @@ static noinline_for_stack int update_inline_extent_backref(
item_size -= size;
btrfs_truncate_item(trans, path, item_size, 1);
}
- btrfs_mark_buffer_dirty(trans, leaf);
return 0;
}
@@ -1218,7 +1211,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
* We're adding refs to a tree block we already own, this
* should not happen at all.
*/
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (unlikely(owner < BTRFS_FIRST_FREE_OBJECTID)) {
btrfs_print_leaf(path->nodes[0]);
btrfs_crit(trans->fs_info,
"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
@@ -1260,12 +1253,12 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
{
int j, ret = 0;
u64 bytes_left, end;
- u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
+ u64 aligned_start = ALIGN(start, SECTOR_SIZE);
/* Adjust the range to be aligned to 512B sectors if necessary. */
if (start != aligned_start) {
len -= aligned_start - start;
- len = round_down(len, 1 << SECTOR_SHIFT);
+ len = round_down(len, SECTOR_SIZE);
start = aligned_start;
}
@@ -1488,10 +1481,10 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
*
*/
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
struct btrfs_key key;
@@ -1512,7 +1505,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
node->parent, node->ref_root, owner,
offset, refs_to_add, extent_op);
if ((ret < 0 && ret != -EAGAIN) || !ret)
- goto out;
+ return ret;
/*
* Ok we had -EAGAIN which means we didn't have space to insert and
@@ -1527,24 +1520,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, item);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/* now insert the actual backref */
- if (owner < BTRFS_FIRST_FREE_OBJECTID)
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = insert_tree_block_ref(trans, path, node, bytenr);
- else
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else {
ret = insert_extent_data_ref(trans, path, node, bytenr);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
- if (ret)
- btrfs_abort_transaction(trans, ret);
-out:
- btrfs_free_path(path);
return ret;
}
static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_head *href)
+ const struct btrfs_delayed_ref_head *href)
{
u64 root = href->owning_root;
@@ -1553,7 +1546,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
* where it has already been unset.
*/
if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
- !href->is_data || !is_fstree(root))
+ !href->is_data || !btrfs_is_fstree(root))
return;
btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
@@ -1562,7 +1555,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
@@ -1630,13 +1623,13 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
}
static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head,
+ const struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_extent_item *ei;
struct extent_buffer *leaf;
u32 item_size;
@@ -1667,7 +1660,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
- goto out;
+ return ret;
} else if (ret > 0) {
if (metadata) {
if (path->slots[0] > 0) {
@@ -1684,8 +1677,8 @@ again:
metadata = 0;
key.objectid = head->bytenr;
- key.offset = head->num_bytes;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = head->num_bytes;
goto again;
}
} else {
@@ -1693,7 +1686,7 @@ again:
btrfs_err(fs_info,
"missing extent item for extent %llu num_bytes %llu level %d",
head->bytenr, head->num_bytes, head->level);
- goto out;
+ return ret;
}
}
@@ -1706,21 +1699,18 @@ again:
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
__run_delayed_extent_op(extent_op, leaf, ei);
- btrfs_mark_buffer_dirty(trans, leaf);
-out:
- btrfs_free_path(path);
return ret;
}
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
@@ -1767,7 +1757,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
@@ -1775,7 +1765,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
if (TRANS_ABORTED(trans)) {
if (insert_reserved) {
- btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes);
free_head_ref_squota_rsv(trans->fs_info, href);
}
return 0;
@@ -1794,7 +1784,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
else
BUG();
if (ret && insert_reserved)
- btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes);
if (ret < 0)
btrfs_err(trans->fs_info,
"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
@@ -1803,30 +1793,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static inline struct btrfs_delayed_ref_node *
-select_delayed_ref(struct btrfs_delayed_ref_head *head)
-{
- struct btrfs_delayed_ref_node *ref;
-
- if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
- return NULL;
-
- /*
- * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
- * This is to prevent a ref count from going down to zero, which deletes
- * the extent item from the extent tree, when there still are references
- * to add, which would fail because they would not find the extent item.
- */
- if (!list_empty(&head->ref_add_list))
- return list_first_entry(&head->ref_add_list,
- struct btrfs_delayed_ref_node, add_list);
-
- ref = rb_entry(rb_first_cached(&head->ref_tree),
- struct btrfs_delayed_ref_node, ref_node);
- ASSERT(list_empty(&ref->add_list));
- return ref;
-}
-
static struct btrfs_delayed_extent_op *cleanup_extent_op(
struct btrfs_delayed_ref_head *head)
{
@@ -1925,7 +1891,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
if (head->must_insert_reserved) {
- btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
+ btrfs_pin_extent(trans, head->bytenr, head->num_bytes);
if (head->is_data) {
struct btrfs_root *csum_root;
@@ -1959,7 +1925,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
lockdep_assert_held(&locked_ref->mutex);
lockdep_assert_held(&locked_ref->lock);
- while ((ref = select_delayed_ref(locked_ref))) {
+ while ((ref = btrfs_select_delayed_ref(locked_ref))) {
if (ref->seq &&
btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
@@ -2043,7 +2009,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
if (min_bytes == 0) {
- max_count = delayed_refs->num_heads_ready;
+ /*
+ * We may be subject to a harmless race if some task is
+ * concurrently adding or removing a delayed ref, so silence
+ * KCSAN and similar tools.
+ */
+ max_count = data_race(delayed_refs->num_heads_ready);
min_bytes = U64_MAX;
}
@@ -2187,7 +2158,7 @@ again:
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
ret = __btrfs_run_delayed_refs(trans, min_bytes);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2230,10 +2201,11 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline int check_delayed_ref(struct btrfs_root *root,
+static noinline int check_delayed_ref(struct btrfs_inode *inode,
struct btrfs_path *path,
- u64 objectid, u64 offset, u64 bytenr)
+ u64 offset, u64 bytenr)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -2307,7 +2279,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
* then we have a cross reference.
*/
if (ref->ref_root != btrfs_root_id(root) ||
- ref_owner != objectid || ref_offset != offset) {
+ ref_owner != btrfs_ino(inode) || ref_offset != offset) {
ret = 1;
break;
}
@@ -2318,11 +2290,53 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
return ret;
}
-static noinline int check_committed_ref(struct btrfs_root *root,
+/*
+ * Check if there are references for a data extent other than the one belonging
+ * to the given inode and offset.
+ *
+ * @inode: The only inode we expect to find associated with the data extent.
+ * @path: A path to use for searching the extent tree.
+ * @offset: The only offset we expect to find associated with the data extent.
+ * @bytenr: The logical address of the data extent.
+ *
+ * When the extent does not have any other references other than the one we
+ * expect to find, we always return a value of 0 with the path having a locked
+ * leaf that contains the extent's extent item - this is necessary to ensure
+ * we don't race with a task running delayed references, and our caller must
+ * have such a path when calling check_delayed_ref() - it must lock a delayed
+ * ref head while holding the leaf locked. In case the extent item is not found
+ * in the extent tree, we return -ENOENT with the path having the leaf (locked)
+ * where the extent item should be, in order to prevent races with another task
+ * running delayed references, so that we don't miss any reference when calling
+ * check_delayed_ref().
+ *
+ * Note: this may return false positives, and this is because we want to be
+ * quick here as we're called in write paths (when flushing delalloc and
+ * in the direct IO write path). For example we can have an extent with
+ * a single reference but that reference is not inlined, or we may have
+ * many references in the extent tree but we also have delayed references
+ * that cancel all the reference except the one for our inode and offset,
+ * but it would be expensive to do such checks and complex due to all
+ * locking to avoid races between the checks and flushing delayed refs,
+ * plus non-inline references may be located on leaves other than the one
+ * that contains the extent item in the extent tree. The important thing
+ * here is to not return false negatives and that the false positives are
+ * not very common.
+ *
+ * Returns: 0 if there are no cross references and with the path having a locked
+ * leaf from the extent tree that contains the extent's extent item.
+ *
+ * 1 if there are cross references (false positives can happen).
+ *
+ * < 0 in case of an error. In case of -ENOENT the leaf in the extent
+ * tree where the extent item should be located at is read locked and
+ * accessible in the given path.
+ */
+static noinline int check_committed_ref(struct btrfs_inode *inode,
struct btrfs_path *path,
- u64 objectid, u64 offset, u64 bytenr,
- bool strict)
+ u64 offset, u64 bytenr)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
struct extent_buffer *leaf;
@@ -2336,40 +2350,37 @@ static noinline int check_committed_ref(struct btrfs_root *root,
int ret;
key.objectid = bytenr;
- key.offset = (u64)-1;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
- if (ret == 0) {
+ return ret;
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
*/
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
- ret = -ENOENT;
if (path->slots[0] == 0)
- goto out;
+ return -ENOENT;
path->slots[0]--;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
- goto out;
+ return -ENOENT;
- ret = 1;
item_size = btrfs_item_size(leaf, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
/* No inline refs; we need to bail before checking for owner ref. */
if (item_size == sizeof(*ei))
- goto out;
+ return 1;
/* Check for an owner ref; skip over it to the real inline refs. */
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
@@ -2377,56 +2388,69 @@ static noinline int check_committed_ref(struct btrfs_root *root,
if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
iref = (struct btrfs_extent_inline_ref *)(iref + 1);
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
}
/* If extent item has more than 1 inline ref then it's shared */
if (item_size != expected_size)
- goto out;
-
- /*
- * If extent created before last snapshot => it's shared unless the
- * snapshot has been deleted. Use the heuristic if strict is false.
- */
- if (!strict &&
- (btrfs_extent_generation(leaf, ei) <=
- btrfs_root_last_snapshot(&root->root_item)))
- goto out;
+ return 1;
/* If this extent has SHARED_DATA_REF then it's shared */
- type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
if (type != BTRFS_EXTENT_DATA_REF_KEY)
- goto out;
+ return 1;
ref = (struct btrfs_extent_data_ref *)(&iref->offset);
if (btrfs_extent_refs(leaf, ei) !=
btrfs_extent_data_ref_count(leaf, ref) ||
btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
- btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+ btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) ||
btrfs_extent_data_ref_offset(leaf, ref) != offset)
- goto out;
+ return 1;
- ret = 0;
-out:
- return ret;
+ return 0;
}
-int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
- u64 bytenr, bool strict, struct btrfs_path *path)
+int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset,
+ u64 bytenr, struct btrfs_path *path)
{
int ret;
do {
- ret = check_committed_ref(root, path, objectid,
- offset, bytenr, strict);
+ ret = check_committed_ref(inode, path, offset, bytenr);
if (ret && ret != -ENOENT)
goto out;
- ret = check_delayed_ref(root, path, objectid, offset, bytenr);
- } while (ret == -EAGAIN);
+ /*
+ * The path must have a locked leaf from the extent tree where
+ * the extent item for our extent is located, in case it exists,
+ * or where it should be located in case it doesn't exist yet
+ * because it's new and its delayed ref was not yet flushed.
+ * We need to lock the delayed ref head at check_delayed_ref(),
+ * if one exists, while holding the leaf locked in order to not
+ * race with delayed ref flushing, missing references and
+ * incorrectly reporting that the extent is not shared.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
+ struct extent_buffer *leaf = path->nodes[0];
+
+ ASSERT(leaf != NULL);
+ btrfs_assert_tree_read_locked(leaf);
+
+ if (ret != -ENOENT) {
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ASSERT(key.objectid == bytenr);
+ ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY);
+ }
+ }
+
+ ret = check_delayed_ref(inode, path, offset, bytenr);
+ } while (ret == -EAGAIN && !path->nowait);
out:
btrfs_release_path(path);
- if (btrfs_is_data_reloc_root(root))
+ if (btrfs_is_data_reloc_root(inode->root))
WARN_ON(ret > 0);
return ret;
}
@@ -2434,7 +2458,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- int full_backref, int inc)
+ bool full_backref, bool inc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 parent;
@@ -2520,15 +2544,15 @@ fail:
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref)
+ struct extent_buffer *buf, bool full_backref)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, true);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref)
+ struct extent_buffer *buf, bool full_backref)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, false);
}
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
@@ -2568,37 +2592,34 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
}
static int pin_down_extent(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *cache,
- u64 bytenr, u64 num_bytes, int reserved)
+ struct btrfs_block_group *bg,
+ u64 bytenr, u64 num_bytes, bool reserved)
{
- struct btrfs_fs_info *fs_info = cache->fs_info;
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->pinned += num_bytes;
- btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
- num_bytes);
- if (reserved) {
- cache->reserved -= num_bytes;
- cache->space_info->bytes_reserved -= num_bytes;
- }
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-
- set_extent_bit(&trans->transaction->pinned_extents, bytenr,
- bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
+ struct btrfs_space_info *space_info = bg->space_info;
+ const u64 reserved_bytes = (reserved ? num_bytes : 0);
+
+ spin_lock(&space_info->lock);
+ spin_lock(&bg->lock);
+ bg->pinned += num_bytes;
+ bg->reserved -= reserved_bytes;
+ spin_unlock(&bg->lock);
+ space_info->bytes_reserved -= reserved_bytes;
+ btrfs_space_info_update_bytes_pinned(space_info, num_bytes);
+ spin_unlock(&space_info->lock);
+
+ btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
return 0;
}
-int btrfs_pin_extent(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int reserved)
+int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes)
{
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
BUG_ON(!cache); /* Logic error */
- pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
+ pin_down_extent(trans, cache, bytenr, num_bytes, true);
btrfs_put_block_group(cache);
return 0;
@@ -2622,7 +2643,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- pin_down_extent(trans, cache, eb->start, eb->len, 0);
+ pin_down_extent(trans, cache, eb->start, eb->len, false);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, eb->start, eb->len);
@@ -2724,26 +2745,23 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
{
struct btrfs_block_group *cache = NULL;
struct btrfs_space_info *space_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_free_cluster *cluster = NULL;
- u64 len;
u64 total_unpinned = 0;
u64 empty_cluster = 0;
- bool readonly;
- int ret = 0;
while (start <= end) {
- readonly = false;
+ u64 len;
+ bool readonly;
+
if (!cache ||
start >= cache->start + cache->length) {
if (cache)
btrfs_put_block_group(cache);
total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start);
- if (cache == NULL) {
+ if (unlikely(cache == NULL)) {
/* Logic error, something removed the block group. */
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
cluster = fetch_cluster_info(fs_info,
@@ -2777,45 +2795,28 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
+ readonly = cache->ro;
cache->pinned -= len;
- btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
+ spin_unlock(&cache->lock);
+
+ btrfs_space_info_update_bytes_pinned(space_info, -len);
space_info->max_extent_size = 0;
- if (cache->ro) {
+
+ if (readonly) {
space_info->bytes_readonly += len;
- readonly = true;
} else if (btrfs_is_zoned(fs_info)) {
/* Need reset before reusing in a zoned block group */
- btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
- len);
- readonly = true;
+ btrfs_space_info_update_bytes_zone_unusable(space_info, len);
+ } else if (return_free_space) {
+ btrfs_return_free_space(space_info, len);
}
- spin_unlock(&cache->lock);
- if (!readonly && return_free_space &&
- global_rsv->space_info == space_info) {
- spin_lock(&global_rsv->lock);
- if (!global_rsv->full) {
- u64 to_add = min(len, global_rsv->size -
- global_rsv->reserved);
-
- global_rsv->reserved += to_add;
- btrfs_space_info_update_bytes_may_use(fs_info,
- space_info, to_add);
- if (global_rsv->reserved >= global_rsv->size)
- global_rsv->full = 1;
- len -= to_add;
- }
- spin_unlock(&global_rsv->lock);
- }
- /* Add to any tickets we may have */
- if (!readonly && return_free_space && len)
- btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
if (cache)
btrfs_put_block_group(cache);
-out:
- return ret;
+
+ return 0;
}
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
@@ -2823,34 +2824,63 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *block_group, *tmp;
struct list_head *deleted_bgs;
- struct extent_io_tree *unpin;
+ struct extent_io_tree *unpin = &trans->transaction->pinned_extents;
+ struct extent_state *cached_state = NULL;
u64 start;
u64 end;
+ int unpin_error = 0;
int ret;
- unpin = &trans->transaction->pinned_extents;
-
- while (!TRANS_ABORTED(trans)) {
- struct extent_state *cached_state = NULL;
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state);
- mutex_lock(&fs_info->unused_bg_unpin_mutex);
- if (!find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, &cached_state)) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- break;
- }
+ while (!TRANS_ABORTED(trans) && cached_state) {
+ struct extent_state *next_state;
if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
end + 1 - start, NULL);
- clear_extent_dirty(unpin, start, end, &cached_state);
+ next_state = btrfs_next_extent_state(unpin, cached_state);
+ btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
ret = unpin_extent_range(fs_info, start, end, true);
- BUG_ON(ret);
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- free_extent_state(cached_state);
- cond_resched();
+ /*
+ * If we get an error unpinning an extent range, store the first
+ * error to return later after trying to unpin all ranges and do
+ * the sync discards. Our caller will abort the transaction
+ * (which already wrote new superblocks) and on the next mount
+ * the space will be available as it was pinned by in-memory
+ * only structures in this phase.
+ */
+ if (ret) {
+ btrfs_err_rl(fs_info,
+"failed to unpin extent range [%llu, %llu] when committing transaction %llu: %s (%d)",
+ start, end, trans->transid,
+ btrfs_decode_error(ret), ret);
+ if (!unpin_error)
+ unpin_error = ret;
+ }
+
+ btrfs_free_extent_state(cached_state);
+
+ if (need_resched()) {
+ btrfs_free_extent_state(next_state);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ cond_resched();
+ cached_state = NULL;
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, &cached_state);
+ } else {
+ cached_state = next_state;
+ if (cached_state) {
+ start = cached_state->start;
+ end = cached_state->end;
+ }
+ }
}
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_free_extent_state(cached_state);
if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
btrfs_discard_calc_delay(&fs_info->discard_ctl);
@@ -2864,16 +2894,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
*/
deleted_bgs = &trans->transaction->deleted_bgs;
list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
- u64 trimmed = 0;
-
ret = -EROFS;
if (!TRANS_ABORTED(trans))
- ret = btrfs_discard_extent(fs_info,
- block_group->start,
- block_group->length,
- &trimmed);
+ ret = btrfs_discard_extent(fs_info, block_group->start,
+ block_group->length, NULL);
+ /*
+ * Not strictly necessary to lock, as the block_group should be
+ * read-only from btrfs_delete_unused_bgs().
+ */
+ ASSERT(block_group->ro);
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
+ spin_unlock(&fs_info->unused_bgs_lock);
+
btrfs_unfreeze_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -2885,7 +2919,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
}
}
- return 0;
+ return unpin_error;
}
/*
@@ -2947,26 +2981,26 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
csum_root = btrfs_csum_root(trans->fs_info, bytenr);
ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
}
ret = btrfs_record_squota_delta(trans->fs_info, delta);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
- ret = add_to_free_space_tree(trans, bytenr, num_bytes);
- if (ret) {
+ ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -3046,12 +3080,12 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
*/
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *extent_root;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -3080,13 +3114,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
- if (!is_data && refs_to_drop != 1) {
+ if (unlikely(!is_data && refs_to_drop != 1)) {
btrfs_crit(info,
"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
node->bytenr, refs_to_drop);
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
if (is_data)
@@ -3127,19 +3161,18 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
if (!found_extent) {
- if (iref) {
+ if (unlikely(iref)) {
abort_and_dump(trans, path,
"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
path->slots[0]);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
/* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, extent_root, path,
NULL, refs_to_drop, is_data);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
btrfs_release_path(path);
@@ -3186,9 +3219,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
"umm, got %d back from search, was looking for %llu, slot %d",
ret, bytenr, path->slots[0]);
}
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
extent_slot = path->slots[0];
}
@@ -3197,10 +3230,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
bytenr, node->parent, node->ref_root, owner_objectid,
owner_offset, path->slots[0]);
- goto out;
+ return ret;
} else {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
leaf = path->nodes[0];
@@ -3211,7 +3244,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
ei = btrfs_item_ptr(leaf, extent_slot,
struct btrfs_extent_item);
@@ -3219,26 +3252,24 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
- if (item_size < sizeof(*ei) + sizeof(*bi)) {
+ if (unlikely(item_size < sizeof(*ei) + sizeof(*bi))) {
abort_and_dump(trans, path,
"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
key.objectid, key.type, key.offset,
path->slots[0], owner_objectid, item_size,
sizeof(*ei) + sizeof(*bi));
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
bi = (struct btrfs_tree_block_info *)(ei + 1);
WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
}
refs = btrfs_extent_refs(leaf, ei);
- if (refs < refs_to_drop) {
+ if (unlikely(refs < refs_to_drop)) {
abort_and_dump(trans, path,
"trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
refs_to_drop, refs, bytenr, path->slots[0]);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
refs -= refs_to_drop;
@@ -3250,23 +3281,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* be updated by remove_extent_backref
*/
if (iref) {
- if (!found_extent) {
+ if (unlikely(!found_extent)) {
abort_and_dump(trans, path,
"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
path->slots[0]);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
} else {
btrfs_set_extent_refs(leaf, ei, refs);
- btrfs_mark_buffer_dirty(trans, leaf);
}
if (found_extent) {
ret = remove_extent_backref(trans, extent_root, path,
iref, refs_to_drop, is_data);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
}
} else {
@@ -3280,23 +3309,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
/* In this branch refs == 1 */
if (found_extent) {
- if (is_data && refs_to_drop !=
- extent_data_ref_count(path, iref)) {
+ if (unlikely(is_data && refs_to_drop !=
+ extent_data_ref_count(path, iref))) {
abort_and_dump(trans, path,
"invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
extent_data_ref_count(path, iref),
refs_to_drop, path->slots[0]);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
if (iref) {
- if (path->slots[0] != extent_slot) {
+ if (unlikely(path->slots[0] != extent_slot)) {
abort_and_dump(trans, path,
-"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
- key.objectid, key.type,
- key.offset, path->slots[0]);
- ret = -EUCLEAN;
- goto out;
+"invalid iref, extent item key " BTRFS_KEY_FMT " slot %u doesn't have wanted iref",
+ BTRFS_KEY_FMT_VALUE(&key),
+ path->slots[0]);
+ return -EUCLEAN;
}
} else {
/*
@@ -3305,12 +3332,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* | extent_slot ||extent_slot + 1|
* [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
*/
- if (path->slots[0] != extent_slot + 1) {
+ if (unlikely(path->slots[0] != extent_slot + 1)) {
abort_and_dump(trans, path,
"invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
path->slots[0]);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
path->slots[0] = extent_slot;
num_to_del = 2;
@@ -3329,9 +3355,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
btrfs_release_path(path);
@@ -3339,8 +3365,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -3449,7 +3473,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
bg = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ pin_down_extent(trans, bg, buf->start, buf->len, true);
btrfs_put_block_group(bg);
goto out;
}
@@ -3473,7 +3497,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
|| btrfs_is_zoned(fs_info)) {
- pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ pin_down_extent(trans, bg, buf->start, buf->len, true);
btrfs_put_block_group(bg);
goto out;
}
@@ -3481,17 +3505,11 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(bg, buf->start, buf->len);
- btrfs_free_reserved_bytes(bg, buf->len, 0);
+ btrfs_free_reserved_bytes(bg, buf->len, false);
btrfs_put_block_group(bg);
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
out:
-
- /*
- * Deleting the buffer, clear the corrupt flag since it doesn't
- * matter anymore.
- */
- clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
return 0;
}
@@ -3509,7 +3527,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
* tree, just update pinning info and exit early.
*/
if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
- btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
+ btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes);
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
@@ -3560,15 +3578,14 @@ enum btrfs_loop_type {
};
static inline void
-btrfs_lock_block_group(struct btrfs_block_group *cache,
- int delalloc)
+btrfs_lock_block_group(struct btrfs_block_group *cache, bool delalloc)
{
if (delalloc)
down_read(&cache->data_rwsem);
}
static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
- int delalloc)
+ bool delalloc)
{
btrfs_get_block_group(cache);
if (delalloc)
@@ -3578,7 +3595,7 @@ static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
static struct btrfs_block_group *btrfs_lock_cluster(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
- int delalloc)
+ bool delalloc)
__acquires(&cluster->refill_lock)
{
struct btrfs_block_group *used_bg = NULL;
@@ -3615,14 +3632,28 @@ static struct btrfs_block_group *btrfs_lock_cluster(
}
static inline void
-btrfs_release_block_group(struct btrfs_block_group *cache,
- int delalloc)
+btrfs_release_block_group(struct btrfs_block_group *cache, bool delalloc)
{
if (delalloc)
up_read(&cache->data_rwsem);
btrfs_put_block_group(cache);
}
+static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl,
+ const struct btrfs_block_group *bg)
+{
+ if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
+ return true;
+ if (!btrfs_block_group_should_use_size_class(bg))
+ return true;
+ if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
+ return true;
+ if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
+ bg->size_class == BTRFS_BG_SZ_NONE)
+ return true;
+ return ffe_ctl->size_class == bg->size_class;
+}
+
/*
* Helper function for find_free_extent().
*
@@ -3644,7 +3675,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
if (!cluster_bg)
goto refill_cluster;
if (cluster_bg != bg && (cluster_bg->ro ||
- !block_group_bits(cluster_bg, ffe_ctl->flags)))
+ !block_group_bits(cluster_bg, ffe_ctl->flags) ||
+ !find_free_extent_check_size_class(ffe_ctl, cluster_bg)))
goto release_cluster;
offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
@@ -3990,7 +4022,7 @@ static int do_allocation(struct btrfs_block_group *block_group,
static void release_block_group(struct btrfs_block_group *block_group,
struct find_free_extent_ctl *ffe_ctl,
- int delalloc)
+ bool delalloc)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
@@ -4109,6 +4141,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
struct btrfs_key *ins,
struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
bool full_search)
{
struct btrfs_root *root = fs_info->chunk_root;
@@ -4163,7 +4196,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return ret;
}
- ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
+ ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags,
CHUNK_ALLOC_FORCE_FOR_EXTENT);
/* Do not bail out on ENOSPC since we can do more. */
@@ -4200,21 +4233,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
-static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
- struct btrfs_block_group *bg)
-{
- if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
- return true;
- if (!btrfs_block_group_should_use_size_class(bg))
- return true;
- if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
- return true;
- if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
- bg->size_class == BTRFS_BG_SZ_NONE)
- return true;
- return ffe_ctl->size_class == bg->size_class;
-}
-
static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
@@ -4267,7 +4285,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
}
static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
- struct find_free_extent_ctl *ffe_ctl)
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info)
{
if (ffe_ctl->for_treelog) {
spin_lock(&fs_info->treelog_bg_lock);
@@ -4285,12 +4304,13 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->zone_active_bgs_lock);
list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
/*
- * No lock is OK here because avail is monotinically
+ * No lock is OK here because avail is monotonically
* decreasing, and this is just a hint.
*/
u64 avail = block_group->zone_capacity - block_group->alloc_offset;
if (block_group_bits(block_group, ffe_ctl->flags) &&
+ block_group->space_info == space_info &&
avail >= ffe_ctl->num_bytes) {
ffe_ctl->hint_byte = block_group->start;
break;
@@ -4312,7 +4332,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
case BTRFS_EXTENT_ALLOC_ZONED:
- return prepare_allocation_zoned(fs_info, ffe_ctl);
+ return prepare_allocation_zoned(fs_info, ffe_ctl, space_info);
default:
BUG();
}
@@ -4380,11 +4400,22 @@ static noinline int find_free_extent(struct btrfs_root *root,
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(root, ffe_ctl);
+ trace_btrfs_find_free_extent(root, ffe_ctl);
space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
+ if (btrfs_is_zoned(fs_info) && space_info) {
+ /* Use dedicated sub-space_info for dedicated block group users. */
+ if (ffe_ctl->for_data_reloc) {
+ space_info = space_info->sub_group[0];
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
+ } else if (ffe_ctl->for_treelog) {
+ space_info = space_info->sub_group[0];
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG);
+ }
+ }
if (!space_info) {
- btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
+ btrfs_err(fs_info, "no space info for %llu, tree-log %d, relocation %d",
+ ffe_ctl->flags, ffe_ctl->for_treelog, ffe_ctl->for_data_reloc);
return -ENOSPC;
}
@@ -4406,6 +4437,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
* picked out then we don't care that the block group is cached.
*/
if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
+ block_group->space_info == space_info &&
block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
@@ -4431,7 +4463,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
}
}
search:
- trace_find_free_extent_search_loop(root, ffe_ctl);
+ trace_btrfs_find_free_extent_search_loop(root, ffe_ctl);
ffe_ctl->have_caching_bg = false;
if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
ffe_ctl->index == 0)
@@ -4483,7 +4515,7 @@ search:
}
have_block_group:
- trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
+ trace_btrfs_find_free_extent_have_block_group(root, ffe_ctl, block_group);
ffe_ctl->cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl->cached)) {
ffe_ctl->have_caching_bg = true;
@@ -4576,7 +4608,8 @@ loop:
}
up_read(&space_info->groups_sem);
- ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
+ ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info,
+ full_search);
if (ret > 0)
goto search;
@@ -4645,7 +4678,7 @@ loop:
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data, int delalloc)
+ struct btrfs_key *ins, bool is_data, bool delalloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct find_free_extent_ctl ffe_ctl = {};
@@ -4690,16 +4723,15 @@ again:
"allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
flags, num_bytes, for_treelog, for_data_reloc);
if (sinfo)
- btrfs_dump_space_info(fs_info, sinfo,
- num_bytes, 1);
+ btrfs_dump_space_info(sinfo, num_bytes, 1);
}
}
return ret;
}
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len, int delalloc)
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len,
+ bool is_delalloc)
{
struct btrfs_block_group *cache;
@@ -4711,7 +4743,7 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
}
btrfs_add_free_space(cache, start, len);
- btrfs_free_reserved_bytes(cache, len, delalloc);
+ btrfs_free_reserved_bytes(cache, len, is_delalloc);
trace_btrfs_reserved_extent_free(fs_info, start, len);
btrfs_put_block_group(cache);
@@ -4731,7 +4763,7 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
return -ENOSPC;
}
- ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
+ ret = pin_down_extent(trans, cache, eb->start, eb->len, true);
btrfs_put_block_group(cache);
return ret;
}
@@ -4742,7 +4774,7 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
+ ret = btrfs_remove_from_free_space_tree(trans, bytenr, num_bytes);
if (ret)
return ret;
@@ -4827,14 +4859,13 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
}
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_free_path(path);
return alloc_reserved_extent(trans, ins->objectid, ins->offset);
}
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -4902,7 +4933,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
}
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_free_path(path);
return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
@@ -4923,7 +4953,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
- if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
+ if (btrfs_is_data_reloc_root(root) && btrfs_is_fstree(root->relocation_src_root))
generic_ref.owning_root = root->relocation_src_root;
btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
@@ -4945,7 +4975,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
- struct btrfs_squota_delta delta = {
+ const struct btrfs_squota_delta delta = {
.root = root_objectid,
.num_bytes = ins->offset,
.generation = trans->transid,
@@ -4979,7 +5009,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
offset, ins, 1, root_objectid);
if (ret)
- btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
+ btrfs_pin_extent(trans, ins->objectid, ins->offset);
ret = btrfs_record_squota_delta(fs_info, &delta);
btrfs_put_block_group(block_group);
return ret;
@@ -5020,7 +5050,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (IS_ERR(buf))
return buf;
- if (check_eb_lock_owner(buf)) {
+ if (unlikely(check_eb_lock_owner(buf))) {
free_extent_buffer(buf);
return ERR_PTR(-EUCLEAN);
}
@@ -5071,17 +5101,17 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* EXTENT bit to differentiate dirty pages.
*/
if (buf->log_index == 0)
- set_extent_bit(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1,
- EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1,
+ EXTENT_DIRTY_LOG1, NULL);
else
- set_extent_bit(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1,
- EXTENT_NEW, NULL);
+ btrfs_set_extent_bit(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1,
+ EXTENT_DIRTY_LOG2, NULL);
} else {
buf->log_index = -1;
- set_extent_bit(&trans->transaction->dirty_pages, buf->start,
- buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start,
+ buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
}
/* this returns a buffer locked for blocking */
return buf;
@@ -5125,7 +5155,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
return ERR_CAST(block_rsv);
ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
- empty_size, hint, &ins, 0, 0);
+ empty_size, hint, &ins, false, false);
if (ret)
goto out_unuse;
@@ -5187,7 +5217,7 @@ out_free_buf:
btrfs_tree_unlock(buf);
free_extent_buffer(buf);
out_free_reserved:
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, false);
out_unuse:
btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
@@ -5285,7 +5315,7 @@ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *
* reference to it.
*/
generation = btrfs_node_ptr_generation(eb, slot);
- if (!wc->update_ref || generation <= root->root_key.offset)
+ if (!wc->update_ref || generation <= btrfs_root_origin_generation(root))
return false;
/*
@@ -5340,7 +5370,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
goto reada;
if (wc->stage == UPDATE_BACKREF &&
- generation <= root->root_key.offset)
+ generation <= btrfs_root_origin_generation(root))
continue;
/* We don't lock the tree block, it's OK to be racy here */
@@ -5429,17 +5459,17 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
if (!(wc->flags[level] & flag)) {
ASSERT(path->locks[level]);
ret = btrfs_inc_ref(trans, root, eb, 1);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_dec_ref(trans, root, eb, 0);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_set_disk_extent_flags(trans, eb, flag);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -5467,7 +5497,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_extent_inline_ref *iref;
int ret;
bool exists = false;
@@ -5484,7 +5514,6 @@ again:
* If we get 0 then we found our reference, return 1, else
* return the error if it's not -ENOENT;
*/
- btrfs_free_path(path);
return (ret < 0 ) ? ret : 1;
}
@@ -5515,11 +5544,10 @@ again:
goto again;
}
- exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
+ exists = btrfs_find_delayed_tree_ref(head, btrfs_root_id(root), parent);
mutex_unlock(&head->mutex);
out:
spin_unlock(&delayed_refs->lock);
- btrfs_free_path(path);
return exists ? 1 : 0;
}
@@ -5543,7 +5571,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
- if (btrfs_buffer_uptodate(next, generation, 0))
+ if (btrfs_buffer_uptodate(next, generation, false))
return 0;
check.level = level - 1;
@@ -5572,7 +5600,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
* If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
*
* If we are DROP_REFERENCE this will figure out if we need to drop our current
- * reference, skipping it if we dropped it from a previous incompleted drop, or
+ * reference, skipping it if we dropped it from a previous uncompleted drop, or
* dropping it if we still have a reference to it.
*/
static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -5597,7 +5625,7 @@ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_r
ref.parent = path->nodes[level]->start;
} else {
ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
- if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
+ if (unlikely(btrfs_root_id(root) != btrfs_header_owner(path->nodes[level]))) {
btrfs_err(root->fs_info, "mismatched block owner");
return -EIO;
}
@@ -5683,7 +5711,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
* for the subtree
*/
if (wc->stage == UPDATE_BACKREF &&
- generation <= root->root_key.offset) {
+ generation <= btrfs_root_origin_generation(root)) {
wc->lookup_info = 1;
return 1;
}
@@ -5719,7 +5747,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
/*
* We have to walk down into this node, and if we're currently at the
- * DROP_REFERNCE stage and this block is shared then we need to switch
+ * DROP_REFERENCE stage and this block is shared then we need to switch
* to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
*/
if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
@@ -5733,7 +5761,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
level--;
ASSERT(level == btrfs_header_level(next));
- if (level != btrfs_header_level(next)) {
+ if (unlikely(level != btrfs_header_level(next))) {
btrfs_err(root->fs_info, "mismatched level");
ret = -EIO;
goto out_unlock;
@@ -5836,15 +5864,20 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (wc->refs[level] == 1) {
if (level == 0) {
- if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
ret = btrfs_dec_ref(trans, root, eb, 1);
- else
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ } else {
ret = btrfs_dec_ref(trans, root, eb, 0);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- return ret;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
}
- if (is_fstree(btrfs_root_id(root))) {
+ if (btrfs_is_fstree(btrfs_root_id(root))) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
if (ret) {
btrfs_err_rl(fs_info,
@@ -5864,13 +5897,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
- else if (btrfs_root_id(root) != btrfs_header_owner(eb))
+ else if (unlikely(btrfs_root_id(root) != btrfs_header_owner(eb)))
goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
- else if (btrfs_root_id(root) !=
- btrfs_header_owner(path->nodes[level + 1]))
+ else if (unlikely(btrfs_root_id(root) !=
+ btrfs_header_owner(path->nodes[level + 1])))
goto owner_mismatch;
}
@@ -6005,9 +6038,9 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
* also make sure backrefs for the shared block and all lower level
* blocks are properly updated.
*
- * If called with for_reloc == 0, may exit early with -EAGAIN
+ * If called with for_reloc set, may exit early with -EAGAIN
*/
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
+int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc)
{
const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6015,7 +6048,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root_item *root_item = &root->root_item;
- struct walk_control *wc;
+ struct walk_control AUTO_KFREE(wc);
struct btrfs_key key;
const u64 rootid = btrfs_root_id(root);
int ret = 0;
@@ -6033,9 +6066,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
wc = kzalloc(sizeof(*wc), GFP_NOFS);
if (!wc) {
- btrfs_free_path(path);
ret = -ENOMEM;
- goto out;
+ goto out_free;
}
/*
@@ -6134,13 +6166,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
while (1) {
ret = walk_down_tree(trans, root, path, wc);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
break;
}
ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -6167,7 +6199,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -6203,7 +6235,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
ret = btrfs_del_root(trans, &root->root_key);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -6211,7 +6243,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
if (!is_reloc_root) {
ret = btrfs_find_root(tree_root, &root->root_key, path,
NULL, NULL);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
} else if (ret > 0) {
@@ -6245,7 +6277,6 @@ out_end_trans:
btrfs_end_transaction_throttle(trans);
out_free:
- kfree(wc);
btrfs_free_path(path);
out:
if (!ret && root_dropped) {
@@ -6287,8 +6318,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *parent)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
- struct walk_control *wc;
+ BTRFS_PATH_AUTO_FREE(path);
+ struct walk_control AUTO_KFREE(wc);
int level;
int parent_level;
int ret = 0;
@@ -6300,14 +6331,12 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return -ENOMEM;
wc = kzalloc(sizeof(*wc), GFP_NOFS);
- if (!wc) {
- btrfs_free_path(path);
+ if (!wc)
return -ENOMEM;
- }
btrfs_assert_tree_write_locked(parent);
parent_level = btrfs_header_level(parent);
- atomic_inc(&parent->refs);
+ refcount_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
@@ -6329,19 +6358,17 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
while (1) {
ret = walk_down_tree(trans, root, path, wc);
if (ret < 0)
- break;
+ return ret;
ret = walk_up_tree(trans, root, path, wc, parent_level);
if (ret) {
- if (ret > 0)
- ret = 0;
+ if (ret < 0)
+ return ret;
break;
}
}
- kfree(wc);
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -6402,14 +6429,14 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
if (ret)
break;
- find_first_clear_extent_bit(&device->alloc_state, start,
- &start, &end,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_find_first_clear_extent_bit(&device->alloc_state, start,
+ &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
/* Check if there are any CHUNK_* bits left */
if (start > device->total_bytes) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- btrfs_warn_in_rcu(fs_info,
+ DEBUG_WARN();
+ btrfs_warn(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
start, end - start + 1,
btrfs_dev_name(device),
@@ -6441,8 +6468,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
ret = btrfs_issue_discard(device->bdev, start, len,
&bytes);
if (!ret)
- set_extent_bit(&device->alloc_state, start,
- start + bytes - 1, CHUNK_TRIMMED, NULL);
+ btrfs_set_extent_bit(&device->alloc_state, start,
+ start + bytes - 1, CHUNK_TRIMMED, NULL);
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 2ad51130c037..71bb8109c969 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -4,7 +4,6 @@
#define BTRFS_EXTENT_TREE_H
#include <linux/types.h>
-#include "misc.h"
#include "block-group.h"
#include "locking.h"
@@ -31,7 +30,6 @@ struct find_free_extent_ctl {
u64 min_alloc_size;
u64 empty_size;
u64 flags;
- int delalloc;
/* Where to start the search inside the bg */
u64 search_start;
@@ -41,6 +39,7 @@ struct find_free_extent_ctl {
struct btrfs_free_cluster *last_ptr;
bool use_cluster;
+ bool delalloc;
bool have_caching_bg;
bool orig_have_caching_bg;
@@ -50,6 +49,16 @@ struct find_free_extent_ctl {
/* Allocation is called for data relocation */
bool for_data_reloc;
+ /*
+ * Set to true if we're retrying the allocation on this block group
+ * after waiting for caching progress, this is so that we retry only
+ * once before moving on to another block group.
+ */
+ bool retry_uncached;
+
+ /* Whether or not the allocator is currently following a hint. */
+ bool hinted;
+
/* RAID index, converted from flags */
int index;
@@ -58,13 +67,6 @@ struct find_free_extent_ctl {
*/
int loop;
- /*
- * Set to true if we're retrying the allocation on this block group
- * after waiting for caching progress, this is so that we retry only
- * once before moving on to another block group.
- */
- bool retry_uncached;
-
/* If current block group is cached */
int cached;
@@ -83,9 +85,6 @@ struct find_free_extent_ctl {
/* Allocation policy */
enum btrfs_extent_allocation_policy policy;
- /* Whether or not the allocator is currently following a hint */
- bool hinted;
-
/* Size class of block groups to prefer in early loops */
enum btrfs_block_group_size_class size_class;
};
@@ -98,7 +97,7 @@ enum btrfs_inline_ref_type {
};
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
- struct btrfs_extent_inline_ref *iref,
+ const struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
@@ -111,13 +110,11 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags,
u64 *owner_root);
-int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
- int reserved);
+int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
const struct extent_buffer *eb);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
-int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr, bool strict,
+int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, u64 bytenr,
struct btrfs_path *path);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -140,28 +137,31 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data, int delalloc);
+ struct btrfs_key *ins, bool is_data, bool delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
+ struct extent_buffer *buf, bool full_backref);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
+ struct extent_buffer *buf, bool full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf, int slot);
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len, int delalloc);
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len,
+ bool is_delalloc);
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
const struct extent_buffer *eb);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
- int for_reloc);
+int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
struct extent_buffer *parent);
+void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
+int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 num_bytes, u64 *actual_bytes);
+int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
#endif
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b923d0cec61c..629fd5af4286 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -75,9 +75,9 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
while (!list_empty(&fs_info->allocated_ebs)) {
eb = list_first_entry(&fs_info->allocated_ebs,
struct extent_buffer, leak_list);
- pr_err(
- "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n",
- eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
+ btrfs_err(fs_info,
+ "buffer leak start %llu len %u refs %d bflags %lu owner %llu",
+ eb->start, eb->len, refcount_read(&eb->refs), eb->bflags,
btrfs_header_owner(eb));
list_del(&eb->leak_list);
WARN_ON_ONCE(1);
@@ -96,9 +96,31 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
*/
struct btrfs_bio_ctrl {
struct btrfs_bio *bbio;
+ /* Last byte contained in bbio + 1 . */
+ loff_t next_file_offset;
enum btrfs_compression_type compress_type;
u32 len_to_oe_boundary;
blk_opf_t opf;
+ /*
+ * For data read bios, we attempt to optimize csum lookups if the extent
+ * generation is older than the current one. To make this possible, we
+ * need to track the maximum generation of an extent in a bio_ctrl to
+ * make the decision when submitting the bio.
+ *
+ * The pattern between do_readpage(), submit_one_bio() and
+ * submit_extent_folio() is quite subtle, so tracking this is tricky.
+ *
+ * As we process extent E, we might submit a bio with existing built up
+ * extents before adding E to a new bio, or we might just add E to the
+ * bio. As a result, E's generation could apply to the current bio or
+ * to the next one, so we need to be careful to update the bio_ctrl's
+ * generation with E's only when we are sure E is added to bio_ctrl->bbio
+ * in submit_extent_folio().
+ *
+ * See the comment in btrfs_lookup_bio_sums() for more detail on the
+ * need for this optimization.
+ */
+ u64 generation;
btrfs_bio_end_io_t end_io_func;
struct writeback_control *wbc;
@@ -108,8 +130,47 @@ struct btrfs_bio_ctrl {
* This is to avoid touching ranges covered by compression/inline.
*/
unsigned long submit_bitmap;
+ struct readahead_control *ractl;
+
+ /*
+ * The start offset of the last used extent map by a read operation.
+ *
+ * This is for proper compressed read merge.
+ * U64_MAX means we are starting the read and have made no progress yet.
+ *
+ * The current btrfs_bio_is_contig() only uses disk_bytenr as
+ * the condition to check if the read can be merged with previous
+ * bio, which is not correct. E.g. two file extents pointing to the
+ * same extent but with different offset.
+ *
+ * So here we need to do extra checks to only merge reads that are
+ * covered by the same extent map.
+ * Just extent_map::start will be enough, as they are unique
+ * inside the same inode.
+ */
+ u64 last_em_start;
};
+/*
+ * Helper to set the csum search commit root option for a bio_ctrl's bbio
+ * before submitting the bio.
+ *
+ * Only for use by submit_one_bio().
+ */
+static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl)
+{
+ struct btrfs_bio *bbio = bio_ctrl->bbio;
+
+ ASSERT(bbio);
+
+ if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode)))
+ return;
+
+ bio_ctrl->bbio->csum_search_commit_root =
+ (bio_ctrl->generation &&
+ bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info));
+}
+
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_bio *bbio = bio_ctrl->bbio;
@@ -120,6 +181,8 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
/* Caller should ensure the bio has at least some range added */
ASSERT(bbio->bio.bi_iter.bi_size);
+ bio_set_csum_search_commit_root(bio_ctrl);
+
if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
btrfs_submit_compressed_read(bbio);
@@ -128,6 +191,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
/* The bbio is owned by the end_io handler now */
bio_ctrl->bbio = NULL;
+ /*
+ * We used the generation to decide whether to lookup csums in the
+ * commit_root or not when we called bio_set_csum_search_commit_root()
+ * above. Now, reset the generation for the next bio.
+ */
+ bio_ctrl->generation = 0;
}
/*
@@ -198,9 +267,8 @@ static void __process_folios_contig(struct address_space *mapping,
u64 end, unsigned long page_ops)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
- pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
- pgoff_t index = start_index;
struct folio_batch fbatch;
int i;
@@ -221,36 +289,27 @@ static void __process_folios_contig(struct address_space *mapping,
}
}
-static noinline void __unlock_for_delalloc(const struct inode *inode,
- const struct folio *locked_folio,
+static noinline void unlock_delalloc_folio(const struct inode *inode,
+ struct folio *locked_folio,
u64 start, u64 end)
{
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
-
ASSERT(locked_folio);
- if (index == locked_folio->index && end_index == index)
- return;
__process_folios_contig(inode->i_mapping, locked_folio, start, end,
PAGE_UNLOCK);
}
static noinline int lock_delalloc_folios(struct inode *inode,
- const struct folio *locked_folio,
+ struct folio *locked_folio,
u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct address_space *mapping = inode->i_mapping;
- pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
- pgoff_t index = start_index;
u64 processed_end = start;
struct folio_batch fbatch;
- if (index == locked_folio->index && index == end_index)
- return 0;
-
folio_batch_init(&fbatch);
while (index <= end_index) {
unsigned int found_folios, i;
@@ -274,8 +333,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
goto out;
}
range_start = max_t(u64, folio_pos(folio), start);
- range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
- end + 1) - range_start;
+ range_len = min_t(u64, folio_next_pos(folio), end + 1) - range_start;
btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
processed_end = range_start + range_len - 1;
@@ -288,8 +346,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
out:
folio_batch_release(&fbatch);
if (processed_end > start)
- __unlock_for_delalloc(inode, locked_folio, start,
- processed_end);
+ unlock_delalloc_folio(inode, locked_folio, start, processed_end);
return -EAGAIN;
}
@@ -317,8 +374,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
const u64 orig_start = *start;
const u64 orig_end = *end;
- /* The sanity tests may not set a valid fs_info. */
- u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
+ u64 max_bytes = fs_info->max_extent_size;
u64 delalloc_start;
u64 delalloc_end;
bool found;
@@ -330,12 +386,19 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
ASSERT(orig_end > orig_start);
/* The range should at least cover part of the folio */
- ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) ||
+ ASSERT(!(orig_start >= folio_next_pos(locked_folio) ||
orig_end <= folio_pos(locked_folio)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
+
+ /*
+ * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can
+ * return early without handling any dirty ranges.
+ */
+ ASSERT(max_bytes >= fs_info->sectorsize);
+
found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
max_bytes, &cached_state);
if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
@@ -343,7 +406,7 @@ again:
/* @delalloc_end can be -1, never go beyond @orig_end */
*end = min(delalloc_end, orig_end);
- free_extent_state(cached_state);
+ btrfs_free_extent_state(cached_state);
return false;
}
@@ -361,18 +424,19 @@ again:
if (delalloc_end + 1 - delalloc_start > max_bytes)
delalloc_end = delalloc_start + max_bytes - 1;
- /* step two, lock all the folioss after the folios that has start */
+ /* step two, lock all the folios after the folios that has start */
ret = lock_delalloc_folios(inode, locked_folio, delalloc_start,
delalloc_end);
ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
- /* some of the folios are gone, lets avoid looping by
- * shortening the size of the delalloc range we're searching
+ /*
+ * Some of the folios are gone, lets avoid looping by
+ * shortening the size of the delalloc range we're searching.
*/
- free_extent_state(cached_state);
+ btrfs_free_extent_state(cached_state);
cached_state = NULL;
if (!loops) {
- max_bytes = PAGE_SIZE;
+ max_bytes = fs_info->sectorsize;
loops = 1;
goto again;
} else {
@@ -382,15 +446,15 @@ again:
}
/* step three, lock the state bits for the whole range */
- lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
+ btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
- ret = test_range_bit(tree, delalloc_start, delalloc_end,
- EXTENT_DELALLOC, cached_state);
+ ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end,
+ EXTENT_DELALLOC, cached_state);
- unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
+ btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
if (!ret) {
- __unlock_for_delalloc(inode, locked_folio, delalloc_start,
+ unlock_delalloc_folio(inode, locked_folio, delalloc_start,
delalloc_end);
cond_resched();
goto again;
@@ -406,7 +470,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached,
u32 clear_bits, unsigned long page_ops)
{
- clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
+ btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
__process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start,
end, page_ops);
@@ -428,14 +492,14 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + PAGE_SIZE);
+ start + len <= folio_next_pos(folio));
if (uptodate && btrfs_verify_folio(folio, start, len))
btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
btrfs_folio_clear_uptodate(fs_info, folio, start, len);
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
folio_unlock(folio);
else
btrfs_folio_end_lock(fs_info, folio, start, len);
@@ -453,7 +517,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
*/
static void end_bbio_data_write(struct btrfs_bio *bbio)
{
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
struct folio_iter fi;
@@ -465,9 +529,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
u64 start = folio_pos(folio) + fi.offset;
u32 len = fi.length;
- /* Only order 0 (single page) folios are allowed for data. */
- ASSERT(folio_order(folio) == 0);
-
/* Our read/write should always be sector aligned. */
if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
@@ -491,11 +552,11 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
{
ASSERT(folio_test_locked(folio));
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
return;
ASSERT(folio_test_private(folio));
- btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+ btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio));
}
/*
@@ -512,50 +573,26 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
*/
static void end_bbio_data_read(struct btrfs_bio *bbio)
{
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
struct bio *bio = &bbio->bio;
struct folio_iter fi;
- const u32 sectorsize = fs_info->sectorsize;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_folio_all(fi, &bbio->bio) {
bool uptodate = !bio->bi_status;
struct folio *folio = fi.folio;
struct inode *inode = folio->mapping->host;
- u64 start;
- u64 end;
- u32 len;
+ u64 start = folio_pos(folio) + fi.offset;
- /* For now only order 0 folios are supported for data. */
- ASSERT(folio_order(folio) == 0);
btrfs_debug(fs_info,
"%s: bi_sector=%llu, err=%d, mirror=%u",
__func__, bio->bi_iter.bi_sector, bio->bi_status,
bbio->mirror_num);
- /*
- * We always issue full-sector reads, but if some block in a
- * folio fails to read, blk_update_request() will advance
- * bv_offset and adjust bv_len to compensate. Print a warning
- * for unaligned offsets, and an error if they don't add up to
- * a full sector.
- */
- if (!IS_ALIGNED(fi.offset, sectorsize))
- btrfs_err(fs_info,
- "partial page read in btrfs with offset %zu and length %zu",
- fi.offset, fi.length);
- else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize))
- btrfs_info(fs_info,
- "incomplete page read with offset %zu and length %zu",
- fi.offset, fi.length);
-
- start = folio_pos(folio) + fi.offset;
- end = start + fi.length - 1;
- len = fi.length;
if (likely(uptodate)) {
+ u64 end = start + fi.length - 1;
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> folio_shift(folio);
/*
* Zero out the remaining part if this range straddles
@@ -564,9 +601,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
* Here we should only zero the range inside the folio,
* not touch anything else.
*
- * NOTE: i_size is exclusive while end is inclusive.
+ * NOTE: i_size is exclusive while end is inclusive and
+ * folio_contains() takes PAGE_SIZE units.
*/
- if (folio_index(folio) == end_index && i_size <= end) {
+ if (folio_contains(folio, i_size >> PAGE_SHIFT) &&
+ i_size <= end) {
u32 zero_start = max(offset_in_folio(folio, i_size),
offset_in_folio(folio, start));
u32 zero_len = offset_in_folio(folio, end) + 1 -
@@ -577,7 +616,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
}
/* Update page status and unlock. */
- end_folio_read(folio, uptodate, start, len);
+ end_folio_read(folio, uptodate, start, fi.length);
}
bio_put(bio);
}
@@ -586,6 +625,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
* Populate every free slot in a provided array with folios using GFP_NOFS.
*
* @nr_folios: number of folios to allocate
+ * @order: the order of the folios to be allocated
* @folio_array: the array to fill with folios; any existing non-NULL entries in
* the array will be skipped
*
@@ -593,12 +633,13 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
* -ENOMEM otherwise, the partially allocated folios would be freed and
* the array slots zeroed
*/
-int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array)
+int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
+ struct folio **folio_array)
{
for (int i = 0; i < nr_folios; i++) {
if (folio_array[i])
continue;
- folio_array[i] = folio_alloc(GFP_NOFS, 0);
+ folio_array[i] = folio_alloc(GFP_NOFS, order);
if (!folio_array[i])
goto error;
}
@@ -607,6 +648,7 @@ error:
for (int i = 0; i < nr_folios; i++) {
if (folio_array[i])
folio_put(folio_array[i]);
+ folio_array[i] = NULL;
}
return -ENOMEM;
}
@@ -632,7 +674,7 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
for (allocated = 0; allocated < nr_pages;) {
unsigned int last = allocated;
- allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array);
+ allocated = alloc_pages_bulk(gfp, nr_pages, page_array);
if (unlikely(allocated == last)) {
/* No progress, fail and do cleanup. */
for (int i = 0; i < allocated; i++) {
@@ -668,13 +710,10 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
}
static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
- struct folio *folio, u64 disk_bytenr,
- unsigned int pg_offset)
+ u64 disk_bytenr, loff_t file_offset)
{
struct bio *bio = &bio_ctrl->bbio->bio;
- struct bio_vec *bvec = bio_last_bvec_all(bio);
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
- struct folio *bv_folio = page_folio(bvec->bv_page);
if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
/*
@@ -685,19 +724,11 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
}
/*
- * The contig check requires the following conditions to be met:
- *
- * 1) The folios are belonging to the same inode
- * This is implied by the call chain.
- *
- * 2) The range has adjacent logical bytenr
- *
- * 3) The range has adjacent file offset
- * This is required for the usage of btrfs_bio->file_offset.
+ * To merge into a bio both the disk sector and the logical offset in
+ * the file need to be contiguous.
*/
- return bio_end_sector(bio) == sector &&
- folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len ==
- folio_pos(folio) + pg_offset;
+ return bio_ctrl->next_file_offset == file_offset &&
+ bio_end_sector(bio) == sector;
}
static void alloc_new_bio(struct btrfs_inode *inode,
@@ -707,13 +738,13 @@ static void alloc_new_bio(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_bio *bbio;
- bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
- bio_ctrl->end_io_func, NULL);
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, inode,
+ file_offset, bio_ctrl->end_io_func, NULL);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
- bbio->inode = inode;
- bbio->file_offset = file_offset;
+ bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint;
bio_ctrl->bbio = bbio;
bio_ctrl->len_to_oe_boundary = U32_MAX;
+ bio_ctrl->next_file_offset = file_offset;
/* Limit data write bios to the ordered boundary. */
if (bio_ctrl->wbc) {
@@ -744,33 +775,35 @@ static void alloc_new_bio(struct btrfs_inode *inode,
* @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
+ * @read_em_generation: generation of the extent_map we are submitting
+ * (only used for read)
*
* The will either add the page into the existing @bio_ctrl->bbio, or allocate a
* new one in @bio_ctrl->bbio.
- * The mirror number for this IO should already be initizlied in
+ * The mirror number for this IO should already be initialized in
* @bio_ctrl->mirror_num.
*/
static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
u64 disk_bytenr, struct folio *folio,
- size_t size, unsigned long pg_offset)
+ size_t size, unsigned long pg_offset,
+ u64 read_em_generation)
{
struct btrfs_inode *inode = folio_to_inode(folio);
+ loff_t file_offset = folio_pos(folio) + pg_offset;
- ASSERT(pg_offset + size <= PAGE_SIZE);
+ ASSERT(pg_offset + size <= folio_size(folio));
ASSERT(bio_ctrl->end_io_func);
if (bio_ctrl->bbio &&
- !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset))
+ !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset))
submit_one_bio(bio_ctrl);
do {
u32 len = size;
/* Allocate new bio if needed */
- if (!bio_ctrl->bbio) {
- alloc_new_bio(inode, bio_ctrl, disk_bytenr,
- folio_pos(folio) + pg_offset);
- }
+ if (!bio_ctrl->bbio)
+ alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset);
/* Cap to the current ordered extent boundary if there is one. */
if (len > bio_ctrl->len_to_oe_boundary) {
@@ -784,14 +817,20 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
submit_one_bio(bio_ctrl);
continue;
}
+ /*
+ * Now that the folio is definitely added to the bio, include its
+ * generation in the max generation calculation.
+ */
+ bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation);
+ bio_ctrl->next_file_offset += len;
if (bio_ctrl->wbc)
- wbc_account_cgroup_owner(bio_ctrl->wbc, folio,
- len);
+ wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len);
size -= len;
pg_offset += len;
disk_bytenr += len;
+ file_offset += len;
/*
* len_to_oe_boundary defaults to U32_MAX, which isn't folio or
@@ -825,7 +864,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
static int attach_extent_buffer_folio(struct extent_buffer *eb,
struct folio *folio,
- struct btrfs_subpage *prealloc)
+ struct btrfs_folio_state *prealloc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;
@@ -839,7 +878,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
if (folio->mapping)
lockdep_assert_held(&folio->mapping->i_private_lock);
- if (fs_info->nodesize >= PAGE_SIZE) {
+ if (!btrfs_meta_is_subpage(fs_info)) {
if (!folio_test_private(folio))
folio_attach_private(folio, eb);
else
@@ -849,7 +888,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
/* Already mapped, just free prealloc */
if (folio_test_private(folio)) {
- btrfs_free_subpage(prealloc);
+ btrfs_free_folio_state(prealloc);
return 0;
}
@@ -858,15 +897,10 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
folio_attach_private(folio, prealloc);
else
/* Do new allocation to attach subpage */
- ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
+ ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return ret;
}
-int set_page_extent_mapped(struct page *page)
-{
- return set_folio_extent_mapped(page_folio(page));
-}
-
int set_folio_extent_mapped(struct folio *folio)
{
struct btrfs_fs_info *fs_info;
@@ -878,8 +912,8 @@ int set_folio_extent_mapped(struct folio *folio)
fs_info = folio_to_fs_info(folio);
- if (btrfs_is_subpage(fs_info, folio->mapping))
- return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
+ if (btrfs_is_subpage(fs_info, folio))
+ return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
return 0;
@@ -895,44 +929,60 @@ void clear_folio_extent_mapped(struct folio *folio)
return;
fs_info = folio_to_fs_info(folio);
- if (btrfs_is_subpage(fs_info, folio->mapping))
- return btrfs_detach_subpage(fs_info, folio);
+ if (btrfs_is_subpage(fs_info, folio))
+ return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_detach_private(folio);
}
-static struct extent_map *__get_extent_map(struct inode *inode,
- struct folio *folio, u64 start,
- u64 len, struct extent_map **em_cached)
+static struct extent_map *get_extent_map(struct btrfs_inode *inode,
+ struct folio *folio, u64 start,
+ u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
- struct extent_state *cached_state = NULL;
ASSERT(em_cached);
if (*em_cached) {
em = *em_cached;
- if (extent_map_in_tree(em) && start >= em->start &&
- start < extent_map_end(em)) {
+ if (btrfs_extent_map_in_tree(em) && start >= em->start &&
+ start < btrfs_extent_map_end(em)) {
refcount_inc(&em->refs);
return em;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*em_cached = NULL;
}
- btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state);
- em = btrfs_get_extent(BTRFS_I(inode), folio, start, len);
+ em = btrfs_get_extent(inode, folio, start, len);
if (!IS_ERR(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
*em_cached = em;
}
- unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state);
return em;
}
+
+static void btrfs_readahead_expand(struct readahead_control *ractl,
+ const struct extent_map *em)
+{
+ const u64 ra_pos = readahead_pos(ractl);
+ const u64 ra_end = ra_pos + readahead_length(ractl);
+ const u64 em_end = em->start + em->len;
+
+ /* No expansion for holes and inline extents. */
+ if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE)
+ return;
+
+ ASSERT(em_end >= ra_pos,
+ "extent_map %llu %llu ends before current readahead position %llu",
+ em->start, em->len, ra_pos);
+ if (em_end > ra_end)
+ readahead_expand(ractl, ra_pos, em_end - ra_pos);
+}
+
/*
* basic readpage implementation. Locked extent state structs are inserted
* into the tree that are removed when the IO is done (by the end_io
@@ -941,21 +991,17 @@ static struct extent_map *__get_extent_map(struct inode *inode,
* return 0 on success, otherwise return error
*/
static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
u64 start = folio_pos(folio);
- const u64 end = start + PAGE_SIZE - 1;
- u64 cur = start;
+ const u64 end = start + folio_size(folio) - 1;
u64 extent_offset;
u64 last_byte = i_size_read(inode);
- u64 block_start;
struct extent_map *em;
int ret = 0;
- size_t pg_offset = 0;
- size_t iosize;
- size_t blocksize = fs_info->sectorsize;
+ const size_t blocksize = fs_info->sectorsize;
ret = set_folio_extent_mapped(folio);
if (ret < 0) {
@@ -963,49 +1009,63 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
return ret;
}
- if (folio->index == last_byte >> folio_shift(folio)) {
+ if (folio_contains(folio, last_byte >> PAGE_SHIFT)) {
size_t zero_offset = offset_in_folio(folio, last_byte);
- if (zero_offset) {
- iosize = folio_size(folio) - zero_offset;
- folio_zero_range(folio, zero_offset, iosize);
- }
+ if (zero_offset)
+ folio_zero_range(folio, zero_offset,
+ folio_size(folio) - zero_offset);
}
bio_ctrl->end_io_func = end_bbio_data_read;
begin_folio_read(fs_info, folio);
- while (cur <= end) {
+ for (u64 cur = start; cur <= end; cur += blocksize) {
enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
+ unsigned long pg_offset = offset_in_folio(folio, cur);
bool force_bio_submit = false;
u64 disk_bytenr;
+ u64 block_start;
+ u64 em_gen;
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
- iosize = folio_size(folio) - pg_offset;
- folio_zero_range(folio, pg_offset, iosize);
- end_folio_read(folio, true, cur, iosize);
+ folio_zero_range(folio, pg_offset, end - cur + 1);
+ end_folio_read(folio, true, cur, end - cur + 1);
break;
}
- em = __get_extent_map(inode, folio, cur, end - cur + 1,
- em_cached);
+ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
+ end_folio_read(folio, true, cur, blocksize);
+ continue;
+ }
+ em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached);
if (IS_ERR(em)) {
end_folio_read(folio, false, cur, end + 1 - cur);
return PTR_ERR(em);
}
extent_offset = cur - em->start;
- BUG_ON(extent_map_end(em) <= cur);
+ BUG_ON(btrfs_extent_map_end(em) <= cur);
BUG_ON(end < cur);
- compress_type = extent_map_compression(em);
+ compress_type = btrfs_extent_map_compression(em);
+
+ /*
+ * Only expand readahead for extents which are already creating
+ * the pages anyway in add_ra_bio_pages, which is compressed
+ * extents in the non subpage case.
+ */
+ if (bio_ctrl->ractl &&
+ !btrfs_is_subpage(fs_info, folio) &&
+ compress_type != BTRFS_COMPRESS_NONE)
+ btrfs_readahead_expand(bio_ctrl->ractl, em);
- iosize = min(extent_map_end(em) - cur, end - cur + 1);
- iosize = ALIGN(iosize, blocksize);
if (compress_type != BTRFS_COMPRESS_NONE)
disk_bytenr = em->disk_bytenr;
else
- disk_bytenr = extent_map_block_start(em) + extent_offset;
- block_start = extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset;
+
if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
+ else
+ block_start = btrfs_extent_map_block_start(em);
/*
* If we have a file range that points to a compressed extent
@@ -1042,30 +1102,25 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
* non-optimal behavior (submitting 2 bios for the same extent).
*/
if (compress_type != BTRFS_COMPRESS_NONE &&
- prev_em_start && *prev_em_start != (u64)-1 &&
- *prev_em_start != em->start)
+ bio_ctrl->last_em_start != U64_MAX &&
+ bio_ctrl->last_em_start != em->start)
force_bio_submit = true;
- if (prev_em_start)
- *prev_em_start = em->start;
+ bio_ctrl->last_em_start = em->start;
- free_extent_map(em);
+ em_gen = em->generation;
+ btrfs_free_extent_map(em);
em = NULL;
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- folio_zero_range(folio, pg_offset, iosize);
-
- end_folio_read(folio, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
+ folio_zero_range(folio, pg_offset, blocksize);
+ end_folio_read(folio, true, cur, blocksize);
continue;
}
/* the get_extent function already copied into the folio */
if (block_start == EXTENT_MAP_INLINE) {
- end_folio_read(folio, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
+ end_folio_read(folio, true, cur, blocksize);
continue;
}
@@ -1076,23 +1131,208 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
if (force_bio_submit)
submit_one_bio(bio_ctrl);
- submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize,
- pg_offset);
- cur = cur + iosize;
- pg_offset += iosize;
+ submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
+ pg_offset, em_gen);
}
-
return 0;
}
+/*
+ * Check if we can skip waiting the @ordered extent covering the block at @fileoff.
+ *
+ * @fileoff: Both input and output.
+ * Input as the file offset where the check should start at.
+ * Output as where the next check should start at,
+ * if the function returns true.
+ *
+ * Return true if we can skip to @fileoff. The caller needs to check the new
+ * @fileoff value to make sure it covers the full range, before skipping the
+ * full OE.
+ *
+ * Return false if we must wait for the ordered extent.
+ */
+static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent *ordered,
+ u64 *fileoff)
+{
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct folio *folio;
+ const u32 blocksize = fs_info->sectorsize;
+ u64 cur = *fileoff;
+ bool ret;
+
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT);
+
+ /*
+ * We should have locked the folio(s) for range [start, end], thus
+ * there must be a folio and it must be locked.
+ */
+ ASSERT(!IS_ERR(folio));
+ ASSERT(folio_test_locked(folio));
+
+ /*
+ * There are several cases for the folio and OE combination:
+ *
+ * 1) Folio has no private flag
+ * The OE has all its IO done but not yet finished, and folio got
+ * invalidated.
+ *
+ * Have we have to wait for the OE to finish, as it may contain the
+ * to-be-inserted data checksum.
+ * Without the data checksum inserted into the csum tree, read will
+ * just fail with missing csum.
+ */
+ if (!folio_test_private(folio)) {
+ ret = false;
+ goto out;
+ }
+
+ /*
+ * 2) The first block is DIRTY.
+ *
+ * This means the OE is created by some other folios whose file pos is
+ * before this one. And since we are holding the folio lock, the writeback
+ * of this folio cannot start.
+ *
+ * We must skip the whole OE, because it will never start until we
+ * finished our folio read and unlocked the folio.
+ */
+ if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
+ u64 range_len = umin(folio_next_pos(folio),
+ ordered->file_offset + ordered->num_bytes) - cur;
+
+ ret = true;
+ /*
+ * At least inside the folio, all the remaining blocks should
+ * also be dirty.
+ */
+ ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len));
+ *fileoff = ordered->file_offset + ordered->num_bytes;
+ goto out;
+ }
+
+ /*
+ * 3) The first block is uptodate.
+ *
+ * At least the first block can be skipped, but we are still not fully
+ * sure. E.g. if the OE has some other folios in the range that cannot
+ * be skipped.
+ * So we return true and update @next_ret to the OE/folio boundary.
+ */
+ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
+ u64 range_len = umin(folio_next_pos(folio),
+ ordered->file_offset + ordered->num_bytes) - cur;
+
+ /*
+ * The whole range to the OE end or folio boundary should also
+ * be uptodate.
+ */
+ ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len));
+ ret = true;
+ *fileoff = cur + range_len;
+ goto out;
+ }
+
+ /*
+ * 4) The first block is not uptodate.
+ *
+ * This means the folio is invalidated after the writeback was finished,
+ * but by some other operations (e.g. block aligned buffered write) the
+ * folio is inserted into filemap.
+ * Very much the same as case 1).
+ */
+ ret = false;
+out:
+ folio_put(folio);
+ return ret;
+}
+
+static bool can_skip_ordered_extent(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent *ordered,
+ u64 start, u64 end)
+{
+ const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1);
+ u64 cur = max(start, ordered->file_offset);
+
+ while (cur < range_end) {
+ bool can_skip;
+
+ can_skip = can_skip_one_ordered_range(inode, ordered, &cur);
+ if (!can_skip)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Locking helper to make sure we get a stable view of extent maps for the
+ * involved range.
+ *
+ * This is for folio read paths (read and readahead), thus the involved range
+ * should have all the folios locked.
+ */
+static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ u64 cur_pos;
+
+ /* Caller must provide a valid @cached_state. */
+ ASSERT(cached_state);
+
+ /* The range must at least be page aligned, as all read paths are folio based. */
+ ASSERT(IS_ALIGNED(start, PAGE_SIZE));
+ ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE));
+
+again:
+ btrfs_lock_extent(&inode->io_tree, start, end, cached_state);
+ cur_pos = start;
+ while (cur_pos < end) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = btrfs_lookup_ordered_range(inode, cur_pos,
+ end - cur_pos + 1);
+ /*
+ * No ordered extents in the range, and we hold the extent lock,
+ * no one can modify the extent maps in the range, we're safe to return.
+ */
+ if (!ordered)
+ break;
+
+ /* Check if we can skip waiting for the whole OE. */
+ if (can_skip_ordered_extent(inode, ordered, start, end)) {
+ cur_pos = min(ordered->file_offset + ordered->num_bytes,
+ end + 1);
+ btrfs_put_ordered_extent(ordered);
+ continue;
+ }
+
+ /* Now wait for the OE to finish. */
+ btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
+ btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start);
+ btrfs_put_ordered_extent(ordered);
+ /* We have unlocked the whole range, restart from the beginning. */
+ goto again;
+ }
+}
+
int btrfs_read_folio(struct file *file, struct folio *folio)
{
- struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+ struct btrfs_inode *inode = folio_to_inode(folio);
+ const u64 start = folio_pos(folio);
+ const u64 end = start + folio_size(folio) - 1;
+ struct extent_state *cached_state = NULL;
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ,
+ .last_em_start = U64_MAX,
+ };
struct extent_map *em_cached = NULL;
int ret;
- ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
- free_extent_map(em_cached);
+ lock_extents_for_read(inode, start, end, &cached_state);
+ ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
+
+ btrfs_free_extent_map(em_cached);
/*
* If btrfs_do_readpage() failed we will want to submit the assembled
@@ -1110,7 +1350,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit
unsigned int start_bit;
unsigned int nbits;
- ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
+ ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio));
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
nbits = len >> fs_info->sectorsize_bits;
ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
@@ -1123,12 +1363,12 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
const u64 folio_start = folio_pos(folio);
- const unsigned int bitmap_size = fs_info->sectors_per_page;
+ const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio);
unsigned int start_bit;
unsigned int first_zero;
unsigned int first_set;
- ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
+ ASSERT(start >= folio_start && start < folio_start + folio_size(folio));
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
@@ -1142,14 +1382,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
}
/*
- * helper for extent_writepage(), doing all of the delayed allocation setup.
+ * Do all of the delayed allocation setup.
*
- * This returns 1 if btrfs_run_delalloc_range function did all the work required
- * to write the page (copy into inline extent). In this case the IO has
- * been started and the page is already unlocked.
+ * Return >0 if all the dirty blocks are submitted async (compression) or inlined.
+ * The @folio should no longer be touched (treat it as already unlocked).
*
- * This returns 0 if all went well (page still locked)
- * This returns < 0 if there were errors (page still locked)
+ * Return 0 if there is still dirty block that needs to be submitted through
+ * extent_writepage_io().
+ * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
+ * submitted, and @folio is still kept locked.
+ *
+ * Return <0 if there is any error hit.
+ * Any allocated ordered extent range covering this folio will be marked
+ * finished (IOERR), and @folio is still kept locked.
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct folio *folio,
@@ -1157,9 +1402,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode);
struct writeback_control *wbc = bio_ctrl->wbc;
- const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping);
+ const bool is_subpage = btrfs_is_subpage(fs_info, folio);
const u64 page_start = folio_pos(folio);
const u64 page_end = page_start + folio_size(folio) - 1;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long delalloc_bitmap = 0;
/*
* Save the last found delalloc end. As the delalloc end can go beyond
@@ -1167,6 +1413,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* last delalloc end.
*/
u64 last_delalloc_end = 0;
+ /*
+ * The range end (exclusive) of the last successfully finished delalloc
+ * range.
+ * Any range covered by ordered extent must either be manually marked
+ * finished (error handling), or has IO submitted (and finish the
+ * ordered extent normally).
+ *
+ * This records the end of ordered extent cleanup if we hit an error.
+ */
+ u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
@@ -1174,14 +1430,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
int bit;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
- if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
- ASSERT(fs_info->sectors_per_page > 1);
+ if (btrfs_is_subpage(fs_info, folio)) {
+ ASSERT(blocks_per_folio > 1);
btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
} else {
bio_ctrl->submit_bitmap = 1;
}
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
u64 start = page_start + (bit << fs_info->sectorsize_bits);
btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
@@ -1235,19 +1491,36 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) {
+ /*
+ * Some delalloc range may be created by previous folios.
+ * Thus we still need to clean up this range during error
+ * handling.
+ */
+ last_finished_delalloc_end = found_start;
/* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc);
+ if (ret >= 0)
+ last_finished_delalloc_end = found_start + found_len;
+ if (unlikely(ret < 0))
+ btrfs_err_rl(fs_info,
+"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d",
+ btrfs_root_id(inode->root),
+ btrfs_ino(inode),
+ folio_pos(folio),
+ blocks_per_folio,
+ &bio_ctrl->submit_bitmap,
+ found_start, found_len, ret);
} else {
/*
* We've hit an error during previous delalloc range,
* have to cleanup the remaining locked ranges.
*/
- unlock_extent(&inode->io_tree, found_start,
- found_start + found_len - 1, NULL);
- __unlock_for_delalloc(&inode->vfs_inode, folio,
+ btrfs_unlock_extent(&inode->io_tree, found_start,
+ found_start + found_len - 1, NULL);
+ unlock_delalloc_folio(&inode->vfs_inode, folio,
found_start,
found_start + found_len - 1);
}
@@ -1274,8 +1547,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = found_start + found_len;
}
- if (ret < 0)
+ /*
+ * It's possible we had some ordered extents created before we hit
+ * an error, cleanup non-async successfully created delalloc ranges.
+ */
+ if (unlikely(ret < 0)) {
+ unsigned int bitmap_size = min(
+ (last_finished_delalloc_end - page_start) >>
+ fs_info->sectorsize_bits,
+ blocks_per_folio);
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
+ btrfs_mark_ordered_io_finished(inode, folio,
+ page_start + (bit << fs_info->sectorsize_bits),
+ fs_info->sectorsize, false);
return ret;
+ }
out:
if (last_delalloc_end)
delalloc_end = last_delalloc_end;
@@ -1283,7 +1570,7 @@ out:
delalloc_end = page_end;
/*
* delalloc_end is already one less than the total length, so
- * we don't subtract one from PAGE_SIZE
+ * we don't subtract one from PAGE_SIZE.
*/
delalloc_to_write +=
DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
@@ -1292,7 +1579,7 @@ out:
* If all ranges are submitted asynchronously, we just need to account
* for them here.
*/
- if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) {
+ if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) {
wbc->nr_to_write -= delalloc_to_write;
return 1;
}
@@ -1311,7 +1598,7 @@ out:
/*
* Return 0 if we have submitted or queued the sector for submission.
- * Return <0 for critical errors.
+ * Return <0 for critical errors, and the sector will have its dirty flag cleared.
*
* Caller should make sure filepos < i_size and handle filepos >= i_size case.
*/
@@ -1334,23 +1621,32 @@ static int submit_one_sector(struct btrfs_inode *inode,
ASSERT(filepos < i_size);
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
- if (IS_ERR(em))
- return PTR_ERR_OR_ZERO(em);
+ if (IS_ERR(em)) {
+ /*
+ * When submission failed, we should still clear the folio dirty.
+ * Or the folio will be written back again but without any
+ * ordered extent.
+ */
+ btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
+ btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
+ btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
+ return PTR_ERR(em);
+ }
extent_offset = filepos - em->start;
- em_end = extent_map_end(em);
+ em_end = btrfs_extent_map_end(em);
ASSERT(filepos <= em_end);
ASSERT(IS_ALIGNED(em->start, sectorsize));
ASSERT(IS_ALIGNED(em->len, sectorsize));
- block_start = extent_map_block_start(em);
- disk_bytenr = extent_map_block_start(em) + extent_offset;
+ block_start = btrfs_extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset;
- ASSERT(!extent_map_is_compressed(em));
+ ASSERT(!btrfs_extent_map_is_compressed(em));
ASSERT(block_start != EXTENT_MAP_HOLE);
ASSERT(block_start != EXTENT_MAP_INLINE);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
/*
@@ -1370,7 +1666,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
ASSERT(folio_test_writeback(folio));
submit_extent_folio(bio_ctrl, disk_bytenr, folio,
- sectorsize, filepos - folio_pos(folio));
+ sectorsize, filepos - folio_pos(folio), 0);
return 0;
}
@@ -1391,35 +1687,62 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
+ int found_error = 0;
+ const u64 end = start + len;
const u64 folio_start = folio_pos(folio);
+ const u64 folio_end = folio_start + folio_size(folio);
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
u64 cur;
int bit;
int ret = 0;
- ASSERT(start >= folio_start &&
- start + len <= folio_start + folio_size(folio));
+ ASSERT(start >= folio_start, "start=%llu folio_start=%llu", start, folio_start);
+ ASSERT(end <= folio_end, "start=%llu len=%u folio_start=%llu folio_size=%zu",
+ start, len, folio_start, folio_size(folio));
ret = btrfs_writepage_cow_fixup(folio);
- if (ret) {
+ if (ret == -EAGAIN) {
/* Fixup worker will requeue */
folio_redirty_for_writepage(bio_ctrl->wbc, folio);
folio_unlock(folio);
return 1;
}
+ if (ret < 0) {
+ btrfs_folio_clear_dirty(fs_info, folio, start, len);
+ btrfs_folio_set_writeback(fs_info, folio, start, len);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
+ return ret;
+ }
- for (cur = start; cur < start + len; cur += fs_info->sectorsize)
+ for (cur = start; cur < end; cur += fs_info->sectorsize)
set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
- fs_info->sectors_per_page);
+ blocks_per_folio);
bio_ctrl->end_io_func = end_bbio_data_write;
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
if (cur >= i_size) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, cur,
+ folio_end - cur);
+ /*
+ * We have just run delalloc before getting here, so
+ * there must be an ordered extent.
+ */
+ ASSERT(ordered != NULL);
+ spin_lock(&inode->ordered_tree_lock);
+ set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+ ordered->truncated_len = min(ordered->truncated_len,
+ cur - ordered->file_offset);
+ spin_unlock(&inode->ordered_tree_lock);
+ btrfs_put_ordered_extent(ordered);
+
btrfs_mark_ordered_io_finished(inode, folio, cur,
- start + len - cur, true);
+ end - cur, true);
/*
* This range is beyond i_size, thus we don't need to
* bother writing back.
@@ -1428,16 +1751,31 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
* writeback the sectors with subpage dirty bits,
* causing writeback without ordered extent.
*/
- btrfs_folio_clear_dirty(fs_info, folio, cur,
- start + len - cur);
+ btrfs_folio_clear_dirty(fs_info, folio, cur, end - cur);
break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
- if (ret < 0)
- goto out;
+ if (unlikely(ret < 0)) {
+ /*
+ * bio_ctrl may contain a bio crossing several folios.
+ * Submit it immediately so that the bio has a chance
+ * to finish normally, other than marked as error.
+ */
+ submit_one_bio(bio_ctrl);
+ /*
+ * Failed to grab the extent map which should be very rare.
+ * Since there is no bio submitted to finish the ordered
+ * extent, we have to manually finish this sector.
+ */
+ btrfs_mark_ordered_io_finished(inode, folio, cur,
+ fs_info->sectorsize, false);
+ if (!found_error)
+ found_error = ret;
+ continue;
+ }
submitted_io = true;
}
-out:
+
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
@@ -1445,12 +1783,15 @@ out:
*
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
+ *
+ * If we hit any error, the corresponding sector will have its dirty
+ * flag cleared and writeback finished, thus no need to handle the error case.
*/
- if (!submitted_io) {
+ if (!submitted_io && !found_error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
- return ret;
+ return found_error;
}
/*
@@ -1464,15 +1805,15 @@ out:
*/
static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl)
{
- struct inode *inode = folio->mapping->host;
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- const u64 page_start = folio_pos(folio);
+ struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
size_t pg_offset;
- loff_t i_size = i_size_read(inode);
- unsigned long end_index = i_size >> PAGE_SHIFT;
+ loff_t i_size = i_size_read(&inode->vfs_inode);
+ const pgoff_t end_index = i_size >> PAGE_SHIFT;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
- trace_extent_writepage(folio, inode, bio_ctrl->wbc);
+ trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc);
WARN_ON(!folio_test_locked(folio));
@@ -1484,7 +1825,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
return 0;
}
- if (folio->index == end_index)
+ if (folio_contains(folio, end_index))
folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset);
/*
@@ -1492,30 +1833,56 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
* The proper bitmap can only be initialized until writepage_delalloc().
*/
bio_ctrl->submit_bitmap = (unsigned long)-1;
+
+ /*
+ * If the page is dirty but without private set, it's marked dirty
+ * without informing the fs.
+ * Nowadays that is a bug, since the introduction of
+ * pin_user_pages*().
+ *
+ * So here we check if the page has private set to rule out such
+ * case.
+ * But we also have a long history of relying on the COW fixup,
+ * so here we only enable this check for experimental builds until
+ * we're sure it's safe.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) &&
+ unlikely(!folio_test_private(folio))) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err_rl(fs_info,
+ "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
+ btrfs_root_id(inode->root),
+ btrfs_ino(inode), folio_pos(folio));
+ ret = -EUCLEAN;
+ goto done;
+ }
+
ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto done;
- ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl);
+ ret = writepage_delalloc(inode, folio, bio_ctrl);
if (ret == 1)
return 0;
if (ret)
goto done;
- ret = extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio),
- PAGE_SIZE, bio_ctrl, i_size);
+ ret = extent_writepage_io(inode, folio, folio_pos(folio),
+ folio_size(folio), bio_ctrl, i_size);
if (ret == 1)
return 0;
+ if (unlikely(ret < 0))
+ btrfs_err_rl(fs_info,
+"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ folio_pos(folio), blocks_per_folio,
+ &bio_ctrl->submit_bitmap, ret);
bio_ctrl->wbc->nr_to_write--;
done:
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
- page_start, PAGE_SIZE, !ret);
+ if (ret < 0)
mapping_set_error(folio->mapping, ret);
- }
-
/*
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
@@ -1525,12 +1892,6 @@ done:
return ret;
}
-void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
-{
- wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
- TASK_UNINTERRUPTIBLE);
-}
-
/*
* Lock extent buffer status and pages for writeback.
*
@@ -1560,8 +1921,19 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
*/
spin_lock(&eb->refs_lock);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
+ unsigned long flags;
+
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
spin_unlock(&eb->refs_lock);
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
+ xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irqrestore(&xas, flags);
+
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
-eb->len,
@@ -1647,50 +2019,167 @@ static void set_btree_ioerr(struct extent_buffer *eb)
}
}
+static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
+ unsigned long flags;
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ xas_set_mark(&xas, mark);
+ xas_unlock_irqrestore(&xas, flags);
+}
+
+static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
+ unsigned long flags;
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ xas_clear_mark(&xas, mark);
+ xas_unlock_irqrestore(&xas, flags);
+}
+
+static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info,
+ unsigned long start, unsigned long end)
+{
+ XA_STATE(xas, &fs_info->buffer_tree, start);
+ unsigned int tagged = 0;
+ void *eb;
+
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) {
+ xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ if (++tagged % XA_CHECK_SCHED)
+ continue;
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
+ }
+ xas_unlock_irq(&xas);
+}
+
+struct eb_batch {
+ unsigned int nr;
+ unsigned int cur;
+ struct extent_buffer *ebs[PAGEVEC_SIZE];
+};
+
+static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb)
+{
+ batch->ebs[batch->nr++] = eb;
+ return (batch->nr < PAGEVEC_SIZE);
+}
+
+static inline void eb_batch_init(struct eb_batch *batch)
+{
+ batch->nr = 0;
+ batch->cur = 0;
+}
+
+static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch)
+{
+ if (batch->cur >= batch->nr)
+ return NULL;
+ return batch->ebs[batch->cur++];
+}
+
+static inline void eb_batch_release(struct eb_batch *batch)
+{
+ for (unsigned int i = 0; i < batch->nr; i++)
+ free_extent_buffer(batch->ebs[i]);
+ eb_batch_init(batch);
+}
+
+static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max,
+ xa_mark_t mark)
+{
+ struct extent_buffer *eb;
+
+retry:
+ eb = xas_find_marked(xas, max, mark);
+
+ if (xas_retry(xas, eb))
+ goto retry;
+
+ if (!eb)
+ return NULL;
+
+ if (!refcount_inc_not_zero(&eb->refs)) {
+ xas_reset(xas);
+ goto retry;
+ }
+
+ if (unlikely(eb != xas_reload(xas))) {
+ free_extent_buffer(eb);
+ xas_reset(xas);
+ goto retry;
+ }
+
+ return eb;
+}
+
+static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
+ unsigned long *start,
+ unsigned long end, xa_mark_t tag,
+ struct eb_batch *batch)
+{
+ XA_STATE(xas, &fs_info->buffer_tree, *start);
+ struct extent_buffer *eb;
+
+ rcu_read_lock();
+ while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
+ if (!eb_batch_add(batch, eb)) {
+ *start = ((eb->start + eb->len) >> fs_info->nodesize_bits);
+ goto out;
+ }
+ }
+ if (end == ULONG_MAX)
+ *start = ULONG_MAX;
+ else
+ *start = end + 1;
+out:
+ rcu_read_unlock();
+
+ return batch->nr;
+}
+
/*
* The endio specific version which won't touch any unsafe spinlock in endio
* context.
*/
static struct extent_buffer *find_extent_buffer_nolock(
- const struct btrfs_fs_info *fs_info, u64 start)
+ struct btrfs_fs_info *fs_info, u64 start)
{
struct extent_buffer *eb;
+ unsigned long index = (start >> fs_info->nodesize_bits);
rcu_read_lock();
- eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits);
- if (eb && atomic_inc_not_zero(&eb->refs)) {
- rcu_read_unlock();
- return eb;
- }
+ eb = xa_load(&fs_info->buffer_tree, index);
+ if (eb && !refcount_inc_not_zero(&eb->refs))
+ eb = NULL;
rcu_read_unlock();
- return NULL;
+ return eb;
}
static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
- struct btrfs_fs_info *fs_info = eb->fs_info;
- bool uptodate = !bbio->bio.bi_status;
struct folio_iter fi;
- u32 bio_offset = 0;
- if (!uptodate)
+ if (bbio->bio.bi_status != BLK_STS_OK)
set_btree_ioerr(eb);
bio_for_each_folio_all(fi, &bbio->bio) {
- u64 start = eb->start + bio_offset;
- struct folio *folio = fi.folio;
- u32 len = fi.length;
-
- btrfs_folio_clear_writeback(fs_info, folio, start, len);
- bio_offset += len;
+ btrfs_meta_folio_clear_writeback(fi.folio, eb);
}
- clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
-
+ buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK);
+ clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
bio_put(&bbio->bio);
}
@@ -1732,205 +2221,69 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
- eb->fs_info, end_bbio_meta_write, eb);
+ BTRFS_I(fs_info->btree_inode), eb->start,
+ end_bbio_meta_write, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
wbc_init_bio(wbc, &bbio->bio);
- bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
- bbio->file_offset = eb->start;
- if (fs_info->nodesize < PAGE_SIZE) {
- struct folio *folio = eb->folios[0];
- bool ret;
+ for (int i = 0; i < num_extent_folios(eb); i++) {
+ struct folio *folio = eb->folios[i];
+ u64 range_start = max_t(u64, eb->start, folio_pos(folio));
+ u32 range_len = min_t(u64, folio_next_pos(folio),
+ eb->start + eb->len) - range_start;
folio_lock(folio);
- btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len);
- if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start,
- eb->len)) {
- folio_clear_dirty_for_io(folio);
- wbc->nr_to_write--;
- }
- ret = bio_add_folio(&bbio->bio, folio, eb->len,
- eb->start - folio_pos(folio));
- ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio, eb->len);
- folio_unlock(folio);
- } else {
- int num_folios = num_extent_folios(eb);
-
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
- bool ret;
-
- folio_lock(folio);
- folio_clear_dirty_for_io(folio);
- folio_start_writeback(folio);
- ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
- ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
+ btrfs_meta_folio_clear_dirty(folio, eb);
+ btrfs_meta_folio_set_writeback(folio, eb);
+ if (!folio_test_dirty(folio))
wbc->nr_to_write -= folio_nr_pages(folio);
- folio_unlock(folio);
- }
+ bio_add_folio_nofail(&bbio->bio, folio, range_len,
+ offset_in_folio(folio, range_start));
+ wbc_account_cgroup_owner(wbc, folio, range_len);
+ folio_unlock(folio);
+ }
+ /*
+ * If the fs is already in error status, do not submit any writeback
+ * but immediately finish it.
+ */
+ if (unlikely(BTRFS_FS_ERROR(fs_info))) {
+ btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info)));
+ return;
}
btrfs_submit_bbio(bbio, 0);
}
/*
- * Submit one subpage btree page.
- *
- * The main difference to submit_eb_page() is:
- * - Page locking
- * For subpage, we don't rely on page locking at all.
- *
- * - Flush write bio
- * We only flush bio if we may be unable to fit current extent buffers into
- * current bio.
+ * Wait for all eb writeback in the given range to finish.
*
- * Return >=0 for the number of submitted extent buffers.
- * Return <0 for fatal error.
+ * @fs_info: The fs_info for this file system.
+ * @start: The offset of the range to start waiting on writeback.
+ * @end: The end of the range, inclusive. This is meant to be used in
+ * conjunction with wait_marked_extents, so this will usually be
+ * the_next_eb->start - 1.
*/
-static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
+void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
+ u64 end)
{
- struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- int submitted = 0;
- u64 folio_start = folio_pos(folio);
- int bit_start = 0;
- int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
-
- /* Lock and write each dirty extent buffers in the range */
- while (bit_start < fs_info->sectors_per_page) {
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct eb_batch batch;
+ unsigned long start_index = (start >> fs_info->nodesize_bits);
+ unsigned long end_index = (end >> fs_info->nodesize_bits);
+
+ eb_batch_init(&batch);
+ while (start_index <= end_index) {
struct extent_buffer *eb;
- unsigned long flags;
- u64 start;
+ unsigned int nr_ebs;
- /*
- * Take private lock to ensure the subpage won't be detached
- * in the meantime.
- */
- spin_lock(&folio->mapping->i_private_lock);
- if (!folio_test_private(folio)) {
- spin_unlock(&folio->mapping->i_private_lock);
+ nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index,
+ PAGECACHE_TAG_WRITEBACK, &batch);
+ if (!nr_ebs)
break;
- }
- spin_lock_irqsave(&subpage->lock, flags);
- if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page,
- subpage->bitmaps)) {
- spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&folio->mapping->i_private_lock);
- bit_start++;
- continue;
- }
-
- start = folio_start + bit_start * fs_info->sectorsize;
- bit_start += sectors_per_node;
-
- /*
- * Here we just want to grab the eb without touching extra
- * spin locks, so call find_extent_buffer_nolock().
- */
- eb = find_extent_buffer_nolock(fs_info, start);
- spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&folio->mapping->i_private_lock);
-
- /*
- * The eb has already reached 0 refs thus find_extent_buffer()
- * doesn't return it. We don't need to write back such eb
- * anyway.
- */
- if (!eb)
- continue;
-
- if (lock_extent_buffer_for_io(eb, wbc)) {
- write_one_eb(eb, wbc);
- submitted++;
- }
- free_extent_buffer(eb);
- }
- return submitted;
-}
-
-/*
- * Submit all page(s) of one extent buffer.
- *
- * @page: the page of one extent buffer
- * @eb_context: to determine if we need to submit this page, if current page
- * belongs to this eb, we don't need to submit
- *
- * The caller should pass each page in their bytenr order, and here we use
- * @eb_context to determine if we have submitted pages of one extent buffer.
- *
- * If we have, we just skip until we hit a new page that doesn't belong to
- * current @eb_context.
- *
- * If not, we submit all the page(s) of the extent buffer.
- *
- * Return >0 if we have submitted the extent buffer successfully.
- * Return 0 if we don't need to submit the page, as it's already submitted by
- * previous call.
- * Return <0 for fatal error.
- */
-static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx)
-{
- struct writeback_control *wbc = ctx->wbc;
- struct address_space *mapping = folio->mapping;
- struct extent_buffer *eb;
- int ret;
-
- if (!folio_test_private(folio))
- return 0;
-
- if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
- return submit_eb_subpage(folio, wbc);
-
- spin_lock(&mapping->i_private_lock);
- if (!folio_test_private(folio)) {
- spin_unlock(&mapping->i_private_lock);
- return 0;
- }
-
- eb = folio_get_private(folio);
-
- /*
- * Shouldn't happen and normally this would be a BUG_ON but no point
- * crashing the machine for something we can survive anyway.
- */
- if (WARN_ON(!eb)) {
- spin_unlock(&mapping->i_private_lock);
- return 0;
- }
-
- if (eb == ctx->eb) {
- spin_unlock(&mapping->i_private_lock);
- return 0;
- }
- ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->i_private_lock);
- if (!ret)
- return 0;
-
- ctx->eb = eb;
- ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx);
- if (ret) {
- if (ret == -EBUSY)
- ret = 0;
- free_extent_buffer(eb);
- return ret;
- }
-
- if (!lock_extent_buffer_for_io(eb, wbc)) {
- free_extent_buffer(eb);
- return 0;
- }
- /* Implies write in zoned mode. */
- if (ctx->zoned_bg) {
- /* Mark the last eb in the block group. */
- btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb);
- ctx->zoned_bg->meta_write_pointer += eb->len;
+ while ((eb = eb_batch_next(&batch)) != NULL)
+ wait_on_extent_buffer_writeback(eb);
+ eb_batch_release(&batch);
+ cond_resched();
}
- write_one_eb(eb, wbc);
- free_extent_buffer(eb);
- return 1;
}
int btree_write_cache_pages(struct address_space *mapping,
@@ -1941,25 +2294,27 @@ int btree_write_cache_pages(struct address_space *mapping,
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
- struct folio_batch fbatch;
- unsigned int nr_folios;
- pgoff_t index;
- pgoff_t end; /* Inclusive */
+ struct eb_batch batch;
+ unsigned int nr_ebs;
+ unsigned long index;
+ unsigned long end;
int scanned = 0;
xa_mark_t tag;
- folio_batch_init(&fbatch);
+ eb_batch_init(&batch);
if (wbc->range_cyclic) {
- index = mapping->writeback_index; /* Start from prev offset */
+ index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits);
end = -1;
+
/*
* Start from the beginning does not need to cycle over the
* range, mark it as scanned.
*/
scanned = (index == 0);
} else {
- index = wbc->range_start >> PAGE_SHIFT;
- end = wbc->range_end >> PAGE_SHIFT;
+ index = (wbc->range_start >> fs_info->nodesize_bits);
+ end = (wbc->range_end >> fs_info->nodesize_bits);
+
scanned = 1;
}
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -1969,31 +2324,39 @@ int btree_write_cache_pages(struct address_space *mapping,
btrfs_zoned_meta_io_lock(fs_info);
retry:
if (wbc->sync_mode == WB_SYNC_ALL)
- tag_pages_for_writeback(mapping, index, end);
+ buffer_tree_tag_for_writeback(fs_info, index, end);
while (!done && !nr_to_write_done && (index <= end) &&
- (nr_folios = filemap_get_folios_tag(mapping, &index, end,
- tag, &fbatch))) {
- unsigned i;
+ (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) {
+ struct extent_buffer *eb;
- for (i = 0; i < nr_folios; i++) {
- struct folio *folio = fbatch.folios[i];
+ while ((eb = eb_batch_next(&batch)) != NULL) {
+ ctx.eb = eb;
+
+ ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx);
+ if (ret) {
+ if (ret == -EBUSY)
+ ret = 0;
- ret = submit_eb_page(folio, &ctx);
- if (ret == 0)
+ if (ret) {
+ done = 1;
+ break;
+ }
continue;
- if (ret < 0) {
- done = 1;
- break;
}
- /*
- * the filesystem may choose to bump up nr_to_write.
- * We have to make sure to honor the new nr_to_write
- * at any time
- */
- nr_to_write_done = wbc->nr_to_write <= 0;
+ if (!lock_extent_buffer_for_io(eb, wbc))
+ continue;
+
+ /* Implies write in zoned mode. */
+ if (ctx.zoned_bg) {
+ /* Mark the last eb in the block group. */
+ btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb);
+ ctx.zoned_bg->meta_write_pointer += eb->len;
+ }
+ write_one_eb(eb, wbc);
}
- folio_batch_release(&fbatch);
+ nr_to_write_done = (wbc->nr_to_write <= 0);
+ eb_batch_release(&batch);
cond_resched();
}
if (!scanned && !done) {
@@ -2119,10 +2482,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
&BTRFS_I(inode)->runtime_flags))
wbc->tagged_writepages = 1;
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
+ tag = wbc_to_tag(wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
@@ -2138,10 +2498,8 @@ retry:
done_index = folio_next_index(folio);
/*
* At this point we hold neither the i_pages lock nor
- * the page lock: the page may be truncated or
- * invalidated (changing page->mapping to NULL),
- * or even swizzled back from swapper_space to
- * tmpfs file mapping
+ * the folio lock: the folio may be truncated or
+ * invalidated (changing folio->mapping to NULL).
*/
if (!folio_trylock(folio)) {
submit_write_bio(bio_ctrl, 0);
@@ -2168,7 +2526,7 @@ retry:
* In above case, [32K, 96K) is asynchronously submitted
* for compression, and [124K, 128K) needs to be written back.
*
- * If we didn't wait wrtiteback for page 64K, [128K, 128K)
+ * If we didn't wait writeback for page 64K, [128K, 128K)
* won't be submitted as the page still has writeback flag
* and will be skipped in the next check.
*
@@ -2179,7 +2537,7 @@ retry:
* regular submission.
*/
if (wbc->sync_mode != WB_SYNC_NONE ||
- btrfs_is_subpage(inode_to_fs_info(inode), mapping)) {
+ btrfs_is_subpage(inode_to_fs_info(inode), folio)) {
if (folio_test_writeback(folio))
submit_write_bio(bio_ctrl, 0);
folio_wait_writeback(folio);
@@ -2260,8 +2618,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
while (cur <= end) {
- u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
- u32 cur_len = cur_end + 1 - cur;
+ u64 cur_end;
+ u32 cur_len;
struct folio *folio;
folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
@@ -2271,13 +2629,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
* code is just in case, but shouldn't actually be run.
*/
if (IS_ERR(folio)) {
+ cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+ cur_len = cur_end + 1 - cur;
btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
cur, cur_len, false);
mapping_set_error(mapping, PTR_ERR(folio));
- cur = cur_end + 1;
+ cur = cur_end;
continue;
}
+ cur_end = min_t(u64, folio_next_pos(folio) - 1, end);
+ cur_len = cur_end + 1 - cur;
+
ASSERT(folio_test_locked(folio));
if (pages_dirty && folio != locked_folio)
ASSERT(folio_test_dirty(folio));
@@ -2292,11 +2655,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
if (ret == 1)
goto next_page;
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
- cur, cur_len, !ret);
+ if (ret)
mapping_set_error(mapping, ret);
- }
btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
@@ -2330,16 +2690,27 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb
void btrfs_readahead(struct readahead_control *rac)
{
- struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ | REQ_RAHEAD,
+ .ractl = rac,
+ .last_em_start = U64_MAX,
+ };
struct folio *folio;
+ struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
+ const u64 start = readahead_pos(rac);
+ const u64 end = start + readahead_length(rac) - 1;
+ struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
- u64 prev_em_start = (u64)-1;
+
+ lock_extents_for_read(inode, start, end, &cached_state);
while ((folio = readahead_folio(rac)) != NULL)
- btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+ btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
+
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
if (em_cached)
- free_extent_map(em_cached);
+ btrfs_free_extent_map(em_cached);
submit_one_bio(&bio_ctrl);
}
@@ -2363,7 +2734,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent(tree, start, end, &cached_state);
+ btrfs_lock_extent(tree, start, end, &cached_state);
folio_wait_writeback(folio);
/*
@@ -2371,46 +2742,54 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* so here we only need to unlock the extent range to free any
* existing extent state.
*/
- unlock_extent(tree, start, end, &cached_state);
+ btrfs_unlock_extent(tree, start, end, &cached_state);
return 0;
}
/*
- * a helper for release_folio, this tests for areas of the page that
- * are locked or under IO and drops the related state bits if it is safe
- * to drop the page.
+ * A helper for struct address_space_operations::release_folio, this tests for
+ * areas of the folio that are locked or under IO and drops the related state
+ * bits if it is safe to drop the folio.
*/
static bool try_release_extent_state(struct extent_io_tree *tree,
struct folio *folio)
{
+ struct extent_state *cached_state = NULL;
u64 start = folio_pos(folio);
- u64 end = start + PAGE_SIZE - 1;
- bool ret;
+ u64 end = start + folio_size(folio) - 1;
+ u32 range_bits;
+ u32 clear_bits;
+ bool ret = false;
+ int ret2;
- if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
- ret = false;
- } else {
- u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
- EXTENT_DELALLOC_NEW | EXTENT_CTLBITS |
- EXTENT_QGROUP_RESERVED);
- int ret2;
+ btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state);
- /*
- * At this point we can safely clear everything except the
- * locked bit, the nodatasum bit and the delalloc new bit.
- * The delalloc new bit will be cleared by ordered extent
- * completion.
- */
- ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
+ /*
+ * We can release the folio if it's locked only for ordered extent
+ * completion, since that doesn't require using the folio.
+ */
+ if ((range_bits & EXTENT_LOCKED) &&
+ !(range_bits & EXTENT_FINISHING_ORDERED))
+ goto out;
+
+ clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW |
+ EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED |
+ EXTENT_FINISHING_ORDERED);
+ /*
+ * At this point we can safely clear everything except the locked,
+ * nodatasum, delalloc new and finishing ordered bits. The delalloc new
+ * bit will be cleared by ordered extent completion.
+ */
+ ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state);
+ /*
+ * If clear_extent_bit failed for enomem reasons, we can't allow the
+ * release to continue.
+ */
+ if (ret2 == 0)
+ ret = true;
+out:
+ btrfs_free_extent_state(cached_state);
- /* if clear_extent_bit failed for enomem reasons,
- * we can't allow the release to continue.
- */
- if (ret2 < 0)
- ret = false;
- else
- ret = true;
- }
return ret;
}
@@ -2422,7 +2801,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
{
u64 start = folio_pos(folio);
- u64 end = start + PAGE_SIZE - 1;
+ u64 end = start + folio_size(folio) - 1;
struct btrfs_inode *inode = folio_to_inode(folio);
struct extent_io_tree *io_tree = &inode->io_tree;
@@ -2433,18 +2812,19 @@ bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
struct extent_map *em;
write_lock(&extent_tree->lock);
- em = lookup_extent_mapping(extent_tree, start, len);
+ em = btrfs_lookup_extent_mapping(extent_tree, start, len);
if (!em) {
write_unlock(&extent_tree->lock);
break;
}
if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) {
write_unlock(&extent_tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
break;
}
- if (test_range_bit_exists(io_tree, em->start,
- extent_map_end(em) - 1, EXTENT_LOCKED))
+ if (btrfs_test_range_bit_exists(io_tree, em->start,
+ btrfs_extent_map_end(em) - 1,
+ EXTENT_LOCKED))
goto next;
/*
* If it's not in the list of modified extents, used by a fast
@@ -2471,15 +2851,15 @@ remove_em:
* fsync performance for workloads with a data size that exceeds
* or is close to the system's memory).
*/
- remove_extent_mapping(inode, em);
+ btrfs_remove_extent_mapping(inode, em);
/* Once for the inode's extent map tree. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
next:
- start = extent_map_end(em);
+ start = btrfs_extent_map_end(em);
write_unlock(&extent_tree->lock);
/* Once for us, for the lookup_extent_mapping() reference. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (need_resched()) {
/*
@@ -2495,11 +2875,6 @@ next:
return try_release_extent_state(io_tree, folio);
}
-static void __free_extent_buffer(struct extent_buffer *eb)
-{
- kmem_cache_free(extent_buffer_cache, eb);
-}
-
static int extent_buffer_under_io(const struct extent_buffer *eb)
{
return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
@@ -2508,13 +2883,13 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
static bool folio_range_has_eb(struct folio *folio)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
lockdep_assert_held(&folio->mapping->i_private_lock);
if (folio_test_private(folio)) {
- subpage = folio_get_private(folio);
- if (atomic_read(&subpage->eb_refs))
+ bfs = folio_get_private(folio);
+ if (atomic_read(&bfs->eb_refs))
return true;
}
return false;
@@ -2523,6 +2898,7 @@ static bool folio_range_has_eb(struct folio *folio)
static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct address_space *mapping = folio->mapping;
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
/*
@@ -2530,21 +2906,20 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* be done under the i_private_lock.
*/
if (mapped)
- spin_lock(&folio->mapping->i_private_lock);
+ spin_lock(&mapping->i_private_lock);
if (!folio_test_private(folio)) {
if (mapped)
- spin_unlock(&folio->mapping->i_private_lock);
+ spin_unlock(&mapping->i_private_lock);
return;
}
- if (fs_info->nodesize >= PAGE_SIZE) {
+ if (!btrfs_meta_is_subpage(fs_info)) {
/*
- * We do this since we'll remove the pages after we've
- * removed the eb from the radix tree, so we could race
- * and have this page now attached to the new eb. So
- * only clear folio if it's still connected to
- * this eb.
+ * We do this since we'll remove the pages after we've removed
+ * the eb from the xarray, so we could race and have this page
+ * now attached to the new eb. So only clear folio if it's
+ * still connected to this eb.
*/
if (folio_test_private(folio) && folio_get_private(folio) == eb) {
BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
@@ -2554,7 +2929,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
folio_detach_private(folio);
}
if (mapped)
- spin_unlock(&folio->mapping->i_private_lock);
+ spin_unlock(&mapping->i_private_lock);
return;
}
@@ -2564,7 +2939,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* attached to one dummy eb, no sharing.
*/
if (!mapped) {
- btrfs_detach_subpage(fs_info, folio);
+ btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return;
}
@@ -2575,13 +2950,13 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* page range and no unfinished IO.
*/
if (!folio_range_has_eb(folio))
- btrfs_detach_subpage(fs_info, folio);
+ btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
- spin_unlock(&folio->mapping->i_private_lock);
+ spin_unlock(&mapping->i_private_lock);
}
-/* Release all pages attached to the extent buffer */
-static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
+/* Release all folios attached to the extent buffer */
+static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb)
{
ASSERT(!extent_buffer_under_io(eb));
@@ -2592,9 +2967,6 @@ static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
continue;
detach_extent_buffer_folio(eb, folio);
-
- /* One for when we allocated the folio. */
- folio_put(folio);
}
}
@@ -2603,40 +2975,57 @@ static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
*/
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
- btrfs_release_extent_buffer_pages(eb);
+ btrfs_release_extent_buffer_folios(eb);
btrfs_leak_debug_del_eb(eb);
- __free_extent_buffer(eb);
+ kmem_cache_free(extent_buffer_cache, eb);
}
-static struct extent_buffer *
-__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
- unsigned long len)
+static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb = NULL;
eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
eb->start = start;
- eb->len = len;
+ eb->len = fs_info->nodesize;
eb->fs_info = fs_info;
init_rwsem(&eb->lock);
btrfs_leak_debug_add_eb(eb);
spin_lock_init(&eb->refs_lock);
- atomic_set(&eb->refs, 1);
+ refcount_set(&eb->refs, 1);
- ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
+ ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE);
return eb;
}
+/*
+ * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer()
+ * does not call folio_put(), and we need to set the folios to NULL so that
+ * btrfs_release_extent_buffer() will not detach them a second time.
+ */
+static void cleanup_extent_buffer_folios(struct extent_buffer *eb)
+{
+ const int num_folios = num_extent_folios(eb);
+
+ /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */
+ for (int i = 0; i < num_folios; i++) {
+ ASSERT(eb->folios[i]);
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ folio_put(eb->folios[i]);
+ eb->folios[i] = NULL;
+ }
+}
+
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
struct extent_buffer *new;
- int num_folios = num_extent_folios(src);
+ int num_folios;
int ret;
- new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
+ new = __alloc_extent_buffer(src->fs_info, src->start);
if (new == NULL)
return NULL;
@@ -2648,78 +3037,78 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
ret = alloc_eb_folio_array(new, false);
- if (ret) {
- btrfs_release_extent_buffer(new);
- return NULL;
- }
+ if (ret)
+ goto release_eb;
+ ASSERT(num_extent_folios(src) == num_extent_folios(new),
+ "%d != %d", num_extent_folios(src), num_extent_folios(new));
+ /* Explicitly use the cached num_extent value from now on. */
+ num_folios = num_extent_folios(src);
for (int i = 0; i < num_folios; i++) {
struct folio *folio = new->folios[i];
ret = attach_extent_buffer_folio(new, folio, NULL);
- if (ret < 0) {
- btrfs_release_extent_buffer(new);
- return NULL;
- }
+ if (ret < 0)
+ goto cleanup_folios;
WARN_ON(folio_test_dirty(folio));
}
+ for (int i = 0; i < num_folios; i++)
+ folio_put(new->folios[i]);
+
copy_extent_buffer_full(new, src);
set_extent_buffer_uptodate(new);
return new;
+
+cleanup_folios:
+ cleanup_extent_buffer_folios(new);
+release_eb:
+ btrfs_release_extent_buffer(new);
+ return NULL;
}
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb;
- int num_folios = 0;
int ret;
- eb = __alloc_extent_buffer(fs_info, start, len);
+ eb = __alloc_extent_buffer(fs_info, start);
if (!eb)
return NULL;
ret = alloc_eb_folio_array(eb, false);
if (ret)
- goto err;
+ goto release_eb;
- num_folios = num_extent_folios(eb);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
if (ret < 0)
- goto err;
+ goto cleanup_folios;
}
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ folio_put(eb->folios[i]);
set_extent_buffer_uptodate(eb);
btrfs_set_header_nritems(eb, 0);
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
return eb;
-err:
- for (int i = 0; i < num_folios; i++) {
- if (eb->folios[i]) {
- detach_extent_buffer_folio(eb, eb->folios[i]);
- folio_put(eb->folios[i]);
- }
- }
- __free_extent_buffer(eb);
- return NULL;
-}
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
-{
- return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
+cleanup_folios:
+ cleanup_extent_buffer_folios(eb);
+release_eb:
+ btrfs_release_extent_buffer(eb);
+ return NULL;
}
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
/*
- * The TREE_REF bit is first set when the extent_buffer is added
- * to the radix tree. It is also reset, if unset, when a new reference
- * is created by find_extent_buffer.
+ * The TREE_REF bit is first set when the extent_buffer is added to the
+ * xarray. It is also reset, if unset, when a new reference is created
+ * by find_extent_buffer.
*
* It is only cleared in two cases: freeing the last non-tree
* reference to the extent_buffer when its STALE bit is set or
@@ -2731,31 +3120,28 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
* conditions between the calls to check_buffer_tree_ref in those
* codepaths and clearing TREE_REF in try_release_extent_buffer.
*
- * The actual lifetime of the extent_buffer in the radix tree is
- * adequately protected by the refcount, but the TREE_REF bit and
- * its corresponding reference are not. To protect against this
- * class of races, we call check_buffer_tree_ref from the codepaths
- * which trigger io. Note that once io is initiated, TREE_REF can no
- * longer be cleared, so that is the moment at which any such race is
- * best fixed.
+ * The actual lifetime of the extent_buffer in the xarray is adequately
+ * protected by the refcount, but the TREE_REF bit and its corresponding
+ * reference are not. To protect against this class of races, we call
+ * check_buffer_tree_ref() from the code paths which trigger io. Note that
+ * once io is initiated, TREE_REF can no longer be cleared, so that is
+ * the moment at which any such race is best fixed.
*/
- refs = atomic_read(&eb->refs);
+ refs = refcount_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
return;
spin_lock(&eb->refs_lock);
if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
spin_unlock(&eb->refs_lock);
}
static void mark_extent_buffer_accessed(struct extent_buffer *eb)
{
- int num_folios= num_extent_folios(eb);
-
check_buffer_tree_ref(eb);
- for (int i = 0; i < num_folios; i++)
+ for (int i = 0; i < num_extent_folios(eb); i++)
folio_mark_accessed(eb->folios[i]);
}
@@ -2788,10 +3174,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
}
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -2803,47 +3189,48 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
again:
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- exists = ERR_PTR(ret);
- goto free_eb;
+ xa_lock_irq(&fs_info->buffer_tree);
+ exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits,
+ NULL, eb, GFP_NOFS);
+ if (xa_is_err(exists)) {
+ ret = xa_err(exists);
+ xa_unlock_irq(&fs_info->buffer_tree);
+ btrfs_release_extent_buffer(eb);
+ return ERR_PTR(ret);
}
- spin_lock(&fs_info->buffer_lock);
- ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits, eb);
- spin_unlock(&fs_info->buffer_lock);
- radix_tree_preload_end();
- if (ret == -EEXIST) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
- goto free_eb;
- else
+ if (exists) {
+ if (!refcount_inc_not_zero(&exists->refs)) {
+ /* The extent buffer is being freed, retry. */
+ xa_unlock_irq(&fs_info->buffer_tree);
goto again;
+ }
+ xa_unlock_irq(&fs_info->buffer_tree);
+ btrfs_release_extent_buffer(eb);
+ return exists;
}
+ xa_unlock_irq(&fs_info->buffer_tree);
check_buffer_tree_ref(eb);
- set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
return eb;
-free_eb:
- btrfs_release_extent_buffer(eb);
- return exists;
-}
+#else
+ /* Stub to avoid linker error when compiled with optimizations turned off. */
+ return NULL;
#endif
+}
-static struct extent_buffer *grab_extent_buffer(
- struct btrfs_fs_info *fs_info, struct page *page)
+static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
+ struct folio *folio)
{
- struct folio *folio = page_folio(page);
struct extent_buffer *exists;
- lockdep_assert_held(&page->mapping->i_private_lock);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
/*
- * For subpage case, we completely rely on radix tree to ensure we
- * don't try to insert two ebs for the same bytenr. So here we always
- * return NULL and just continue.
+ * For subpage case, we completely rely on xarray to ensure we don't try
+ * to insert two ebs for the same bytenr. So here we always return NULL
+ * and just continue.
*/
- if (fs_info->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(fs_info))
return NULL;
/* Page not yet attached to an extent buffer */
@@ -2851,51 +3238,53 @@ static struct extent_buffer *grab_extent_buffer(
return NULL;
/*
- * We could have already allocated an eb for this page and attached one
+ * We could have already allocated an eb for this folio and attached one
* so lets see if we can get a ref on the existing eb, and if we can we
* know it's good and we can just return that one, else we know we can
* just overwrite folio private.
*/
exists = folio_get_private(folio);
- if (atomic_inc_not_zero(&exists->refs))
+ if (refcount_inc_not_zero(&exists->refs))
return exists;
- WARN_ON(PageDirty(page));
+ WARN_ON(folio_test_dirty(folio));
folio_detach_private(folio);
return NULL;
}
-static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
+/*
+ * Validate alignment constraints of eb at logical address @start.
+ */
+static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
{
- if (!IS_ALIGNED(start, fs_info->sectorsize)) {
+ const u32 nodesize = fs_info->nodesize;
+
+ if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) {
btrfs_err(fs_info, "bad tree block start %llu", start);
- return -EINVAL;
+ return true;
}
- if (fs_info->nodesize < PAGE_SIZE &&
- offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
+ if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) {
btrfs_err(fs_info,
- "tree block crosses page boundary, start %llu nodesize %u",
- start, fs_info->nodesize);
- return -EINVAL;
+ "tree block is not nodesize aligned, start %llu nodesize %u",
+ start, nodesize);
+ return true;
}
- if (fs_info->nodesize >= PAGE_SIZE &&
- !PAGE_ALIGNED(start)) {
+ if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) {
btrfs_err(fs_info,
"tree block is not page aligned, start %llu nodesize %u",
- start, fs_info->nodesize);
- return -EINVAL;
+ start, nodesize);
+ return true;
}
- if (!IS_ALIGNED(start, fs_info->nodesize) &&
- !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
+ if (unlikely(!IS_ALIGNED(start, nodesize) &&
+ !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) {
btrfs_warn(fs_info,
"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
- start, fs_info->nodesize);
+ start, nodesize);
}
- return 0;
+ return false;
}
-
/*
* Return 0 if eb->folios[i] is attached to btree inode successfully.
* Return >0 if there is already another extent buffer for the range,
@@ -2905,14 +3294,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
* The caller needs to free the existing folios and retry using the same order.
*/
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
- struct btrfs_subpage *prealloc,
+ struct btrfs_folio_state *prealloc,
struct extent_buffer **found_eb_ret)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
- const unsigned long index = eb->start >> PAGE_SHIFT;
- struct folio *existing_folio = NULL;
+ const pgoff_t index = eb->start >> PAGE_SHIFT;
+ struct folio *existing_folio;
int ret;
ASSERT(found_eb_ret);
@@ -2921,6 +3310,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
ASSERT(eb->folios[i]);
retry:
+ existing_folio = NULL;
ret = filemap_add_folio(mapping, eb->folios[i], index + i,
GFP_NOFS | __GFP_NOFAIL);
if (!ret)
@@ -2928,10 +3318,8 @@ retry:
existing_folio = filemap_lock_folio(mapping, index + i);
/* The page cache only exists for a very short time, just retry. */
- if (IS_ERR(existing_folio)) {
- existing_folio = NULL;
+ if (IS_ERR(existing_folio))
goto retry;
- }
/* For now, we should only have single-page folios for btree inode. */
ASSERT(folio_nr_pages(existing_folio) == 1);
@@ -2944,15 +3332,14 @@ retry:
finish:
spin_lock(&mapping->i_private_lock);
- if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
+ if (existing_folio && btrfs_meta_is_subpage(fs_info)) {
/* We're going to reuse the existing page, can drop our folio now. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
} else if (existing_folio) {
struct extent_buffer *existing_eb;
- existing_eb = grab_extent_buffer(fs_info,
- folio_page(existing_folio, 0));
+ existing_eb = grab_extent_buffer(fs_info, existing_folio);
if (existing_eb) {
/* The extent buffer still exists, we can use it directly. */
*found_eb_ret = existing_eb;
@@ -2973,7 +3360,7 @@ finish:
/*
* To inform we have an extra eb under allocation, so that
* detach_extent_buffer_page() won't release the folio private when the
- * eb hasn't been inserted into radix tree yet.
+ * eb hasn't been inserted into the xarray yet.
*
* The ref will be decreased when the eb releases the page, in
* detach_extent_buffer_page(). Thus needs no special handling in the
@@ -2987,12 +3374,10 @@ finish:
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
- unsigned long len = fs_info->nodesize;
- int num_folios;
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
- struct btrfs_subpage *prealloc = NULL;
+ struct btrfs_folio_state *prealloc = NULL;
u64 lockdep_owner = owner_root;
bool page_contig = true;
int uptodate = 1;
@@ -3016,7 +3401,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (eb)
return eb;
- eb = __alloc_extent_buffer(fs_info, start, len);
+ eb = __alloc_extent_buffer(fs_info, start);
if (!eb)
return ERR_PTR(-ENOMEM);
@@ -3036,8 +3421,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* The memory will be freed by attach_extent_buffer_page() or freed
* manually if we exit earlier.
*/
- if (fs_info->nodesize < PAGE_SIZE) {
- prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+ if (btrfs_meta_is_subpage(fs_info)) {
+ prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
ret = PTR_ERR(prealloc);
goto out;
@@ -3048,13 +3433,12 @@ reallocate:
/* Allocate all pages first. */
ret = alloc_eb_folio_array(eb, true);
if (ret < 0) {
- btrfs_free_subpage(prealloc);
+ btrfs_free_folio_state(prealloc);
goto out;
}
- num_folios = num_extent_folios(eb);
/* Attach all pages to the filemap. */
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio;
ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
@@ -3083,7 +3467,7 @@ reallocate:
* using 0-order folios.
*/
if (unlikely(ret == -EAGAIN)) {
- ASSERT(0);
+ DEBUG_WARN("folio order mismatch between new eb and filemap");
goto reallocate;
}
attached++;
@@ -3094,7 +3478,7 @@ reallocate:
* and free the allocated page.
*/
folio = eb->folios[i];
- WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
+ WARN_ON(btrfs_meta_folio_test_dirty(folio, eb));
/*
* Check if the current page is physically contiguous with previous eb
@@ -3105,15 +3489,14 @@ reallocate:
if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
page_contig = false;
- if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len))
+ if (!btrfs_meta_folio_test_uptodate(folio, eb))
uptodate = 0;
/*
* We can't unlock the pages just yet since the extent buffer
- * hasn't been properly inserted in the radix tree, this
- * opens a race with btree_release_folio which can free a page
- * while we are still filling in all pages for the buffer and
- * we could crash.
+ * hasn't been properly inserted into the xarray, this opens a
+ * race with btree_release_folio() which can free a page while we
+ * are still filling in all pages for the buffer and we could crash.
*/
}
if (uptodate)
@@ -3122,38 +3505,46 @@ reallocate:
if (page_contig)
eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
again:
- ret = radix_tree_preload(GFP_NOFS);
- if (ret)
+ xa_lock_irq(&fs_info->buffer_tree);
+ existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
+ start >> fs_info->nodesize_bits, NULL, eb,
+ GFP_NOFS);
+ if (xa_is_err(existing_eb)) {
+ ret = xa_err(existing_eb);
+ xa_unlock_irq(&fs_info->buffer_tree);
goto out;
-
- spin_lock(&fs_info->buffer_lock);
- ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits, eb);
- spin_unlock(&fs_info->buffer_lock);
- radix_tree_preload_end();
- if (ret == -EEXIST) {
- ret = 0;
- existing_eb = find_extent_buffer(fs_info, start);
- if (existing_eb)
- goto out;
- else
+ }
+ if (existing_eb) {
+ if (!refcount_inc_not_zero(&existing_eb->refs)) {
+ xa_unlock_irq(&fs_info->buffer_tree);
goto again;
+ }
+ xa_unlock_irq(&fs_info->buffer_tree);
+ goto out;
}
+ xa_unlock_irq(&fs_info->buffer_tree);
+
/* add one reference for the tree */
check_buffer_tree_ref(eb);
- set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
/*
* Now it's safe to unlock the pages because any calls to
* btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
- for (int i = 0; i < num_folios; i++)
- unlock_page(folio_page(eb->folios[i], 0));
+ for (int i = 0; i < num_extent_folios(eb); i++) {
+ folio_unlock(eb->folios[i]);
+ /*
+ * A folio that has been added to an address_space mapping
+ * should not continue holding the refcount from its original
+ * allocation indefinitely.
+ */
+ folio_put(eb->folios[i]);
+ }
return eb;
out:
- WARN_ON(!atomic_dec_and_test(&eb->refs));
+ WARN_ON(!refcount_dec_and_test(&eb->refs));
/*
* Any attached folios need to be detached before we unlock them. This
@@ -3163,26 +3554,22 @@ out:
* want that to grab this eb, as we're getting ready to free it. So we
* have to detach it first and then unlock it.
*
- * We have to drop our reference and NULL it out here because in the
- * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb.
- * Below when we call btrfs_release_extent_buffer() we will call
- * detach_extent_buffer_folio() on our remaining pages in the !subpage
- * case. If we left eb->folios[i] populated in the subpage case we'd
- * double put our reference and be super sad.
+ * Note: the bounds is num_extent_pages() as we need to go through all slots.
*/
- for (int i = 0; i < attached; i++) {
- ASSERT(eb->folios[i]);
- detach_extent_buffer_folio(eb, eb->folios[i]);
- unlock_page(folio_page(eb->folios[i], 0));
- folio_put(eb->folios[i]);
+ for (int i = 0; i < num_extent_pages(eb); i++) {
+ struct folio *folio = eb->folios[i];
+
+ if (i < attached) {
+ ASSERT(folio);
+ detach_extent_buffer_folio(eb, folio);
+ folio_unlock(folio);
+ } else if (!folio) {
+ continue;
+ }
+
+ folio_put(folio);
eb->folios[i] = NULL;
}
- /*
- * Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
- * so it can be cleaned up without utilizing page->mapping.
- */
- set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
-
btrfs_release_extent_buffer(eb);
if (ret < 0)
return ERR_PTR(ret);
@@ -3195,7 +3582,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
struct extent_buffer *eb =
container_of(head, struct extent_buffer, rcu_head);
- __free_extent_buffer(eb);
+ kmem_cache_free(extent_buffer_cache, eb);
}
static int release_extent_buffer(struct extent_buffer *eb)
@@ -3203,27 +3590,35 @@ static int release_extent_buffer(struct extent_buffer *eb)
{
lockdep_assert_held(&eb->refs_lock);
- WARN_ON(atomic_read(&eb->refs) == 0);
- if (atomic_dec_and_test(&eb->refs)) {
- if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
- struct btrfs_fs_info *fs_info = eb->fs_info;
+ if (refcount_dec_and_test(&eb->refs)) {
+ struct btrfs_fs_info *fs_info = eb->fs_info;
- spin_unlock(&eb->refs_lock);
+ spin_unlock(&eb->refs_lock);
- spin_lock(&fs_info->buffer_lock);
- radix_tree_delete(&fs_info->buffer_radix,
- eb->start >> fs_info->sectorsize_bits);
- spin_unlock(&fs_info->buffer_lock);
- } else {
- spin_unlock(&eb->refs_lock);
- }
+ /*
+ * We're erasing, theoretically there will be no allocations, so
+ * just use GFP_ATOMIC.
+ *
+ * We use cmpxchg instead of erase because we do not know if
+ * this eb is actually in the tree or not, we could be cleaning
+ * up an eb that we allocated but never inserted into the tree.
+ * Thus use cmpxchg to remove it from the tree if it is there,
+ * or leave the other entry if this isn't in the tree.
+ *
+ * The documentation says that putting a NULL value is the same
+ * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't
+ * in this case.
+ */
+ xa_cmpxchg_irq(&fs_info->buffer_tree,
+ eb->start >> fs_info->nodesize_bits, eb, NULL,
+ GFP_ATOMIC);
btrfs_leak_debug_del_eb(eb);
- /* Should be safe to release our pages at this point */
- btrfs_release_extent_buffer_pages(eb);
+ /* Should be safe to release folios at this point. */
+ btrfs_release_extent_buffer_folios(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
- __free_extent_buffer(eb);
+ kmem_cache_free(extent_buffer_cache, eb);
return 1;
}
#endif
@@ -3241,22 +3636,26 @@ void free_extent_buffer(struct extent_buffer *eb)
if (!eb)
return;
- refs = atomic_read(&eb->refs);
+ refs = refcount_read(&eb->refs);
while (1) {
- if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
- || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
- refs == 1))
+ if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) {
+ if (refs == 1)
+ break;
+ } else if (refs <= 3) {
break;
- if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
+ }
+
+ /* Optimization to avoid locking eb->refs_lock. */
+ if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1))
return;
}
spin_lock(&eb->refs_lock);
- if (atomic_read(&eb->refs) == 2 &&
+ if (refcount_read(&eb->refs) == 2 &&
test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
!extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_dec(&eb->refs);
+ refcount_dec(&eb->refs);
/*
* I know this is terrible, but it's temporary until we stop tracking
@@ -3273,44 +3672,27 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
spin_lock(&eb->refs_lock);
set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
- if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+ if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_dec(&eb->refs);
+ refcount_dec(&eb->refs);
release_extent_buffer(eb);
}
-static void btree_clear_folio_dirty(struct folio *folio)
+static void btree_clear_folio_dirty_tag(struct folio *folio)
{
- ASSERT(folio_test_dirty(folio));
+ ASSERT(!folio_test_dirty(folio));
ASSERT(folio_test_locked(folio));
- folio_clear_dirty_for_io(folio);
xa_lock_irq(&folio->mapping->i_pages);
if (!folio_test_dirty(folio))
- __xa_clear_mark(&folio->mapping->i_pages,
- folio_index(folio), PAGECACHE_TAG_DIRTY);
+ __xa_clear_mark(&folio->mapping->i_pages, folio->index,
+ PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&folio->mapping->i_pages);
}
-static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
-{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- struct folio *folio = eb->folios[0];
- bool last;
-
- /* btree_clear_folio_dirty() needs page locked. */
- folio_lock(folio);
- last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len);
- if (last)
- btree_clear_folio_dirty(folio);
- folio_unlock(folio);
- WARN_ON(atomic_read(&eb->refs) == 0);
-}
-
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios;
btrfs_assert_tree_write_locked(eb);
@@ -3334,129 +3716,99 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
return;
+ buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY);
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len,
fs_info->dirty_metadata_batch);
- if (eb->fs_info->nodesize < PAGE_SIZE)
- return clear_subpage_extent_buffer_dirty(eb);
-
- num_folios = num_extent_folios(eb);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
+ bool last;
if (!folio_test_dirty(folio))
continue;
folio_lock(folio);
- btree_clear_folio_dirty(folio);
+ last = btrfs_meta_folio_clear_and_test_dirty(folio, eb);
+ if (last)
+ btree_clear_folio_dirty_tag(folio);
folio_unlock(folio);
}
- WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(refcount_read(&eb->refs) == 0);
}
void set_extent_buffer_dirty(struct extent_buffer *eb)
{
- int num_folios;
bool was_dirty;
check_buffer_tree_ref(eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- num_folios = num_extent_folios(eb);
- WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(refcount_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
if (!was_dirty) {
- bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
+ bool subpage = btrfs_meta_is_subpage(eb->fs_info);
/*
* For subpage case, we can have other extent buffers in the
- * same page, and in clear_subpage_extent_buffer_dirty() we
+ * same page, and in clear_extent_buffer_dirty() we
* have to clear page dirty without subpage lock held.
* This can cause race where our page gets dirty cleared after
* we just set it.
*
- * Thankfully, clear_subpage_extent_buffer_dirty() has locked
+ * Thankfully, clear_extent_buffer_dirty() has locked
* its page for other reasons, we can use page lock to prevent
* the above race.
*/
if (subpage)
- lock_page(folio_page(eb->folios[0], 0));
- for (int i = 0; i < num_folios; i++)
- btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
- eb->start, eb->len);
+ folio_lock(eb->folios[0]);
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ btrfs_meta_folio_set_dirty(eb->folios[i], eb);
+ buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY);
if (subpage)
- unlock_page(folio_page(eb->folios[0], 0));
+ folio_unlock(eb->folios[0]);
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
eb->len,
eb->fs_info->dirty_metadata_batch);
}
#ifdef CONFIG_BTRFS_DEBUG
- for (int i = 0; i < num_folios; i++)
+ for (int i = 0; i < num_extent_folios(eb); i++)
ASSERT(folio_test_dirty(eb->folios[i]));
#endif
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
if (!folio)
continue;
- /*
- * This is special handling for metadata subpage, as regular
- * btrfs_is_subpage() can not handle cloned/dummy metadata.
- */
- if (fs_info->nodesize >= PAGE_SIZE)
- folio_clear_uptodate(folio);
- else
- btrfs_subpage_clear_uptodate(fs_info, folio,
- eb->start, eb->len);
+ btrfs_meta_folio_clear_uptodate(folio, eb);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
-
- /*
- * This is special handling for metadata subpage, as regular
- * btrfs_is_subpage() can not handle cloned/dummy metadata.
- */
- if (fs_info->nodesize >= PAGE_SIZE)
- folio_mark_uptodate(folio);
- else
- btrfs_subpage_set_uptodate(fs_info, folio,
- eb->start, eb->len);
- }
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ btrfs_meta_folio_set_uptodate(eb->folios[i], eb);
}
static void clear_extent_buffer_reading(struct extent_buffer *eb)
{
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags);
}
static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
- struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct folio_iter fi;
- u32 bio_offset = 0;
/*
* If the extent buffer is marked UPTODATE before the read operation
@@ -3471,25 +3823,10 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0)
uptodate = false;
- if (uptodate) {
+ if (uptodate)
set_extent_buffer_uptodate(eb);
- } else {
+ else
clear_extent_buffer_uptodate(eb);
- set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
- }
-
- bio_for_each_folio_all(fi, &bbio->bio) {
- struct folio *folio = fi.folio;
- u64 start = eb->start + bio_offset;
- u32 len = fi.length;
-
- if (uptodate)
- btrfs_folio_set_uptodate(fs_info, folio, start, len);
- else
- btrfs_folio_clear_uptodate(fs_info, folio, start, len);
-
- bio_offset += len;
- }
clear_extent_buffer_reading(eb);
free_extent_buffer(eb);
@@ -3497,11 +3834,11 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
bio_put(&bbio->bio);
}
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
- const struct btrfs_tree_parent_check *check)
+int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
+ const struct btrfs_tree_parent_check *check)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct btrfs_bio *bbio;
- bool ret;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -3516,7 +3853,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
/* Someone else is already reading the buffer, just wait for it. */
if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags))
- goto done;
+ return 0;
/*
* Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above
@@ -3529,41 +3866,40 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
return 0;
}
- clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
check_buffer_tree_ref(eb);
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
- REQ_OP_READ | REQ_META, eb->fs_info,
- end_bbio_meta_read, eb);
+ REQ_OP_READ | REQ_META, BTRFS_I(fs_info->btree_inode),
+ eb->start, end_bbio_meta_read, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
- bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
- bbio->file_offset = eb->start;
memcpy(&bbio->parent_check, check, sizeof(*check));
- if (eb->fs_info->nodesize < PAGE_SIZE) {
- ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len,
- eb->start - folio_pos(eb->folios[0]));
- ASSERT(ret);
- } else {
- int num_folios = num_extent_folios(eb);
-
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
+ for (int i = 0; i < num_extent_folios(eb); i++) {
+ struct folio *folio = eb->folios[i];
+ u64 range_start = max_t(u64, eb->start, folio_pos(folio));
+ u32 range_len = min_t(u64, folio_next_pos(folio),
+ eb->start + eb->len) - range_start;
- ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
- ASSERT(ret);
- }
+ bio_add_folio_nofail(&bbio->bio, folio, range_len,
+ offset_in_folio(folio, range_start));
}
btrfs_submit_bbio(bbio, mirror_num);
+ return 0;
+}
-done:
- if (wait == WAIT_COMPLETE) {
- wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
- if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
- return -EIO;
- }
+int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
+ const struct btrfs_tree_parent_check *check)
+{
+ int ret;
+
+ ret = read_extent_buffer_pages_nowait(eb, mirror_num, check);
+ if (ret < 0)
+ return ret;
+ wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
+ if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)))
+ return -EIO;
return 0;
}
@@ -3573,7 +3909,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
btrfs_warn(eb->fs_info,
"access to eb bytenr %llu len %u out of range start %lu len %lu",
eb->start, eb->len, start, len);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN();
return true;
}
@@ -3735,7 +4071,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
return;
- if (fs_info->nodesize < PAGE_SIZE) {
+ if (btrfs_meta_is_subpage(fs_info)) {
folio = eb->folios[0];
ASSERT(i == 0);
if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
@@ -3921,8 +4257,8 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number to test
*/
-int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
- unsigned long nr)
+bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
+ unsigned long nr)
{
unsigned long i;
size_t offset;
@@ -4109,82 +4445,29 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
-#define GANG_LOOKUP_SIZE 16
-static struct extent_buffer *get_next_extent_buffer(
- const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr)
-{
- struct extent_buffer *gang[GANG_LOOKUP_SIZE];
- struct extent_buffer *found = NULL;
- u64 folio_start = folio_pos(folio);
- u64 cur = folio_start;
-
- ASSERT(in_range(bytenr, folio_start, PAGE_SIZE));
- lockdep_assert_held(&fs_info->buffer_lock);
-
- while (cur < folio_start + PAGE_SIZE) {
- int ret;
- int i;
-
- ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
- (void **)gang, cur >> fs_info->sectorsize_bits,
- min_t(unsigned int, GANG_LOOKUP_SIZE,
- PAGE_SIZE / fs_info->nodesize));
- if (ret == 0)
- goto out;
- for (i = 0; i < ret; i++) {
- /* Already beyond page end */
- if (gang[i]->start >= folio_start + PAGE_SIZE)
- goto out;
- /* Found one */
- if (gang[i]->start >= bytenr) {
- found = gang[i];
- goto out;
- }
- }
- cur = gang[ret - 1]->start + gang[ret - 1]->len;
- }
-out:
- return found;
-}
-
static int try_release_subpage_extent_buffer(struct folio *folio)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- u64 cur = folio_pos(folio);
- const u64 end = cur + PAGE_SIZE;
+ struct extent_buffer *eb;
+ unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits);
+ unsigned long index = start;
+ unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
int ret;
- while (cur < end) {
- struct extent_buffer *eb = NULL;
-
- /*
- * Unlike try_release_extent_buffer() which uses folio private
- * to grab buffer, for subpage case we rely on radix tree, thus
- * we need to ensure radix tree consistency.
- *
- * We also want an atomic snapshot of the radix tree, thus go
- * with spinlock rather than RCU.
- */
- spin_lock(&fs_info->buffer_lock);
- eb = get_next_extent_buffer(fs_info, folio, cur);
- if (!eb) {
- /* No more eb in the page range after or at cur */
- spin_unlock(&fs_info->buffer_lock);
- break;
- }
- cur = eb->start + eb->len;
-
+ rcu_read_lock();
+ xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) {
/*
* The same as try_release_extent_buffer(), to ensure the eb
* won't disappear out from under us.
*/
spin_lock(&eb->refs_lock);
- if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ rcu_read_unlock();
+
+ if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
- spin_unlock(&fs_info->buffer_lock);
- break;
+ rcu_read_lock();
+ continue;
}
- spin_unlock(&fs_info->buffer_lock);
/*
* If tree ref isn't set then we know the ref on this eb is a
@@ -4202,7 +4485,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
* release_extent_buffer() will release the refs_lock.
*/
release_extent_buffer(eb);
+ rcu_read_lock();
}
+ rcu_read_unlock();
+
/*
* Finally to check if we have cleared folio private, as if we have
* released all ebs in the page, the folio private should be cleared now.
@@ -4214,14 +4500,13 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
ret = 0;
spin_unlock(&folio->mapping->i_private_lock);
return ret;
-
}
int try_release_extent_buffer(struct folio *folio)
{
struct extent_buffer *eb;
- if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(folio_to_fs_info(folio)))
return try_release_subpage_extent_buffer(folio);
/*
@@ -4243,7 +4528,7 @@ int try_release_extent_buffer(struct folio *folio)
* this page.
*/
spin_lock(&eb->refs_lock);
- if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
spin_unlock(&folio->mapping->i_private_lock);
return 0;
@@ -4289,12 +4574,12 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
if (IS_ERR(eb))
return;
- if (btrfs_buffer_uptodate(eb, gen, 1)) {
+ if (btrfs_buffer_uptodate(eb, gen, true)) {
free_extent_buffer(eb);
return;
}
- ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check);
+ ret = read_extent_buffer_pages_nowait(eb, 0, &check);
if (ret < 0)
free_extent_buffer_stale(eb);
else
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 8a36117ed453..02ebb2f238af 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -12,7 +12,6 @@
#include <linux/rwsem.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include "compression.h"
#include "messages.h"
#include "ulist.h"
#include "misc.h"
@@ -38,16 +37,10 @@ struct btrfs_tree_parent_check;
enum {
EXTENT_BUFFER_UPTODATE,
EXTENT_BUFFER_DIRTY,
- EXTENT_BUFFER_CORRUPT,
- /* this got triggered by readahead */
- EXTENT_BUFFER_READAHEAD,
EXTENT_BUFFER_TREE_REF,
EXTENT_BUFFER_STALE,
EXTENT_BUFFER_WRITEBACK,
- /* read IO error */
- EXTENT_BUFFER_READ_ERR,
EXTENT_BUFFER_UNMAPPED,
- EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
/* Indicate the extent buffer is written zeroed out (for zoned) */
@@ -79,7 +72,7 @@ enum {
* single word in a bitmap may straddle two pages in the extent buffer.
*/
#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
-#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+#define BYTE_MASK ((1U << BITS_PER_BYTE) - 1)
#define BITMAP_FIRST_BYTE_MASK(start) \
((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
#define BITMAP_LAST_BYTE_MASK(nbits) \
@@ -104,7 +97,7 @@ struct extent_buffer {
void *addr;
spinlock_t refs_lock;
- atomic_t refs;
+ refcount_t refs;
int read_mirror;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
s8 log_index;
@@ -246,15 +239,13 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
+void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
void btrfs_readahead(struct readahead_control *rac);
int set_folio_extent_mapped(struct folio *folio);
-int set_page_extent_mapped(struct page *page);
void clear_folio_extent_mapped(struct folio *folio);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level);
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src);
@@ -262,17 +253,23 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
void free_extent_buffer(struct extent_buffer *eb);
void free_extent_buffer_stale(struct extent_buffer *eb);
-#define WAIT_NONE 0
-#define WAIT_COMPLETE 1
-#define WAIT_PAGE_LOCK 2
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
const struct btrfs_tree_parent_check *parent_check);
-void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
+int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
+ const struct btrfs_tree_parent_check *parent_check);
+
+static inline void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
+ TASK_UNINTERRUPTIBLE);
+}
+
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level);
void btrfs_readahead_node_child(struct extent_buffer *node, int slot);
-static inline int num_extent_pages(const struct extent_buffer *eb)
+/* Note: this can be used in for loops without caching the value in a variable. */
+static inline int __pure num_extent_pages(const struct extent_buffer *eb)
{
/*
* For sectorsize == PAGE_SIZE case, since nodesize is always aligned to
@@ -290,9 +287,13 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
* As we can have either one large folio covering the whole eb
* (either nodesize <= PAGE_SIZE, or high order folio), or multiple
* single-paged folios.
+ *
+ * Note: this can be used in for loops without caching the value in a variable.
*/
-static inline int num_extent_folios(const struct extent_buffer *eb)
+static inline int __pure num_extent_folios(const struct extent_buffer *eb)
{
+ if (!eb->folios[0])
+ return 0;
if (folio_order(eb->folios[0]))
return 1;
return num_extent_pages(eb);
@@ -343,8 +344,8 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long len);
void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
unsigned long len);
-int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
- unsigned long pos);
+bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
+ unsigned long pos);
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len);
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
@@ -364,7 +365,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
bool nofail);
-int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array);
+int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
+ struct folio **folio_array);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 67ce85ff0ae2..7e38c23a0c1c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -13,7 +13,7 @@
static struct kmem_cache *extent_map_cache;
-int __init extent_map_init(void)
+int __init btrfs_extent_map_init(void)
{
extent_map_cache = kmem_cache_create("btrfs_extent_map",
sizeof(struct extent_map), 0, 0, NULL);
@@ -22,7 +22,7 @@ int __init extent_map_init(void)
return 0;
}
-void __cold extent_map_exit(void)
+void __cold btrfs_extent_map_exit(void)
{
kmem_cache_destroy(extent_map_cache);
}
@@ -31,7 +31,7 @@ void __cold extent_map_exit(void)
* Initialize the extent tree @tree. Should be called for each new inode or
* other user of the extent_map interface.
*/
-void extent_map_tree_init(struct extent_map_tree *tree)
+void btrfs_extent_map_tree_init(struct extent_map_tree *tree)
{
tree->root = RB_ROOT;
INIT_LIST_HEAD(&tree->modified_extents);
@@ -42,7 +42,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
* Allocate a new extent_map structure. The new structure is returned with a
* reference count of one and needs to be freed using free_extent_map()
*/
-struct extent_map *alloc_extent_map(void)
+struct extent_map *btrfs_alloc_extent_map(void)
{
struct extent_map *em;
em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
@@ -58,12 +58,12 @@ struct extent_map *alloc_extent_map(void)
* Drop the reference out on @em by one and free the structure if the reference
* count hits zero.
*/
-void free_extent_map(struct extent_map *em)
+void btrfs_free_extent_map(struct extent_map *em)
{
if (!em)
return;
if (refcount_dec_and_test(&em->refs)) {
- WARN_ON(extent_map_in_tree(em));
+ WARN_ON(btrfs_extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
kmem_cache_free(extent_map_cache, em);
}
@@ -84,7 +84,7 @@ static void remove_em(struct btrfs_inode *inode, struct extent_map *em)
rb_erase(&em->rb_node, &inode->extent_tree.root);
RB_CLEAR_NODE(&em->rb_node);
- if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
+ if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(inode->root)))
percpu_counter_dec(&fs_info->evictable_extent_maps);
}
@@ -102,19 +102,19 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
if (em->start < entry->start)
p = &(*p)->rb_left;
- else if (em->start >= extent_map_end(entry))
+ else if (em->start >= btrfs_extent_map_end(entry))
p = &(*p)->rb_right;
else
return -EEXIST;
}
orig_parent = parent;
- while (parent && em->start >= extent_map_end(entry)) {
+ while (parent && em->start >= btrfs_extent_map_end(entry)) {
parent = rb_next(parent);
entry = rb_entry(parent, struct extent_map, rb_node);
}
if (parent)
- if (end > entry->start && em->start < extent_map_end(entry))
+ if (end > entry->start && em->start < btrfs_extent_map_end(entry))
return -EEXIST;
parent = orig_parent;
@@ -124,7 +124,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
entry = rb_entry(parent, struct extent_map, rb_node);
}
if (parent)
- if (end > entry->start && em->start < extent_map_end(entry))
+ if (end > entry->start && em->start < btrfs_extent_map_end(entry))
return -EEXIST;
rb_link_node(&em->rb_node, orig_parent, p);
@@ -136,8 +136,8 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
* Search through the tree for an extent_map with a given offset. If it can't
* be found, try to find some neighboring extents
*/
-static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
- struct rb_node **prev_or_next_ret)
+static struct rb_node *tree_search(struct rb_root *root, u64 offset,
+ struct rb_node **prev_or_next_ret)
{
struct rb_node *n = root->rb_node;
struct rb_node *prev = NULL;
@@ -154,14 +154,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
if (offset < entry->start)
n = n->rb_left;
- else if (offset >= extent_map_end(entry))
+ else if (offset >= btrfs_extent_map_end(entry))
n = n->rb_right;
else
return n;
}
orig_prev = prev;
- while (prev && offset >= extent_map_end(prev_entry)) {
+ while (prev && offset >= btrfs_extent_map_end(prev_entry)) {
prev = rb_next(prev);
prev_entry = rb_entry(prev, struct extent_map, rb_node);
}
@@ -188,14 +188,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
static inline u64 extent_map_block_len(const struct extent_map *em)
{
- if (extent_map_is_compressed(em))
+ if (btrfs_extent_map_is_compressed(em))
return em->disk_num_bytes;
return em->len;
}
static inline u64 extent_map_block_end(const struct extent_map *em)
{
- const u64 block_start = extent_map_block_start(em);
+ const u64 block_start = btrfs_extent_map_block_start(em);
const u64 block_end = block_start + extent_map_block_len(em);
if (block_end < block_start)
@@ -210,7 +210,7 @@ static bool can_merge_extent_map(const struct extent_map *em)
return false;
/* Don't merge compressed extents, we need to know their actual size. */
- if (extent_map_is_compressed(em))
+ if (btrfs_extent_map_is_compressed(em))
return false;
if (em->flags & EXTENT_FLAG_LOGGING)
@@ -230,7 +230,7 @@ static bool can_merge_extent_map(const struct extent_map *em)
/* Check to see if two extent_map structs are adjacent and safe to merge. */
static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
{
- if (extent_map_end(prev) != next->start)
+ if (btrfs_extent_map_end(prev) != next->start)
return false;
/*
@@ -242,7 +242,7 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
return false;
if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1)
- return extent_map_block_start(next) == extent_map_block_end(prev);
+ return btrfs_extent_map_block_start(next) == extent_map_block_end(prev);
/* HOLES and INLINE extents. */
return next->disk_bytenr == prev->disk_bytenr;
@@ -270,8 +270,8 @@ static void merge_ondisk_extents(const struct extent_map *prev, const struct ext
u64 new_offset;
/* @prev and @next should not be compressed. */
- ASSERT(!extent_map_is_compressed(prev));
- ASSERT(!extent_map_is_compressed(next));
+ ASSERT(!btrfs_extent_map_is_compressed(prev));
+ ASSERT(!btrfs_extent_map_is_compressed(next));
/*
* There are two different cases where @prev and @next can be merged.
@@ -327,9 +327,9 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map
if (em->offset + em->len > em->ram_bytes)
dump_extent_map(fs_info, "ram_bytes too small", em);
if (em->offset + em->len > em->disk_num_bytes &&
- !extent_map_is_compressed(em))
+ !btrfs_extent_map_is_compressed(em))
dump_extent_map(fs_info, "disk_num_bytes too small", em);
- if (!extent_map_is_compressed(em) &&
+ if (!btrfs_extent_map_is_compressed(em) &&
em->ram_bytes != em->disk_num_bytes)
dump_extent_map(fs_info,
"ram_bytes mismatch with disk_num_bytes for non-compressed em",
@@ -361,8 +361,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
- if (rb)
- merge = rb_entry(rb, struct extent_map, rb_node);
+ merge = rb_entry_safe(rb, struct extent_map, rb_node);
+
if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
em->start = merge->start;
em->len += merge->len;
@@ -374,13 +374,13 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
validate_extent_map(fs_info, em);
remove_em(inode, merge);
- free_extent_map(merge);
+ btrfs_free_extent_map(merge);
}
}
rb = rb_next(&em->rb_node);
- if (rb)
- merge = rb_entry(rb, struct extent_map, rb_node);
+ merge = rb_entry_safe(rb, struct extent_map, rb_node);
+
if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
em->len += merge->len;
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
@@ -389,7 +389,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED;
remove_em(inode, merge);
- free_extent_map(merge);
+ btrfs_free_extent_map(merge);
}
}
@@ -409,7 +409,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
* -ENOENT when the extent is not found in the tree
* -EUCLEAN if the found extent does not match the expected start
*/
-int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
+int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *tree = &inode->extent_tree;
@@ -417,7 +417,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
struct extent_map *em;
write_lock(&tree->lock);
- em = lookup_extent_mapping(tree, start, len);
+ em = btrfs_lookup_extent_mapping(tree, start, len);
if (WARN_ON(!em)) {
btrfs_warn(fs_info,
@@ -444,23 +444,23 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
out:
write_unlock(&tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ret;
}
-void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
+void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
{
lockdep_assert_held_write(&inode->extent_tree.lock);
em->flags &= ~EXTENT_FLAG_LOGGING;
- if (extent_map_in_tree(em))
+ if (btrfs_extent_map_in_tree(em))
try_merge_map(inode, em);
}
static inline void setup_extent_mapping(struct btrfs_inode *inode,
struct extent_map *em,
- int modified)
+ bool modified)
{
refcount_inc(&em->refs);
@@ -486,7 +486,7 @@ static inline void setup_extent_mapping(struct btrfs_inode *inode,
* taken, or a reference dropped if the merge attempt was successful.
*/
static int add_extent_mapping(struct btrfs_inode *inode,
- struct extent_map *em, int modified)
+ struct extent_map *em, bool modified)
{
struct extent_map_tree *tree = &inode->extent_tree;
struct btrfs_root *root = inode->root;
@@ -502,22 +502,21 @@ static int add_extent_mapping(struct btrfs_inode *inode,
setup_extent_mapping(inode, em, modified);
- if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root)))
+ if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(root)))
percpu_counter_inc(&fs_info->evictable_extent_maps);
return 0;
}
-static struct extent_map *
-__lookup_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len, int strict)
+static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len, bool strict)
{
struct extent_map *em;
struct rb_node *rb_node;
struct rb_node *prev_or_next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->root, start, &prev_or_next);
+ rb_node = tree_search(&tree->root, start, &prev_or_next);
if (!rb_node) {
if (prev_or_next)
rb_node = prev_or_next;
@@ -527,7 +526,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
em = rb_entry(rb_node, struct extent_map, rb_node);
- if (strict && !(end > em->start && start < extent_map_end(em)))
+ if (strict && !(end > em->start && start < btrfs_extent_map_end(em)))
return NULL;
refcount_inc(&em->refs);
@@ -546,10 +545,10 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
* intersect, so check the object returned carefully to make sure that no
* additional lookups are needed.
*/
-struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len)
+struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
{
- return __lookup_extent_mapping(tree, start, len, 1);
+ return lookup_extent_mapping(tree, start, len, true);
}
/*
@@ -564,10 +563,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
*
* If one can't be found, any nearby extent may be returned
*/
-struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len)
+struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
{
- return __lookup_extent_mapping(tree, start, len, 0);
+ return lookup_extent_mapping(tree, start, len, false);
}
/*
@@ -579,7 +578,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
* Remove @em from the extent tree of @inode. No reference counts are dropped,
* and no checks are done to see if the range is in use.
*/
-void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
+void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
{
struct extent_map_tree *tree = &inode->extent_tree;
@@ -595,7 +594,7 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
static void replace_extent_mapping(struct btrfs_inode *inode,
struct extent_map *cur,
struct extent_map *new,
- int modified)
+ bool modified)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *tree = &inode->extent_tree;
@@ -605,7 +604,7 @@ static void replace_extent_mapping(struct btrfs_inode *inode,
validate_extent_map(fs_info, new);
WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
- ASSERT(extent_map_in_tree(cur));
+ ASSERT(btrfs_extent_map_in_tree(cur));
if (!(cur->flags & EXTENT_FLAG_LOGGING))
list_del_init(&cur->list);
rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root);
@@ -651,7 +650,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode,
u64 end;
u64 start_diff;
- if (map_start < em->start || map_start >= extent_map_end(em))
+ if (map_start < em->start || map_start >= btrfs_extent_map_end(em))
return -EINVAL;
if (existing->start > map_start) {
@@ -662,16 +661,16 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode,
next = next_extent_map(prev);
}
- start = prev ? extent_map_end(prev) : em->start;
+ start = prev ? btrfs_extent_map_end(prev) : em->start;
start = max_t(u64, start, em->start);
- end = next ? next->start : extent_map_end(em);
- end = min_t(u64, end, extent_map_end(em));
+ end = next ? next->start : btrfs_extent_map_end(em);
+ end = min_t(u64, end, btrfs_extent_map_end(em));
start_diff = start - em->start;
em->start = start;
em->len = end - start;
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
em->offset += start_diff;
- return add_extent_mapping(inode, em, 0);
+ return add_extent_mapping(inode, em, false);
}
/*
@@ -708,7 +707,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
if (em->disk_bytenr == EXTENT_MAP_INLINE)
ASSERT(em->start == 0);
- ret = add_extent_mapping(inode, em, 0);
+ ret = add_extent_mapping(inode, em, false);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
* an overlapping map exists in the tree
@@ -716,7 +715,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
if (ret == -EEXIST) {
struct extent_map *existing;
- existing = search_extent_mapping(&inode->extent_tree, start, len);
+ existing = btrfs_search_extent_mapping(&inode->extent_tree, start, len);
trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
@@ -725,8 +724,8 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
* extent causing the -EEXIST.
*/
if (start >= existing->start &&
- start < extent_map_end(existing)) {
- free_extent_map(em);
+ start < btrfs_extent_map_end(existing)) {
+ btrfs_free_extent_map(em);
*em_in = existing;
ret = 0;
} else {
@@ -739,14 +738,14 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
*/
ret = merge_extent_mapping(inode, existing, em, start);
if (WARN_ON(ret)) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*em_in = NULL;
btrfs_warn(fs_info,
"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu",
- existing->start, extent_map_end(existing),
+ existing->start, btrfs_extent_map_end(existing),
orig_start, orig_start + orig_len, start);
}
- free_extent_map(existing);
+ btrfs_free_extent_map(existing);
}
}
@@ -772,8 +771,8 @@ static void drop_all_extent_maps_fast(struct btrfs_inode *inode)
em = rb_entry(node, struct extent_map, rb_node);
em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
- remove_extent_mapping(inode, em);
- free_extent_map(em);
+ btrfs_remove_extent_mapping(inode, em);
+ btrfs_free_extent_map(em);
if (cond_resched_rwlock_write(&tree->lock))
node = rb_first(&tree->root);
@@ -826,15 +825,15 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
* range ends after our range (and they might be the same extent map),
* because we need to split those two extent maps at the boundaries.
*/
- split = alloc_extent_map();
- split2 = alloc_extent_map();
+ split = btrfs_alloc_extent_map();
+ split2 = btrfs_alloc_extent_map();
write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = btrfs_lookup_extent_mapping(em_tree, start, len);
while (em) {
/* extent_map_end() returns exclusive value (last byte + 1). */
- const u64 em_end = extent_map_end(em);
+ const u64 em_end = btrfs_extent_map_end(em);
struct extent_map *next_em = NULL;
u64 gen;
unsigned long flags;
@@ -898,7 +897,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->generation = gen;
split->flags = flags;
replace_extent_mapping(inode, em, split, modified);
- free_extent_map(split);
+ btrfs_free_extent_map(split);
split = split2;
split2 = NULL;
}
@@ -925,7 +924,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->ram_bytes = split->len;
}
- if (extent_map_in_tree(em)) {
+ if (btrfs_extent_map_in_tree(em)) {
replace_extent_mapping(inode, em, split, modified);
} else {
int ret;
@@ -936,11 +935,11 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
if (WARN_ON(ret != 0) && modified)
btrfs_set_inode_full_sync(inode);
}
- free_extent_map(split);
+ btrfs_free_extent_map(split);
split = NULL;
}
remove_em:
- if (extent_map_in_tree(em)) {
+ if (btrfs_extent_map_in_tree(em)) {
/*
* If the extent map is still in the tree it means that
* either of the following is true:
@@ -965,25 +964,25 @@ remove_em:
ASSERT(!split);
btrfs_set_inode_full_sync(inode);
}
- remove_extent_mapping(inode, em);
+ btrfs_remove_extent_mapping(inode, em);
}
/*
* Once for the tree reference (we replaced or removed the
* extent map from the tree).
*/
- free_extent_map(em);
+ btrfs_free_extent_map(em);
next:
/* Once for us (for our lookup reference). */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = next_em;
}
write_unlock(&em_tree->lock);
- free_extent_map(split);
- free_extent_map(split2);
+ btrfs_free_extent_map(split);
+ btrfs_free_extent_map(split2);
}
/*
@@ -1007,7 +1006,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map_tree *tree = &inode->extent_tree;
int ret;
- ASSERT(!extent_map_in_tree(new_em));
+ ASSERT(!btrfs_extent_map_in_tree(new_em));
/*
* The caller has locked an appropriate file range in the inode's io
@@ -1033,8 +1032,8 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
*
* This function is used when an ordered_extent needs to be split.
*/
-int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
- u64 new_logical)
+int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
+ u64 new_logical)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
@@ -1046,25 +1045,25 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
ASSERT(pre != 0);
ASSERT(pre < len);
- split_pre = alloc_extent_map();
+ split_pre = btrfs_alloc_extent_map();
if (!split_pre)
return -ENOMEM;
- split_mid = alloc_extent_map();
+ split_mid = btrfs_alloc_extent_map();
if (!split_mid) {
ret = -ENOMEM;
goto out_free_pre;
}
- lock_extent(&inode->io_tree, start, start + len - 1, NULL);
+ btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL);
write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em) {
+ em = btrfs_lookup_extent_mapping(em_tree, start, len);
+ if (unlikely(!em)) {
ret = -EIO;
goto out_unlock;
}
ASSERT(em->len == len);
- ASSERT(!extent_map_is_compressed(em));
+ ASSERT(!btrfs_extent_map_is_compressed(em));
ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE);
ASSERT(em->flags & EXTENT_FLAG_PINNED);
ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
@@ -1083,7 +1082,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_pre->flags = flags;
split_pre->generation = em->generation;
- replace_extent_mapping(inode, em, split_pre, 1);
+ replace_extent_mapping(inode, em, split_pre, true);
/*
* Now we only have an extent_map at:
@@ -1093,25 +1092,25 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
/* Insert the middle extent_map. */
split_mid->start = em->start + pre;
split_mid->len = em->len - pre;
- split_mid->disk_bytenr = extent_map_block_start(em) + pre;
+ split_mid->disk_bytenr = btrfs_extent_map_block_start(em) + pre;
split_mid->disk_num_bytes = split_mid->len;
split_mid->offset = 0;
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
split_mid->generation = em->generation;
- add_extent_mapping(inode, split_mid, 1);
+ add_extent_mapping(inode, split_mid, true);
/* Once for us */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Once for the tree */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out_unlock:
write_unlock(&em_tree->lock);
- unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
- free_extent_map(split_mid);
+ btrfs_unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
+ btrfs_free_extent_map(split_mid);
out_free_pre:
- free_extent_map(split_pre);
+ btrfs_free_extent_map(split_pre);
return ret;
}
@@ -1128,6 +1127,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
long nr_dropped = 0;
struct rb_node *node;
+ lockdep_assert_held_write(&tree->lock);
+
/*
* Take the mmap lock so that we serialize with the inode logging phase
* of fsync because we may need to set the full sync flag on the inode,
@@ -1139,28 +1140,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
* to find new extents, which may not be there yet because ordered
* extents haven't completed yet.
*
- * We also do a try lock because otherwise we could deadlock. This is
- * because the shrinker for this filesystem may be invoked while we are
- * in a path that is holding the mmap lock in write mode. For example in
- * a reflink operation while COWing an extent buffer, when allocating
- * pages for a new extent buffer and under memory pressure, the shrinker
- * may be invoked, and therefore we would deadlock by attempting to read
- * lock the mmap lock while we are holding already a write lock on it.
+ * We also do a try lock because we don't want to block for too long and
+ * we are holding the extent map tree's lock in write mode.
*/
if (!down_read_trylock(&inode->i_mmap_lock))
return 0;
- /*
- * We want to be fast so if the lock is busy we don't want to spend time
- * waiting for it - either some task is about to do IO for the inode or
- * we may have another task shrinking extent maps, here in this code, so
- * skip this inode.
- */
- if (!write_trylock(&tree->lock)) {
- up_read(&inode->i_mmap_lock);
- return 0;
- }
-
node = rb_first(&tree->root);
while (node) {
struct rb_node *next = rb_next(node);
@@ -1182,10 +1167,10 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
if (!list_empty(&em->list) && em->generation >= cur_fs_gen)
btrfs_set_inode_full_sync(inode);
- remove_extent_mapping(inode, em);
+ btrfs_remove_extent_mapping(inode, em);
trace_btrfs_extent_map_shrinker_remove_em(inode, em);
/* Drop the reference for the tree. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
nr_dropped++;
next:
if (ctx->scanned >= ctx->nr_to_scan)
@@ -1201,12 +1186,61 @@ next:
break;
node = next;
}
- write_unlock(&tree->lock);
up_read(&inode->i_mmap_lock);
return nr_dropped;
}
+static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
+ u64 min_ino)
+{
+ struct btrfs_inode *inode;
+ unsigned long from = min_ino;
+
+ xa_lock(&root->inodes);
+ while (true) {
+ struct extent_map_tree *tree;
+
+ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
+ if (!inode)
+ break;
+
+ tree = &inode->extent_tree;
+
+ /*
+ * We want to be fast so if the lock is busy we don't want to
+ * spend time waiting for it (some task is about to do IO for
+ * the inode).
+ */
+ if (!write_trylock(&tree->lock))
+ goto next;
+
+ /*
+ * Skip inode if it doesn't have loaded extent maps, so we avoid
+ * getting a reference and doing an iput later. This includes
+ * cases like files that were opened for things like stat(2), or
+ * files with all extent maps previously released through the
+ * release folio callback (btrfs_release_folio()) or released in
+ * a previous run, or directories which never have extent maps.
+ */
+ if (RB_EMPTY_ROOT(&tree->root)) {
+ write_unlock(&tree->lock);
+ goto next;
+ }
+
+ if (igrab(&inode->vfs_inode))
+ break;
+
+ write_unlock(&tree->lock);
+next:
+ from = btrfs_ino(inode) + 1;
+ cond_resched_lock(&root->inodes.xa_lock);
+ }
+ xa_unlock(&root->inodes);
+
+ return inode;
+}
+
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1214,21 +1248,21 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
long nr_dropped = 0;
u64 min_ino = fs_info->em_shrinker_last_ino + 1;
- inode = btrfs_find_first_inode(root, min_ino);
+ inode = find_first_inode_to_shrink(root, min_ino);
while (inode) {
nr_dropped += btrfs_scan_inode(inode, ctx);
+ write_unlock(&inode->extent_tree.lock);
min_ino = btrfs_ino(inode) + 1;
fs_info->em_shrinker_last_ino = btrfs_ino(inode);
- btrfs_add_delayed_iput(inode);
+ iput(&inode->vfs_inode);
- if (ctx->scanned >= ctx->nr_to_scan ||
- btrfs_fs_closing(inode->root->fs_info))
+ if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info))
break;
cond_resched();
- inode = btrfs_find_first_inode(root, min_ino);
+ inode = find_first_inode_to_shrink(root, min_ino);
}
if (inode) {
@@ -1303,7 +1337,7 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
if (!root)
continue;
- if (is_fstree(btrfs_root_id(root)))
+ if (btrfs_is_fstree(btrfs_root_id(root)))
nr_dropped += btrfs_scan_root(root, &ctx);
btrfs_put_root(root);
@@ -1338,7 +1372,7 @@ void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
return;
- queue_work(system_unbound_wq, &fs_info->em_shrinker_work);
+ queue_work(system_dfl_wq, &fs_info->em_shrinker_work);
}
void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index cd123b266b64..6f685f3c9327 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -8,8 +8,7 @@
#include <linux/rbtree.h>
#include <linux/list.h>
#include <linux/refcount.h>
-#include "misc.h"
-#include "compression.h"
+#include "fs.h"
struct btrfs_inode;
struct btrfs_fs_info;
@@ -108,8 +107,8 @@ struct extent_map_tree {
struct btrfs_inode;
-static inline void extent_map_set_compression(struct extent_map *em,
- enum btrfs_compression_type type)
+static inline void btrfs_extent_map_set_compression(struct extent_map *em,
+ enum btrfs_compression_type type)
{
if (type == BTRFS_COMPRESS_ZLIB)
em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
@@ -119,7 +118,8 @@ static inline void extent_map_set_compression(struct extent_map *em,
em->flags |= EXTENT_FLAG_COMPRESS_ZSTD;
}
-static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em)
+static inline enum btrfs_compression_type btrfs_extent_map_compression(
+ const struct extent_map *em)
{
if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB)
return BTRFS_COMPRESS_ZLIB;
@@ -137,50 +137,50 @@ static inline enum btrfs_compression_type extent_map_compression(const struct ex
* More efficient way to determine if extent is compressed, instead of using
* 'extent_map_compression() != BTRFS_COMPRESS_NONE'.
*/
-static inline bool extent_map_is_compressed(const struct extent_map *em)
+static inline bool btrfs_extent_map_is_compressed(const struct extent_map *em)
{
return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB |
EXTENT_FLAG_COMPRESS_LZO |
EXTENT_FLAG_COMPRESS_ZSTD)) != 0;
}
-static inline int extent_map_in_tree(const struct extent_map *em)
+static inline int btrfs_extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
}
-static inline u64 extent_map_block_start(const struct extent_map *em)
+static inline u64 btrfs_extent_map_block_start(const struct extent_map *em)
{
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
- if (extent_map_is_compressed(em))
+ if (btrfs_extent_map_is_compressed(em))
return em->disk_bytenr;
return em->disk_bytenr + em->offset;
}
return em->disk_bytenr;
}
-static inline u64 extent_map_end(const struct extent_map *em)
+static inline u64 btrfs_extent_map_end(const struct extent_map *em)
{
if (em->start + em->len < em->start)
return (u64)-1;
return em->start + em->len;
}
-void extent_map_tree_init(struct extent_map_tree *tree);
-struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len);
-void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em);
-int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
- u64 new_logical);
-
-struct extent_map *alloc_extent_map(void);
-void free_extent_map(struct extent_map *em);
-int __init extent_map_init(void);
-void __cold extent_map_exit(void);
-int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
-void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em);
-struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len);
+void btrfs_extent_map_tree_init(struct extent_map_tree *tree);
+struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
+void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em);
+int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
+ u64 new_logical);
+
+struct extent_map *btrfs_alloc_extent_map(void);
+void btrfs_free_extent_map(struct extent_map *em);
+int __init btrfs_extent_map_init(void);
+void __cold btrfs_extent_map_exit(void);
+int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
+void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em);
+struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
int btrfs_add_extent_mapping(struct btrfs_inode *inode,
struct extent_map **em_in, u64 start, u64 len);
void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index b80c07ad8c5e..f2eaaef8422b 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -153,7 +153,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
if (cache_end > offset) {
if (offset == cache->offset) {
/*
- * We cached a dealloc range (found in the io tree) for
+ * We cached a delalloc range (found in the io tree) for
* a hole or prealloc extent and we have now found a
* file extent item for the same offset. What we have
* now is more recent and up to date, so discard what
@@ -320,7 +320,7 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p
* the cost of allocating a new one.
*/
ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
- atomic_inc(&clone->refs);
+ refcount_inc(&clone->refs);
ret = btrfs_next_leaf(inode->root, path);
if (ret != 0)
@@ -634,7 +634,7 @@ static int extent_fiemap(struct btrfs_inode *inode,
const u64 ino = btrfs_ino(inode);
struct extent_state *cached_state = NULL;
struct extent_state *delalloc_cached_state = NULL;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct fiemap_cache cache = { 0 };
struct btrfs_backref_share_check_ctx *backref_ctx;
u64 last_extent_end = 0;
@@ -661,7 +661,7 @@ restart:
range_end = round_up(start + len, sectorsize);
prev_extent_end = range_start;
- lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
if (ret < 0)
@@ -841,7 +841,7 @@ check_eof_delalloc:
}
out_unlock:
- unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
btrfs_release_path(path);
@@ -871,10 +871,9 @@ out_unlock:
ret = emit_last_fiemap_cache(fieinfo, &cache);
out:
- free_extent_state(delalloc_cached_state);
+ btrfs_free_extent_state(delalloc_cached_state);
kfree(cache.entries);
btrfs_free_backref_share_ctx(backref_ctx);
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 886749b39672..14e5257f0f04 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -18,6 +18,7 @@
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
+#include "volumes.h"
#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
@@ -46,7 +47,7 @@
void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size)
{
u64 start, end, i_size;
- int ret;
+ bool found;
spin_lock(&inode->lock);
i_size = new_i_size ?: i_size_read(&inode->vfs_inode);
@@ -55,9 +56,9 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
goto out_unlock;
}
- ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
- &end, EXTENT_DIRTY);
- if (!ret && start == 0)
+ found = btrfs_find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
+ &end, EXTENT_DIRTY);
+ if (found && start == 0)
i_size = min(i_size, end + 1);
else
i_size = 0;
@@ -91,8 +92,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize));
- return set_extent_bit(inode->file_extent_tree, start, start + len - 1,
- EXTENT_DIRTY, NULL);
+ return btrfs_set_extent_bit(inode->file_extent_tree, start, start + len - 1,
+ EXTENT_DIRTY, NULL);
}
/*
@@ -121,8 +122,8 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
len == (u64)-1);
- return clear_extent_bit(inode->file_extent_tree, start,
- start + len - 1, EXTENT_DIRTY, NULL);
+ return btrfs_clear_extent_bit(inode->file_extent_tree, start,
+ start + len - 1, EXTENT_DIRTY, NULL);
}
static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes)
@@ -163,20 +164,21 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
int ret = 0;
struct btrfs_file_extent_item *item;
struct btrfs_key file_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+
file_key.objectid = objectid;
- file_key.offset = pos;
file_key.type = BTRFS_EXTENT_DATA_KEY;
+ file_key.offset = pos;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
sizeof(*item));
if (ret < 0)
- goto out;
+ return ret;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -191,9 +193,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_encryption(leaf, item, 0);
btrfs_set_file_extent_other_encoding(leaf, item, 0);
- btrfs_mark_buffer_dirty(trans, leaf);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -214,8 +213,8 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
int csums_in_item;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- file_key.offset = bytenr;
file_key.type = BTRFS_EXTENT_CSUM_KEY;
+ file_key.offset = bytenr;
ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
if (ret < 0)
goto fail;
@@ -261,8 +260,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int cow = mod != 0;
file_key.objectid = objectid;
- file_key.offset = offset;
file_key.type = BTRFS_EXTENT_DATA_KEY;
+ file_key.offset = offset;
return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
}
@@ -338,23 +337,23 @@ out:
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
-blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
+int btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = &bbio->bio;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
const u32 sectorsize = fs_info->sectorsize;
const u32 csum_size = fs_info->csum_size;
u32 orig_len = bio->bi_iter.bi_size;
u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
- blk_status_t ret = BLK_STS_OK;
+ int ret = 0;
u32 bio_offset = 0;
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state))
- return BLK_STS_OK;
+ return 0;
/*
* This function is only called for read bio.
@@ -371,14 +370,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
ASSERT(bio_op(bio) == REQ_OP_READ);
path = btrfs_alloc_path();
if (!path)
- return BLK_STS_RESOURCE;
+ return -ENOMEM;
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
- bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
- if (!bbio->csum) {
- btrfs_free_path(path);
- return BLK_STS_RESOURCE;
- }
+ bbio->csum = kvcalloc(nblocks, csum_size, GFP_NOFS);
+ if (!bbio->csum)
+ return -ENOMEM;
} else {
bbio->csum = bbio->csum_inline;
}
@@ -397,8 +394,38 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
* between reading the free space cache and updating the csum tree.
*/
if (btrfs_is_free_space_inode(inode)) {
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
+ }
+
+ /*
+ * If we are searching for a csum of an extent from a past
+ * transaction, we can search in the commit root and reduce
+ * lock contention on the csum tree extent buffers.
+ *
+ * This is important because that lock is an rwsem which gets
+ * pretty heavy write load under memory pressure and sustained
+ * csum overwrites, unlike the commit_root_sem. (Memory pressure
+ * makes us writeback the nodes multiple times per transaction,
+ * which makes us cow them each time, taking the write lock.)
+ *
+ * Due to how rwsem is implemented, there is a possible
+ * priority inversion where the readers holding the lock don't
+ * get scheduled (say they're in a cgroup stuck in heavy reclaim)
+ * which then blocks writers, including transaction commit. By
+ * using a semaphore with fewer writers (only a commit switching
+ * the roots), we make this issue less likely.
+ *
+ * Note that we don't rely on btrfs_search_slot to lock the
+ * commit root csum. We call search_slot multiple times, which would
+ * create a potential race where a commit comes in between searches
+ * while we are not holding the commit_root_sem, and we get csums
+ * from across transactions.
+ */
+ if (bbio->csum_search_commit_root) {
+ path->search_commit_root = true;
+ path->skip_locking = true;
+ down_read(&fs_info->commit_root_sem);
}
while (bio_offset < orig_len) {
@@ -410,9 +437,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
count = search_csum_tree(fs_info, path, cur_disk_bytenr,
orig_len - bio_offset, csum_dst);
if (count < 0) {
- ret = errno_to_blk_status(count);
+ ret = count;
if (bbio->csum != bbio->csum_inline)
- kfree(bbio->csum);
+ kvfree(bbio->csum);
bbio->csum = NULL;
break;
}
@@ -431,12 +458,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
memset(csum_dst, 0, csum_size);
count = 1;
- if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (btrfs_is_data_reloc_root(inode->root)) {
u64 file_offset = bbio->file_offset + bio_offset;
- set_extent_bit(&inode->io_tree, file_offset,
- file_offset + sectorsize - 1,
- EXTENT_NODATASUM, NULL);
+ btrfs_set_extent_bit(&inode->io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM, NULL);
} else {
btrfs_warn_rl(fs_info,
"csum hole found for disk bytenr range [%llu, %llu)",
@@ -446,7 +473,8 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
bio_offset += count * sectorsize;
}
- btrfs_free_path(path);
+ if (bbio->csum_search_commit_root)
+ up_read(&fs_info->commit_root_sem);
return ret;
}
@@ -486,8 +514,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
path->nowait = nowait;
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.offset = start;
key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = start;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -737,23 +765,55 @@ fail:
return ret;
}
+static void csum_one_bio(struct btrfs_bio *bbio, struct bvec_iter *src)
+{
+ struct btrfs_inode *inode = bbio->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ struct bio *bio = &bbio->bio;
+ struct btrfs_ordered_sum *sums = bbio->sums;
+ struct bvec_iter iter = *src;
+ phys_addr_t paddr;
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 step = min(blocksize, PAGE_SIZE);
+ const u32 nr_steps = blocksize / step;
+ phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
+ u32 offset = 0;
+ int index = 0;
+
+ shash->tfm = fs_info->csum_shash;
+
+ btrfs_bio_for_each_block(paddr, bio, &iter, step) {
+ paddrs[(offset / step) % nr_steps] = paddr;
+ offset += step;
+
+ if (IS_ALIGNED(offset, blocksize)) {
+ btrfs_calculate_block_csum_pages(fs_info, paddrs, sums->sums + index);
+ index += fs_info->csum_size;
+ }
+ }
+}
+
+static void csum_one_bio_work(struct work_struct *work)
+{
+ struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, csum_work);
+
+ ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
+ ASSERT(bbio->async_csum == true);
+ csum_one_bio(bbio, &bbio->csum_saved_iter);
+ complete(&bbio->csum_done);
+}
+
/*
* Calculate checksums of the data contained inside a bio.
*/
-blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
+int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async)
{
struct btrfs_ordered_extent *ordered = bbio->ordered;
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct bio *bio = &bbio->bio;
struct btrfs_ordered_sum *sums;
- char *data;
- struct bvec_iter iter;
- struct bio_vec bvec;
- int index;
- unsigned int blockcount;
- int i;
unsigned nofs_flag;
nofs_flag = memalloc_nofs_save();
@@ -762,35 +822,23 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
memalloc_nofs_restore(nofs_flag);
if (!sums)
- return BLK_STS_RESOURCE;
+ return -ENOMEM;
+ sums->logical = bbio->orig_logical;
sums->len = bio->bi_iter.bi_size;
INIT_LIST_HEAD(&sums->list);
-
- sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
- index = 0;
-
- shash->tfm = fs_info->csum_shash;
-
- bio_for_each_segment(bvec, bio, iter) {
- blockcount = BTRFS_BYTES_TO_BLKS(fs_info,
- bvec.bv_len + fs_info->sectorsize
- - 1);
-
- for (i = 0; i < blockcount; i++) {
- data = bvec_kmap_local(&bvec);
- crypto_shash_digest(shash,
- data + (i * fs_info->sectorsize),
- fs_info->sectorsize,
- sums->sums + index);
- kunmap_local(data);
- index += fs_info->csum_size;
- }
-
- }
-
bbio->sums = sums;
btrfs_add_ordered_sum(ordered, sums);
+
+ if (!async) {
+ csum_one_bio(bbio, &bbio->bio.bi_iter);
+ return 0;
+ }
+ init_completion(&bbio->csum_done);
+ bbio->async_csum = true;
+ bbio->csum_saved_iter = bbio->bio.bi_iter;
+ INIT_WORK(&bbio->csum_work, csum_one_bio_work);
+ schedule_work(&bbio->csum_work);
return 0;
}
@@ -799,11 +847,11 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
* record the updated logical address on Zone Append completion.
* Allocate just the structure with an empty sums array here for that case.
*/
-blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
+int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
{
bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS);
if (!bbio->sums)
- return BLK_STS_RESOURCE;
+ return -ENOMEM;
bbio->sums->len = bbio->bio.bi_iter.bi_size;
bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
btrfs_add_ordered_sum(bbio->ordered, bbio->sums);
@@ -876,7 +924,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
u64 end_byte = bytenr + len;
u64 csum_end;
@@ -894,8 +942,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
while (1) {
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.offset = end_byte - 1;
key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = end_byte - 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
@@ -998,7 +1046,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
* item changed size or key
*/
ret = btrfs_split_item(trans, root, path, &key, offset);
- if (ret && ret != -EAGAIN) {
+ if (unlikely(ret && ret != -EAGAIN)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1012,7 +1060,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
}
- btrfs_free_path(path);
return ret;
}
@@ -1054,7 +1101,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key file_key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_csum_item *item;
struct btrfs_csum_item *item_end;
struct extent_buffer *leaf = NULL;
@@ -1076,8 +1123,8 @@ again:
found_next = 0;
bytenr = sums->logical + total_bytes;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- file_key.offset = bytenr;
file_key.type = BTRFS_EXTENT_CSUM_KEY;
+ file_key.offset = bytenr;
item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
if (!IS_ERR(item)) {
@@ -1130,10 +1177,10 @@ again:
}
btrfs_release_path(path);
- path->search_for_extension = 1;
+ path->search_for_extension = true;
ret = btrfs_search_slot(trans, root, &file_key, path,
csum_size, 1);
- path->search_for_extension = 0;
+ path->search_for_extension = false;
if (ret < 0)
goto out;
@@ -1259,14 +1306,12 @@ found:
ins_size /= csum_size;
total_bytes += ins_size * fs_info->sectorsize;
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
if (total_bytes < sums->len) {
btrfs_release_path(path);
cond_resched();
goto again;
}
out:
- btrfs_free_path(path);
return ret;
}
@@ -1304,7 +1349,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
em->offset = btrfs_file_extent_offset(leaf, fi);
if (compress_type != BTRFS_COMPRESS_NONE) {
- extent_map_set_compression(em, compress_type);
+ btrfs_extent_map_set_compression(em, compress_type);
} else {
/*
* Older kernels can create regular non-hole data
@@ -1324,7 +1369,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->start = 0;
em->len = fs_info->sectorsize;
em->offset = 0;
- extent_map_set_compression(em, compress_type);
+ btrfs_extent_map_set_compression(em, compress_type);
} else {
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 0e13661a71f3..5645c5e3abdb 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -3,9 +3,11 @@
#ifndef BTRFS_FILE_ITEM_H
#define BTRFS_FILE_ITEM_H
+#include <linux/blk_types.h>
#include <linux/list.h>
#include <uapi/linux/btrfs_tree.h>
-#include "accessors.h"
+#include "ctree.h"
+#include "ordered-data.h"
struct extent_map;
struct btrfs_file_extent_item;
@@ -51,7 +53,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
-blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio);
+int btrfs_lookup_bio_sums(struct btrfs_bio *bbio);
int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid, u64 pos,
u64 num_bytes);
@@ -62,8 +64,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
-blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio);
-blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio);
+int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async);
+int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit,
bool nowait);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 588c353d2969..7a501e73d880 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -36,52 +36,7 @@
#include "ioctl.h"
#include "file.h"
#include "super.h"
-
-/*
- * Helper to fault in page and copy. This should go away and be replaced with
- * calls into generic code.
- */
-static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
- struct folio *folio, struct iov_iter *i)
-{
- size_t copied = 0;
- size_t total_copied = 0;
- int offset = offset_in_page(pos);
-
- while (write_bytes > 0) {
- size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes);
- /*
- * Copy data from userspace to the current page
- */
- copied = copy_folio_from_iter_atomic(folio, offset, count, i);
-
- /* Flush processor's dcache for this page */
- flush_dcache_folio(folio);
-
- /*
- * if we get a partial write, we can end up with
- * partially up to date page. These add
- * a lot of complexity, so make sure they don't
- * happen by forcing this copy to be retried.
- *
- * The rest of the btrfs_file_write code will fall
- * back to page at a time copies after we return 0.
- */
- if (unlikely(copied < count)) {
- if (!folio_test_uptodate(folio)) {
- iov_iter_revert(i, copied);
- copied = 0;
- }
- if (!copied)
- break;
- }
-
- write_bytes -= copied;
- total_copied += copied;
- offset += copied;
- }
- return total_copied;
-}
+#include "print-tree.h"
/*
* Unlock folio after btrfs_file_write() is done with it.
@@ -106,7 +61,7 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
}
/*
- * After btrfs_copy_from_user(), update the following things for delalloc:
+ * After copy_folio_from_iter_atomic(), update the following things for delalloc:
* - Mark newly dirtied folio as DELALLOC in the io tree.
* Used to advise which range is to be written back.
* - Mark modified folio as Uptodate/Dirty and not needing COW fixup
@@ -120,7 +75,7 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
- u64 end_pos = pos + write_bytes;
+ const u64 end_pos = pos + write_bytes;
loff_t isize = i_size_read(&inode->vfs_inode);
unsigned int extra_bits = 0;
@@ -131,11 +86,9 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
extra_bits |= EXTENT_NORESERVE;
start_pos = round_down(pos, fs_info->sectorsize);
- num_bytes = round_up(write_bytes + pos - start_pos,
- fs_info->sectorsize);
+ num_bytes = round_up(end_pos - start_pos, fs_info->sectorsize);
ASSERT(num_bytes <= U32_MAX);
- ASSERT(folio_pos(folio) <= pos &&
- folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
+ ASSERT(folio_pos(folio) <= pos && folio_next_pos(folio) >= end_pos);
end_of_last_block = start_pos + num_bytes - 1;
@@ -143,9 +96,9 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
* The pages may have already been dirty, clear out old accounting so
* we can set things up properly
*/
- clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- cached);
+ btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ cached);
ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
@@ -224,7 +177,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (args->drop_cache)
btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
- if (args->start >= inode->disk_i_size && !args->replace_extent)
+ if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
modify_tree = 0;
update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
@@ -245,7 +198,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
- BUG_ON(del_nr > 0);
+ if (WARN_ON(del_nr > 0)) {
+ btrfs_print_leaf(leaf);
+ ret = -EINVAL;
+ break;
+ }
ret = btrfs_next_leaf(root, path);
if (ret < 0)
break;
@@ -321,7 +278,11 @@ next_slot:
* | -------- extent -------- |
*/
if (args->start > key.offset && args->end < extent_end) {
- BUG_ON(del_nr > 0);
+ if (WARN_ON(del_nr > 0)) {
+ btrfs_print_leaf(leaf);
+ ret = -EINVAL;
+ break;
+ }
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
break;
@@ -351,7 +312,6 @@ next_slot:
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - args->start);
- btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0) {
struct btrfs_ref ref = {
@@ -366,7 +326,7 @@ next_slot:
args->start - extent_offset,
0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -397,7 +357,6 @@ next_slot:
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - args->end);
- btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0)
args->bytes_found += args->end - key.offset;
break;
@@ -409,7 +368,11 @@ next_slot:
* | -------- extent -------- |
*/
if (args->start > key.offset && args->end >= extent_end) {
- BUG_ON(del_nr > 0);
+ if (WARN_ON(del_nr > 0)) {
+ btrfs_print_leaf(leaf);
+ ret = -EINVAL;
+ break;
+ }
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
break;
@@ -417,7 +380,6 @@ next_slot:
btrfs_set_file_extent_num_bytes(leaf, fi,
args->start - key.offset);
- btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0)
args->bytes_found += extent_end - args->start;
if (args->end == extent_end)
@@ -437,7 +399,11 @@ delete_extent_item:
del_slot = path->slots[0];
del_nr = 1;
} else {
- BUG_ON(del_slot + del_nr != path->slots[0]);
+ if (WARN_ON(del_slot + del_nr != path->slots[0])) {
+ btrfs_print_leaf(leaf);
+ ret = -EINVAL;
+ break;
+ }
del_nr++;
}
@@ -459,7 +425,7 @@ delete_extent_item:
key.offset - extent_offset,
0, false);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -476,7 +442,7 @@ delete_extent_item:
ret = btrfs_del_items(trans, root, path, del_slot,
del_nr);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -540,20 +506,19 @@ out:
return ret;
}
-static int extent_mergeable(struct extent_buffer *leaf, int slot,
- u64 objectid, u64 bytenr, u64 orig_offset,
- u64 *start, u64 *end)
+static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid,
+ u64 bytenr, u64 orig_offset, u64 *start, u64 *end)
{
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 extent_end;
if (slot < 0 || slot >= btrfs_header_nritems(leaf))
- return 0;
+ return false;
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
- return 0;
+ return false;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
@@ -562,15 +527,15 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
- return 0;
+ return false;
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
if ((*start && *start != key.offset) || (*end && *end != extent_end))
- return 0;
+ return false;
*start = key.offset;
*end = extent_end;
- return 1;
+ return true;
}
/*
@@ -585,7 +550,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_file_extent_item *fi;
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
@@ -621,21 +586,20 @@ again:
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != ino ||
- key.type != BTRFS_EXTENT_DATA_KEY) {
+ if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
}
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
+ if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
}
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- if (key.offset > start || extent_end < end) {
+ if (unlikely(key.offset > start || extent_end < end)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -668,7 +632,6 @@ again:
trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
end - other_start);
- btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
}
@@ -697,7 +660,6 @@ again:
other_end - start);
btrfs_set_file_extent_offset(leaf, fi,
start - orig_offset);
- btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
}
@@ -712,7 +674,7 @@ again:
btrfs_release_path(path);
goto again;
}
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -731,7 +693,6 @@ again:
btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - split);
- btrfs_mark_buffer_dirty(trans, leaf);
ref.action = BTRFS_ADD_DELAYED_REF;
ref.bytenr = bytenr;
@@ -741,7 +702,7 @@ again:
ref.ref_root = btrfs_root_id(root);
btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -749,7 +710,7 @@ again:
if (split == start) {
key.offset = start;
} else {
- if (start != key.offset) {
+ if (unlikely(start != key.offset)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -781,7 +742,7 @@ again:
del_slot = path->slots[0] + 1;
del_nr++;
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -799,7 +760,7 @@ again:
del_slot = path->slots[0];
del_nr++;
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -810,7 +771,6 @@ again:
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_mark_buffer_dirty(trans, leaf);
} else {
fi = btrfs_item_ptr(leaf, del_slot - 1,
struct btrfs_file_extent_item);
@@ -819,16 +779,14 @@ again:
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - key.offset);
- btrfs_mark_buffer_dirty(trans, leaf);
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
out:
- btrfs_free_path(path);
return ret;
}
@@ -837,25 +795,25 @@ out:
* On success return a locked folio and 0
*/
static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
- u64 len, bool force_uptodate)
+ u64 len)
{
u64 clamp_start = max_t(u64, pos, folio_pos(folio));
- u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
+ u64 clamp_end = min_t(u64, pos + len, folio_next_pos(folio));
+ const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
int ret = 0;
if (folio_test_uptodate(folio))
return 0;
- if (!force_uptodate &&
- IS_ALIGNED(clamp_start, PAGE_SIZE) &&
- IS_ALIGNED(clamp_end, PAGE_SIZE))
+ if (IS_ALIGNED(clamp_start, blocksize) &&
+ IS_ALIGNED(clamp_end, blocksize))
return 0;
ret = btrfs_read_folio(NULL, folio);
if (ret)
return ret;
folio_lock(folio);
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
folio_unlock(folio);
return -EIO;
}
@@ -894,32 +852,27 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
*/
static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
loff_t pos, size_t write_bytes,
- bool force_uptodate, bool nowait)
+ bool nowait)
{
- unsigned long index = pos >> PAGE_SHIFT;
+ const pgoff_t index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
- fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN);
+ fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) |
+ fgf_set_order(write_bytes);
struct folio *folio;
int ret = 0;
again:
folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
- if (IS_ERR(folio)) {
- if (nowait)
- ret = -EAGAIN;
- else
- ret = PTR_ERR(folio);
- return ret;
- }
- /* Only support page sized folio yet. */
- ASSERT(folio_order(folio) == 0);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+
ret = set_folio_extent_mapped(folio);
if (ret < 0) {
folio_unlock(folio);
folio_put(folio);
return ret;
}
- ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate);
+ ret = prepare_uptodate_folio(inode, folio, pos, write_bytes);
if (ret) {
/* The folio is already unlocked. */
folio_put(folio);
@@ -960,14 +913,15 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
struct btrfs_ordered_extent *ordered;
if (nowait) {
- if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
- cached_state)) {
+ if (!btrfs_try_lock_extent(&inode->io_tree, start_pos,
+ last_pos, cached_state)) {
folio_unlock(folio);
folio_put(folio);
return -EAGAIN;
}
} else {
- lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
+ btrfs_lock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state);
}
ordered = btrfs_lookup_ordered_range(inode, start_pos,
@@ -975,8 +929,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
if (ordered &&
ordered->file_offset + ordered->num_bytes > start_pos &&
ordered->file_offset <= last_pos) {
- unlock_extent(&inode->io_tree, start_pos, last_pos,
- cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state);
folio_unlock(folio);
folio_put(folio);
btrfs_start_ordered_extent(ordered);
@@ -1006,6 +960,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
* @pos: File offset.
* @write_bytes: The length to write, will be updated to the nocow writeable
* range.
+ * @nowait: Indicate if we can block or not (non-blocking IO context).
*
* This function will flush ordered extents in the range to ensure proper
* nocow checks.
@@ -1013,8 +968,9 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
* Return:
* > 0 If we can nocow, and updates @write_bytes.
* 0 If we can't do a nocow write.
- * -EAGAIN If we can't do a nocow write because snapshoting of the inode's
- * root is in progress.
+ * -EAGAIN If we can't do a nocow write because snapshotting of the inode's
+ * root is in progress or because we are in a non-blocking IO
+ * context and need to block (@nowait is true).
* < 0 If an error happened.
*
* NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
@@ -1026,8 +982,8 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
struct btrfs_root *root = inode->root;
struct extent_state *cached_state = NULL;
u64 lockstart, lockend;
- u64 num_bytes;
- int ret;
+ u64 cur_offset;
+ int ret = 0;
if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
return 0;
@@ -1038,7 +994,6 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
- num_bytes = lockend - lockstart + 1;
if (nowait) {
if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
@@ -1050,14 +1005,35 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
&cached_state);
}
- ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, nowait, false);
- if (ret <= 0)
- btrfs_drew_write_unlock(&root->snapshot_lock);
- else
- *write_bytes = min_t(size_t, *write_bytes ,
- num_bytes - pos + lockstart);
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+ cur_offset = lockstart;
+ while (cur_offset < lockend) {
+ u64 num_bytes = lockend - cur_offset + 1;
+
+ ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait);
+ if (ret <= 0) {
+ /*
+ * If cur_offset == lockstart it means we haven't found
+ * any extent against which we can NOCOW, so unlock the
+ * snapshot lock.
+ */
+ if (cur_offset == lockstart)
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ break;
+ }
+ cur_offset += num_bytes;
+ }
+
+ btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+ /*
+ * cur_offset > lockstart means there's at least a partial range we can
+ * NOCOW, and that range can cover one or more extents.
+ */
+ if (cur_offset > lockstart) {
+ *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos);
+ return 1;
+ }
return ret;
}
@@ -1075,7 +1051,6 @@ int btrfs_write_check(struct kiocb *iocb, size_t count)
loff_t pos = iocb->ki_pos;
int ret;
loff_t oldsize;
- loff_t start_pos;
/*
* Quickly bail out on NOWAIT writes if we don't have the nodatacow or
@@ -1102,9 +1077,8 @@ int btrfs_write_check(struct kiocb *iocb, size_t count)
inode_inc_iversion(inode);
}
- start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
- if (start_pos > oldsize) {
+ if (pos > oldsize) {
/* Expand hole size to cover write data, preventing empty gap */
loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
@@ -1116,218 +1090,306 @@ int btrfs_write_check(struct kiocb *iocb, size_t count)
return 0;
}
-ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
+static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved,
+ u64 start, u64 len, bool only_release_metadata)
{
- struct file *file = iocb->ki_filp;
- loff_t pos;
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_changeset *data_reserved = NULL;
- u64 release_bytes = 0;
- u64 lockstart;
- u64 lockend;
- size_t num_written = 0;
- ssize_t ret;
- loff_t old_isize = i_size_read(inode);
- unsigned int ilock_flags = 0;
- const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
- unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
- bool only_release_metadata = false;
+ if (len == 0)
+ return;
- if (nowait)
- ilock_flags |= BTRFS_ILOCK_TRY;
+ if (only_release_metadata) {
+ btrfs_check_nocow_unlock(inode);
+ btrfs_delalloc_release_metadata(inode, len, true);
+ } else {
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
- ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
- if (ret < 0)
- return ret;
+ btrfs_delalloc_release_space(inode, data_reserved,
+ round_down(start, fs_info->sectorsize),
+ len, true);
+ }
+}
- ret = generic_write_checks(iocb, i);
- if (ret <= 0)
- goto out;
+/*
+ * Reserve data and metadata space for this buffered write range.
+ *
+ * Return >0 for the number of bytes reserved, which is always block aligned.
+ * Return <0 for error.
+ */
+static ssize_t reserve_space(struct btrfs_inode *inode,
+ struct extent_changeset **data_reserved,
+ u64 start, size_t *len, bool nowait,
+ bool *only_release_metadata)
+{
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const unsigned int block_offset = (start & (fs_info->sectorsize - 1));
+ size_t reserve_bytes;
+ int ret;
- ret = btrfs_write_check(iocb, ret);
- if (ret < 0)
- goto out;
+ ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait);
+ if (ret < 0) {
+ int can_nocow;
- pos = iocb->ki_pos;
- while (iov_iter_count(i) > 0) {
- struct extent_state *cached_state = NULL;
- size_t offset = offset_in_page(pos);
- size_t sector_offset;
- size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset);
- size_t reserve_bytes;
- size_t copied;
- size_t dirty_sectors;
- size_t num_sectors;
- struct folio *folio = NULL;
- int extents_locked;
- bool force_page_uptodate = false;
+ if (nowait && (ret == -ENOSPC || ret == -EAGAIN))
+ return -EAGAIN;
/*
- * Fault pages before locking them in prepare_one_folio()
- * to avoid recursive lock
+ * If we don't have to COW at the offset, reserve metadata only.
+ * write_bytes may get smaller than requested here.
*/
- if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
- ret = -EFAULT;
- break;
- }
+ can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait);
+ if (can_nocow < 0)
+ ret = can_nocow;
+ if (can_nocow > 0)
+ ret = 0;
+ if (ret)
+ return ret;
+ *only_release_metadata = true;
+ }
- only_release_metadata = false;
- sector_offset = pos & (fs_info->sectorsize - 1);
+ reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize);
+ WARN_ON(reserve_bytes == 0);
+ ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes,
+ reserve_bytes, nowait);
+ if (ret) {
+ if (!*only_release_metadata)
+ btrfs_free_reserved_data_space(inode, *data_reserved,
+ start, *len);
+ else
+ btrfs_check_nocow_unlock(inode);
- extent_changeset_release(data_reserved);
- ret = btrfs_check_data_free_space(BTRFS_I(inode),
- &data_reserved, pos,
- write_bytes, nowait);
- if (ret < 0) {
- int can_nocow;
+ if (nowait && ret == -ENOSPC)
+ ret = -EAGAIN;
+ return ret;
+ }
+ return reserve_bytes;
+}
- if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
- ret = -EAGAIN;
- break;
- }
+/* Shrink the reserved data and metadata space from @reserved_len to @new_len. */
+static void shrink_reserved_space(struct btrfs_inode *inode,
+ struct extent_changeset *data_reserved,
+ u64 reserved_start, u64 reserved_len,
+ u64 new_len, bool only_release_metadata)
+{
+ const u64 diff = reserved_len - new_len;
- /*
- * If we don't have to COW at the offset, reserve
- * metadata only. write_bytes may get smaller than
- * requested here.
- */
- can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
- &write_bytes, nowait);
- if (can_nocow < 0)
- ret = can_nocow;
- if (can_nocow > 0)
- ret = 0;
- if (ret)
- break;
- only_release_metadata = true;
- }
+ ASSERT(new_len <= reserved_len);
+ btrfs_delalloc_shrink_extents(inode, reserved_len, new_len);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, diff, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ reserved_start + new_len, diff, true);
+}
- reserve_bytes = round_up(write_bytes + sector_offset,
- fs_info->sectorsize);
- WARN_ON(reserve_bytes == 0);
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- reserve_bytes,
- reserve_bytes, nowait);
- if (ret) {
- if (!only_release_metadata)
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- data_reserved, pos,
- write_bytes);
- else
- btrfs_check_nocow_unlock(BTRFS_I(inode));
+/* Calculate the maximum amount of bytes we can write into one folio. */
+static size_t calc_write_bytes(const struct btrfs_inode *inode,
+ const struct iov_iter *iter, u64 start)
+{
+ const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping);
- if (nowait && ret == -ENOSPC)
- ret = -EAGAIN;
- break;
- }
+ return min(max_folio_size - (start & (max_folio_size - 1)),
+ iov_iter_count(iter));
+}
+
+/*
+ * Do the heavy-lifting work to copy one range into one folio of the page cache.
+ *
+ * Return > 0 in case we copied all bytes or just some of them.
+ * Return 0 if no bytes were copied, in which case the caller should retry.
+ * Return <0 on error.
+ */
+static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter,
+ struct extent_changeset **data_reserved, u64 start,
+ bool nowait)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_state *cached_state = NULL;
+ size_t write_bytes = calc_write_bytes(inode, iter, start);
+ size_t copied;
+ const u64 reserved_start = round_down(start, fs_info->sectorsize);
+ u64 reserved_len;
+ struct folio *folio = NULL;
+ int extents_locked;
+ u64 lockstart;
+ u64 lockend;
+ bool only_release_metadata = false;
+ const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
+ int ret;
+
+ /*
+ * Fault all pages before locking them in prepare_one_folio() to avoid
+ * recursive lock.
+ */
+ if (unlikely(fault_in_iov_iter_readable(iter, write_bytes)))
+ return -EFAULT;
+ extent_changeset_release(*data_reserved);
+ ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait,
+ &only_release_metadata);
+ if (ret < 0)
+ return ret;
+ reserved_len = ret;
+ /* Write range must be inside the reserved range. */
+ ASSERT(reserved_start <= start);
+ ASSERT(start + write_bytes <= reserved_start + reserved_len);
- release_bytes = reserve_bytes;
again:
- ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
- if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
- break;
- }
+ ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping,
+ bdp_flags);
+ if (ret) {
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ return ret;
+ }
- ret = prepare_one_folio(inode, &folio, pos, write_bytes,
- force_page_uptodate, false);
- if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes);
- break;
- }
+ ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false);
+ if (ret) {
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ return ret;
+ }
- extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode),
- folio, pos, write_bytes, &lockstart,
- &lockend, nowait, &cached_state);
- if (extents_locked < 0) {
- if (!nowait && extents_locked == -EAGAIN)
- goto again;
+ /*
+ * The reserved range goes beyond the current folio, shrink the reserved
+ * space to the folio boundary.
+ */
+ if (reserved_start + reserved_len > folio_next_pos(folio)) {
+ const u64 last_block = folio_next_pos(folio);
+
+ shrink_reserved_space(inode, *data_reserved, reserved_start,
+ reserved_len, last_block - reserved_start,
+ only_release_metadata);
+ write_bytes = last_block - start;
+ reserved_len = last_block - reserved_start;
+ }
+
+ extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start,
+ write_bytes, &lockstart,
+ &lockend, nowait,
+ &cached_state);
+ if (extents_locked < 0) {
+ if (!nowait && extents_locked == -EAGAIN)
+ goto again;
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes);
- ret = extents_locked;
- break;
- }
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ ret = extents_locked;
+ return ret;
+ }
- copied = btrfs_copy_from_user(pos, write_bytes, folio, i);
+ copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start),
+ write_bytes, iter);
+ flush_dcache_folio(folio);
- num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
- dirty_sectors = round_up(copied + sector_offset,
- fs_info->sectorsize);
- dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
+ if (unlikely(copied < write_bytes)) {
+ u64 last_block;
- if (copied == 0) {
- force_page_uptodate = true;
- dirty_sectors = 0;
- } else {
- force_page_uptodate = false;
+ /*
+ * The original write range doesn't need an uptodate folio as
+ * the range is block aligned. But now a short copy happened.
+ * We cannot handle it without an uptodate folio.
+ *
+ * So just revert the range and we will retry.
+ */
+ if (!folio_test_uptodate(folio)) {
+ iov_iter_revert(iter, copied);
+ copied = 0;
}
- if (num_sectors > dirty_sectors) {
- /* release everything except the sectors we dirtied */
- release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
- if (only_release_metadata) {
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- release_bytes, true);
- } else {
- u64 release_start = round_up(pos + copied,
- fs_info->sectorsize);
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, release_start,
- release_bytes, true);
- }
+ /* No copied bytes, unlock, release reserved space and exit. */
+ if (copied == 0) {
+ if (extents_locked)
+ btrfs_unlock_extent(&inode->io_tree, lockstart, lockend,
+ &cached_state);
+ else
+ btrfs_free_extent_state(cached_state);
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ btrfs_drop_folio(fs_info, folio, start, copied);
+ return 0;
}
- release_bytes = round_up(copied + sector_offset,
- fs_info->sectorsize);
+ /* Release the reserved space beyond the last block. */
+ last_block = round_up(start + copied, fs_info->sectorsize);
- ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied,
- &cached_state, only_release_metadata);
+ shrink_reserved_space(inode, *data_reserved, reserved_start,
+ reserved_len, last_block - reserved_start,
+ only_release_metadata);
+ reserved_len = last_block - reserved_start;
+ }
- /*
- * If we have not locked the extent range, because the range's
- * start offset is >= i_size, we might still have a non-NULL
- * cached extent state, acquired while marking the extent range
- * as delalloc through btrfs_dirty_page(). Therefore free any
- * possible cached extent state to avoid a memory leak.
- */
- if (extents_locked)
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
- else
- free_extent_state(cached_state);
+ ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state,
+ only_release_metadata);
+ /*
+ * If we have not locked the extent range, because the range's start
+ * offset is >= i_size, we might still have a non-NULL cached extent
+ * state, acquired while marking the extent range as delalloc through
+ * btrfs_dirty_page(). Therefore free any possible cached extent state
+ * to avoid a memory leak.
+ */
+ if (extents_locked)
+ btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ else
+ btrfs_free_extent_state(cached_state);
- btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
- if (ret) {
- btrfs_drop_folio(fs_info, folio, pos, copied);
- break;
- }
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ if (ret) {
+ btrfs_drop_folio(fs_info, folio, start, copied);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ return ret;
+ }
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
- release_bytes = 0;
- if (only_release_metadata)
- btrfs_check_nocow_unlock(BTRFS_I(inode));
+ btrfs_drop_folio(fs_info, folio, start, copied);
+ return copied;
+}
- btrfs_drop_folio(fs_info, folio, pos, copied);
+ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ loff_t pos;
+ struct inode *inode = file_inode(file);
+ struct extent_changeset *data_reserved = NULL;
+ size_t num_written = 0;
+ ssize_t ret;
+ loff_t old_isize;
+ unsigned int ilock_flags = 0;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
- cond_resched();
+ if (nowait)
+ ilock_flags |= BTRFS_ILOCK_TRY;
- pos += copied;
- num_written += copied;
- }
+ ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
+ if (ret < 0)
+ return ret;
- if (release_bytes) {
- if (only_release_metadata) {
- btrfs_check_nocow_unlock(BTRFS_I(inode));
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- release_bytes, true);
- } else {
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved,
- round_down(pos, fs_info->sectorsize),
- release_bytes, true);
- }
+ /*
+ * We can only trust the isize with inode lock held, or it can race with
+ * other buffered writes and cause incorrect call of
+ * pagecache_isize_extended() to overwrite existing data.
+ */
+ old_isize = i_size_read(inode);
+
+ ret = generic_write_checks(iocb, iter);
+ if (ret <= 0)
+ goto out;
+
+ ret = btrfs_write_check(iocb, ret);
+ if (ret < 0)
+ goto out;
+
+ pos = iocb->ki_pos;
+ while (iov_iter_count(iter) > 0) {
+ ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait);
+ if (ret < 0)
+ break;
+ pos += ret;
+ num_written += ret;
+ cond_resched();
}
extent_changeset_free(data_reserved);
@@ -1378,6 +1440,8 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
struct btrfs_inode *inode = BTRFS_I(file_inode(file));
ssize_t num_written, num_sync;
+ if (unlikely(btrfs_is_shutdown(inode->root->fs_info)))
+ return -EIO;
/*
* If the fs flips readonly due to some impossible error, although we
* have opened a file as writable, we have to stop this write operation
@@ -1422,7 +1486,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
if (private) {
kfree(private->filldir_buf);
- free_extent_state(private->llseek_cached_state);
+ btrfs_free_extent_state(private->llseek_cached_state);
kfree(private);
filp->private_data = NULL;
}
@@ -1790,27 +1854,25 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct folio *folio = page_folio(page);
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
unsigned long zero_start;
loff_t size;
- vm_fault_t ret;
- int ret2;
- int reserved = 0;
+ size_t fsize = folio_size(folio);
+ int ret;
+ bool only_release_metadata = false;
u64 reserved_space;
u64 page_start;
u64 page_end;
u64 end;
- ASSERT(folio_order(folio) == 0);
+ reserved_space = fsize;
- reserved_space = PAGE_SIZE;
-
- sb_start_pagefault(inode->i_sb);
+ sb_start_pagefault(inode->vfs_inode.i_sb);
page_start = folio_pos(folio);
page_end = page_start + folio_size(folio) - 1;
end = page_end;
@@ -1823,38 +1885,53 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
- ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- page_start, reserved_space);
- if (!ret2) {
- ret2 = file_update_time(vmf->vma->vm_file);
- reserved = 1;
- }
- if (ret2) {
- ret = vmf_error(ret2);
- if (reserved)
- goto out;
+ ret = btrfs_check_data_free_space(inode, &data_reserved, page_start,
+ reserved_space, false);
+ if (ret < 0) {
+ size_t write_bytes = reserved_space;
+
+ if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0)
+ goto out_noreserve;
+
+ only_release_metadata = true;
+
+ /*
+ * Can't write the whole range, there may be shared extents or
+ * holes in the range, bail out with @only_release_metadata set
+ * to true so that we unlock the nocow lock before returning the
+ * error.
+ */
+ if (write_bytes < reserved_space)
+ goto out_noreserve;
+ }
+ ret = btrfs_delalloc_reserve_metadata(inode, reserved_space,
+ reserved_space, false);
+ if (ret < 0) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ page_start, reserved_space);
goto out_noreserve;
}
- /* Make the VM retry the fault. */
- ret = VM_FAULT_NOPAGE;
+ ret = file_update_time(vmf->vma->vm_file);
+ if (ret < 0)
+ goto out;
again:
- down_read(&BTRFS_I(inode)->i_mmap_lock);
+ down_read(&inode->i_mmap_lock);
folio_lock(folio);
- size = i_size_read(inode);
+ size = i_size_read(&inode->vfs_inode);
- if ((folio->mapping != inode->i_mapping) ||
+ if ((folio->mapping != inode->vfs_inode.i_mapping) ||
(page_start >= size)) {
/* Page got truncated out from underneath us. */
goto out_unlock;
}
folio_wait_writeback(folio);
- lock_extent(io_tree, page_start, page_end, &cached_state);
- ret2 = set_folio_extent_mapped(folio);
- if (ret2 < 0) {
- ret = vmf_error(ret2);
- unlock_extent(io_tree, page_start, page_end, &cached_state);
+ btrfs_lock_extent(io_tree, page_start, page_end, &cached_state);
+ ret = set_folio_extent_mapped(folio);
+ if (ret < 0) {
+ btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
goto out_unlock;
}
@@ -1862,23 +1939,27 @@ again:
* We can't set the delalloc bits if there are pending ordered
* extents. Drop our locks and wait for them to finish.
*/
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, fsize);
if (ordered) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
+ btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
folio_unlock(folio);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
+ up_read(&inode->i_mmap_lock);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
}
- if (folio->index == ((size - 1) >> PAGE_SHIFT)) {
+ if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start, fs_info->sectorsize);
- if (reserved_space < PAGE_SIZE) {
+ if (reserved_space < fsize) {
+ const u64 to_free = fsize - reserved_space;
+
end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, page_start,
- PAGE_SIZE - reserved_space, true);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, to_free, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ end + 1, to_free, true);
}
}
@@ -1889,15 +1970,13 @@ again:
* clear any delalloc bits within this page range since we have to
* reserve data&meta space before lock_page() (see above comments).
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
-
- ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
- &cached_state);
- if (ret2) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- ret = VM_FAULT_SIGBUS;
+ btrfs_clear_extent_bit(io_tree, page_start, end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state);
+ if (ret < 0) {
+ btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
goto out_unlock;
}
@@ -1905,36 +1984,53 @@ again:
if (page_start + folio_size(folio) > size)
zero_start = offset_in_folio(folio, size);
else
- zero_start = PAGE_SIZE;
+ zero_start = fsize;
- if (zero_start != PAGE_SIZE)
+ if (zero_start != fsize)
folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
- btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
- btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+ btrfs_set_inode_last_sub_trans(inode);
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
+ if (only_release_metadata)
+ btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE,
+ &cached_state);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- sb_end_pagefault(inode->i_sb);
+ btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
+ up_read(&inode->i_mmap_lock);
+
+ btrfs_delalloc_release_extents(inode, fsize);
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
+ sb_end_pagefault(inode->vfs_inode.i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
out_unlock:
folio_unlock(folio);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
+ up_read(&inode->i_mmap_lock);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
- reserved_space, (ret != 0));
-out_noreserve:
- sb_end_pagefault(inode->i_sb);
+ btrfs_delalloc_release_extents(inode, fsize);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, reserved_space, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ reserved_space, true);
extent_changeset_free(data_reserved);
- return ret;
+out_noreserve:
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
+
+ sb_end_pagefault(inode->vfs_inode.i_sb);
+
+ if (ret < 0)
+ return vmf_error(ret);
+
+ /* Make the VM retry the fault. */
+ return VM_FAULT_NOPAGE;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -1943,46 +2039,49 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
.page_mkwrite = btrfs_page_mkwrite,
};
-static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+static int btrfs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *filp = desc->file;
struct address_space *mapping = filp->f_mapping;
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(filp)))))
+ return -EIO;
if (!mapping->a_ops->read_folio)
return -ENOEXEC;
file_accessed(filp);
- vma->vm_ops = &btrfs_file_vm_ops;
+ desc->vm_ops = &btrfs_file_vm_ops;
return 0;
}
-static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
- int slot, u64 start, u64 end)
+static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
+ int slot, u64 start, u64 end)
{
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
if (slot < 0 || slot >= btrfs_header_nritems(leaf))
- return 0;
+ return false;
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY)
- return 0;
+ return false;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
- return 0;
+ return false;
if (btrfs_file_extent_disk_bytenr(leaf, fi))
- return 0;
+ return false;
if (key.offset == end)
- return 1;
+ return true;
if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
- return 1;
- return 0;
+ return true;
+ return false;
}
static int fill_holes(struct btrfs_trans_handle *trans,
@@ -2028,7 +2127,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
@@ -2045,7 +2143,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
btrfs_release_path(path);
@@ -2058,7 +2155,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
- hole_em = alloc_extent_map();
+ hole_em = btrfs_alloc_extent_map();
if (!hole_em) {
btrfs_drop_extent_map_range(inode, offset, end - 1, false);
btrfs_set_inode_full_sync(inode);
@@ -2072,7 +2169,7 @@ out:
hole_em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
- free_extent_map(hole_em);
+ btrfs_free_extent_map(hole_em);
if (ret)
btrfs_set_inode_full_sync(inode);
}
@@ -2105,15 +2202,33 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
0 : *start + *len - em->start - em->len;
*start = em->start + em->len;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ret;
}
-static void btrfs_punch_hole_lock_range(struct inode *inode,
- const u64 lockstart,
- const u64 lockend,
- struct extent_state **cached_state)
+/*
+ * Check if there is no folio in the range.
+ *
+ * We cannot utilize filemap_range_has_page() in a filemap with large folios
+ * as we can hit the following false positive:
+ *
+ * start end
+ * | |
+ * |//|//|//|//| | | | | | | | |//|//|
+ * \ / \ /
+ * Folio A Folio B
+ *
+ * That large folio A and B cover the start and end indexes.
+ * In that case filemap_range_has_page() will always return true, but the above
+ * case is fine for btrfs_punch_hole_lock_range() usage.
+ *
+ * So here we only ensure that no other folios is in the range, excluding the
+ * head/tail large folio.
+ */
+static bool check_range_has_page(struct inode *inode, u64 start, u64 end)
{
+ struct folio_batch fbatch;
+ bool ret = false;
/*
* For subpage case, if the range is not at page boundary, we could
* have pages at the leading/tailing part of the range.
@@ -2121,15 +2236,48 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* will always return true.
* So here we need to do extra page alignment for
* filemap_range_has_page().
+ *
+ * And do not decrease page_lockend right now, as it can be 0.
*/
- const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
- const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
+ const u64 page_lockstart = round_up(start, PAGE_SIZE);
+ const u64 page_lockend = round_down(end + 1, PAGE_SIZE);
+ const pgoff_t start_index = page_lockstart >> PAGE_SHIFT;
+ const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT;
+ pgoff_t tmp = start_index;
+ int found_folios;
+
+ /* The same page or adjacent pages. */
+ if (page_lockend <= page_lockstart)
+ return false;
+
+ folio_batch_init(&fbatch);
+ found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch);
+ for (int i = 0; i < found_folios; i++) {
+ struct folio *folio = fbatch.folios[i];
+
+ /* A large folio begins before the start. Not a target. */
+ if (folio->index < start_index)
+ continue;
+ /* A large folio extends beyond the end. Not a target. */
+ if (folio_next_index(folio) > end_index)
+ continue;
+ /* A folio doesn't cover the head/tail index. Found a target. */
+ ret = true;
+ break;
+ }
+ folio_batch_release(&fbatch);
+ return ret;
+}
+static void btrfs_punch_hole_lock_range(struct inode *inode,
+ const u64 lockstart, const u64 lockend,
+ struct extent_state **cached_state)
+{
while (1) {
truncate_pagecache_range(inode, lockstart, lockend);
- lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
/*
* We can't have ordered extents in the range, nor dirty/writeback
* pages, because we have locked the inode's VFS lock in exclusive
@@ -2140,12 +2288,11 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* locking the range check if we have pages in the range, and if
* we do, unlock the range and retry.
*/
- if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
- page_lockend))
+ if (!check_range_has_page(inode, lockstart, lockend))
break;
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
}
btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
@@ -2193,7 +2340,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
if (extent_info->is_new_extent)
btrfs_set_file_extent_generation(leaf, extent, trans->transid);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
@@ -2259,7 +2405,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_block_rsv *rsv;
+ struct btrfs_block_rsv rsv;
unsigned int rsv_count;
u64 cur_offset;
u64 len = end - start;
@@ -2268,13 +2414,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
if (end <= start)
return -EINVAL;
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv) {
- ret = -ENOMEM;
- goto out;
- }
- rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
- rsv->failfast = true;
+ btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
+ rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1);
+ rsv.failfast = true;
/*
* 1 - update the inode
@@ -2291,14 +2433,14 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
- goto out_free;
+ goto out_release;
}
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
min_size, false);
if (WARN_ON(ret))
goto out_trans;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
cur_offset = start;
drop_args.path = path;
@@ -2320,9 +2462,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
* got EOPNOTSUPP via prealloc then we messed up and
* need to abort.
*/
- if (ret &&
- (ret != -EOPNOTSUPP ||
- (extent_info && extent_info->is_new_extent)))
+ if (unlikely(ret &&
+ (ret != -EOPNOTSUPP ||
+ (extent_info && extent_info->is_new_extent))))
btrfs_abort_transaction(trans, ret);
break;
}
@@ -2333,7 +2475,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
cur_offset < ino_size) {
ret = fill_holes(trans, inode, path, cur_offset,
drop_args.drop_end);
- if (ret) {
+ if (unlikely(ret)) {
/*
* If we failed then we didn't insert our hole
* entries for the area we dropped, so now the
@@ -2353,7 +2495,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_inode_clear_file_extent_range(inode,
cur_offset,
drop_args.drop_end - cur_offset);
- if (ret) {
+ if (unlikely(ret)) {
/*
* We couldn't clear our area, so we could
* presumably adjust up and corrupt the fs, so
@@ -2372,7 +2514,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_insert_replace_extent(trans, inode, path,
extent_info, replace_len,
drop_args.bytes_found);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -2414,10 +2556,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, false);
+ &rsv, min_size, false);
if (WARN_ON(ret))
break;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
cur_offset = drop_args.drop_end;
len = end - cur_offset;
@@ -2467,7 +2609,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
cur_offset < drop_args.drop_end) {
ret = fill_holes(trans, inode, path, cur_offset,
drop_args.drop_end);
- if (ret) {
+ if (unlikely(ret)) {
/* Same comment as above. */
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2476,7 +2618,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
drop_args.drop_end - cur_offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
@@ -2486,7 +2628,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_insert_replace_extent(trans, inode, path,
extent_info, extent_info->data_len,
drop_args.bytes_found);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
@@ -2494,16 +2636,15 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
out_trans:
if (!trans)
- goto out_free;
+ goto out_release;
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret)
btrfs_end_transaction(trans);
else
*trans_out = trans;
-out_free:
- btrfs_free_block_rsv(fs_info, rsv);
-out:
+out_release:
+ btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
return ret;
}
@@ -2519,7 +2660,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
u64 lockend;
u64 tail_start;
u64 tail_len;
- u64 orig_start = offset;
+ const u64 orig_start = offset;
+ const u64 orig_end = offset + len - 1;
int ret = 0;
bool same_block;
u64 ino_size;
@@ -2551,18 +2693,14 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
- * We needn't truncate any block which is beyond the end of the file
- * because we are sure there is no data there.
- */
- /*
* Only do this if we are in the same block and we aren't doing the
* entire block.
*/
if (same_block && len < fs_info->sectorsize) {
if (offset < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
- 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
+ orig_start, orig_end);
} else {
ret = 0;
}
@@ -2572,7 +2710,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
/* zero back part of the first block */
if (offset < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end);
if (ret) {
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
@@ -2609,8 +2747,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
if (tail_start + tail_len < ino_size) {
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode),
- tail_start + tail_len,
- 0, 1);
+ tail_start + tail_len - 1,
+ orig_start, orig_end);
if (ret)
goto out_only_mutex;
}
@@ -2644,8 +2782,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out:
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
out_only_mutex:
if (!updated_inode && truncated_block && !ret) {
/*
@@ -2719,12 +2857,22 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 range_start;
+ u64 range_end;
int ret;
int ret2;
if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
return 0;
+ range_start = round_down(i_size_read(inode), root->fs_info->sectorsize);
+ range_end = round_up(end, root->fs_info->sectorsize);
+
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), range_start,
+ range_end - range_start);
+ if (ret)
+ return ret;
+
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -2763,7 +2911,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
else
ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ret;
}
@@ -2778,6 +2926,8 @@ static int btrfs_zero_range(struct inode *inode,
int ret;
u64 alloc_hint = 0;
const u64 sectorsize = fs_info->sectorsize;
+ const u64 orig_start = offset;
+ const u64 orig_end = offset + len - 1;
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
@@ -2807,7 +2957,7 @@ static int btrfs_zero_range(struct inode *inode,
* do nothing except updating the inode's i_size if
* needed.
*/
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
goto out;
@@ -2820,9 +2970,9 @@ static int btrfs_zero_range(struct inode *inode,
ASSERT(IS_ALIGNED(alloc_start, sectorsize));
len = offset + len - alloc_start;
offset = alloc_start;
- alloc_hint = extent_map_block_start(em) + em->len;
+ alloc_hint = btrfs_extent_map_block_start(em) + em->len;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
@@ -2833,22 +2983,22 @@ static int btrfs_zero_range(struct inode *inode,
}
if (em->flags & EXTENT_FLAG_PREALLOC) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
goto out;
}
if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
- free_extent_map(em);
- ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
- 0);
+ btrfs_free_extent_map(em);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
+ orig_start, orig_end);
if (!ret)
ret = btrfs_fallocate_update_isize(inode,
offset + len,
mode);
return ret;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
alloc_start = round_down(offset, sectorsize);
alloc_end = alloc_start + sectorsize;
goto reserve_space;
@@ -2872,7 +3022,8 @@ static int btrfs_zero_range(struct inode *inode,
alloc_start = round_down(offset, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
- ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset,
+ orig_start, orig_end);
if (ret)
goto out;
} else {
@@ -2889,8 +3040,8 @@ static int btrfs_zero_range(struct inode *inode,
alloc_end = round_up(offset + len, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
- ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
- 0, 1);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
+ orig_start, orig_end);
if (ret)
goto out;
} else {
@@ -2915,16 +3066,16 @@ reserve_space:
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret) {
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
goto out;
}
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
fs_info->sectorsize,
offset + len, &alloc_hint);
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
/* btrfs_prealloc_file_range releases reserved space on error */
if (ret) {
space_reserved = false;
@@ -2964,6 +3115,9 @@ static long btrfs_fallocate(struct file *file, int mode,
int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
int ret;
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode))))
+ return -EIO;
+
/* Do not allow fallocate in ZONED mode */
if (btrfs_is_zoned(inode_to_fs_info(inode)))
return -EOPNOTSUPP;
@@ -3010,7 +3164,8 @@ static long btrfs_fallocate(struct file *file, int mode,
* need to zero out the end of the block if i_size lands in the
* middle of a block.
*/
- ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size,
+ inode->i_size, (u64)-1);
if (ret)
goto out;
}
@@ -3035,8 +3190,8 @@ static long btrfs_fallocate(struct file *file, int mode,
}
locked_end = alloc_end - 1;
- lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
@@ -3048,8 +3203,8 @@ static long btrfs_fallocate(struct file *file, int mode,
ret = PTR_ERR(em);
break;
}
- last_byte = min(extent_map_end(em), alloc_end);
- actual_end = min_t(u64, extent_map_end(em), offset + len);
+ last_byte = min(btrfs_extent_map_end(em), alloc_end);
+ actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
@@ -3058,19 +3213,19 @@ static long btrfs_fallocate(struct file *file, int mode,
ret = add_falloc_range(&reserve_list, cur_offset, range_len);
if (ret < 0) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
break;
}
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
&data_reserved, cur_offset, range_len);
if (ret < 0) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
break;
}
qgroup_reserved += range_len;
data_space_needed += range_len;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
cur_offset = last_byte;
}
@@ -3124,8 +3279,8 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
out_unlock:
- unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
out:
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
extent_changeset_free(data_reserved);
@@ -3159,10 +3314,10 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
if (inode->delalloc_bytes > 0) {
spin_unlock(&inode->lock);
*delalloc_start_ret = start;
- delalloc_len = count_range_bits(&inode->io_tree,
- delalloc_start_ret, end,
- len, EXTENT_DELALLOC, 1,
- cached_state);
+ delalloc_len = btrfs_count_range_bits(&inode->io_tree,
+ delalloc_start_ret, end,
+ len, EXTENT_DELALLOC, 1,
+ cached_state);
} else {
spin_unlock(&inode->lock);
}
@@ -3205,7 +3360,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
* We could also use the extent map tree to find such delalloc that is
* being flushed, but using the ordered extents tree is more efficient
* because it's usually much smaller as ordered extents are removed from
- * the tree once they complete. With the extent maps, we mau have them
+ * the tree once they complete. With the extent maps, we may have them
* in the extent map tree for a very long time, and they were either
* created by previous writes or loaded by read operations.
*/
@@ -3471,7 +3626,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
last_extent_end = lockstart;
- lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
@@ -3617,7 +3772,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
}
out:
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
btrfs_free_path(path);
if (ret < 0)
@@ -3654,6 +3809,9 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
{
int ret;
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode))))
+ return -EIO;
+
filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
ret = fsverity_file_open(inode, filp);
@@ -3666,6 +3824,9 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret = 0;
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp)))))
+ return -EIO;
+
if (iocb->ki_flags & IOCB_DIRECT) {
ret = btrfs_direct_read(iocb, to);
if (ret < 0 || !iov_iter_count(to) ||
@@ -3676,13 +3837,23 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return filemap_read(iocb, to, ret);
}
+static ssize_t btrfs_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(in)))))
+ return -EIO;
+
+ return filemap_splice_read(in, ppos, pipe, len, flags);
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
.read_iter = btrfs_file_read_iter,
- .splice_read = filemap_splice_read,
+ .splice_read = btrfs_file_splice_read,
.write_iter = btrfs_file_write_iter,
.splice_write = iter_file_splice_write,
- .mmap = btrfs_file_mmap,
+ .mmap_prepare = btrfs_file_mmap_prepare,
.open = btrfs_file_open,
.release = btrfs_release_file,
.get_unmapped_area = thp_get_unmapped_area,
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index de89e644be29..d7df81388cbe 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -9,6 +9,8 @@ struct file;
struct extent_state;
struct kiocb;
struct iov_iter;
+struct inode;
+struct folio;
struct page;
struct btrfs_ioctl_encoded_io_args;
struct btrfs_drop_extents_args;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cfa52ef40b06..f0f72850fab2 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -12,7 +12,7 @@
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
#include <linux/string_choices.h>
-#include "ctree.h"
+#include "extent-tree.h"
#include "fs.h"
#include "messages.h"
#include "misc.h"
@@ -88,13 +88,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_disk_key disk_key;
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode;
unsigned nofs_flag;
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -120,13 +120,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
- mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_constraint(inode->i_mapping,
+ mapping_set_gfp_mask(inode->vfs_inode.i_mapping,
+ mapping_gfp_constraint(inode->vfs_inode.i_mapping,
~(__GFP_FS | __GFP_HIGHMEM)));
- return inode;
+ return &inode->vfs_inode;
}
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
@@ -198,12 +198,11 @@ static int __create_free_space_inode(struct btrfs_root *root,
btrfs_set_inode_nlink(leaf, inode_item, 1);
btrfs_set_inode_transid(leaf, inode_item, trans->transid);
btrfs_set_inode_block_group(leaf, inode_item, offset);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_free_space_header));
if (ret < 0) {
@@ -216,7 +215,6 @@ static int __create_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header);
memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
btrfs_set_free_space_key(leaf, header, &disk_key);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
return 0;
@@ -246,7 +244,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_block_group *block_group)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret = 0;
@@ -259,12 +257,12 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
if (IS_ERR(inode)) {
if (PTR_ERR(inode) != -ENOENT)
ret = PTR_ERR(inode);
- goto out;
+ return ret;
}
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
btrfs_add_delayed_iput(BTRFS_I(inode));
- goto out;
+ return ret;
}
clear_nlink(inode);
/* One for the block groups ref */
@@ -287,12 +285,9 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
if (ret) {
if (ret > 0)
ret = 0;
- goto out;
+ return ret;
}
- ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, trans->fs_info->tree_root, path);
}
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
@@ -313,8 +308,9 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
bool locked = false;
if (block_group) {
- struct btrfs_path *path = btrfs_alloc_path();
+ BTRFS_PATH_AUTO_FREE(path);
+ path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto fail;
@@ -335,13 +331,12 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_CLEAR;
spin_unlock(&block_group->lock);
- btrfs_free_path(path);
}
btrfs_i_size_write(inode, 0);
truncate_pagecache(vfs_inode, 0);
- lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
/*
@@ -353,7 +348,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
- unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
if (ret)
goto fail;
@@ -371,7 +366,7 @@ fail:
static void readahead_cache(struct inode *inode)
{
struct file_ra_state ra;
- unsigned long last_index;
+ pgoff_t last_index;
file_ra_state_init(&ra, inode->i_mapping);
last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
@@ -449,7 +444,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
{
- struct page *page;
+ struct folio *folio;
struct inode *inode = io_ctl->inode;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int i;
@@ -457,31 +452,33 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
for (i = 0; i < io_ctl->num_pages; i++) {
int ret;
- page = find_or_create_page(inode->i_mapping, i, mask);
- if (!page) {
+ folio = __filemap_get_folio(inode->i_mapping, i,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mask);
+ if (IS_ERR(folio)) {
io_ctl_drop_pages(io_ctl);
- return -ENOMEM;
+ return PTR_ERR(folio);
}
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
io_ctl_drop_pages(io_ctl);
return ret;
}
- io_ctl->pages[i] = page;
- if (uptodate && !PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != inode->i_mapping) {
+ io_ctl->pages[i] = &folio->page;
+ if (uptodate && !folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != inode->i_mapping) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"free space cache page truncated");
io_ctl_drop_pages(io_ctl);
return -EIO;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"error reading free space cache");
io_ctl_drop_pages(io_ctl);
@@ -755,8 +752,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
return 0;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -971,8 +968,8 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
path = btrfs_alloc_path();
if (!path)
return 0;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
/*
* We must pass a path with search_commit_root set to btrfs_iget in
@@ -1083,9 +1080,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
/* Get the cluster for this block_group if it exists */
if (block_group && !list_empty(&block_group->cluster_list)) {
- cluster = list_entry(block_group->cluster_list.next,
- struct btrfs_free_cluster,
- block_group_list);
+ cluster = list_first_entry(&block_group->cluster_list,
+ struct btrfs_free_cluster, block_group_list);
}
if (!node && cluster) {
@@ -1158,13 +1154,13 @@ update_cache_item(struct btrfs_trans_handle *trans,
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, NULL);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DELALLOC, NULL);
goto fail;
}
leaf = path->nodes[0];
@@ -1175,9 +1171,9 @@ update_cache_item(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
found_key.offset != offset) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
- inode->i_size - 1, EXTENT_DELALLOC,
- NULL);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+ inode->i_size - 1, EXTENT_DELALLOC,
+ NULL);
btrfs_release_path(path);
goto fail;
}
@@ -1189,7 +1185,6 @@ update_cache_item(struct btrfs_trans_handle *trans,
btrfs_set_free_space_entries(leaf, header, entries);
btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
btrfs_set_free_space_generation(leaf, header, trans->transid);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
return 0;
@@ -1223,9 +1218,9 @@ static noinline_for_stack int write_pinned_extent_entries(
start = block_group->start;
while (start < block_group->start + block_group->length) {
- if (!find_first_extent_bit(unpin, start,
- &extent_start, &extent_end,
- EXTENT_DIRTY, NULL))
+ if (!btrfs_find_first_extent_bit(unpin, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY, NULL))
return 0;
/* This pinned extent is out of our range */
@@ -1271,8 +1266,8 @@ static int flush_dirty_cache(struct inode *inode)
ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, NULL);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DELALLOC, NULL);
return ret;
}
@@ -1292,8 +1287,8 @@ cleanup_write_cache_enospc(struct inode *inode,
struct extent_state **cached_state)
{
io_ctl_drop_pages(io_ctl);
- unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ cached_state);
}
static int __btrfs_wait_cache_io(struct btrfs_root *root,
@@ -1418,8 +1413,8 @@ static int __btrfs_write_out_cache(struct inode *inode,
if (ret)
goto out_unlock;
- lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- &cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
io_ctl_set_generation(io_ctl, trans->transid);
@@ -1479,8 +1474,8 @@ static int __btrfs_write_out_cache(struct inode *inode,
io_ctl_drop_pages(io_ctl);
io_ctl_free(io_ctl);
- unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
/*
* at this point the pages are under IO and we're happy,
@@ -2287,7 +2282,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
* If this block group has some small extents we don't want to
* use up all of our free slots in the cache with them, we want
* to reserve them to larger extents, however if we have plenty
- * of cache left then go ahead an dadd them, no sense in adding
+ * of cache left then go ahead and add them, no sense in adding
* the overhead of a bitmap if we don't have to.
*/
if (info->bytes <= fs_info->sectorsize * 8) {
@@ -2346,9 +2341,8 @@ again:
struct rb_node *node;
struct btrfs_free_space *entry;
- cluster = list_entry(block_group->cluster_list.next,
- struct btrfs_free_cluster,
- block_group_list);
+ cluster = list_first_entry(&block_group->cluster_list,
+ struct btrfs_free_cluster, block_group_list);
spin_lock(&cluster->lock);
node = rb_first(&cluster->root);
if (!node) {
@@ -3198,7 +3192,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
u64 *max_extent_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- int err;
+ int ret2;
u64 search_start = cluster->window_start;
u64 search_bytes = bytes;
u64 ret = 0;
@@ -3206,8 +3200,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
search_start = min_start;
search_bytes = bytes;
- err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
- if (err) {
+ ret2 = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
+ if (ret2) {
*max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
return 0;
@@ -3662,7 +3656,7 @@ static int do_trimming(struct btrfs_block_group *block_group,
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
int ret;
- int update = 0;
+ bool bg_ro;
const u64 end = start + bytes;
const u64 reserved_end = reserved_start + reserved_bytes;
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
@@ -3670,12 +3664,14 @@ static int do_trimming(struct btrfs_block_group *block_group,
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (!block_group->ro) {
+ bg_ro = block_group->ro;
+ if (!bg_ro) {
block_group->reserved += reserved_bytes;
+ spin_unlock(&block_group->lock);
space_info->bytes_reserved += reserved_bytes;
- update = 1;
+ } else {
+ spin_unlock(&block_group->lock);
}
- spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
@@ -3696,14 +3692,16 @@ static int do_trimming(struct btrfs_block_group *block_group,
list_del(&trim_entry->list);
mutex_unlock(&ctl->cache_writeout_mutex);
- if (update) {
+ if (!bg_ro) {
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (block_group->ro)
- space_info->bytes_readonly += reserved_bytes;
+ bg_ro = block_group->ro;
block_group->reserved -= reserved_bytes;
- space_info->bytes_reserved -= reserved_bytes;
spin_unlock(&block_group->lock);
+
+ space_info->bytes_reserved -= reserved_bytes;
+ if (bg_ro)
+ space_info->bytes_readonly += reserved_bytes;
spin_unlock(&space_info->lock);
}
@@ -3835,7 +3833,7 @@ out_unlock:
/*
* If we break out of trimming a bitmap prematurely, we should reset the
- * trimming bit. In a rather contrieved case, it's possible to race here so
+ * trimming bit. In a rather contrived case, it's possible to race here so
* reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
*
* start = start of bitmap
@@ -4148,7 +4146,7 @@ int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool act
if (!active) {
set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
ret = cleanup_free_space_cache_v1(fs_info, trans);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 7ba50e133921..1ad2ad384b9e 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -35,7 +35,7 @@ static struct btrfs_root *btrfs_free_space_root(
return btrfs_global_root(block_group->fs_info, &key);
}
-void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
+void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *cache)
{
u32 bitmap_range;
size_t bitmap_size;
@@ -82,23 +82,19 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
info = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_info);
btrfs_set_free_space_extent_count(leaf, info, 0);
btrfs_set_free_space_flags(leaf, info, 0);
- btrfs_mark_buffer_dirty(trans, leaf);
-
- ret = 0;
-out:
btrfs_release_path(path);
- return ret;
+ return 0;
}
EXPORT_FOR_TESTS
-struct btrfs_free_space_info *search_free_space_info(
+struct btrfs_free_space_info *btrfs_search_free_space_info(
struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow)
@@ -118,7 +114,7 @@ struct btrfs_free_space_info *search_free_space_info(
if (ret != 0) {
btrfs_warn(fs_info, "missing free space info for %llu",
block_group->start);
- ASSERT(0);
+ DEBUG_WARN();
return ERR_PTR(-ENOENT);
}
@@ -141,13 +137,13 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
if (ret < 0)
return ret;
- if (ret == 0) {
- ASSERT(0);
+ if (unlikely(ret == 0)) {
+ DEBUG_WARN();
return -EIO;
}
- if (p->slots[0] == 0) {
- ASSERT(0);
+ if (unlikely(p->slots[0] == 0)) {
+ DEBUG_WARN("no previous slot found");
return -EIO;
}
p->slots[0]--;
@@ -169,11 +165,9 @@ static unsigned long *alloc_bitmap(u32 bitmap_size)
/*
* GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse
- * into the filesystem as the free space bitmap can be modified in the
- * critical section of a transaction commit.
- *
- * TODO: push the memalloc_nofs_{save,restore}() to the caller where we
- * know that recursion is unsafe.
+ * into the filesystem here. All callers hold a transaction handle
+ * open, so if a GFP_KERNEL allocation recurses into the filesystem
+ * and triggers a transaction commit, we would deadlock.
*/
nofs_flag = memalloc_nofs_save();
ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL);
@@ -202,9 +196,9 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
}
EXPORT_FOR_TESTS
-int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path)
+int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = btrfs_free_space_root(block_group);
@@ -222,10 +216,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
- if (!bitmap) {
- ret = -ENOMEM;
- goto out;
- }
+ if (unlikely(!bitmap))
+ return 0;
start = block_group->start;
end = block_group->start + block_group->length;
@@ -236,8 +228,10 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
nr = 0;
@@ -272,31 +266,35 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
btrfs_release_path(path);
}
- info = search_free_space_info(trans, block_group, path, 1);
+ info = btrfs_search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
flags = btrfs_free_space_flags(leaf, info);
flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
+ block_group->using_free_space_bitmaps = true;
+ block_group->using_free_space_bitmaps_cached = true;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
- if (extent_count != expected_extent_count) {
+ if (unlikely(extent_count != expected_extent_count)) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
- ASSERT(0);
ret = -EIO;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -317,14 +315,15 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key,
data_size);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
write_extent_buffer(leaf, bitmap_cursor, ptr,
data_size);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
i += extent_size;
@@ -334,15 +333,13 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
ret = 0;
out:
kvfree(bitmap);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
EXPORT_FOR_TESTS
-int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path)
+int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = btrfs_free_space_root(block_group);
@@ -359,10 +356,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
- if (!bitmap) {
- ret = -ENOMEM;
- goto out;
- }
+ if (unlikely(!bitmap))
+ return 0;
start = block_group->start;
end = block_group->start + block_group->length;
@@ -373,8 +368,10 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
nr = 0;
@@ -403,50 +400,56 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
data_size = free_space_bitmap_size(fs_info,
found_key.offset);
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
+ path->slots[0]--;
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
read_extent_buffer(leaf, bitmap_cursor, ptr,
data_size);
nr++;
- path->slots[0]--;
} else {
ASSERT(0);
}
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
btrfs_release_path(path);
}
- info = search_free_space_info(trans, block_group, path, 1);
+ info = btrfs_search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
flags = btrfs_free_space_flags(leaf, info);
flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
+ block_group->using_free_space_bitmaps = false;
+ block_group->using_free_space_bitmaps_cached = true;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
- nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
+ nrbits = block_group->length >> fs_info->sectorsize_bits;
start_bit = find_next_bit_le(bitmap, nrbits, 0);
while (start_bit < nrbits) {
end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit);
ASSERT(start_bit < end_bit);
- key.objectid = start + start_bit * block_group->fs_info->sectorsize;
+ key.objectid = start + start_bit * fs_info->sectorsize;
key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
- key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize;
+ key.offset = (end_bit - start_bit) * fs_info->sectorsize;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
btrfs_release_path(path);
extent_count++;
@@ -454,21 +457,19 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
start_bit = find_next_bit_le(bitmap, nrbits, end_bit);
}
- if (extent_count != expected_extent_count) {
+ if (unlikely(extent_count != expected_extent_count)) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
- ASSERT(0);
ret = -EIO;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
ret = 0;
out:
kvfree(bitmap);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -485,34 +486,31 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
if (new_extents == 0)
return 0;
- info = search_free_space_info(trans, block_group, path, 1);
- if (IS_ERR(info)) {
- ret = PTR_ERR(info);
- goto out;
- }
+ info = btrfs_search_free_space_info(trans, block_group, path, 1);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
flags = btrfs_free_space_flags(path->nodes[0], info);
extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
extent_count += new_extents;
btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count > block_group->bitmap_high_thresh) {
- ret = convert_free_space_to_bitmaps(trans, block_group, path);
+ ret = btrfs_convert_free_space_to_bitmaps(trans, block_group, path);
} else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count < block_group->bitmap_low_thresh) {
- ret = convert_free_space_to_extents(trans, block_group, path);
+ ret = btrfs_convert_free_space_to_extents(trans, block_group, path);
}
-out:
return ret;
}
EXPORT_FOR_TESTS
-int free_space_test_bit(struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 offset)
+bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 offset)
{
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -530,13 +528,13 @@ int free_space_test_bit(struct btrfs_block_group *block_group,
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
i = div_u64(offset - found_start,
block_group->fs_info->sectorsize);
- return !!extent_buffer_test_bit(leaf, ptr, i);
+ return extent_buffer_test_bit(leaf, ptr, i);
}
-static void free_space_set_bits(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 *start, u64 *size,
- int bit)
+static void free_space_modify_bits(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 *start, u64 *size,
+ bool set_bits)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct extent_buffer *leaf;
@@ -560,7 +558,7 @@ static void free_space_set_bits(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
first = (*start - found_start) >> fs_info->sectorsize_bits;
last = (end - found_start) >> fs_info->sectorsize_bits;
- if (bit)
+ if (set_bits)
extent_buffer_bitmap_set(leaf, ptr, first, last - first);
else
extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
@@ -604,13 +602,14 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path,
- u64 start, u64 size, int remove)
+ u64 start, u64 size, bool remove)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
u64 end = start + size;
u64 cur_start, cur_size;
- int prev_bit, next_bit;
+ bool prev_bit_set = false;
+ bool next_bit_set = false;
int new_extents;
int ret;
@@ -627,16 +626,16 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
if (ret)
- goto out;
+ return ret;
- prev_bit = free_space_test_bit(block_group, path, prev_block);
+ prev_bit_set = btrfs_free_space_test_bit(block_group, path, prev_block);
/* The previous block may have been in the previous bitmap. */
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (start >= key.objectid + key.offset) {
ret = free_space_next_bitmap(trans, root, path);
if (ret)
- goto out;
+ return ret;
}
} else {
key.objectid = start;
@@ -645,9 +644,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
if (ret)
- goto out;
-
- prev_bit = -1;
+ return ret;
}
/*
@@ -657,13 +654,13 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
cur_start = start;
cur_size = size;
while (1) {
- free_space_set_bits(trans, block_group, path, &cur_start, &cur_size,
- !remove);
+ free_space_modify_bits(trans, block_group, path, &cur_start,
+ &cur_size, !remove);
if (cur_size == 0)
break;
ret = free_space_next_bitmap(trans, root, path);
if (ret)
- goto out;
+ return ret;
}
/*
@@ -676,42 +673,36 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
if (end >= key.objectid + key.offset) {
ret = free_space_next_bitmap(trans, root, path);
if (ret)
- goto out;
+ return ret;
}
- next_bit = free_space_test_bit(block_group, path, end);
- } else {
- next_bit = -1;
+ next_bit_set = btrfs_free_space_test_bit(block_group, path, end);
}
if (remove) {
new_extents = -1;
- if (prev_bit == 1) {
+ if (prev_bit_set) {
/* Leftover on the left. */
new_extents++;
}
- if (next_bit == 1) {
+ if (next_bit_set) {
/* Leftover on the right. */
new_extents++;
}
} else {
new_extents = 1;
- if (prev_bit == 1) {
+ if (prev_bit_set) {
/* Merging with neighbor on the left. */
new_extents--;
}
- if (next_bit == 1) {
+ if (next_bit_set) {
/* Merging with neighbor on the right. */
new_extents--;
}
}
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, block_group, path,
- new_extents);
-
-out:
- return ret;
+ return update_free_space_extent_count(trans, block_group, path, new_extents);
}
static int remove_free_space_extent(struct btrfs_trans_handle *trans,
@@ -732,7 +723,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -764,7 +755,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
/* Delete the existing key (cases 1-4). */
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ return ret;
/* Add a key for leftovers at the beginning (cases 3 and 4). */
if (start > found_start) {
@@ -775,7 +766,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret)
- goto out;
+ return ret;
new_extents++;
}
@@ -788,81 +779,89 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret)
- goto out;
+ return ret;
new_extents++;
}
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, block_group, path,
- new_extents);
-
-out:
- return ret;
+ return update_free_space_extent_count(trans, block_group, path, new_extents);
}
-EXPORT_FOR_TESTS
-int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size)
+static int using_bitmaps(struct btrfs_block_group *bg, struct btrfs_path *path)
{
struct btrfs_free_space_info *info;
u32 flags;
- int ret;
- if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
- ret = __add_block_group_free_space(trans, block_group, path);
- if (ret)
- return ret;
- }
+ if (bg->using_free_space_bitmaps_cached)
+ return bg->using_free_space_bitmaps;
- info = search_free_space_info(NULL, block_group, path, 0);
+ info = btrfs_search_free_space_info(NULL, bg, path, 0);
if (IS_ERR(info))
return PTR_ERR(info);
flags = btrfs_free_space_flags(path->nodes[0], info);
btrfs_release_path(path);
- if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ bg->using_free_space_bitmaps = (flags & BTRFS_FREE_SPACE_USING_BITMAPS);
+ bg->using_free_space_bitmaps_cached = true;
+
+ return bg->using_free_space_bitmaps;
+}
+
+EXPORT_FOR_TESTS
+int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ int ret;
+
+ ret = __add_block_group_free_space(trans, block_group, path);
+ if (ret)
+ return ret;
+
+ ret = using_bitmaps(block_group, path);
+ if (ret < 0)
+ return ret;
+
+ if (ret)
return modify_free_space_bitmap(trans, block_group, path,
- start, size, 1);
- } else {
- return remove_free_space_extent(trans, block_group, path,
- start, size);
- }
+ start, size, true);
+
+ return remove_free_space_extent(trans, block_group, path, start, size);
}
-int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size)
+int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size)
{
struct btrfs_block_group *block_group;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
- goto out;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
block_group = btrfs_lookup_block_group(trans->fs_info, start);
- if (!block_group) {
- ASSERT(0);
+ if (unlikely(!block_group)) {
+ DEBUG_WARN("no block group found for start=%llu", start);
ret = -ENOENT;
- goto out;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
mutex_lock(&block_group->free_space_lock);
- ret = __remove_from_free_space_tree(trans, block_group, path, start,
- size);
+ ret = __btrfs_remove_from_free_space_tree(trans, block_group, path, start, size);
mutex_unlock(&block_group->free_space_lock);
-
- btrfs_put_block_group(block_group);
-out:
- btrfs_free_path(path);
if (ret)
btrfs_abort_transaction(trans, ret);
+
+ btrfs_put_block_group(block_group);
+
return ret;
}
@@ -909,7 +908,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -932,7 +931,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
if (found_end == start) {
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ return ret;
new_key.objectid = found_start;
new_key.offset += key.offset;
new_extents--;
@@ -949,7 +948,7 @@ right:
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -973,7 +972,7 @@ right:
if (found_start == end) {
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ return ret;
new_key.offset += key.offset;
new_extents--;
}
@@ -983,78 +982,67 @@ insert:
/* Insert the new key (cases 1-4). */
ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, block_group, path,
- new_extents);
-
-out:
- return ret;
+ return update_free_space_extent_count(trans, block_group, path, new_extents);
}
EXPORT_FOR_TESTS
-int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size)
+int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
{
- struct btrfs_free_space_info *info;
- u32 flags;
int ret;
- if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
- ret = __add_block_group_free_space(trans, block_group, path);
- if (ret)
- return ret;
- }
+ ret = __add_block_group_free_space(trans, block_group, path);
+ if (ret)
+ return ret;
- info = search_free_space_info(NULL, block_group, path, 0);
- if (IS_ERR(info))
- return PTR_ERR(info);
- flags = btrfs_free_space_flags(path->nodes[0], info);
- btrfs_release_path(path);
+ ret = using_bitmaps(block_group, path);
+ if (ret < 0)
+ return ret;
- if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ if (ret)
return modify_free_space_bitmap(trans, block_group, path,
- start, size, 0);
- } else {
- return add_free_space_extent(trans, block_group, path, start,
- size);
- }
+ start, size, false);
+
+ return add_free_space_extent(trans, block_group, path, start, size);
}
-int add_to_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size)
+int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size)
{
struct btrfs_block_group *block_group;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
- goto out;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
block_group = btrfs_lookup_block_group(trans->fs_info, start);
- if (!block_group) {
- ASSERT(0);
+ if (unlikely(!block_group)) {
+ DEBUG_WARN("no block group found for start=%llu", start);
ret = -ENOENT;
- goto out;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
mutex_lock(&block_group->free_space_lock);
- ret = __add_to_free_space_tree(trans, block_group, path, start, size);
+ ret = __btrfs_add_to_free_space_tree(trans, block_group, path, start, size);
mutex_unlock(&block_group->free_space_lock);
-
- btrfs_put_block_group(block_group);
-out:
- btrfs_free_path(path);
if (ret)
btrfs_abort_transaction(trans, ret);
+
+ btrfs_put_block_group(block_group);
+
return ret;
}
@@ -1067,7 +1055,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
struct btrfs_root *extent_root;
- struct btrfs_path *path, *path2;
+ BTRFS_PATH_AUTO_FREE(path);
+ BTRFS_PATH_AUTO_FREE(path2);
struct btrfs_key key;
u64 start, end;
int ret;
@@ -1075,17 +1064,16 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
path2 = btrfs_alloc_path();
- if (!path2) {
- btrfs_free_path(path);
+ if (!path2)
return -ENOMEM;
- }
+
+ path->reada = READA_FORWARD;
ret = add_new_free_space_info(trans, block_group, path2);
if (ret)
- goto out;
+ return ret;
mutex_lock(&block_group->free_space_lock);
@@ -1104,11 +1092,22 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
if (ret < 0)
goto out_locked;
- ASSERT(ret == 0);
-
+ /*
+ * If ret is 1 (no key found), it means this is an empty block group,
+ * without any extents allocated from it and there's no block group
+ * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree
+ * because we are using the block group tree feature (so block group
+ * items are stored in the block group tree) or this is a new block
+ * group created in the current transaction and its block group item
+ * was not yet inserted in the extent tree (that happens in
+ * btrfs_create_pending_block_groups() -> insert_block_group_item()).
+ * It also means there are no extents allocated for block groups with a
+ * start offset beyond this block group's end offset (this is the last,
+ * highest, block group).
+ */
start = block_group->start;
end = block_group->start + block_group->length;
- while (1) {
+ while (ret == 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
@@ -1117,11 +1116,11 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
break;
if (start < key.objectid) {
- ret = __add_to_free_space_tree(trans,
- block_group,
- path2, start,
- key.objectid -
- start);
+ ret = __btrfs_add_to_free_space_tree(trans,
+ block_group,
+ path2, start,
+ key.objectid -
+ start);
if (ret)
goto out_locked;
}
@@ -1138,12 +1137,10 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_next_item(extent_root, path);
if (ret < 0)
goto out_locked;
- if (ret)
- break;
}
if (start < end) {
- ret = __add_to_free_space_tree(trans, block_group, path2,
- start, end - start);
+ ret = __btrfs_add_to_free_space_tree(trans, block_group, path2,
+ start, end - start);
if (ret)
goto out_locked;
}
@@ -1151,9 +1148,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = 0;
out_locked:
mutex_unlock(&block_group->free_space_lock);
-out:
- btrfs_free_path(path2);
- btrfs_free_path(path);
+
return ret;
}
@@ -1181,7 +1176,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
goto out_clear;
}
ret = btrfs_global_root_insert(free_space_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_put_root(free_space_root);
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -1193,7 +1188,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out_clear;
@@ -1222,8 +1217,9 @@ out_clear:
static int clear_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
+ struct rb_node *node;
int nr;
int ret;
@@ -1238,7 +1234,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
nr = btrfs_header_nritems(path->nodes[0]);
if (!nr)
@@ -1247,15 +1243,22 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
path->slots[0] = 0;
ret = btrfs_del_items(trans, root, path, 0, nr);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ node = rb_first_cached(&trans->fs_info->block_group_cache_tree);
+ while (node) {
+ struct btrfs_block_group *bg;
+
+ bg = rb_entry(node, struct btrfs_block_group, cache_node);
+ clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags);
+ node = rb_next(node);
+ cond_resched();
+ }
+
+ return 0;
}
int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
@@ -1278,14 +1281,14 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
ret = clear_free_space_tree(trans, free_space_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
ret = btrfs_del_root(trans, &free_space_root->root_key);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -1303,7 +1306,7 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
free_space_root->node, 0, 1);
btrfs_put_root(free_space_root);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -1332,7 +1335,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
ret = clear_free_space_tree(trans, free_space_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -1344,12 +1347,24 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
+
+ if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
+ &block_group->runtime_flags))
+ goto next;
+
ret = populate_free_space_tree(trans, block_group);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
+next:
+ if (btrfs_should_end_transaction(trans)) {
+ btrfs_end_transaction(trans);
+ trans = btrfs_start_transaction(free_space_root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ }
node = rb_next(node);
}
@@ -1366,54 +1381,82 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
+ bool own_path = false;
int ret;
- clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);
+ if (!test_and_clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ &block_group->runtime_flags))
+ return 0;
+
+ /*
+ * While rebuilding the free space tree we may allocate new metadata
+ * block groups while modifying the free space tree.
+ *
+ * Because during the rebuild (at btrfs_rebuild_free_space_tree()) we
+ * can use multiple transactions, every time btrfs_end_transaction() is
+ * called at btrfs_rebuild_free_space_tree() we finish the creation of
+ * new block groups by calling btrfs_create_pending_block_groups(), and
+ * that in turn calls us, through add_block_group_free_space(), to add
+ * a free space info item and a free space extent item for the block
+ * group.
+ *
+ * Then later btrfs_rebuild_free_space_tree() may find such new block
+ * groups and processes them with populate_free_space_tree(), which can
+ * fail with EEXIST since there are already items for the block group in
+ * the free space tree. Notice that we say "may find" because a new
+ * block group may be added to the block groups rbtree in a node before
+ * or after the block group currently being processed by the rebuild
+ * process. So signal the rebuild process to skip such new block groups
+ * if it finds them.
+ */
+ set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags);
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (unlikely(!path)) {
+ btrfs_abort_transaction(trans, -ENOMEM);
+ return -ENOMEM;
+ }
+ own_path = true;
+ }
ret = add_new_free_space_info(trans, block_group, path);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = __btrfs_add_to_free_space_tree(trans, block_group, path,
+ block_group->start, block_group->length);
if (ret)
- return ret;
+ btrfs_abort_transaction(trans, ret);
+
+out:
+ if (own_path)
+ btrfs_free_path(path);
- return __add_to_free_space_tree(trans, block_group, path,
- block_group->start,
- block_group->length);
+ return ret;
}
-int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group)
+int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_path *path = NULL;
- int ret = 0;
+ int ret;
- if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
mutex_lock(&block_group->free_space_lock);
- if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags))
- goto out;
-
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = __add_block_group_free_space(trans, block_group, path);
-
-out:
- btrfs_free_path(path);
+ ret = __add_block_group_free_space(trans, block_group, NULL);
mutex_unlock(&block_group->free_space_lock);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
-int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group)
+int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
u64 start, end;
@@ -1429,9 +1472,10 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
}
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
- goto out;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
start = block_group->start;
@@ -1443,8 +1487,10 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret)
- goto out;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
leaf = path->nodes[0];
nr = 0;
@@ -1472,16 +1518,15 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret)
- goto out;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
btrfs_release_path(path);
}
ret = 0;
-out:
- btrfs_free_path(path);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+
return ret;
}
@@ -1493,7 +1538,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
- int prev_bit = 0, bit;
+ bool prev_bit_set = false;
/* Initialize to silence GCC. */
u64 extent_start = 0;
u64 end, offset;
@@ -1510,7 +1555,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
while (1) {
ret = btrfs_next_item(root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret)
break;
@@ -1524,10 +1569,12 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
offset = key.objectid;
while (offset < key.objectid + key.offset) {
- bit = free_space_test_bit(block_group, path, offset);
- if (prev_bit == 0 && bit == 1) {
+ bool bit_set;
+
+ bit_set = btrfs_free_space_test_bit(block_group, path, offset);
+ if (!prev_bit_set && bit_set) {
extent_start = offset;
- } else if (prev_bit == 1 && bit == 0) {
+ } else if (prev_bit_set && !bit_set) {
u64 space_added;
ret = btrfs_add_new_free_space(block_group,
@@ -1535,7 +1582,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
offset,
&space_added);
if (ret)
- goto out;
+ return ret;
total_found += space_added;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
@@ -1543,30 +1590,27 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
}
extent_count++;
}
- prev_bit = bit;
+ prev_bit_set = bit_set;
offset += fs_info->sectorsize;
}
}
- if (prev_bit == 1) {
+ if (prev_bit_set) {
ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL);
if (ret)
- goto out;
+ return ret;
extent_count++;
}
- if (extent_count != expected_extent_count) {
+ if (unlikely(extent_count != expected_extent_count)) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
- ASSERT(0);
- ret = -EIO;
- goto out;
+ DEBUG_WARN();
+ return -EIO;
}
- ret = 0;
-out:
- return ret;
+ return 0;
}
static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
@@ -1593,7 +1637,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
ret = btrfs_next_item(root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret)
break;
@@ -1609,7 +1653,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
key.objectid + key.offset,
&space_added);
if (ret)
- goto out;
+ return ret;
total_found += space_added;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
@@ -1618,28 +1662,24 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
extent_count++;
}
- if (extent_count != expected_extent_count) {
+ if (unlikely(extent_count != expected_extent_count)) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
- ASSERT(0);
- ret = -EIO;
- goto out;
+ DEBUG_WARN();
+ return -EIO;
}
- ret = 0;
-out:
- return ret;
+ return 0;
}
-int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
+int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group;
struct btrfs_free_space_info *info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
u32 extent_count, flags;
- int ret;
block_group = caching_ctl->block_group;
@@ -1651,15 +1691,14 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
* Just like caching_thread() doesn't want to deadlock on the extent
* tree, we don't want to deadlock on the free space tree.
*/
- path->skip_locking = 1;
- path->search_commit_root = 1;
+ path->skip_locking = true;
+ path->search_commit_root = true;
path->reada = READA_FORWARD;
- info = search_free_space_info(NULL, block_group, path, 0);
- if (IS_ERR(info)) {
- ret = PTR_ERR(info);
- goto out;
- }
+ info = btrfs_search_free_space_info(NULL, block_group, path, 0);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
flags = btrfs_free_space_flags(path->nodes[0], info);
@@ -1669,11 +1708,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
* there.
*/
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS)
- ret = load_free_space_bitmaps(caching_ctl, path, extent_count);
+ return load_free_space_bitmaps(caching_ctl, path, extent_count);
else
- ret = load_free_space_extents(caching_ctl, path, extent_count);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return load_free_space_extents(caching_ctl, path, extent_count);
}
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index e6c6d6f4f221..3d9a5d4477fc 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -22,39 +22,39 @@ struct btrfs_trans_handle;
#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
-void set_free_space_tree_thresholds(struct btrfs_block_group *block_group);
+void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *block_group);
int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info);
-int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
-int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group);
-int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group);
-int add_to_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size);
-int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size);
+int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl);
+int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group);
+int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group);
+int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size);
+int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_free_space_info *
-search_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, int cow);
-int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+btrfs_search_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size);
-int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size);
-int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path);
-int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path);
-int free_space_test_bit(struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 offset);
+ struct btrfs_path *path, int cow);
+int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 offset);
#endif
#endif
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 31c1648bc0b4..feb0a2faa837 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -1,9 +1,186 @@
// SPDX-License-Identifier: GPL-2.0
#include "messages.h"
-#include "ctree.h"
#include "fs.h"
#include "accessors.h"
+#include "volumes.h"
+
+static const struct btrfs_csums {
+ u16 size;
+ const char name[10];
+ const char driver[12];
+} btrfs_csums[] = {
+ [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
+ [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
+ [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
+ [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
+ .driver = "blake2b-256" },
+};
+
+/* This exists for btrfs-progs usages. */
+u16 btrfs_csum_type_size(u16 type)
+{
+ return btrfs_csums[type].size;
+}
+
+int btrfs_super_csum_size(const struct btrfs_super_block *s)
+{
+ u16 t = btrfs_super_csum_type(s);
+
+ /* csum type is validated at mount time. */
+ return btrfs_csum_type_size(t);
+}
+
+const char *btrfs_super_csum_name(u16 csum_type)
+{
+ /* csum type is validated at mount time. */
+ return btrfs_csums[csum_type].name;
+}
+
+/*
+ * Return driver name if defined, otherwise the name that's also a valid driver
+ * name.
+ */
+const char *btrfs_super_csum_driver(u16 csum_type)
+{
+ /* csum type is validated at mount time */
+ return btrfs_csums[csum_type].driver[0] ?
+ btrfs_csums[csum_type].driver :
+ btrfs_csums[csum_type].name;
+}
+
+size_t __attribute_const__ btrfs_get_num_csums(void)
+{
+ return ARRAY_SIZE(btrfs_csums);
+}
+
+/*
+ * We support the following block sizes for all systems:
+ *
+ * - 4K
+ * This is the most common block size. For PAGE SIZE > 4K cases the subpage
+ * mode is used.
+ *
+ * - PAGE_SIZE
+ * The straightforward block size to support.
+ *
+ * And extra support for the following block sizes based on the kernel config:
+ *
+ * - MIN_BLOCKSIZE
+ * This is either 4K (regular builds) or 2K (debug builds)
+ * This allows testing subpage routines on x86_64.
+ */
+bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize)
+{
+ /* @blocksize should be validated first. */
+ ASSERT(is_power_of_2(blocksize) && blocksize >= BTRFS_MIN_BLOCKSIZE &&
+ blocksize <= BTRFS_MAX_BLOCKSIZE);
+
+ if (blocksize == PAGE_SIZE || blocksize == SZ_4K || blocksize == BTRFS_MIN_BLOCKSIZE)
+ return true;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /*
+ * For bs > ps support it's done by specifying a minimal folio order
+ * for filemap, thus implying large data folios.
+ * For HIGHMEM systems, we can not always access the content of a (large)
+ * folio in one go, but go through them page by page.
+ *
+ * A lot of features don't implement a proper PAGE sized loop for large
+ * folios, this includes:
+ *
+ * - compression
+ * - verity
+ * - encoded write
+ *
+ * Considering HIGHMEM is such a pain to deal with and it's going
+ * to be deprecated eventually, just reject HIGHMEM && bs > ps cases.
+ */
+ if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE)
+ return false;
+ return true;
+#endif
+ return false;
+}
+
+/*
+ * Start exclusive operation @type, return true on success.
+ */
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ bool ret = false;
+
+ spin_lock(&fs_info->super_lock);
+ if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
+ fs_info->exclusive_operation = type;
+ ret = true;
+ }
+ spin_unlock(&fs_info->super_lock);
+
+ return ret;
+}
+
+/*
+ * Conditionally allow to enter the exclusive operation in case it's compatible
+ * with the running one. This must be paired with btrfs_exclop_start_unlock()
+ * and btrfs_exclop_finish().
+ *
+ * Compatibility:
+ * - the same type is already running
+ * - when trying to add a device and balance has been paused
+ * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
+ * must check the condition first that would allow none -> @type
+ */
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ spin_lock(&fs_info->super_lock);
+ if (fs_info->exclusive_operation == type ||
+ (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
+ type == BTRFS_EXCLOP_DEV_ADD))
+ return true;
+
+ spin_unlock(&fs_info->super_lock);
+ return false;
+}
+
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
+{
+ spin_unlock(&fs_info->super_lock);
+}
+
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
+{
+ spin_lock(&fs_info->super_lock);
+ WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+ spin_unlock(&fs_info->super_lock);
+ sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
+}
+
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op)
+{
+ switch (op) {
+ case BTRFS_EXCLOP_BALANCE_PAUSED:
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
+ fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD ||
+ fs_info->exclusive_operation == BTRFS_EXCLOP_NONE ||
+ fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
+ spin_unlock(&fs_info->super_lock);
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+ spin_unlock(&fs_info->super_lock);
+ break;
+ default:
+ btrfs_warn(fs_info,
+ "invalid exclop balance operation %d requested", op);
+ }
+}
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name)
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 79a1a3d6f04d..0f7e1ef27891 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -14,10 +14,10 @@
#include <linux/lockdep.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
-#include <linux/rwlock_types.h>
#include <linux/rwsem.h>
#include <linux/semaphore.h>
#include <linux/list.h>
+#include <linux/pagemap.h>
#include <linux/radix-tree.h>
#include <linux/workqueue.h>
#include <linux/wait.h>
@@ -29,6 +29,7 @@
#include "extent-io-tree.h"
#include "async-thread.h"
#include "block-rsv.h"
+#include "messages.h"
struct inode;
struct super_block;
@@ -47,6 +48,20 @@ struct btrfs_subpage_info;
struct btrfs_stripe_hash_table;
struct btrfs_space_info;
+/*
+ * Minimum data and metadata block size.
+ *
+ * Normally it's 4K, but for testing subpage block size on 4K page systems, we
+ * allow DEBUG builds to accept 2K page size.
+ */
+#ifdef CONFIG_BTRFS_DEBUG
+#define BTRFS_MIN_BLOCKSIZE (SZ_2K)
+#else
+#define BTRFS_MIN_BLOCKSIZE (SZ_4K)
+#endif
+
+#define BTRFS_MAX_BLOCKSIZE (SZ_64K)
+
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
#define BTRFS_OLDEST_GENERATION 0ULL
@@ -59,6 +74,13 @@ struct btrfs_space_info;
#define BTRFS_SUPER_INFO_SIZE 4096
static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
+/* Array of bytes with variable length, hexadecimal format 0x1234 */
+#define BTRFS_CSUM_FMT "0x%*phN"
+#define BTRFS_CSUM_FMT_VALUE(size, bytes) size, bytes
+
+#define BTRFS_KEY_FMT "(%llu %u %llu)"
+#define BTRFS_KEY_FMT_VALUE(key) (key)->objectid, (key)->type, (key)->offset
+
/*
* Number of metadata items necessary for an unlink operation:
*
@@ -90,6 +112,8 @@ enum {
BTRFS_FS_STATE_RO,
/* Track if a transaction abort has been reported on this filesystem */
BTRFS_FS_STATE_TRANS_ABORTED,
+ /* Track if log replay has failed. */
+ BTRFS_FS_STATE_LOG_REPLAY_ABORTED,
/*
* Bio operations should be blocked on this filesystem because a source
* or target device is being destroyed as part of a device replace
@@ -105,6 +129,15 @@ enum {
/* Indicates there was an error cleaning up a log tree. */
BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+ /* No more delayed iput can be queued. */
+ BTRFS_FS_STATE_NO_DELAYED_IPUT,
+
+ /*
+ * Emergency shutdown, a step further than transaction aborted by
+ * rejecting all operations.
+ */
+ BTRFS_FS_STATE_EMERGENCY_SHUTDOWN,
+
BTRFS_FS_STATE_COUNT
};
@@ -228,6 +261,7 @@ enum {
BTRFS_MOUNT_NOSPACECACHE = (1ULL << 30),
BTRFS_MOUNT_IGNOREMETACSUMS = (1ULL << 31),
BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32),
+ BTRFS_MOUNT_REF_TRACKER = (1ULL << 33),
};
/*
@@ -265,7 +299,7 @@ enum {
#ifdef CONFIG_BTRFS_EXPERIMENTAL
/*
- * Features under developmen like Extent tree v2 support is enabled
+ * Features under development like Extent tree v2 support is enabled
* only under CONFIG_BTRFS_EXPERIMENTAL
*/
#define BTRFS_FEATURE_INCOMPAT_SUPP \
@@ -285,8 +319,19 @@ enum {
#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
+#define BTRFS_WARNING_COMMIT_INTERVAL (300)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
+enum btrfs_compression_type {
+ BTRFS_COMPRESS_NONE = 0,
+ BTRFS_COMPRESS_ZLIB = 1,
+ BTRFS_COMPRESS_LZO = 2,
+ BTRFS_COMPRESS_ZSTD = 3,
+ BTRFS_NR_COMPRESS_TYPES = 4,
+
+ BTRFS_DEFRAG_DONT_COMPRESS,
+};
+
struct btrfs_dev_replace {
/* See #define above */
u64 replace_state;
@@ -404,6 +449,8 @@ struct btrfs_commit_stats {
u64 last_commit_dur;
/* The total commit duration in ns */
u64 total_commit_dur;
+ /* Start of the last critical section in ns. */
+ u64 critical_section_start_time;
};
struct btrfs_fs_info {
@@ -456,6 +503,8 @@ struct btrfs_fs_info {
struct btrfs_block_rsv delayed_block_rsv;
/* Block reservation for delayed refs */
struct btrfs_block_rsv delayed_refs_rsv;
+ /* Block reservation for treelog tree */
+ struct btrfs_block_rsv treelog_rsv;
struct btrfs_block_rsv empty_block_rsv;
@@ -485,8 +534,11 @@ struct btrfs_fs_info {
u64 last_trans_log_full_commit;
unsigned long long mount_opt;
- unsigned long compress_type:4;
- unsigned int compress_level;
+ /* Compress related structures. */
+ void *compr_wsm[BTRFS_NR_COMPRESS_TYPES];
+
+ int compress_type;
+ int compress_level;
u32 commit_interval;
/*
* It is a suggestive number, the read side is safe even it gets a
@@ -606,7 +658,6 @@ struct btrfs_fs_info {
struct workqueue_struct *endio_workers;
struct workqueue_struct *endio_meta_workers;
struct workqueue_struct *rmw_workers;
- struct workqueue_struct *compressed_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
@@ -627,6 +678,9 @@ struct btrfs_fs_info {
struct kobject *qgroups_kobj;
struct kobject *discard_kobj;
+ /* Track the number of blocks (sectors) read by the filesystem. */
+ struct percpu_counter stats_read_blocks;
+
/* Used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
struct percpu_counter delalloc_bytes;
@@ -692,8 +746,6 @@ struct btrfs_fs_info {
u32 data_chunk_allocations;
u32 metadata_ratio;
- void *bdev_holder;
-
/* Private scrub information */
struct mutex scrub_lock;
atomic_t scrubs_running;
@@ -706,7 +758,6 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
- u32 sectors_per_page;
struct workqueue_struct *scrub_workers;
struct btrfs_discard_ctl discard_ctl;
@@ -719,12 +770,6 @@ struct btrfs_fs_info {
spinlock_t qgroup_lock;
/*
- * Used to avoid frequently calling ulist_alloc()/ulist_free()
- * when doing qgroup accounting, it must be protected by qgroup_lock.
- */
- struct ulist *qgroup_ulist;
-
- /*
* Protect user change for quota operations. If a transaction is needed,
* it must be started before locking this lock.
*/
@@ -759,10 +804,8 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root;
- /* Extent buffer radix tree */
- spinlock_t buffer_lock;
- /* Entries are eb->start / sectorsize */
- struct radix_tree_root buffer_radix;
+ /* Entries are eb->start >> nodesize_bits */
+ struct xarray buffer_tree;
/* Next backup root to be overwritten */
int backup_root_index;
@@ -793,9 +836,12 @@ struct btrfs_fs_info {
/* Cached block sizes */
u32 nodesize;
+ u32 nodesize_bits;
u32 sectorsize;
/* ilog2 of sectorsize, use to avoid 64bit division */
u32 sectorsize_bits;
+ u32 block_min_order;
+ u32 block_max_order;
u32 csum_size;
u32 csums_per_leaf;
u32 stripesize;
@@ -865,12 +911,10 @@ struct btrfs_fs_info {
struct lockdep_map btrfs_trans_pending_ordered_map;
struct lockdep_map btrfs_ordered_extent_map;
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
spinlock_t ref_verify_lock;
struct rb_root block_tree;
-#endif
-#ifdef CONFIG_BTRFS_DEBUG
struct kobject *debug_kobj;
struct list_head allocated_roots;
@@ -887,6 +931,17 @@ struct btrfs_fs_info {
#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \
struct inode *: (_inode)))->root->fs_info)
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+ return mapping_gfp_constraint(mapping, ~__GFP_FS);
+}
+
+/* Return the minimal folio size of the fs. */
+static inline unsigned int btrfs_min_folio_size(struct btrfs_fs_info *fs_info)
+{
+ return 1U << (PAGE_SHIFT + fs_info->block_min_order);
+}
+
static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
{
return READ_ONCE(fs_info->generation);
@@ -953,6 +1008,8 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info,
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
+#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) ((bytes) >> (fs_info)->sectorsize_bits)
+
static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
{
return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0;
@@ -971,6 +1028,13 @@ static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 siz
return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
}
+static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs_info,
+ const struct folio *folio)
+{
+ return folio_size(folio) >> fs_info->sectorsize_bits;
+}
+
+bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize);
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
@@ -982,6 +1046,17 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args);
+u16 btrfs_csum_type_size(u16 type);
+int btrfs_super_csum_size(const struct btrfs_super_block *s);
+const char *btrfs_super_csum_name(u16 csum_type);
+const char *btrfs_super_csum_driver(u16 csum_type);
+size_t __attribute_const__ btrfs_get_num_csums(void);
+
+static inline bool btrfs_is_empty_uuid(const u8 *uuid)
+{
+ return uuid_is_null((const uuid_t *)uuid);
+}
+
/* Compatibility and incompatibility defines */
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name);
@@ -1058,13 +1133,42 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
&(fs_info)->fs_state)))
+static inline bool btrfs_is_shutdown(struct btrfs_fs_info *fs_info)
+{
+ return test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state);
+}
+
+static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * Here we do not want to use handle_fs_error(), which will mark the fs
+ * read-only.
+ * Some call sites like shutdown ioctl will mark the fs shutdown when
+ * the fs is frozen. But thaw path will handle RO and RW fs
+ * differently.
+ *
+ * So here we only mark the fs error without flipping it RO.
+ */
+ WRITE_ONCE(fs_info->fs_error, -EIO);
+ if (!test_and_set_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state))
+ btrfs_crit(fs_info, "emergency shutdown");
+}
+
+/*
+ * We use folio flag owner_2 to indicate there is an ordered extent with
+ * unfinished IO.
+ */
+#define folio_test_ordered(folio) folio_test_owner_2(folio)
+#define folio_set_ordered(folio) folio_set_owner_2(folio)
+#define folio_clear_ordered(folio) folio_clear_owner_2(folio)
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
#define EXPORT_FOR_TESTS
-static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
+static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info)
{
- return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+ return unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state));
}
void btrfs_test_destroy_inode(struct inode *inode);
@@ -1073,9 +1177,9 @@ void btrfs_test_destroy_inode(struct inode *inode);
#define EXPORT_FOR_TESTS static
-static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
+static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info)
{
- return 0;
+ return false;
}
#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 29572dfaf878..b73e1dd97208 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -78,13 +78,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
}
/* Returns NULL if no extref found */
-struct btrfs_inode_extref *
-btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct fscrypt_str *name,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow)
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid)
{
int ret;
struct btrfs_key key;
@@ -93,7 +90,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ERR_PTR(ret);
if (ret > 0)
@@ -109,7 +106,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
u64 inode_objectid, u64 ref_objectid,
u64 *index)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_inode_extref *extref;
struct extent_buffer *leaf;
@@ -129,9 +126,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
- ret = -ENOENT;
+ return -ENOENT;
if (ret < 0)
- goto out;
+ return ret;
/*
* Sanity check - did we find the right item for this name?
@@ -140,10 +137,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
*/
extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
ref_objectid, name);
- if (!extref) {
+ if (unlikely(!extref)) {
btrfs_abort_transaction(trans, -ENOENT);
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
leaf = path->nodes[0];
@@ -152,12 +148,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
*index = btrfs_inode_extref_index(leaf, extref);
if (del_len == item_size) {
- /*
- * Common case only one ref in the item, remove the
- * whole item.
- */
- ret = btrfs_del_item(trans, root, path);
- goto out;
+ /* Common case only one ref in the item, remove the whole item. */
+ return btrfs_del_item(trans, root, path);
}
ptr = (unsigned long)extref;
@@ -168,9 +160,6 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
btrfs_truncate_item(trans, path, item_size - del_len, 1);
-out:
- btrfs_free_path(path);
-
return ret;
}
@@ -191,8 +180,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
int del_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
- key.offset = ref_objectid;
key.type = BTRFS_INODE_REF_KEY;
+ key.offset = ref_objectid;
path = btrfs_alloc_path();
if (!path)
@@ -260,7 +249,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
int ret;
int ins_len = name->len + sizeof(*extref);
unsigned long ptr;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -279,13 +268,13 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
path->slots[0],
ref_objectid,
name))
- goto out;
+ return ret;
btrfs_extend_item(trans, path, ins_len);
ret = 0;
}
if (ret < 0)
- goto out;
+ return ret;
leaf = path->nodes[0];
ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
@@ -298,11 +287,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)&extref->name;
write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
@@ -319,14 +305,14 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
int ins_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
- key.offset = ref_objectid;
key.type = BTRFS_INODE_REF_KEY;
+ key.offset = ref_objectid;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->skip_release_on_error = 1;
+ path->skip_release_on_error = true;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
@@ -363,8 +349,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
}
write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
-
out:
btrfs_free_path(path);
@@ -460,7 +444,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_truncate_control *control)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
@@ -497,8 +481,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
path->reada = READA_BACK;
key.objectid = control->ino;
- key.offset = (u64)-1;
key.type = (u8)-1;
+ key.offset = (u64)-1;
search_again:
/*
@@ -590,7 +574,6 @@ search_again:
num_dec = (orig_num_bytes - extent_num_bytes);
if (extent_start != 0)
control->sub_bytes += num_dec;
- btrfs_mark_buffer_dirty(trans, leaf);
} else {
extent_num_bytes =
btrfs_file_extent_disk_num_bytes(leaf, fi);
@@ -644,7 +627,7 @@ delete:
if (control->clear_extent_range) {
ret = btrfs_inode_clear_file_extent_range(control->inode,
clear_start, clear_len);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -683,7 +666,7 @@ delete:
btrfs_init_data_ref(&ref, control->ino, extent_offset,
btrfs_root_id(root), false);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -701,7 +684,7 @@ delete:
ret = btrfs_del_items(trans, root, path,
pending_del_slot,
pending_del_nr);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -734,13 +717,12 @@ delete:
}
out:
if (ret >= 0 && pending_del_nr) {
- int err;
+ int ret2;
- err = btrfs_del_items(trans, root, path, pending_del_slot,
- pending_del_nr);
- if (err) {
- btrfs_abort_transaction(trans, err);
- ret = err;
+ ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr);
+ if (unlikely(ret2)) {
+ btrfs_abort_transaction(trans, ret2);
+ ret = ret2;
}
}
@@ -748,6 +730,5 @@ out:
if (!ret && control->last_size > new_size)
control->last_size = new_size;
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index c11b97fdccc4..6d9f5ad20646 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -101,13 +101,10 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *location, int mod);
-struct btrfs_inode_extref *btrfs_lookup_inode_extref(
- struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct fscrypt_str *name,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow);
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid);
struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf,
int slot,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 03fe0de2cd0d..c4bee47829ed 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9,6 +9,7 @@
#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
@@ -71,6 +72,10 @@
#include "backref.h"
#include "raid-stripe-tree.h"
#include "fiemap.h"
+#include "delayed-inode.h"
+
+#define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0)
+#define COW_FILE_RANGE_NO_INLINE (1UL << 1)
struct btrfs_iget_args {
u64 ino;
@@ -127,7 +132,7 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
struct btrfs_fs_info *fs_info = warn->fs_info;
struct extent_buffer *eb;
struct btrfs_inode_item *inode_item;
- struct inode_fs_paths *ipath = NULL;
+ struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL;
struct btrfs_root *local_root;
struct btrfs_key key;
unsigned int nofs_flag;
@@ -174,8 +179,10 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
return ret;
}
ret = paths_from_inode(inum, ipath);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_put_root(local_root);
goto err;
+ }
/*
* We deliberately ignore the bit ipath might have been too small to
@@ -190,7 +197,6 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
}
btrfs_put_root(local_root);
- free_ipath(ipath);
return 0;
err:
@@ -198,7 +204,6 @@ err:
"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
warn->logical, warn->mirror_num, root, inum, offset, ret);
- free_ipath(ipath);
return ret;
}
@@ -230,21 +235,21 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
if (logical == U64_MAX) {
btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
btrfs_warn_rl(fs_info,
-"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+"csum failed root %lld ino %llu off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d",
btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
- CSUM_FMT_VALUE(csum_size, csum),
- CSUM_FMT_VALUE(csum_size, csum_expected),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
return;
}
logical += file_off;
btrfs_warn_rl(fs_info,
-"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+"csum failed root %lld ino %llu off %llu logical %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d",
btrfs_root_id(inode->root),
btrfs_ino(inode), file_off, logical,
- CSUM_FMT_VALUE(csum_size, csum),
- CSUM_FMT_VALUE(csum_size, csum_expected),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
@@ -308,26 +313,26 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
const u32 csum_size = root->fs_info->csum_size;
/* For data reloc tree, it's better to do a backref lookup instead. */
- if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
return print_data_reloc_error(inode, logical_start, csum,
csum_expected, mirror_num);
/* Output without objectid, which is more meaningful */
if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
btrfs_warn_rl(root->fs_info,
-"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+"csum failed root %lld ino %lld off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d",
btrfs_root_id(root), btrfs_ino(inode),
logical_start,
- CSUM_FMT_VALUE(csum_size, csum),
- CSUM_FMT_VALUE(csum_size, csum_expected),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
} else {
btrfs_warn_rl(root->fs_info,
-"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+"csum failed root %llu ino %llu off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d",
btrfs_root_id(root), btrfs_ino(inode),
logical_start,
- CSUM_FMT_VALUE(csum_size, csum),
- CSUM_FMT_VALUE(csum_size, csum_expected),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
}
}
@@ -367,7 +372,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
}
/*
- * Unock inode i_rwsem.
+ * Unlock inode i_rwsem.
*
* ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
* to decide whether the lock acquired is shared or exclusive.
@@ -393,39 +398,20 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
* extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
- struct folio *locked_folio,
u64 offset, u64 bytes)
{
- unsigned long index = offset >> PAGE_SHIFT;
- unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
- u64 page_start = 0, page_end = 0;
+ pgoff_t index = offset >> PAGE_SHIFT;
+ const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT;
struct folio *folio;
- if (locked_folio) {
- page_start = folio_pos(locked_folio);
- page_end = page_start + folio_size(locked_folio) - 1;
- }
-
while (index <= end_index) {
- /*
- * For locked page, we will call btrfs_mark_ordered_io_finished
- * through btrfs_mark_ordered_io_finished() on it
- * in run_delalloc_range() for the error handling, which will
- * clear page Ordered and run the ordered extent accounting.
- *
- * Here we can't just clear the Ordered bit, or
- * btrfs_mark_ordered_io_finished() would skip the accounting
- * for the page range, and the ordered extent will never finish.
- */
- if (locked_folio && index == (page_start >> PAGE_SHIFT)) {
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
+ if (IS_ERR(folio)) {
index++;
continue;
}
- folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
- index++;
- if (IS_ERR(folio))
- continue;
+ index = folio_next_index(folio);
/*
* Here we just clear all Ordered bits for every page in the
* range, then btrfs_mark_ordered_io_finished() will handle
@@ -436,23 +422,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
folio_put(folio);
}
- if (locked_folio) {
- /* The locked page covers the full range, nothing needs to be done */
- if (bytes + offset <= page_start + folio_size(locked_folio))
- return;
- /*
- * In case this page belongs to the delalloc range being
- * instantiated then skip it, since the first page of a range is
- * going to be properly cleaned up by the caller of
- * run_delalloc_range
- */
- if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- bytes = offset + bytes - folio_pos(locked_folio) -
- folio_size(locked_folio);
- offset = folio_pos(locked_folio) + folio_size(locked_folio);
- }
- }
-
return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
}
@@ -461,18 +430,18 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct btrfs_new_inode_args *args)
{
- int err;
+ int ret;
if (args->default_acl) {
- err = __btrfs_set_acl(trans, args->inode, args->default_acl,
+ ret = __btrfs_set_acl(trans, args->inode, args->default_acl,
ACL_TYPE_DEFAULT);
- if (err)
- return err;
+ if (ret)
+ return ret;
}
if (args->acl) {
- err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
- if (err)
- return err;
+ ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
+ if (ret)
+ return ret;
}
if (!args->default_acl && !args->acl)
cache_no_acl(args->inode);
@@ -527,8 +496,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t datasize;
key.objectid = btrfs_ino(inode);
- key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -564,7 +533,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
kunmap_local(kaddr);
folio_put(folio);
}
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/*
@@ -605,23 +573,14 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode,
if (offset != 0)
return false;
- /*
- * Due to the page size limit, for subpage we can only trigger the
- * writeback for the dirty sectors of page, that means data writeback
- * is doing more writeback than what we want.
- *
- * This is especially unexpected for some call sites like fallocate,
- * where we only increase i_size after everything is done.
- * This means we can trigger inline extent even if we didn't want to.
- * So here we skip inline extent creation completely.
- */
- if (fs_info->sectorsize != PAGE_SIZE)
- return false;
-
/* Inline extents are limited to sectorsize. */
if (size > fs_info->sectorsize)
return false;
+ /* We do not allow a non-compressed extent to be as large as block size. */
+ if (data_len >= fs_info->sectorsize)
+ return false;
+
/* We cannot exceed the maximum inline data size. */
if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
return false;
@@ -634,6 +593,10 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode,
if (size < i_size_read(&inode->vfs_inode))
return false;
+ /* Encrypted file cannot be inlined. */
+ if (IS_ENCRYPTED(&inode->vfs_inode))
+ return false;
+
return true;
}
@@ -677,7 +640,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
drop_args.replace_extent = true;
drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -685,7 +648,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
size, compressed_size, compress_type,
compressed_folio, update_i_size);
- if (ret && ret != -ENOSPC) {
+ if (unlikely(ret && ret != -ENOSPC)) {
btrfs_abort_transaction(trans, ret);
goto out;
} else if (ret == -ENOSPC) {
@@ -695,7 +658,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
ret = btrfs_update_inode(trans, inode);
- if (ret && ret != -ENOSPC) {
+ if (unlikely(ret && ret != -ENOSPC)) {
btrfs_abort_transaction(trans, ret);
goto out;
} else if (ret == -ENOSPC) {
@@ -711,7 +674,7 @@ out:
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
- btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
+ btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
btrfs_free_path(path);
btrfs_end_transaction(trans);
return ret;
@@ -734,12 +697,12 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode,
if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
return 1;
- lock_extent(&inode->io_tree, offset, end, &cached);
+ btrfs_lock_extent(&inode->io_tree, offset, end, &cached);
ret = __cow_file_range_inline(inode, size, compressed_size,
compress_type, compressed_folio,
update_i_size);
if (ret > 0) {
- unlock_extent(&inode->io_tree, offset, end, &cached);
+ btrfs_unlock_extent(&inode->io_tree, offset, end, &cached);
return ret;
}
@@ -825,33 +788,19 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (!btrfs_inode_can_compress(inode)) {
- WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
- KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
- btrfs_ino(inode));
+ DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode));
return 0;
}
- /*
- * Only enable sector perfect compression for experimental builds.
- *
- * This is a big feature change for subpage cases, and can hit
- * different corner cases, so only limit this feature for
- * experimental build for now.
- *
- * ETA for moving this out of experimental builds is 6.15.
- */
- if (fs_info->sectorsize < PAGE_SIZE &&
- !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
- if (!PAGE_ALIGNED(start) ||
- !PAGE_ALIGNED(end + 1))
- return 0;
- }
+ /* Defrag ioctl takes precedence over mount options and properties. */
+ if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS)
+ return 0;
+ if (BTRFS_COMPRESS_NONE < inode->defrag_compress &&
+ inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES)
+ return 1;
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
- /* defrag ioctl */
- if (inode->defrag_compress)
- return 1;
/* bad compression ratios */
if (inode->flags & BTRFS_INODE_NOCOMPRESS)
return 0;
@@ -871,21 +820,20 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
btrfs_add_inode_defrag(inode, small_write);
}
-static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
{
- unsigned long end_index = end >> PAGE_SHIFT;
+ const pgoff_t end_index = end >> PAGE_SHIFT;
struct folio *folio;
int ret = 0;
- for (unsigned long index = start >> PAGE_SHIFT;
- index <= end_index; index++) {
- folio = filemap_get_folio(inode->i_mapping, index);
+ for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) {
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
if (IS_ERR(folio)) {
if (!ret)
ret = PTR_ERR(folio);
continue;
}
- btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start,
+ btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
end + 1 - start);
folio_put(folio);
}
@@ -912,19 +860,25 @@ static void compress_file_range(struct btrfs_work *work)
struct btrfs_inode *inode = async_chunk->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
u64 blocksize = fs_info->sectorsize;
u64 start = async_chunk->start;
u64 end = async_chunk->end;
u64 actual_end;
u64 i_size;
int ret = 0;
- struct folio **folios;
+ struct folio **folios = NULL;
unsigned long nr_folios;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
- unsigned int poff;
+ unsigned int loff;
int i;
int compress_type = fs_info->compress_type;
+ int compress_level = fs_info->compress_level;
+
+ if (unlikely(btrfs_is_shutdown(fs_info)))
+ goto cleanup_and_bail_uncompressed;
inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
@@ -933,7 +887,7 @@ static void compress_file_range(struct btrfs_work *work)
* Otherwise applications with the file mmap'd can wander in and change
* the page contents while we are compressing them.
*/
- ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
+ ret = extent_range_clear_dirty_for_io(inode, start, end);
/*
* All the folios should have been locked thus no failure.
@@ -959,8 +913,8 @@ static void compress_file_range(struct btrfs_work *work)
actual_end = min_t(u64, i_size, end + 1);
again:
folios = NULL;
- nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
+ nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1;
+ nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift);
/*
* we don't want to send crud past the end of i_size through
@@ -1007,25 +961,27 @@ again:
goto cleanup_and_bail_uncompressed;
}
- if (inode->defrag_compress)
+ if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) {
compress_type = inode->defrag_compress;
- else if (inode->prop_compress)
+ compress_level = inode->defrag_compress_level;
+ } else if (inode->prop_compress) {
compress_type = inode->prop_compress;
+ }
/* Compression level is applied here. */
- ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
- mapping, start, folios, &nr_folios, &total_in,
+ ret = btrfs_compress_folios(compress_type, compress_level,
+ inode, start, folios, &nr_folios, &total_in,
&total_compressed);
if (ret)
goto mark_incompressible;
/*
- * Zero the tail end of the last page, as we might be sending it down
+ * Zero the tail end of the last folio, as we might be sending it down
* to disk.
*/
- poff = offset_in_page(total_compressed);
- if (poff)
- folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
+ loff = (total_compressed & (min_folio_size - 1));
+ if (loff)
+ folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff);
/*
* Try to create an inline extent.
@@ -1129,19 +1085,13 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
&wbc, false);
wbc_detach_inode(&wbc);
if (ret < 0) {
- btrfs_cleanup_ordered_extents(inode, locked_folio,
- start, end - start + 1);
- if (locked_folio) {
- const u64 page_start = folio_pos(locked_folio);
-
- folio_start_writeback(locked_folio);
- folio_end_writeback(locked_folio);
- btrfs_mark_ordered_io_finished(inode, locked_folio,
- page_start, PAGE_SIZE,
- !ret);
- mapping_set_error(locked_folio->mapping, ret);
- folio_unlock(locked_folio);
- }
+ if (locked_folio)
+ btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
+ start, async_extent->ram_size);
+ btrfs_err_rl(inode->root->fs_info,
+ "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
+ __func__, btrfs_root_id(inode->root),
+ btrfs_ino(inode), start, async_extent->ram_size, ret);
}
}
@@ -1160,6 +1110,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
+ bool free_pages = false;
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
@@ -1180,14 +1131,17 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
}
if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
+ ASSERT(!async_extent->folios);
+ ASSERT(async_extent->nr_folios == 0);
submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
ret = btrfs_reserve_extent(root, async_extent->ram_size,
async_extent->compressed_size,
async_extent->compressed_size,
- 0, *alloc_hint, &ins, 1, 1);
+ 0, *alloc_hint, &ins, true, true);
if (ret) {
/*
* We can't reserve contiguous space for the compressed size.
@@ -1196,10 +1150,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
* fall back to uncompressed.
*/
submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
- lock_extent(io_tree, start, end, &cached);
+ btrfs_lock_extent(io_tree, start, end, &cached);
/* Here we're doing allocation and writeback of the compressed pages */
file_extent.disk_bytenr = ins.objectid;
@@ -1214,10 +1169,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
ret = PTR_ERR(em);
goto out_free_reserve;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- 1 << BTRFS_ORDERED_COMPRESSED);
+ 1U << BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -1237,12 +1192,14 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
done:
if (async_chunk->blkcg_css)
kthread_associate_blkcg(NULL);
+ if (free_pages)
+ free_async_extent_pages(async_extent);
kfree(async_extent);
return;
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
extent_clear_unlock_delalloc(inode, start, end,
NULL, &cached,
@@ -1269,7 +1226,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
u64 alloc_hint = 0;
read_lock(&em_tree->lock);
- em = search_extent_mapping(em_tree, start, num_bytes);
+ em = btrfs_search_extent_mapping(em_tree, start, num_bytes);
if (em) {
/*
* if block start isn't an actual block number then find the
@@ -1277,15 +1234,15 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* block is also bogus then just don't worry about it.
*/
if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- em = search_extent_mapping(em_tree, 0, 0);
+ btrfs_free_extent_map(em);
+ em = btrfs_search_extent_mapping(em_tree, 0, 0);
if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
- alloc_hint = extent_map_block_start(em);
+ alloc_hint = btrfs_extent_map_block_start(em);
if (em)
- free_extent_map(em);
+ btrfs_free_extent_map(em);
} else {
- alloc_hint = extent_map_block_start(em);
- free_extent_map(em);
+ alloc_hint = btrfs_extent_map_block_start(em);
+ btrfs_free_extent_map(em);
}
}
read_unlock(&em_tree->lock);
@@ -1302,29 +1259,26 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* locked_folio is the folio that writepage had locked already. We use
* it to make sure we don't do extra locks or unlocks.
*
- * When this function fails, it unlocks all pages except @locked_folio.
+ * When this function fails, it unlocks all folios except @locked_folio.
*
* When this function successfully creates an inline extent, it returns 1 and
- * unlocks all pages including locked_folio and starts I/O on them.
- * (In reality inline extents are limited to a single page, so locked_folio is
- * the only page handled anyway).
+ * unlocks all folios including locked_folio and starts I/O on them.
+ * (In reality inline extents are limited to a single block, so locked_folio is
+ * the only folio handled anyway).
*
- * When this function succeed and creates a normal extent, the page locking
+ * When this function succeed and creates a normal extent, the folio locking
* status depends on the passed in flags:
*
- * - If @keep_locked is set, all pages are kept locked.
- * - Else all pages except for @locked_folio are unlocked.
+ * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked.
+ * - Else all folios except for @locked_folio are unlocked.
*
* When a failure happens in the second or later iteration of the
- * while-loop, the ordered extents created in previous iterations are kept
- * intact. So, the caller must clean them up by calling
- * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
- * example.
+ * while-loop, the ordered extents created in previous iterations are cleaned up.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
struct folio *locked_folio, u64 start,
u64 end, u64 *done_offset,
- bool keep_locked, bool no_inline)
+ unsigned long flags)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1341,6 +1295,11 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
unsigned long page_ops;
int ret = 0;
+ if (unlikely(btrfs_is_shutdown(fs_info))) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
if (btrfs_is_free_space_inode(inode)) {
ret = -EINVAL;
goto out_unlock;
@@ -1352,7 +1311,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- if (!no_inline) {
+ if (!(flags & COW_FILE_RANGE_NO_INLINE)) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false);
@@ -1373,6 +1332,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
/*
+ * We're not doing compressed IO, don't unlock the first page (which
+ * the caller expects to stay locked), don't clear any dirty bits and
+ * don't set any writeback bits.
+ *
+ * Do set the Ordered (Private2) bit so we know this page was properly
+ * setup for writepage.
+ */
+ page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK);
+ page_ops |= PAGE_SET_ORDERED;
+
+ /*
* Relocation relies on the relocated extents to have exactly the same
* size as the original extents. Normally writeback for relocation data
* extents follows a NOCOW path because relocation preallocates the
@@ -1394,7 +1364,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
min_alloc_size, 0, alloc_hint,
- &ins, 1, 1);
+ &ins, true, true);
if (ret == -EAGAIN) {
/*
* btrfs_reserve_extent only returns -EAGAIN for zoned
@@ -1415,8 +1385,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
continue;
}
if (done_offset) {
- *done_offset = start - 1;
- return 0;
+ /*
+ * Move @end to the end of the processed range,
+ * and exit the loop to unlock the processed extents.
+ */
+ end = start - 1;
+ ret = 0;
+ break;
}
ret = -ENOSPC;
}
@@ -1431,24 +1406,28 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
file_extent.offset = 0;
file_extent.compression = BTRFS_COMPRESS_NONE;
- lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
- &cached);
+ /*
+ * Locked range will be released either during error clean up or
+ * after the whole range is finished.
+ */
+ btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
+ &cached);
em = btrfs_create_io_em(inode, start, &file_extent,
BTRFS_ORDERED_REGULAR);
if (IS_ERR(em)) {
- unlock_extent(&inode->io_tree, start,
- start + cur_alloc_size - 1, &cached);
+ btrfs_unlock_extent(&inode->io_tree, start,
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(em);
goto out_reserve;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- 1 << BTRFS_ORDERED_REGULAR);
+ 1U << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) {
- unlock_extent(&inode->io_tree, start,
- start + cur_alloc_size - 1, &cached);
+ btrfs_unlock_extent(&inode->io_tree, start,
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(ordered);
goto out_drop_extent_cache;
}
@@ -1476,21 +1455,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- /*
- * We're not doing compressed IO, don't unlock the first page
- * (which the caller expects to stay locked), don't clear any
- * dirty bits and don't set any writeback bits
- *
- * Do set the Ordered flag so we know this page was
- * properly setup for writepage.
- */
- page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
- page_ops |= PAGE_SET_ORDERED;
-
- extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1,
- locked_folio, &cached,
- EXTENT_LOCKED | EXTENT_DELALLOC,
- page_ops);
if (num_bytes < cur_alloc_size)
num_bytes = 0;
else
@@ -1507,6 +1471,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret)
goto out_unlock;
}
+ extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
done:
if (done_offset)
*done_offset = end;
@@ -1516,7 +1482,7 @@ out_drop_extent_cache:
btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_unlock:
/*
* Now, we have three regions to clean up:
@@ -1527,35 +1493,30 @@ out_unlock:
* We process each region below.
*/
- clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
- page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
-
/*
* For the range (1). We have already instantiated the ordered extents
- * for this region. They are cleaned up by
- * btrfs_cleanup_ordered_extents() in e.g,
- * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
- * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
- * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
- * function.
+ * for this region, thus we need to cleanup those ordered extents.
+ * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
+ * are also handled by the ordered extents cleanup.
*
- * However, in case of @keep_locked, we still need to unlock the pages
- * (except @locked_folio) to ensure all the pages are unlocked.
+ * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
+ * finish the writeback of the involved folios, which will be never submitted.
*/
- if (keep_locked && orig_start < start) {
+ if (orig_start < start) {
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
+
+ btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
- locked_folio, NULL, 0, page_ops);
+ locked_folio, NULL, clear_bits, page_ops);
}
- /*
- * At this point we're unlocked, we want to make sure we're only
- * clearing these flags under the extent lock, so lock the rest of the
- * range and clear everything up.
- */
- lock_extent(&inode->io_tree, start, end, NULL);
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
/*
* For the range (2). If we reserved an extent for our delalloc range
@@ -1589,6 +1550,11 @@ out_unlock:
btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
end - start - cur_alloc_size + 1, NULL);
}
+ btrfs_err(fs_info,
+"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d",
+ __func__, btrfs_root_id(inode->root),
+ btrfs_ino(inode), orig_start, end + 1 - orig_start,
+ start, cur_alloc_size, ret);
return ret;
}
@@ -1626,8 +1592,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_
PAGE_SHIFT;
while (!list_empty(&async_chunk->extents)) {
- async_extent = list_entry(async_chunk->extents.next,
- struct async_extent, list);
+ async_extent = list_first_entry(&async_chunk->extents,
+ struct async_extent, list);
list_del(&async_extent->list);
submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
}
@@ -1741,7 +1707,7 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
while (start <= end) {
ret = cow_file_range(inode, locked_folio, start, end,
- &done_offset, true, false);
+ &done_offset, COW_FILE_RANGE_KEEP_LOCKED);
if (ret)
return ret;
extent_write_locked_range(&inode->vfs_inode, locked_folio,
@@ -1797,9 +1763,9 @@ static int fallback_to_cow(struct btrfs_inode *inode,
* group that contains that extent to RO mode and therefore force COW
* when starting writeback.
*/
- lock_extent(io_tree, start, end, &cached_state);
- count = count_range_bits(io_tree, &range_start, end, range_bytes,
- EXTENT_NORESERVE, 0, NULL);
+ btrfs_lock_extent(io_tree, start, end, &cached_state);
+ count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes,
+ EXTENT_NORESERVE, 0, NULL);
if (count > 0 || is_space_ino || is_reloc_ino) {
u64 bytes = count;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1809,22 +1775,28 @@ static int fallback_to_cow(struct btrfs_inode *inode,
bytes = range_bytes;
spin_lock(&sinfo->lock);
- btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
+ btrfs_space_info_update_bytes_may_use(sinfo, bytes);
spin_unlock(&sinfo->lock);
if (count > 0)
- clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
- NULL);
+ btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
+ &cached_state);
}
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
/*
* Don't try to create inline extents, as a mix of inline extent that
* is written out and unlocked directly and a normal NOCOW extent
* doesn't work.
+ *
+ * And here we do not unlock the folio after a successful run.
+ * The folios will be unlocked after everything is finished, or by error handling.
+ *
+ * This is to ensure error handling won't need to clear dirty/ordered flags without
+ * a locked folio, which can race with writeback.
*/
- ret = cow_file_range(inode, locked_folio, start, end, NULL, false,
- true);
+ ret = cow_file_range(inode, locked_folio, start, end, NULL,
+ COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED);
ASSERT(ret != 1);
return ret;
}
@@ -1837,7 +1809,6 @@ struct can_nocow_file_extent_args {
/* End file offset (inclusive) of the range we want to NOCOW. */
u64 end;
bool writeback_path;
- bool strict;
/*
* Free the path passed to can_nocow_file_extent() once it's not needed
* anymore.
@@ -1892,8 +1863,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* for its subvolume was created, then this implies the extent is shared,
* hence we must COW.
*/
- if (!args->strict &&
- btrfs_file_extent_generation(leaf, fi) <=
+ if (btrfs_file_extent_generation(leaf, fi) <=
btrfs_root_last_snapshot(&root->root_item))
goto out;
@@ -1922,9 +1892,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
*/
btrfs_release_path(path);
- ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
- key->offset - args->file_extent.offset,
- args->file_extent.disk_bytenr, args->strict, path);
+ ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
+ args->file_extent.disk_bytenr, path);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -1970,8 +1939,74 @@ static int can_nocow_file_extent(struct btrfs_path *path,
return ret < 0 ? ret : can_nocow;
}
+static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
+ struct extent_state **cached,
+ struct can_nocow_file_extent_args *nocow_args,
+ u64 file_pos, bool is_prealloc)
+{
+ struct btrfs_ordered_extent *ordered;
+ const u64 len = nocow_args->file_extent.num_bytes;
+ const u64 end = file_pos + len - 1;
+ int ret = 0;
+
+ btrfs_lock_extent(&inode->io_tree, file_pos, end, cached);
+
+ if (is_prealloc) {
+ struct extent_map *em;
+
+ em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
+ BTRFS_ORDERED_PREALLOC);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto error;
+ }
+ btrfs_free_extent_map(em);
+ }
+
+ ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
+ is_prealloc
+ ? (1U << BTRFS_ORDERED_PREALLOC)
+ : (1U << BTRFS_ORDERED_NOCOW));
+ if (IS_ERR(ordered)) {
+ if (is_prealloc)
+ btrfs_drop_extent_map_range(inode, file_pos, end, false);
+ ret = PTR_ERR(ordered);
+ goto error;
+ }
+
+ if (btrfs_is_data_reloc_root(inode->root))
+ /*
+ * Errors are handled later, as we must prevent
+ * extent_clear_unlock_delalloc() in error handler from freeing
+ * metadata of the created ordered extent.
+ */
+ ret = btrfs_reloc_clone_csums(ordered);
+ btrfs_put_ordered_extent(ordered);
+
+ if (ret < 0)
+ goto error;
+ extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_SET_ORDERED);
+ return ret;
+
+error:
+ btrfs_cleanup_ordered_extents(inode, file_pos, len);
+ extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ btrfs_err(inode->root->fs_info,
+ "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d",
+ __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
+ file_pos, len, ret);
+ return ret;
+}
+
/*
- * when nowcow writeback call back. This checks for snapshots or COW copies
+ * When nocow writeback calls back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
*
* If no cow copies or snapshots exist, we write directly to the existing
@@ -1983,13 +2018,28 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
- struct btrfs_path *path;
+ struct btrfs_path *path = NULL;
u64 cow_start = (u64)-1;
+ /*
+ * If not 0, represents the inclusive end of the last fallback_to_cow()
+ * range. Only for error handling.
+ *
+ * The same for nocow_end, it's to avoid double cleaning up the range
+ * already cleaned by nocow_one_range().
+ */
+ u64 cow_end = 0;
+ u64 nocow_end = 0;
u64 cur_offset = start;
int ret;
bool check_prev = true;
u64 ino = btrfs_ino(inode);
struct can_nocow_file_extent_args nocow_args = { 0 };
+ /* The range that has ordered extent(s). */
+ u64 oe_cleanup_start;
+ u64 oe_cleanup_len = 0;
+ /* The range that is untouched. */
+ u64 untouched_start;
+ u64 untouched_len = 0;
/*
* Normally on a zoned device we're only doing COW writes, but in case
@@ -1998,6 +2048,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
*/
ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
+ if (unlikely(btrfs_is_shutdown(fs_info))) {
+ ret = -EIO;
+ goto error;
+ }
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -2009,15 +2063,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
while (cur_offset <= end) {
struct btrfs_block_group *nocow_bg = NULL;
- struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
struct extent_state *cached_state = NULL;
u64 extent_end;
- u64 nocow_end;
int extent_type;
- bool is_prealloc;
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0);
@@ -2072,12 +2123,13 @@ next_slot:
/*
* If the found extent starts after requested offset, then
- * adjust extent_end to be right before this extent begins
+ * adjust cur_offset to be right before this extent begins.
*/
if (found_key.offset > cur_offset) {
- extent_end = found_key.offset;
- extent_type = 0;
- goto must_cow;
+ if (cow_start == (u64)-1)
+ cow_start = cur_offset;
+ cur_offset = found_key.offset;
+ goto next_slot;
}
/*
@@ -2143,74 +2195,23 @@ must_cow:
if (cow_start != (u64)-1) {
ret = fallback_to_cow(inode, locked_folio, cow_start,
found_key.offset - 1);
- cow_start = (u64)-1;
if (ret) {
+ cow_end = found_key.offset - 1;
btrfs_dec_nocow_writers(nocow_bg);
goto error;
}
+ cow_start = (u64)-1;
}
- nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
- lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
-
- is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
- if (is_prealloc) {
- struct extent_map *em;
-
- em = btrfs_create_io_em(inode, cur_offset,
- &nocow_args.file_extent,
- BTRFS_ORDERED_PREALLOC);
- if (IS_ERR(em)) {
- unlock_extent(&inode->io_tree, cur_offset,
- nocow_end, &cached_state);
- btrfs_dec_nocow_writers(nocow_bg);
- ret = PTR_ERR(em);
- goto error;
- }
- free_extent_map(em);
- }
-
- ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
- &nocow_args.file_extent,
- is_prealloc
- ? (1 << BTRFS_ORDERED_PREALLOC)
- : (1 << BTRFS_ORDERED_NOCOW));
+ ret = nocow_one_range(inode, locked_folio, &cached_state,
+ &nocow_args, cur_offset,
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC);
btrfs_dec_nocow_writers(nocow_bg);
- if (IS_ERR(ordered)) {
- if (is_prealloc) {
- btrfs_drop_extent_map_range(inode, cur_offset,
- nocow_end, false);
- }
- unlock_extent(&inode->io_tree, cur_offset,
- nocow_end, &cached_state);
- ret = PTR_ERR(ordered);
+ if (ret < 0) {
+ nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
goto error;
}
-
- if (btrfs_is_data_reloc_root(root))
- /*
- * Error handled later, as we must prevent
- * extent_clear_unlock_delalloc() in error handler
- * from freeing metadata of created ordered extent.
- */
- ret = btrfs_reloc_clone_csums(ordered);
- btrfs_put_ordered_extent(ordered);
-
- extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
- locked_folio, &cached_state,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_ORDERED);
-
cur_offset = extent_end;
-
- /*
- * btrfs_reloc_clone_csums() error, now we're OK to call error
- * handler, as metadata for created ordered extent will only
- * be freed by btrfs_finish_ordered_io().
- */
- if (ret)
- goto error;
}
btrfs_release_path(path);
@@ -2218,43 +2219,113 @@ must_cow:
cow_start = cur_offset;
if (cow_start != (u64)-1) {
- cur_offset = end;
ret = fallback_to_cow(inode, locked_folio, cow_start, end);
- cow_start = (u64)-1;
- if (ret)
+ if (ret) {
+ cow_end = end;
goto error;
+ }
+ cow_start = (u64)-1;
}
+ /*
+ * Everything is finished without an error, can unlock the folios now.
+ *
+ * No need to touch the io tree range nor set folio ordered flag, as
+ * fallback_to_cow() and nocow_one_range() have already handled them.
+ */
+ extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK);
+
btrfs_free_path(path);
return 0;
error:
- /*
- * If an error happened while a COW region is outstanding, cur_offset
- * needs to be reset to cow_start to ensure the COW region is unlocked
- * as well.
- */
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
+ if (cow_start == (u64)-1) {
+ /*
+ * case a)
+ * start cur_offset end
+ * | OE cleanup | Untouched |
+ *
+ * We finished a fallback_to_cow() or nocow_one_range() call,
+ * but failed to check the next range.
+ *
+ * or
+ * start cur_offset nocow_end end
+ * | OE cleanup | Skip | Untouched |
+ *
+ * nocow_one_range() failed, the range [cur_offset, nocow_end] is
+ * already cleaned up.
+ */
+ oe_cleanup_start = start;
+ oe_cleanup_len = cur_offset - start;
+ if (nocow_end)
+ untouched_start = nocow_end + 1;
+ else
+ untouched_start = cur_offset;
+ untouched_len = end + 1 - untouched_start;
+ } else if (cow_start != (u64)-1 && cow_end == 0) {
+ /*
+ * case b)
+ * start cow_start cur_offset end
+ * | OE cleanup | Untouched |
+ *
+ * We got a range that needs COW, but before we hit the next NOCOW range,
+ * thus [cow_start, cur_offset) doesn't yet have any OE.
+ */
+ oe_cleanup_start = start;
+ oe_cleanup_len = cow_start - start;
+ untouched_start = cow_start;
+ untouched_len = end + 1 - untouched_start;
+ } else {
+ /*
+ * case c)
+ * start cow_start cow_end end
+ * | OE cleanup | Skip | Untouched |
+ *
+ * fallback_to_cow() failed, and fallback_to_cow() will do the
+ * cleanup for its range, we shouldn't touch the range
+ * [cow_start, cow_end].
+ */
+ ASSERT(cow_start != (u64)-1 && cow_end != 0);
+ oe_cleanup_start = start;
+ oe_cleanup_len = cow_start - start;
+ untouched_start = cow_end + 1;
+ untouched_len = end + 1 - untouched_start;
+ }
+
+ if (oe_cleanup_len) {
+ const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1;
+ btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len);
+ extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end,
+ locked_folio, NULL,
+ EXTENT_LOCKED | EXTENT_DELALLOC,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ }
- /*
- * We need to lock the extent here because we're clearing DELALLOC and
- * we're not locked at this point.
- */
- if (cur_offset < end) {
+ if (untouched_len) {
struct extent_state *cached = NULL;
+ const u64 untouched_end = untouched_start + untouched_len - 1;
- lock_extent(&inode->io_tree, cur_offset, end, &cached);
- extent_clear_unlock_delalloc(inode, cur_offset, end,
+ /*
+ * We need to lock the extent here because we're clearing DELALLOC and
+ * we're not locked at this point.
+ */
+ btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached);
+ extent_clear_unlock_delalloc(inode, untouched_start, untouched_end,
locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
- btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
+ btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL);
}
btrfs_free_path(path);
+ btrfs_err(fs_info,
+"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d",
+ __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
+ start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len,
+ untouched_start, untouched_len, ret);
return ret;
}
@@ -2262,7 +2333,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
if (inode->defrag_bytes &&
- test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
+ btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
return false;
return true;
}
@@ -2284,11 +2355,11 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
* can confuse the caller.
*/
ASSERT(!(end <= folio_pos(locked_folio) ||
- start >= folio_pos(locked_folio) + folio_size(locked_folio)));
+ start >= folio_next_pos(locked_folio)));
if (should_nocow(inode, start, end)) {
ret = run_delalloc_nocow(inode, locked_folio, start, end);
- goto out;
+ return ret;
}
if (btrfs_inode_can_compress(inode) &&
@@ -2300,13 +2371,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
true);
else
- ret = cow_file_range(inode, locked_folio, start, end, NULL,
- false, false);
-
-out:
- if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_folio, start,
- end - start + 1);
+ ret = cow_file_range(inode, locked_folio, start, end, NULL, 0);
return ret;
}
@@ -2556,7 +2621,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
!btrfs_is_free_space_inode(inode) &&
!(state->state & EXTENT_NORESERVE) &&
(bits & EXTENT_CLEAR_DATA_RESV))
- btrfs_free_reserved_data_space_noquota(fs_info, len);
+ btrfs_free_reserved_data_space_noquota(inode, len);
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
fs_info->delalloc_batch);
@@ -2640,12 +2705,12 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
if (em_len > search_len)
em_len = search_len;
- ret = set_extent_bit(&inode->io_tree, search_start,
- search_start + em_len - 1,
- EXTENT_DELALLOC_NEW, cached_state);
+ ret = btrfs_set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW, cached_state);
next:
- search_start = extent_map_end(em);
- free_extent_map(em);
+ search_start = btrfs_extent_map_end(em);
+ btrfs_free_extent_map(em);
if (ret)
return ret;
}
@@ -2675,8 +2740,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
return ret;
}
- return set_extent_bit(&inode->io_tree, start, end,
- EXTENT_DELALLOC | extra_bits, cached_state);
+ return btrfs_set_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC | extra_bits, cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
@@ -2697,7 +2762,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct btrfs_inode *inode = fixup->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 page_start = folio_pos(folio);
- u64 page_end = folio_pos(folio) + folio_size(folio) - 1;
+ u64 page_end = folio_next_pos(folio) - 1;
int ret = 0;
bool free_delalloc_space = true;
@@ -2751,7 +2816,7 @@ again:
if (ret)
goto out_page;
- lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
if (folio_test_ordered(folio))
@@ -2759,8 +2824,8 @@ again:
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
- unlock_extent(&inode->io_tree, page_start, page_end,
- &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
folio_unlock(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
@@ -2786,7 +2851,7 @@ out_reserved:
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
- unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
out_page:
if (ret) {
/*
@@ -2833,6 +2898,21 @@ int btrfs_writepage_cow_fixup(struct folio *folio)
return 0;
/*
+ * For experimental build, we error out instead of EAGAIN.
+ *
+ * We should not hit such out-of-band dirty folios anymore.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
+ DEBUG_WARN();
+ btrfs_err_rl(fs_info,
+ "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
+ btrfs_root_id(BTRFS_I(inode)->root),
+ btrfs_ino(BTRFS_I(inode)),
+ folio_pos(folio));
+ return -EUCLEAN;
+ }
+
+ /*
* folio_checked is set below when we create a fixup worker for this
* folio, don't try to create another one if we're already
* folio_test_checked.
@@ -2851,7 +2931,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio)
* We are already holding a reference to this inode from
* write_cache_pages. We need to hold it because the space reservation
* takes place outside of the folio lock, and we can't trust
- * page->mapping outside of the folio lock.
+ * folio->mapping outside of the folio lock.
*/
ihold(inode);
btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
@@ -2872,7 +2952,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
const u64 sectorsize = root->fs_info->sectorsize;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key ins;
u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
@@ -2907,8 +2987,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
if (!drop_args.extent_inserted) {
ins.objectid = btrfs_ino(inode);
- ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
+ ins.offset = file_pos;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
sizeof(*stack_fi));
@@ -2921,14 +3001,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(struct btrfs_file_extent_item));
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/*
* If we dropped an inline extent here, we know the range where it is
* was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
* number of bytes only for that range containing the inline extent.
- * The remaining of the range will be processed when clearning the
+ * The remaining of the range will be processed when clearing the
* EXTENT_DELALLOC_BIT bit through the ordered extent completion.
*/
if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
@@ -2944,8 +3023,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
ins.objectid = disk_bytenr;
- ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = disk_num_bytes;
ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
if (ret)
@@ -2955,8 +3034,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
file_pos - offset,
qgroup_reserved, &ins);
out:
- btrfs_free_path(path);
-
return ret;
}
@@ -3046,14 +3123,15 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
if (!freespace_inode)
btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
- if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+ if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) {
ret = -EIO;
goto out;
}
- if (btrfs_is_zoned(fs_info))
- btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
- ordered_extent->disk_num_bytes);
+ ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
+ if (ret)
+ goto out;
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
@@ -3063,6 +3141,21 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
+ /*
+ * If it's a COW write we need to lock the extent range as we will be
+ * inserting/replacing file extent items and unpinning an extent map.
+ * This must be taken before joining a transaction, as it's a higher
+ * level lock (like the inode's VFS lock), otherwise we can run into an
+ * ABBA deadlock with other tasks (transactions work like a lock,
+ * depending on their current state).
+ */
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+ clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED;
+ btrfs_lock_extent_bits(io_tree, start, end,
+ EXTENT_LOCKED | EXTENT_FINISHING_ORDERED,
+ &cached_state);
+ }
+
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
else
@@ -3076,7 +3169,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &inode->block_rsv;
ret = btrfs_insert_raid_extent(trans, ordered_extent);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3084,7 +3177,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
/* Logic error */
ASSERT(list_empty(&ordered_extent->list));
- if (!list_empty(&ordered_extent->list)) {
+ if (unlikely(!list_empty(&ordered_extent->list))) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3092,16 +3185,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, inode);
- if (ret) {
+ if (unlikely(ret)) {
/* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
}
goto out;
}
- clear_bits |= EXTENT_LOCKED;
- lock_extent(io_tree, start, end, &cached_state);
-
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
@@ -3122,20 +3212,20 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- ret = unpin_extent_cache(inode, ordered_extent->file_offset,
- ordered_extent->num_bytes, trans->transid);
- if (ret < 0) {
+ ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset,
+ ordered_extent->num_bytes, trans->transid);
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = add_pending_csums(trans, &ordered_extent->list);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3147,26 +3237,24 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
*/
if ((clear_bits & EXTENT_DELALLOC_NEW) &&
!test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
- clear_extent_bit(&inode->io_tree, start, end,
- EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
- &cached_state);
+ btrfs_clear_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
+ &cached_state);
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, inode);
- if (ret) { /* -ENOMEM or corruption */
+ if (unlikely(ret)) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
}
out:
- clear_extent_bit(&inode->io_tree, start, end, clear_bits,
- &cached_state);
+ btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits,
+ &cached_state);
if (trans)
btrfs_end_transaction(trans);
if (ret || truncated) {
- u64 unwritten_start = start;
-
/*
* If we failed to finish this ordered extent for any reason we
* need to make sure BTRFS_ORDERED_IOERR is set on the ordered
@@ -3178,10 +3266,6 @@ out:
if (ret)
btrfs_mark_ordered_extent_error(ordered_extent);
- if (truncated)
- unwritten_start += logical_len;
- clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
-
/*
* Drop extent maps for the part of the extent we didn't write.
*
@@ -3196,9 +3280,15 @@ out:
* we don't mess with the extent map tree in the NOCOW case, but
* for now simply skip this if we are the free space inode.
*/
- if (!btrfs_is_free_space_inode(inode))
+ if (!btrfs_is_free_space_inode(inode)) {
+ u64 unwritten_start = start;
+
+ if (truncated)
+ unwritten_start += logical_len;
+
btrfs_drop_extent_map_range(inode, unwritten_start,
end, false);
+ }
/*
* If the ordered extent had an IOERR or something else went
@@ -3225,7 +3315,7 @@ out:
NULL);
btrfs_free_reserved_extent(fs_info,
ordered_extent->disk_bytenr,
- ordered_extent->disk_num_bytes, 1);
+ ordered_extent->disk_num_bytes, true);
/*
* Actually free the qgroup rsv which was released when
* the ordered extent was created.
@@ -3260,35 +3350,89 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
}
/*
- * Verify the checksum for a single sector without any extra action that depend
- * on the type of I/O.
+ * Calculate the checksum of an fs block at physical memory address @paddr,
+ * and save the result to @dest.
+ *
+ * The folio containing @paddr must be large enough to contain a full fs block.
*/
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
- u32 pgoff, u8 *csum, const u8 * const csum_expected)
+void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info,
+ const phys_addr_t paddr, u8 *dest)
{
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- char *kaddr;
+ struct folio *folio = page_folio(phys_to_page(paddr));
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 step = min(blocksize, PAGE_SIZE);
+ const u32 nr_steps = blocksize / step;
+ phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
- ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
+ /* The full block must be inside the folio. */
+ ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio));
- shash->tfm = fs_info->csum_shash;
+ for (int i = 0; i < nr_steps; i++) {
+ u32 pindex = offset_in_folio(folio, paddr + i * step) >> PAGE_SHIFT;
- kaddr = kmap_local_page(page) + pgoff;
- crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
- kunmap_local(kaddr);
+ /*
+ * For bs <= ps cases, we will only run the loop once, so the offset
+ * inside the page will only added to paddrs[0].
+ *
+ * For bs > ps cases, the block must be page aligned, thus offset
+ * inside the page will always be 0.
+ */
+ paddrs[i] = page_to_phys(folio_page(folio, pindex)) + offset_in_page(paddr);
+ }
+ return btrfs_calculate_block_csum_pages(fs_info, paddrs, dest);
+}
+
+/*
+ * Calculate the checksum of a fs block backed by multiple noncontiguous pages
+ * at @paddrs[] and save the result to @dest.
+ *
+ * The folio containing @paddr must be large enough to contain a full fs block.
+ */
+void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info,
+ const phys_addr_t paddrs[], u8 *dest)
+{
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 step = min(blocksize, PAGE_SIZE);
+ const u32 nr_steps = blocksize / step;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- if (memcmp(csum, csum_expected, fs_info->csum_size))
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ for (int i = 0; i < nr_steps; i++) {
+ const phys_addr_t paddr = paddrs[i];
+ void *kaddr;
+
+ ASSERT(offset_in_page(paddr) + step <= PAGE_SIZE);
+ kaddr = kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
+ crypto_shash_update(shash, kaddr, step);
+ kunmap_local(kaddr);
+ }
+ crypto_shash_final(shash, dest);
+}
+
+/*
+ * Verify the checksum for a single sector without any extra action that depend
+ * on the type of I/O.
+ *
+ * @kaddr must be a properly kmapped address.
+ */
+int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum,
+ const u8 * const csum_expected)
+{
+ btrfs_calculate_block_csum_folio(fs_info, paddr, csum);
+ if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0))
return -EIO;
return 0;
}
/*
- * Verify the checksum of a single data sector.
+ * Verify the checksum of a single data sector, which can be scattered at
+ * different noncontiguous pages.
*
* @bbio: btrfs_io_bio which contains the csum
* @dev: device the sector is on
* @bio_offset: offset to the beginning of the bio (in bytes)
- * @bv: bio_vec to check
+ * @paddrs: physical addresses which back the fs block
*
* Check if the checksum on a data block is valid. When a checksum mismatch is
* detected, report the error and fill the corrupted range with zero.
@@ -3296,33 +3440,34 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
* Return %true if the sector is ok or had no checksum to start with, else %false.
*/
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
- u32 bio_offset, struct bio_vec *bv)
+ u32 bio_offset, const phys_addr_t paddrs[])
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 step = min(blocksize, PAGE_SIZE);
+ const u32 nr_steps = blocksize / step;
u64 file_offset = bbio->file_offset + bio_offset;
- u64 end = file_offset + bv->bv_len - 1;
+ u64 end = file_offset + blocksize - 1;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
- ASSERT(bv->bv_len == fs_info->sectorsize);
-
if (!bbio->csum)
return true;
if (btrfs_is_data_reloc_root(inode->root) &&
- test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
- NULL)) {
+ btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
+ NULL)) {
/* Skip the range without csum for data reloc inode */
- clear_extent_bits(&inode->io_tree, file_offset, end,
- EXTENT_NODATASUM);
+ btrfs_clear_extent_bit(&inode->io_tree, file_offset, end,
+ EXTENT_NODATASUM, NULL);
return true;
}
csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
fs_info->csum_size;
- if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
- csum_expected))
+ btrfs_calculate_block_csum_pages(fs_info, paddrs, csum);
+ if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0))
goto zeroit;
return true;
@@ -3331,7 +3476,8 @@ zeroit:
bbio->mirror_num);
if (dev)
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
- memzero_bvec(bv);
+ for (int i = 0; i < nr_steps; i++)
+ memzero_page(phys_to_page(paddrs[i]), offset_in_page(paddrs[i]), step);
return false;
}
@@ -3353,6 +3499,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode)
if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
return;
+ WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state));
atomic_inc(&fs_info->nr_delayed_iputs);
/*
* Need to be irq safe here because we can be called from either an irq
@@ -3444,7 +3591,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
int ret;
ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
- if (ret && ret != -EEXIST) {
+ if (unlikely(ret && ret != -EEXIST)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -3469,11 +3616,10 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
int btrfs_orphan_cleanup(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key, found_key;
struct btrfs_trans_handle *trans;
- struct inode *inode;
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
@@ -3492,6 +3638,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *inode;
+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
@@ -3615,10 +3763,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* deleted but wasn't. The inode number may have been reused,
* but either way, we can delete the orphan item.
*/
- if (!inode || inode->i_nlink) {
+ if (!inode || inode->vfs_inode.i_nlink) {
if (inode) {
- ret = btrfs_drop_verity_items(BTRFS_I(inode));
- iput(inode);
+ ret = btrfs_drop_verity_items(inode);
+ iput(&inode->vfs_inode);
inode = NULL;
if (ret)
goto out;
@@ -3641,7 +3789,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
nr_unlink++;
/* this will do delete_inode and everything for us */
- iput(inode);
+ iput(&inode->vfs_inode);
}
/* release the path since we're done with it */
btrfs_release_path(path);
@@ -3658,19 +3806,22 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
out:
if (ret)
btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
- btrfs_free_path(path);
return ret;
}
/*
- * very simple check to peek ahead in the leaf looking for xattrs. If we
- * don't find any xattrs, we know there can't be any acls.
+ * Look ahead in the leaf for xattrs. If we don't find any then we know there
+ * can't be any ACLs.
*
- * slot is the slot the inode is in, objectid is the objectid of the inode
+ * @leaf: the eb leaf where to search
+ * @slot: the slot the inode is in
+ * @objectid: the objectid of the inode
+ *
+ * Return true if there is xattr/ACL, false otherwise.
*/
-static noinline int acls_after_inode_item(struct extent_buffer *leaf,
- int slot, u64 objectid,
- int *first_xattr_slot)
+static noinline bool acls_after_inode_item(struct extent_buffer *leaf,
+ int slot, u64 objectid,
+ int *first_xattr_slot)
{
u32 nritems = btrfs_header_nritems(leaf);
struct btrfs_key found_key;
@@ -3690,45 +3841,50 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
while (slot < nritems) {
btrfs_item_key_to_cpu(leaf, &found_key, slot);
- /* we found a different objectid, there must not be acls */
+ /* We found a different objectid, there must be no ACLs. */
if (found_key.objectid != objectid)
- return 0;
+ return false;
- /* we found an xattr, assume we've got an acl */
+ /* We found an xattr, assume we've got an ACL. */
if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
if (*first_xattr_slot == -1)
*first_xattr_slot = slot;
if (found_key.offset == xattr_access ||
found_key.offset == xattr_default)
- return 1;
+ return true;
}
/*
- * we found a key greater than an xattr key, there can't
- * be any acls later on
+ * We found a key greater than an xattr key, there can't be any
+ * ACLs later on.
*/
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
- return 0;
+ return false;
slot++;
scanned++;
/*
- * it goes inode, inode backrefs, xattrs, extents,
- * so if there are a ton of hard links to an inode there can
- * be a lot of backrefs. Don't waste time searching too hard,
- * this is just an optimization
+ * The item order goes like:
+ * - inode
+ * - inode backrefs
+ * - xattrs
+ * - extents,
+ *
+ * so if there are lots of hard links to an inode there can be
+ * a lot of backrefs. Don't waste time searching too hard,
+ * this is just an optimization.
*/
if (scanned >= 8)
break;
}
- /* we hit the end of the leaf before we found an xattr or
- * something larger than an xattr. We have to assume the inode
- * has acls
+ /*
+ * We hit the end of the leaf before we found an xattr or something
+ * larger than an xattr. We have to assume the inode has ACLs.
*/
if (*first_xattr_slot == -1)
*first_xattr_slot = slot;
- return 1;
+ return true;
}
static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
@@ -3748,7 +3904,8 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
if (!inode->file_extent_tree)
return -ENOMEM;
- extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT);
+ btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT);
/* Lockdep class is set only for the file extent tree. */
lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
@@ -3779,7 +3936,7 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
ASSERT(ret != -ENOMEM);
return ret;
} else if (existing) {
- WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
+ WARN_ON(!(inode_state_read_once(&existing->vfs_inode) & (I_WILL_FREE | I_FREEING)));
}
return 0;
@@ -3791,12 +3948,13 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
*
* On failure clean up the inode.
*/
-static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
+static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct inode *vfs_inode = &inode->vfs_inode;
struct btrfs_key location;
unsigned long ptr;
int maybe_acls;
@@ -3805,17 +3963,13 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
bool filled = false;
int first_xattr_slot;
- ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
- if (ret)
- goto out;
-
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
ASSERT(path);
- btrfs_get_inode_key(BTRFS_I(inode), &location);
+ btrfs_get_inode_key(inode, &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
@@ -3835,43 +3989,47 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
- inode->i_mode = btrfs_inode_mode(leaf, inode_item);
- set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
- i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
- i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
- btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
- btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
- round_up(i_size_read(inode), fs_info->sectorsize));
-
- inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
+ vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+ set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item));
+ i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
+ i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
+ btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+
+ inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
btrfs_timespec_nsec(leaf, &inode_item->atime));
- inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
+ inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
btrfs_timespec_nsec(leaf, &inode_item->mtime));
- inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
+ inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
btrfs_timespec_nsec(leaf, &inode_item->ctime));
- BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
- BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
+ inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
+ inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
- inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
- BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
- BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+ inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item));
+ inode->generation = btrfs_inode_generation(leaf, inode_item);
+ inode->last_trans = btrfs_inode_transid(leaf, inode_item);
- inode_set_iversion_queried(inode,
- btrfs_inode_sequence(leaf, inode_item));
- inode->i_generation = BTRFS_I(inode)->generation;
- inode->i_rdev = 0;
+ inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item));
+ vfs_inode->i_generation = inode->generation;
+ vfs_inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
- if (S_ISDIR(inode->i_mode))
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(vfs_inode->i_mode))
+ inode->index_cnt = (u64)-1;
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
- &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
+ &inode->flags, &inode->ro_flags);
+ btrfs_update_inode_mapping_flags(inode);
+ btrfs_set_inode_mapping_order(inode);
cache_index:
+ ret = btrfs_init_file_extent_tree(inode);
+ if (ret)
+ goto out;
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
/*
* If we were modified in the current generation and evicted from memory
* and then re-read we need to do a full sync since we don't have any
@@ -3881,9 +4039,8 @@ cache_index:
* This is required for both inode re-read from disk and delayed inode
* in the delayed_nodes xarray.
*/
- if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ if (inode->last_trans == btrfs_get_fs_generation(fs_info))
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/*
* We don't persist the id of the transaction where an unlink operation
@@ -3912,7 +4069,7 @@ cache_index:
* transaction commits on fsync if our inode is a directory, or if our
* inode is not a directory, logging its parent unnecessarily.
*/
- BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+ inode->last_unlink_trans = inode->last_trans;
/*
* Same logic as for last_unlink_trans. We don't persist the generation
@@ -3920,15 +4077,15 @@ cache_index:
* operation, so after eviction and reloading the inode we must be
* pessimistic and assume the last transaction that modified the inode.
*/
- BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
+ inode->last_reflink_trans = inode->last_trans;
path->slots[0]++;
- if (inode->i_nlink != 1 ||
+ if (vfs_inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
goto cache_acl;
btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
- if (location.objectid != btrfs_ino(BTRFS_I(inode)))
+ if (location.objectid != btrfs_ino(inode))
goto cache_acl;
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -3936,13 +4093,12 @@ cache_index:
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ptr;
- BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+ inode->dir_index = btrfs_inode_ref_index(leaf, ref);
} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)ptr;
- BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
- extref);
+ inode->dir_index = btrfs_inode_extref_index(leaf, extref);
}
cache_acl:
/*
@@ -3950,50 +4106,49 @@ cache_acl:
* any xattrs or acls
*/
maybe_acls = acls_after_inode_item(leaf, path->slots[0],
- btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
+ btrfs_ino(inode), &first_xattr_slot);
if (first_xattr_slot != -1) {
path->slots[0] = first_xattr_slot;
ret = btrfs_load_inode_props(inode, path);
if (ret)
btrfs_err(fs_info,
"error loading props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)),
- btrfs_root_id(root), ret);
+ btrfs_ino(inode), btrfs_root_id(root), ret);
}
if (!maybe_acls)
- cache_no_acl(inode);
+ cache_no_acl(vfs_inode);
- switch (inode->i_mode & S_IFMT) {
+ switch (vfs_inode->i_mode & S_IFMT) {
case S_IFREG:
- inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_fop = &btrfs_file_operations;
- inode->i_op = &btrfs_file_inode_operations;
+ vfs_inode->i_mapping->a_ops = &btrfs_aops;
+ vfs_inode->i_fop = &btrfs_file_operations;
+ vfs_inode->i_op = &btrfs_file_inode_operations;
break;
case S_IFDIR:
- inode->i_fop = &btrfs_dir_file_operations;
- inode->i_op = &btrfs_dir_inode_operations;
+ vfs_inode->i_fop = &btrfs_dir_file_operations;
+ vfs_inode->i_op = &btrfs_dir_inode_operations;
break;
case S_IFLNK:
- inode->i_op = &btrfs_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_aops;
+ vfs_inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(vfs_inode);
+ vfs_inode->i_mapping->a_ops = &btrfs_aops;
break;
default:
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode, rdev);
+ vfs_inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(vfs_inode, vfs_inode->i_mode, rdev);
break;
}
btrfs_sync_inode_flags_to_i_flags(inode);
- ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
+ ret = btrfs_add_inode_to_root(inode, true);
if (ret)
goto out;
return 0;
out:
- iget_failed(inode);
+ iget_failed(vfs_inode);
return ret;
}
@@ -4005,45 +4160,35 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *item,
struct inode *inode)
{
- struct btrfs_map_token token;
u64 flags;
- btrfs_init_map_token(&token, leaf);
-
- btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
- btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
- btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
- btrfs_set_token_inode_mode(&token, item, inode->i_mode);
- btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
-
- btrfs_set_token_timespec_sec(&token, &item->atime,
- inode_get_atime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode_get_atime_nsec(inode));
-
- btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode_get_mtime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode_get_mtime_nsec(inode));
-
- btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime_nsec(inode));
-
- btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
- btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
-
- btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
- btrfs_set_token_inode_generation(&token, item,
- BTRFS_I(inode)->generation);
- btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
- btrfs_set_token_inode_transid(&token, item, trans->transid);
- btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+ btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
+ btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+ btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
+
+ btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+ btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+ btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
BTRFS_I(inode)->ro_flags);
- btrfs_set_token_inode_flags(&token, item, flags);
- btrfs_set_token_inode_block_group(&token, item, 0);
+ btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_block_group(leaf, item, 0);
}
/*
@@ -4053,7 +4198,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
struct btrfs_inode_item *inode_item;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
int ret;
@@ -4067,7 +4212,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
if (ret) {
if (ret > 0)
ret = -ENOENT;
- goto failed;
+ return ret;
}
leaf = path->nodes[0];
@@ -4075,12 +4220,8 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item);
fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_set_inode_last_trans(trans, inode);
- ret = 0;
-failed:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -4125,6 +4266,23 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
return ret;
}
+static void update_time_after_link_or_unlink(struct btrfs_inode *dir)
+{
+ struct timespec64 now;
+
+ /*
+ * If we are replaying a log tree, we do not want to update the mtime
+ * and ctime of the parent directory with the current time, since the
+ * log replay procedure is responsible for setting them to their correct
+ * values (the ones it had when the fsync was done).
+ */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags))
+ return;
+
+ now = inode_set_ctime_current(&dir->vfs_inode);
+ inode_set_mtime_to_ts(&dir->vfs_inode, now);
+}
+
/*
* unlink helper that gets used here in inode.c and in the tree logging
* recovery code. It remove a link in a directory with a given name, and
@@ -4146,20 +4304,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
u64 dir_ino = btrfs_ino(dir);
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
if (IS_ERR_OR_NULL(di)) {
- ret = di ? PTR_ERR(di) : -ENOENT;
- goto err;
+ btrfs_free_path(path);
+ return di ? PTR_ERR(di) : -ENOENT;
}
ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ /*
+ * Down the call chains below we'll also need to allocate a path, so no
+ * need to hold on to this one for longer than necessary.
+ */
+ btrfs_free_path(path);
if (ret)
- goto err;
- btrfs_release_path(path);
+ return ret;
/*
* If we don't have dir index, we have to get it by looking up
@@ -4180,21 +4340,21 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
- if (ret) {
- btrfs_info(fs_info,
- "failed to delete reference to %.*s, inode %llu parent %llu",
- name->len, name->name, ino, dir_ino);
+ if (unlikely(ret)) {
+ btrfs_crit(fs_info,
+ "failed to delete reference to %.*s, root %llu inode %llu parent %llu",
+ name->len, name->name, btrfs_root_id(root), ino, dir_ino);
btrfs_abort_transaction(trans, ret);
- goto err;
+ return ret;
}
skip_backref:
if (rename_ctx)
rename_ctx->index = index;
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto err;
+ return ret;
}
/*
@@ -4204,8 +4364,8 @@ skip_backref:
* operations on the log tree, increasing latency for applications.
*/
if (!rename_ctx) {
- btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
- btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
+ btrfs_del_inode_ref_in_log(trans, name, inode, dir);
+ btrfs_del_dir_entries_in_log(trans, name, dir, index);
}
/*
@@ -4218,19 +4378,14 @@ skip_backref:
* holding.
*/
btrfs_run_delayed_iput(fs_info, inode);
-err:
- btrfs_free_path(path);
- if (ret)
- goto out;
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_set_ctime_current(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
- ret = btrfs_update_inode(trans, dir);
-out:
- return ret;
+ update_time_after_link_or_unlink(dir);
+
+ return btrfs_update_inode(trans, dir);
}
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -4309,7 +4464,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = dir->root;
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
@@ -4352,7 +4507,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_dir_item_key_to_cpu(leaf, di, &key);
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4383,14 +4538,14 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
ret = btrfs_del_root_ref(trans, objectid,
btrfs_root_id(root), dir_ino,
&index, &fname.disk_name);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4402,7 +4557,6 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
if (ret)
btrfs_abort_transaction(trans, ret);
out:
- btrfs_free_path(path);
fscrypt_free_filename(&fname);
return ret;
}
@@ -4414,7 +4568,7 @@ out:
static noinline int may_destroy_subvol(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *di;
struct btrfs_key key;
struct fscrypt_str name = FSTR_INIT("default", 7);
@@ -4436,7 +4590,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
btrfs_err(fs_info,
"deleting default subvolume %llu is not allowed",
key.objectid);
- goto out;
+ return ret;
}
btrfs_release_path(path);
}
@@ -4447,14 +4601,13 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
- if (ret == 0) {
+ return ret;
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of valid range.
*/
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
ret = 0;
@@ -4464,8 +4617,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
ret = -ENOTEMPTY;
}
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -4481,7 +4633,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
- if (atomic_read(&inode->vfs_inode.i_count) > 1)
+ if (icount_read(&inode->vfs_inode) > 1)
d_prune_aliases(&inode->vfs_inode);
min_ino = btrfs_ino(inode) + 1;
@@ -4564,13 +4716,13 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
btrfs_record_snapshot_destroy(trans, dir);
ret = btrfs_unlink_subvol(trans, dir, dentry);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
ret = btrfs_record_root_in_trans(trans, dest);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4584,7 +4736,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = btrfs_insert_orphan_item(trans,
fs_info->tree_root,
btrfs_root_id(dest));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4592,7 +4744,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
- if (ret && ret != -ENOENT) {
+ if (unlikely(ret && ret != -ENOENT)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4601,7 +4753,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
dest->root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
btrfs_root_id(dest));
- if (ret && ret != -ENOENT) {
+ if (unlikely(ret && ret != -ENOENT)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4637,68 +4789,68 @@ out_up_write:
return ret;
}
-static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry)
{
- struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_inode *dir = BTRFS_I(vfs_dir);
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
struct btrfs_trans_handle *trans;
- u64 last_unlink_trans;
struct fscrypt_name fname;
- if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+ if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
btrfs_err(fs_info,
"extent tree v2 doesn't support snapshot deletion yet");
return -EOPNOTSUPP;
}
- return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
+ return btrfs_delete_subvolume(dir, dentry);
}
- ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname);
if (ret)
return ret;
/* This needs to handle no-key deletions later on */
- trans = __unlink_start_trans(BTRFS_I(dir));
+ trans = __unlink_start_trans(dir);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_notrans;
}
- if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
+ /*
+ * Propagate the last_unlink_trans value of the deleted dir to its
+ * parent directory. This is to prevent an unrecoverable log tree in the
+ * case we do something like this:
+ * 1) create dir foo
+ * 2) create snapshot under dir foo
+ * 3) delete the snapshot
+ * 4) rmdir foo
+ * 5) mkdir foo
+ * 6) fsync foo or some file inside foo
+ *
+ * This is because we can't unlink other roots when replaying the dir
+ * deletes for directory foo.
+ */
+ if (inode->last_unlink_trans >= trans->transid)
+ btrfs_record_snapshot_destroy(trans, dir);
+
+ if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ ret = btrfs_unlink_subvol(trans, dir, dentry);
goto out;
}
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ ret = btrfs_orphan_add(trans, inode);
if (ret)
goto out;
- last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
-
/* now the directory is empty */
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
- &fname.disk_name);
- if (!ret) {
- btrfs_i_size_write(BTRFS_I(inode), 0);
- /*
- * Propagate the last_unlink_trans value of the deleted dir to
- * its parent directory. This is to prevent an unrecoverable
- * log tree in the case we do something like this:
- * 1) create dir foo
- * 2) create snapshot under dir foo
- * 3) delete the snapshot
- * 4) rmdir foo
- * 5) mkdir foo
- * 6) fsync foo or some file inside foo
- */
- if (last_unlink_trans >= trans->transid)
- BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
- }
+ ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name);
+ if (!ret)
+ btrfs_i_size_write(inode, 0);
out:
btrfs_end_transaction(trans);
out_notrans:
@@ -4708,20 +4860,80 @@ out_notrans:
return ret;
}
+static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize)
+{
+ ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u",
+ blockstart, blocksize);
+
+ if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1)
+ return true;
+ return false;
+}
+
+static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start)
+{
+ const pgoff_t index = (start >> PAGE_SHIFT);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct folio *folio;
+ u64 zero_start;
+ u64 zero_end;
+ int ret = 0;
+
+again:
+ folio = filemap_lock_folio(mapping, index);
+ /* No folio present. */
+ if (IS_ERR(folio))
+ return 0;
+
+ if (!folio_test_uptodate(folio)) {
+ ret = btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
+ if (unlikely(!folio_test_uptodate(folio))) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+ }
+ folio_wait_writeback(folio);
+
+ /*
+ * We do not need to lock extents nor wait for OE, as it's already
+ * beyond EOF.
+ */
+
+ zero_start = max_t(u64, folio_pos(folio), start);
+ zero_end = folio_next_pos(folio);
+ folio_zero_range(folio, zero_start - folio_pos(folio),
+ zero_end - zero_start);
+
+out_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+}
+
/*
- * Read, zero a chunk and write a block.
+ * Handle the truncation of a fs block.
+ *
+ * @inode - inode that we're zeroing
+ * @offset - the file offset of the block to truncate
+ * The value must be inside [@start, @end], and the function will do
+ * extra checks if the block that covers @offset needs to be zeroed.
+ * @start - the start file offset of the range we want to zero
+ * @end - the end (inclusive) file offset of the range we want to zero.
*
- * @inode - inode that we're zeroing
- * @from - the offset to start zeroing
- * @len - the length to zero, 0 to zero the entire range respective to the
- * offset
- * @front - zero up to the offset instead of from the offset on
+ * If the range is not block aligned, read out the folio that covers @offset,
+ * and if needed zero blocks that are inside the folio and covered by [@start, @end).
+ * If @start or @end + 1 lands inside a block, that block will be marked dirty
+ * for writeback.
*
- * This will find the block for the "from" offset and cow the block and zero the
- * part we want to zero. This is used with truncate and hole punching.
+ * This is utilized by hole punch, zero range, file expansion.
*/
-int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
- int front)
+int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
@@ -4731,27 +4943,66 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
struct extent_changeset *data_reserved = NULL;
bool only_release_metadata = false;
u32 blocksize = fs_info->sectorsize;
- pgoff_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (blocksize - 1);
+ pgoff_t index = (offset >> PAGE_SHIFT);
struct folio *folio;
gfp_t mask = btrfs_alloc_write_mask(mapping);
- size_t write_bytes = blocksize;
int ret = 0;
+ const bool in_head_block = is_inside_block(offset, round_down(start, blocksize),
+ blocksize);
+ const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize),
+ blocksize);
+ bool need_truncate_head = false;
+ bool need_truncate_tail = false;
+ u64 zero_start;
+ u64 zero_end;
u64 block_start;
u64 block_end;
- if (IS_ALIGNED(offset, blocksize) &&
- (!len || IS_ALIGNED(len, blocksize)))
+ /* @offset should be inside the range. */
+ ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu",
+ offset, start, end);
+
+ /* The range is aligned at both ends. */
+ if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) {
+ /*
+ * For block size < page size case, we may have polluted blocks
+ * beyond EOF. So we also need to zero them out.
+ */
+ if (end == (u64)-1 && blocksize < PAGE_SIZE)
+ ret = truncate_block_zero_beyond_eof(inode, start);
+ goto out;
+ }
+
+ /*
+ * @offset may not be inside the head nor tail block. In that case we
+ * don't need to do anything.
+ */
+ if (!in_head_block && !in_tail_block)
goto out;
- block_start = round_down(from, blocksize);
+ /*
+ * Skip the truncation if the range in the target block is already aligned.
+ * The seemingly complex check will also handle the same block case.
+ */
+ if (in_head_block && !IS_ALIGNED(start, blocksize))
+ need_truncate_head = true;
+ if (in_tail_block && !IS_ALIGNED(end + 1, blocksize))
+ need_truncate_tail = true;
+ if (!need_truncate_head && !need_truncate_tail)
+ goto out;
+
+ block_start = round_down(offset, blocksize);
block_end = block_start + blocksize - 1;
ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
blocksize, false);
if (ret < 0) {
+ size_t write_bytes = blocksize;
+
if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
- /* For nocow case, no need to reserve data space */
+ /* For nocow case, no need to reserve data space. */
+ ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u",
+ write_bytes, blocksize);
only_release_metadata = true;
} else {
goto out;
@@ -4768,10 +5019,13 @@ again:
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
if (IS_ERR(folio)) {
- btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize, true);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, blocksize, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ block_start, blocksize, true);
btrfs_delalloc_release_extents(inode, blocksize);
- ret = -ENOMEM;
+ ret = PTR_ERR(folio);
goto out;
}
@@ -4783,7 +5037,7 @@ again:
folio_put(folio);
goto again;
}
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
ret = -EIO;
goto out_unlock;
}
@@ -4801,11 +5055,11 @@ again:
folio_wait_writeback(folio);
- lock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_lock_extent(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
- unlock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
folio_unlock(folio);
folio_put(folio);
btrfs_start_ordered_extent(ordered);
@@ -4813,37 +5067,46 @@ again:
goto again;
}
- clear_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- &cached_state);
+ btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ &cached_state);
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state);
if (ret) {
- unlock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
goto out_unlock;
}
- if (offset != blocksize) {
- if (!len)
- len = blocksize - offset;
- if (front)
- folio_zero_range(folio, block_start - folio_pos(folio),
- offset);
- else
- folio_zero_range(folio,
- (block_start - folio_pos(folio)) + offset,
- len);
+ if (end == (u64)-1) {
+ /*
+ * We're truncating beyond EOF, the remaining blocks normally are
+ * already holes thus no need to zero again, but it's possible for
+ * fs block size < page size cases to have memory mapped writes
+ * to pollute ranges beyond EOF.
+ *
+ * In that case although such polluted blocks beyond EOF will
+ * not reach disk, it still affects our page caches.
+ */
+ zero_start = max_t(u64, folio_pos(folio), start);
+ zero_end = min_t(u64, folio_next_pos(folio) - 1, end);
+ } else {
+ zero_start = max_t(u64, block_start, start);
+ zero_end = min_t(u64, block_end, end);
}
+ folio_zero_range(folio, zero_start - folio_pos(folio),
+ zero_end - zero_start + 1);
+
btrfs_folio_clear_checked(fs_info, folio, block_start,
block_end + 1 - block_start);
btrfs_folio_set_dirty(fs_info, folio, block_start,
block_end + 1 - block_start);
- unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
- set_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_NORESERVE, NULL);
+ btrfs_set_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_NORESERVE, &cached_state);
+
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
out_unlock:
if (ret) {
@@ -4894,7 +5157,7 @@ static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
drop_args.drop_cache = true;
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -4936,7 +5199,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
* rest of the block before we expand the i_size, otherwise we could
* expose stale data.
*/
- ret = btrfs_truncate_block(inode, oldsize, 0, 0);
+ ret = btrfs_truncate_block(inode, oldsize, oldsize, -1);
if (ret)
return ret;
@@ -4953,7 +5216,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
em = NULL;
break;
}
- last_byte = min(extent_map_end(em), block_end);
+ last_byte = min(btrfs_extent_map_end(em), block_end);
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
@@ -4969,7 +5232,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
if (ret)
break;
- hole_em = alloc_extent_map();
+ hole_em = btrfs_alloc_extent_map();
if (!hole_em) {
btrfs_drop_extent_map_range(inode, cur_offset,
cur_offset + hole_size - 1,
@@ -4986,7 +5249,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->generation = btrfs_get_fs_generation(fs_info);
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
- free_extent_map(hole_em);
+ btrfs_free_extent_map(hole_em);
} else {
ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
@@ -4994,14 +5257,14 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
break;
}
next:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
cur_offset = last_byte;
if (cur_offset >= block_end)
break;
}
- free_extent_map(em);
- unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
+ btrfs_free_extent_map(em);
+ btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
return ret;
}
@@ -5081,7 +5344,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
if (ret && inode->i_nlink) {
- int err;
+ int ret2;
/*
* Truncate failed, so fix up the in-memory size. We
@@ -5089,9 +5352,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* wait for disk_i_size to be stable and then update the
* in-memory size to match.
*/
- err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
- if (err)
- return err;
+ ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
+ if (ret2)
+ return ret2;
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
}
}
@@ -5104,31 +5367,31 @@ static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
- int err;
+ int ret;
if (btrfs_root_readonly(root))
return -EROFS;
- err = setattr_prepare(idmap, dentry, attr);
- if (err)
- return err;
+ ret = setattr_prepare(idmap, dentry, attr);
+ if (ret)
+ return ret;
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- err = btrfs_setsize(inode, attr);
- if (err)
- return err;
+ ret = btrfs_setsize(inode, attr);
+ if (ret)
+ return ret;
}
if (attr->ia_valid) {
setattr_copy(idmap, inode, attr);
inode_inc_iversion(inode);
- err = btrfs_dirty_inode(BTRFS_I(inode));
+ ret = btrfs_dirty_inode(BTRFS_I(inode));
- if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(idmap, dentry, inode->i_mode);
+ if (!ret && attr->ia_valid & ATTR_MODE)
+ ret = posix_acl_chmod(idmap, dentry, inode->i_mode);
}
- return err;
+ return ret;
}
/*
@@ -5149,7 +5412,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct rb_node *node;
- ASSERT(inode->i_state & I_FREEING);
+ ASSERT(inode_state_read_once(inode) & I_FREEING);
truncate_inode_pages_final(&inode->i_data);
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
@@ -5185,7 +5448,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
state_flags = state->state;
spin_unlock(&io_tree->lock);
- lock_extent(io_tree, start, end, &cached_state);
+ btrfs_lock_extent(io_tree, start, end, &cached_state);
/*
* If still has DELALLOC flag, the extent didn't reach disk,
@@ -5199,9 +5462,9 @@ static void evict_inode_truncate_pages(struct inode *inode)
btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
end - start + 1, NULL);
- clear_extent_bit(io_tree, start, end,
- EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
- &cached_state);
+ btrfs_clear_extent_bit(io_tree, start, end,
+ EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
+ &cached_state);
cond_resched();
spin_lock(&io_tree->lock);
@@ -5262,7 +5525,7 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_fs_info *fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_block_rsv *rsv = NULL;
+ struct btrfs_block_rsv rsv;
int ret;
trace_btrfs_inode_evict(inode);
@@ -5310,11 +5573,9 @@ void btrfs_evict_inode(struct inode *inode)
*/
btrfs_kill_delayed_inode_items(BTRFS_I(inode));
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv)
- goto out;
- rsv->size = btrfs_calc_metadata_size(fs_info, 1);
- rsv->failfast = true;
+ btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
+ rsv.size = btrfs_calc_metadata_size(fs_info, 1);
+ rsv.failfast = true;
btrfs_i_size_write(BTRFS_I(inode), 0);
@@ -5326,11 +5587,11 @@ void btrfs_evict_inode(struct inode *inode)
.min_type = 0,
};
- trans = evict_refill_and_join(root, rsv);
+ trans = evict_refill_and_join(root, &rsv);
if (IS_ERR(trans))
- goto out;
+ goto out_release;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
ret = btrfs_truncate_inode_items(trans, root, &control);
trans->block_rsv = &fs_info->trans_block_rsv;
@@ -5342,7 +5603,7 @@ void btrfs_evict_inode(struct inode *inode)
*/
btrfs_btree_balance_dirty_nodelay(fs_info);
if (ret && ret != -ENOSPC && ret != -EAGAIN)
- goto out;
+ goto out_release;
else if (!ret)
break;
}
@@ -5356,16 +5617,17 @@ void btrfs_evict_inode(struct inode *inode)
* If it turns out that we are dropping too many of these, we might want
* to add a mechanism for retrying these after a commit.
*/
- trans = evict_refill_and_join(root, rsv);
+ trans = evict_refill_and_join(root, &rsv);
if (!IS_ERR(trans)) {
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
btrfs_orphan_del(trans, BTRFS_I(inode));
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
}
+out_release:
+ btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
out:
- btrfs_free_block_rsv(fs_info, rsv);
/*
* If we didn't successfully delete, the orphan item will still be in
* the tree and we'll retry on the next mount. Again, we might also want
@@ -5387,7 +5649,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
struct btrfs_key *location, u8 *type)
{
struct btrfs_dir_item *di;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = dir->root;
int ret = 0;
struct fscrypt_name fname;
@@ -5398,7 +5660,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
if (ret < 0)
- goto out;
+ return ret;
/*
* fscrypt_setup_filename() should never return a positive value, but
* gcc on sparc/parisc thinks it can, so assert that doesn't happen.
@@ -5415,19 +5677,18 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
- if (location->type != BTRFS_INODE_ITEM_KEY &&
- location->type != BTRFS_ROOT_ITEM_KEY) {
+ if (unlikely(location->type != BTRFS_INODE_ITEM_KEY &&
+ location->type != BTRFS_ROOT_ITEM_KEY)) {
ret = -EUCLEAN;
btrfs_warn(root->fs_info,
-"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
+"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location " BTRFS_KEY_FMT ")",
__func__, fname.disk_name.name, btrfs_ino(dir),
- location->objectid, location->type, location->offset);
+ BTRFS_KEY_FMT_VALUE(location));
}
if (!ret)
*type = btrfs_dir_ftype(path->nodes[0], di);
out:
fscrypt_free_filename(&fname);
- btrfs_free_path(path);
return ret;
}
@@ -5442,7 +5703,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
struct btrfs_key *location,
struct btrfs_root **sub_root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *new_root;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
@@ -5498,7 +5759,6 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
location->offset = 0;
err = 0;
out:
- btrfs_free_path(path);
fscrypt_free_filename(&fname);
return err;
}
@@ -5512,7 +5772,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
bool empty = false;
xa_lock(&root->inodes);
- entry = __xa_erase(&root->inodes, btrfs_ino(inode));
+ /*
+ * This btrfs_inode is being freed and has already been unhashed at this
+ * point. It's possible that another btrfs_inode has already been
+ * allocated for the same inode and inserted itself into the root, so
+ * don't delete it in that case.
+ *
+ * Note that this shouldn't need to allocate memory, so the gfp flags
+ * don't really matter.
+ */
+ entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL,
+ GFP_ATOMIC);
if (entry == inode)
empty = xa_empty(&root->inodes);
xa_unlock(&root->inodes);
@@ -5549,7 +5819,7 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
args->root == BTRFS_I(inode)->root;
}
-static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
+static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
@@ -5561,40 +5831,42 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
- return inode;
+ if (!inode)
+ return NULL;
+ return BTRFS_I(inode);
}
/*
* Get an inode object given its inode number and corresponding root. Path is
* preallocated to prevent recursing back to iget through allocator.
*/
-struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
- struct btrfs_path *path)
+struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
int ret;
inode = btrfs_iget_locked(ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW))
return inode;
ret = btrfs_read_locked_inode(inode, path);
if (ret)
return ERR_PTR(ret);
- unlock_new_inode(inode);
+ unlock_new_inode(&inode->vfs_inode);
return inode;
}
/*
* Get an inode object given its inode number and corresponding root.
*/
-struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
+struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_path *path;
int ret;
@@ -5602,55 +5874,62 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW))
return inode;
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ iget_failed(&inode->vfs_inode);
return ERR_PTR(-ENOMEM);
+ }
ret = btrfs_read_locked_inode(inode, path);
btrfs_free_path(path);
if (ret)
return ERR_PTR(ret);
- unlock_new_inode(inode);
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC;
+ unlock_new_inode(&inode->vfs_inode);
return inode;
}
-static struct inode *new_simple_dir(struct inode *dir,
- struct btrfs_key *key,
- struct btrfs_root *root)
+static struct btrfs_inode *new_simple_dir(struct inode *dir,
+ struct btrfs_key *key,
+ struct btrfs_root *root)
{
struct timespec64 ts;
- struct inode *inode = new_inode(dir->i_sb);
+ struct inode *vfs_inode;
+ struct btrfs_inode *inode;
- if (!inode)
+ vfs_inode = new_inode(dir->i_sb);
+ if (!vfs_inode)
return ERR_PTR(-ENOMEM);
- BTRFS_I(inode)->root = btrfs_grab_root(root);
- BTRFS_I(inode)->ref_root_id = key->objectid;
- set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags);
- set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
+ inode = BTRFS_I(vfs_inode);
+ inode->root = btrfs_grab_root(root);
+ inode->ref_root_id = key->objectid;
+ set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags);
+ set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags);
- btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
+ btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
/*
* We only need lookup, the rest is read-only and there's no inode
* associated with the dentry
*/
- inode->i_op = &simple_dir_inode_operations;
- inode->i_opflags &= ~IOP_XATTR;
- inode->i_fop = &simple_dir_operations;
- inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+ vfs_inode->i_op = &simple_dir_inode_operations;
+ vfs_inode->i_opflags &= ~IOP_XATTR;
+ vfs_inode->i_fop = &simple_dir_operations;
+ vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- ts = inode_set_ctime_current(inode);
- inode_set_mtime_to_ts(inode, ts);
- inode_set_atime_to_ts(inode, inode_get_atime(dir));
- BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
- BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
+ ts = inode_set_ctime_current(vfs_inode);
+ inode_set_mtime_to_ts(vfs_inode, ts);
+ inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir));
+ inode->i_otime_sec = ts.tv_sec;
+ inode->i_otime_nsec = ts.tv_nsec;
- inode->i_uid = dir->i_uid;
- inode->i_gid = dir->i_gid;
+ vfs_inode->i_uid = dir->i_uid;
+ vfs_inode->i_gid = dir->i_gid;
return inode;
}
@@ -5664,15 +5943,15 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO);
static_assert(BTRFS_FT_SOCK == FT_SOCK);
static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
-static inline u8 btrfs_inode_type(struct inode *inode)
+static inline u8 btrfs_inode_type(const struct btrfs_inode *inode)
{
- return fs_umode_to_ftype(inode->i_mode);
+ return fs_umode_to_ftype(inode->vfs_inode.i_mode);
}
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
struct btrfs_key location = { 0 };
@@ -5689,18 +5968,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (location.type == BTRFS_INODE_ITEM_KEY) {
inode = btrfs_iget(location.objectid, root);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
/* Do extra check against inode mode with di_type */
- if (btrfs_inode_type(inode) != di_type) {
+ if (unlikely(btrfs_inode_type(inode) != di_type)) {
btrfs_crit(fs_info,
"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
- inode->i_mode, btrfs_inode_type(inode),
+ inode->vfs_inode.i_mode, btrfs_inode_type(inode),
di_type);
- iput(inode);
+ iput(&inode->vfs_inode);
return ERR_PTR(-EUCLEAN);
}
- return inode;
+ return &inode->vfs_inode;
}
ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
@@ -5715,19 +5994,22 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
btrfs_put_root(sub_root);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
down_read(&fs_info->cleanup_work_sem);
- if (!sb_rdonly(inode->i_sb))
+ if (!sb_rdonly(inode->vfs_inode.i_sb))
ret = btrfs_orphan_cleanup(sub_root);
up_read(&fs_info->cleanup_work_sem);
if (ret) {
- iput(inode);
+ iput(&inode->vfs_inode);
inode = ERR_PTR(ret);
}
}
- return inode;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return &inode->vfs_inode;
}
static int btrfs_dentry_delete(const struct dentry *dentry)
@@ -5767,7 +6049,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
struct btrfs_key key, found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
int ret;
@@ -5781,15 +6063,14 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
/* FIXME: we should be able to handle this */
if (ret == 0)
- goto out;
- ret = 0;
+ return ret;
if (path->slots[0] == 0) {
inode->index_cnt = BTRFS_DIR_START_INDEX;
- goto out;
+ return 0;
}
path->slots[0]--;
@@ -5800,13 +6081,12 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
if (found_key.objectid != btrfs_ino(inode) ||
found_key.type != BTRFS_DIR_INDEX_KEY) {
inode->index_cnt = BTRFS_DIR_START_INDEX;
- goto out;
+ return 0;
}
inode->index_cnt = found_key.offset + 1;
-out:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
@@ -5909,7 +6189,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
struct btrfs_dir_item *di;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
void *addr;
LIST_HEAD(ins_list);
LIST_HEAD(del_list);
@@ -5992,8 +6272,7 @@ again:
if (ret)
goto nopos;
- ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
- if (ret)
+ if (btrfs_readdir_delayed_dir_index(ctx, &ins_list))
goto nopos;
/*
@@ -6022,7 +6301,6 @@ nopos:
err:
if (put)
btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
- btrfs_free_path(path);
return ret;
}
@@ -6064,8 +6342,8 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
}
/*
- * This is a copy of file_update_time. We need this so we can return error on
- * ENOSPC for updating the inode in the case of file write and mmap writes.
+ * We need our own ->update_time so that we can return error on ENOSPC for
+ * updating the inode in the case of file write and mmap writes.
*/
static int btrfs_update_time(struct inode *inode, int flags)
{
@@ -6200,7 +6478,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *
inode->flags |= BTRFS_INODE_NODATASUM;
}
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
}
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
@@ -6280,12 +6558,14 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (!args->subvol)
btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
+ btrfs_set_inode_mapping_order(BTRFS_I(inode));
if (S_ISREG(inode->i_mode)) {
if (btrfs_test_opt(fs_info, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
if (btrfs_test_opt(fs_info, NODATACOW))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
+ btrfs_update_inode_mapping_flags(BTRFS_I(inode));
}
ret = btrfs_insert_inode_locked(inode);
@@ -6332,7 +6612,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
batch.nr = args->orphan ? 1 : 2;
ret = btrfs_insert_empty_items(trans, root, path, &batch);
- if (ret != 0) {
+ if (unlikely(ret != 0)) {
btrfs_abort_transaction(trans, ret);
goto discard;
}
@@ -6370,7 +6650,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
}
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
/*
* We don't need the path anymore, plus inheriting properties, adding
* ACLs, security xattrs, orphan item or adding the link, will result in
@@ -6380,7 +6659,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
path = NULL;
if (args->subvol) {
- struct inode *parent;
+ struct btrfs_inode *parent;
/*
* Subvolumes inherit properties from their parent subvolume,
@@ -6390,11 +6669,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
} else {
- ret = btrfs_inode_inherit_props(trans, inode, parent);
- iput(parent);
+ ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
+ parent);
+ iput(&parent->vfs_inode);
}
} else {
- ret = btrfs_inode_inherit_props(trans, inode, dir);
+ ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
+ BTRFS_I(dir));
}
if (ret) {
btrfs_err(fs_info,
@@ -6408,7 +6689,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
*/
if (!args->subvol) {
ret = btrfs_init_inode_security(trans, args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto discard;
}
@@ -6428,13 +6709,17 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (args->orphan) {
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
} else {
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
0, BTRFS_I(inode)->dir_index);
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto discard;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
}
return 0;
@@ -6462,7 +6747,7 @@ out:
*/
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
- const struct fscrypt_str *name, int add_backref, u64 index)
+ const struct fscrypt_str *name, bool add_backref, u64 index)
{
int ret = 0;
struct btrfs_key key;
@@ -6492,10 +6777,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
return ret;
ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
- btrfs_inode_type(&inode->vfs_inode), index);
+ btrfs_inode_type(inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
- else if (ret) {
+ else if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -6503,15 +6788,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
name->len * 2);
inode_inc_iversion(&parent_inode->vfs_inode);
- /*
- * If we are replaying a log tree, we do not want to update the mtime
- * and ctime of the parent directory with the current time, since the
- * log replay procedure is responsible for setting them to their correct
- * values (the ones it had when the fsync was done).
- */
- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
- inode_set_mtime_to_ts(&parent_inode->vfs_inode,
- inode_set_ctime_current(&parent_inode->vfs_inode));
+ update_time_after_link_or_unlink(parent_inode);
ret = btrfs_update_inode(trans, parent_inode);
if (ret)
@@ -6521,20 +6798,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
fail_dir_item:
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
u64 local_index;
- int err;
- err = btrfs_del_root_ref(trans, key.objectid,
- btrfs_root_id(root), parent_ino,
- &local_index, name);
- if (err)
- btrfs_abort_transaction(trans, err);
+ int ret2;
+
+ ret2 = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root),
+ parent_ino, &local_index, name);
+ if (ret2)
+ btrfs_abort_transaction(trans, ret2);
} else if (add_backref) {
- u64 local_index;
- int err;
+ int ret2;
- err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
- &local_index);
- if (err)
- btrfs_abort_transaction(trans, err);
+ ret2 = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, NULL);
+ if (ret2)
+ btrfs_abort_transaction(trans, ret2);
}
/* Return the original error code */
@@ -6553,30 +6828,33 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
};
unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
- int err;
+ int ret;
- err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
- if (err)
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
goto out_inode;
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_new_inode_args;
}
- err = btrfs_create_new_inode(trans, &new_inode_args);
- if (!err)
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
+ if (!ret) {
+ if (S_ISDIR(inode->i_mode))
+ inode->i_opflags |= IOP_FASTPERM_MAY_EXEC;
d_instantiate_new(dentry, inode);
+ }
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
- if (err)
+ if (ret)
iput(inode);
- return err;
+ return ret;
}
static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
@@ -6617,8 +6895,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct fscrypt_name fname;
u64 index;
- int err;
- int drop_inode = 0;
+ int ret;
/* do not allow sys_link's with other subvols of the same device */
if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
@@ -6627,12 +6904,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink >= BTRFS_LINK_MAX)
return -EMLINK;
- err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
- if (err)
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+ if (ret)
goto fail;
- err = btrfs_set_inode_index(BTRFS_I(dir), &index);
- if (err)
+ ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
+ if (ret)
goto fail;
/*
@@ -6643,67 +6920,66 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
*/
trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
trans = NULL;
goto fail;
}
/* There are several dir indexes for this inode, clear the cache. */
BTRFS_I(inode)->dir_index = 0ULL;
- inc_nlink(inode);
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ihold(inode);
- set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
- err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
&fname.disk_name, 1, index);
+ if (ret)
+ goto fail;
- if (err) {
- drop_inode = 1;
- } else {
- struct dentry *parent = dentry->d_parent;
+ /* Link added now we update the inode item with the new link count. */
+ inc_nlink(inode);
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
- err = btrfs_update_inode(trans, BTRFS_I(inode));
- if (err)
+ if (inode->i_nlink == 1) {
+ /*
+ * If the new hard link count is 1, it's a file created with the
+ * open(2) O_TMPFILE flag.
+ */
+ ret = btrfs_orphan_del(trans, BTRFS_I(inode));
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto fail;
- if (inode->i_nlink == 1) {
- /*
- * If new hard link count is 1, it's a file created
- * with open(2) O_TMPFILE flag.
- */
- err = btrfs_orphan_del(trans, BTRFS_I(inode));
- if (err)
- goto fail;
}
- d_instantiate(dentry, inode);
- btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
}
+ /* Grab reference for the new dentry passed to d_instantiate(). */
+ ihold(inode);
+ d_instantiate(dentry, inode);
+ btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent);
+
fail:
fscrypt_free_filename(&fname);
if (trans)
btrfs_end_transaction(trans);
- if (drop_inode) {
- inode_dec_link_count(inode);
- iput(inode);
- }
btrfs_btree_balance_dirty(fs_info);
- return err;
+ return ret;
}
-static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
inode = new_inode(dir->i_sb);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
- return btrfs_create_common(dir, dentry, inode);
+ return ERR_PTR(btrfs_create_common(dir, dentry, inode));
}
static noinline int uncompress_inline(struct btrfs_path *path,
@@ -6712,6 +6988,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
{
int ret;
struct extent_buffer *leaf = path->nodes[0];
+ const u32 blocksize = leaf->fs_info->sectorsize;
char *tmp;
size_t max_size;
unsigned long inline_size;
@@ -6728,7 +7005,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
- max_size = min_t(unsigned long, PAGE_SIZE, max_size);
+ max_size = min_t(unsigned long, blocksize, max_size);
ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
max_size);
@@ -6740,14 +7017,15 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* cover that region here.
*/
- if (max_size < PAGE_SIZE)
- folio_zero_range(folio, max_size, PAGE_SIZE - max_size);
+ if (max_size < blocksize)
+ folio_zero_range(folio, max_size, blocksize - max_size);
kfree(tmp);
return ret;
}
static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
{
+ const u32 blocksize = path->nodes[0]->fs_info->sectorsize;
struct btrfs_file_extent_item *fi;
void *kaddr;
size_t copy_size;
@@ -6762,14 +7040,14 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
return uncompress_inline(path, folio, fi);
- copy_size = min_t(u64, PAGE_SIZE,
+ copy_size = min_t(u64, blocksize,
btrfs_file_extent_ram_bytes(path->nodes[0], fi));
kaddr = kmap_local_folio(folio, 0);
read_extent_buffer(path->nodes[0], kaddr,
btrfs_file_extent_inline_start(fi), copy_size);
kunmap_local(kaddr);
- if (copy_size < PAGE_SIZE)
- folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size);
+ if (copy_size < blocksize)
+ folio_zero_range(folio, copy_size, blocksize - copy_size);
return 0;
}
@@ -6808,18 +7086,18 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct extent_map_tree *em_tree = &inode->extent_tree;
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = btrfs_lookup_extent_mapping(em_tree, start, len);
read_unlock(&em_tree->lock);
if (em) {
if (em->start > start || em->start + em->len <= start)
- free_extent_map(em);
+ btrfs_free_extent_map(em);
else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
- free_extent_map(em);
+ btrfs_free_extent_map(em);
else
goto out;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
ret = -ENOMEM;
goto out;
@@ -6843,8 +7121,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
* point the commit_root has everything we need.
*/
if (btrfs_is_free_space_inode(inode)) {
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
}
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
@@ -6879,7 +7157,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
- if (!S_ISREG(inode->vfs_inode.i_mode)) {
+ if (unlikely(!S_ISREG(inode->vfs_inode.i_mode))) {
ret = -EUCLEAN;
btrfs_crit(fs_info,
"regular/prealloc extent found for non-regular inode %llu",
@@ -6956,7 +7234,7 @@ not_found:
insert:
ret = 0;
btrfs_release_path(path);
- if (em->start > start || extent_map_end(em) <= start) {
+ if (unlikely(em->start > start || btrfs_extent_map_end(em) <= start)) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
@@ -6973,7 +7251,7 @@ out:
trace_btrfs_get_extent(root, inode, em);
if (ret) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ERR_PTR(ret);
}
return em;
@@ -7001,8 +7279,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
* @orig_start: (optional) Return the original file offset of the file extent
* @orig_len: (optional) Return the original on-disk length of the file extent
* @ram_bytes: (optional) Return the ram_bytes of the file extent
- * @strict: if true, omit optimizations that might force us into unnecessary
- * cow. e.g., don't trust generation number.
*
* Return:
* >0 and update @len if we can do nocow write
@@ -7012,17 +7288,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
* NOTE: This only checks the file extents, caller is responsible to wait for
* any ordered extents.
*/
-noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
struct btrfs_file_extent *file_extent,
- bool nowait, bool strict)
+ bool nowait)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct can_nocow_file_extent_args nocow_args = { 0 };
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
struct extent_buffer *leaf;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
int found_type;
@@ -7032,81 +7308,74 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
return -ENOMEM;
path->nowait = nowait;
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), offset, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+ offset, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret == 1) {
if (path->slots[0] == 0) {
- /* can't find the item, must cow */
- ret = 0;
- goto out;
+ /* Can't find the item, must COW. */
+ return 0;
}
path->slots[0]--;
}
ret = 0;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY) {
- /* not our file or wrong item type, must cow */
- goto out;
+ /* Not our file or wrong item type, must COW. */
+ return 0;
}
if (key.offset > offset) {
- /* Wrong offset, must cow */
- goto out;
+ /* Wrong offset, must COW. */
+ return 0;
}
if (btrfs_file_extent_end(path) <= offset)
- goto out;
+ return 0;
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(leaf, fi);
nocow_args.start = offset;
nocow_args.end = offset + *len - 1;
- nocow_args.strict = strict;
nocow_args.free_path = true;
- ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
+ ret = can_nocow_file_extent(path, &key, inode, &nocow_args);
/* can_nocow_file_extent() has freed the path. */
path = NULL;
if (ret != 1) {
/* Treat errors as not being able to NOCOW. */
- ret = 0;
- goto out;
+ return 0;
}
- ret = 0;
if (btrfs_extent_readonly(fs_info,
nocow_args.file_extent.disk_bytenr +
nocow_args.file_extent.offset))
- goto out;
+ return 0;
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
range_end = round_up(offset + nocow_args.file_extent.num_bytes,
root->fs_info->sectorsize) - 1;
- ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
- if (ret) {
- ret = -EAGAIN;
- goto out;
- }
+ ret = btrfs_test_range_bit_exists(io_tree, offset, range_end,
+ EXTENT_DELALLOC);
+ if (ret)
+ return -EAGAIN;
}
if (file_extent)
memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
*len = nocow_args.file_extent.num_bytes;
- ret = 1;
-out:
- btrfs_free_path(path);
- return ret;
+
+ return 1;
}
/* The callers of this must take lock_extent() */
@@ -7154,7 +7423,7 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
break;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7167,15 +7436,15 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
em->offset = file_extent->offset;
em->flags |= EXTENT_FLAG_PINNED;
if (type == BTRFS_ORDERED_COMPRESSED)
- extent_map_set_compression(em, file_extent->compression);
+ btrfs_extent_map_set_compression(em, file_extent->compression);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ERR_PTR(ret);
}
- /* em got 2 refs now, callers needs to do free_extent_map once. */
+ /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */
return em;
}
@@ -7189,13 +7458,13 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
static void wait_subpage_spinlock(struct folio *folio)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
return;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- subpage = folio_get_private(folio);
+ bfs = folio_get_private(folio);
/*
* This may look insane as we just acquire the spinlock and release it,
@@ -7208,14 +7477,14 @@ static void wait_subpage_spinlock(struct folio *folio)
* Here we just acquire the spinlock so that all existing callers
* should exit and we're safe to release/invalidate the page.
*/
- spin_lock_irq(&subpage->lock);
- spin_unlock_irq(&subpage->lock);
+ spin_lock_irq(&bfs->lock);
+ spin_unlock_irq(&bfs->lock);
}
static int btrfs_launder_folio(struct folio *folio)
{
return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
- PAGE_SIZE, NULL);
+ folio_size(folio), NULL);
}
static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
@@ -7242,7 +7511,7 @@ static int btrfs_migrate_folio(struct address_space *mapping,
{
int ret = filemap_migrate_folio(mapping, dst, src, mode);
- if (ret != MIGRATEPAGE_SUCCESS)
+ if (ret)
return ret;
if (folio_test_ordered(src)) {
@@ -7250,7 +7519,7 @@ static int btrfs_migrate_folio(struct address_space *mapping,
folio_set_ordered(dst);
}
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
#else
#define btrfs_migrate_folio NULL
@@ -7266,7 +7535,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
u64 page_start = folio_pos(folio);
u64 page_end = page_start + folio_size(folio) - 1;
u64 cur;
- int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
+ int inode_evicting = inode_state_read_once(&inode->vfs_inode) & I_FREEING;
/*
* We have folio locked so no new ordered extent can be created on this
@@ -7302,7 +7571,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
}
if (!inode_evicting)
- lock_extent(tree, page_start, page_end, &cached_state);
+ btrfs_lock_extent(tree, page_start, page_end, &cached_state);
cur = page_start;
while (cur < page_end) {
@@ -7358,16 +7627,16 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* btrfs_finish_ordered_io().
*/
if (!inode_evicting)
- clear_extent_bit(tree, cur, range_end,
- EXTENT_DELALLOC |
- EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
+ btrfs_clear_extent_bit(tree, cur, range_end,
+ EXTENT_DELALLOC |
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
ordered->truncated_len = min(ordered->truncated_len,
cur - ordered->file_offset);
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
/*
* If the ordered extent has finished, we're safe to delete all
@@ -7403,12 +7672,11 @@ next:
* Since the IO will never happen for this page.
*/
btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
- if (!inode_evicting) {
- clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_UPTODATE |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
- extra_flags, &cached_state);
- }
+ if (!inode_evicting)
+ btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG | extra_flags,
+ &cached_state);
cur = range_end + 1;
}
/*
@@ -7430,19 +7698,22 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
.ino = btrfs_ino(inode),
.min_type = BTRFS_EXTENT_DATA_KEY,
.clear_extent_range = true,
+ .new_size = inode->vfs_inode.i_size,
};
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *rsv;
+ struct btrfs_block_rsv rsv;
int ret;
struct btrfs_trans_handle *trans;
- u64 mask = fs_info->sectorsize - 1;
const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
+ const u64 lock_start = round_down(inode->vfs_inode.i_size, fs_info->sectorsize);
+ const u64 i_size_up = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
+
+ /* Our inode is locked and the i_size can't be changed concurrently. */
+ btrfs_assert_inode_locked(inode);
if (!skip_writeback) {
- ret = btrfs_wait_ordered_range(inode,
- inode->vfs_inode.i_size & (~mask),
- (u64)-1);
+ ret = btrfs_wait_ordered_range(inode, lock_start, (u64)-1);
if (ret)
return ret;
}
@@ -7475,11 +7746,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
* 2) fs_info->trans_block_rsv - this will have 1 items worth left for
* updating the inode.
*/
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv)
- return -ENOMEM;
- rsv->size = min_size;
- rsv->failfast = true;
+ btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
+ rsv.size = min_size;
+ rsv.failfast = true;
/*
* 1 for the truncate slack space
@@ -7492,7 +7761,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
}
/* Migrate the slack space for the truncate to our reserve */
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
min_size, false);
/*
* We have reserved 2 metadata units when we started the transaction and
@@ -7504,30 +7773,25 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
goto out;
}
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
while (1) {
struct extent_state *cached_state = NULL;
- const u64 new_size = inode->vfs_inode.i_size;
- const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
- control.new_size = new_size;
- lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
/*
* We want to drop from the next block forward in case this new
* size is not block aligned since we will be keeping the last
* block of the extent just the way it is.
*/
- btrfs_drop_extent_map_range(inode,
- ALIGN(new_size, fs_info->sectorsize),
- (u64)-1, false);
+ btrfs_drop_extent_map_range(inode, i_size_up, (u64)-1, false);
ret = btrfs_truncate_inode_items(trans, root, &control);
inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
- unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
@@ -7547,9 +7811,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
break;
}
- btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
+ btrfs_block_rsv_release(fs_info, &rsv, -1, NULL);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, false);
+ &rsv, min_size, false);
/*
* We have reserved 2 metadata units when we started the
* transaction and min_size matches 1 unit, so this should never
@@ -7558,7 +7822,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
if (WARN_ON(ret))
break;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
}
/*
@@ -7571,7 +7835,8 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
+ ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size,
+ inode->vfs_inode.i_size, (u64)-1);
if (ret)
goto out;
trans = btrfs_start_transaction(root, 1);
@@ -7596,7 +7861,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
btrfs_btree_balance_dirty(fs_info);
}
out:
- btrfs_free_block_rsv(fs_info, rsv);
+ btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
/*
* So if we truncate and then write and fsync we normally would just
* write the extents that changed, which is a problem if we need to
@@ -7652,6 +7917,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
+ /* new_delalloc_bytes and last_dir_index_offset are in a union. */
ei->new_delalloc_bytes = 0;
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
@@ -7683,10 +7949,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->i_otime_nsec = 0;
inode = &ei->vfs_inode;
- extent_map_tree_init(&ei->extent_tree);
+ btrfs_extent_map_tree_init(&ei->extent_tree);
/* This io tree sets the valid inode. */
- extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
+ btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
ei->file_extent_tree = NULL;
@@ -7786,7 +8052,7 @@ int btrfs_drop_inode(struct inode *inode)
if (btrfs_root_refs(&root->root_item) == 0)
return 1;
else
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
static void init_once(void *foo)
@@ -7794,6 +8060,9 @@ static void init_once(void *foo)
struct btrfs_inode *ei = foo;
inode_init_once(&ei->vfs_inode);
+#ifdef CONFIG_FS_VERITY
+ ei->i_verity_info = NULL;
+#endif
}
void __cold btrfs_destroy_cachep(void)
@@ -7851,7 +8120,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
- stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
+ stat->subvol = btrfs_root_id(BTRFS_I(inode)->root);
stat->result_mask |= STATX_SUBVOL;
spin_lock(&BTRFS_I(inode)->lock);
@@ -7884,6 +8153,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret;
int ret2;
bool need_abort = false;
+ bool logs_pinned = false;
struct fscrypt_name old_fname, new_fname;
struct fscrypt_str *old_name, *new_name;
@@ -7994,7 +8264,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
if (ret) {
- if (need_abort)
+ if (unlikely(need_abort))
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8007,6 +8277,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
+ new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not for
+ * root entries) pin the log early to prevent any concurrent
+ * task from logging the directory after we removed the old
+ * entries and before we add the new entries, otherwise that
+ * task can sync a log without any entry for the inodes we are
+ * renaming and therefore replaying that log, if a power failure
+ * happens after syncing the log, would result in deleting the
+ * inodes.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8017,43 +8312,57 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
} else { /* src is an inode */
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_name, &old_rename_ctx);
- if (!ret)
- ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_fail;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+ ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
/* dest is a subvolume */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
} else { /* dest is an inode */
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_name, &new_rename_ctx);
- if (!ret)
- ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_fail;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+ ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
new_name, 0, old_idx);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
old_name, 0, new_idx);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8064,30 +8373,23 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
/*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
+ * Do the log updates for all inodes.
+ *
+ * If either entry is for a root we don't need to update the logs since
+ * we've called btrfs_set_log_full_commit() before.
*/
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(dest);
-
- /* Do the log updates for all inodes. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned) {
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
old_rename_ctx.index, new_dentry->d_parent);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
new_rename_ctx.index, old_dentry->d_parent);
+ }
- /* Now unpin the logs. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+out_fail:
+ if (logs_pinned) {
btrfs_end_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_end_log_trans(dest);
-out_fail:
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -8137,6 +8439,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
struct fscrypt_name old_fname, new_fname;
+ bool logs_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -8271,22 +8574,52 @@ static int btrfs_rename(struct mnt_idmap *idmap,
inode_inc_iversion(old_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not a
+ * root entry) pin the log to prevent any concurrent task from
+ * logging the directory after we removed the old entry and
+ * before we add the new entry, otherwise that task can sync
+ * a log without any entry for the inode we are renaming and
+ * therefore replaying that log, if a power failure happens
+ * after syncing the log, would result in deleting the inode.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
} else {
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
&old_fname.disk_name, &rename_ctx);
- if (!ret)
- ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_fail;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+ ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
if (new_inode) {
@@ -8294,24 +8627,33 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
&new_fname.disk_name);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
- if (!ret && new_inode->i_nlink == 0)
+ if (new_inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans,
BTRFS_I(d_inode(new_dentry)));
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_fail;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
}
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
&new_fname.disk_name, 0, index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8319,13 +8661,13 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned)
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
rename_ctx.index, new_dentry->d_parent);
if (flags & RENAME_WHITEOUT) {
ret = btrfs_create_new_inode(trans, &whiteout_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
} else {
@@ -8335,6 +8677,10 @@ static int btrfs_rename(struct mnt_idmap *idmap,
}
}
out_fail:
+ if (logs_pinned) {
+ btrfs_end_log_trans(root);
+ btrfs_end_log_trans(dest);
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -8416,46 +8762,42 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root,
- struct writeback_control *wbc, bool snapshot,
- bool in_reclaim_context)
+static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write,
+ bool snapshot, bool in_reclaim_context)
{
- struct btrfs_inode *binode;
- struct inode *inode;
struct btrfs_delalloc_work *work, *next;
LIST_HEAD(works);
LIST_HEAD(splice);
int ret = 0;
- bool full_flush = wbc->nr_to_write == LONG_MAX;
mutex_lock(&root->delalloc_mutex);
spin_lock(&root->delalloc_lock);
list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
- binode = list_entry(splice.next, struct btrfs_inode,
- delalloc_inodes);
+ struct btrfs_inode *inode;
+ struct inode *tmp_inode;
+
+ inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes);
- list_move_tail(&binode->delalloc_inodes,
- &root->delalloc_inodes);
+ list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
if (in_reclaim_context &&
- test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
+ test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags))
continue;
- inode = igrab(&binode->vfs_inode);
- if (!inode) {
+ tmp_inode = igrab(&inode->vfs_inode);
+ if (!tmp_inode) {
cond_resched_lock(&root->delalloc_lock);
continue;
}
spin_unlock(&root->delalloc_lock);
if (snapshot)
- set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
- &binode->runtime_flags);
- if (full_flush) {
- work = btrfs_alloc_delalloc_work(inode);
+ set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
+ if (nr_to_write == NULL) {
+ work = btrfs_alloc_delalloc_work(tmp_inode);
if (!work) {
- iput(inode);
+ iput(tmp_inode);
ret = -ENOMEM;
goto out;
}
@@ -8463,9 +8805,11 @@ static int start_delalloc_inodes(struct btrfs_root *root,
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
- btrfs_add_delayed_iput(BTRFS_I(inode));
- if (ret || wbc->nr_to_write <= 0)
+ ret = filemap_flush_nr(tmp_inode->i_mapping,
+ nr_to_write);
+ btrfs_add_delayed_iput(inode);
+
+ if (ret || *nr_to_write <= 0)
goto out;
}
cond_resched();
@@ -8491,29 +8835,17 @@ out:
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
{
- struct writeback_control wbc = {
- .nr_to_write = LONG_MAX,
- .sync_mode = WB_SYNC_NONE,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
struct btrfs_fs_info *fs_info = root->fs_info;
if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
-
- return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
+ return start_delalloc_inodes(root, NULL, true, in_reclaim_context);
}
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context)
{
- struct writeback_control wbc = {
- .nr_to_write = nr,
- .sync_mode = WB_SYNC_NONE,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
+ long *nr_to_write = nr == LONG_MAX ? NULL : &nr;
struct btrfs_root *root;
LIST_HEAD(splice);
int ret;
@@ -8525,13 +8857,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
while (!list_empty(&splice)) {
- /*
- * Reset nr_to_write here so we know that we're doing a full
- * flush.
- */
- if (nr == LONG_MAX)
- wbc.nr_to_write = LONG_MAX;
-
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
root = btrfs_grab_root(root);
@@ -8540,9 +8865,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
+ ret = start_delalloc_inodes(root, nr_to_write, false,
+ in_reclaim_context);
btrfs_put_root(root);
- if (ret < 0 || wbc.nr_to_write <= 0)
+ if (ret < 0 || nr <= 0)
goto out;
spin_lock(&fs_info->delalloc_root_lock);
}
@@ -8573,7 +8899,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
.dentry = dentry,
};
unsigned int trans_num_items;
- int err;
+ int ret;
int name_len;
int datasize;
unsigned long ptr;
@@ -8581,7 +8907,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct extent_buffer *leaf;
name_len = strlen(symname);
- if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
+ /*
+ * Symlinks utilize uncompressed inline extent data, which should not
+ * reach block size.
+ */
+ if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
+ name_len >= fs_info->sectorsize)
return -ENAMETOOLONG;
inode = new_inode(dir->i_sb);
@@ -8595,38 +8926,37 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode_set_bytes(inode, name_len);
new_inode_args.inode = inode;
- err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
- if (err)
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
goto out_inode;
/* 1 additional item for the inline extent */
trans_num_items++;
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_new_inode_args;
}
- err = btrfs_create_new_inode(trans, &new_inode_args);
- if (err)
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
+ if (ret)
goto out;
path = btrfs_alloc_path();
- if (!path) {
- err = -ENOMEM;
- btrfs_abort_transaction(trans, err);
+ if (unlikely(!path)) {
+ ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
discard_new_inode(inode);
inode = NULL;
goto out;
}
key.objectid = btrfs_ino(BTRFS_I(inode));
- key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(name_len);
- err = btrfs_insert_empty_item(trans, root, path, &key,
- datasize);
- if (err) {
- btrfs_abort_transaction(trans, err);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, datasize);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
discard_new_inode(inode);
inode = NULL;
@@ -8645,20 +8975,19 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
ptr = btrfs_file_extent_inline_start(ei);
write_extent_buffer(leaf, symname, ptr, name_len);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_free_path(path);
d_instantiate_new(dentry, inode);
- err = 0;
+ ret = 0;
out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
- if (err)
+ if (ret)
iput(inode);
- return err;
+ return ret;
}
static struct btrfs_trans_handle *insert_prealloc_file_extent(
@@ -8769,7 +9098,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
*/
cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
- min_size, 0, *alloc_hint, &ins, 1, 0);
+ min_size, 0, *alloc_hint, &ins, true, false);
if (ret)
break;
@@ -8795,11 +9124,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
- ins.offset, 0);
+ ins.offset, false);
break;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
cur_offset + ins.offset - 1, false);
@@ -8817,7 +9146,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
next:
num_bytes -= ins.offset;
cur_offset += ins.offset;
@@ -8839,7 +9168,7 @@ next:
ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
if (own_trans)
btrfs_end_transaction(trans);
@@ -8875,6 +9204,11 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
min_size, actual_len, alloc_hint, trans);
}
+/*
+ * NOTE: in case you are adding MAY_EXEC check for directories:
+ * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to
+ * elide calls here.
+ */
static int btrfs_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask)
{
@@ -8989,7 +9323,7 @@ static ssize_t btrfs_encoded_read_inline(
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_file_extent_item *item;
u64 ram_bytes;
@@ -8999,21 +9333,19 @@ static ssize_t btrfs_encoded_read_inline(
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
path->nowait = nowait;
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
extent_start, 0);
if (ret) {
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
/* The extent item disappeared? */
- ret = -EIO;
+ return -EIO;
}
- goto out;
+ return ret;
}
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
@@ -9026,17 +9358,16 @@ static ssize_t btrfs_encoded_read_inline(
ret = btrfs_encoded_io_compression_from_extent(fs_info,
btrfs_file_extent_compression(leaf, item));
if (ret < 0)
- goto out;
+ return ret;
encoded->compression = ret;
if (encoded->compression) {
size_t inline_size;
inline_size = btrfs_file_extent_inline_item_len(leaf,
path->slots[0]);
- if (inline_size > count) {
- ret = -ENOBUFS;
- goto out;
- }
+ if (inline_size > count)
+ return -ENOBUFS;
+
count = inline_size;
encoded->unencoded_len = ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - extent_start;
@@ -9048,13 +9379,12 @@ static ssize_t btrfs_encoded_read_inline(
}
tmp = kmalloc(count, GFP_NOFS);
- if (!tmp) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!tmp)
+ return -ENOMEM;
+
read_extent_buffer(leaf, tmp, ptr, count);
btrfs_release_path(path);
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
@@ -9062,15 +9392,14 @@ static ssize_t btrfs_encoded_read_inline(
if (ret != count)
ret = -EFAULT;
kfree(tmp);
-out:
- btrfs_free_path(path);
+
return ret;
}
struct btrfs_encoded_read_private {
- wait_queue_head_t wait;
+ struct completion *sync_reads;
void *uring_ctx;
- atomic_t pending;
+ refcount_t pending_refs;
blk_status_t status;
};
@@ -9080,23 +9409,22 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
if (bbio->bio.bi_status) {
/*
- * The memory barrier implied by the atomic_dec_return() here
- * pairs with the memory barrier implied by the
- * atomic_dec_return() or io_wait_event() in
- * btrfs_encoded_read_regular_fill_pages() to ensure that this
- * write is observed before the load of status in
+ * The memory barrier implied by the refcount_dec_and_test() here
+ * pairs with the memory barrier implied by the refcount_dec_and_test()
+ * in btrfs_encoded_read_regular_fill_pages() to ensure that
+ * this write is observed before the load of status in
* btrfs_encoded_read_regular_fill_pages().
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
}
- if (atomic_dec_return(&priv->pending) == 0) {
+ if (refcount_dec_and_test(&priv->pending_refs)) {
int err = blk_status_to_errno(READ_ONCE(priv->status));
if (priv->uring_ctx) {
btrfs_uring_read_extent_endio(priv->uring_ctx, err);
kfree(priv);
} else {
- wake_up(&priv->wait);
+ complete(priv->sync_reads);
}
}
bio_put(&bbio->bio);
@@ -9106,37 +9434,44 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
u64 disk_bytenr, u64 disk_io_size,
struct page **pages, void *uring_ctx)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_encoded_read_private *priv;
+ struct btrfs_encoded_read_private *priv, sync_priv;
+ struct completion sync_reads;
unsigned long i = 0;
struct btrfs_bio *bbio;
int ret;
- priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
- if (!priv)
- return -ENOMEM;
+ /*
+ * Fast path for synchronous reads which completes in this call, io_uring
+ * needs longer time span.
+ */
+ if (uring_ctx) {
+ priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
+ if (!priv)
+ return -ENOMEM;
+ } else {
+ priv = &sync_priv;
+ init_completion(&sync_reads);
+ priv->sync_reads = &sync_reads;
+ }
- init_waitqueue_head(&priv->wait);
- atomic_set(&priv->pending, 1);
+ refcount_set(&priv->pending_refs, 1);
priv->status = 0;
priv->uring_ctx = uring_ctx;
- bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode, 0,
btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
- bbio->inode = inode;
do {
size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
- atomic_inc(&priv->pending);
+ refcount_inc(&priv->pending_refs);
btrfs_submit_bbio(bbio, 0);
- bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode, 0,
btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
- bbio->inode = inode;
continue;
}
@@ -9145,11 +9480,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
disk_io_size -= bytes;
} while (disk_io_size);
- atomic_inc(&priv->pending);
+ refcount_inc(&priv->pending_refs);
btrfs_submit_bbio(bbio, 0);
if (uring_ctx) {
- if (atomic_dec_return(&priv->pending) == 0) {
+ if (refcount_dec_and_test(&priv->pending_refs)) {
ret = blk_status_to_errno(READ_ONCE(priv->status));
btrfs_uring_read_extent_endio(uring_ctx, ret);
kfree(priv);
@@ -9158,12 +9493,10 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
return -EIOCBQUEUED;
} else {
- if (atomic_dec_return(&priv->pending) != 0)
- io_wait_event(priv->wait, !atomic_read(&priv->pending));
+ if (!refcount_dec_and_test(&priv->pending_refs))
+ wait_for_completion_io(&sync_reads);
/* See btrfs_encoded_read_endio() for ordering. */
- ret = blk_status_to_errno(READ_ONCE(priv->status));
- kfree(priv);
- return ret;
+ return blk_status_to_errno(READ_ONCE(priv->status));
}
}
@@ -9196,7 +9529,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
if (ret)
goto out;
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
@@ -9273,7 +9606,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
goto out_unlock_inode;
}
- if (!try_lock_extent(io_tree, start, lockend, cached_state)) {
+ if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) {
ret = -EAGAIN;
goto out_unlock_inode;
}
@@ -9282,7 +9615,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
lockend - start + 1);
if (ordered) {
btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
ret = -EAGAIN;
goto out_unlock_inode;
}
@@ -9295,13 +9628,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
if (ret)
goto out_unlock_inode;
- lock_extent(io_tree, start, lockend, cached_state);
+ btrfs_lock_extent(io_tree, start, lockend, cached_state);
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
if (!ordered)
break;
btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
cond_resched();
}
}
@@ -9319,7 +9652,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
* For inline extents we get everything we need out of the
* extent item.
*/
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
cached_state, extent_start,
@@ -9331,7 +9664,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
* We only want to return up to EOF even if the extent extends beyond
* that.
*/
- encoded->len = min_t(u64, extent_map_end(em),
+ encoded->len = min_t(u64, btrfs_extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC)) {
@@ -9339,7 +9672,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
- } else if (extent_map_is_compressed(em)) {
+ } else if (btrfs_extent_map_is_compressed(em)) {
*disk_bytenr = em->disk_bytenr;
/*
* Bail if the buffer isn't large enough to return the whole
@@ -9354,12 +9687,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
ret = btrfs_encoded_io_compression_from_extent(fs_info,
- extent_map_compression(em));
+ btrfs_extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
} else {
- *disk_bytenr = extent_map_block_start(em) + (start - em->start);
+ *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start);
if (encoded->len > count)
encoded->len = count;
/*
@@ -9372,11 +9705,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->unencoded_len = count;
*disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
if (*disk_bytenr == EXTENT_MAP_HOLE) {
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
@@ -9388,11 +9721,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
}
out_em:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out_unlock_extent:
/* Leave inode and extent locked if we need to do a read. */
if (!unlocked && ret != -EIOCBQUEUED)
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
out_unlock_inode:
if (!unlocked && ret != -EIOCBQUEUED)
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
@@ -9529,8 +9862,6 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
}
for (;;) {
- struct btrfs_ordered_extent *ordered;
-
ret = btrfs_wait_ordered_range(inode, start, num_bytes);
if (ret)
goto out_folios;
@@ -9539,14 +9870,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
end >> PAGE_SHIFT);
if (ret)
goto out_folios;
- lock_extent(io_tree, start, end, &cached_state);
+ btrfs_lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
if (!ordered &&
!filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
break;
if (ordered)
btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
cond_resched();
}
@@ -9580,7 +9911,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
}
ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
- disk_num_bytes, 0, 0, &ins, 1, 1);
+ disk_num_bytes, 0, 0, &ins, true, true);
if (ret)
goto out_delalloc_release;
extent_reserved = true;
@@ -9596,11 +9927,11 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = PTR_ERR(em);
goto out_free_reserved;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- (1 << BTRFS_ORDERED_ENCODED) |
- (1 << BTRFS_ORDERED_COMPRESSED));
+ (1U << BTRFS_ORDERED_ENCODED) |
+ (1U << BTRFS_ORDERED_COMPRESSED));
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -9611,7 +9942,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (start + encoded->len > inode->vfs_inode.i_size)
i_size_write(&inode->vfs_inode, start + encoded->len);
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
btrfs_delalloc_release_extents(inode, num_bytes);
@@ -9621,7 +9952,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
out_free_reserved:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_delalloc_release:
btrfs_delalloc_release_extents(inode, num_bytes);
btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
@@ -9634,9 +9965,9 @@ out_free_data_space:
* bytes_may_use.
*/
if (!extent_reserved)
- btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
+ btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes);
out_unlock:
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
out_folios:
for (i = 0; i < nr_folios; i++) {
if (folios[i])
@@ -9789,15 +10120,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
- struct extent_map *em = NULL;
struct btrfs_chunk_map *map = NULL;
struct btrfs_device *device = NULL;
struct btrfs_swap_info bsi = {
.lowest_ppage = (sector_t)-1ULL,
};
+ struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
+ struct btrfs_path *path = NULL;
int ret = 0;
u64 isize;
- u64 start;
+ u64 prev_extent_end = 0;
+
+ /*
+ * Acquire the inode's mmap lock to prevent races with memory mapped
+ * writes, as they could happen after we flush delalloc below and before
+ * we lock the extent range further below. The inode was already locked
+ * up in the call chain.
+ */
+ btrfs_assert_inode_locked(BTRFS_I(inode));
+ down_write(&BTRFS_I(inode)->i_mmap_lock);
/*
* If the swap file was just created, make sure delalloc is done. If the
@@ -9806,22 +10147,32 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
*/
ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
- return ret;
+ goto out_unlock_mmap;
/*
* The inode is locked, so these flags won't change after we check them.
*/
if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
btrfs_warn(fs_info, "swapfile must not be compressed");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
}
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
btrfs_warn(fs_info, "swapfile must not be copy-on-write");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
}
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
btrfs_warn(fs_info, "swapfile must not be checksummed");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
+ }
+
+ path = btrfs_alloc_path();
+ backref_ctx = btrfs_alloc_backref_share_check_ctx();
+ if (!path || !backref_ctx) {
+ ret = -ENOMEM;
+ goto out_unlock_mmap;
}
/*
@@ -9836,7 +10187,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_unlock_mmap;
}
/*
@@ -9850,7 +10202,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because snapshot creation is in progress");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
}
/*
* Snapshots can create extents which require COW even if NODATACOW is
@@ -9866,36 +10219,53 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (btrfs_root_dead(root)) {
spin_unlock(&root->root_item_lock);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because subvolume %llu is being deleted",
btrfs_root_id(root));
- return -EPERM;
+ ret = -EPERM;
+ goto out_unlock_mmap;
}
atomic_inc(&root->nr_swapfiles);
spin_unlock(&root->root_item_lock);
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
- lock_extent(io_tree, 0, isize - 1, &cached_state);
- start = 0;
- while (start < isize) {
- u64 logical_block_start, physical_block_start;
+ btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state);
+ while (prev_extent_end < isize) {
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *ei;
struct btrfs_block_group *bg;
- u64 len = isize - start;
+ u64 logical_block_start;
+ u64 physical_block_start;
+ u64 extent_gen;
+ u64 disk_bytenr;
+ u64 len;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = prev_extent_end;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
goto out;
- }
- if (em->disk_bytenr == EXTENT_MAP_HOLE) {
+ /*
+ * If key not found it means we have an implicit hole (NO_HOLES
+ * is enabled).
+ */
+ if (ret > 0) {
btrfs_warn(fs_info, "swapfile must not have holes");
ret = -EINVAL;
goto out;
}
- if (em->disk_bytenr == EXTENT_MAP_INLINE) {
+
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
/*
* It's unlikely we'll ever actually find ourselves
* here, as a file small enough to fit inline won't be
@@ -9907,23 +10277,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = -EINVAL;
goto out;
}
- if (extent_map_is_compressed(em)) {
+
+ if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL;
goto out;
}
- logical_block_start = extent_map_block_start(em) + (start - em->start);
- len = min(len, em->len - (start - em->start));
- free_extent_map(em);
- em = NULL;
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (disk_bytenr == 0) {
+ btrfs_warn(fs_info, "swapfile must not have holes");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
+ extent_gen = btrfs_file_extent_generation(leaf, ei);
+ prev_extent_end = btrfs_file_extent_end(path);
+
+ if (prev_extent_end > isize)
+ len = isize - key.offset;
+ else
+ len = btrfs_file_extent_num_bytes(leaf, ei);
- ret = can_nocow_extent(inode, start, &len, NULL, false, true);
+ backref_ctx->curr_leaf_bytenr = leaf->start;
+
+ /*
+ * Don't need the path anymore, release to avoid deadlocks when
+ * calling btrfs_is_data_extent_shared() because when joining a
+ * transaction it can block waiting for the current one's commit
+ * which in turn may be trying to lock the same leaf to flush
+ * delayed items for example.
+ */
+ btrfs_release_path(path);
+
+ ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
+ extent_gen, backref_ctx);
if (ret < 0) {
goto out;
- } else if (ret) {
- ret = 0;
- } else {
+ } else if (ret > 0) {
btrfs_warn(fs_info,
"swapfile must not be copy-on-write");
ret = -EINVAL;
@@ -9958,7 +10350,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
physical_block_start = (map->stripes[0].physical +
(logical_block_start - map->start));
- len = min(len, map->chunk_len - (logical_block_start - map->start));
btrfs_free_chunk_map(map);
map = NULL;
@@ -9999,24 +10390,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (ret)
goto out;
}
- bsi.start = start;
+ bsi.start = key.offset;
bsi.block_start = physical_block_start;
bsi.block_len = len;
}
- start += len;
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ cond_resched();
}
if (bsi.block_len)
ret = btrfs_add_swap_extent(sis, &bsi);
out:
- if (!IS_ERR_OR_NULL(em))
- free_extent_map(em);
if (!IS_ERR_OR_NULL(map))
btrfs_free_chunk_map(map);
- unlock_extent(io_tree, 0, isize - 1, &cached_state);
+ btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state);
if (ret)
btrfs_swap_deactivate(file);
@@ -10025,6 +10419,10 @@ out:
btrfs_exclop_finish(fs_info);
+out_unlock_mmap:
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ btrfs_free_backref_share_ctx(backref_ctx);
+ btrfs_free_path(path);
if (ret)
return ret;
@@ -10033,7 +10431,6 @@ out:
*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
sis->max = bsi.nr_pages;
sis->pages = bsi.nr_pages - 1;
- sis->highest_bit = bsi.nr_pages - 1;
return bsi.nr_extents;
}
#else
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c9302d193187..acb484546b1d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -118,8 +118,8 @@ struct btrfs_ioctl_encoded_io_args_32 {
#endif
/* Mask out flags that are inappropriate for the given type of inode. */
-static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
- unsigned int flags)
+static unsigned int btrfs_mask_fsflags_for_type(const struct inode *inode,
+ unsigned int flags)
{
if (S_ISDIR(inode->i_mode))
return flags;
@@ -133,11 +133,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
* ioctl.
*/
-static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
+static unsigned int btrfs_inode_flags_to_fsflags(const struct btrfs_inode *inode)
{
unsigned int iflags = 0;
- u32 flags = binode->flags;
- u32 ro_flags = binode->ro_flags;
+ u32 flags = inode->flags;
+ u32 ro_flags = inode->ro_flags;
if (flags & BTRFS_INODE_SYNC)
iflags |= FS_SYNC_FL;
@@ -167,25 +167,24 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
/*
* Update inode->i_flags based on the btrfs internal flags.
*/
-void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
+void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode)
{
- struct btrfs_inode *binode = BTRFS_I(inode);
unsigned int new_fl = 0;
- if (binode->flags & BTRFS_INODE_SYNC)
+ if (inode->flags & BTRFS_INODE_SYNC)
new_fl |= S_SYNC;
- if (binode->flags & BTRFS_INODE_IMMUTABLE)
+ if (inode->flags & BTRFS_INODE_IMMUTABLE)
new_fl |= S_IMMUTABLE;
- if (binode->flags & BTRFS_INODE_APPEND)
+ if (inode->flags & BTRFS_INODE_APPEND)
new_fl |= S_APPEND;
- if (binode->flags & BTRFS_INODE_NOATIME)
+ if (inode->flags & BTRFS_INODE_NOATIME)
new_fl |= S_NOATIME;
- if (binode->flags & BTRFS_INODE_DIRSYNC)
+ if (inode->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
- if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
+ if (inode->ro_flags & BTRFS_INODE_RO_VERITY)
new_fl |= S_VERITY;
- set_mask_bits(&inode->i_flags,
+ set_mask_bits(&inode->vfs_inode.i_flags,
S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
S_VERITY, new_fl);
}
@@ -219,7 +218,7 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags)
return 0;
}
-static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
+static int check_fsflags_compatible(const struct btrfs_fs_info *fs_info,
unsigned int flags)
{
if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
@@ -246,26 +245,25 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_
* Set flags/xflags from the internal inode flags. The remaining items of
* fsxattr are zeroed.
*/
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
- struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
+ const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
- fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
+ fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(inode));
return 0;
}
int btrfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
- struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_inode *binode = BTRFS_I(inode);
- struct btrfs_root *root = binode->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
unsigned int fsflags, old_fsflags;
int ret;
const char *comp = NULL;
- u32 binode_flags;
+ u32 inode_flags;
if (btrfs_root_readonly(root))
return -EROFS;
@@ -273,8 +271,8 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
if (fileattr_has_fsx(fa))
return -EOPNOTSUPP;
- fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
- old_fsflags = btrfs_inode_flags_to_fsflags(binode);
+ fsflags = btrfs_mask_fsflags_for_type(&inode->vfs_inode, fa->flags);
+ old_fsflags = btrfs_inode_flags_to_fsflags(inode);
ret = check_fsflags(old_fsflags, fsflags);
if (ret)
return ret;
@@ -283,27 +281,27 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
if (ret)
return ret;
- binode_flags = binode->flags;
+ inode_flags = inode->flags;
if (fsflags & FS_SYNC_FL)
- binode_flags |= BTRFS_INODE_SYNC;
+ inode_flags |= BTRFS_INODE_SYNC;
else
- binode_flags &= ~BTRFS_INODE_SYNC;
+ inode_flags &= ~BTRFS_INODE_SYNC;
if (fsflags & FS_IMMUTABLE_FL)
- binode_flags |= BTRFS_INODE_IMMUTABLE;
+ inode_flags |= BTRFS_INODE_IMMUTABLE;
else
- binode_flags &= ~BTRFS_INODE_IMMUTABLE;
+ inode_flags &= ~BTRFS_INODE_IMMUTABLE;
if (fsflags & FS_APPEND_FL)
- binode_flags |= BTRFS_INODE_APPEND;
+ inode_flags |= BTRFS_INODE_APPEND;
else
- binode_flags &= ~BTRFS_INODE_APPEND;
+ inode_flags &= ~BTRFS_INODE_APPEND;
if (fsflags & FS_NODUMP_FL)
- binode_flags |= BTRFS_INODE_NODUMP;
+ inode_flags |= BTRFS_INODE_NODUMP;
else
- binode_flags &= ~BTRFS_INODE_NODUMP;
+ inode_flags &= ~BTRFS_INODE_NODUMP;
if (fsflags & FS_NOATIME_FL)
- binode_flags |= BTRFS_INODE_NOATIME;
+ inode_flags |= BTRFS_INODE_NOATIME;
else
- binode_flags &= ~BTRFS_INODE_NOATIME;
+ inode_flags &= ~BTRFS_INODE_NOATIME;
/* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
if (!fa->flags_valid) {
@@ -315,32 +313,32 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
}
if (fsflags & FS_DIRSYNC_FL)
- binode_flags |= BTRFS_INODE_DIRSYNC;
+ inode_flags |= BTRFS_INODE_DIRSYNC;
else
- binode_flags &= ~BTRFS_INODE_DIRSYNC;
+ inode_flags &= ~BTRFS_INODE_DIRSYNC;
if (fsflags & FS_NOCOW_FL) {
- if (S_ISREG(inode->i_mode)) {
+ if (S_ISREG(inode->vfs_inode.i_mode)) {
/*
* It's safe to turn csums off here, no extents exist.
* Otherwise we want the flag to reflect the real COW
* status of the file and will not set it.
*/
- if (inode->i_size == 0)
- binode_flags |= BTRFS_INODE_NODATACOW |
- BTRFS_INODE_NODATASUM;
+ if (inode->vfs_inode.i_size == 0)
+ inode_flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
} else {
- binode_flags |= BTRFS_INODE_NODATACOW;
+ inode_flags |= BTRFS_INODE_NODATACOW;
}
} else {
/*
* Revert back under same assumptions as above
*/
- if (S_ISREG(inode->i_mode)) {
- if (inode->i_size == 0)
- binode_flags &= ~(BTRFS_INODE_NODATACOW |
- BTRFS_INODE_NODATASUM);
+ if (S_ISREG(inode->vfs_inode.i_mode)) {
+ if (inode->vfs_inode.i_size == 0)
+ inode_flags &= ~(BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM);
} else {
- binode_flags &= ~BTRFS_INODE_NODATACOW;
+ inode_flags &= ~BTRFS_INODE_NODATACOW;
}
}
@@ -350,21 +348,21 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
* things smaller.
*/
if (fsflags & FS_NOCOMP_FL) {
- binode_flags &= ~BTRFS_INODE_COMPRESS;
- binode_flags |= BTRFS_INODE_NOCOMPRESS;
+ inode_flags &= ~BTRFS_INODE_COMPRESS;
+ inode_flags |= BTRFS_INODE_NOCOMPRESS;
} else if (fsflags & FS_COMPR_FL) {
- if (IS_SWAPFILE(inode))
+ if (IS_SWAPFILE(&inode->vfs_inode))
return -ETXTBSY;
- binode_flags |= BTRFS_INODE_COMPRESS;
- binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode_flags |= BTRFS_INODE_COMPRESS;
+ inode_flags &= ~BTRFS_INODE_NOCOMPRESS;
comp = btrfs_compress_type2str(fs_info->compress_type);
if (!comp || comp[0] == 0)
comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
} else {
- binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+ inode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
/*
@@ -376,114 +374,34 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
return PTR_ERR(trans);
if (comp) {
- ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
+ ret = btrfs_set_prop(trans, inode, "btrfs.compression",
comp, strlen(comp), 0);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
} else {
- ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
- NULL, 0, 0);
- if (ret && ret != -ENODATA) {
+ ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0);
+ if (unlikely(ret && ret != -ENODATA)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
}
update_flags:
- binode->flags = binode_flags;
+ inode->flags = inode_flags;
+ btrfs_update_inode_mapping_flags(inode);
btrfs_sync_inode_flags_to_i_flags(inode);
- inode_inc_iversion(inode);
- inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ inode_inc_iversion(&inode->vfs_inode);
+ inode_set_ctime_current(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, inode);
out_end_trans:
btrfs_end_transaction(trans);
return ret;
}
-/*
- * Start exclusive operation @type, return true on success
- */
-bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type)
-{
- bool ret = false;
-
- spin_lock(&fs_info->super_lock);
- if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
- fs_info->exclusive_operation = type;
- ret = true;
- }
- spin_unlock(&fs_info->super_lock);
-
- return ret;
-}
-
-/*
- * Conditionally allow to enter the exclusive operation in case it's compatible
- * with the running one. This must be paired with btrfs_exclop_start_unlock and
- * btrfs_exclop_finish.
- *
- * Compatibility:
- * - the same type is already running
- * - when trying to add a device and balance has been paused
- * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
- * must check the condition first that would allow none -> @type
- */
-bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type)
-{
- spin_lock(&fs_info->super_lock);
- if (fs_info->exclusive_operation == type ||
- (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
- type == BTRFS_EXCLOP_DEV_ADD))
- return true;
-
- spin_unlock(&fs_info->super_lock);
- return false;
-}
-
-void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
-{
- spin_unlock(&fs_info->super_lock);
-}
-
-void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
-{
- spin_lock(&fs_info->super_lock);
- WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
- spin_unlock(&fs_info->super_lock);
- sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
-}
-
-void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation op)
-{
- switch (op) {
- case BTRFS_EXCLOP_BALANCE_PAUSED:
- spin_lock(&fs_info->super_lock);
- ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
- fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD ||
- fs_info->exclusive_operation == BTRFS_EXCLOP_NONE ||
- fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
- fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
- spin_unlock(&fs_info->super_lock);
- break;
- case BTRFS_EXCLOP_BALANCE:
- spin_lock(&fs_info->super_lock);
- ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
- fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
- spin_unlock(&fs_info->super_lock);
- break;
- default:
- btrfs_warn(fs_info,
- "invalid exclop balance operation %d requested", op);
- }
-}
-
-static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
+static int btrfs_ioctl_getversion(const struct inode *inode, int __user *arg)
{
return put_user(inode->i_generation, arg);
}
@@ -551,22 +469,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
return ret;
}
-int __pure btrfs_is_empty_uuid(const u8 *uuid)
-{
- int i;
-
- for (i = 0; i < BTRFS_UUID_SIZE; i++) {
- if (uuid[i])
- return 0;
- }
- return 1;
-}
-
/*
* Calculate the number of transaction items to reserve for creating a subvolume
* or snapshot, not including the inode, directory entries, or parent directory.
*/
-static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
+static unsigned int create_subvol_num_items(const struct btrfs_qgroup_inherit *inherit)
{
/*
* 1 to add root block
@@ -596,7 +503,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_trans_handle *trans;
struct btrfs_key key;
- struct btrfs_root_item *root_item;
+ struct btrfs_root_item AUTO_KFREE(root_item);
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -620,20 +527,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
if (ret)
- goto out_root_item;
+ return ret;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
* screwed up since it assumes subvolume qgroup's level to be 0.
*/
- if (btrfs_qgroup_level(objectid)) {
- ret = -ENOSPC;
- goto out_root_item;
- }
+ if (btrfs_qgroup_level(objectid))
+ return -ENOSPC;
ret = get_anon_bdev(&anon_dev);
if (ret < 0)
- goto out_root_item;
+ return ret;
new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir);
if (!new_inode_args.inode) {
@@ -708,8 +613,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
key.objectid = objectid;
- key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
if (ret) {
@@ -726,7 +631,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
btrfs_clear_buffer_dirty(trans, leaf);
btrfs_tree_unlock(leaf);
ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
- if (ret2 < 0)
+ if (unlikely(ret2 < 0))
btrfs_abort_transaction(trans, ret2);
free_extent_buffer(leaf);
goto out;
@@ -747,26 +652,26 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
/* ... and new_root is owned by new_inode_args.inode now. */
ret = btrfs_record_root_in_trans(trans, new_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_uuid_tree_add(trans, root_item->uuid,
BTRFS_UUID_KEY_SUBVOL, objectid);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
+ btrfs_record_new_subvolume(trans, BTRFS_I(dir));
+
ret = btrfs_create_new_inode(trans, &new_inode_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- btrfs_record_new_subvolume(trans, BTRFS_I(dir));
-
d_instantiate_new(dentry, new_inode_args.inode);
new_inode_args.inode = NULL;
@@ -785,8 +690,7 @@ out_inode:
out_anon_dev:
if (anon_dev)
free_anon_bdev(anon_dev);
-out_root_item:
- kfree(root_item);
+
return ret;
}
@@ -934,7 +838,7 @@ free_pending:
static int btrfs_may_delete(struct mnt_idmap *idmap,
struct inode *dir, struct dentry *victim, int isdir)
{
- int error;
+ int ret;
if (d_really_is_negative(victim))
return -ENOENT;
@@ -944,9 +848,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap,
return -EINVAL;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
- error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
- if (error)
- return error;
+ ret = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
+ if (ret)
+ return ret;
if (IS_APPEND(dir))
return -EPERM;
if (check_sticky(idmap, dir, d_inode(victim)) ||
@@ -969,7 +873,7 @@ static int btrfs_may_delete(struct mnt_idmap *idmap,
/* copy of may_create in fs/namei.c() */
static inline int btrfs_may_create(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *child)
+ struct inode *dir, const struct dentry *child)
{
if (d_really_is_positive(child))
return -EEXIST;
@@ -985,39 +889,32 @@ static inline int btrfs_may_create(struct mnt_idmap *idmap,
* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
* inside this filesystem so it's quite a bit simpler.
*/
-static noinline int btrfs_mksubvol(const struct path *parent,
+static noinline int btrfs_mksubvol(struct dentry *parent,
struct mnt_idmap *idmap,
- const char *name, int namelen,
- struct btrfs_root *snap_src,
+ struct qstr *qname, struct btrfs_root *snap_src,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- struct inode *dir = d_inode(parent->dentry);
+ struct inode *dir = d_inode(parent);
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct dentry *dentry;
- struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
- int error;
-
- error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
- if (error == -EINTR)
- return error;
+ struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len);
+ int ret;
- dentry = lookup_one(idmap, name, parent->dentry, namelen);
- error = PTR_ERR(dentry);
+ dentry = start_creating_killable(idmap, parent, qname);
if (IS_ERR(dentry))
- goto out_unlock;
+ return PTR_ERR(dentry);
- error = btrfs_may_create(idmap, dir, dentry);
- if (error)
+ ret = btrfs_may_create(idmap, dir, dentry);
+ if (ret)
goto out_dput;
/*
* even if this name doesn't exist, we may get hash collisions.
* check for them now when we can safely fail
*/
- error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
- dir->i_ino, &name_str);
- if (error)
+ ret = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, dir->i_ino, &name_str);
+ if (ret)
goto out_dput;
down_read(&fs_info->subvol_sem);
@@ -1026,24 +923,22 @@ static noinline int btrfs_mksubvol(const struct path *parent,
goto out_up_read;
if (snap_src)
- error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
+ ret = create_snapshot(snap_src, dir, dentry, readonly, inherit);
else
- error = create_subvol(idmap, dir, dentry, inherit);
+ ret = create_subvol(idmap, dir, dentry, inherit);
- if (!error)
+ if (!ret)
fsnotify_mkdir(dir, dentry);
out_up_read:
up_read(&fs_info->subvol_sem);
out_dput:
- dput(dentry);
-out_unlock:
- btrfs_inode_unlock(BTRFS_I(dir), 0);
- return error;
+ end_creating(dentry);
+ return ret;
}
-static noinline int btrfs_mksnapshot(const struct path *parent,
+static noinline int btrfs_mksnapshot(struct dentry *parent,
struct mnt_idmap *idmap,
- const char *name, int namelen,
+ struct qstr *qname,
struct btrfs_root *root,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
@@ -1052,7 +947,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
/*
* Force new buffered writes to reserve space even when NOCOW is
- * possible. This is to avoid later writeback (running dealloc) to
+ * possible. This is to avoid later writeback (running delalloc) to
* fallback to COW mode and unexpectedly fail with ENOSPC.
*/
btrfs_drew_read_lock(&root->snapshot_lock);
@@ -1070,8 +965,8 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
btrfs_wait_ordered_extents(root, U64_MAX, NULL);
- ret = btrfs_mksubvol(parent, idmap, name, namelen,
- root, readonly, inherit);
+ ret = btrfs_mksubvol(parent, idmap, qname, root, readonly, inherit);
+
atomic_dec(&root->snapshot_force_cow);
out:
btrfs_drew_read_unlock(&root->snapshot_lock);
@@ -1124,17 +1019,14 @@ static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 new_size;
u64 old_size;
u64 devid = 1;
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
- struct btrfs_trans_handle *trans;
struct btrfs_device *device = NULL;
char *sizestr;
- char *retptr;
char *devstr = NULL;
int ret = 0;
int mod = 0;
@@ -1202,6 +1094,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (!strcmp(sizestr, "max"))
new_size = bdev_nr_bytes(device->bdev);
else {
+ char *retptr;
+
if (sizestr[0] == '-') {
mod = -1;
sizestr++;
@@ -1249,6 +1143,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = round_down(new_size, fs_info->sectorsize);
if (new_size > old_size) {
+ struct btrfs_trans_handle *trans;
+
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -1261,7 +1157,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
} /* equal, nothing need to do */
if (ret == 0 && new_size != old_size)
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"resize device %s (devid %llu) from %llu to %llu",
btrfs_dev_name(device), device->devid,
old_size, new_size);
@@ -1276,12 +1172,12 @@ out_drop:
static noinline int __btrfs_ioctl_snap_create(struct file *file,
struct mnt_idmap *idmap,
- const char *name, unsigned long fd, int subvol,
+ const char *name, unsigned long fd, bool subvol,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- int namelen;
int ret = 0;
+ struct qstr qname = QSTR_INIT(name, strlen(name));
if (!S_ISDIR(file_inode(file)->i_mode))
return -ENOTDIR;
@@ -1290,21 +1186,20 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
if (ret)
goto out;
- namelen = strlen(name);
if (strchr(name, '/')) {
ret = -EINVAL;
goto out_drop_write;
}
- if (name[0] == '.' &&
- (namelen == 1 || (name[1] == '.' && namelen == 2))) {
+ if (qname.name[0] == '.' &&
+ (qname.len == 1 || (qname.name[1] == '.' && qname.len == 2))) {
ret = -EEXIST;
goto out_drop_write;
}
if (subvol) {
- ret = btrfs_mksubvol(&file->f_path, idmap, name,
- namelen, NULL, readonly, inherit);
+ ret = btrfs_mksubvol(file_dentry(file), idmap, &qname, NULL,
+ readonly, inherit);
} else {
CLASS(fd, src)(fd);
struct inode *src_inode;
@@ -1334,8 +1229,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
*/
ret = -EINVAL;
} else {
- ret = btrfs_mksnapshot(&file->f_path, idmap,
- name, namelen,
+ ret = btrfs_mksnapshot(file_dentry(file), idmap, &qname,
BTRFS_I(src_inode)->root,
readonly, inherit);
}
@@ -1347,7 +1241,7 @@ out:
}
static noinline int btrfs_ioctl_snap_create(struct file *file,
- void __user *arg, int subvol)
+ void __user *arg, bool subvol)
{
struct btrfs_ioctl_vol_args *vol_args;
int ret;
@@ -1372,7 +1266,7 @@ out:
}
static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
- void __user *arg, int subvol)
+ void __user *arg, bool subvol)
{
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
@@ -1427,15 +1321,15 @@ free_args:
return ret;
}
-static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
+static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode,
void __user *arg)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
u64 flags = 0;
- if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID)
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
return -EINVAL;
down_read(&fs_info->subvol_sem);
@@ -1538,8 +1432,8 @@ out:
return ret;
}
-static noinline int key_in_sk(struct btrfs_key *key,
- struct btrfs_ioctl_search_key *sk)
+static noinline bool key_in_sk(const struct btrfs_key *key,
+ const struct btrfs_ioctl_search_key *sk)
{
struct btrfs_key test;
int ret;
@@ -1550,7 +1444,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
ret = btrfs_comp_cpu_keys(key, &test);
if (ret < 0)
- return 0;
+ return false;
test.objectid = sk->max_objectid;
test.type = sk->max_type;
@@ -1558,13 +1452,13 @@ static noinline int key_in_sk(struct btrfs_key *key,
ret = btrfs_comp_cpu_keys(key, &test);
if (ret > 0)
- return 0;
- return 1;
+ return false;
+ return true;
}
static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
- struct btrfs_ioctl_search_key *sk,
+ const struct btrfs_ioctl_search_key *sk,
u64 *buf_size,
char __user *ubuf,
unsigned long *sk_offset,
@@ -1621,8 +1515,8 @@ static noinline int copy_to_sk(struct btrfs_path *path,
}
sh.objectid = key->objectid;
- sh.offset = key->offset;
sh.type = key->type;
+ sh.offset = key->offset;
sh.len = item_len;
sh.transid = found_transid;
@@ -1695,15 +1589,14 @@ out:
return ret;
}
-static noinline int search_ioctl(struct inode *inode,
+static noinline int search_ioctl(struct btrfs_root *root,
struct btrfs_ioctl_search_key *sk,
u64 *buf_size,
char __user *ubuf)
{
- struct btrfs_fs_info *info = inode_to_fs_info(inode);
- struct btrfs_root *root;
+ struct btrfs_fs_info *info = root->fs_info;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
int num_found = 0;
unsigned long sk_offset = 0;
@@ -1718,14 +1611,13 @@ static noinline int search_ioctl(struct inode *inode,
return -ENOMEM;
if (sk->tree_id == 0) {
- /* search the root of the inode that was passed */
- root = btrfs_grab_root(BTRFS_I(inode)->root);
+ /* Search the root that we got passed. */
+ root = btrfs_grab_root(root);
} else {
+ /* Look up the root from the arguments. */
root = btrfs_get_fs_root(info, sk->tree_id, true);
- if (IS_ERR(root)) {
- btrfs_free_path(path);
+ if (IS_ERR(root))
return PTR_ERR(root);
- }
}
key.objectid = sk->min_objectid;
@@ -1733,21 +1625,19 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
- ret = -EFAULT;
/*
* Ensure that the whole user buffer is faulted in at sub-page
* granularity, otherwise the loop may live-lock.
*/
- if (fault_in_subpage_writeable(ubuf + sk_offset,
- *buf_size - sk_offset))
+ if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) {
+ ret = -EFAULT;
break;
+ }
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
- if (ret != 0) {
- if (ret > 0)
- ret = 0;
- goto err;
- }
+ if (ret)
+ break;
+
ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
&sk_offset, &num_found);
btrfs_release_path(path);
@@ -1755,16 +1645,16 @@ static noinline int search_ioctl(struct inode *inode,
break;
}
+ /* Normalize return values from btrfs_search_forward() and copy_to_sk(). */
if (ret > 0)
ret = 0;
-err:
+
sk->nr_items = num_found;
btrfs_put_root(root);
- btrfs_free_path(path);
return ret;
}
-static noinline int btrfs_ioctl_tree_search(struct inode *inode,
+static noinline int btrfs_ioctl_tree_search(struct btrfs_root *root,
void __user *argp)
{
struct btrfs_ioctl_search_args __user *uargs = argp;
@@ -1780,7 +1670,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
buf_size = sizeof(uargs->buf);
- ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
+ ret = search_ioctl(root, &sk, &buf_size, uargs->buf);
/*
* In the origin implementation an overflow is handled by returning a
@@ -1794,7 +1684,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
return ret;
}
-static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
+static noinline int btrfs_ioctl_tree_search_v2(struct btrfs_root *root,
void __user *argp)
{
struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
@@ -1816,7 +1706,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
if (buf_size > buf_limit)
buf_size = buf_limit;
- ret = search_ioctl(inode, &args.key, &buf_size,
+ ret = search_ioctl(root, &args.key, &buf_size,
(char __user *)(&uarg->buf[0]));
if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
ret = -EFAULT;
@@ -1843,7 +1733,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
int total_len = 0;
struct btrfs_inode_ref *iref;
struct extent_buffer *l;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
name[0]='\0';
@@ -1904,7 +1794,6 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
ret = 0;
out:
btrfs_put_root(root);
- btrfs_free_path(path);
return ret;
}
@@ -1921,10 +1810,9 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
struct btrfs_inode_ref *iref;
struct btrfs_root_ref *rref;
struct btrfs_root *root = NULL;
- struct btrfs_path *path;
- struct btrfs_key key, key2;
+ BTRFS_PATH_AUTO_FREE(path);
+ struct btrfs_key key;
struct extent_buffer *leaf;
- struct inode *temp_inode;
char *ptr;
int slot;
int len;
@@ -1943,15 +1831,15 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
root = btrfs_get_fs_root(fs_info, treeid, true);
- if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
- }
+ if (IS_ERR(root))
+ return PTR_ERR(root);
key.objectid = dirid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *temp_inode;
+
ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out_put;
@@ -1976,24 +1864,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
read_extent_buffer(leaf, ptr,
(unsigned long)(iref + 1), len);
- /* Check the read+exec permission of this directory */
- ret = btrfs_previous_item(root, path, dirid,
- BTRFS_INODE_ITEM_KEY);
- if (ret < 0) {
- goto out_put;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto out_put;
- }
-
- leaf = path->nodes[0];
- slot = path->slots[0];
- btrfs_item_key_to_cpu(leaf, &key2, slot);
- if (key2.objectid != dirid) {
- ret = -ENOENT;
- goto out_put;
- }
-
/*
* We don't need the path anymore, so release it and
* avoid deadlocks and lockdep warnings in case
@@ -2001,18 +1871,17 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
* btree and lock the same leaf.
*/
btrfs_release_path(path);
- temp_inode = btrfs_iget(key2.objectid, root);
+ temp_inode = btrfs_iget(key.offset, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
goto out_put;
}
- ret = inode_permission(idmap, temp_inode,
+ /* Check the read+exec permission of this directory. */
+ ret = inode_permission(idmap, &temp_inode->vfs_inode,
MAY_READ | MAY_EXEC);
- iput(temp_inode);
- if (ret) {
- ret = -EACCES;
+ iput(&temp_inode->vfs_inode);
+ if (ret)
goto out_put;
- }
if (key.offset == upper_limit)
break;
@@ -2038,12 +1907,10 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
key.type = BTRFS_ROOT_REF_KEY;
key.offset = args->treeid;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ return -ENOENT;
leaf = path->nodes[0];
slot = path->slots[0];
@@ -2053,10 +1920,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
item_len = btrfs_item_size(leaf, slot);
/* Check if dirid in ROOT_REF corresponds to passed dirid */
rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
- if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
- ret = -EINVAL;
- goto out;
- }
+ if (args->dirid != btrfs_root_ref_dirid(leaf, rref))
+ return -EINVAL;
/* Copy subvolume's name */
item_off += sizeof(struct btrfs_root_ref);
@@ -2066,8 +1931,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
out_put:
btrfs_put_root(root);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -2229,7 +2093,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
ret = btrfs_next_leaf(fs_info->tree_root, path);
if (ret < 0) {
goto out;
- } else if (ret > 0) {
+ } else if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -2312,7 +2176,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
ret = btrfs_next_leaf(root, path);
if (ret < 0) {
goto out;
- } else if (ret > 0) {
+ } else if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -2341,7 +2205,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
ret = btrfs_next_item(root, path);
if (ret < 0) {
goto out;
- } else if (ret > 0) {
+ } else if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -2380,7 +2244,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
struct mnt_idmap *idmap = file_mnt_idmap(file);
char *subvol_name, *subvol_name_ptr = NULL;
- int subvol_namelen;
int ret = 0;
bool destroy_parent = false;
@@ -2503,10 +2366,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out;
}
- subvol_namelen = strlen(subvol_name);
-
if (strchr(subvol_name, '/') ||
- strncmp(subvol_name, "..", subvol_namelen) == 0) {
+ strcmp(subvol_name, "..") == 0) {
ret = -EINVAL;
goto free_subvol_name;
}
@@ -2516,18 +2377,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto free_subvol_name;
}
- ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
- if (ret == -EINTR)
- goto free_subvol_name;
- dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen);
+ dentry = start_removing_killable(idmap, parent, &QSTR(subvol_name));
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
- goto out_unlock_dir;
- }
-
- if (d_really_is_negative(dentry)) {
- ret = -ENOENT;
- goto out_dput;
+ goto out_end_removing;
}
inode = d_inode(dentry);
@@ -2548,7 +2401,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
*/
ret = -EPERM;
if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
- goto out_dput;
+ goto out_end_removing;
/*
* Do not allow deletion if the parent dir is the same
@@ -2559,21 +2412,21 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
*/
ret = -EINVAL;
if (root == dest)
- goto out_dput;
+ goto out_end_removing;
ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
if (ret)
- goto out_dput;
+ goto out_end_removing;
}
/* check if subvolume may be deleted by a user */
ret = btrfs_may_delete(idmap, dir, dentry, 1);
if (ret)
- goto out_dput;
+ goto out_end_removing;
if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
ret = -EINVAL;
- goto out_dput;
+ goto out_end_removing;
}
btrfs_inode_lock(BTRFS_I(inode), 0);
@@ -2582,10 +2435,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (!ret)
d_delete_notify(dir, dentry);
-out_dput:
- dput(dentry);
-out_unlock_dir:
- btrfs_inode_unlock(BTRFS_I(dir), 0);
+out_end_removing:
+ end_removing(dentry);
free_subvol_name:
kfree(subvol_name_ptr);
free_parent:
@@ -2635,6 +2486,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
+ /*
+ * Don't allow defrag on pre-content watched files, as it could
+ * populate the page cache with 0's via readahead.
+ */
+ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (argp) {
if (copy_from_user(&range, argp, sizeof(range))) {
ret = -EFAULT;
@@ -2644,8 +2504,14 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EOPNOTSUPP;
goto out;
}
- /* compression requires us to start the IO */
- if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) &&
+ (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /* Compression or no-compression require to start the IO. */
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) ||
+ (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) {
range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
range.extent_thresh = (u32)-1;
}
@@ -2653,7 +2519,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range.len = (u64)-1;
}
- ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
+ ret = btrfs_defrag_file(BTRFS_I(file_inode(file)), &file->f_ra,
&range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
@@ -2786,7 +2652,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
err_drop:
mnt_drop_write_file(file);
if (bdev_file)
- fput(bdev_file);
+ bdev_fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2837,7 +2703,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
mnt_drop_write_file(file);
if (bdev_file)
- fput(bdev_file);
+ bdev_fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
out_free:
@@ -2845,7 +2711,7 @@ out_free:
return ret;
}
-static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
+static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
@@ -2899,7 +2765,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
return ret;
}
-static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
+static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info,
void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
@@ -2976,7 +2842,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
- if (!is_fstree(btrfs_root_id(new_root))) {
+ if (!btrfs_is_fstree(btrfs_root_id(new_root))) {
ret = -ENOENT;
goto out_free;
}
@@ -3007,7 +2873,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
@@ -3041,7 +2906,7 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_space_args space_args = { 0 };
struct btrfs_ioctl_space_info space;
struct btrfs_ioctl_space_info *dest;
- struct btrfs_ioctl_space_info *dest_orig;
+ struct btrfs_ioctl_space_info AUTO_KFREE(dest_orig);
struct btrfs_ioctl_space_info __user *user_dest;
struct btrfs_space_info *info;
static const u64 types[] = {
@@ -3162,9 +3027,8 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
(arg + sizeof(struct btrfs_ioctl_space_args));
if (copy_to_user(user_dest, dest_orig, alloc_size))
- ret = -EFAULT;
+ return -EFAULT;
- kfree(dest_orig);
out:
if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
ret = -EFAULT;
@@ -3226,7 +3090,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
return -EPERM;
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet");
+ btrfs_err(fs_info, "scrub: extent tree v2 not yet supported");
return -EINVAL;
}
@@ -3383,7 +3247,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
u64 rel_ptr;
int size;
struct btrfs_ioctl_ino_path_args *ipa = NULL;
- struct inode_fs_paths *ipath = NULL;
+ struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL;
struct btrfs_path *path;
if (!capable(CAP_DAC_READ_SEARCH))
@@ -3431,7 +3295,6 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
out:
btrfs_free_path(path);
- free_ipath(ipath);
kfree(ipa);
return ret;
@@ -3444,7 +3307,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
int size;
struct btrfs_ioctl_logical_ino_args *loi;
struct btrfs_data_container *inodes = NULL;
- struct btrfs_path *path = NULL;
bool ignore_offset;
if (!capable(CAP_SYS_ADMIN))
@@ -3478,14 +3340,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
goto out_loi;
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
- ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
- inodes, ignore_offset);
- btrfs_free_path(path);
+ ret = iterate_inodes_from_logical(loi->logical, fs_info, inodes, ignore_offset);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -3704,7 +3559,7 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_ioctl_balance_args *bargs;
+ struct btrfs_ioctl_balance_args AUTO_KFREE(bargs);
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -3726,8 +3581,6 @@ static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
-
- kfree(bargs);
out:
mutex_unlock(&fs_info->balance_mutex);
return ret;
@@ -3802,22 +3655,6 @@ drop_write:
return ret;
}
-/*
- * Quick check for ioctl handlers if quotas are enabled. Proper locking must be
- * done before any operations.
- */
-static bool qgroup_enabled(struct btrfs_fs_info *fs_info)
-{
- bool ret = true;
-
- mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (!fs_info->quota_root)
- ret = false;
- mutex_unlock(&fs_info->qgroup_ioctl_lock);
-
- return ret;
-}
-
static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
@@ -3832,7 +3669,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(root->fs_info))
+ if (!btrfs_qgroup_enabled(fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -3849,7 +3686,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
if (!prealloc) {
ret = -ENOMEM;
- goto drop_write;
+ goto out;
}
}
@@ -3902,7 +3739,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(root->fs_info))
+ if (!btrfs_qgroup_enabled(root->fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -3920,7 +3757,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
- if (sa->create && is_fstree(sa->qgroupid)) {
+ if (sa->create && btrfs_is_fstree(sa->qgroupid)) {
ret = -EINVAL;
goto out;
}
@@ -3961,7 +3798,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(root->fs_info))
+ if (!btrfs_qgroup_enabled(root->fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -4009,7 +3846,7 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(fs_info))
+ if (!btrfs_qgroup_enabled(fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -4117,7 +3954,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
btrfs_root_id(root));
- if (ret && ret != -ENOENT) {
+ if (unlikely(ret && ret != -ENOENT)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
@@ -4141,7 +3978,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
ret = btrfs_uuid_tree_add(trans, sa->uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
btrfs_root_id(root));
- if (ret < 0 && ret != -EEXIST) {
+ if (unlikely(ret < 0 && ret != -EEXIST)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
@@ -4287,7 +4124,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
}
spin_lock(&fs_info->super_lock);
- strcpy(super_block->label, label);
+ strscpy(super_block->label, label);
spin_unlock(&fs_info->super_lock);
ret = btrfs_commit_transaction(trans);
@@ -4331,13 +4168,13 @@ static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
return 0;
}
-static int check_feature_bits(struct btrfs_fs_info *fs_info,
+static int check_feature_bits(const struct btrfs_fs_info *fs_info,
enum btrfs_feature_set set,
u64 change_mask, u64 flags, u64 supported_flags,
u64 safe_set, u64 safe_clear)
{
const char *type = btrfs_feature_set_name(set);
- char *names;
+ const char AUTO_KFREE(names);
u64 disallowed, unsupported;
u64 set_mask = flags & change_mask;
u64 clear_mask = ~flags & change_mask;
@@ -4345,12 +4182,11 @@ static int check_feature_bits(struct btrfs_fs_info *fs_info,
unsupported = set_mask & ~supported_flags;
if (unsupported) {
names = btrfs_printable_features(set, unsupported);
- if (names) {
+ if (names)
btrfs_warn(fs_info,
"this kernel does not support the %s feature bit%s",
names, strchr(names, ',') ? "s" : "");
- kfree(names);
- } else
+ else
btrfs_warn(fs_info,
"this kernel does not support %s bits 0x%llx",
type, unsupported);
@@ -4360,12 +4196,11 @@ static int check_feature_bits(struct btrfs_fs_info *fs_info,
disallowed = set_mask & ~safe_set;
if (disallowed) {
names = btrfs_printable_features(set, disallowed);
- if (names) {
+ if (names)
btrfs_warn(fs_info,
"can't set the %s feature bit%s while mounted",
names, strchr(names, ',') ? "s" : "");
- kfree(names);
- } else
+ else
btrfs_warn(fs_info,
"can't set %s bits 0x%llx while mounted",
type, disallowed);
@@ -4375,12 +4210,11 @@ static int check_feature_bits(struct btrfs_fs_info *fs_info,
disallowed = clear_mask & ~safe_clear;
if (disallowed) {
names = btrfs_printable_features(set, disallowed);
- if (names) {
+ if (names)
btrfs_warn(fs_info,
"can't clear the %s feature bit%s while mounted",
names, strchr(names, ',') ? "s" : "");
- kfree(names);
- } else
+ else
btrfs_warn(fs_info,
"can't clear %s bits 0x%llx while mounted",
type, disallowed);
@@ -4467,7 +4301,7 @@ out_drop_write:
return ret;
}
-static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat)
+static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat)
{
struct btrfs_ioctl_send_args *arg;
int ret;
@@ -4498,7 +4332,7 @@ static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool
if (IS_ERR(arg))
return PTR_ERR(arg);
}
- ret = btrfs_ioctl_send(inode, arg);
+ ret = btrfs_ioctl_send(root, arg);
kfree(arg);
return ret;
}
@@ -4594,7 +4428,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
args.compression, &unlocked);
if (!unlocked) {
- unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
}
}
@@ -4716,6 +4550,13 @@ out_acct:
return ret;
}
+struct btrfs_uring_encoded_data {
+ struct btrfs_ioctl_encoded_io_args args;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov;
+ struct iov_iter iter;
+};
+
/*
* Context that's attached to an encoded read io_uring command, in cmd->pdu. It
* contains the fields in btrfs_uring_read_extent that are necessary to finish
@@ -4737,20 +4578,25 @@ struct btrfs_uring_priv {
};
struct io_btrfs_cmd {
+ struct btrfs_uring_encoded_data *data;
struct btrfs_uring_priv *priv;
};
-static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags)
+static void btrfs_uring_read_finished(struct io_tw_req tw_req, io_tw_token_t tw)
{
+ struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
struct btrfs_uring_priv *priv = bc->priv;
struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree;
- unsigned long index;
+ pgoff_t index;
u64 cur;
size_t page_offset;
ssize_t ret;
+ /* The inode lock has already been acquired in btrfs_uring_read_extent. */
+ btrfs_lockdep_inode_acquire(inode, i_rwsem);
+
if (priv->err) {
ret = priv->err;
goto out;
@@ -4780,10 +4626,10 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss
ret = priv->count;
out:
- unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
+ btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
- io_uring_cmd_done(cmd, ret, 0, issue_flags);
+ io_uring_cmd_done(cmd, ret, IO_URING_CMD_TASK_WORK_ISSUE_FLAGS);
add_rchar(current, ret);
for (index = 0; index < priv->nr_pages; index++)
@@ -4792,6 +4638,7 @@ out:
kfree(priv->pages);
kfree(priv->iov);
kfree(priv);
+ kfree(bc->data);
}
void btrfs_uring_read_extent_endio(void *ctx, int err)
@@ -4859,10 +4706,17 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
* and inode and freeing the allocations.
*/
+ /*
+ * We're returning to userspace with the inode lock held, and that's
+ * okay - it'll get unlocked in a worker thread. Call
+ * btrfs_lockdep_inode_release() to avoid confusing lockdep.
+ */
+ btrfs_lockdep_inode_release(inode, i_rwsem);
+
return -EIOCBQUEUED;
out_fail:
- unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
kfree(priv);
return ret;
@@ -4870,73 +4724,92 @@ out_fail:
static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
+ struct file *file = cmd->file;
+ struct btrfs_inode *inode = BTRFS_I(file->f_inode);
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
size_t copy_end;
- struct btrfs_ioctl_encoded_io_args args = { 0 };
int ret;
u64 disk_bytenr, disk_io_size;
- struct file *file;
- struct btrfs_inode *inode;
- struct btrfs_fs_info *fs_info;
- struct extent_io_tree *io_tree;
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
loff_t pos;
struct kiocb kiocb;
struct extent_state *cached_state = NULL;
u64 start, lockend;
void __user *sqe_addr;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_encoded_data *data = NULL;
+
+ if (cmd->flags & IORING_URING_CMD_REISSUE)
+ data = bc->data;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
goto out_acct;
}
- file = cmd->file;
- inode = BTRFS_I(file->f_inode);
- fs_info = inode->root->fs_info;
- io_tree = &inode->io_tree;
sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
if (issue_flags & IO_URING_F_COMPAT) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
- struct btrfs_ioctl_encoded_io_args_32 args32;
-
copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
- if (copy_from_user(&args32, sqe_addr, copy_end)) {
- ret = -EFAULT;
- goto out_acct;
- }
- args.iov = compat_ptr(args32.iov);
- args.iovcnt = args32.iovcnt;
- args.offset = args32.offset;
- args.flags = args32.flags;
#else
- return -ENOTTY;
+ ret = -ENOTTY;
+ goto out_acct;
#endif
} else {
copy_end = copy_end_kernel;
- if (copy_from_user(&args, sqe_addr, copy_end)) {
- ret = -EFAULT;
+ }
+
+ if (!data) {
+ data = kzalloc(sizeof(*data), GFP_NOFS);
+ if (!data) {
+ ret = -ENOMEM;
goto out_acct;
}
- }
- if (args.flags != 0)
- return -EINVAL;
+ bc->data = data;
- ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
- &iov, &iter);
- if (ret < 0)
- goto out_acct;
+ if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
- if (iov_iter_count(&iter) == 0) {
- ret = 0;
- goto out_free;
+ if (copy_from_user(&args32, sqe_addr, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+
+ data->args.iov = compat_ptr(args32.iov);
+ data->args.iovcnt = args32.iovcnt;
+ data->args.offset = args32.offset;
+ data->args.flags = args32.flags;
+#endif
+ } else {
+ if (copy_from_user(&data->args, sqe_addr, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+
+ if (data->args.flags != 0) {
+ ret = -EINVAL;
+ goto out_acct;
+ }
+
+ data->iov = data->iovstack;
+ ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt,
+ ARRAY_SIZE(data->iovstack), &data->iov,
+ &data->iter);
+ if (ret < 0)
+ goto out_acct;
+
+ if (iov_iter_count(&data->iter) == 0) {
+ ret = 0;
+ goto out_free;
+ }
}
- pos = args.offset;
- ret = rw_verify_area(READ, file, &pos, args.len);
+ pos = data->args.offset;
+ ret = rw_verify_area(READ, file, &pos, data->args.len);
if (ret < 0)
goto out_free;
@@ -4949,17 +4822,20 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
start = ALIGN_DOWN(pos, fs_info->sectorsize);
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
- ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+ ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state,
&disk_bytenr, &disk_io_size);
+ if (ret == -EAGAIN)
+ goto out_acct;
if (ret < 0 && ret != -EIOCBQUEUED)
goto out_free;
file_accessed(file);
- if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel,
- sizeof(args) - copy_end_kernel)) {
+ if (copy_to_user(sqe_addr + copy_end,
+ (const char *)&data->args + copy_end_kernel,
+ sizeof(data->args) - copy_end_kernel)) {
if (ret == -EIOCBQUEUED) {
- unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
}
ret = -EFAULT;
@@ -4967,57 +4843,178 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
}
if (ret == -EIOCBQUEUED) {
- u64 count;
+ u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size);
- /*
- * If we've optimized things by storing the iovecs on the stack,
- * undo this.
- */
- if (!iov) {
- iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS);
- if (!iov) {
- unlock_extent(io_tree, start, lockend, &cached_state);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
- ret = -ENOMEM;
+ /* Match ioctl by not returning past EOF if uncompressed. */
+ if (!data->args.compression)
+ count = min_t(u64, count, data->args.len);
+
+ ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend,
+ cached_state, disk_bytenr, disk_io_size,
+ count, data->args.compression,
+ data->iov, cmd);
+
+ goto out_acct;
+ }
+
+out_free:
+ kfree(data->iov);
+
+out_acct:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+
+ if (ret != -EIOCBQUEUED && ret != -EAGAIN)
+ kfree(data);
+
+ return ret;
+}
+
+static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct file *file = cmd->file;
+ loff_t pos;
+ struct kiocb kiocb;
+ ssize_t ret;
+ void __user *sqe_addr;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_encoded_data *data = NULL;
+
+ if (cmd->flags & IORING_URING_CMD_REISSUE)
+ data = bc->data;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+ sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
+
+ if (!(file->f_mode & FMODE_WRITE)) {
+ ret = -EBADF;
+ goto out_acct;
+ }
+
+ if (!data) {
+ data = kzalloc(sizeof(*data), GFP_NOFS);
+ if (!data) {
+ ret = -ENOMEM;
+ goto out_acct;
+ }
+
+ bc->data = data;
+
+ if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ if (copy_from_user(&args32, sqe_addr, sizeof(args32))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ data->args.iov = compat_ptr(args32.iov);
+ data->args.iovcnt = args32.iovcnt;
+ data->args.offset = args32.offset;
+ data->args.flags = args32.flags;
+ data->args.len = args32.len;
+ data->args.unencoded_len = args32.unencoded_len;
+ data->args.unencoded_offset = args32.unencoded_offset;
+ data->args.compression = args32.compression;
+ data->args.encryption = args32.encryption;
+ memcpy(data->args.reserved, args32.reserved,
+ sizeof(data->args.reserved));
+#else
+ ret = -ENOTTY;
+ goto out_acct;
+#endif
+ } else {
+ if (copy_from_user(&data->args, sqe_addr, sizeof(data->args))) {
+ ret = -EFAULT;
goto out_acct;
}
-
- memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt);
}
- count = min_t(u64, iov_iter_count(&iter), disk_io_size);
+ ret = -EINVAL;
+ if (data->args.flags != 0)
+ goto out_acct;
+ if (memchr_inv(data->args.reserved, 0, sizeof(data->args.reserved)))
+ goto out_acct;
+ if (data->args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE &&
+ data->args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ goto out_acct;
+ if (data->args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES ||
+ data->args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES)
+ goto out_acct;
+ if (data->args.unencoded_offset > data->args.unencoded_len)
+ goto out_acct;
+ if (data->args.len > data->args.unencoded_len - data->args.unencoded_offset)
+ goto out_acct;
- /* Match ioctl by not returning past EOF if uncompressed. */
- if (!args.compression)
- count = min_t(u64, count, args.len);
+ data->iov = data->iovstack;
+ ret = import_iovec(ITER_SOURCE, data->args.iov, data->args.iovcnt,
+ ARRAY_SIZE(data->iovstack), &data->iov,
+ &data->iter);
+ if (ret < 0)
+ goto out_acct;
- ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
- cached_state, disk_bytenr,
- disk_io_size, count,
- args.compression, iov, cmd);
+ if (iov_iter_count(&data->iter) == 0) {
+ ret = 0;
+ goto out_iov;
+ }
+ }
+ if (issue_flags & IO_URING_F_NONBLOCK) {
+ ret = -EAGAIN;
goto out_acct;
}
-out_free:
- kfree(iov);
+ pos = data->args.offset;
+ ret = rw_verify_area(WRITE, file, &pos, data->args.len);
+ if (ret < 0)
+ goto out_iov;
+
+ init_sync_kiocb(&kiocb, file);
+ ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
+ if (ret)
+ goto out_iov;
+ kiocb.ki_pos = pos;
+
+ file_start_write(file);
+
+ ret = btrfs_do_write_iter(&kiocb, &data->iter, &data->args);
+ if (ret > 0)
+ fsnotify_modify(file);
+ file_end_write(file);
+out_iov:
+ kfree(data->iov);
out_acct:
if (ret > 0)
- add_rchar(current, ret);
- inc_syscr(current);
+ add_wchar(current, ret);
+ inc_syscw(current);
+ if (ret != -EAGAIN)
+ kfree(data);
return ret;
}
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file)))))
+ return -EIO;
+
switch (cmd->cmd_op) {
case BTRFS_IOC_ENCODED_READ:
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_ENCODED_READ_32:
#endif
return btrfs_uring_encoded_read(cmd, issue_flags);
+
+ case BTRFS_IOC_ENCODED_WRITE:
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_ENCODED_WRITE_32:
+#endif
+ return btrfs_uring_encoded_write(cmd, issue_flags);
}
return -EINVAL;
@@ -5149,6 +5146,43 @@ static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *a
return 0;
}
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+static int btrfs_ioctl_shutdown(struct btrfs_fs_info *fs_info, unsigned long arg)
+{
+ int ret = 0;
+ u32 flags;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(flags, (u32 __user *)arg))
+ return -EFAULT;
+
+ if (flags >= BTRFS_SHUTDOWN_FLAGS_LAST)
+ return -EINVAL;
+
+ if (btrfs_is_shutdown(fs_info))
+ return 0;
+
+ switch (flags) {
+ case BTRFS_SHUTDOWN_FLAGS_LOGFLUSH:
+ case BTRFS_SHUTDOWN_FLAGS_DEFAULT:
+ ret = freeze_super(fs_info->sb, FREEZE_HOLDER_KERNEL, NULL);
+ if (ret)
+ return ret;
+ btrfs_force_shutdown(fs_info);
+ ret = thaw_super(fs_info->sb, FREEZE_HOLDER_KERNEL, NULL);
+ if (ret)
+ return ret;
+ break;
+ case BTRFS_SHUTDOWN_FLAGS_NOLOGFLUSH:
+ btrfs_force_shutdown(fs_info);
+ break;
+ }
+ return ret;
+}
+#endif
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -5167,19 +5201,19 @@ long btrfs_ioctl(struct file *file, unsigned int
case FITRIM:
return btrfs_ioctl_fitrim(fs_info, argp);
case BTRFS_IOC_SNAP_CREATE:
- return btrfs_ioctl_snap_create(file, argp, 0);
+ return btrfs_ioctl_snap_create(file, argp, false);
case BTRFS_IOC_SNAP_CREATE_V2:
- return btrfs_ioctl_snap_create_v2(file, argp, 0);
+ return btrfs_ioctl_snap_create_v2(file, argp, false);
case BTRFS_IOC_SUBVOL_CREATE:
- return btrfs_ioctl_snap_create(file, argp, 1);
+ return btrfs_ioctl_snap_create(file, argp, true);
case BTRFS_IOC_SUBVOL_CREATE_V2:
- return btrfs_ioctl_snap_create_v2(file, argp, 1);
+ return btrfs_ioctl_snap_create_v2(file, argp, true);
case BTRFS_IOC_SNAP_DESTROY:
return btrfs_ioctl_snap_destroy(file, argp, false);
case BTRFS_IOC_SNAP_DESTROY_V2:
return btrfs_ioctl_snap_destroy(file, argp, true);
case BTRFS_IOC_SUBVOL_GETFLAGS:
- return btrfs_ioctl_subvol_getflags(inode, argp);
+ return btrfs_ioctl_subvol_getflags(BTRFS_I(inode), argp);
case BTRFS_IOC_SUBVOL_SETFLAGS:
return btrfs_ioctl_subvol_setflags(file, argp);
case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -5201,9 +5235,9 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(fs_info, argp);
case BTRFS_IOC_TREE_SEARCH:
- return btrfs_ioctl_tree_search(inode, argp);
+ return btrfs_ioctl_tree_search(root, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
- return btrfs_ioctl_tree_search_v2(inode, argp);
+ return btrfs_ioctl_tree_search_v2(root, argp);
case BTRFS_IOC_INO_LOOKUP:
return btrfs_ioctl_ino_lookup(root, argp);
case BTRFS_IOC_INO_PATHS:
@@ -5251,10 +5285,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
- return _btrfs_ioctl_send(BTRFS_I(inode), argp, false);
+ return _btrfs_ioctl_send(root, argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32:
- return _btrfs_ioctl_send(BTRFS_I(inode), argp, true);
+ return _btrfs_ioctl_send(root, argp, true);
#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
@@ -5290,6 +5324,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return fsverity_ioctl_enable(file, (const void __user *)argp);
case FS_IOC_MEASURE_VERITY:
return fsverity_ioctl_measure(file, argp);
+ case FS_IOC_READ_VERITY_METADATA:
+ return fsverity_ioctl_read_metadata(file, argp);
case BTRFS_IOC_ENCODED_READ:
return btrfs_ioctl_encoded_read(file, argp, false);
case BTRFS_IOC_ENCODED_WRITE:
@@ -5302,6 +5338,10 @@ long btrfs_ioctl(struct file *file, unsigned int
#endif
case BTRFS_IOC_SUBVOL_SYNC_WAIT:
return btrfs_ioctl_subvol_sync(fs_info, argp);
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ case BTRFS_IOC_SHUTDOWN:
+ return btrfs_ioctl_shutdown(fs_info, arg);
+#endif
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 2b760c8778f8..ccf6bed9cc24 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -8,18 +8,19 @@
struct file;
struct dentry;
struct mnt_idmap;
-struct fileattr;
+struct file_kattr;
+struct io_uring_cmd;
+struct btrfs_inode;
struct btrfs_fs_info;
struct btrfs_ioctl_balance_args;
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int btrfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
-void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
-int __pure btrfs_is_empty_uuid(const u8 *uuid);
+void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9a7a7b723305..0035851d72b0 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -9,7 +9,6 @@
#include <linux/page-flags.h>
#include <asm/bug.h>
#include <trace/events/btrfs.h>
-#include "misc.h"
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"
@@ -150,15 +149,15 @@ void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesti
/*
* Try-lock for read.
*
- * Return 1 if the rwlock has been taken, 0 otherwise
+ * Return true if the rwlock has been taken, false otherwise
*/
-int btrfs_try_tree_read_lock(struct extent_buffer *eb)
+bool btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
if (down_read_trylock(&eb->lock)) {
trace_btrfs_try_tree_read_lock(eb);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -362,7 +361,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
atomic_inc(&lock->readers);
/*
- * Ensure the pending reader count is perceieved BEFORE this reader
+ * Ensure the pending reader count is perceived BEFORE this reader
* goes to sleep in case of active writers. This guarantees new writers
* won't be allowed and that the current reader will be woken up when
* the last active writer finishes its jobs.
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 46c8be2afab1..a4673e7d95d7 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -74,7 +74,7 @@ enum btrfs_lock_nesting {
BTRFS_NESTING_NEW_ROOT,
/*
- * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
+ * We are limited to MAX_LOCKDEP_SUBCLASSES number of subclasses, so
* add this in here and add a static_assert to keep us from going over
* the limit. As of this writing we're limited to 8, and we're
* definitely using 8, hence this check to keep us from messing up in
@@ -129,6 +129,16 @@ enum btrfs_lockdep_trans_states {
rwsem_release(&owner->lock##_map, _THIS_IP_)
/*
+ * Used to account for the fact that when doing io_uring encoded I/O, we can
+ * return to userspace with the inode lock still held.
+ */
+#define btrfs_lockdep_inode_acquire(owner, lock) \
+ rwsem_acquire_read(&owner->vfs_inode.lock.dep_map, 0, 0, _THIS_IP_)
+
+#define btrfs_lockdep_inode_release(owner, lock) \
+ rwsem_release(&owner->vfs_inode.lock.dep_map, _THIS_IP_)
+
+/*
* Macros for the transaction states wait events, similar to the generic wait
* event macros.
*/
@@ -179,7 +189,7 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb)
}
void btrfs_tree_read_unlock(struct extent_buffer *eb);
-int btrfs_try_tree_read_lock(struct extent_buffer *eb);
+bool btrfs_try_tree_read_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
@@ -189,8 +199,13 @@ static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
{
lockdep_assert_held_write(&eb->lock);
}
+static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+{
+ lockdep_assert_held_read(&eb->lock);
+}
#else
static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
+static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
#endif
void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index a45bc11f8665..4758f66da449 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -58,9 +58,6 @@
* 0x1000 | SegHdr N+1| Data payload N+1 ... |
*/
-#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
-#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
-
struct workspace {
void *mem;
void *buf; /* where decompressed data goes */
@@ -68,7 +65,14 @@ struct workspace {
struct list_head list;
};
-static struct workspace_manager wsm;
+static u32 workspace_buf_length(const struct btrfs_fs_info *fs_info)
+{
+ return lzo1x_worst_compress(fs_info->sectorsize);
+}
+static u32 workspace_cbuf_length(const struct btrfs_fs_info *fs_info)
+{
+ return lzo1x_worst_compress(fs_info->sectorsize);
+}
void lzo_free_workspace(struct list_head *ws)
{
@@ -80,7 +84,7 @@ void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *lzo_alloc_workspace(void)
+struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info)
{
struct workspace *workspace;
@@ -89,8 +93,8 @@ struct list_head *lzo_alloc_workspace(void)
return ERR_PTR(-ENOMEM);
workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN);
- workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN);
- workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN);
+ workspace->buf = kvmalloc(workspace_buf_length(fs_info), GFP_KERNEL | __GFP_NOWARN);
+ workspace->cbuf = kvmalloc(workspace_cbuf_length(fs_info), GFP_KERNEL | __GFP_NOWARN);
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
@@ -128,19 +132,21 @@ static inline size_t read_compress_length(const char *buf)
*
* Will allocate new pages when needed.
*/
-static int copy_compressed_data_to_page(char *compressed_data,
+static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info,
+ char *compressed_data,
size_t compressed_size,
struct folio **out_folios,
unsigned long max_nr_folio,
- u32 *cur_out,
- const u32 sectorsize)
+ u32 *cur_out)
{
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
u32 sector_bytes_left;
u32 orig_out;
struct folio *cur_folio;
char *kaddr;
- if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
+ if ((*cur_out >> min_folio_shift) >= max_nr_folio)
return -E2BIG;
/*
@@ -149,18 +155,17 @@ static int copy_compressed_data_to_page(char *compressed_data,
*/
ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
- cur_folio = out_folios[*cur_out / PAGE_SIZE];
+ cur_folio = out_folios[*cur_out >> min_folio_shift];
/* Allocate a new page */
if (!cur_folio) {
- cur_folio = btrfs_alloc_compr_folio();
+ cur_folio = btrfs_alloc_compr_folio(fs_info);
if (!cur_folio)
return -ENOMEM;
- out_folios[*cur_out / PAGE_SIZE] = cur_folio;
+ out_folios[*cur_out >> min_folio_shift] = cur_folio;
}
- kaddr = kmap_local_folio(cur_folio, 0);
- write_compress_length(kaddr + offset_in_page(*cur_out),
- compressed_size);
+ kaddr = kmap_local_folio(cur_folio, offset_in_folio(cur_folio, *cur_out));
+ write_compress_length(kaddr, compressed_size);
*cur_out += LZO_LEN;
orig_out = *cur_out;
@@ -172,20 +177,20 @@ static int copy_compressed_data_to_page(char *compressed_data,
kunmap_local(kaddr);
- if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
+ if ((*cur_out >> min_folio_shift) >= max_nr_folio)
return -E2BIG;
- cur_folio = out_folios[*cur_out / PAGE_SIZE];
+ cur_folio = out_folios[*cur_out >> min_folio_shift];
/* Allocate a new page */
if (!cur_folio) {
- cur_folio = btrfs_alloc_compr_folio();
+ cur_folio = btrfs_alloc_compr_folio(fs_info);
if (!cur_folio)
return -ENOMEM;
- out_folios[*cur_out / PAGE_SIZE] = cur_folio;
+ out_folios[*cur_out >> min_folio_shift] = cur_folio;
}
kaddr = kmap_local_folio(cur_folio, 0);
- memcpy(kaddr + offset_in_page(*cur_out),
+ memcpy(kaddr + offset_in_folio(cur_folio, *cur_out),
compressed_data + *cur_out - orig_out, copy_len);
*cur_out += copy_len;
@@ -209,12 +214,15 @@ out:
return 0;
}
-int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct workspace *workspace = list_entry(ws, struct workspace, list);
- const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize;
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
struct folio *folio_in = NULL;
char *sizes_ptr;
const unsigned long max_nr_folio = *out_folios;
@@ -252,9 +260,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
/* Compress at most one sector of data each time */
in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
ASSERT(in_len);
- data_in = kmap_local_folio(folio_in, 0);
- ret = lzo1x_1_compress(data_in +
- offset_in_page(cur_in), in_len,
+ data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in));
+ ret = lzo1x_1_compress(data_in, in_len,
workspace->cbuf, &out_len,
workspace->mem);
kunmap_local(data_in);
@@ -264,9 +271,9 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
goto out;
}
- ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+ ret = copy_compressed_data_to_page(fs_info, workspace->cbuf, out_len,
folios, max_nr_folio,
- &cur_out, sectorsize);
+ &cur_out);
if (ret < 0)
goto out;
@@ -281,8 +288,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
goto out;
}
- /* Check if we have reached page boundary */
- if (PAGE_ALIGNED(cur_in)) {
+ /* Check if we have reached folio boundary. */
+ if (IS_ALIGNED(cur_in, min_folio_size)) {
folio_put(folio_in);
folio_in = NULL;
}
@@ -299,7 +306,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
out:
if (folio_in)
folio_put(folio_in);
- *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE);
+ *out_folios = DIV_ROUND_UP(cur_out, min_folio_size);
return ret;
}
@@ -311,15 +318,16 @@ out:
static void copy_compressed_segment(struct compressed_bio *cb,
char *dest, u32 len, u32 *cur_in)
{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
- struct folio *cur_folio;
- u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
- orig_in + len - *cur_in);
+ struct folio *cur_folio = cb->compressed_folios[*cur_in >> min_folio_shift];
+ u32 copy_len = min_t(u32, orig_in + len - *cur_in,
+ folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in));
ASSERT(copy_len);
- cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE];
memcpy_from_folio(dest + *cur_in - orig_in, cur_folio,
offset_in_folio(cur_folio, *cur_in), copy_len);
@@ -333,6 +341,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
struct workspace *workspace = list_entry(ws, struct workspace, list);
const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
char *kaddr;
int ret;
/* Compressed data length, can be unaligned */
@@ -379,14 +388,14 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
*/
ASSERT(cur_in / sectorsize ==
(cur_in + LZO_LEN - 1) / sectorsize);
- cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE];
+ cur_folio = cb->compressed_folios[cur_in >> min_folio_shift];
ASSERT(cur_folio);
kaddr = kmap_local_folio(cur_folio, 0);
- seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+ seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in));
kunmap_local(kaddr);
cur_in += LZO_LEN;
- if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) {
+ if (unlikely(seg_len > workspace_cbuf_length(fs_info))) {
struct btrfs_inode *inode = cb->bbio.inode;
/*
@@ -446,19 +455,19 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
const u32 sectorsize = fs_info->sectorsize;
size_t in_len;
size_t out_len;
- size_t max_segment_len = WORKSPACE_BUF_LENGTH;
+ size_t max_segment_len = workspace_buf_length(fs_info);
int ret = 0;
- if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
+ if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2))
return -EUCLEAN;
in_len = read_compress_length(data_in);
- if (in_len != srclen)
+ if (unlikely(in_len != srclen))
return -EUCLEAN;
data_in += LZO_LEN;
in_len = read_compress_length(data_in);
- if (in_len != srclen - LZO_LEN * 2) {
+ if (unlikely(in_len != srclen - LZO_LEN * 2)) {
ret = -EUCLEAN;
goto out;
}
@@ -488,8 +497,7 @@ out:
return ret;
}
-const struct btrfs_compress_op btrfs_lzo_compress = {
- .workspace_manager = &wsm,
+const struct btrfs_compress_levels btrfs_lzo_compress = {
.max_level = 1,
.default_level = 1,
};
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 363fd28c0268..2f853de44473 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -18,11 +18,13 @@ static const char fs_state_chars[] = {
[BTRFS_FS_STATE_REMOUNTING] = 'M',
[BTRFS_FS_STATE_RO] = 0,
[BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
+ [BTRFS_FS_STATE_LOG_REPLAY_ABORTED] = 'O',
[BTRFS_FS_STATE_DEV_REPLACING] = 'R',
[BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
[BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C',
[BTRFS_FS_STATE_SKIP_META_CSUMS] = 'S',
[BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
+ [BTRFS_FS_STATE_EMERGENCY_SHUTDOWN] = 'E',
};
static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 08a9272399d2..d8c0bd17dcda 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -36,106 +36,46 @@ void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
btrfs_no_printk(fs_info, fmt, ##args)
#endif
-#define btrfs_emerg(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_INFO fmt, ##args)
-
/*
- * Wrappers that use printk_in_rcu
+ * Print a message with filesystem info, enclosed in RCU protection.
*/
-#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+#define btrfs_crit(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+#define btrfs_err(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+#define btrfs_warn(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+#define btrfs_info(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
/*
- * Wrappers that use a ratelimited printk_in_rcu
- */
-#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
-
-/*
* Wrappers that use a ratelimited printk
*/
-#define btrfs_emerg_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
#define btrfs_crit_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
#define btrfs_err_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
#define btrfs_warn_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
#define btrfs_info_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
#if defined(CONFIG_DYNAMIC_DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk, \
- fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
_dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
- fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
fs_info, KERN_DEBUG fmt, ##args)
#elif defined(DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#else
-#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+/* When printk() is no_printk(), expand to no-op. */
+#define btrfs_debug(fs_info, fmt, args...) do { (void)(fs_info); } while(0)
+#define btrfs_debug_rl(fs_info, fmt, args...) do { (void)(fs_info); } while(0)
#endif
#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
@@ -145,40 +85,98 @@ do { \
rcu_read_unlock(); \
} while (0)
-#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_no_printk(fs_info, fmt, ##args); \
- rcu_read_unlock(); \
-} while (0)
-
-#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
+ \
+ rcu_read_lock(); \
if (__ratelimit(&_rs)) \
btrfs_printk(fs_info, fmt, ##args); \
-} while (0)
-
-#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_printk_ratelimited(fs_info, fmt, ##args); \
rcu_read_unlock(); \
} while (0)
#ifdef CONFIG_BTRFS_ASSERT
-#define btrfs_assertfail(expr, file, line) ({ \
- pr_err("assertion failed: %s, in %s:%d\n", (expr), (file), (line)); \
- BUG(); \
-})
+__printf(1, 2)
+static inline void verify_assert_printk_format(const char *fmt, ...) {
+ /* Stub to verify the assertion format string. */
+}
+
+/* Take the first token if any. */
+#define __FIRST_ARG(_, ...) _
+/*
+ * Skip the first token and return the rest, if it's empty the comma is dropped.
+ * As ##__VA_ARGS__ cannot be at the beginning of the macro the __VA_OPT__ is needed
+ * and supported since GCC 8 and Clang 12.
+ */
+#define __REST_ARGS(_, ... ) __VA_OPT__(,) __VA_ARGS__
+
+#if defined(CONFIG_CC_IS_CLANG) || GCC_VERSION >= 80000
+/*
+ * Assertion with optional printk() format.
+ *
+ * Accepted syntax:
+ * ASSERT(condition);
+ * ASSERT(condition, "string");
+ * ASSERT(condition, "variable=%d", variable);
+ *
+ * How it works:
+ * - if there's no format string, ""[0] evaluates at compile time to 0 and the
+ * true branch is executed
+ * - any non-empty format string with the "" prefix evaluates to != 0 at
+ * compile time and the false branch is executed
+ * - stringified condition is printed as %s so we don't accidentally mix format
+ * strings (the % operator)
+ * - there can be only one printk() call, so the format strings and arguments are
+ * spliced together:
+ * DEFAULT_FMT [USER_FMT], DEFAULT_ARGS [, USER_ARGS]
+ * - comma between DEFAULT_ARGS and USER_ARGS is handled by preprocessor
+ * (requires __VA_OPT__ support)
+ * - otherwise we could use __VA_OPT(,) __VA_ARGS__ for the 2nd+ argument of args,
+ */
+#define ASSERT(cond, args...) \
+do { \
+ verify_assert_printk_format("check the format string" args); \
+ if (!likely(cond)) { \
+ if (("" __FIRST_ARG(args) [0]) == 0) { \
+ pr_err("assertion failed: %s :: %ld, in %s:%d\n", \
+ #cond, (long)(cond), __FILE__, __LINE__); \
+ } else { \
+ pr_err("assertion failed: %s :: %ld, in %s:%d (" __FIRST_ARG(args) ")\n", \
+ #cond, (long)(cond), __FILE__, __LINE__ __REST_ARGS(args)); \
+ } \
+ BUG(); \
+ } \
+} while(0)
+
+#else
+
+/* For GCC < 8.x only the simple output. */
+
+#define ASSERT(cond, args...) \
+do { \
+ verify_assert_printk_format("check the format string" args); \
+ if (!likely(cond)) { \
+ pr_err("assertion failed: %s :: %ld, in %s:%d\n", \
+ #cond, (long)(cond), __FILE__, __LINE__); \
+ BUG(); \
+ } \
+} while(0)
+
+#endif
+
+#else
+/* Compile check the @cond expression but don't generate any code. */
+#define ASSERT(cond, args...) BUILD_BUG_ON_INVALID(cond)
+#endif
-#define ASSERT(expr) \
- (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
+#ifdef CONFIG_BTRFS_DEBUG
+/* Verbose warning only under debug build. */
+#define DEBUG_WARN(args...) WARN(1, KERN_ERR args)
#else
-#define ASSERT(expr) (void)(expr)
+#define DEBUG_WARN(...) do {} while(0)
#endif
__printf(5, 6)
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 0d599fd847c9..12c5a9d6564f 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -7,8 +7,18 @@
#include <linux/bitmap.h>
#include <linux/sched.h>
#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/math64.h>
#include <linux/rbtree.h>
+#include <linux/bio.h>
+
+/*
+ * Convenience macros to define a pointer with the __free(kfree) and
+ * __free(kvfree) cleanup attributes and initialized to NULL.
+ */
+#define AUTO_KFREE(name) *name __free(kfree) = NULL
+#define AUTO_KVFREE(name) *name __free(kvfree) = NULL
/*
* Enumerate bits using enum autoincrement. Define the @name as the n-th bit.
@@ -18,6 +28,54 @@
name = (1U << __ ## name ## _BIT), \
__ ## name ## _SEQ = __ ## name ## _BIT
+static inline phys_addr_t bio_iter_phys(struct bio *bio, struct bvec_iter *iter)
+{
+ struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+ return bvec_phys(&bv);
+}
+
+/*
+ * Iterate bio using btrfs block size.
+ *
+ * This will handle large folio and highmem.
+ *
+ * @paddr: Physical memory address of each iteration
+ * @bio: The bio to iterate
+ * @iter: The bvec_iter (pointer) to use.
+ * @blocksize: The blocksize to iterate.
+ *
+ * This requires all folios in the bio to cover at least one block.
+ */
+#define btrfs_bio_for_each_block(paddr, bio, iter, blocksize) \
+ for (; (iter)->bi_size && \
+ (paddr = bio_iter_phys((bio), (iter)), 1); \
+ bio_advance_iter_single((bio), (iter), (blocksize)))
+
+/* Initialize a bvec_iter to the size of the specified bio. */
+static inline struct bvec_iter init_bvec_iter_for_bio(struct bio *bio)
+{
+ struct bio_vec *bvec;
+ u32 bio_size = 0;
+ int i;
+
+ bio_for_each_bvec_all(bvec, bio, i)
+ bio_size += bvec->bv_len;
+
+ return (struct bvec_iter) {
+ .bi_sector = 0,
+ .bi_size = bio_size,
+ .bi_idx = 0,
+ .bi_bvec_done = 0,
+ };
+}
+
+#define btrfs_bio_for_each_block_all(paddr, bio, blocksize) \
+ for (struct bvec_iter iter = init_bvec_iter_for_bio(bio); \
+ (iter).bi_size && \
+ (paddr = bio_iter_phys((bio), &(iter)), 1); \
+ bio_advance_iter_single((bio), &(iter), (blocksize)))
+
static inline void cond_wake_up(struct wait_queue_head *wq)
{
/*
@@ -119,28 +177,23 @@ static inline struct rb_node *rb_simple_search_first(const struct rb_root *root,
return ret;
}
-static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
- struct rb_node *node)
+static int rb_simple_node_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct rb_simple_node *entry;
+ struct rb_simple_node *new_entry = rb_entry(new, struct rb_simple_node, rb_node);
+ struct rb_simple_node *existing_entry = rb_entry(existing, struct rb_simple_node, rb_node);
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct rb_simple_node, rb_node);
+ if (new_entry->bytenr < existing_entry->bytenr)
+ return -1;
+ else if (new_entry->bytenr > existing_entry->bytenr)
+ return 1;
- if (bytenr < entry->bytenr)
- p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
- p = &(*p)->rb_right;
- else
- return parent;
- }
+ return 0;
+}
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
+static inline struct rb_node *rb_simple_insert(struct rb_root *root,
+ struct rb_simple_node *simple_node)
+{
+ return rb_find_add(&simple_node->rb_node, root, rb_simple_node_bytenr_cmp);
}
static inline bool bitmap_test_range_all_set(const unsigned long *addr,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 30eceaf829a7..5df02c707aee 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -153,25 +153,30 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
struct btrfs_ordered_extent *entry;
int ret;
u64 qgroup_rsv = 0;
+ const bool is_nocow = (flags &
+ ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)));
- if (flags &
- ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
- /* For nocow write, we can release the qgroup rsv right now */
+ /*
+ * For a NOCOW write we can free the qgroup reserve right now. For a COW
+ * one we transfer the reserved space from the inode's iotree into the
+ * ordered extent by calling btrfs_qgroup_release_data() and tracking
+ * the qgroup reserved amount in the ordered extent, so that later after
+ * completing the ordered extent, when running the data delayed ref it
+ * creates, we free the reserved data with btrfs_qgroup_free_refroot().
+ */
+ if (is_nocow)
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
- if (ret < 0)
- return ERR_PTR(ret);
- } else {
- /*
- * The ordered extent has reserved qgroup space, release now
- * and pass the reserved number for qgroup_record to free.
- */
+ else
ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
- if (ret < 0)
- return ERR_PTR(ret);
- }
+
+ if (ret < 0)
+ return ERR_PTR(ret);
+
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
- if (!entry)
- return ERR_PTR(-ENOMEM);
+ if (!entry) {
+ entry = ERR_PTR(-ENOMEM);
+ goto out;
+ }
entry->file_offset = file_offset;
entry->num_bytes = num_bytes;
@@ -180,7 +185,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
entry->disk_num_bytes = disk_num_bytes;
entry->offset = offset;
entry->bytes_left = num_bytes;
- entry->inode = BTRFS_I(igrab(&inode->vfs_inode));
+ if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) {
+ kmem_cache_free(btrfs_ordered_extent_cache, entry);
+ entry = ERR_PTR(-ESTALE);
+ goto out;
+ }
+ entry->inode = inode;
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = qgroup_rsv;
@@ -203,6 +213,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
btrfs_mod_outstanding_extents(inode, 1);
spin_unlock(&inode->lock);
+out:
+ if (IS_ERR(entry) && !is_nocow)
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ btrfs_root_id(inode->root),
+ qgroup_rsv, BTRFS_QGROUP_RSV_DATA);
+
return entry;
}
@@ -221,14 +237,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
/* One ref for the tree. */
refcount_inc(&entry->refs);
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
node = tree_insert(&inode->ordered_tree, entry->file_offset,
&entry->rb_node);
if (unlikely(node))
btrfs_panic(fs_info, -EEXIST,
"inconsistency in ordered tree at offset %llu",
entry->file_offset);
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
spin_lock(&root->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
@@ -253,7 +269,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
* @disk_bytenr: Offset of extent on disk.
* @disk_num_bytes: Size of extent on disk.
* @offset: Offset into unencoded data where file data starts.
- * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*).
* @compress_type: Compression algorithm used for data.
*
* Most of these parameters correspond to &struct btrfs_file_extent_item. The
@@ -312,9 +328,9 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
{
struct btrfs_inode *inode = entry->inode;
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
list_add_tail(&sum->list, &entry->list);
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
}
void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
@@ -343,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
if (folio) {
ASSERT(folio->mapping);
ASSERT(folio_pos(folio) <= file_offset);
- ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio));
+ ASSERT(file_offset + len <= folio_next_pos(folio));
/*
* Ordered flag indicates whether we still have
@@ -401,15 +417,14 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
bool uptodate)
{
struct btrfs_inode *inode = ordered->inode;
- unsigned long flags;
bool ret;
trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ spin_lock(&inode->ordered_tree_lock);
ret = can_finish_ordered_extent(ordered, folio, file_offset, len,
uptodate);
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
/*
* If this is a COW write it means we created new extent maps for the
@@ -465,18 +480,16 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
{
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- unsigned long flags;
u64 cur = file_offset;
+ const u64 end = file_offset + num_bytes;
- trace_btrfs_writepage_end_io_hook(inode, file_offset,
- file_offset + num_bytes - 1,
- uptodate);
+ trace_btrfs_writepage_end_io_hook(inode, file_offset, end - 1, uptodate);
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
- while (cur < file_offset + num_bytes) {
+ spin_lock(&inode->ordered_tree_lock);
+ while (cur < end) {
u64 entry_end;
- u64 end;
- u32 len;
+ u64 this_end;
+ u64 len;
node = ordered_tree_search(inode, cur);
/* No ordered extents at all */
@@ -519,19 +532,18 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
* |
* cur
*/
- end = min(entry->file_offset + entry->num_bytes,
- file_offset + num_bytes) - 1;
- ASSERT(end + 1 - cur < U32_MAX);
- len = end + 1 - cur;
+ this_end = min(entry_end, end);
+ len = this_end - cur;
+ ASSERT(len < U32_MAX);
if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) {
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
btrfs_queue_ordered_fn(entry);
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ spin_lock(&inode->ordered_tree_lock);
}
cur += len;
}
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
}
/*
@@ -557,10 +569,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
{
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- unsigned long flags;
bool finished = false;
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ spin_lock(&inode->ordered_tree_lock);
if (cached && *cached) {
entry = *cached;
goto have_entry;
@@ -597,7 +608,7 @@ out:
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
}
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
return finished;
}
@@ -607,23 +618,18 @@ out:
*/
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
{
- struct list_head *cur;
- struct btrfs_ordered_sum *sum;
-
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (refcount_dec_and_test(&entry->refs)) {
+ struct btrfs_ordered_sum *sum;
+ struct btrfs_ordered_sum *tmp;
+
ASSERT(list_empty(&entry->root_extent_list));
ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
- if (entry->inode)
- btrfs_add_delayed_iput(entry->inode);
- while (!list_empty(&entry->list)) {
- cur = entry->list.next;
- sum = list_entry(cur, struct btrfs_ordered_sum, list);
- list_del(&sum->list);
+ btrfs_add_delayed_iput(entry->inode);
+ list_for_each_entry_safe(sum, tmp, &entry->list, list)
kvfree(sum);
- }
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
}
@@ -667,7 +673,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
- spin_lock_irq(&btrfs_inode->ordered_tree_lock);
+ spin_lock(&btrfs_inode->ordered_tree_lock);
node = &entry->rb_node;
rb_erase(node, &btrfs_inode->ordered_tree);
RB_CLEAR_NODE(node);
@@ -675,7 +681,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
btrfs_inode->ordered_tree_last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
- spin_unlock_irq(&btrfs_inode->ordered_tree_lock);
+ spin_unlock(&btrfs_inode->ordered_tree_lock);
/*
* The current running transaction is waiting on us, we need to let it
@@ -842,10 +848,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
/*
* Start IO and wait for a given ordered extent to finish.
*
- * Wait on page writeback for all the pages in the extent and the IO completion
- * code to insert metadata into the btree corresponding to the extent.
+ * Wait on page writeback for all the pages in the extent but not in
+ * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the
+ * IO completion code to insert metadata into the btree corresponding to the extent.
*/
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
+void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry,
+ u64 nowriteback_start, u32 nowriteback_len)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
@@ -865,8 +873,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
* start IO on any dirty ones so the wait doesn't stall waiting
* for the flusher thread to find them
*/
- if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
+ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) {
+ if (!nowriteback_len) {
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
+ } else {
+ if (start < nowriteback_start)
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start,
+ nowriteback_start - 1);
+ if (nowriteback_start + nowriteback_len < end)
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping,
+ nowriteback_start + nowriteback_len,
+ end);
+ }
+ }
if (!freespace_inode)
btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
@@ -947,9 +966,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
{
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- unsigned long flags;
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ spin_lock(&inode->ordered_tree_lock);
node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -962,7 +980,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
trace_btrfs_ordered_extent_lookup(inode, entry);
}
out:
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
return entry;
}
@@ -975,7 +993,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
node = ordered_tree_search(inode, file_offset);
if (!node) {
node = ordered_tree_search(inode, file_offset + len);
@@ -1002,7 +1020,7 @@ out:
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_lookup_range(inode, entry);
}
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
return entry;
}
@@ -1017,7 +1035,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
btrfs_assert_inode_locked(inode);
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
struct btrfs_ordered_extent *ordered;
@@ -1031,7 +1049,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
refcount_inc(&ordered->refs);
trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
}
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
}
/*
@@ -1044,7 +1062,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -1053,7 +1071,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_lookup_first(inode, entry);
out:
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
return entry;
}
@@ -1075,7 +1093,7 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
struct rb_node *next;
struct btrfs_ordered_extent *entry = NULL;
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
node = inode->ordered_tree.rb_node;
/*
* Here we don't want to use tree_search() which will use tree->last
@@ -1130,7 +1148,7 @@ out:
trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
}
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
return entry;
}
@@ -1160,7 +1178,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
cachedp = cached_state;
while (1) {
- lock_extent(&inode->io_tree, start, end, cachedp);
+ btrfs_lock_extent(&inode->io_tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
@@ -1173,7 +1191,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
refcount_dec(&cache->refs);
break;
}
- unlock_extent(&inode->io_tree, start, end, cachedp);
+ btrfs_unlock_extent(&inode->io_tree, start, end, cachedp);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
@@ -1191,7 +1209,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
{
struct btrfs_ordered_extent *ordered;
- if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
+ if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state))
return false;
ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
@@ -1199,7 +1217,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
return true;
btrfs_put_ordered_extent(ordered);
- unlock_extent(&inode->io_tree, start, end, cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
return false;
}
@@ -1229,6 +1247,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
*/
if (WARN_ON_ONCE(len >= ordered->num_bytes))
return ERR_PTR(-EINVAL);
+ /*
+ * If our ordered extent had an error there's no point in continuing.
+ * The error may have come from a transaction abort done either by this
+ * task or some other concurrent task, and the transaction abort path
+ * iterates over all existing ordered extents and sets the flag
+ * BTRFS_ORDERED_IOERR on them.
+ */
+ if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) {
+ const int fs_error = BTRFS_FS_ERROR(fs_info);
+
+ return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO);
+ }
/* We cannot split partially completed ordered extents. */
if (ordered->bytes_left) {
ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
@@ -1250,9 +1280,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
/*
* Take the root's ordered_extent_lock to avoid a race with
* btrfs_wait_ordered_extents() when updating the disk_bytenr and
- * disk_num_bytes fields of the ordered extent below. And we disable
- * IRQs because the inode's ordered_tree_lock is used in IRQ context
- * elsewhere.
+ * disk_num_bytes fields of the ordered extent below.
*
* There's no concern about a previous caller of
* btrfs_wait_ordered_extents() getting the trimmed ordered extent
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4e152736d06c..1e6b0b182b29 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -17,6 +17,7 @@
struct inode;
struct page;
struct extent_state;
+struct btrfs_block_group;
struct btrfs_inode;
struct btrfs_root;
struct btrfs_fs_info;
@@ -191,7 +192,13 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry);
+void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry,
+ u64 nowriteback_start, u32 nowriteback_len);
+static inline void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+ return btrfs_start_ordered_extent_nowriteback(entry, 0, 0);
+}
+
int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fc821aa446f0..f189bf09ce6a 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -6,12 +6,19 @@
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
+#include "file-item.h"
#include "print-tree.h"
#include "accessors.h"
#include "tree-checker.h"
#include "volumes.h"
#include "raid-stripe-tree.h"
+/*
+ * Large enough buffer size for the stringification of any key type yet short
+ * enough to use the stack and avoid allocations.
+ */
+#define KEY_TYPE_BUF_SIZE 32
+
struct root_name_map {
u64 id;
const char *name;
@@ -124,7 +131,7 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type
struct btrfs_tree_block_info *info;
info = (struct btrfs_tree_block_info *)(ei + 1);
btrfs_tree_block_key(eb, info, &key);
- pr_info("\t\ttree block key (%llu %u %llu) level %d\n",
+ pr_info("\t\ttree block key " BTRFS_KEY_FMT " level %d\n",
btrfs_disk_key_objectid(&key), key.type,
btrfs_disk_key_offset(&key),
btrfs_tree_block_level(eb, info));
@@ -190,7 +197,7 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset,
u32 item_size)
{
if (!IS_ALIGNED(item_size, sizeof(u64))) {
- pr_warn("BTRFS: uuid item with illegal size %lu!\n",
+ btrfs_warn(l->fs_info, "uuid item with illegal size %lu",
(unsigned long)item_size);
return;
}
@@ -223,25 +230,212 @@ static void print_eb_refs_lock(const struct extent_buffer *eb)
{
#ifdef CONFIG_BTRFS_DEBUG
btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u",
- atomic_read(&eb->refs), eb->lock_owner, current->pid);
+ refcount_read(&eb->refs), eb->lock_owner, current->pid);
#endif
}
+static void print_timespec(const struct extent_buffer *eb,
+ struct btrfs_timespec *timespec,
+ const char *prefix, const char *suffix)
+{
+ const u64 secs = btrfs_timespec_sec(eb, timespec);
+ const u32 nsecs = btrfs_timespec_nsec(eb, timespec);
+
+ pr_info("%s%llu.%u%s", prefix, secs, nsecs, suffix);
+}
+
+static void print_inode_item(const struct extent_buffer *eb, int i)
+{
+ struct btrfs_inode_item *ii = btrfs_item_ptr(eb, i, struct btrfs_inode_item);
+
+ pr_info("\t\tinode generation %llu transid %llu size %llu nbytes %llu\n",
+ btrfs_inode_generation(eb, ii), btrfs_inode_transid(eb, ii),
+ btrfs_inode_size(eb, ii), btrfs_inode_nbytes(eb, ii));
+ pr_info("\t\tblock group %llu mode %o links %u uid %u gid %u\n",
+ btrfs_inode_block_group(eb, ii), btrfs_inode_mode(eb, ii),
+ btrfs_inode_nlink(eb, ii), btrfs_inode_uid(eb, ii),
+ btrfs_inode_gid(eb, ii));
+ pr_info("\t\trdev %llu sequence %llu flags 0x%llx\n",
+ btrfs_inode_rdev(eb, ii), btrfs_inode_sequence(eb, ii),
+ btrfs_inode_flags(eb, ii));
+ print_timespec(eb, &ii->atime, "\t\tatime ", "\n");
+ print_timespec(eb, &ii->ctime, "\t\tctime ", "\n");
+ print_timespec(eb, &ii->mtime, "\t\tmtime ", "\n");
+ print_timespec(eb, &ii->otime, "\t\totime ", "\n");
+}
+
+static void print_dir_item(const struct extent_buffer *eb, int i)
+{
+ const u32 size = btrfs_item_size(eb, i);
+ struct btrfs_dir_item *di = btrfs_item_ptr(eb, i, struct btrfs_dir_item);
+ u32 cur = 0;
+
+ while (cur < size) {
+ const u32 name_len = btrfs_dir_name_len(eb, di);
+ const u32 data_len = btrfs_dir_data_len(eb, di);
+ const u32 len = sizeof(*di) + name_len + data_len;
+ struct btrfs_key location;
+
+ btrfs_dir_item_key_to_cpu(eb, di, &location);
+ pr_info("\t\tlocation key " BTRFS_KEY_FMT " type %d\n",
+ BTRFS_KEY_FMT_VALUE(&location), btrfs_dir_ftype(eb, di));
+ pr_info("\t\ttransid %llu data_len %u name_len %u\n",
+ btrfs_dir_transid(eb, di), data_len, name_len);
+ di = (struct btrfs_dir_item *)((char *)di + len);
+ cur += len;
+ }
+}
+
+static void print_inode_ref_item(const struct extent_buffer *eb, int i)
+{
+ const u32 size = btrfs_item_size(eb, i);
+ struct btrfs_inode_ref *ref = btrfs_item_ptr(eb, i, struct btrfs_inode_ref);
+ u32 cur = 0;
+
+ while (cur < size) {
+ const u64 index = btrfs_inode_ref_index(eb, ref);
+ const u32 name_len = btrfs_inode_ref_name_len(eb, ref);
+ const u32 len = sizeof(*ref) + name_len;
+
+ pr_info("\t\tindex %llu name_len %u\n", index, name_len);
+ ref = (struct btrfs_inode_ref *)((char *)ref + len);
+ cur += len;
+ }
+}
+
+static void print_inode_extref_item(const struct extent_buffer *eb, int i)
+{
+ const u32 size = btrfs_item_size(eb, i);
+ struct btrfs_inode_extref *extref;
+ u32 cur = 0;
+
+ extref = btrfs_item_ptr(eb, i, struct btrfs_inode_extref);
+ while (cur < size) {
+ const u64 index = btrfs_inode_extref_index(eb, extref);
+ const u32 name_len = btrfs_inode_extref_name_len(eb, extref);
+ const u64 parent = btrfs_inode_extref_parent(eb, extref);
+ const u32 len = sizeof(*extref) + name_len;
+
+ pr_info("\t\tindex %llu parent %llu name_len %u\n",
+ index, parent, name_len);
+ extref = (struct btrfs_inode_extref *)((char *)extref + len);
+ cur += len;
+ }
+}
+
+static void print_dir_log_index_item(const struct extent_buffer *eb, int i)
+{
+ struct btrfs_dir_log_item *dlog;
+
+ dlog = btrfs_item_ptr(eb, i, struct btrfs_dir_log_item);
+ pr_info("\t\tdir log end %llu\n", btrfs_dir_log_end(eb, dlog));
+}
+
+static void print_extent_csum(const struct extent_buffer *eb, int i)
+{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
+ const u32 size = btrfs_item_size(eb, i);
+ const u32 csum_bytes = (size / fs_info->csum_size) * fs_info->sectorsize;
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(eb, &key, i);
+ pr_info("\t\trange start %llu end %llu length %u\n",
+ key.offset, key.offset + csum_bytes, csum_bytes);
+}
+
+static void print_file_extent_item(const struct extent_buffer *eb, int i)
+{
+ struct btrfs_file_extent_item *fi;
+
+ fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ pr_info("\t\tgeneration %llu type %hhu\n",
+ btrfs_file_extent_generation(eb, fi),
+ btrfs_file_extent_type(eb, fi));
+
+ if (btrfs_file_extent_type(eb, fi) == BTRFS_FILE_EXTENT_INLINE) {
+ pr_info("\t\tinline extent data size %u ram_bytes %llu compression %hhu\n",
+ btrfs_file_extent_inline_item_len(eb, i),
+ btrfs_file_extent_ram_bytes(eb, fi),
+ btrfs_file_extent_compression(eb, fi));
+ return;
+ }
+
+ pr_info("\t\textent data disk bytenr %llu nr %llu\n",
+ btrfs_file_extent_disk_bytenr(eb, fi),
+ btrfs_file_extent_disk_num_bytes(eb, fi));
+ pr_info("\t\textent data offset %llu nr %llu ram %llu\n",
+ btrfs_file_extent_offset(eb, fi),
+ btrfs_file_extent_num_bytes(eb, fi),
+ btrfs_file_extent_ram_bytes(eb, fi));
+ pr_info("\t\textent compression %hhu\n",
+ btrfs_file_extent_compression(eb, fi));
+}
+
+static void key_type_string(const struct btrfs_key *key, char *buf, int buf_size)
+{
+ static const char *key_to_str[256] = {
+ [BTRFS_INODE_ITEM_KEY] = "INODE_ITEM",
+ [BTRFS_INODE_REF_KEY] = "INODE_REF",
+ [BTRFS_INODE_EXTREF_KEY] = "INODE_EXTREF",
+ [BTRFS_DIR_ITEM_KEY] = "DIR_ITEM",
+ [BTRFS_DIR_INDEX_KEY] = "DIR_INDEX",
+ [BTRFS_DIR_LOG_ITEM_KEY] = "DIR_LOG_ITEM",
+ [BTRFS_DIR_LOG_INDEX_KEY] = "DIR_LOG_INDEX",
+ [BTRFS_XATTR_ITEM_KEY] = "XATTR_ITEM",
+ [BTRFS_VERITY_DESC_ITEM_KEY] = "VERITY_DESC_ITEM",
+ [BTRFS_VERITY_MERKLE_ITEM_KEY] = "VERITY_MERKLE_ITEM",
+ [BTRFS_ORPHAN_ITEM_KEY] = "ORPHAN_ITEM",
+ [BTRFS_ROOT_ITEM_KEY] = "ROOT_ITEM",
+ [BTRFS_ROOT_REF_KEY] = "ROOT_REF",
+ [BTRFS_ROOT_BACKREF_KEY] = "ROOT_BACKREF",
+ [BTRFS_EXTENT_ITEM_KEY] = "EXTENT_ITEM",
+ [BTRFS_METADATA_ITEM_KEY] = "METADATA_ITEM",
+ [BTRFS_TREE_BLOCK_REF_KEY] = "TREE_BLOCK_REF",
+ [BTRFS_SHARED_BLOCK_REF_KEY] = "SHARED_BLOCK_REF",
+ [BTRFS_EXTENT_DATA_REF_KEY] = "EXTENT_DATA_REF",
+ [BTRFS_SHARED_DATA_REF_KEY] = "SHARED_DATA_REF",
+ [BTRFS_EXTENT_OWNER_REF_KEY] = "EXTENT_OWNER_REF",
+ [BTRFS_EXTENT_CSUM_KEY] = "EXTENT_CSUM",
+ [BTRFS_EXTENT_DATA_KEY] = "EXTENT_DATA",
+ [BTRFS_BLOCK_GROUP_ITEM_KEY] = "BLOCK_GROUP_ITEM",
+ [BTRFS_FREE_SPACE_INFO_KEY] = "FREE_SPACE_INFO",
+ [BTRFS_FREE_SPACE_EXTENT_KEY] = "FREE_SPACE_EXTENT",
+ [BTRFS_FREE_SPACE_BITMAP_KEY] = "FREE_SPACE_BITMAP",
+ [BTRFS_CHUNK_ITEM_KEY] = "CHUNK_ITEM",
+ [BTRFS_DEV_ITEM_KEY] = "DEV_ITEM",
+ [BTRFS_DEV_EXTENT_KEY] = "DEV_EXTENT",
+ [BTRFS_TEMPORARY_ITEM_KEY] = "TEMPORARY_ITEM",
+ [BTRFS_DEV_REPLACE_KEY] = "DEV_REPLACE",
+ [BTRFS_STRING_ITEM_KEY] = "STRING_ITEM",
+ [BTRFS_QGROUP_STATUS_KEY] = "QGROUP_STATUS",
+ [BTRFS_QGROUP_RELATION_KEY] = "QGROUP_RELATION",
+ [BTRFS_QGROUP_INFO_KEY] = "QGROUP_INFO",
+ [BTRFS_QGROUP_LIMIT_KEY] = "QGROUP_LIMIT",
+ [BTRFS_PERSISTENT_ITEM_KEY] = "PERSISTENT_ITEM",
+ [BTRFS_UUID_KEY_SUBVOL] = "UUID_KEY_SUBVOL",
+ [BTRFS_UUID_KEY_RECEIVED_SUBVOL] = "UUID_KEY_RECEIVED_SUBVOL",
+ [BTRFS_RAID_STRIPE_KEY] = "RAID_STRIPE",
+ };
+
+ if (key->type == 0 && key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+ scnprintf(buf, buf_size, "UNTYPED");
+ else if (key_to_str[key->type])
+ scnprintf(buf, buf_size, "%s", key_to_str[key->type]);
+ else
+ scnprintf(buf, buf_size, "UNKNOWN.%d", key->type);
+}
+
void btrfs_print_leaf(const struct extent_buffer *l)
{
struct btrfs_fs_info *fs_info;
int i;
u32 type, nr;
struct btrfs_root_item *ri;
- struct btrfs_dir_item *di;
- struct btrfs_inode_item *ii;
struct btrfs_block_group_item *bi;
- struct btrfs_file_extent_item *fi;
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
struct btrfs_dev_extent *dev_extent;
struct btrfs_key key;
- struct btrfs_key found_key;
if (!l)
return;
@@ -255,25 +449,35 @@ void btrfs_print_leaf(const struct extent_buffer *l)
btrfs_leaf_free_space(l), btrfs_header_owner(l));
print_eb_refs_lock(l);
for (i = 0 ; i < nr ; i++) {
+ char key_buf[KEY_TYPE_BUF_SIZE];
+
btrfs_item_key_to_cpu(l, &key, i);
type = key.type;
- pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n",
- i, key.objectid, type, key.offset,
+ key_type_string(&key, key_buf, KEY_TYPE_BUF_SIZE);
+
+ pr_info("\titem %d key (%llu %s %llu) itemoff %d itemsize %d\n",
+ i, key.objectid, key_buf, key.offset,
btrfs_item_offset(l, i), btrfs_item_size(l, i));
switch (type) {
case BTRFS_INODE_ITEM_KEY:
- ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
- pr_info("\t\tinode generation %llu size %llu mode %o\n",
- btrfs_inode_generation(l, ii),
- btrfs_inode_size(l, ii),
- btrfs_inode_mode(l, ii));
+ print_inode_item(l, i);
+ break;
+ case BTRFS_INODE_REF_KEY:
+ print_inode_ref_item(l, i);
+ break;
+ case BTRFS_INODE_EXTREF_KEY:
+ print_inode_extref_item(l, i);
break;
case BTRFS_DIR_ITEM_KEY:
- di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
- btrfs_dir_item_key_to_cpu(l, di, &found_key);
- pr_info("\t\tdir oid %llu flags %u\n",
- found_key.objectid,
- btrfs_dir_flags(l, di));
+ case BTRFS_DIR_INDEX_KEY:
+ case BTRFS_XATTR_ITEM_KEY:
+ print_dir_item(l, i);
+ break;
+ case BTRFS_DIR_LOG_INDEX_KEY:
+ print_dir_log_index_item(l, i);
+ break;
+ case BTRFS_EXTENT_CSUM_KEY:
+ print_extent_csum(l, i);
break;
case BTRFS_ROOT_ITEM_KEY:
ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
@@ -303,24 +507,7 @@ void btrfs_print_leaf(const struct extent_buffer *l)
btrfs_shared_data_ref_count(l, sref));
break;
case BTRFS_EXTENT_DATA_KEY:
- fi = btrfs_item_ptr(l, i,
- struct btrfs_file_extent_item);
- pr_info("\t\tgeneration %llu type %hhu\n",
- btrfs_file_extent_generation(l, fi),
- btrfs_file_extent_type(l, fi));
- if (btrfs_file_extent_type(l, fi) ==
- BTRFS_FILE_EXTENT_INLINE) {
- pr_info("\t\tinline extent data size %llu\n",
- btrfs_file_extent_ram_bytes(l, fi));
- break;
- }
- pr_info("\t\textent data disk bytenr %llu nr %llu\n",
- btrfs_file_extent_disk_bytenr(l, fi),
- btrfs_file_extent_disk_num_bytes(l, fi));
- pr_info("\t\textent data offset %llu nr %llu ram %llu\n",
- btrfs_file_extent_offset(l, fi),
- btrfs_file_extent_num_bytes(l, fi),
- btrfs_file_extent_ram_bytes(l, fi));
+ print_file_extent_item(l, i);
break;
case BTRFS_BLOCK_GROUP_ITEM_KEY:
bi = btrfs_item_ptr(l, i,
@@ -410,10 +597,9 @@ void btrfs_print_tree(const struct extent_buffer *c, bool follow)
print_eb_refs_lock(c);
for (i = 0; i < nr; i++) {
btrfs_node_key_to_cpu(c, &key, i);
- pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n",
- i, key.objectid, key.type, key.offset,
- btrfs_node_blockptr(c, i),
- btrfs_node_ptr_generation(c, i));
+ pr_info("\tkey %d " BTRFS_KEY_FMT " block %llu gen %llu\n",
+ i, BTRFS_KEY_FMT_VALUE(&key), btrfs_node_blockptr(c, i),
+ btrfs_node_ptr_generation(c, i));
}
if (!follow)
return;
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 8504bf1702c7..d0e620bf5f5a 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_PRINT_TREE_H
#define BTRFS_PRINT_TREE_H
+#include <linux/types.h>
+
/* Buffer size to contain tree name and possibly additional data (offset) */
#define BTRFS_ROOT_NAME_BUF_LEN 48
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index b8fa34e16abb..adc956432d2f 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -26,8 +26,8 @@ struct prop_handler {
const char *xattr_name;
int (*validate)(const struct btrfs_inode *inode, const char *value,
size_t len);
- int (*apply)(struct inode *inode, const char *value, size_t len);
- const char *(*extract)(const struct inode *inode);
+ int (*apply)(struct btrfs_inode *inode, const char *value, size_t len);
+ const char *(*extract)(const struct btrfs_inode *inode);
bool (*ignore)(const struct btrfs_inode *inode);
int inheritable;
};
@@ -121,7 +121,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
if (ret)
return ret;
- ret = handler->apply(&inode->vfs_inode, NULL, 0);
+ ret = handler->apply(inode, NULL, 0);
ASSERT(ret == 0);
return ret;
@@ -131,7 +131,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
value_len, flags);
if (ret)
return ret;
- ret = handler->apply(&inode->vfs_inode, value, value_len);
+ ret = handler->apply(inode, value, value_len);
if (ret) {
btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL,
0, flags);
@@ -263,7 +263,7 @@ static void inode_prop_iterator(void *ctx,
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
- ret = handler->apply(inode, value, len);
+ ret = handler->apply(BTRFS_I(inode), value, len);
if (unlikely(ret))
btrfs_warn(root->fs_info,
"error applying prop %s to ino %llu (root %llu): %d",
@@ -273,12 +273,13 @@ static void inode_prop_iterator(void *ctx,
set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
}
-int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
+int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ struct btrfs_root *root = inode->root;
+ u64 ino = btrfs_ino(inode);
- return iterate_object_props(root, path, ino, inode_prop_iterator, inode);
+ return iterate_object_props(root, path, ino, inode_prop_iterator,
+ &inode->vfs_inode);
}
static int prop_compression_validate(const struct btrfs_inode *inode,
@@ -300,26 +301,26 @@ static int prop_compression_validate(const struct btrfs_inode *inode,
return -EINVAL;
}
-static int prop_compression_apply(struct inode *inode, const char *value,
+static int prop_compression_apply(struct btrfs_inode *inode, const char *value,
size_t len)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int type;
/* Reset to defaults */
if (len == 0) {
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+ inode->flags &= ~BTRFS_INODE_COMPRESS;
+ inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode->prop_compress = BTRFS_COMPRESS_NONE;
return 0;
}
/* Set NOCOMPRESS flag */
if ((len == 2 && strncmp("no", value, 2) == 0) ||
(len == 4 && strncmp("none", value, 4) == 0)) {
- BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+ inode->flags |= BTRFS_INODE_NOCOMPRESS;
+ inode->flags &= ~BTRFS_INODE_COMPRESS;
+ inode->prop_compress = BTRFS_COMPRESS_NONE;
return 0;
}
@@ -336,9 +337,9 @@ static int prop_compression_apply(struct inode *inode, const char *value,
return -EINVAL;
}
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->prop_compress = type;
+ inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode->flags |= BTRFS_INODE_COMPRESS;
+ inode->prop_compress = type;
return 0;
}
@@ -359,13 +360,13 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode)
return false;
}
-static const char *prop_compression_extract(const struct inode *inode)
+static const char *prop_compression_extract(const struct btrfs_inode *inode)
{
- switch (BTRFS_I(inode)->prop_compress) {
+ switch (inode->prop_compress) {
case BTRFS_COMPRESS_ZLIB:
case BTRFS_COMPRESS_LZO:
case BTRFS_COMPRESS_ZSTD:
- return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress);
+ return btrfs_compress_type2str(inode->prop_compress);
default:
break;
}
@@ -385,16 +386,16 @@ static struct prop_handler prop_handlers[] = {
};
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode, const struct inode *parent)
+ struct btrfs_inode *inode,
+ const struct btrfs_inode *parent)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
int i;
bool need_reserve = false;
- if (!test_bit(BTRFS_INODE_HAS_PROPS,
- &BTRFS_I(parent)->runtime_flags))
+ if (!test_bit(BTRFS_INODE_HAS_PROPS, &parent->runtime_flags))
return 0;
for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
@@ -405,7 +406,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
if (!h->inheritable)
continue;
- if (h->ignore(BTRFS_I(inode)))
+ if (h->ignore(inode))
continue;
value = h->extract(parent);
@@ -416,7 +417,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
* This is not strictly necessary as the property should be
* valid, but in case it isn't, don't propagate it further.
*/
- ret = h->validate(BTRFS_I(inode), value, strlen(value));
+ ret = h->validate(inode, value, strlen(value));
if (ret)
continue;
@@ -436,16 +437,15 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
return ret;
}
- ret = btrfs_setxattr(trans, inode, h->xattr_name, value,
+ ret = btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, value,
strlen(value), 0);
if (!ret) {
ret = h->apply(inode, value, strlen(value));
if (ret)
- btrfs_setxattr(trans, inode, h->xattr_name,
+ btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name,
NULL, 0, 0);
else
- set_bit(BTRFS_INODE_HAS_PROPS,
- &BTRFS_I(inode)->runtime_flags);
+ set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags);
}
if (need_reserve) {
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 63546d0a9444..15d9a025c923 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -6,9 +6,9 @@
#ifndef BTRFS_PROPS_H
#define BTRFS_PROPS_H
+#include <linux/types.h>
#include <linux/compiler_types.h>
-struct inode;
struct btrfs_inode;
struct btrfs_path;
struct btrfs_trans_handle;
@@ -22,10 +22,10 @@ int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
const char *value, size_t value_len);
bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name);
-int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
+int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path);
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode,
- const struct inode *dir);
+ struct btrfs_inode *inode,
+ const struct btrfs_inode *dir);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a6f92836c9b1..9e2b53e90dcb 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -83,7 +83,7 @@ static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
- trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
+ trace_btrfs_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
qgroup->rsv.values[type] += num_bytes;
}
@@ -91,7 +91,7 @@ static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
- trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
+ trace_btrfs_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
if (qgroup->rsv.values[type] >= num_bytes) {
qgroup->rsv.values[type] -= num_bytes;
return;
@@ -160,23 +160,34 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
+static int btrfs_qgroup_qgroupid_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *qgroupid = key;
+ const struct btrfs_qgroup *qgroup = rb_entry(node, struct btrfs_qgroup, node);
+
+ if (qgroup->qgroupid < *qgroupid)
+ return -1;
+ else if (qgroup->qgroupid > *qgroupid)
+ return 1;
+
+ return 0;
+}
+
/* must be called with qgroup_ioctl_lock held */
static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info,
u64 qgroupid)
{
- struct rb_node *n = fs_info->qgroup_tree.rb_node;
- struct btrfs_qgroup *qgroup;
+ struct rb_node *node;
- while (n) {
- qgroup = rb_entry(n, struct btrfs_qgroup, node);
- if (qgroup->qgroupid < qgroupid)
- n = n->rb_left;
- else if (qgroup->qgroupid > qgroupid)
- n = n->rb_right;
- else
- return qgroup;
- }
- return NULL;
+ node = rb_find(&qgroupid, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_key_cmp);
+ return rb_entry_safe(node, struct btrfs_qgroup, node);
+}
+
+static int btrfs_qgroup_qgroupid_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct btrfs_qgroup *new_qgroup = rb_entry(new, struct btrfs_qgroup, node);
+
+ return btrfs_qgroup_qgroupid_key_cmp(&new_qgroup->qgroupid, existing);
}
/*
@@ -191,39 +202,25 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *prealloc,
u64 qgroupid)
{
- struct rb_node **p = &fs_info->qgroup_tree.rb_node;
- struct rb_node *parent = NULL;
- struct btrfs_qgroup *qgroup;
+ struct rb_node *node;
/* Caller must have pre-allocated @prealloc. */
ASSERT(prealloc);
- while (*p) {
- parent = *p;
- qgroup = rb_entry(parent, struct btrfs_qgroup, node);
-
- if (qgroup->qgroupid < qgroupid) {
- p = &(*p)->rb_left;
- } else if (qgroup->qgroupid > qgroupid) {
- p = &(*p)->rb_right;
- } else {
- kfree(prealloc);
- return qgroup;
- }
+ prealloc->qgroupid = qgroupid;
+ node = rb_find_add(&prealloc->node, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_cmp);
+ if (node) {
+ kfree(prealloc);
+ return rb_entry(node, struct btrfs_qgroup, node);
}
- qgroup = prealloc;
- qgroup->qgroupid = qgroupid;
- INIT_LIST_HEAD(&qgroup->groups);
- INIT_LIST_HEAD(&qgroup->members);
- INIT_LIST_HEAD(&qgroup->dirty);
- INIT_LIST_HEAD(&qgroup->iterator);
- INIT_LIST_HEAD(&qgroup->nested_iterator);
+ INIT_LIST_HEAD(&prealloc->groups);
+ INIT_LIST_HEAD(&prealloc->members);
+ INIT_LIST_HEAD(&prealloc->dirty);
+ INIT_LIST_HEAD(&prealloc->iterator);
+ INIT_LIST_HEAD(&prealloc->nested_iterator);
- rb_link_node(&qgroup->node, parent, p);
- rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
-
- return qgroup;
+ return prealloc;
}
static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
@@ -349,13 +346,27 @@ int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid
}
#endif
-static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
+__printf(2, 3)
+static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
+ const u64 old_flags = fs_info->qgroup_flags;
+
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
return;
fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
+ if (!(old_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_warn_rl(fs_info, "qgroup marked inconsistent, %pV", &vaf);
+ va_end(args);
+ }
}
static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
@@ -386,12 +397,6 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
if (!fs_info->quota_root)
return 0;
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
- if (!fs_info->qgroup_ulist) {
- ret = -ENOMEM;
- goto out;
- }
-
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -434,13 +439,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
qgroup_read_enable_gen(fs_info, l, slot, ptr);
- } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_err(fs_info,
- "qgroup generation mismatch, marked as inconsistent");
- }
+ else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation)
+ qgroup_mark_inconsistent(fs_info, "qgroup generation mismatch");
rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
goto next1;
}
@@ -451,10 +453,8 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup = find_qgroup_rb(fs_info, found_key.offset);
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
- (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
- btrfs_err(fs_info, "inconsistent qgroup config");
- qgroup_mark_inconsistent(fs_info);
- }
+ (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY))
+ qgroup_mark_inconsistent(fs_info, "inconsistent qgroup config");
if (!qgroup) {
struct btrfs_qgroup *prealloc;
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -476,7 +476,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
* during mount before we start doing things like creating
* subvolumes.
*/
- if (is_fstree(qgroup->qgroupid) &&
+ if (btrfs_is_fstree(qgroup->qgroupid) &&
qgroup->qgroupid > tree_root->free_objectid)
/*
* Don't need to check against BTRFS_LAST_FREE_OBJECTID,
@@ -581,8 +581,6 @@ out:
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
} else {
- ulist_free(fs_info->qgroup_ulist);
- fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
btrfs_sysfs_del_qgroups(fs_info);
}
@@ -630,29 +628,30 @@ bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info)
/*
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
- * first two are in single-threaded paths.And for the third one, we have set
- * quota_root to be null with qgroup_lock held before, so it is safe to clean
- * up the in-memory structures without qgroup_lock held.
+ * first two are in single-threaded paths.
*/
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
{
struct rb_node *n;
struct btrfs_qgroup *qgroup;
+ /*
+ * btrfs_quota_disable() can be called concurrently with
+ * btrfs_qgroup_rescan() -> qgroup_rescan_zero_tracking(), so take the
+ * lock.
+ */
+ spin_lock(&fs_info->qgroup_lock);
while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
__del_qgroup_rb(qgroup);
+ spin_unlock(&fs_info->qgroup_lock);
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
+ spin_lock(&fs_info->qgroup_lock);
}
- /*
- * We call btrfs_free_qgroup_config() when unmounting
- * filesystem and disabling quota, so we set qgroup_ulist
- * to be null here to avoid double free.
- */
- ulist_free(fs_info->qgroup_ulist);
- fs_info->qgroup_ulist = NULL;
+ spin_unlock(&fs_info->qgroup_lock);
+
btrfs_sysfs_del_qgroups(fs_info);
}
@@ -661,7 +660,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
{
int ret;
struct btrfs_root *quota_root = trans->fs_info->quota_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
path = btrfs_alloc_path();
@@ -673,10 +672,6 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
key.offset = dst;
ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
-
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
-
- btrfs_free_path(path);
return ret;
}
@@ -685,7 +680,7 @@ static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
{
int ret;
struct btrfs_root *quota_root = trans->fs_info->quota_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
path = btrfs_alloc_path();
@@ -698,24 +693,19 @@ static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret > 0)
+ return -ENOENT;
- ret = btrfs_del_item(trans, quota_root, path);
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, quota_root, path);
}
static int add_qgroup_item(struct btrfs_trans_handle *trans,
struct btrfs_root *quota_root, u64 qgroupid)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_qgroup_info_item *qgroup_info;
struct btrfs_qgroup_limit_item *qgroup_limit;
struct extent_buffer *leaf;
@@ -741,7 +731,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_info));
if (ret && ret != -EEXIST)
- goto out;
+ return ret;
leaf = path->nodes[0];
qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
@@ -752,15 +742,13 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
- btrfs_mark_buffer_dirty(trans, leaf);
-
btrfs_release_path(path);
key.type = BTRFS_QGROUP_LIMIT_KEY;
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_limit));
if (ret && ret != -EEXIST)
- goto out;
+ return ret;
leaf = path->nodes[0];
qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
@@ -771,19 +759,14 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
- btrfs_mark_buffer_dirty(trans, leaf);
-
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
{
int ret;
struct btrfs_root *quota_root = trans->fs_info->quota_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
path = btrfs_alloc_path();
@@ -795,33 +778,27 @@ static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
key.offset = qgroupid;
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret > 0)
+ return -ENOENT;
ret = btrfs_del_item(trans, quota_root, path);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
key.type = BTRFS_QGROUP_LIMIT_KEY;
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret > 0)
+ return -ENOENT;
ret = btrfs_del_item(trans, quota_root, path);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -829,7 +806,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
struct btrfs_qgroup *qgroup)
{
struct btrfs_root *quota_root = trans->fs_info->quota_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *l;
struct btrfs_qgroup_limit_item *qgroup_limit;
@@ -849,7 +826,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
ret = -ENOENT;
if (ret)
- goto out;
+ return ret;
l = path->nodes[0];
slot = path->slots[0];
@@ -860,10 +837,6 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
- btrfs_mark_buffer_dirty(trans, l);
-
-out:
- btrfs_free_path(path);
return ret;
}
@@ -872,7 +845,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root = fs_info->quota_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *l;
struct btrfs_qgroup_info_item *qgroup_info;
@@ -895,7 +868,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
ret = -ENOENT;
if (ret)
- goto out;
+ return ret;
l = path->nodes[0];
slot = path->slots[0];
@@ -906,10 +879,6 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
- btrfs_mark_buffer_dirty(trans, l);
-
-out:
- btrfs_free_path(path);
return ret;
}
@@ -917,7 +886,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root = fs_info->quota_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *l;
struct btrfs_qgroup_status_item *ptr;
@@ -937,7 +906,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
ret = -ENOENT;
if (ret)
- goto out;
+ return ret;
l = path->nodes[0];
slot = path->slots[0];
@@ -948,10 +917,6 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
btrfs_set_qgroup_status_rescan(l, ptr,
fs_info->qgroup_rescan_progress.objectid);
- btrfs_mark_buffer_dirty(trans, l);
-
-out:
- btrfs_free_path(path);
return ret;
}
@@ -961,7 +926,7 @@ out:
static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *leaf = NULL;
int ret;
@@ -972,13 +937,13 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
return -ENOMEM;
key.objectid = 0;
- key.offset = 0;
key.type = 0;
+ key.offset = 0;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
leaf = path->nodes[0];
nr = btrfs_header_nritems(leaf);
if (!nr)
@@ -991,14 +956,12 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
path->slots[0] = 0;
ret = btrfs_del_items(trans, root, path, 0, nr);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
@@ -1014,7 +977,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup = NULL;
struct btrfs_qgroup *prealloc = NULL;
struct btrfs_trans_handle *trans = NULL;
- struct ulist *ulist = NULL;
const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
int ret = 0;
int slot;
@@ -1037,12 +999,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
if (fs_info->quota_root)
goto out;
- ulist = ulist_alloc(GFP_KERNEL);
- if (!ulist) {
- ret = -ENOMEM;
- goto out;
- }
-
ret = btrfs_sysfs_add_qgroups(fs_info);
if (ret < 0)
goto out;
@@ -1082,9 +1038,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
if (fs_info->quota_root)
goto out;
- fs_info->qgroup_ulist = ulist;
- ulist = NULL;
-
/*
* initially create the quota tree
*/
@@ -1096,7 +1049,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
}
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out_free_root;
@@ -1108,7 +1061,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*ptr));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1121,6 +1074,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
if (simple) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
+ btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
} else {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -1129,8 +1083,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
- btrfs_mark_buffer_dirty(trans, leaf);
-
key.objectid = 0;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = 0;
@@ -1139,7 +1091,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
if (ret > 0)
goto out_add_root;
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1157,7 +1109,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
/* We should not have a stray @prealloc pointer. */
ASSERT(prealloc == NULL);
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
- if (!prealloc) {
+ if (unlikely(!prealloc)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out_free_path;
@@ -1165,26 +1117,21 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
ret = add_qgroup_item(trans, quota_root,
found_key.offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
prealloc = NULL;
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
- btrfs_abort_transaction(trans, ret);
- goto out_free_path;
- }
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
ret = btrfs_search_slot_for_read(tree_root, &found_key,
path, 1, 0);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1198,7 +1145,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
}
}
ret = btrfs_next_item(tree_root, path);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1209,7 +1156,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
out_add_root:
btrfs_release_path(path);
ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1223,7 +1170,7 @@ out_add_root:
qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
prealloc = NULL;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1254,8 +1201,6 @@ out_add_root:
spin_lock(&fs_info->qgroup_lock);
fs_info->quota_root = quota_root;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- if (simple)
- btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
spin_unlock(&fs_info->qgroup_lock);
/* Skip rescan for simple qgroups. */
@@ -1291,18 +1236,21 @@ out_free_root:
if (ret)
btrfs_put_root(quota_root);
out:
- if (ret) {
- ulist_free(fs_info->qgroup_ulist);
- fs_info->qgroup_ulist = NULL;
+ if (ret)
btrfs_sysfs_del_qgroups(fs_info);
- }
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (ret && trans)
btrfs_end_transaction(trans);
else if (trans)
ret = btrfs_end_transaction(trans);
- ulist_free(ulist);
- kfree(prealloc);
+
+ /*
+ * At this point we either failed at allocating prealloc, or we
+ * succeeded and passed the ownership to it to add_qgroup_rb(). In any
+ * case, this needs to be NULL or there is something wrong.
+ */
+ ASSERT(prealloc == NULL);
+
return ret;
}
@@ -1373,11 +1321,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
/*
* We have nothing held here and no trans handle, just return the error
- * if there is one.
+ * if there is one and set back the quota enabled bit since we didn't
+ * actually disable quotas.
*/
ret = flush_reservations(fs_info);
- if (ret)
+ if (ret) {
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
return ret;
+ }
/*
* 1 For the root item
@@ -1412,13 +1363,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_free_qgroup_config(fs_info);
ret = btrfs_clean_quota_tree(trans, quota_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_del_root(trans, &quota_root->root_key);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1489,9 +1440,9 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup *src, int sign)
{
struct btrfs_qgroup *qgroup;
- struct btrfs_qgroup *cur;
LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
+ u64 num_bytes_cmpr = src->excl_cmpr;
int ret = 0;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -1499,15 +1450,16 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
goto out;
qgroup_iterator_add(&qgroup_list, qgroup);
- list_for_each_entry(cur, &qgroup_list, iterator) {
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
struct btrfs_qgroup_list *glist;
qgroup->rfer += sign * num_bytes;
- qgroup->rfer_cmpr += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes_cmpr;
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr);
qgroup->excl += sign * num_bytes;
- qgroup->excl_cmpr += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes_cmpr;
if (sign > 0)
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
@@ -1574,8 +1526,10 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst
ASSERT(prealloc);
/* Check the level of src and dst first */
- if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
+ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) {
+ kfree(prealloc);
return -EINVAL;
+ }
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
@@ -1698,9 +1652,6 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
struct btrfs_qgroup *prealloc = NULL;
int ret = 0;
- if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
- return 0;
-
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
@@ -1731,7 +1682,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
- kfree(prealloc);
+ /*
+ * At this point we either failed at allocating prealloc, or we
+ * succeeded and passed the ownership to it to add_qgroup_rb(). In any
+ * case, this needs to be NULL or there is something wrong.
+ */
+ ASSERT(prealloc == NULL);
return ret;
}
@@ -1743,8 +1699,7 @@ out:
static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup)
{
struct btrfs_key key;
- struct btrfs_path *path;
- int ret;
+ BTRFS_PATH_AUTO_FREE(path);
/*
* Squota would never be inconsistent, but there can still be case
@@ -1777,13 +1732,11 @@ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup
if (!path)
return -ENOMEM;
- ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
- btrfs_free_path(path);
/*
* The @ret from btrfs_find_root() exactly matches our definition for
* the return value, thus can be returned directly.
*/
- return ret;
+ return btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
}
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
@@ -1839,9 +1792,19 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
* Thus its reserved space should all be zero, no matter if qgroup
* is consistent or the mode.
*/
- WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
+ if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
+ DEBUG_WARN();
+ btrfs_warn_rl(fs_info,
+"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA],
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC],
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
+
+ }
/*
* The same for rfer/excl numbers, but that's only if our qgroup is
* consistent and if it's in regular qgroup mode.
@@ -1850,15 +1813,15 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
*/
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
- if (WARN_ON(qgroup->rfer || qgroup->excl ||
- qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
- btrfs_warn_rl(fs_info,
-"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
- btrfs_qgroup_level(qgroup->qgroupid),
- btrfs_qgroup_subvolid(qgroup->qgroupid),
- qgroup->rfer, qgroup->rfer_cmpr,
- qgroup->excl, qgroup->excl_cmpr);
- qgroup_mark_inconsistent(fs_info);
+ if (qgroup->rfer || qgroup->excl ||
+ qgroup->rfer_cmpr || qgroup->excl_cmpr) {
+ DEBUG_WARN();
+ qgroup_mark_inconsistent(fs_info,
+ "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ qgroup->rfer, qgroup->rfer_cmpr,
+ qgroup->excl, qgroup->excl_cmpr);
}
}
del_qgroup_rb(fs_info, qgroupid);
@@ -1881,18 +1844,15 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su
struct btrfs_trans_handle *trans;
int ret;
- if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root)
+ if (!btrfs_is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) ||
+ !fs_info->quota_root)
return 0;
/*
* Commit current transaction to make sure all the rfer/excl numbers
* get updated.
*/
- trans = btrfs_start_transaction(fs_info->quota_root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_current_transaction(fs_info->quota_root);
if (ret < 0)
return ret;
@@ -1905,8 +1865,11 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su
/*
* It's squota and the subvolume still has numbers needed for future
* accounting, in this case we can not delete it. Just skip it.
+ *
+ * Or the qgroup is already removed by a qgroup rescan. For both cases we're
+ * safe to ignore them.
*/
- if (ret == -EBUSY)
+ if (ret == -EBUSY || ret == -ENOENT)
ret = 0;
return ret;
}
@@ -1977,11 +1940,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_limit_item(trans, qgroup);
- if (ret) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_info(fs_info, "unable to update quota limit for %llu",
- qgroupid);
- }
+ if (ret)
+ qgroup_mark_inconsistent(fs_info, "qgroup item update error %d", ret);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
@@ -2036,7 +1996,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC);
xa_unlock(&delayed_refs->dirty_extents);
if (xa_is_err(ret)) {
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "xarray insert error: %d", xa_err(ret));
return xa_err(ret);
}
@@ -2103,10 +2063,8 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
ret = btrfs_find_all_roots(&ctx, true);
if (ret < 0) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_warn(fs_info,
-"error accounting new delayed refs extent (err code: %d), quota inconsistent",
- ret);
+ qgroup_mark_inconsistent(fs_info,
+ "error accounting new delayed refs extent: %d", ret);
return 0;
}
@@ -2327,7 +2285,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
bool trace_leaf)
{
struct btrfs_key key;
- struct btrfs_path *src_path;
+ BTRFS_PATH_AUTO_FREE(src_path);
struct btrfs_fs_info *fs_info = trans->fs_info;
u32 nodesize = fs_info->nodesize;
int cur_level = root_level;
@@ -2339,10 +2297,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
return -EINVAL;
src_path = btrfs_alloc_path();
- if (!src_path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!src_path)
+ return -ENOMEM;
if (dst_level)
btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
@@ -2350,7 +2306,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
/* For src_path */
- atomic_inc(&src_eb->refs);
+ refcount_inc(&src_eb->refs);
src_path->nodes[root_level] = src_eb;
src_path->slots[root_level] = dst_path->slots[root_level];
src_path->locks[root_level] = 0;
@@ -2368,10 +2324,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
parent_slot = src_path->slots[cur_level + 1];
eb = btrfs_read_node_slot(eb, parent_slot);
- if (IS_ERR(eb)) {
- ret = PTR_ERR(eb);
- goto out;
- }
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
src_path->nodes[cur_level] = eb;
@@ -2392,10 +2346,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
&src_key, src_path->slots[cur_level]);
}
/* Content mismatch, something went wrong */
- if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
- ret = -ENOENT;
- goto out;
- }
+ if (btrfs_comp_cpu_keys(&dst_key, &src_key))
+ return -ENOENT;
cur_level--;
}
@@ -2406,21 +2358,20 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
nodesize);
if (ret < 0)
- goto out;
+ return ret;
ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
nodesize);
if (ret < 0)
- goto out;
+ return ret;
/* Record leaf file extents */
if (dst_level == 0 && trace_leaf) {
ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
if (ret < 0)
- goto out;
+ return ret;
ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
}
-out:
- btrfs_free_path(src_path);
+
return ret;
}
@@ -2459,9 +2410,9 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
int i;
/* Level sanity check */
- if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
- root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
- root_level < cur_level) {
+ if (unlikely(cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < cur_level)) {
btrfs_err_rl(fs_info,
"%s: bad levels, cur_level=%d root_level=%d",
__func__, cur_level, root_level);
@@ -2477,7 +2428,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
* dst_path->nodes[root_level] must be initialized before
* calling this function.
*/
- if (cur_level == root_level) {
+ if (unlikely(cur_level == root_level)) {
btrfs_err_rl(fs_info,
"%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
__func__, root_level, root_level, cur_level);
@@ -2563,7 +2514,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
return 0;
/* Wrong parameter order */
- if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
+ if (unlikely(btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb))) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
btrfs_header_generation(src_eb),
@@ -2571,7 +2522,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
return -EUCLEAN;
}
- if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
+ if (unlikely(!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb))) {
ret = -EIO;
goto out;
}
@@ -2583,7 +2534,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
goto out;
}
/* For dst_path */
- atomic_inc(&dst_eb->refs);
+ refcount_inc(&dst_eb->refs);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
@@ -2598,7 +2549,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(dst_path);
if (ret < 0)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret);
return ret;
}
@@ -2621,7 +2572,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
int level;
u8 drop_subptree_thres;
struct extent_buffer *eb = root_eb;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL);
ASSERT(root_eb != NULL);
@@ -2642,7 +2593,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
* mark qgroup inconsistent.
*/
if (root_level >= drop_subptree_thres) {
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "subtree level reached threshold");
return 0;
}
@@ -2654,12 +2605,12 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
ret = btrfs_read_extent_buffer(root_eb, &check);
if (ret)
- goto out;
+ return ret;
}
if (root_level == 0) {
ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
- goto out;
+ return ret;
}
path = btrfs_alloc_path();
@@ -2675,7 +2626,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
* walk back up the tree (adjusting slot pointers as we go)
* and restart the search process.
*/
- atomic_inc(&root_eb->refs); /* For path */
+ refcount_inc(&root_eb->refs); /* For path */
path->nodes[root_level] = root_eb;
path->slots[root_level] = 0;
path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
@@ -2695,10 +2646,8 @@ walk_down:
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
eb = btrfs_read_node_slot(eb, parent_slot);
- if (IS_ERR(eb)) {
- ret = PTR_ERR(eb);
- goto out;
- }
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
path->nodes[level] = eb;
path->slots[level] = 0;
@@ -2709,14 +2658,14 @@ walk_down:
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
fs_info->nodesize);
if (ret)
- goto out;
+ return ret;
}
if (level == 0) {
ret = btrfs_qgroup_trace_leaf_items(trans,
path->nodes[level]);
if (ret)
- goto out;
+ return ret;
/* Nonzero return here means we completed our search */
ret = adjust_slots_upwards(path, root_level);
@@ -2730,11 +2679,7 @@ walk_down:
level--;
}
- ret = 0;
-out:
- btrfs_free_path(path);
-
- return ret;
+ return 0;
}
static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup)
@@ -2762,7 +2707,7 @@ static void qgroup_iterator_nested_clean(struct list_head *head)
*/
static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
struct ulist *roots, struct list_head *qgroups,
- u64 seq, int update_old)
+ u64 seq, bool update_old)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -2846,8 +2791,8 @@ static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
- trace_qgroup_update_counters(fs_info, qg, cur_old_count,
- cur_new_count);
+ trace_btrfs_qgroup_update_counters(fs_info, qg, cur_old_count,
+ cur_new_count);
/* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) {
@@ -2941,7 +2886,7 @@ static int maybe_fs_roots(struct ulist *roots)
* trees.
* If it contains a non-fs tree, it won't be shared with fs/subvol trees.
*/
- return is_fstree(unode->val);
+ return btrfs_is_fstree(unode->val);
}
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
@@ -3109,8 +3054,7 @@ cleanup:
kfree(record);
}
- trace_qgroup_num_dirty_extents(fs_info, trans->transid,
- num_dirty_extents);
+ trace_btrfs_qgroup_num_dirty_extents(fs_info, trans->transid, num_dirty_extents);
return ret;
}
@@ -3143,10 +3087,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_info_item(trans, qgroup);
if (ret)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "qgroup info item update error %d", ret);
ret = update_qgroup_limit_item(trans, qgroup);
if (ret)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "qgroup limit item update error %d", ret);
spin_lock(&fs_info->qgroup_lock);
}
if (btrfs_qgroup_enabled(fs_info))
@@ -3157,7 +3103,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
ret = update_qgroup_status_item(trans);
if (ret)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "qgroup status item update error %d", ret);
return ret;
}
@@ -3332,13 +3279,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
- struct btrfs_qgroup *prealloc;
+ struct btrfs_qgroup *prealloc = NULL;
struct btrfs_qgroup_list **qlist_prealloc = NULL;
bool free_inherit = false;
bool need_rescan = false;
u32 level_size = 0;
u64 nums;
+ if (!btrfs_qgroup_enabled(fs_info))
+ return 0;
+
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
if (!prealloc)
return -ENOMEM;
@@ -3362,8 +3312,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
if (!committing)
mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (!btrfs_qgroup_enabled(fs_info))
- goto out;
quota_root = fs_info->quota_root;
if (!quota_root) {
@@ -3564,7 +3512,7 @@ out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (need_rescan)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "qgroup inherit needs a rescan");
if (qlist_prealloc) {
for (int i = 0; i < inherit->num_qgroups; i++)
kfree(qlist_prealloc[i]);
@@ -3572,7 +3520,14 @@ out:
}
if (free_inherit)
kfree(inherit);
- kfree(prealloc);
+
+ /*
+ * At this point we either failed at allocating prealloc, or we
+ * succeeded and passed the ownership to it to add_qgroup_rb(). In any
+ * case, this needs to be NULL or there is something wrong.
+ */
+ ASSERT(prealloc == NULL);
+
return ret;
}
@@ -3598,7 +3553,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
int ret = 0;
LIST_HEAD(qgroup_list);
- if (!is_fstree(ref_root))
+ if (!btrfs_is_fstree(ref_root))
return 0;
if (num_bytes == 0)
@@ -3658,7 +3613,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup;
LIST_HEAD(qgroup_list);
- if (!is_fstree(ref_root))
+ if (!btrfs_is_fstree(ref_root))
return;
if (num_bytes == 0)
@@ -3740,10 +3695,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
path, 1, 0);
btrfs_debug(fs_info,
- "current progress key (%llu %u %llu), search_slot ret %d",
- fs_info->qgroup_rescan_progress.objectid,
- fs_info->qgroup_rescan_progress.type,
- fs_info->qgroup_rescan_progress.offset, ret);
+ "current progress key " BTRFS_KEY_FMT ", search_slot ret %d",
+ BTRFS_KEY_FMT_VALUE(&fs_info->qgroup_rescan_progress), ret);
if (ret) {
/*
@@ -3845,8 +3798,8 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
* Rescan should only search for commit root, and any later difference
* should be recorded by qgroup
*/
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
while (!ret && !(stopped = rescan_should_stop(fs_info))) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
@@ -4046,12 +3999,21 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup_rescan_zero_tracking(fs_info);
mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_rescan_running = true;
- btrfs_queue_work(fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ /*
+ * The rescan worker is only for full accounting qgroups, check if it's
+ * enabled as it is pointless to queue it otherwise. A concurrent quota
+ * disable may also have just cleared BTRFS_FS_QUOTA_ENABLED.
+ */
+ if (btrfs_qgroup_full_accounting(fs_info)) {
+ fs_info->qgroup_rescan_running = true;
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ } else {
+ ret = -ENOTCONN;
+ }
mutex_unlock(&fs_info->qgroup_rescan_lock);
- return 0;
+ return ret;
}
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
@@ -4138,8 +4100,8 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,
* Now the entry is in [start, start + len), revert the
* EXTENT_QGROUP_RESERVED bit.
*/
- clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
- entry_end, EXTENT_QGROUP_RESERVED);
+ clear_ret = btrfs_clear_extent_bit(&inode->io_tree, entry_start, entry_end,
+ EXTENT_QGROUP_RESERVED, NULL);
if (!ret && clear_ret < 0)
ret = clear_ret;
@@ -4226,7 +4188,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
int ret;
if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)) || len == 0)
+ !btrfs_is_fstree(btrfs_root_id(root)) || len == 0)
return 0;
/* @reserved parameter is mandatory for qgroup */
@@ -4241,8 +4203,9 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
reserved = *reserved_ret;
/* Record already reserved space */
orig_reserved = reserved->bytes_changed;
- ret = set_record_extent_bits(&inode->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, reserved);
+ ret = btrfs_set_record_extent_bits(&inode->io_tree, start,
+ start + len - 1, EXTENT_QGROUP_RESERVED,
+ reserved);
/* Newly reserved space */
to_reserve = reserved->bytes_changed - orig_reserved;
@@ -4335,9 +4298,10 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&inode->io_tree, free_start,
- free_start + free_len - 1,
- EXTENT_QGROUP_RESERVED, &changeset);
+ ret = btrfs_clear_record_extent_bits(&inode->io_tree, free_start,
+ free_start + free_len - 1,
+ EXTENT_QGROUP_RESERVED,
+ &changeset);
if (ret < 0)
goto out;
freed += changeset.bytes_changed;
@@ -4361,9 +4325,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
int ret;
if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
- return clear_record_extent_bits(&inode->io_tree, start,
- start + len - 1,
- EXTENT_QGROUP_RESERVED, NULL);
+ return btrfs_clear_record_extent_bits(&inode->io_tree, start,
+ start + len - 1,
+ EXTENT_QGROUP_RESERVED, NULL);
}
/* In release case, we shouldn't have @reserved */
@@ -4371,8 +4335,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
if (free && reserved)
return qgroup_free_reserved_data(inode, reserved, start, len, released);
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
- EXTENT_QGROUP_RESERVED, &changeset);
+ ret = btrfs_clear_record_extent_bits(&inode->io_tree, start, start + len - 1,
+ EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
@@ -4477,11 +4441,11 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
int ret;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)) || num_bytes == 0)
+ !btrfs_is_fstree(btrfs_root_id(root)) || num_bytes == 0)
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
+ trace_btrfs_qgroup_meta_reserve(root, (s64)num_bytes, type);
ret = qgroup_reserve(root, num_bytes, enforce, type);
if (ret < 0)
return ret;
@@ -4522,11 +4486,11 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)))
+ !btrfs_is_fstree(btrfs_root_id(root)))
return;
/* TODO: Update trace point to handle such free */
- trace_qgroup_meta_free_all_pertrans(root);
+ trace_btrfs_qgroup_meta_free_all_pertrans(root);
/* Special value -1 means to free all reserved space */
btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1,
BTRFS_QGROUP_RSV_META_PERTRANS);
@@ -4538,7 +4502,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)))
+ !btrfs_is_fstree(btrfs_root_id(root)))
return;
/*
@@ -4548,7 +4512,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
*/
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
+ trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, type);
btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type);
}
@@ -4597,12 +4561,12 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)))
+ !btrfs_is_fstree(btrfs_root_id(root)))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
- trace_qgroup_meta_convert(root, num_bytes);
+ trace_btrfs_qgroup_meta_convert(root, num_bytes);
qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes);
if (!sb_rdonly(fs_info->sb))
add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
@@ -4620,8 +4584,8 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
int ret;
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
- EXTENT_QGROUP_RESERVED, &changeset);
+ ret = btrfs_clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
+ EXTENT_QGROUP_RESERVED, &changeset);
WARN_ON(ret < 0);
if (WARN_ON(changeset.bytes_changed)) {
@@ -4681,6 +4645,28 @@ out:
spin_unlock(&swapped_blocks->lock);
}
+static int qgroup_swapped_block_bytenr_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *bytenr = key;
+ const struct btrfs_qgroup_swapped_block *block = rb_entry(node,
+ struct btrfs_qgroup_swapped_block, node);
+
+ if (block->subvol_bytenr < *bytenr)
+ return -1;
+ else if (block->subvol_bytenr > *bytenr)
+ return 1;
+
+ return 0;
+}
+
+static int qgroup_swapped_block_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct btrfs_qgroup_swapped_block *new_block = rb_entry(new,
+ struct btrfs_qgroup_swapped_block, node);
+
+ return qgroup_swapped_block_bytenr_key_cmp(&new_block->subvol_bytenr, existing);
+}
+
/*
* Add subtree roots record into @subvol_root.
*
@@ -4700,16 +4686,15 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
struct btrfs_fs_info *fs_info = subvol_root->fs_info;
struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
struct btrfs_qgroup_swapped_block *block;
- struct rb_node **cur;
- struct rb_node *parent = NULL;
+ struct rb_node *node;
int level = btrfs_header_level(subvol_parent) - 1;
int ret = 0;
if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
- if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
- btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+ if (unlikely(btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot))) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
__func__,
@@ -4750,46 +4735,32 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
/* Insert @block into @blocks */
spin_lock(&blocks->lock);
- cur = &blocks->blocks[level].rb_node;
- while (*cur) {
+ node = rb_find_add(&block->node, &blocks->blocks[level], qgroup_swapped_block_bytenr_cmp);
+ if (node) {
struct btrfs_qgroup_swapped_block *entry;
- parent = *cur;
- entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
- node);
+ entry = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
- if (entry->subvol_bytenr < block->subvol_bytenr) {
- cur = &(*cur)->rb_left;
- } else if (entry->subvol_bytenr > block->subvol_bytenr) {
- cur = &(*cur)->rb_right;
- } else {
- if (entry->subvol_generation !=
- block->subvol_generation ||
- entry->reloc_bytenr != block->reloc_bytenr ||
- entry->reloc_generation !=
- block->reloc_generation) {
- /*
- * Duplicated but mismatch entry found.
- * Shouldn't happen.
- *
- * Marking qgroup inconsistent should be enough
- * for end users.
- */
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- ret = -EEXIST;
- }
- kfree(block);
- goto out_unlock;
+ if (entry->subvol_generation != block->subvol_generation ||
+ entry->reloc_bytenr != block->reloc_bytenr ||
+ entry->reloc_generation != block->reloc_generation) {
+ /*
+ * Duplicated but mismatch entry found. Shouldn't happen.
+ * Marking qgroup inconsistent should be enough for end
+ * users.
+ */
+ DEBUG_WARN("duplicated but mismatched entry found");
+ ret = -EEXIST;
}
+ kfree(block);
+ goto out_unlock;
}
- rb_link_node(&block->node, parent, cur);
- rb_insert_color(&block->node, &blocks->blocks[level]);
blocks->swapped = true;
out_unlock:
spin_unlock(&blocks->lock);
out:
if (ret < 0)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret);
return ret;
}
@@ -4806,10 +4777,9 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_tree_parent_check check = { 0 };
struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
- struct btrfs_qgroup_swapped_block *block;
+ struct btrfs_qgroup_swapped_block AUTO_KFREE(block);
struct extent_buffer *reloc_eb = NULL;
struct rb_node *node;
- bool found = false;
bool swapped = false;
int level = btrfs_header_level(subvol_eb);
int ret = 0;
@@ -4817,7 +4787,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
- if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root)
+ if (!btrfs_is_fstree(btrfs_root_id(root)) || !root->reloc_root)
return 0;
spin_lock(&blocks->lock);
@@ -4825,23 +4795,14 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
spin_unlock(&blocks->lock);
return 0;
}
- node = blocks->blocks[level].rb_node;
-
- while (node) {
- block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
- if (block->subvol_bytenr < subvol_eb->start) {
- node = node->rb_left;
- } else if (block->subvol_bytenr > subvol_eb->start) {
- node = node->rb_right;
- } else {
- found = true;
- break;
- }
- }
- if (!found) {
+ node = rb_find(&subvol_eb->start, &blocks->blocks[level],
+ qgroup_swapped_block_bytenr_key_cmp);
+ if (!node) {
spin_unlock(&blocks->lock);
goto out;
}
+ block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+
/* Found one, remove it from @blocks first and update blocks->swapped */
rb_erase(&block->node, &blocks->blocks[level]);
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
@@ -4865,7 +4826,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
reloc_eb = NULL;
goto free_out;
}
- if (!extent_buffer_uptodate(reloc_eb)) {
+ if (unlikely(!extent_buffer_uptodate(reloc_eb))) {
ret = -EIO;
goto free_out;
}
@@ -4873,14 +4834,12 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
block->last_snapshot, block->trace_leaf);
free_out:
- kfree(block);
free_extent_buffer(reloc_eb);
out:
if (ret < 0) {
- btrfs_err_rl(fs_info,
- "failed to account subtree at bytenr %llu: %d",
- subvol_eb->start, ret);
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "failed to account subtree at bytenr %llu: %d",
+ subvol_eb->start, ret);
}
return ret;
}
@@ -4911,7 +4870,7 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
return 0;
- if (!is_fstree(root))
+ if (!btrfs_is_fstree(root))
return 0;
/* If the extent predates enabling quotas, don't count it. */
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index e233cc79af18..a979fd59a4da 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -22,6 +22,9 @@ struct btrfs_ioctl_quota_ctl_args;
struct btrfs_trans_handle;
struct btrfs_delayed_ref_root;
struct btrfs_inode;
+struct btrfs_transaction;
+struct btrfs_block_group;
+struct btrfs_qgroup_swapped_blocks;
/*
* Btrfs qgroup overview
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 9ffc79f250fb..2987cb7c686e 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -13,12 +13,13 @@
#include "volumes.h"
#include "print-tree.h"
-static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
+static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const struct btrfs_key *oldkey,
u64 newlen, u64 frontpad)
{
- struct btrfs_stripe_extent *extent;
+ struct btrfs_root *stripe_root = trans->fs_info->stripe_root;
+ struct btrfs_stripe_extent *extent, AUTO_KFREE(newitem);
struct extent_buffer *leaf;
int slot;
size_t item_size;
@@ -27,30 +28,42 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
.type = BTRFS_RAID_STRIPE_KEY,
.offset = newlen,
};
+ int ret;
+ ASSERT(newlen > 0);
ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
leaf = path->nodes[0];
slot = path->slots[0];
item_size = btrfs_item_size(leaf, slot);
+
+ newitem = kzalloc(item_size, GFP_NOFS);
+ if (!newitem)
+ return -ENOMEM;
+
extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
struct btrfs_raid_stride *stride = &extent->strides[i];
u64 phys;
- phys = btrfs_raid_stride_physical(leaf, stride);
- btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad);
+ phys = btrfs_raid_stride_physical(leaf, stride) + frontpad;
+ btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys);
}
- btrfs_set_item_key_safe(trans, path, &newkey);
+ ret = btrfs_del_item(trans, stripe_root, path);
+ if (ret)
+ return ret;
+
+ btrfs_release_path(path);
+ return btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size);
}
int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *stripe_root = fs_info->stripe_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *leaf;
u64 found_start;
@@ -59,9 +72,22 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
int slot;
int ret;
- if (!stripe_root)
+ if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE) || !stripe_root)
return 0;
+ if (!btrfs_is_testing(fs_info)) {
+ struct btrfs_chunk_map *map;
+ bool use_rst;
+
+ map = btrfs_find_chunk_map(fs_info, start, length);
+ if (!map)
+ return -EINVAL;
+ use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);
+ btrfs_free_chunk_map(map);
+ if (!use_rst)
+ return 0;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -85,6 +111,37 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
found_end = found_start + key.offset;
ret = 0;
+ /*
+ * The stripe extent starts before the range we want to delete,
+ * but the range spans more than one stripe extent:
+ *
+ * |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---|
+ * |--- keep ---|--- drop ---|
+ *
+ * This means we have to get the previous item, truncate its
+ * length and then restart the search.
+ */
+ if (found_start > start) {
+ if (slot == 0) {
+ ret = btrfs_previous_item(stripe_root, path, start,
+ BTRFS_RAID_STRIPE_KEY);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ break;
+ }
+ } else {
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ found_start = key.objectid;
+ found_end = found_start + key.offset;
+ ASSERT(found_start <= start);
+ }
+
if (key.type != BTRFS_RAID_STRIPE_KEY)
break;
@@ -96,6 +153,54 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
found_start, found_end);
/*
+ * The stripe extent starts before the range we want to delete
+ * and ends after the range we want to delete, i.e. we're
+ * punching a hole in the stripe extent:
+ *
+ * |--- RAID Stripe Extent ---|
+ * | keep |--- drop ---| keep |
+ *
+ * This means we need to a) truncate the existing item and b)
+ * create a second item for the remaining range.
+ */
+ if (found_start < start && found_end > end) {
+ size_t item_size;
+ u64 diff_start = start - found_start;
+ u64 diff_end = found_end - end;
+ struct btrfs_stripe_extent *extent;
+ struct btrfs_key newkey = {
+ .objectid = end,
+ .type = BTRFS_RAID_STRIPE_KEY,
+ .offset = diff_end,
+ };
+
+ /* The "right" item. */
+ ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey);
+ if (ret)
+ break;
+
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_stripe_extent);
+
+ for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
+ struct btrfs_raid_stride *stride = &extent->strides[i];
+ u64 phys;
+
+ phys = btrfs_raid_stride_physical(leaf, stride);
+ phys += diff_start + length;
+ btrfs_set_raid_stride_physical(leaf, stride, phys);
+ }
+
+ /* The "left" item. */
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_partially_delete_raid_extent(trans, path, &key,
+ diff_start, 0);
+ break;
+ }
+
+ /*
* The stripe extent starts before the range we want to delete:
*
* |--- RAID Stripe Extent ---|
@@ -105,11 +210,18 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
* length to the new size and then re-insert the item.
*/
if (found_start < start) {
- u64 diff = start - found_start;
+ u64 diff_start = start - found_start;
btrfs_partially_delete_raid_extent(trans, path, &key,
- diff, 0);
- break;
+ diff_start, 0);
+
+ start += (key.offset - diff_start);
+ length -= (key.offset - diff_start);
+ if (length == 0)
+ break;
+
+ btrfs_release_path(path);
+ continue;
}
/*
@@ -122,13 +234,16 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
* length to the new size and then re-insert the item.
*/
if (found_end > end) {
- u64 diff = found_end - end;
+ u64 diff_end = found_end - end;
btrfs_partially_delete_raid_extent(trans, path, &key,
- diff, diff);
+ key.offset - length,
+ length);
+ ASSERT(key.offset - diff_end == length);
break;
}
+ /* Finally we can delete the whole item, no more special cases. */
ret = btrfs_del_item(trans, stripe_root, path);
if (ret)
break;
@@ -141,7 +256,6 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
btrfs_release_path(path);
}
- btrfs_free_path(path);
return ret;
}
@@ -150,7 +264,7 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
struct btrfs_stripe_extent *stripe_extent,
const size_t item_size)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
int ret;
int slot;
@@ -169,8 +283,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot),
item_size);
- btrfs_mark_buffer_dirty(trans, leaf);
- btrfs_free_path(path);
return ret;
}
@@ -183,12 +295,12 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_key stripe_key;
struct btrfs_root *stripe_root = fs_info->stripe_root;
const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
- struct btrfs_stripe_extent *stripe_extent;
+ struct btrfs_stripe_extent AUTO_KFREE(stripe_extent);
const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
int ret;
stripe_extent = kzalloc(item_size, GFP_NOFS);
- if (!stripe_extent) {
+ if (!unlikely(stripe_extent)) {
btrfs_abort_transaction(trans, -ENOMEM);
btrfs_end_transaction(trans);
return -ENOMEM;
@@ -199,12 +311,8 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
for (int i = 0; i < num_stripes; i++) {
u64 devid = bioc->stripes[i].dev->devid;
u64 physical = bioc->stripes[i].physical;
- u64 length = bioc->stripes[i].length;
struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
- if (length == 0)
- length = bioc->size;
-
btrfs_set_stack_raid_stride_devid(raid_stride, devid);
btrfs_set_stack_raid_stride_physical(raid_stride, physical);
}
@@ -215,13 +323,14 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
item_size);
- if (ret == -EEXIST)
+ if (ret == -EEXIST) {
ret = update_raid_extent_item(trans, &stripe_key, stripe_extent,
item_size);
- if (ret)
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else if (ret) {
btrfs_abort_transaction(trans, ret);
-
- kfree(stripe_extent);
+ }
return ret;
}
@@ -259,7 +368,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
struct btrfs_stripe_extent *stripe_extent;
struct btrfs_key stripe_key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
const u64 end = logical + *length;
int num_stripes;
@@ -279,13 +388,13 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
return -ENOMEM;
if (stripe->rst_search_commit_root) {
- path->skip_locking = 1;
- path->search_commit_root = 1;
+ path->skip_locking = true;
+ path->search_commit_root = true;
}
ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
if (ret < 0)
- goto free_path;
+ return ret;
if (ret) {
if (path->slots[0] != 0)
path->slots[0]--;
@@ -342,8 +451,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
trace_btrfs_get_raid_extent_offset(fs_info, logical, *length,
stripe->physical, devid);
- ret = 0;
- goto free_path;
+ return 0;
}
/* If we're here, we haven't found the requested devid in the stripe. */
@@ -357,8 +465,6 @@ out:
logical, logical + *length, stripe->dev->devid,
btrfs_bg_type_to_raid_name(map_type));
}
-free_path:
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 541836421778..69942ad43140 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <uapi/linux/btrfs_tree.h>
#include "fs.h"
+#include "accessors.h"
#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID1_MASK | \
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index cdd373c27784..f38d8305e46d 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -66,10 +66,10 @@ static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
dump_bioc(fs_info, rbio->bioc);
btrfs_crit(fs_info,
-"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx",
+"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u sector_nsteps=%u scrubp=%u dbitmap=0x%lx",
rbio->flags, rbio->nr_sectors, rbio->nr_data,
rbio->real_stripes, rbio->stripe_nsectors,
- rbio->scrubp, rbio->dbitmap);
+ rbio->sector_nsteps, rbio->scrubp, rbio->dbitmap);
}
#define ASSERT_RBIO(expr, rbio) \
@@ -134,15 +134,10 @@ struct btrfs_stripe_hash_table {
};
/*
- * A bvec like structure to present a sector inside a page.
- *
- * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
+ * The PFN may still be valid, but our paddrs should always be block size
+ * aligned, thus such -1 paddr is definitely not a valid one.
*/
-struct sector_ptr {
- struct page *page;
- unsigned int pgoff:24;
- unsigned int uptodate:8;
-};
+#define INVALID_PADDR (~(phys_addr_t)0)
static void rmw_rbio_work(struct work_struct *work);
static void rmw_rbio_work_locked(struct work_struct *work);
@@ -156,8 +151,8 @@ static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
{
bitmap_free(rbio->error_bitmap);
kfree(rbio->stripe_pages);
- kfree(rbio->bio_sectors);
- kfree(rbio->stripe_sectors);
+ kfree(rbio->bio_paddrs);
+ kfree(rbio->stripe_paddrs);
kfree(rbio->finish_pointers);
}
@@ -200,8 +195,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
struct btrfs_stripe_hash_table *x;
struct btrfs_stripe_hash *cur;
struct btrfs_stripe_hash *h;
- int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
- int i;
+ unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
if (info->stripe_hash_table)
return 0;
@@ -222,7 +216,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
h = table->table;
- for (i = 0; i < num_entries; i++) {
+ for (unsigned int i = 0; i < num_entries; i++) {
cur = h + i;
INIT_LIST_HEAD(&cur->hash_list);
spin_lock_init(&cur->lock);
@@ -233,6 +227,24 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
return 0;
}
+static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr)
+{
+ const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
+
+ ASSERT(sector_nr < rbio->nr_sectors);
+ for (int i = 0; i < rbio->sector_nsteps; i++) {
+ unsigned int index = sector_nr * rbio->sector_nsteps + i;
+ phys_addr_t dst = rbio->stripe_paddrs[index];
+ phys_addr_t src = rbio->bio_paddrs[index];
+
+ ASSERT(dst != INVALID_PADDR);
+ ASSERT(src != INVALID_PADDR);
+
+ memcpy_page(phys_to_page(dst), offset_in_page(dst),
+ phys_to_page(src), offset_in_page(src), step);
+ }
+}
+
/*
* caching an rbio means to copy anything from the
* bio_sectors array into the stripe_pages array. We
@@ -253,24 +265,19 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
for (i = 0; i < rbio->nr_sectors; i++) {
/* Some range not covered by bio (partial write), skip it */
- if (!rbio->bio_sectors[i].page) {
+ if (rbio->bio_paddrs[i * rbio->sector_nsteps] == INVALID_PADDR) {
/*
* Even if the sector is not covered by bio, if it is
* a data sector it should still be uptodate as it is
* read from disk.
*/
if (i < rbio->nr_data * rbio->stripe_nsectors)
- ASSERT(rbio->stripe_sectors[i].uptodate);
+ ASSERT(test_bit(i, rbio->stripe_uptodate_bitmap));
continue;
}
- ASSERT(rbio->stripe_sectors[i].page);
- memcpy_page(rbio->stripe_sectors[i].page,
- rbio->stripe_sectors[i].pgoff,
- rbio->bio_sectors[i].page,
- rbio->bio_sectors[i].pgoff,
- rbio->bioc->fs_info->sectorsize);
- rbio->stripe_sectors[i].uptodate = 1;
+ memcpy_from_bio_to_stripe(rbio, i);
+ set_bit(i, rbio->stripe_uptodate_bitmap);
}
set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
}
@@ -293,19 +300,48 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio)
return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
}
-static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
- unsigned int page_nr)
+/* Get the sector number of the first sector covered by @page_nr. */
+static u32 page_nr_to_sector_nr(struct btrfs_raid_bio *rbio, unsigned int page_nr)
{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ u32 sector_nr;
+
+ ASSERT(page_nr < rbio->nr_pages);
+
+ sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits;
+ ASSERT(sector_nr < rbio->nr_sectors);
+ return sector_nr;
+}
+
+/*
+ * Get the number of sectors covered by @page_nr.
+ *
+ * For bs > ps cases, the result will always be 1.
+ * For bs <= ps cases, the result will be ps / bs.
+ */
+static u32 page_nr_to_num_sectors(struct btrfs_raid_bio *rbio, unsigned int page_nr)
+{
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ u32 nr_sectors;
+
+ ASSERT(page_nr < rbio->nr_pages);
+
+ nr_sectors = round_up(PAGE_SIZE, fs_info->sectorsize) >> fs_info->sectorsize_bits;
+ ASSERT(nr_sectors > 0);
+ return nr_sectors;
+}
+
+static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
+ unsigned int page_nr)
+{
+ const u32 sector_nr = page_nr_to_sector_nr(rbio, page_nr);
+ const u32 nr_bits = page_nr_to_num_sectors(rbio, page_nr);
int i;
ASSERT(page_nr < rbio->nr_pages);
+ ASSERT(sector_nr + nr_bits < rbio->nr_sectors);
- for (i = sectors_per_page * page_nr;
- i < sectors_per_page * page_nr + sectors_per_page;
- i++) {
- if (!rbio->stripe_sectors[i].uptodate)
+ for (i = sector_nr; i < sector_nr + nr_bits; i++) {
+ if (!test_bit(i, rbio->stripe_uptodate_bitmap))
return false;
}
return true;
@@ -318,41 +354,44 @@ static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
*/
static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
u32 offset;
int i;
- for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
+ for (i = 0, offset = 0; i < rbio->nr_sectors * rbio->sector_nsteps;
+ i++, offset += step) {
int page_index = offset >> PAGE_SHIFT;
ASSERT(page_index < rbio->nr_pages);
- rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
- rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
+ if (!rbio->stripe_pages[page_index])
+ continue;
+
+ rbio->stripe_paddrs[i] = page_to_phys(rbio->stripe_pages[page_index]) +
+ offset_in_page(offset);
}
}
static void steal_rbio_page(struct btrfs_raid_bio *src,
struct btrfs_raid_bio *dest, int page_nr)
{
- const u32 sectorsize = src->bioc->fs_info->sectorsize;
- const u32 sectors_per_page = PAGE_SIZE / sectorsize;
- int i;
+ const u32 sector_nr = page_nr_to_sector_nr(src, page_nr);
+ const u32 nr_bits = page_nr_to_num_sectors(src, page_nr);
+
+ ASSERT(page_nr < src->nr_pages);
+ ASSERT(sector_nr + nr_bits < src->nr_sectors);
if (dest->stripe_pages[page_nr])
__free_page(dest->stripe_pages[page_nr]);
dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
src->stripe_pages[page_nr] = NULL;
- /* Also update the sector->uptodate bits. */
- for (i = sectors_per_page * page_nr;
- i < sectors_per_page * page_nr + sectors_per_page; i++)
- dest->stripe_sectors[i].uptodate = true;
+ /* Also update the stripe_uptodate_bitmap bits. */
+ bitmap_set(dest->stripe_uptodate_bitmap, sector_nr, nr_bits);
}
static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
{
- const int sector_nr = (page_nr << PAGE_SHIFT) >>
- rbio->bioc->fs_info->sectorsize_bits;
+ const int sector_nr = page_nr_to_sector_nr(rbio, page_nr);
/*
* We have ensured PAGE_SIZE is aligned with sectorsize, thus
@@ -507,9 +546,8 @@ static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
spin_lock(&table->cache_lock);
while (!list_empty(&table->stripe_cache)) {
- rbio = list_entry(table->stripe_cache.next,
- struct btrfs_raid_bio,
- stripe_cache);
+ rbio = list_first_entry(&table->stripe_cache,
+ struct btrfs_raid_bio, stripe_cache);
__remove_rbio_from_cache(rbio);
}
spin_unlock(&table->cache_lock);
@@ -567,9 +605,9 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
if (table->cache_size > RBIO_CACHE_SIZE) {
struct btrfs_raid_bio *found;
- found = list_entry(table->stripe_cache.prev,
- struct btrfs_raid_bio,
- stripe_cache);
+ found = list_last_entry(&table->stripe_cache,
+ struct btrfs_raid_bio,
+ stripe_cache);
if (found != rbio)
__remove_rbio_from_cache(found);
@@ -667,39 +705,62 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
return 1;
}
-static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
- unsigned int stripe_nr,
- unsigned int sector_nr)
+/* Return the sector index for @stripe_nr and @sector_nr. */
+static unsigned int rbio_sector_index(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr)
{
+ unsigned int ret;
+
ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
- return stripe_nr * rbio->stripe_nsectors + sector_nr;
+ ret = stripe_nr * rbio->stripe_nsectors + sector_nr;
+ ASSERT(ret < rbio->nr_sectors);
+ return ret;
+}
+
+/* Return the paddr array index for @stripe_nr, @sector_nr and @step_nr. */
+static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr,
+ unsigned int step_nr)
+{
+ unsigned int ret;
+
+ ASSERT_RBIO_SECTOR(step_nr < rbio->sector_nsteps, rbio, step_nr);
+
+ ret = rbio_sector_index(rbio, stripe_nr, sector_nr) * rbio->sector_nsteps + step_nr;
+ ASSERT(ret < rbio->nr_sectors * rbio->sector_nsteps);
+ return ret;
}
-/* Return a sector from rbio->stripe_sectors, not from the bio list */
-static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
- unsigned int stripe_nr,
- unsigned int sector_nr)
+static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr, unsigned int sector_nr,
+ unsigned int step_nr)
{
- return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
- sector_nr)];
+ return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)];
}
-/* Grab a sector inside P stripe */
-static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
- unsigned int sector_nr)
+static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr, unsigned int step_nr)
{
- return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
+ return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr, step_nr);
}
-/* Grab a sector inside Q stripe, return NULL if not RAID6 */
-static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
- unsigned int sector_nr)
+static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr, unsigned int step_nr)
{
if (rbio->nr_data + 1 == rbio->real_stripes)
- return NULL;
- return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
+ return INVALID_PADDR;
+ return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr);
+}
+
+/* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */
+static phys_addr_t *rbio_stripe_paddrs(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr, unsigned int sector_nr)
+{
+ return &rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)];
}
/*
@@ -882,14 +943,14 @@ done_nolock:
remove_rbio_from_cache(rbio);
}
-static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
+static void rbio_endio_bio_list(struct bio *cur, blk_status_t status)
{
struct bio *next;
while (cur) {
next = cur->bi_next;
cur->bi_next = NULL;
- cur->bi_status = err;
+ cur->bi_status = status;
bio_endio(cur);
cur = next;
}
@@ -899,7 +960,7 @@ static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
* this frees the rbio and runs through all the bios in the
* bio_list and calls end_io on them
*/
-static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status)
{
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *extra;
@@ -928,13 +989,13 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
extra = bio_list_get(&rbio->bio_list);
free_raid_bio(rbio);
- rbio_endio_bio_list(cur, err);
+ rbio_endio_bio_list(cur, status);
if (extra)
- rbio_endio_bio_list(extra, err);
+ rbio_endio_bio_list(extra, status);
}
/*
- * Get a sector pointer specified by its @stripe_nr and @sector_nr.
+ * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr.
*
* @rbio: The raid bio
* @stripe_nr: Stripe number, valid range [0, real_stripe)
@@ -944,34 +1005,52 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
*
* The read/modify/write code wants to reuse the original bio page as much
* as possible, and only use stripe_sectors as fallback.
+ *
+ * Return NULL if bio_list_only is set but the specified sector has no
+ * coresponding bio.
*/
-static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
- int stripe_nr, int sector_nr,
- bool bio_list_only)
+static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio,
+ int stripe_nr, int sector_nr,
+ bool bio_list_only)
{
- struct sector_ptr *sector;
- int index;
+ phys_addr_t *ret = NULL;
+ const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, 0);
- ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes,
- rbio, stripe_nr);
- ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
- rbio, sector_nr);
+ ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps);
- index = stripe_nr * rbio->stripe_nsectors + sector_nr;
- ASSERT(index >= 0 && index < rbio->nr_sectors);
-
- spin_lock(&rbio->bio_list_lock);
- sector = &rbio->bio_sectors[index];
- if (sector->page || bio_list_only) {
- /* Don't return sector without a valid page pointer */
- if (!sector->page)
- sector = NULL;
- spin_unlock(&rbio->bio_list_lock);
- return sector;
+ scoped_guard(spinlock, &rbio->bio_list_lock) {
+ if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) {
+ /* Don't return sector without a valid page pointer */
+ if (rbio->bio_paddrs[index] != INVALID_PADDR)
+ ret = &rbio->bio_paddrs[index];
+ return ret;
+ }
}
- spin_unlock(&rbio->bio_list_lock);
+ return &rbio->stripe_paddrs[index];
+}
- return &rbio->stripe_sectors[index];
+/*
+ * Similar to sector_paddr_in_rbio(), but with extra consideration for
+ * bs > ps cases, where we can have multiple steps for a fs block.
+ */
+static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio,
+ int stripe_nr, int sector_nr, int step_nr,
+ bool bio_list_only)
+{
+ phys_addr_t ret = INVALID_PADDR;
+ const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr);
+
+ ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps);
+
+ scoped_guard(spinlock, &rbio->bio_list_lock) {
+ if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) {
+ /* Don't return sector without a valid page pointer */
+ if (rbio->bio_paddrs[index] != INVALID_PADDR)
+ ret = rbio->bio_paddrs[index];
+ return ret;
+ }
+ }
+ return rbio->stripe_paddrs[index];
}
/*
@@ -987,10 +1066,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
const unsigned int stripe_nsectors =
BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
const unsigned int num_sectors = stripe_nsectors * real_stripes;
+ const unsigned int step = min(fs_info->sectorsize, PAGE_SIZE);
+ const unsigned int sector_nsteps = fs_info->sectorsize / step;
struct btrfs_raid_bio *rbio;
- /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
- ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
+ /*
+ * For bs <= ps cases, ps must be aligned to bs.
+ * For bs > ps cases, bs must be aligned to ps.
+ */
+ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize) ||
+ IS_ALIGNED(fs_info->sectorsize, PAGE_SIZE));
/*
* Our current stripe len should be fixed to 64k thus stripe_nsectors
* (at most 16) should be no larger than BITS_PER_LONG.
@@ -1009,19 +1094,22 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
return ERR_PTR(-ENOMEM);
rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
GFP_NOFS);
- rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
- GFP_NOFS);
- rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
- GFP_NOFS);
+ rbio->bio_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS);
+ rbio->stripe_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS);
rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
+ rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
- if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
- !rbio->finish_pointers || !rbio->error_bitmap) {
+ if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs ||
+ !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) {
free_raid_bio_pointers(rbio);
kfree(rbio);
return ERR_PTR(-ENOMEM);
}
+ for (int i = 0; i < num_sectors * sector_nsteps; i++) {
+ rbio->stripe_paddrs[i] = INVALID_PADDR;
+ rbio->bio_paddrs[i] = INVALID_PADDR;
+ }
bio_list_init(&rbio->bio_list);
init_waitqueue_head(&rbio->io_wait);
@@ -1036,6 +1124,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
rbio->real_stripes = real_stripes;
rbio->stripe_npages = stripe_npages;
rbio->stripe_nsectors = stripe_nsectors;
+ rbio->sector_nsteps = sector_nsteps;
refcount_set(&rbio->refs, 1);
atomic_set(&rbio->stripes_pending, 0);
@@ -1080,8 +1169,8 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
* @faila and @failb will also be updated to the first and second stripe
* number of the errors.
*/
-static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
- int *faila, int *failb)
+static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
+ int *faila, int *failb)
{
int stripe_nr;
int found_errors = 0;
@@ -1113,20 +1202,41 @@ static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
return found_errors;
}
+static int bio_add_paddrs(struct bio *bio, phys_addr_t *paddrs, unsigned int nr_steps,
+ unsigned int step)
+{
+ int added = 0;
+ int ret;
+
+ for (int i = 0; i < nr_steps; i++) {
+ ret = bio_add_page(bio, phys_to_page(paddrs[i]), step,
+ offset_in_page(paddrs[i]));
+ if (ret != step)
+ goto revert;
+ added += ret;
+ }
+ return added;
+revert:
+ /*
+ * We don't need to revert the bvec, as the bio will be submitted immediately,
+ * as long as the size is reduced the extra bvec will not be accessed.
+ */
+ bio->bi_iter.bi_size -= added;
+ return 0;
+}
+
/*
* Add a single sector @sector into our list of bios for IO.
*
* Return 0 if everything went well.
- * Return <0 for error.
+ * Return <0 for error, and no byte will be added to @rbio.
*/
-static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
- struct bio_list *bio_list,
- struct sector_ptr *sector,
- unsigned int stripe_nr,
- unsigned int sector_nr,
- enum req_op op)
+static int rbio_add_io_paddrs(struct btrfs_raid_bio *rbio, struct bio_list *bio_list,
+ phys_addr_t *paddrs, unsigned int stripe_nr,
+ unsigned int sector_nr, enum req_op op)
{
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ const u32 step = min(sectorsize, PAGE_SIZE);
struct bio *last = bio_list->tail;
int ret;
struct bio *bio;
@@ -1142,7 +1252,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
rbio, stripe_nr);
ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
rbio, sector_nr);
- ASSERT(sector->page);
+ ASSERT(paddrs != NULL);
stripe = &rbio->bioc->stripes[stripe_nr];
disk_start = stripe->physical + sector_nr * sectorsize;
@@ -1155,9 +1265,9 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
rbio->error_bitmap);
/* Check if we have reached tolerance early. */
- found_errors = get_rbio_veritical_errors(rbio, sector_nr,
- NULL, NULL);
- if (found_errors > rbio->bioc->max_errors)
+ found_errors = get_rbio_vertical_errors(rbio, sector_nr,
+ NULL, NULL);
+ if (unlikely(found_errors > rbio->bioc->max_errors))
return -EIO;
return 0;
}
@@ -1173,8 +1283,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
*/
if (last_end == disk_start && !last->bi_status &&
last->bi_bdev == stripe->dev->bdev) {
- ret = bio_add_page(last, sector->page, sectorsize,
- sector->pgoff);
+ ret = bio_add_paddrs(last, paddrs, rbio->sector_nsteps, step);
if (ret == sectorsize)
return 0;
}
@@ -1187,31 +1296,27 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
bio->bi_private = rbio;
- __bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
+ ret = bio_add_paddrs(bio, paddrs, rbio->sector_nsteps, step);
+ ASSERT(ret == sectorsize);
bio_list_add(bio_list, bio);
return 0;
}
static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- struct bio_vec bvec;
- struct bvec_iter iter;
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
+ const u32 step_bits = min(fs_info->sectorsize_bits, PAGE_SHIFT);
+ struct bvec_iter iter = bio->bi_iter;
+ phys_addr_t paddr;
u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
rbio->bioc->full_stripe_logical;
- bio_for_each_segment(bvec, bio, iter) {
- u32 bvec_offset;
+ btrfs_bio_for_each_block(paddr, bio, &iter, step) {
+ unsigned int index = (offset >> step_bits);
- for (bvec_offset = 0; bvec_offset < bvec.bv_len;
- bvec_offset += sectorsize, offset += sectorsize) {
- int index = offset / sectorsize;
- struct sector_ptr *sector = &rbio->bio_sectors[index];
-
- sector->page = bvec.bv_page;
- sector->pgoff = bvec.bv_offset + bvec_offset;
- ASSERT(sector->pgoff < PAGE_SIZE);
- }
+ rbio->bio_paddrs[index] = paddr;
+ offset += step;
}
}
@@ -1289,49 +1394,64 @@ static void assert_rbio(struct btrfs_raid_bio *rbio)
ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
}
-/* Generate PQ for one vertical stripe. */
-static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
+static inline void *kmap_local_paddr(phys_addr_t paddr)
+{
+ /* The sector pointer must have a page mapped to it. */
+ ASSERT(paddr != INVALID_PADDR);
+
+ return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
+}
+
+static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr,
+ unsigned int step_nr)
{
void **pointers = rbio->finish_pointers;
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- struct sector_ptr *sector;
+ const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
int stripe;
const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
/* First collect one sector from each data stripe */
- for (stripe = 0; stripe < rbio->nr_data; stripe++) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 0);
- pointers[stripe] = kmap_local_page(sector->page) +
- sector->pgoff;
- }
+ for (stripe = 0; stripe < rbio->nr_data; stripe++)
+ pointers[stripe] = kmap_local_paddr(
+ sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0));
/* Then add the parity stripe */
- sector = rbio_pstripe_sector(rbio, sectornr);
- sector->uptodate = 1;
- pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
+ pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr));
if (has_qstripe) {
/*
* RAID6, add the qstripe and call the library function
* to fill in our p/q
*/
- sector = rbio_qstripe_sector(rbio, sectornr);
- sector->uptodate = 1;
- pointers[stripe++] = kmap_local_page(sector->page) +
- sector->pgoff;
+ pointers[stripe++] = kmap_local_paddr(
+ rbio_qstripe_paddr(rbio, sector_nr, step_nr));
assert_rbio(rbio);
- raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
- pointers);
+ raid6_call.gen_syndrome(rbio->real_stripes, step, pointers);
} else {
/* raid5 */
- memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
- run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
+ memcpy(pointers[rbio->nr_data], pointers[0], step);
+ run_xor(pointers + 1, rbio->nr_data - 1, step);
}
for (stripe = stripe - 1; stripe >= 0; stripe--)
kunmap_local(pointers[stripe]);
}
+/* Generate PQ for one vertical stripe. */
+static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
+{
+ const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6);
+
+ for (int i = 0; i < rbio->sector_nsteps; i++)
+ generate_pq_vertical_step(rbio, sectornr, i);
+
+ set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr),
+ rbio->stripe_uptodate_bitmap);
+ if (has_qstripe)
+ set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr),
+ rbio->stripe_uptodate_bitmap);
+}
+
static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
struct bio_list *bio_list)
{
@@ -1358,7 +1478,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
*/
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
stripe = total_sector_nr / rbio->stripe_nsectors;
sectornr = total_sector_nr % rbio->stripe_nsectors;
@@ -1368,14 +1488,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
continue;
if (stripe < rbio->nr_data) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (!sector)
+ paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
+ if (paddrs == NULL)
continue;
} else {
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
}
- ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
+ ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, stripe,
sectornr, REQ_OP_WRITE);
if (ret)
goto error;
@@ -1393,7 +1513,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
stripe = total_sector_nr / rbio->stripe_nsectors;
sectornr = total_sector_nr % rbio->stripe_nsectors;
@@ -1418,14 +1538,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
continue;
if (stripe < rbio->nr_data) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (!sector)
+ paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
+ if (paddrs == NULL)
continue;
} else {
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
}
- ret = rbio_add_io_sector(rbio, bio_list, sector,
+ ret = rbio_add_io_paddrs(rbio, bio_list, paddrs,
rbio->real_stripes,
sectornr, REQ_OP_WRITE);
if (ret)
@@ -1473,22 +1593,17 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
}
/*
- * For subpage case, we can no longer set page Up-to-date directly for
- * stripe_pages[], thus we need to locate the sector.
+ * Return the index inside the rbio->stripe_sectors[] array.
+ *
+ * Return -1 if not found.
*/
-static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
- struct page *page,
- unsigned int pgoff)
+static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr)
{
- int i;
-
- for (i = 0; i < rbio->nr_sectors; i++) {
- struct sector_ptr *sector = &rbio->stripe_sectors[i];
-
- if (sector->page == page && sector->pgoff == pgoff)
- return sector;
+ for (int i = 0; i < rbio->nr_sectors; i++) {
+ if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == paddr)
+ return i;
}
- return NULL;
+ return -1;
}
/*
@@ -1498,38 +1613,34 @@ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
{
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ const u32 step = min(sectorsize, PAGE_SIZE);
+ u32 offset = 0;
+ phys_addr_t paddr;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct sector_ptr *sector;
- int pgoff;
+ btrfs_bio_for_each_block_all(paddr, bio, step) {
+ /* Hitting the first step of a sector. */
+ if (IS_ALIGNED(offset, sectorsize)) {
+ int sector_nr = find_stripe_sector_nr(rbio, paddr);
- for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
- pgoff += sectorsize) {
- sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
- ASSERT(sector);
- if (sector)
- sector->uptodate = 1;
+ ASSERT(sector_nr >= 0);
+ if (sector_nr >= 0)
+ set_bit(sector_nr, rbio->stripe_uptodate_bitmap);
}
+ offset += step;
}
}
static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- struct bio_vec *bv = bio_first_bvec_all(bio);
+ phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio));
int i;
for (i = 0; i < rbio->nr_sectors; i++) {
- struct sector_ptr *sector;
-
- sector = &rbio->stripe_sectors[i];
- if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+ if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == bvec_paddr)
break;
- sector = &rbio->bio_sectors[i];
- if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+ if (rbio->bio_paddrs[i * rbio->sector_nsteps] == bvec_paddr)
break;
}
ASSERT(i < rbio->nr_sectors);
@@ -1562,9 +1673,12 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
+ const u32 nr_steps = rbio->sector_nsteps;
int total_sector_nr = get_bio_sector_nr(rbio, bio);
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ u32 offset = 0;
+ phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
+ phys_addr_t paddr;
/* No data csum for the whole stripe, no need to verify. */
if (!rbio->csum_bitmap || !rbio->csum_buf)
@@ -1574,26 +1688,26 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
return;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- int bv_offset;
+ btrfs_bio_for_each_block_all(paddr, bio, step) {
+ u8 csum_buf[BTRFS_CSUM_SIZE];
+ u8 *expected_csum;
- for (bv_offset = bvec->bv_offset;
- bv_offset < bvec->bv_offset + bvec->bv_len;
- bv_offset += fs_info->sectorsize, total_sector_nr++) {
- u8 csum_buf[BTRFS_CSUM_SIZE];
- u8 *expected_csum = rbio->csum_buf +
- total_sector_nr * fs_info->csum_size;
- int ret;
+ paddrs[(offset / step) % nr_steps] = paddr;
+ offset += step;
- /* No csum for this sector, skip to the next sector. */
- if (!test_bit(total_sector_nr, rbio->csum_bitmap))
- continue;
+ /* Not yet covering the full fs block, continue to the next step. */
+ if (!IS_ALIGNED(offset, fs_info->sectorsize))
+ continue;
- ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
- bv_offset, csum_buf, expected_csum);
- if (ret < 0)
- set_bit(total_sector_nr, rbio->error_bitmap);
- }
+ /* No csum for this sector, skip to the next sector. */
+ if (!test_bit(total_sector_nr, rbio->csum_bitmap))
+ continue;
+
+ expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size;
+ btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf);
+ if (unlikely(memcmp(csum_buf, expected_csum, fs_info->csum_size) != 0))
+ set_bit(total_sector_nr, rbio->error_bitmap);
+ total_sector_nr++;
}
}
@@ -1689,8 +1803,8 @@ static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
list_sort(NULL, &plug->rbio_list, plug_cmp);
while (!list_empty(&plug->rbio_list)) {
- cur = list_entry(plug->rbio_list.next,
- struct btrfs_raid_bio, plug_list);
+ cur = list_first_entry(&plug->rbio_list,
+ struct btrfs_raid_bio, plug_list);
list_del_init(&cur->plug_list);
if (rbio_is_full(cur)) {
@@ -1788,10 +1902,9 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio,
int stripe_nr, int sector_nr)
{
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
u8 csum_buf[BTRFS_CSUM_SIZE];
u8 *csum_expected;
- int ret;
if (!rbio->csum_bitmap || !rbio->csum_buf)
return 0;
@@ -1804,57 +1917,32 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio,
* bio list if possible.
*/
if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
- sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
+ paddrs = sector_paddrs_in_rbio(rbio, stripe_nr, sector_nr, 0);
} else {
- sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
+ paddrs = rbio_stripe_paddrs(rbio, stripe_nr, sector_nr);
}
- ASSERT(sector->page);
-
csum_expected = rbio->csum_buf +
(stripe_nr * rbio->stripe_nsectors + sector_nr) *
fs_info->csum_size;
- ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
- csum_buf, csum_expected);
- return ret;
+ btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf);
+ if (unlikely(memcmp(csum_buf, csum_expected, fs_info->csum_size) != 0))
+ return -EIO;
+ return 0;
}
-/*
- * Recover a vertical stripe specified by @sector_nr.
- * @*pointers are the pre-allocated pointers by the caller, so we don't
- * need to allocate/free the pointers again and again.
- */
-static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
- void **pointers, void **unmap_array)
+static void recover_vertical_step(struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr,
+ unsigned int step_nr,
+ int faila, int failb,
+ void **pointers, void **unmap_array)
{
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
- struct sector_ptr *sector;
- const u32 sectorsize = fs_info->sectorsize;
- int found_errors;
- int faila;
- int failb;
+ const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
int stripe_nr;
- int ret = 0;
- /*
- * Now we just use bitmap to mark the horizontal stripes in
- * which we have data when doing parity scrub.
- */
- if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
- !test_bit(sector_nr, &rbio->dbitmap))
- return 0;
-
- found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
- &failb);
- /*
- * No errors in the vertical stripe, skip it. Can happen for recovery
- * which only part of a stripe failed csum check.
- */
- if (!found_errors)
- return 0;
-
- if (found_errors > rbio->bioc->max_errors)
- return -EIO;
+ ASSERT(step_nr < rbio->sector_nsteps);
+ ASSERT(sector_nr < rbio->stripe_nsectors);
/*
* Setup our array of pointers with sectors from each stripe
@@ -1863,18 +1951,18 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
* pointer order.
*/
for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+ phys_addr_t paddr;
+
/*
* If we're rebuilding a read, we have to use pages from the
* bio list if possible.
*/
if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
- sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
+ paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0);
} else {
- sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
+ paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr);
}
- ASSERT(sector->page);
- pointers[stripe_nr] = kmap_local_page(sector->page) +
- sector->pgoff;
+ pointers[stripe_nr] = kmap_local_paddr(paddr);
unmap_array[stripe_nr] = pointers[stripe_nr];
}
@@ -1920,10 +2008,10 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
}
if (failb == rbio->real_stripes - 2) {
- raid6_datap_recov(rbio->real_stripes, sectorsize,
+ raid6_datap_recov(rbio->real_stripes, step,
faila, pointers);
} else {
- raid6_2data_recov(rbio->real_stripes, sectorsize,
+ raid6_2data_recov(rbio->real_stripes, step,
faila, failb, pointers);
}
} else {
@@ -1933,7 +2021,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
ASSERT(failb == -1);
pstripe:
/* Copy parity block into failed block to start with */
- memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
+ memcpy(pointers[faila], pointers[rbio->nr_data], step);
/* Rearrange the pointer array */
p = pointers[faila];
@@ -1943,40 +2031,66 @@ pstripe:
pointers[rbio->nr_data - 1] = p;
/* Xor in the rest */
- run_xor(pointers, rbio->nr_data - 1, sectorsize);
-
+ run_xor(pointers, rbio->nr_data - 1, step);
}
+cleanup:
+ for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
+ kunmap_local(unmap_array[stripe_nr]);
+}
+
+/*
+ * Recover a vertical stripe specified by @sector_nr.
+ * @*pointers are the pre-allocated pointers by the caller, so we don't
+ * need to allocate/free the pointers again and again.
+ */
+static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
+ void **pointers, void **unmap_array)
+{
+ int found_errors;
+ int faila;
+ int failb;
+ int ret = 0;
+
/*
- * No matter if this is a RMW or recovery, we should have all
- * failed sectors repaired in the vertical stripe, thus they are now
- * uptodate.
- * Especially if we determine to cache the rbio, we need to
- * have at least all data sectors uptodate.
- *
- * If possible, also check if the repaired sector matches its data
- * checksum.
+ * Now we just use bitmap to mark the horizontal stripes in
+ * which we have data when doing parity scrub.
*/
+ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+ !test_bit(sector_nr, &rbio->dbitmap))
+ return 0;
+
+ found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila,
+ &failb);
+ /*
+ * No errors in the vertical stripe, skip it. Can happen for recovery
+ * which only part of a stripe failed csum check.
+ */
+ if (!found_errors)
+ return 0;
+
+ if (unlikely(found_errors > rbio->bioc->max_errors))
+ return -EIO;
+
+ for (int i = 0; i < rbio->sector_nsteps; i++)
+ recover_vertical_step(rbio, sector_nr, i, faila, failb,
+ pointers, unmap_array);
if (faila >= 0) {
ret = verify_one_sector(rbio, faila, sector_nr);
if (ret < 0)
- goto cleanup;
+ return ret;
- sector = rbio_stripe_sector(rbio, faila, sector_nr);
- sector->uptodate = 1;
+ set_bit(rbio_sector_index(rbio, faila, sector_nr),
+ rbio->stripe_uptodate_bitmap);
}
if (failb >= 0) {
ret = verify_one_sector(rbio, failb, sector_nr);
if (ret < 0)
- goto cleanup;
+ return ret;
- sector = rbio_stripe_sector(rbio, failb, sector_nr);
- sector->uptodate = 1;
+ set_bit(rbio_sector_index(rbio, failb, sector_nr),
+ rbio->stripe_uptodate_bitmap);
}
-
-cleanup:
- for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
- kunmap_local(unmap_array[stripe_nr]);
return ret;
}
@@ -2051,7 +2165,7 @@ static void recover_rbio(struct btrfs_raid_bio *rbio)
total_sector_nr++) {
int stripe = total_sector_nr / rbio->stripe_nsectors;
int sectornr = total_sector_nr % rbio->stripe_nsectors;
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
/*
* Skip the range which has error. It can be a range which is
@@ -2068,8 +2182,8 @@ static void recover_rbio(struct btrfs_raid_bio *rbio)
continue;
}
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
+ ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
sectornr, REQ_OP_READ);
if (ret < 0) {
bio_list_put(&bio_list);
@@ -2114,7 +2228,7 @@ static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_n
int faila;
int failb;
- found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ found_errors = get_rbio_vertical_errors(rbio, sector_nr,
&faila, &failb);
/* This vertical stripe doesn't have errors. */
if (!found_errors)
@@ -2258,13 +2372,13 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
*/
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
- struct sector_ptr *sector;
int stripe = total_sector_nr / rbio->stripe_nsectors;
int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ phys_addr_t *paddrs;
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- stripe, sectornr, REQ_OP_READ);
+ paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
+ ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
+ sectornr, REQ_OP_READ);
if (ret) {
bio_list_put(&bio_list);
return ret;
@@ -2282,9 +2396,8 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
static void raid_wait_write_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- blk_status_t err = bio->bi_status;
- if (err)
+ if (bio->bi_status)
rbio_update_error_bitmap(rbio, bio);
bio_put(bio);
if (atomic_dec_and_test(&rbio->stripes_pending))
@@ -2319,14 +2432,15 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
int i;
for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
- struct sector_ptr *sector = &rbio->stripe_sectors[i];
+ phys_addr_t paddr = rbio->stripe_paddrs[i * rbio->sector_nsteps];
/*
* We have a sector which doesn't have page nor uptodate,
* thus this rbio can not be cached one, as cached one must
* have all its data sectors present and uptodate.
*/
- if (!sector->page || !sector->uptodate)
+ if (paddr == INVALID_PADDR ||
+ !test_bit(i, rbio->stripe_uptodate_bitmap))
return true;
}
return false;
@@ -2407,8 +2521,8 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio)
for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
int found_errors;
- found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
- if (found_errors > rbio->bioc->max_errors) {
+ found_errors = get_rbio_vertical_errors(rbio, sectornr, NULL, NULL);
+ if (unlikely(found_errors > rbio->bioc->max_errors)) {
ret = -EIO;
break;
}
@@ -2478,46 +2592,121 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
return rbio;
}
+static int alloc_rbio_sector_pages(struct btrfs_raid_bio *rbio,
+ int sector_nr)
+{
+ const u32 step = min(PAGE_SIZE, rbio->bioc->fs_info->sectorsize);
+ const u32 base = sector_nr * rbio->sector_nsteps;
+
+ for (int i = base; i < base + rbio->sector_nsteps; i++) {
+ const unsigned int page_index = (i * step) >> PAGE_SHIFT;
+ struct page *page;
+
+ if (rbio->stripe_pages[page_index])
+ continue;
+ page = alloc_page(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[page_index] = page;
+ }
+ return 0;
+}
+
/*
* We just scrub the parity that we have correct data on the same horizontal,
* so we needn't allocate all pages for all the stripes.
*/
static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
int total_sector_nr;
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
- struct page *page;
int sectornr = total_sector_nr % rbio->stripe_nsectors;
- int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
+ int ret;
if (!test_bit(sectornr, &rbio->dbitmap))
continue;
- if (rbio->stripe_pages[index])
- continue;
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[index] = page;
+ ret = alloc_rbio_sector_pages(rbio, total_sector_nr);
+ if (ret < 0)
+ return ret;
}
index_stripe_sectors(rbio);
return 0;
}
+/* Return true if the content of the step matches the caclulated one. */
+static bool verify_one_parity_step(struct btrfs_raid_bio *rbio,
+ void *pointers[], unsigned int sector_nr,
+ unsigned int step_nr)
+{
+ const unsigned int nr_data = rbio->nr_data;
+ const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2);
+ const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
+ void *parity;
+ bool ret = false;
+
+ ASSERT(step_nr < rbio->sector_nsteps);
+
+ /* First collect one page from each data stripe. */
+ for (int stripe = 0; stripe < nr_data; stripe++)
+ pointers[stripe] = kmap_local_paddr(
+ sector_paddr_in_rbio(rbio, stripe, sector_nr,
+ step_nr, 0));
+
+ if (has_qstripe) {
+ assert_rbio(rbio);
+ /* RAID6, call the library function to fill in our P/Q. */
+ raid6_call.gen_syndrome(rbio->real_stripes, step, pointers);
+ } else {
+ /* RAID5. */
+ memcpy(pointers[nr_data], pointers[0], step);
+ run_xor(pointers + 1, nr_data - 1, step);
+ }
+
+ /* Check scrubbing parity and repair it. */
+ parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr));
+ if (memcmp(parity, pointers[rbio->scrubp], step) != 0)
+ memcpy(parity, pointers[rbio->scrubp], step);
+ else
+ ret = true;
+ kunmap_local(parity);
+
+ for (int stripe = nr_data - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
+ return ret;
+}
+
+/*
+ * The @pointers array should have the P/Q parity already mapped.
+ */
+static void verify_one_parity_sector(struct btrfs_raid_bio *rbio,
+ void *pointers[], unsigned int sector_nr)
+{
+ bool found_error = false;
+
+ for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) {
+ bool match;
+
+ match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr);
+ if (!match)
+ found_error = true;
+ }
+ if (!found_error)
+ bitmap_clear(&rbio->dbitmap, sector_nr, 1);
+}
+
static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
{
struct btrfs_io_context *bioc = rbio->bioc;
- const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
unsigned long *pbitmap = &rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
- int stripe;
int sectornr;
bool has_qstripe;
- struct sector_ptr p_sector = { 0 };
- struct sector_ptr q_sector = { 0 };
+ struct page *page;
+ phys_addr_t p_paddr = INVALID_PADDR;
+ phys_addr_t q_paddr = INVALID_PADDR;
struct bio_list bio_list;
int is_replace = 0;
int ret;
@@ -2547,73 +2736,39 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
*/
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- p_sector.page = alloc_page(GFP_NOFS);
- if (!p_sector.page)
+ page = alloc_page(GFP_NOFS);
+ if (!page)
return -ENOMEM;
- p_sector.pgoff = 0;
- p_sector.uptodate = 1;
+ p_paddr = page_to_phys(page);
+ page = NULL;
+ pointers[nr_data] = kmap_local_paddr(p_paddr);
if (has_qstripe) {
/* RAID6, allocate and map temp space for the Q stripe */
- q_sector.page = alloc_page(GFP_NOFS);
- if (!q_sector.page) {
- __free_page(p_sector.page);
- p_sector.page = NULL;
+ page = alloc_page(GFP_NOFS);
+ if (!page) {
+ __free_page(phys_to_page(p_paddr));
+ p_paddr = INVALID_PADDR;
return -ENOMEM;
}
- q_sector.pgoff = 0;
- q_sector.uptodate = 1;
- pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
+ q_paddr = page_to_phys(page);
+ page = NULL;
+ pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr);
}
bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
/* Map the parity stripe just once */
- pointers[nr_data] = kmap_local_page(p_sector.page);
-
- for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
- struct sector_ptr *sector;
- void *parity;
-
- /* first collect one page from each data stripe */
- for (stripe = 0; stripe < nr_data; stripe++) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 0);
- pointers[stripe] = kmap_local_page(sector->page) +
- sector->pgoff;
- }
-
- if (has_qstripe) {
- assert_rbio(rbio);
- /* RAID6, call the library function to fill in our P/Q */
- raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
- pointers);
- } else {
- /* raid5 */
- memcpy(pointers[nr_data], pointers[0], sectorsize);
- run_xor(pointers + 1, nr_data - 1, sectorsize);
- }
- /* Check scrubbing parity and repair it */
- sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
- parity = kmap_local_page(sector->page) + sector->pgoff;
- if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
- memcpy(parity, pointers[rbio->scrubp], sectorsize);
- else
- /* Parity is right, needn't writeback */
- bitmap_clear(&rbio->dbitmap, sectornr, 1);
- kunmap_local(parity);
-
- for (stripe = nr_data - 1; stripe >= 0; stripe--)
- kunmap_local(pointers[stripe]);
- }
+ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors)
+ verify_one_parity_sector(rbio, pointers, sectornr);
kunmap_local(pointers[nr_data]);
- __free_page(p_sector.page);
- p_sector.page = NULL;
- if (q_sector.page) {
- kunmap_local(pointers[rbio->real_stripes - 1]);
- __free_page(q_sector.page);
- q_sector.page = NULL;
+ __free_page(phys_to_page(p_paddr));
+ p_paddr = INVALID_PADDR;
+ if (q_paddr != INVALID_PADDR) {
+ __free_page(phys_to_page(q_paddr));
+ q_paddr = INVALID_PADDR;
}
/*
@@ -2622,10 +2777,10 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
* everything else.
*/
for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
- sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
- ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
+ paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->scrubp,
sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
@@ -2640,11 +2795,10 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
*/
ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
- sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- rbio->real_stripes,
+ paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->real_stripes,
sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
@@ -2692,9 +2846,9 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
int failb;
int found_errors;
- found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ found_errors = get_rbio_vertical_errors(rbio, sector_nr,
&faila, &failb);
- if (found_errors > rbio->bioc->max_errors) {
+ if (unlikely(found_errors > rbio->bioc->max_errors)) {
ret = -EIO;
goto out;
}
@@ -2718,7 +2872,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
* data, so the capability of the repair is declined. (In the
* case of RAID5, we can not repair anything.)
*/
- if (dfail > rbio->bioc->max_errors - 1) {
+ if (unlikely(dfail > rbio->bioc->max_errors - 1)) {
ret = -EIO;
goto out;
}
@@ -2735,7 +2889,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
* scrubbing parity, luckily, use the other one to repair the
* data, or we can not repair the data stripe.
*/
- if (failp != rbio->scrubp) {
+ if (unlikely(failp != rbio->scrubp)) {
ret = -EIO;
goto out;
}
@@ -2761,7 +2915,7 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
total_sector_nr++) {
int sectornr = total_sector_nr % rbio->stripe_nsectors;
int stripe = total_sector_nr / rbio->stripe_nsectors;
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
/* No data in the vertical stripe, no need to read. */
if (!test_bit(sectornr, &rbio->dbitmap))
@@ -2769,22 +2923,23 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
/*
* We want to find all the sectors missing from the rbio and
- * read them from the disk. If sector_in_rbio() finds a sector
+ * read them from the disk. If sector_paddr_in_rbio() finds a sector
* in the bio list we don't need to read it off the stripe.
*/
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (sector)
+ paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
+ if (paddrs == NULL)
continue;
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
/*
* The bio cache may have handed us an uptodate sector. If so,
* use it.
*/
- if (sector->uptodate)
+ if (test_bit(rbio_sector_index(rbio, stripe, sectornr),
+ rbio->stripe_uptodate_bitmap))
continue;
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
sectornr, REQ_OP_READ);
if (ret) {
bio_list_put(&bio_list);
@@ -2825,8 +2980,8 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio)
for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
int found_errors;
- found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
- if (found_errors > rbio->bioc->max_errors) {
+ found_errors = get_rbio_vertical_errors(rbio, sector_nr, NULL, NULL);
+ if (unlikely(found_errors > rbio->bioc->max_errors)) {
ret = -EIO;
break;
}
@@ -2850,17 +3005,17 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
* This is for scrub call sites where we already have correct data contents.
* This allows us to avoid reading data stripes again.
*
- * Unfortunately here we have to do page copy, other than reusing the pages.
+ * Unfortunately here we have to do folio copy, other than reusing the pages.
* This is due to the fact rbio has its own page management for its cache.
*/
-void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
- struct page **data_pages, u64 data_logical)
+void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
+ struct folio **data_folios, u64 data_logical)
{
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
const u64 offset_in_full_stripe = data_logical -
rbio->bioc->full_stripe_logical;
- const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ unsigned int findex = 0;
+ unsigned int foffset = 0;
int ret;
/*
@@ -2879,14 +3034,24 @@ void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
- for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
- struct page *dst = rbio->stripe_pages[page_nr + page_index];
- struct page *src = data_pages[page_nr];
-
- memcpy_page(dst, 0, src, 0, PAGE_SIZE);
- for (int sector_nr = sectors_per_page * page_index;
- sector_nr < sectors_per_page * (page_index + 1);
- sector_nr++)
- rbio->stripe_sectors[sector_nr].uptodate = true;
+ for (unsigned int cur_off = offset_in_full_stripe;
+ cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN;
+ cur_off += PAGE_SIZE) {
+ const unsigned int pindex = cur_off >> PAGE_SHIFT;
+ void *kaddr;
+
+ kaddr = kmap_local_page(rbio->stripe_pages[pindex]);
+ memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE);
+ kunmap_local(kaddr);
+
+ foffset += PAGE_SIZE;
+ ASSERT(foffset <= folio_size(data_folios[findex]));
+ if (foffset == folio_size(data_folios[findex])) {
+ findex++;
+ foffset = 0;
+ }
}
+ bitmap_set(rbio->stripe_uptodate_bitmap,
+ offset_in_full_stripe >> fs_info->sectorsize_bits,
+ BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 0d7b4c2fb6ae..1f463ecf7e41 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -16,7 +16,6 @@
#include "volumes.h"
struct page;
-struct sector_ptr;
struct btrfs_fs_info;
enum btrfs_rbio_ops {
@@ -25,6 +24,84 @@ enum btrfs_rbio_ops {
BTRFS_RBIO_PARITY_SCRUB,
};
+/*
+ * Overview of btrfs_raid_bio.
+ *
+ * One btrfs_raid_bio represents a full stripe of RAID56, including both data
+ * and P/Q stripes. For now, each data and P/Q stripe is of a fixed length (64K).
+ *
+ * One btrfs_raid_bio can have one or more bios from higher layer, covering
+ * part or all of the data stripes.
+ *
+ * [PAGES FROM HIGHER LAYER BIOS]
+ * Higher layer bios are in the btrfs_raid_bio::bio_list.
+ *
+ * Pages from the bio_list are represented like the following:
+ *
+ * bio_list: |<- Bio 1 ->| |<- Bio 2 ->| ...
+ * bio_paddrs: [0] [1] [2] [3] [4] [5] ...
+ *
+ * If there is a bio covering a sector (one btrfs fs block), the corresponding
+ * pointer in btrfs_raid_bio::bio_paddrs[] will point to the physical address
+ * (with the offset inside the page) of the corresponding bio.
+ *
+ * If there is no bio covering a sector, then btrfs_raid_bio::bio_paddrs[i] will
+ * be INVALID_PADDR.
+ *
+ * The length of each entry in bio_paddrs[] is a step (aka, min(sectorsize, PAGE_SIZE)).
+ *
+ * [PAGES FOR INTERNAL USAGES]
+ * Pages not covered by any bio or belonging to P/Q stripes are stored in
+ * btrfs_raid_bio::stripe_pages[] and stripe_paddrs[], like the following:
+ *
+ * stripe_pages: |<- Page 0 ->|<- Page 1 ->| ...
+ * stripe_paddrs: [0] [1] [2] [3] [4] ...
+ *
+ * stripe_pages[] array stores all the pages covering the full stripe, including
+ * data and P/Q pages.
+ * stripe_pages[0] is the first page of the first data stripe.
+ * stripe_pages[BTRFS_STRIPE_LEN / PAGE_SIZE] is the first page of the second
+ * data stripe.
+ *
+ * Some pointers inside stripe_pages[] can be NULL, e.g. for a full stripe write
+ * (the bio covers all data stripes) there is no need to allocate pages for
+ * data stripes (can grab from bio_paddrs[]).
+ *
+ * If the corresponding page of stripe_paddrs[i] is not allocated, the value of
+ * stripe_paddrs[i] will be INVALID_PADDR.
+ *
+ * The length of each entry in stripe_paddrs[] is a step.
+ *
+ * [LOCATING A SECTOR]
+ * To locate a sector for IO, we need the following info:
+ *
+ * - stripe_nr
+ * Starts from 0 (representing the first data stripe), ends at
+ * @nr_data (RAID5, P stripe) or @nr_data + 1 (RAID6, Q stripe).
+ *
+ * - sector_nr
+ * Starts from 0 (representing the first sector of the stripe), ends
+ * at BTRFS_STRIPE_LEN / sectorsize - 1.
+ *
+ * - step_nr
+ * A step is min(sector_size, PAGE_SIZE).
+ *
+ * Starts from 0 (representing the first step of the sector), ends
+ * at @sector_nsteps - 1.
+ *
+ * For most call sites they do not need to bother this parameter.
+ * It is for bs > ps support and only for vertical stripe related works.
+ * (e.g. RMW/recover)
+ *
+ * - from which array
+ * Whether grabbing from stripe_paddrs[] (aka, internal pages) or from the
+ * bio_paddrs[] (aka, from the higher layer bios).
+ *
+ * For IO, a physical address is returned, so that we can extract the page and
+ * the offset inside the page for IO.
+ * A special value INVALID_PADDR represents when the physical address is invalid,
+ * normally meaning there is no page allocated for the specified sector.
+ */
struct btrfs_raid_bio {
struct btrfs_io_context *bioc;
@@ -82,6 +159,14 @@ struct btrfs_raid_bio {
/* How many sectors there are for each stripe */
u8 stripe_nsectors;
+ /*
+ * How many steps there are for one sector.
+ *
+ * For bs > ps cases, it's sectorsize / PAGE_SIZE.
+ * For bs <= ps cases, it's always 1.
+ */
+ u8 sector_nsteps;
+
/* Stripe number that we're scrubbing */
u8 scrubp;
@@ -116,13 +201,13 @@ struct btrfs_raid_bio {
struct page **stripe_pages;
/* Pointers to the sectors in the bio_list, for faster lookup */
- struct sector_ptr *bio_sectors;
+ phys_addr_t *bio_paddrs;
- /*
- * For subpage support, we need to map each sector to above
- * stripe_pages.
- */
- struct sector_ptr *stripe_sectors;
+ /* Pointers to the sectors in the stripe_pages[]. */
+ phys_addr_t *stripe_paddrs;
+
+ /* Each set bit means the corresponding sector in stripe_sectors[] is uptodate. */
+ unsigned long *stripe_uptodate_bitmap;
/* Allocated with real_stripes-many pointers for finish_*() calls */
void **finish_pointers;
@@ -131,10 +216,6 @@ struct btrfs_raid_bio {
* The bitmap recording where IO errors happened.
* Each bit is corresponding to one sector in either bio_sectors[] or
* stripe_sectors[] array.
- *
- * The reason we don't use another bit in sector_ptr is, we have two
- * arrays of sectors, and a lot of IO can use sectors in both arrays.
- * Thus making it much harder to iterate.
*/
unsigned long *error_bitmap;
@@ -201,8 +282,8 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
-void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
- struct page **data_pages, u64 data_logical);
+void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
+ struct folio **data_folios, u64 data_logical);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
deleted file mode 100644
index 1c2d7cb1fe6f..000000000000
--- a/fs/btrfs/rcu-string.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2012 Red Hat. All rights reserved.
- */
-
-#ifndef BTRFS_RCU_STRING_H
-#define BTRFS_RCU_STRING_H
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/rcupdate.h>
-#include <linux/printk.h>
-
-struct rcu_string {
- struct rcu_head rcu;
- char str[];
-};
-
-static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
-{
- size_t len = strlen(src) + 1;
- struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) +
- (len * sizeof(char)), mask);
- if (!ret)
- return ret;
- /* Warn if the source got unexpectedly truncated. */
- if (WARN_ON(strscpy(ret->str, src, len) < 0)) {
- kfree(ret);
- return NULL;
- }
- return ret;
-}
-
-static inline void rcu_string_free(struct rcu_string *str)
-{
- if (str)
- kfree_rcu(str, rcu);
-}
-
-#define printk_in_rcu(fmt, ...) do { \
- rcu_read_lock(); \
- printk(fmt, __VA_ARGS__); \
- rcu_read_unlock(); \
-} while (0)
-
-#define printk_ratelimited_in_rcu(fmt, ...) do { \
- rcu_read_lock(); \
- printk_ratelimited(fmt, __VA_ARGS__); \
- rcu_read_unlock(); \
-} while (0)
-
-#define rcu_str_deref(rcu_str) ({ \
- struct rcu_string *__str = rcu_dereference(rcu_str); \
- __str->str; \
-})
-
-#endif
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 9522a8b79d22..e9224145d754 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -75,69 +75,70 @@ struct block_entry {
struct list_head actions;
};
+static int block_entry_bytenr_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *bytenr = key;
+ const struct block_entry *entry = rb_entry(node, struct block_entry, node);
+
+ if (entry->bytenr < *bytenr)
+ return 1;
+ else if (entry->bytenr > *bytenr)
+ return -1;
+
+ return 0;
+}
+
+static int block_entry_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct block_entry *new_entry = rb_entry(new, struct block_entry, node);
+
+ return block_entry_bytenr_key_cmp(&new_entry->bytenr, existing);
+}
+
static struct block_entry *insert_block_entry(struct rb_root *root,
struct block_entry *be)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct block_entry *entry;
-
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct block_entry, node);
- if (entry->bytenr > be->bytenr)
- p = &(*p)->rb_left;
- else if (entry->bytenr < be->bytenr)
- p = &(*p)->rb_right;
- else
- return entry;
- }
+ struct rb_node *node;
- rb_link_node(&be->node, parent_node, p);
- rb_insert_color(&be->node, root);
- return NULL;
+ node = rb_find_add(&be->node, root, block_entry_bytenr_cmp);
+ return rb_entry_safe(node, struct block_entry, node);
}
static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr)
{
- struct rb_node *n;
- struct block_entry *entry = NULL;
+ struct rb_node *node;
- n = root->rb_node;
- while (n) {
- entry = rb_entry(n, struct block_entry, node);
- if (entry->bytenr < bytenr)
- n = n->rb_right;
- else if (entry->bytenr > bytenr)
- n = n->rb_left;
- else
- return entry;
- }
- return NULL;
+ node = rb_find(&bytenr, root, block_entry_bytenr_key_cmp);
+ return rb_entry_safe(node, struct block_entry, node);
+}
+
+static int root_entry_root_objectid_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *objectid = key;
+ const struct root_entry *entry = rb_entry(node, struct root_entry, node);
+
+ if (entry->root_objectid < *objectid)
+ return 1;
+ else if (entry->root_objectid > *objectid)
+ return -1;
+
+ return 0;
+}
+
+static int root_entry_root_objectid_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct root_entry *new_entry = rb_entry(new, struct root_entry, node);
+
+ return root_entry_root_objectid_key_cmp(&new_entry->root_objectid, existing);
}
static struct root_entry *insert_root_entry(struct rb_root *root,
struct root_entry *re)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct root_entry *entry;
-
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct root_entry, node);
- if (entry->root_objectid > re->root_objectid)
- p = &(*p)->rb_left;
- else if (entry->root_objectid < re->root_objectid)
- p = &(*p)->rb_right;
- else
- return entry;
- }
-
- rb_link_node(&re->node, parent_node, p);
- rb_insert_color(&re->node, root);
- return NULL;
+ struct rb_node *node;
+ node = rb_find_add(&re->node, root, root_entry_root_objectid_cmp);
+ return rb_entry_safe(node, struct root_entry, node);
}
static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
@@ -161,48 +162,29 @@ static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
return 0;
}
+static int ref_entry_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ struct ref_entry *new_entry = rb_entry(new, struct ref_entry, node);
+ struct ref_entry *existing_entry = rb_entry(existing, struct ref_entry, node);
+
+ return comp_refs(new_entry, existing_entry);
+}
+
static struct ref_entry *insert_ref_entry(struct rb_root *root,
struct ref_entry *ref)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct ref_entry *entry;
- int cmp;
-
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct ref_entry, node);
- cmp = comp_refs(entry, ref);
- if (cmp > 0)
- p = &(*p)->rb_left;
- else if (cmp < 0)
- p = &(*p)->rb_right;
- else
- return entry;
- }
-
- rb_link_node(&ref->node, parent_node, p);
- rb_insert_color(&ref->node, root);
- return NULL;
+ struct rb_node *node;
+ node = rb_find_add(&ref->node, root, ref_entry_cmp);
+ return rb_entry_safe(node, struct ref_entry, node);
}
static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid)
{
- struct rb_node *n;
- struct root_entry *entry = NULL;
+ struct rb_node *node;
- n = root->rb_node;
- while (n) {
- entry = rb_entry(n, struct root_entry, node);
- if (entry->root_objectid < objectid)
- n = n->rb_right;
- else if (entry->root_objectid > objectid)
- n = n->rb_left;
- else
- return entry;
- }
- return NULL;
+ node = rb_find(&objectid, root, root_entry_root_objectid_key_cmp);
+ return rb_entry_safe(node, struct root_entry, node);
}
#ifdef CONFIG_STACKTRACE
@@ -668,7 +650,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info,
* our sanity checks pass as they are no longer needed.
*/
int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *generic_ref)
+ const struct btrfs_ref *generic_ref)
{
struct ref_entry *ref = NULL, *exist;
struct ref_action *ra = NULL;
@@ -857,6 +839,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"dropping a ref for a root that doesn't have a ref on the block");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ rb_erase(&ref->node, &be->refs);
kfree(ref);
kfree(ra);
goto out_unlock;
@@ -988,7 +971,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *extent_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *eb;
int tree_block_level = 0;
u64 bytenr = 0, num_bytes = 0;
@@ -997,11 +980,18 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
if (!btrfs_test_opt(fs_info, REF_VERIFY))
return 0;
+ extent_root = btrfs_extent_root(fs_info, 0);
+ /* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */
+ if (!extent_root) {
+ btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling");
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ return 0;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- extent_root = btrfs_extent_root(fs_info, 0);
eb = btrfs_read_lock_root_node(extent_root);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
@@ -1031,6 +1021,5 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
btrfs_free_ref_cache(fs_info);
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
}
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index 3511e1a5c96b..1ce544d53cc5 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h
@@ -12,14 +12,14 @@
struct btrfs_fs_info;
struct btrfs_ref;
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
#include <linux/spinlock.h>
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *generic_ref);
+ const struct btrfs_ref *generic_ref);
void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
u64 len);
@@ -39,7 +39,7 @@ static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
}
static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *generic_ref)
+ const struct btrfs_ref *generic_ref)
{
return 0;
}
@@ -53,6 +53,6 @@ static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
{
}
-#endif /* CONFIG_BTRFS_FS_REF_VERIFY */
+#endif /* CONFIG_BTRFS_DEBUG */
#endif
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f0824c948cb7..b5fe95baf92e 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/blkdev.h>
+#include <linux/fscrypt.h>
#include <linux/iversion.h>
#include "ctree.h"
#include "fs.h"
@@ -23,7 +24,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
u64 endoff,
const u64 destoff,
const u64 olen,
- int no_time_update)
+ bool no_time_update)
{
int ret;
@@ -43,14 +44,12 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
}
ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
- goto out;
+ return ret;
}
- ret = btrfs_end_transaction(trans);
-out:
- return ret;
+ return btrfs_end_transaction(trans);
}
static int copy_inline_to_page(struct btrfs_inode *inode,
@@ -87,7 +86,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
btrfs_alloc_write_mask(mapping));
if (IS_ERR(folio)) {
- ret = -ENOMEM;
+ ret = PTR_ERR(folio);
goto out_unlock;
}
@@ -95,9 +94,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (ret < 0)
goto out_unlock;
- clear_extent_bit(&inode->io_tree, file_offset, range_end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- NULL);
+ btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL);
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -165,7 +163,7 @@ out:
* the source inode to destination inode when possible. When not possible we
* copy the inline extent's data into the respective page of the inode.
*/
-static int clone_copy_inline_extent(struct inode *dst,
+static int clone_copy_inline_extent(struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_key *new_key,
const u64 drop_start,
@@ -175,8 +173,8 @@ static int clone_copy_inline_extent(struct inode *dst,
char *inline_data,
struct btrfs_trans_handle **trans_out)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(dst);
- struct btrfs_root *root = BTRFS_I(dst)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
const u64 aligned_end = ALIGN(new_key->offset + datal,
fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
@@ -185,12 +183,12 @@ static int clone_copy_inline_extent(struct inode *dst,
struct btrfs_key key;
if (new_key->offset > 0) {
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ ret = copy_inline_to_page(inode, new_key->offset,
inline_data, size, datal, comp_type);
goto out;
}
- key.objectid = btrfs_ino(BTRFS_I(dst));
+ key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -205,7 +203,7 @@ static int clone_copy_inline_extent(struct inode *dst,
goto copy_inline_extent;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
+ if (key.objectid == btrfs_ino(inode) &&
key.type == BTRFS_EXTENT_DATA_KEY) {
/*
* There's an implicit hole at file offset 0, copy the
@@ -214,7 +212,7 @@ static int clone_copy_inline_extent(struct inode *dst,
ASSERT(key.offset > 0);
goto copy_to_page;
}
- } else if (i_size_read(dst) <= datal) {
+ } else if (i_size_read(&inode->vfs_inode) <= datal) {
struct btrfs_file_extent_item *ei;
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -236,7 +234,7 @@ copy_inline_extent:
* We have no extent items, or we have an extent at offset 0 which may
* or may not be inlined. All these cases are dealt the same way.
*/
- if (i_size_read(dst) > datal) {
+ if (i_size_read(&inode->vfs_inode) > datal) {
/*
* At the destination offset 0 we have either a hole, a regular
* extent or an inline extent larger then the one we want to
@@ -270,20 +268,26 @@ copy_inline_extent:
drop_args.start = drop_start;
drop_args.end = aligned_end;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args);
- if (ret)
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
write_extent_buffer(path->nodes[0], inline_data,
btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]),
size);
- btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
- btrfs_set_inode_full_sync(BTRFS_I(dst));
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
+ btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found);
+ btrfs_set_inode_full_sync(inode);
+ ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end);
+ if (unlikely(ret))
+ btrfs_abort_transaction(trans, ret);
out:
if (!ret && !trans) {
/*
@@ -298,10 +302,8 @@ out:
trans = NULL;
}
}
- if (ret && trans) {
- btrfs_abort_transaction(trans, ret);
+ if (ret && trans)
btrfs_end_transaction(trans);
- }
if (!ret)
*trans_out = trans;
@@ -318,7 +320,7 @@ copy_to_page:
*/
btrfs_release_path(path);
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ ret = copy_inline_to_page(inode, new_key->offset,
inline_data, size, datal, comp_type);
goto out;
}
@@ -336,13 +338,13 @@ copy_to_page:
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
- const u64 destoff, int no_time_update)
+ const u64 destoff, bool no_time_update)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_trans_handle *trans;
- char *buf = NULL;
+ char AUTO_KVFREE(buf);
struct btrfs_key key;
u32 nritems;
int slot;
@@ -357,10 +359,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
return ret;
path = btrfs_alloc_path();
- if (!path) {
- kvfree(buf);
+ if (!path)
return ret;
- }
path->reada = READA_FORWARD;
/* Clone data */
@@ -526,7 +526,7 @@ process_slot:
goto out;
}
- ret = clone_copy_inline_extent(inode, path, &new_key,
+ ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key,
drop_start, datal, size,
comp, buf, &trans);
if (ret)
@@ -610,33 +610,31 @@ process_slot:
}
out:
- btrfs_free_path(path);
- kvfree(buf);
clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
return ret;
}
-static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
+static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
{
if (inode1 < inode2)
swap(inode1, inode2);
- down_write(&BTRFS_I(inode1)->i_mmap_lock);
- down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
+ down_write(&inode1->i_mmap_lock);
+ down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING);
}
-static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
+static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
{
- up_write(&BTRFS_I(inode1)->i_mmap_lock);
- up_write(&BTRFS_I(inode2)->i_mmap_lock);
+ up_write(&inode1->i_mmap_lock);
+ up_write(&inode2->i_mmap_lock);
}
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
- struct inode *dst, u64 dst_loff)
+static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len,
+ struct btrfs_inode *dst, u64 dst_loff)
{
const u64 end = dst_loff + len - 1;
struct extent_state *cached_state = NULL;
- struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
+ struct btrfs_fs_info *fs_info = src->root->fs_info;
const u64 bs = fs_info->sectorsize;
int ret;
@@ -646,9 +644,10 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
* because we have already locked the inode's i_mmap_lock in exclusive
* mode.
*/
- lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
- ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
- unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
+ btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state);
+ ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len,
+ ALIGN(len, bs), dst_loff, 1);
+ btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state);
btrfs_btree_balance_dirty(fs_info);
@@ -678,8 +677,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
for (i = 0; i < chunk_count; i++) {
- ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
- dst, dst_loff);
+ ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN,
+ BTRFS_I(dst), dst_loff);
if (ret)
goto out;
@@ -688,7 +687,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
}
if (tail_len > 0)
- ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
+ ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len,
+ BTRFS_I(dst), dst_loff);
out:
spin_lock(&root_dst->root_item_lock);
root_dst->dedupe_in_progress--;
@@ -747,9 +747,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* mode.
*/
end = destoff + len - 1;
- lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
- unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
/*
* We may have copied an inline extent into a page of the destination
@@ -775,24 +775,28 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags)
{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize;
+ struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in));
+ struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out));
+ u64 bs = inode_out->root->fs_info->sectorsize;
u64 wb_len;
int ret;
if (!(remap_flags & REMAP_FILE_DEDUP)) {
- struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
+ struct btrfs_root *root_out = inode_out->root;
if (btrfs_root_readonly(root_out))
return -EROFS;
- ASSERT(inode_in->i_sb == inode_out->i_sb);
+ ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb);
}
+ /* Can only reflink encrypted files if both files are encrypted. */
+ if (IS_ENCRYPTED(&inode_in->vfs_inode) != IS_ENCRYPTED(&inode_out->vfs_inode))
+ return -EINVAL;
+
/* Don't make the dst file partly checksummed */
- if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ if ((inode_in->flags & BTRFS_INODE_NODATASUM) !=
+ (inode_out->flags & BTRFS_INODE_NODATASUM)) {
return -EINVAL;
}
@@ -811,7 +815,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
* to complete so that new file extent items are in the fs tree.
*/
if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
- wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
+ wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs);
else
wb_len = ALIGN(*len, bs);
@@ -832,16 +836,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
* Also we don't need to check ASYNC_EXTENT, as async extent will be
* CoWed anyway, not affecting nocow part.
*/
- ret = filemap_flush(inode_in->i_mapping);
+ ret = filemap_flush(inode_in->vfs_inode.i_mapping);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs),
- wb_len);
+ ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs),
- wb_len);
+ ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len);
if (ret < 0)
return ret;
@@ -863,18 +865,21 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
{
- struct inode *src_inode = file_inode(src_file);
- struct inode *dst_inode = file_inode(dst_file);
+ struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file));
+ struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file));
bool same_inode = dst_inode == src_inode;
int ret;
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file)))))
+ return -EIO;
+
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
if (same_inode) {
- btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
} else {
- lock_two_nondirectories(src_inode, dst_inode);
+ lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode);
btrfs_double_mmap_lock(src_inode, dst_inode);
}
@@ -884,16 +889,18 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
goto out_unlock;
if (remap_flags & REMAP_FILE_DEDUP)
- ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
+ ret = btrfs_extent_same(&src_inode->vfs_inode, off, len,
+ &dst_inode->vfs_inode, destoff);
else
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
out_unlock:
if (same_inode) {
- btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
} else {
btrfs_double_mmap_unlock(src_inode, dst_inode);
- unlock_two_nondirectories(src_inode, dst_inode);
+ unlock_two_nondirectories(&src_inode->vfs_inode,
+ &dst_inode->vfs_inode);
}
/*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index bf267bdfa8f8..5bfefc3e9c06 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -90,10 +90,15 @@
* map address of tree root to tree
*/
struct mapping_node {
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- }; /* Use rb_simle_node for search/insert */
+ union {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+
+ struct rb_simple_node simple_node;
+ };
void *data;
};
@@ -106,10 +111,15 @@ struct mapping_tree {
* present a tree block to process
*/
struct tree_block {
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- }; /* Use rb_simple_node for search/insert */
+ union {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+
+ struct rb_simple_node simple_node;
+ };
u64 owner;
struct btrfs_key key;
u8 level;
@@ -178,8 +188,9 @@ static void mark_block_processed(struct reloc_control *rc,
in_range(node->bytenr, rc->block_group->start,
rc->block_group->length)) {
blocksize = rc->extent_root->fs_info->nodesize;
- set_extent_bit(&rc->processed_blocks, node->bytenr,
- node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&rc->processed_blocks, node->bytenr,
+ node->bytenr + blocksize - 1, EXTENT_DIRTY,
+ NULL);
}
node->processed = 1;
}
@@ -195,8 +206,8 @@ static struct btrfs_backref_node *walk_up_backref(
int idx = *index;
while (!list_empty(&node->upper)) {
- edge = list_entry(node->upper.next,
- struct btrfs_backref_edge, list[LOWER]);
+ edge = list_first_entry(&node->upper, struct btrfs_backref_edge,
+ list[LOWER]);
edges[idx++] = edge;
node = edge->node[UPPER];
}
@@ -222,8 +233,8 @@ static struct btrfs_backref_node *walk_down_backref(
idx--;
continue;
}
- edge = list_entry(edge->list[LOWER].next,
- struct btrfs_backref_edge, list[LOWER]);
+ edge = list_first_entry(&edge->list[LOWER], struct btrfs_backref_edge,
+ list[LOWER]);
edges[idx - 1] = edge;
*index = idx;
return edge->node[UPPER];
@@ -342,19 +353,13 @@ static bool handle_useless_nodes(struct reloc_control *rc,
if (cur == node)
ret = true;
- /* The node is the lowest node */
- if (cur->lowest) {
- list_del_init(&cur->lower);
- cur->lowest = 0;
- }
-
/* Cleanup the lower edges */
while (!list_empty(&cur->lower)) {
struct btrfs_backref_edge *edge;
struct btrfs_backref_node *lower;
- edge = list_entry(cur->lower.next,
- struct btrfs_backref_edge, list[UPPER]);
+ edge = list_first_entry(&cur->lower, struct btrfs_backref_edge,
+ list[UPPER]);
list_del(&edge->list[UPPER]);
list_del(&edge->list[LOWER]);
lower = edge->node[LOWER];
@@ -373,7 +378,6 @@ static bool handle_useless_nodes(struct reloc_control *rc,
* cache to avoid unnecessary backref lookup.
*/
if (cur->level > 0) {
- list_add(&cur->list, &cache->detached);
cur->detached = 1;
} else {
rb_erase(&cur->rb_node, &cache->rb_root);
@@ -426,7 +430,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
goto out;
}
- node->lowest = 1;
cur = node;
/* Breadth-first search to build backref cache */
@@ -470,92 +473,6 @@ out:
}
/*
- * helper to add backref node for the newly created snapshot.
- * the backref node is created by cloning backref node that
- * corresponds to root of source tree
- */
-static int clone_backref_node(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- const struct btrfs_root *src,
- struct btrfs_root *dest)
-{
- struct btrfs_root *reloc_root = src->reloc_root;
- struct btrfs_backref_cache *cache = &rc->backref_cache;
- struct btrfs_backref_node *node = NULL;
- struct btrfs_backref_node *new_node;
- struct btrfs_backref_edge *edge;
- struct btrfs_backref_edge *new_edge;
- struct rb_node *rb_node;
-
- rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
- if (rb_node) {
- node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
- if (node->detached)
- node = NULL;
- else
- BUG_ON(node->new_bytenr != reloc_root->node->start);
- }
-
- if (!node) {
- rb_node = rb_simple_search(&cache->rb_root,
- reloc_root->commit_root->start);
- if (rb_node) {
- node = rb_entry(rb_node, struct btrfs_backref_node,
- rb_node);
- BUG_ON(node->detached);
- }
- }
-
- if (!node)
- return 0;
-
- new_node = btrfs_backref_alloc_node(cache, dest->node->start,
- node->level);
- if (!new_node)
- return -ENOMEM;
-
- new_node->lowest = node->lowest;
- new_node->checked = 1;
- new_node->root = btrfs_grab_root(dest);
- ASSERT(new_node->root);
-
- if (!node->lowest) {
- list_for_each_entry(edge, &node->lower, list[UPPER]) {
- new_edge = btrfs_backref_alloc_edge(cache);
- if (!new_edge)
- goto fail;
-
- btrfs_backref_link_edge(new_edge, edge->node[LOWER],
- new_node, LINK_UPPER);
- }
- } else {
- list_add_tail(&new_node->lower, &cache->leaves);
- }
-
- rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
- &new_node->rb_node);
- if (rb_node)
- btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
-
- if (!new_node->lowest) {
- list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
- list_add_tail(&new_edge->list[LOWER],
- &new_edge->node[LOWER]->upper);
- }
- }
- return 0;
-fail:
- while (!list_empty(&new_node->lower)) {
- new_edge = list_entry(new_node->lower.next,
- struct btrfs_backref_edge, list[UPPER]);
- list_del(&new_edge->list[UPPER]);
- btrfs_backref_free_edge(cache, new_edge);
- }
- btrfs_backref_free_node(cache, new_node);
- return -ENOMEM;
-}
-
-/*
* helper to add 'address of tree root -> reloc tree' mapping
*/
static int __add_reloc_root(struct btrfs_root *root)
@@ -573,8 +490,7 @@ static int __add_reloc_root(struct btrfs_root *root)
node->data = root;
spin_lock(&rc->reloc_root_tree.lock);
- rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
btrfs_err(fs_info,
@@ -595,7 +511,7 @@ static void __del_reloc_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
- struct mapping_node *node = NULL;
+ struct mapping_node AUTO_KFREE(node);
struct reloc_control *rc = fs_info->reloc_ctl;
bool put_ref = false;
@@ -628,7 +544,6 @@ static void __del_reloc_root(struct btrfs_root *root)
spin_unlock(&fs_info->trans_lock);
if (put_ref)
btrfs_put_root(root);
- kfree(node);
}
/*
@@ -657,8 +572,7 @@ static int __update_reloc_root(struct btrfs_root *root)
spin_lock(&rc->reloc_root_tree.lock);
node->bytenr = root->node->start;
- rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node)
btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
@@ -671,10 +585,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *reloc_root;
struct extent_buffer *eb;
- struct btrfs_root_item *root_item;
+ struct btrfs_root_item AUTO_KFREE(root_item);
struct btrfs_key root_key;
int ret = 0;
- bool must_abort = false;
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
if (!root_item)
@@ -687,11 +600,29 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
if (btrfs_root_id(root) == objectid) {
u64 commit_root_gen;
+ /*
+ * Relocation will wait for cleaner thread, and any half-dropped
+ * subvolume will be fully cleaned up at mount time.
+ * So here we shouldn't hit a subvolume with non-zero drop_progress.
+ *
+ * If this isn't the case, error out since it can make us attempt to
+ * drop references for extents that were already dropped before.
+ */
+ if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) {
+ struct btrfs_key cpu_key;
+
+ btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress);
+ btrfs_err(fs_info,
+ "cannot relocate partially dropped subvolume %llu, drop progress key " BTRFS_KEY_FMT,
+ objectid, BTRFS_KEY_FMT_VALUE(&cpu_key));
+ return ERR_PTR(-EUCLEAN);
+ }
+
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
if (ret)
- goto fail;
+ return ERR_PTR(ret);
/*
* Set the last_snapshot field to the generation of the commit
@@ -714,14 +645,13 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_copy_root(trans, root, root->node, &eb,
BTRFS_TREE_RELOC_OBJECTID);
if (ret)
- goto fail;
+ return ERR_PTR(ret);
}
/*
* We have changed references at this point, we must abort the
- * transaction if anything fails.
+ * transaction if anything fails (i.e. 'goto abort').
*/
- must_abort = true;
memcpy(root_item, &root->root_item, sizeof(*root_item));
btrfs_set_root_bytenr(root_item, eb->start);
@@ -741,9 +671,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_insert_root(trans, fs_info->tree_root,
&root_key, root_item);
if (ret)
- goto fail;
-
- kfree(root_item);
+ goto abort;
reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
if (IS_ERR(reloc_root)) {
@@ -753,11 +681,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
btrfs_set_root_last_trans(reloc_root, trans->transid);
return reloc_root;
-fail:
- kfree(root_item);
+
abort:
- if (must_abort)
- btrfs_abort_transaction(trans, ret);
+ btrfs_abort_transaction(trans, ret);
return ERR_PTR(ret);
}
@@ -887,7 +813,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
u64 bytenr, u64 num_bytes)
{
struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
int ret;
@@ -900,11 +826,9 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
if (ret < 0)
- goto out;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ return ret;
+ if (ret > 0)
+ return -ENOENT;
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -915,16 +839,11 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi));
- if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
- ret = -EINVAL;
- goto out;
- }
+ if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi))
+ return -EINVAL;
*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -950,7 +869,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
u32 i;
int ret = 0;
int first = 1;
- int dirty = 0;
if (rc->stage != UPDATE_DATA_PTRS)
return 0;
@@ -1005,16 +923,16 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
/* Take mmap lock to serialize with reflinks. */
if (!down_read_trylock(&inode->i_mmap_lock))
continue;
- ret = try_lock_extent(&inode->io_tree, key.offset,
- end, &cached_state);
+ ret = btrfs_try_lock_extent(&inode->io_tree, key.offset,
+ end, &cached_state);
if (!ret) {
up_read(&inode->i_mmap_lock);
continue;
}
btrfs_drop_extent_map_range(inode, key.offset, end, true);
- unlock_extent(&inode->io_tree, key.offset, end,
- &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, key.offset, end,
+ &cached_state);
up_read(&inode->i_mmap_lock);
}
}
@@ -1030,7 +948,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
}
btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
- dirty = 1;
key.offset -= btrfs_file_extent_offset(leaf, fi);
ref.action = BTRFS_ADD_DELAYED_REF;
@@ -1042,7 +959,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_init_data_ref(&ref, key.objectid, key.offset,
btrfs_root_id(root), false);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1056,13 +973,11 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_init_data_ref(&ref, key.objectid, key.offset,
btrfs_root_id(root), false);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
}
- if (dirty)
- btrfs_mark_buffer_dirty(trans, leaf);
if (inode)
btrfs_add_delayed_iput(inode);
return ret;
@@ -1255,13 +1170,11 @@ again:
*/
btrfs_set_node_blockptr(parent, slot, new_bytenr);
btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
- btrfs_mark_buffer_dirty(trans, parent);
btrfs_set_node_blockptr(path->nodes[level],
path->slots[level], old_bytenr);
btrfs_set_node_ptr_generation(path->nodes[level],
path->slots[level], old_ptr_gen);
- btrfs_mark_buffer_dirty(trans, path->nodes[level]);
ref.action = BTRFS_ADD_DELAYED_REF;
ref.bytenr = old_bytenr;
@@ -1271,7 +1184,7 @@ again:
ref.ref_root = btrfs_root_id(src);
btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1284,7 +1197,7 @@ again:
ref.ref_root = btrfs_root_id(dest);
btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1298,7 +1211,7 @@ again:
ref.ref_root = btrfs_root_id(src);
btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1312,7 +1225,7 @@ again:
ref.ref_root = btrfs_root_id(dest);
btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1478,9 +1391,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
}
/* the lock_extent waits for read_folio to complete */
- lock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, start, end, &cached_state);
btrfs_drop_extent_map_range(inode, start, end, true);
- unlock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
}
return 0;
}
@@ -1562,7 +1475,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
* ->reloc_root. If it fails however we must
* drop the ref ourselves.
*/
- ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
+ ret2 = btrfs_drop_snapshot(reloc_root, false, true);
if (ret2 < 0) {
btrfs_put_root(reloc_root);
if (!ret)
@@ -1572,7 +1485,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
btrfs_put_root(root);
} else {
/* Orphan reloc tree, just clean it up */
- ret2 = btrfs_drop_snapshot(root, 0, 1);
+ ret2 = btrfs_drop_snapshot(root, false, true);
if (ret2 < 0) {
btrfs_put_root(root);
if (!ret)
@@ -1615,7 +1528,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_root_level(root_item);
- atomic_inc(&reloc_root->node->refs);
+ refcount_inc(&reloc_root->node->refs);
path->nodes[level] = reloc_root->node;
path->slots[level] = 0;
} else {
@@ -1797,8 +1710,8 @@ again:
rc->merge_reloc_tree = true;
while (!list_empty(&rc->reloc_roots)) {
- reloc_root = list_entry(rc->reloc_roots.next,
- struct btrfs_root, root_list);
+ reloc_root = list_first_entry(&rc->reloc_roots,
+ struct btrfs_root, root_list);
list_del_init(&reloc_root->root_list);
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
@@ -1863,7 +1776,7 @@ again:
list_add(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
if (!err)
err = ret;
@@ -1913,8 +1826,7 @@ again:
while (!list_empty(&reloc_roots)) {
found = 1;
- reloc_root = list_entry(reloc_roots.next,
- struct btrfs_root, root_list);
+ reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list);
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
@@ -2030,11 +1942,11 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
* reloc root without a corresponding root this could return ENOENT.
*/
if (IS_ERR(root)) {
- ASSERT(0);
+ DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root));
return PTR_ERR(root);
}
- if (root->reloc_root != reloc_root) {
- ASSERT(0);
+ if (unlikely(root->reloc_root != reloc_root)) {
+ DEBUG_WARN("unexpected reloc root found");
btrfs_err(fs_info,
"root %llu has two reloc roots associated with it",
reloc_root->root_key.offset);
@@ -2058,100 +1970,72 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
int index = 0;
int ret;
- next = node;
- while (1) {
- cond_resched();
- next = walk_up_backref(next, edges, &index);
- root = next->root;
+ next = walk_up_backref(node, edges, &index);
+ root = next->root;
- /*
- * If there is no root, then our references for this block are
- * incomplete, as we should be able to walk all the way up to a
- * block that is owned by a root.
- *
- * This path is only for SHAREABLE roots, so if we come upon a
- * non-SHAREABLE root then we have backrefs that resolve
- * improperly.
- *
- * Both of these cases indicate file system corruption, or a bug
- * in the backref walking code.
- */
- if (!root) {
- ASSERT(0);
- btrfs_err(trans->fs_info,
- "bytenr %llu doesn't have a backref path ending in a root",
- node->bytenr);
- return ERR_PTR(-EUCLEAN);
- }
- if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
- ASSERT(0);
- btrfs_err(trans->fs_info,
- "bytenr %llu has multiple refs with one ending in a non-shareable root",
- node->bytenr);
- return ERR_PTR(-EUCLEAN);
- }
-
- if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
- ret = record_reloc_root_in_trans(trans, root);
- if (ret)
- return ERR_PTR(ret);
- break;
- }
+ /*
+ * If there is no root, then our references for this block are
+ * incomplete, as we should be able to walk all the way up to a block
+ * that is owned by a root.
+ *
+ * This path is only for SHAREABLE roots, so if we come upon a
+ * non-SHAREABLE root then we have backrefs that resolve improperly.
+ *
+ * Both of these cases indicate file system corruption, or a bug in the
+ * backref walking code.
+ */
+ if (unlikely(!root)) {
+ btrfs_err(trans->fs_info,
+ "bytenr %llu doesn't have a backref path ending in a root",
+ node->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
+ if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
+ btrfs_err(trans->fs_info,
+ "bytenr %llu has multiple refs with one ending in a non-shareable root",
+ node->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
- ret = btrfs_record_root_in_trans(trans, root);
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
+ ret = record_reloc_root_in_trans(trans, root);
if (ret)
return ERR_PTR(ret);
- root = root->reloc_root;
-
- /*
- * We could have raced with another thread which failed, so
- * root->reloc_root may not be set, return ENOENT in this case.
- */
- if (!root)
- return ERR_PTR(-ENOENT);
+ goto found;
+ }
- if (next->new_bytenr != root->node->start) {
- /*
- * We just created the reloc root, so we shouldn't have
- * ->new_bytenr set and this shouldn't be in the changed
- * list. If it is then we have multiple roots pointing
- * at the same bytenr which indicates corruption, or
- * we've made a mistake in the backref walking code.
- */
- ASSERT(next->new_bytenr == 0);
- ASSERT(list_empty(&next->list));
- if (next->new_bytenr || !list_empty(&next->list)) {
- btrfs_err(trans->fs_info,
- "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
- node->bytenr, next->bytenr);
- return ERR_PTR(-EUCLEAN);
- }
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret)
+ return ERR_PTR(ret);
+ root = root->reloc_root;
- next->new_bytenr = root->node->start;
- btrfs_put_root(next->root);
- next->root = btrfs_grab_root(root);
- ASSERT(next->root);
- list_add_tail(&next->list,
- &rc->backref_cache.changed);
- mark_block_processed(rc, next);
- break;
- }
+ /*
+ * We could have raced with another thread which failed, so
+ * root->reloc_root may not be set, return ENOENT in this case.
+ */
+ if (!root)
+ return ERR_PTR(-ENOENT);
- WARN_ON(1);
- root = NULL;
- next = walk_down_backref(edges, &index);
- if (!next || next->level <= node->level)
- break;
- }
- if (!root) {
+ if (unlikely(next->new_bytenr)) {
/*
- * This can happen if there's fs corruption or if there's a bug
- * in the backref lookup code.
+ * We just created the reloc root, so we shouldn't have
+ * ->new_bytenr set yet. If it is then we have multiple roots
+ * pointing at the same bytenr which indicates corruption, or
+ * we've made a mistake in the backref walking code.
*/
- ASSERT(0);
- return ERR_PTR(-ENOENT);
+ ASSERT(next->new_bytenr == 0);
+ btrfs_err(trans->fs_info,
+ "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
+ node->bytenr, next->bytenr);
+ return ERR_PTR(-EUCLEAN);
}
+ next->new_bytenr = root->node->start;
+ btrfs_put_root(next->root);
+ next->root = btrfs_grab_root(root);
+ ASSERT(next->root);
+ mark_block_processed(rc, next);
+found:
next = node;
/* setup backref node path for btrfs_reloc_cow_block */
while (1) {
@@ -2191,7 +2075,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
* This can occur if we have incomplete extent refs leading all
* the way up a particular path, in this case return -EUCLEAN.
*/
- if (!root)
+ if (unlikely(!root))
return ERR_PTR(-EUCLEAN);
/* No other choice for non-shareable tree */
@@ -2237,8 +2121,8 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
if (list_empty(&next->upper))
break;
- edge = list_entry(next->upper.next,
- struct btrfs_backref_edge, list[LOWER]);
+ edge = list_first_entry(&next->upper, struct btrfs_backref_edge,
+ list[LOWER]);
edges[index++] = edge;
next = edge->node[UPPER];
}
@@ -2247,17 +2131,11 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
return num_bytes;
}
-static int reserve_metadata_space(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct btrfs_backref_node *node)
+static int refill_metadata_space(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc, u64 num_bytes)
{
- struct btrfs_root *root = rc->extent_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- u64 num_bytes;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- u64 tmp;
-
- num_bytes = calcu_metadata_size(rc, node) * 2;
trans->block_rsv = rc->block_rsv;
rc->reserved_bytes += num_bytes;
@@ -2270,7 +2148,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
- tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
+ u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
+
while (tmp <= rc->reserved_bytes)
tmp <<= 1;
/*
@@ -2288,6 +2167,16 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
return 0;
}
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_backref_node *node)
+{
+ u64 num_bytes;
+
+ num_bytes = calcu_metadata_size(rc, node) * 2;
+ return refill_metadata_space(trans, rc, num_bytes);
+}
+
/*
* relocate a block tree, and then update pointers in upper level
* blocks that reference the block to point to the new location.
@@ -2373,7 +2262,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (lowest) {
- if (bytenr != node->bytenr) {
+ if (unlikely(bytenr != node->bytenr)) {
btrfs_err(root->fs_info,
"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
bytenr, node->bytenr, slot,
@@ -2428,7 +2317,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!ret)
ret = btrfs_drop_subtree(trans, root, eb,
upper->eb);
- if (ret)
+ if (unlikely(ret))
btrfs_abort_transaction(trans, ret);
}
next:
@@ -2442,7 +2331,7 @@ next:
if (!ret && node->pending) {
btrfs_backref_drop_node_buffer(node);
- list_move_tail(&node->list, &rc->backref_cache.changed);
+ list_del_init(&node->list);
node->pending = 0;
}
@@ -2479,8 +2368,8 @@ static int finish_pending_nodes(struct btrfs_trans_handle *trans,
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
while (!list_empty(&cache->pending[level])) {
- node = list_entry(cache->pending[level].next,
- struct btrfs_backref_node, list);
+ node = list_first_entry(&cache->pending[level],
+ struct btrfs_backref_node, list);
list_move_tail(&node->list, &list);
BUG_ON(!node->pending);
@@ -2518,8 +2407,8 @@ static void update_processed_blocks(struct reloc_control *rc,
if (list_empty(&next->upper))
break;
- edge = list_entry(next->upper.next,
- struct btrfs_backref_edge, list[LOWER]);
+ edge = list_first_entry(&next->upper, struct btrfs_backref_edge,
+ list[LOWER]);
edges[index++] = edge;
next = edge->node[UPPER];
}
@@ -2531,8 +2420,8 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
{
u32 blocksize = rc->extent_root->fs_info->nodesize;
- if (test_range_bit(&rc->processed_blocks, bytenr,
- bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
+ if (btrfs_test_range_bit(&rc->processed_blocks, bytenr,
+ bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
return 1;
return 0;
}
@@ -2550,7 +2439,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
eb = read_tree_block(fs_info, block->bytenr, &check);
if (IS_ERR(eb))
return PTR_ERR(eb);
- if (!extent_buffer_uptodate(eb)) {
+ if (unlikely(!extent_buffer_uptodate(eb))) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2605,8 +2494,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
/*
* This block was the root block of a root, and this is
* the first time we're processing the block and thus it
- * should not have had the ->new_bytenr modified and
- * should have not been included on the changed list.
+ * should not have had the ->new_bytenr modified.
*
* However in the case of corruption we could have
* multiple refs pointing to the same block improperly,
@@ -2616,8 +2504,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
* normal user in the case of corruption.
*/
ASSERT(node->new_bytenr == 0);
- ASSERT(list_empty(&node->list));
- if (node->new_bytenr || !list_empty(&node->list)) {
+ if (unlikely(node->new_bytenr)) {
btrfs_err(root->fs_info,
"bytenr %llu has improper references to it",
node->bytenr);
@@ -2640,17 +2527,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
btrfs_put_root(node->root);
node->root = btrfs_grab_root(root);
ASSERT(node->root);
- list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
- path->lowest_level = node->level;
- if (root == root->fs_info->chunk_root)
- btrfs_reserve_chunk_metadata(trans, false);
- ret = btrfs_search_slot(trans, root, key, path, 0, 1);
- btrfs_release_path(path);
- if (root == root->fs_info->chunk_root)
- btrfs_trans_release_chunk_metadata(trans);
- if (ret > 0)
- ret = 0;
+ btrfs_err(root->fs_info,
+ "bytenr %llu resolved to a non-shareable root",
+ node->bytenr);
+ ret = -EUCLEAN;
+ goto out;
}
if (!ret)
update_processed_blocks(rc, node);
@@ -2658,11 +2540,50 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
ret = do_relocation(trans, rc, node, key, path, 1);
}
out:
- if (ret || node->level == 0 || node->cowonly)
+ if (ret || node->level == 0)
btrfs_backref_cleanup_node(&rc->backref_cache, node);
return ret;
}
+static int relocate_cowonly_block(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc, struct tree_block *block,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root;
+ u64 num_bytes;
+ int nr_levels;
+ int ret;
+
+ root = btrfs_get_fs_root(fs_info, block->owner, true);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+
+ nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1;
+
+ num_bytes = fs_info->nodesize * nr_levels;
+ ret = refill_metadata_space(trans, rc, num_bytes);
+ if (ret) {
+ btrfs_put_root(root);
+ return ret;
+ }
+ path->lowest_level = block->level;
+ if (root == root->fs_info->chunk_root)
+ btrfs_reserve_chunk_metadata(trans, false);
+
+ ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1);
+ path->lowest_level = 0;
+ btrfs_release_path(path);
+
+ if (root == root->fs_info->chunk_root)
+ btrfs_trans_release_chunk_metadata(trans);
+ if (ret > 0)
+ ret = 0;
+ btrfs_put_root(root);
+
+ return ret;
+}
+
/*
* relocate a list of blocks
*/
@@ -2702,6 +2623,20 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
/* Do tree relocation */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
+ /*
+ * For COWonly blocks, or the data reloc tree, we only need to
+ * COW down to the block, there's no need to generate a backref
+ * tree.
+ */
+ if (block->owner &&
+ (!btrfs_is_fstree(block->owner) ||
+ block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
+ ret = relocate_cowonly_block(trans, rc, block, path);
+ if (ret)
+ break;
+ continue;
+ }
+
node = build_backref_tree(trans, rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
@@ -2735,69 +2670,24 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control
u64 num_bytes;
int nr;
int ret = 0;
- u64 i_size = i_size_read(&inode->vfs_inode);
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
u64 cur_offset = prealloc_start;
/*
- * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
- * This means the range [i_size, PAGE_END + 1) is filled with zeros by
- * btrfs_do_readpage() call of previously relocated file cluster.
+ * For blocksize < folio size case (either bs < page size or large folios),
+ * beyond i_size, all blocks are filled with zero.
*
- * If the current cluster starts in the above range, btrfs_do_readpage()
+ * If the current cluster covers the above range, btrfs_do_readpage()
* will skip the read, and relocate_one_folio() will later writeback
* the padding zeros as new data, causing data corruption.
*
- * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
+ * Here we have to invalidate the cache covering our cluster.
*/
- if (!PAGE_ALIGNED(i_size)) {
- struct address_space *mapping = inode->vfs_inode.i_mapping;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- const u32 sectorsize = fs_info->sectorsize;
- struct folio *folio;
-
- ASSERT(sectorsize < PAGE_SIZE);
- ASSERT(IS_ALIGNED(i_size, sectorsize));
-
- /*
- * Subpage can't handle page with DIRTY but without UPTODATE
- * bit as it can lead to the following deadlock:
- *
- * btrfs_read_folio()
- * | Page already *locked*
- * |- btrfs_lock_and_flush_ordered_range()
- * |- btrfs_start_ordered_extent()
- * |- extent_write_cache_pages()
- * |- lock_page()
- * We try to lock the page we already hold.
- *
- * Here we just writeback the whole data reloc inode, so that
- * we will be ensured to have no dirty range in the page, and
- * are safe to clear the uptodate bits.
- *
- * This shouldn't cause too much overhead, as we need to write
- * the data back anyway.
- */
- ret = filemap_write_and_wait(mapping);
- if (ret < 0)
- return ret;
-
- clear_extent_bits(&inode->io_tree, i_size,
- round_up(i_size, PAGE_SIZE) - 1,
- EXTENT_UPTODATE);
- folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT);
- /*
- * If page is freed we don't need to do anything then, as we
- * will re-read the whole page anyway.
- */
- if (!IS_ERR(folio)) {
- btrfs_subpage_clear_uptodate(fs_info, folio, i_size,
- round_up(i_size, PAGE_SIZE) - i_size);
- folio_unlock(folio);
- folio_put(folio);
- }
- }
+ ret = filemap_invalidate_inode(&inode->vfs_inode, true, prealloc_start,
+ prealloc_end);
+ if (ret < 0)
+ return ret;
BUG_ON(cluster->start != cluster->boundary[0]);
ret = btrfs_alloc_data_chunk_ondemand(inode,
@@ -2815,21 +2705,21 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control
else
end = cluster->end - offset;
- lock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, start, end, &cached_state);
num_bytes = end + 1 - start;
ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
- unlock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
if (ret)
break;
}
btrfs_inode_unlock(inode, 0);
if (cur_offset < prealloc_end)
- btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
- prealloc_end + 1 - cur_offset);
+ btrfs_free_reserved_data_space_noquota(inode,
+ prealloc_end + 1 - cur_offset);
return ret;
}
@@ -2843,7 +2733,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr
u64 end = rc->cluster.end - offset;
int ret = 0;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em)
return -ENOMEM;
@@ -2854,10 +2744,10 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr
em->ram_bytes = em->len;
em->flags |= EXTENT_FLAG_PINNED;
- lock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, start, end, &cached_state);
ret = btrfs_replace_extent_map_range(inode, em, false);
- unlock_extent(&inode->io_tree, start, end, &cached_state);
- free_extent_map(em);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_free_extent_map(em);
return ret;
}
@@ -2886,13 +2776,15 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
static int relocate_one_folio(struct reloc_control *rc,
struct file_ra_state *ra,
- int *cluster_nr, unsigned long index)
+ int *cluster_nr, u64 *file_offset_ret)
{
const struct file_extent_cluster *cluster = &rc->cluster;
struct inode *inode = rc->data_inode;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ const u64 orig_file_offset = *file_offset_ret;
u64 offset = BTRFS_I(inode)->reloc_block_group_start;
- const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ const pgoff_t last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ const pgoff_t index = orig_file_offset >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
struct folio *folio;
u64 folio_start;
@@ -2902,6 +2794,7 @@ static int relocate_one_folio(struct reloc_control *rc,
const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags);
ASSERT(index <= last_index);
+again:
folio = filemap_lock_folio(inode->i_mapping, index);
if (IS_ERR(folio)) {
@@ -2924,8 +2817,6 @@ static int relocate_one_folio(struct reloc_control *rc,
return PTR_ERR(folio);
}
- WARN_ON(folio_order(folio));
-
if (folio_test_readahead(folio) && !use_rst)
page_cache_async_readahead(inode->i_mapping, ra, NULL,
folio, last_index + 1 - index);
@@ -2933,15 +2824,20 @@ static int relocate_one_folio(struct reloc_control *rc,
if (!folio_test_uptodate(folio)) {
btrfs_read_folio(NULL, folio);
folio_lock(folio);
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
ret = -EIO;
goto release_folio;
}
+ if (folio->mapping != inode->i_mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
}
/*
* We could have lost folio private when we dropped the lock to read the
- * folio above, make sure we set_page_extent_mapped here so we have any
+ * folio above, make sure we set_folio_extent_mapped() here so we have any
* of the subpage blocksize stuff we need in place.
*/
ret = set_folio_extent_mapped(folio);
@@ -2949,7 +2845,7 @@ static int relocate_one_folio(struct reloc_control *rc,
goto release_folio;
folio_start = folio_pos(folio);
- folio_end = folio_start + PAGE_SIZE - 1;
+ folio_end = folio_start + folio_size(folio) - 1;
/*
* Start from the cluster, as for subpage case, the cluster can start
@@ -2973,15 +2869,15 @@ static int relocate_one_folio(struct reloc_control *rc,
goto release_folio;
/* Mark the range delalloc and dirty for later writeback */
- lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
- &cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, clamped_start,
+ clamped_end, &cached_state);
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
clamped_end, 0, &cached_state);
if (ret) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree,
- clamped_start, clamped_end,
- EXTENT_LOCKED | EXTENT_BOUNDARY,
- &cached_state);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ clamped_start, clamped_end,
+ EXTENT_LOCKED | EXTENT_BOUNDARY,
+ &cached_state);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
clamped_len, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
@@ -2997,18 +2893,19 @@ static int relocate_one_folio(struct reloc_control *rc,
* EXTENT_BOUNDARY bit prevents current extent from being merged
* with previous extent.
*/
- if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) {
+ if (in_range(cluster->boundary[*cluster_nr] - offset,
+ folio_start, folio_size(folio))) {
u64 boundary_start = cluster->boundary[*cluster_nr] -
offset;
u64 boundary_end = boundary_start +
fs_info->sectorsize - 1;
- set_extent_bit(&BTRFS_I(inode)->io_tree,
- boundary_start, boundary_end,
- EXTENT_BOUNDARY, NULL);
+ btrfs_set_extent_bit(&BTRFS_I(inode)->io_tree,
+ boundary_start, boundary_end,
+ EXTENT_BOUNDARY, NULL);
}
- unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
+ &cached_state);
btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
cur += clamped_len;
@@ -3027,6 +2924,7 @@ static int relocate_one_folio(struct reloc_control *rc,
btrfs_throttle(fs_info);
if (btrfs_should_cancel_balance(fs_info))
ret = -ECANCELED;
+ *file_offset_ret = folio_end + 1;
return ret;
release_folio:
@@ -3040,9 +2938,8 @@ static int relocate_file_extent_cluster(struct reloc_control *rc)
struct inode *inode = rc->data_inode;
const struct file_extent_cluster *cluster = &rc->cluster;
u64 offset = BTRFS_I(inode)->reloc_block_group_start;
- unsigned long index;
- unsigned long last_index;
- struct file_ra_state *ra;
+ u64 cur_file_offset = cluster->start - offset;
+ struct file_ra_state AUTO_KFREE(ra);
int cluster_nr = 0;
int ret = 0;
@@ -3055,22 +2952,21 @@ static int relocate_file_extent_cluster(struct reloc_control *rc)
ret = prealloc_file_extent_cluster(rc);
if (ret)
- goto out;
+ return ret;
file_ra_state_init(ra, inode->i_mapping);
ret = setup_relocation_extent_mapping(rc);
if (ret)
- goto out;
+ return ret;
- last_index = (cluster->end - offset) >> PAGE_SHIFT;
- for (index = (cluster->start - offset) >> PAGE_SHIFT;
- index <= last_index && !ret; index++)
- ret = relocate_one_folio(rc, ra, &cluster_nr, index);
+ while (cur_file_offset < cluster->end - offset) {
+ ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset);
+ if (ret)
+ break;
+ }
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
-out:
- kfree(ra);
return ret;
}
@@ -3229,7 +3125,7 @@ static int add_tree_block(struct reloc_control *rc,
block->key_ready = false;
block->owner = owner;
- rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
+ rb_node = rb_simple_insert(blocks, &block->simple_node);
if (rb_node)
btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
-EEXIST);
@@ -3245,7 +3141,7 @@ static int __add_tree_block(struct reloc_control *rc,
struct rb_root *blocks)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret;
bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
@@ -3269,11 +3165,11 @@ again:
key.offset = blocksize;
}
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0 && skinny) {
if (path->slots[0]) {
@@ -3300,31 +3196,29 @@ again:
"tree block extent item (%llu) is not found in extent tree",
bytenr);
WARN_ON(1);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
- ret = add_tree_block(rc, &key, path, blocks);
-out:
- btrfs_free_path(path);
- return ret;
+ return add_tree_block(rc, &key, path, blocks);
}
-static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group *block_group,
+static int delete_block_group_cache(struct btrfs_block_group *block_group,
struct inode *inode,
u64 ino)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
+ struct btrfs_inode *btrfs_inode;
int ret = 0;
if (inode)
goto truncate;
- inode = btrfs_iget(ino, root);
- if (IS_ERR(inode))
+ btrfs_inode = btrfs_iget(ino, root);
+ if (IS_ERR(btrfs_inode))
return -ENOENT;
+ inode = &btrfs_inode->vfs_inode;
truncate:
ret = btrfs_check_trunc_cache_free_space(fs_info,
@@ -3384,8 +3278,7 @@ static int delete_v1_space_cache(struct extent_buffer *leaf,
}
if (!found)
return -ENOENT;
- ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
- space_cache_ino);
+ ret = delete_block_group_cache(block_group, NULL, space_cache_ino);
return ret;
}
@@ -3465,8 +3358,8 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = 0;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
0, 0);
if (ret < 0)
@@ -3505,9 +3398,9 @@ next:
goto next;
}
- block_found = find_first_extent_bit(&rc->processed_blocks,
- key.objectid, &start, &end,
- EXTENT_DIRTY, NULL);
+ block_found = btrfs_find_first_extent_bit(&rc->processed_blocks,
+ key.objectid, &start, &end,
+ EXTENT_DIRTY, NULL);
if (block_found && start <= key.objectid) {
btrfs_release_path(path);
@@ -3596,7 +3489,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
struct rb_root blocks = RB_ROOT;
struct btrfs_key key;
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_extent_item *ei;
u64 flags;
int ret;
@@ -3716,7 +3609,7 @@ restart:
}
btrfs_release_path(path);
- clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
+ btrfs_clear_extent_bit(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, NULL);
if (trans) {
btrfs_end_transaction_throttle(trans);
@@ -3765,14 +3658,13 @@ out_free:
if (ret < 0 && !err)
err = ret;
btrfs_free_block_rsv(fs_info, rc->block_rsv);
- btrfs_free_path(path);
return err;
}
static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_inode_item *item;
struct extent_buffer *leaf;
int ret;
@@ -3783,7 +3675,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_inode(trans, root, path, objectid);
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
@@ -3793,16 +3685,13 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
BTRFS_INODE_PREALLOC);
- btrfs_mark_buffer_dirty(trans, leaf);
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static void delete_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret = 0;
@@ -3825,7 +3714,6 @@ static void delete_orphan_inode(struct btrfs_trans_handle *trans,
out:
if (ret)
btrfs_abort_transaction(trans, ret);
- btrfs_free_path(path);
}
/*
@@ -3833,10 +3721,10 @@ out:
* the inode is in data relocation tree and its link count is 0
*/
static noinline_for_stack struct inode *create_reloc_inode(
- struct btrfs_fs_info *fs_info,
const struct btrfs_block_group *group)
{
- struct inode *inode = NULL;
+ struct btrfs_fs_info *fs_info = group->fs_info;
+ struct btrfs_inode *inode = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
u64 objectid;
@@ -3864,23 +3752,25 @@ static noinline_for_stack struct inode *create_reloc_inode(
inode = NULL;
goto out;
}
- BTRFS_I(inode)->reloc_block_group_start = group->start;
+ inode->reloc_block_group_start = group->start;
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ ret = btrfs_orphan_add(trans, inode);
out:
btrfs_put_root(root);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (ret) {
- iput(inode);
- inode = ERR_PTR(ret);
+ if (inode)
+ iput(&inode->vfs_inode);
+ return ERR_PTR(ret);
}
- return inode;
+ return &inode->vfs_inode;
}
/*
* Mark start of chunk relocation that is cancellable. Check if the cancellation
* has been requested meanwhile and don't start in that case.
+ * NOTE: if this returns an error, reloc_chunk_end() must not be called.
*
* Return:
* 0 success
@@ -3897,10 +3787,8 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
btrfs_info(fs_info, "chunk relocation canceled on start");
- /*
- * On cancel, clear all requests but let the caller mark
- * the end after cleanup operations.
- */
+ /* On cancel, clear all requests. */
+ clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
atomic_set(&fs_info->reloc_cancel_req, 0);
return -ECANCELED;
}
@@ -3909,9 +3797,11 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
/*
* Mark end of chunk relocation that is cancellable and wake any waiters.
+ * NOTE: call only if a previous call to reloc_chunk_start() succeeded.
*/
static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
{
+ ASSERT(test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags));
/* Requested after start, clear bit first so any waiters can continue */
if (atomic_read(&fs_info->reloc_cancel_req) > 0)
btrfs_info(fs_info, "chunk relocation canceled during operation");
@@ -3932,7 +3822,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
btrfs_backref_init_cache(fs_info, &rc->backref_cache, true);
rc->reloc_root_tree.rb_root = RB_ROOT;
spin_lock_init(&rc->reloc_root_tree.lock);
- extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
+ btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
return rc;
}
@@ -3953,7 +3843,7 @@ static void free_reloc_control(struct reloc_control *rc)
*/
static void describe_relocation(struct btrfs_block_group *block_group)
{
- char buf[128] = {'\0'};
+ char buf[128] = "NONE";
btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
@@ -3973,7 +3863,8 @@ static const char *stage_to_string(enum reloc_stage stage)
/*
* function to relocate all extents in a block group.
*/
-int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
+ bool verbose)
{
struct btrfs_block_group *bg;
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
@@ -3981,8 +3872,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
struct inode *inode;
struct btrfs_path *path;
int ret;
- int rw = 0;
- int err = 0;
+ bool bg_is_ro = false;
/*
* This only gets set if we had a half-deleted snapshot on mount. We
@@ -4024,24 +3914,20 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
}
ret = reloc_chunk_start(fs_info);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out_put_bg;
- }
rc->extent_root = extent_root;
rc->block_group = bg;
ret = btrfs_inc_block_group_ro(rc->block_group, true);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
- rw = 1;
+ bg_is_ro = true;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -4049,23 +3935,22 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
btrfs_free_path(path);
if (!IS_ERR(inode))
- ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
+ ret = delete_block_group_cache(rc->block_group, inode, 0);
else
ret = PTR_ERR(inode);
- if (ret && ret != -ENOENT) {
- err = ret;
+ if (ret && ret != -ENOENT)
goto out;
- }
- rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+ rc->data_inode = create_reloc_inode(rc->block_group);
if (IS_ERR(rc->data_inode)) {
- err = PTR_ERR(rc->data_inode);
+ ret = PTR_ERR(rc->data_inode);
rc->data_inode = NULL;
goto out;
}
- describe_relocation(rc->block_group);
+ if (verbose)
+ describe_relocation(rc->block_group);
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
@@ -4080,8 +3965,6 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
mutex_lock(&fs_info->cleaner_mutex);
ret = relocate_block_group(rc);
mutex_unlock(&fs_info->cleaner_mutex);
- if (ret < 0)
- err = ret;
finishes_stage = rc->stage;
/*
@@ -4094,37 +3977,41 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
* out of the loop if we hit an error.
*/
if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
- ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0,
- (u64)-1);
- if (ret)
- err = ret;
+ int wb_ret;
+
+ wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0,
+ (u64)-1);
+ if (wb_ret && ret == 0)
+ ret = wb_ret;
invalidate_mapping_pages(rc->data_inode->i_mapping,
0, -1);
rc->stage = UPDATE_DATA_PTRS;
}
- if (err < 0)
+ if (ret < 0)
goto out;
if (rc->extents_found == 0)
break;
- btrfs_info(fs_info, "found %llu extents, stage: %s",
- rc->extents_found, stage_to_string(finishes_stage));
+ if (verbose)
+ btrfs_info(fs_info, "found %llu extents, stage: %s",
+ rc->extents_found,
+ stage_to_string(finishes_stage));
}
WARN_ON(rc->block_group->pinned > 0);
WARN_ON(rc->block_group->reserved > 0);
WARN_ON(rc->block_group->used > 0);
out:
- if (err && rw)
+ if (ret && bg_is_ro)
btrfs_dec_block_group_ro(rc->block_group);
iput(rc->data_inode);
+ reloc_chunk_end(fs_info);
out_put_bg:
btrfs_put_block_group(bg);
- reloc_chunk_end(fs_info);
free_reloc_control(rc);
- return err;
+ return ret;
}
static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
@@ -4255,8 +4142,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
rc->merge_reloc_tree = true;
while (!list_empty(&reloc_roots)) {
- reloc_root = list_entry(reloc_roots.next,
- struct btrfs_root, root_list);
+ reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list);
list_del(&reloc_root->root_list);
if (btrfs_root_refs(&reloc_root->root_item) == 0) {
@@ -4306,8 +4192,8 @@ out_clean:
ret = ret2;
out_unset:
unset_reloc_control(rc);
-out_end:
reloc_chunk_end(fs_info);
+out_end:
free_reloc_control(rc);
out:
free_reloc_roots(&reloc_roots);
@@ -4349,7 +4235,7 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
while (!list_empty(&list)) {
struct btrfs_ordered_sum *sums =
- list_entry(list.next, struct btrfs_ordered_sum, list);
+ list_first_entry(&list, struct btrfs_ordered_sum, list);
list_del_init(&sums->list);
@@ -4399,11 +4285,21 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(!first_cow && level == 0);
node = rc->backref_cache.path[level];
- BUG_ON(node->bytenr != buf->start &&
- node->new_bytenr != buf->start);
+
+ /*
+ * If node->bytenr != buf->start and node->new_bytenr !=
+ * buf->start then we've got the wrong backref node for what we
+ * expected to see here and the cache is incorrect.
+ */
+ if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) {
+ btrfs_err(fs_info,
+"bytenr %llu was found but our backref cache was expecting %llu or %llu",
+ buf->start, node->bytenr, node->new_bytenr);
+ return -EUCLEAN;
+ }
btrfs_backref_drop_node_buffer(node);
- atomic_inc(&cow->refs);
+ refcount_inc(&cow->refs);
node->eb = cow;
node->new_bytenr = cow->start;
@@ -4500,10 +4396,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
return ret;
}
new_root->reloc_root = btrfs_grab_root(reloc_root);
-
- if (rc->create_reloc_tree)
- ret = clone_backref_node(trans, rc, root, reloc_root);
- return ret;
+ return 0;
}
/*
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 788c86d8633a..5c36b3f84b57 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -12,7 +12,8 @@ struct btrfs_trans_handle;
struct btrfs_ordered_extent;
struct btrfs_pending_snapshot;
-int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
+ bool verbose);
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 33962671a96c..6a7e297ab0a7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -85,7 +85,7 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of the valid range.
*/
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -130,7 +130,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
*item)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *l;
int ret;
int slot;
@@ -143,15 +143,15 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
if (ret < 0)
- goto out;
+ return ret;
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
btrfs_crit(fs_info,
- "unable to find root key (%llu %u %llu) in tree %llu",
- key->objectid, key->type, key->offset, btrfs_root_id(root));
+ "unable to find root key " BTRFS_KEY_FMT " in tree %llu",
+ BTRFS_KEY_FMT_VALUE(key), btrfs_root_id(root));
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
l = path->nodes[0];
@@ -168,22 +168,22 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, key, path,
-1, 1);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
ret = btrfs_del_item(trans, root, path);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path,
key, sizeof(*item));
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
l = path->nodes[0];
slot = path->slots[0];
@@ -197,9 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
write_extent_buffer(l, item, ptr, sizeof(*item));
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -217,7 +214,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *tree_root = fs_info->tree_root;
struct extent_buffer *leaf;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root *root;
int err = 0;
@@ -310,7 +307,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
btrfs_put_root(root);
}
- btrfs_free_path(path);
return err;
}
@@ -319,7 +315,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
const struct btrfs_key *key)
{
struct btrfs_root *root = trans->fs_info->tree_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
path = btrfs_alloc_path();
@@ -327,17 +323,12 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = btrfs_search_slot(trans, root, key, path, -1, 1);
if (ret < 0)
- goto out;
- if (ret != 0) {
+ return ret;
+ if (unlikely(ret > 0))
/* The root must exist but we did not find it by the key. */
- ret = -EUCLEAN;
- goto out;
- }
+ return -EUCLEAN;
- ret = btrfs_del_item(trans, root, path);
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, root, path);
}
int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
@@ -345,7 +336,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
const struct fscrypt_str *name)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -362,7 +353,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0) {
- goto out;
+ return ret;
} else if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -370,18 +361,16 @@ again:
ptr = (unsigned long)(ref + 1);
if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
(btrfs_root_ref_name_len(leaf, ref) != name->len) ||
- memcmp_extent_buffer(leaf, name->name, ptr, name->len)) {
- ret = -ENOENT;
- goto out;
- }
+ memcmp_extent_buffer(leaf, name->name, ptr, name->len))
+ return -ENOENT;
+
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
if (ret)
- goto out;
+ return ret;
} else {
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
@@ -392,8 +381,6 @@ again:
goto again;
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -419,7 +406,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_key key;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
unsigned long ptr;
@@ -434,9 +421,8 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name->len);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- btrfs_free_path(path);
return ret;
}
@@ -447,7 +433,6 @@ again:
btrfs_set_root_ref_name_len(leaf, ref, name->len);
ptr = (unsigned long)(ref + 1);
write_extent_buffer(leaf, name->name, ptr, name->len);
- btrfs_mark_buffer_dirty(trans, leaf);
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(path);
@@ -457,7 +442,6 @@ again:
goto again;
}
- btrfs_free_path(path);
return 0;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 204c928beaf9..a40ee41f42c6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -66,8 +66,6 @@ struct scrub_ctx;
/* Represent one sector and its needed info to verify the content. */
struct scrub_sector_verification {
- bool is_metadata;
-
union {
/*
* Csum pointer for data csum verification. Should point to a
@@ -100,7 +98,39 @@ enum scrub_stripe_flags {
SCRUB_STRIPE_FLAG_NO_REPORT,
};
-#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE)
+/*
+ * We have multiple bitmaps for one scrub_stripe.
+ * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits,
+ * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64).
+ *
+ * So to reduce memory usage for each scrub_stripe, we pack those bitmaps
+ * into a larger one.
+ *
+ * These enum records where the sub-bitmap are inside the larger one.
+ * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit.
+ */
+enum {
+ /* Which blocks are covered by extent items. */
+ scrub_bitmap_nr_has_extent = 0,
+
+ /* Which blocks are metadata. */
+ scrub_bitmap_nr_is_metadata,
+
+ /*
+ * Which blocks have errors, including IO, csum, and metadata
+ * errors.
+ * This sub-bitmap is the OR results of the next few error related
+ * sub-bitmaps.
+ */
+ scrub_bitmap_nr_error,
+ scrub_bitmap_nr_io_error,
+ scrub_bitmap_nr_csum_error,
+ scrub_bitmap_nr_meta_error,
+ scrub_bitmap_nr_meta_gen_error,
+ scrub_bitmap_nr_last,
+};
+
+#define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE)
/*
* Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
@@ -109,7 +139,7 @@ struct scrub_stripe {
struct scrub_ctx *sctx;
struct btrfs_block_group *bg;
- struct page *pages[SCRUB_STRIPE_PAGES];
+ struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS];
struct scrub_sector_verification *sectors;
struct btrfs_device *dev;
@@ -138,36 +168,15 @@ struct scrub_stripe {
*/
unsigned long state;
- /* Indicate which sectors are covered by extent items. */
- unsigned long extent_sector_bitmap;
+ /* The large bitmap contains all the sub-bitmaps. */
+ unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last *
+ (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))];
/*
- * The errors hit during the initial read of the stripe.
- *
- * Would be utilized for error reporting and repair.
- *
- * The remaining init_nr_* records the number of errors hit, only used
- * by error reporting.
+ * For writeback (repair or replace) error reporting.
+ * This one is protected by a spinlock, thus can not be packed into
+ * the larger bitmap.
*/
- unsigned long init_error_bitmap;
- unsigned int init_nr_io_errors;
- unsigned int init_nr_csum_errors;
- unsigned int init_nr_meta_errors;
-
- /*
- * The following error bitmaps are all for the current status.
- * Every time we submit a new read, these bitmaps may be updated.
- *
- * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
- *
- * IO and csum errors can happen for both metadata and data.
- */
- unsigned long error_bitmap;
- unsigned long io_error_bitmap;
- unsigned long csum_error_bitmap;
- unsigned long meta_error_bitmap;
-
- /* For writeback (repair or replace) error reporting. */
unsigned long write_error_bitmap;
/* Writeback can be concurrent, thus we need to protect the bitmap. */
@@ -197,7 +206,7 @@ struct scrub_ctx {
ktime_t throttle_deadline;
u64 throttle_sent;
- int is_dev_replace;
+ bool is_dev_replace;
u64 write_pointer;
struct mutex wr_lock;
@@ -219,6 +228,90 @@ struct scrub_ctx {
refcount_t refs;
};
+#define scrub_calc_start_bit(stripe, name, block_nr) \
+({ \
+ unsigned int __start_bit; \
+ \
+ ASSERT(block_nr < stripe->nr_sectors, \
+ "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \
+ __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \
+ __start_bit; \
+})
+
+#define IMPLEMENT_SCRUB_BITMAP_OPS(name) \
+static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr, \
+ unsigned int nr_blocks) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, \
+ name, block_nr); \
+ \
+ bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \
+} \
+static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr, \
+ unsigned int nr_blocks) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
+ block_nr); \
+ \
+ bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \
+} \
+static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
+ block_nr); \
+ \
+ return test_bit(start_bit, stripe->bitmaps); \
+} \
+static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
+ block_nr); \
+ \
+ set_bit(start_bit, stripe->bitmaps); \
+} \
+static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
+ block_nr); \
+ \
+ clear_bit(start_bit, stripe->bitmaps); \
+} \
+static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \
+{ \
+ const unsigned int nr_blocks = stripe->nr_sectors; \
+ \
+ ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \
+ "nr_blocks=%u BITS_PER_LONG=%u", \
+ nr_blocks, BITS_PER_LONG); \
+ \
+ return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \
+ stripe->nr_sectors); \
+} \
+static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \
+{ \
+ unsigned long bitmap = scrub_bitmap_read_##name(stripe); \
+ \
+ return bitmap_empty(&bitmap, stripe->nr_sectors); \
+} \
+static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \
+{ \
+ unsigned long bitmap = scrub_bitmap_read_##name(stripe); \
+ \
+ return bitmap_weight(&bitmap, stripe->nr_sectors); \
+}
+IMPLEMENT_SCRUB_BITMAP_OPS(has_extent);
+IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata);
+IMPLEMENT_SCRUB_BITMAP_OPS(error);
+IMPLEMENT_SCRUB_BITMAP_OPS(io_error);
+IMPLEMENT_SCRUB_BITMAP_OPS(csum_error);
+IMPLEMENT_SCRUB_BITMAP_OPS(meta_error);
+IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error);
+
struct scrub_warning {
struct btrfs_path *path;
u64 extent_item_size;
@@ -228,15 +321,28 @@ struct scrub_warning {
struct btrfs_device *dev;
};
+struct scrub_error_records {
+ /*
+ * Bitmap recording which blocks hit errors (IO/csum/...) during the
+ * initial read.
+ */
+ unsigned long init_error_bitmap;
+
+ unsigned int nr_io_errors;
+ unsigned int nr_csum_errors;
+ unsigned int nr_meta_errors;
+ unsigned int nr_meta_gen_errors;
+};
+
static void release_scrub_stripe(struct scrub_stripe *stripe)
{
if (!stripe)
return;
- for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
- if (stripe->pages[i])
- __free_page(stripe->pages[i]);
- stripe->pages[i] = NULL;
+ for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) {
+ if (stripe->folios[i])
+ folio_put(stripe->folios[i]);
+ stripe->folios[i] = NULL;
}
kfree(stripe->sectors);
kfree(stripe->csums);
@@ -249,6 +355,7 @@ static void release_scrub_stripe(struct scrub_stripe *stripe)
static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
struct scrub_stripe *stripe)
{
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
int ret;
memset(stripe, 0, sizeof(*stripe));
@@ -261,7 +368,9 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
atomic_set(&stripe->pending_io, 0);
spin_lock_init(&stripe->write_error_lock);
- ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false);
+ ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS);
+ ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift,
+ fs_info->block_min_order, stripe->folios);
if (ret < 0)
goto error;
@@ -340,7 +449,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx)
}
static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
- struct btrfs_fs_info *fs_info, int is_dev_replace)
+ struct btrfs_fs_info *fs_info, bool is_dev_replace)
{
struct scrub_ctx *sctx;
int i;
@@ -354,10 +463,10 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->fs_info = fs_info;
- sctx->extent_path.search_commit_root = 1;
- sctx->extent_path.skip_locking = 1;
- sctx->csum_path.search_commit_root = 1;
- sctx->csum_path.skip_locking = 1;
+ sctx->extent_path.search_commit_root = true;
+ sctx->extent_path.skip_locking = true;
+ sctx->csum_path.search_commit_root = true;
+ sctx->csum_path.skip_locking = true;
for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) {
int ret;
@@ -396,7 +505,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
struct btrfs_inode_item *inode_item;
struct scrub_warning *swarn = warn_ctx;
struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
- struct inode_fs_paths *ipath = NULL;
+ struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL;
struct btrfs_root *local_root;
struct btrfs_key key;
@@ -450,8 +559,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
- btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
+ btrfs_warn(fs_info,
+"scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)",
swarn->errstr, swarn->logical,
btrfs_dev_name(swarn->dev),
swarn->physical,
@@ -460,18 +569,16 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
(char *)(unsigned long)ipath->fspath->val[i]);
btrfs_put_root(local_root);
- free_ipath(ipath);
return 0;
err:
- btrfs_warn_in_rcu(fs_info,
- "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
+ btrfs_warn(fs_info,
+ "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d",
swarn->errstr, swarn->logical,
btrfs_dev_name(swarn->dev),
swarn->physical,
root, inum, offset, ret);
- free_ipath(ipath);
return 0;
}
@@ -479,7 +586,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
bool is_super, u64 logical, u64 physical)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key found_key;
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
@@ -490,7 +597,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
/* Super block error, no need to search extent tree. */
if (is_super) {
- btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
+ btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu",
errstr, btrfs_dev_name(dev), physical);
return;
}
@@ -506,7 +613,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
&flags);
if (ret < 0)
- goto out;
+ return;
swarn.extent_item_size = found_key.offset;
@@ -525,14 +632,14 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
&ref_level);
if (ret < 0) {
btrfs_warn(fs_info,
- "failed to resolve tree backref for logical %llu: %d",
- swarn.logical, ret);
+ "scrub: failed to resolve tree backref for logical %llu: %d",
+ swarn.logical, ret);
break;
}
if (ret > 0)
break;
- btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
+ btrfs_warn(fs_info,
+"scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
errstr, swarn.logical, btrfs_dev_name(dev),
swarn.physical, (ref_level ? "node" : "leaf"),
ref_level, ref_root);
@@ -552,9 +659,6 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
}
-
-out:
- btrfs_free_path(path);
}
static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
@@ -579,20 +683,32 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
return ret;
}
-static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
+static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr)
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
- int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ u32 offset = (sector_nr << fs_info->sectorsize_bits);
+ const struct folio *folio = stripe->folios[offset >> min_folio_shift];
- return stripe->pages[page_index];
+ /* stripe->folios[] is allocated by us and no highmem is allowed. */
+ ASSERT(folio);
+ ASSERT(!folio_test_highmem(folio));
+ return folio_address(folio) + offset_in_folio(folio, offset);
}
-static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
- int sector_nr)
+static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr)
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ u32 offset = (sector_nr << fs_info->sectorsize_bits);
+ const struct folio *folio = stripe->folios[offset >> min_folio_shift];
- return offset_in_page(sector_nr << fs_info->sectorsize_bits);
+ /* stripe->folios[] is allocated by us and no highmem is allowed. */
+ ASSERT(folio);
+ ASSERT(!folio_test_highmem(folio));
+ /* And the range must be contained inside the folio. */
+ ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio));
+ return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset);
}
static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
@@ -600,46 +716,44 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
- const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
- const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
+ void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
+ struct btrfs_header *header = first_kaddr;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 on_disk_csum[BTRFS_CSUM_SIZE];
u8 calculated_csum[BTRFS_CSUM_SIZE];
- struct btrfs_header *header;
/*
* Here we don't have a good way to attach the pages (and subpages)
* to a dummy extent buffer, thus we have to directly grab the members
* from pages.
*/
- header = (struct btrfs_header *)(page_address(first_page) + first_off);
memcpy(on_disk_csum, header->csum, fs_info->csum_size);
if (logical != btrfs_stack_header_bytenr(header)) {
- bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
+ "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu",
logical, stripe->mirror_num,
btrfs_stack_header_bytenr(header), logical);
return;
}
if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) != 0) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad fsid, has %pU want %pU",
+ "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU",
logical, stripe->mirror_num,
header->fsid, fs_info->fs_devices->fsid);
return;
}
if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE) != 0) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
+ "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
logical, stripe->mirror_num,
header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
return;
@@ -648,42 +762,40 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
/* Now check tree block csum. */
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- crypto_shash_update(shash, page_address(first_page) + first_off +
- BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);
+ crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE,
+ fs_info->sectorsize - BTRFS_CSUM_SIZE);
for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
- struct page *page = scrub_stripe_get_page(stripe, i);
- unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);
-
- crypto_shash_update(shash, page_address(page) + page_off,
+ crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i),
fs_info->sectorsize);
}
crypto_shash_final(shash, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
+"scrub: tree block %llu mirror %u has bad csum, has " BTRFS_CSUM_FMT " want " BTRFS_CSUM_FMT,
logical, stripe->mirror_num,
- CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
- CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
+ BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
+ BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
return;
}
if (stripe->sectors[sector_nr].generation !=
btrfs_stack_header_generation(header)) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad generation, has %llu want %llu",
+ "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu",
logical, stripe->mirror_num,
btrfs_stack_header_generation(header),
stripe->sectors[sector_nr].generation);
return;
}
- bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
- bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree);
}
static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
@@ -691,23 +803,22 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
- struct page *page = scrub_stripe_get_page(stripe, sector_nr);
- unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
+ phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr);
u8 csum_buf[BTRFS_CSUM_SIZE];
int ret;
ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);
/* Sector not utilized, skip it. */
- if (!test_bit(sector_nr, &stripe->extent_sector_bitmap))
+ if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr))
return;
/* IO error, no need to check. */
- if (test_bit(sector_nr, &stripe->io_error_bitmap))
+ if (scrub_bitmap_test_bit_io_error(stripe, sector_nr))
return;
/* Metadata, verify the full tree block. */
- if (sector->is_metadata) {
+ if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) {
/*
* Check if the tree block crosses the stripe boundary. If
* crossed the boundary, we cannot verify it but only give a
@@ -718,7 +829,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
*/
if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
btrfs_warn_rl(fs_info,
- "tree block at %llu crosses stripe boundary %llu",
+ "scrub: tree block at %llu crosses stripe boundary %llu",
stripe->logical +
(sector_nr << fs_info->sectorsize_bits),
stripe->logical);
@@ -733,17 +844,17 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
* cases without csum, we have no other choice but to trust it.
*/
if (!sector->csum) {
- clear_bit(sector_nr, &stripe->error_bitmap);
+ scrub_bitmap_clear_bit_error(stripe, sector_nr);
return;
}
- ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum);
+ ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum);
if (ret < 0) {
- set_bit(sector_nr, &stripe->csum_error_bitmap);
- set_bit(sector_nr, &stripe->error_bitmap);
+ scrub_bitmap_set_bit_csum_error(stripe, sector_nr);
+ scrub_bitmap_set_bit_error(stripe, sector_nr);
} else {
- clear_bit(sector_nr, &stripe->csum_error_bitmap);
- clear_bit(sector_nr, &stripe->error_bitmap);
+ scrub_bitmap_clear_bit_csum_error(stripe, sector_nr);
+ scrub_bitmap_clear_bit_error(stripe, sector_nr);
}
}
@@ -756,7 +867,7 @@ static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long b
for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
scrub_verify_one_sector(stripe, sector_nr);
- if (stripe->sectors[sector_nr].is_metadata)
+ if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr))
sector_nr += sectors_per_tree - 1;
}
}
@@ -766,8 +877,7 @@ static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first
int i;
for (i = 0; i < stripe->nr_sectors; i++) {
- if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page &&
- scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset)
+ if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec))
break;
}
ASSERT(i < stripe->nr_sectors);
@@ -795,13 +905,13 @@ static void scrub_repair_read_endio(struct btrfs_bio *bbio)
bio_size += bvec->bv_len;
if (bbio->bio.bi_status) {
- bitmap_set(&stripe->io_error_bitmap, sector_nr,
- bio_size >> fs_info->sectorsize_bits);
- bitmap_set(&stripe->error_bitmap, sector_nr,
- bio_size >> fs_info->sectorsize_bits);
+ scrub_bitmap_set_io_error(stripe, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
+ scrub_bitmap_set_error(stripe, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
} else {
- bitmap_clear(&stripe->io_error_bitmap, sector_nr,
- bio_size >> fs_info->sectorsize_bits);
+ scrub_bitmap_clear_io_error(stripe, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io))
@@ -814,27 +924,55 @@ static int calc_next_mirror(int mirror, int num_copies)
return (mirror + 1 > num_copies) ? 1 : mirror + 1;
}
+static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe,
+ int sector_nr)
+{
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
+ int ret;
+
+ ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), fs_info->sectorsize,
+ offset_in_page(kaddr));
+ /*
+ * Caller should ensure the bbio has enough size.
+ * And we cannot use __bio_add_page(), which doesn't do any merge.
+ *
+ * Meanwhile for scrub_submit_initial_read() we fully rely on the merge
+ * to create the minimal amount of bio vectors, for fs block size < page
+ * size cases.
+ */
+ ASSERT(ret == fs_info->sectorsize);
+}
+
+static struct btrfs_bio *alloc_scrub_bbio(struct btrfs_fs_info *fs_info,
+ unsigned int nr_vecs, blk_opf_t opf,
+ u64 logical,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ struct btrfs_bio *bbio;
+
+ bbio = btrfs_bio_alloc(nr_vecs, opf, BTRFS_I(fs_info->btree_inode),
+ logical, end_io, private);
+ bbio->is_scrub = true;
+ bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+ return bbio;
+}
+
static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
int mirror, int blocksize, bool wait)
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
- const unsigned long old_error_bitmap = stripe->error_bitmap;
+ const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
int i;
- ASSERT(stripe->mirror_num >= 1);
- ASSERT(atomic_read(&stripe->pending_io) == 0);
+ ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num);
+ ASSERT(atomic_read(&stripe->pending_io) == 0,
+ "atomic_read(&stripe->pending_io)=%d", atomic_read(&stripe->pending_io));
for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
- struct page *page;
- int pgoff;
- int ret;
-
- page = scrub_stripe_get_page(stripe, i);
- pgoff = scrub_stripe_get_page_offset(stripe, i);
-
/* The current sector cannot be merged, submit the bio. */
- if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) ||
+ if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) ||
bbio->bio.bi_iter.bi_size >= blocksize)) {
ASSERT(bbio->bio.bi_iter.bi_size);
atomic_inc(&stripe->pending_io);
@@ -844,15 +982,12 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
bbio = NULL;
}
- if (!bbio) {
- bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
- fs_info, scrub_repair_read_endio, stripe);
- bbio->bio.bi_iter.bi_sector = (stripe->logical +
- (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
- }
+ if (!bbio)
+ bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ,
+ stripe->logical + (i << fs_info->sectorsize_bits),
+ scrub_repair_read_endio, stripe);
- ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
- ASSERT(ret == fs_info->sectorsize);
+ scrub_bio_add_sector(bbio, stripe, i);
}
if (bbio) {
ASSERT(bbio->bio.bi_iter.bi_size);
@@ -864,12 +999,15 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
}
static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
- struct scrub_stripe *stripe)
+ struct scrub_stripe *stripe,
+ const struct scrub_error_records *errors)
{
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_device *dev = NULL;
+ const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe);
+ const unsigned long error_bitmap = scrub_bitmap_read_error(stripe);
u64 physical = 0;
int nr_data_sectors = 0;
int nr_meta_sectors = 0;
@@ -886,14 +1024,14 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
* Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
* thus no need for dev/physical, error reporting still needs dev and physical.
*/
- if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
+ if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) {
u64 mapped_len = fs_info->sectorsize;
struct btrfs_io_context *bioc = NULL;
int stripe_index = stripe->mirror_num - 1;
int ret;
/* For scrub, our mirror_num should always start at 1. */
- ASSERT(stripe->mirror_num >= 1);
+ ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num);
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
stripe->logical, &mapped_len, &bioc,
NULL, NULL);
@@ -909,10 +1047,10 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
}
skip:
- for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
+ for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) {
bool repaired = false;
- if (stripe->sectors[sector_nr].is_metadata) {
+ if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) {
nr_meta_sectors++;
} else {
nr_data_sectors++;
@@ -920,14 +1058,14 @@ skip:
nr_nodatacsum_sectors++;
}
- if (test_bit(sector_nr, &stripe->init_error_bitmap) &&
- !test_bit(sector_nr, &stripe->error_bitmap)) {
+ if (test_bit(sector_nr, &errors->init_error_bitmap) &&
+ !test_bit(sector_nr, &error_bitmap)) {
nr_repaired_sectors++;
repaired = true;
}
/* Good sector from the beginning, nothing need to be done. */
- if (!test_bit(sector_nr, &stripe->init_error_bitmap))
+ if (!test_bit(sector_nr, &errors->init_error_bitmap))
continue;
/*
@@ -936,13 +1074,13 @@ skip:
*/
if (repaired) {
if (dev) {
- btrfs_err_rl_in_rcu(fs_info,
- "fixed up error at logical %llu on dev %s physical %llu",
+ btrfs_err_rl(fs_info,
+ "scrub: fixed up error at logical %llu on dev %s physical %llu",
stripe->logical, btrfs_dev_name(dev),
physical);
} else {
- btrfs_err_rl_in_rcu(fs_info,
- "fixed up error at logical %llu on mirror %u",
+ btrfs_err_rl(fs_info,
+ "scrub: fixed up error at logical %llu on mirror %u",
stripe->logical, stripe->mirror_num);
}
continue;
@@ -950,41 +1088,56 @@ skip:
/* The remaining are all for unrepaired. */
if (dev) {
- btrfs_err_rl_in_rcu(fs_info,
- "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
+ btrfs_err_rl(fs_info,
+"scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu",
stripe->logical, btrfs_dev_name(dev),
physical);
} else {
- btrfs_err_rl_in_rcu(fs_info,
- "unable to fixup (regular) error at logical %llu on mirror %u",
+ btrfs_err_rl(fs_info,
+ "scrub: unable to fixup (regular) error at logical %llu on mirror %u",
stripe->logical, stripe->mirror_num);
}
- if (test_bit(sector_nr, &stripe->io_error_bitmap))
+ if (scrub_bitmap_test_bit_io_error(stripe, sector_nr))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("i/o error", dev, false,
stripe->logical, physical);
- if (test_bit(sector_nr, &stripe->csum_error_bitmap))
+ if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("checksum error", dev, false,
stripe->logical, physical);
- if (test_bit(sector_nr, &stripe->meta_error_bitmap))
+ if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("header error", dev, false,
stripe->logical, physical);
+ if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr))
+ if (__ratelimit(&rs) && dev)
+ scrub_print_common_warning("generation error", dev, false,
+ stripe->logical, physical);
}
+ /* Update the device stats. */
+ for (int i = 0; i < errors->nr_io_errors; i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS);
+ for (int i = 0; i < errors->nr_csum_errors; i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ /* Generation mismatch error is based on each metadata, not each block. */
+ for (int i = 0; i < errors->nr_meta_gen_errors;
+ i += (fs_info->nodesize >> fs_info->sectorsize_bits))
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS);
+
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
sctx->stat.no_csum += nr_nodatacsum_sectors;
- sctx->stat.read_errors += stripe->init_nr_io_errors;
- sctx->stat.csum_errors += stripe->init_nr_csum_errors;
- sctx->stat.verify_errors += stripe->init_nr_meta_errors;
+ sctx->stat.read_errors += errors->nr_io_errors;
+ sctx->stat.csum_errors += errors->nr_csum_errors;
+ sctx->stat.verify_errors += errors->nr_meta_errors +
+ errors->nr_meta_gen_errors;
sctx->stat.uncorrectable_errors +=
- bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
+ bitmap_weight(&error_bitmap, stripe->nr_sectors);
sctx->stat.corrected_errors += nr_repaired_sectors;
spin_unlock(&sctx->stat_lock);
}
@@ -1010,26 +1163,26 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
struct scrub_ctx *sctx = stripe->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct scrub_error_records errors = { 0 };
int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
stripe->bg->length);
unsigned long repaired;
+ unsigned long error;
int mirror;
int i;
- ASSERT(stripe->mirror_num > 0);
+ ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num);
wait_scrub_stripe_io(stripe);
- scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
+ scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe));
/* Save the initial failed bitmap for later repair and report usage. */
- stripe->init_error_bitmap = stripe->error_bitmap;
- stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap,
- stripe->nr_sectors);
- stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap,
- stripe->nr_sectors);
- stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap,
- stripe->nr_sectors);
-
- if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
+ errors.init_error_bitmap = scrub_bitmap_read_error(stripe);
+ errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe);
+ errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe);
+ errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe);
+ errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe);
+
+ if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors))
goto out;
/*
@@ -1041,13 +1194,13 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
mirror != stripe->mirror_num;
mirror = calc_next_mirror(mirror, num_copies)) {
- const unsigned long old_error_bitmap = stripe->error_bitmap;
+ const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
scrub_stripe_submit_repair_read(stripe, mirror,
BTRFS_STRIPE_LEN, false);
wait_scrub_stripe_io(stripe);
scrub_verify_one_stripe(stripe, old_error_bitmap);
- if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+ if (scrub_bitmap_empty_error(stripe))
goto out;
}
@@ -1065,21 +1218,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
for (i = 0, mirror = stripe->mirror_num;
i < num_copies;
i++, mirror = calc_next_mirror(mirror, num_copies)) {
- const unsigned long old_error_bitmap = stripe->error_bitmap;
+ const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
scrub_stripe_submit_repair_read(stripe, mirror,
fs_info->sectorsize, true);
wait_scrub_stripe_io(stripe);
scrub_verify_one_stripe(stripe, old_error_bitmap);
- if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+ if (scrub_bitmap_empty_error(stripe))
goto out;
}
out:
+ error = scrub_bitmap_read_error(stripe);
/*
* Submit the repaired sectors. For zoned case, we cannot do repair
* in-place, but queue the bg to be relocated.
*/
- bitmap_andnot(&repaired, &stripe->init_error_bitmap, &stripe->error_bitmap,
+ bitmap_andnot(&repaired, &errors.init_error_bitmap, &error,
stripe->nr_sectors);
if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) {
if (btrfs_is_zoned(fs_info)) {
@@ -1090,7 +1244,7 @@ out:
}
}
- scrub_stripe_report_errors(sctx, stripe);
+ scrub_stripe_report_errors(sctx, stripe, &errors);
set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
wake_up(&stripe->repair_wait);
}
@@ -1110,10 +1264,10 @@ static void scrub_read_endio(struct btrfs_bio *bbio)
num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
if (bbio->bio.bi_status) {
- bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
- bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
+ scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors);
+ scrub_bitmap_set_error(stripe, sector_nr, num_sectors);
} else {
- bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
+ scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors);
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io)) {
@@ -1142,6 +1296,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio)
bitmap_set(&stripe->write_error_bitmap, sector_nr,
bio_size >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&stripe->write_error_lock, flags);
+ for (i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
}
bio_put(&bbio->bio);
@@ -1199,27 +1356,19 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
int sector_nr;
for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
- struct page *page = scrub_stripe_get_page(stripe, sector_nr);
- unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
- int ret;
-
/* We should only writeback sectors covered by an extent. */
- ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap));
+ ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr));
/* Cannot merge with previous sector, submit the current one. */
if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
bbio = NULL;
}
- if (!bbio) {
- bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE,
- fs_info, scrub_write_endio, stripe);
- bbio->bio.bi_iter.bi_sector = (stripe->logical +
- (sector_nr << fs_info->sectorsize_bits)) >>
- SECTOR_SHIFT;
- }
- ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
- ASSERT(ret == fs_info->sectorsize);
+ if (!bbio)
+ bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_WRITE,
+ stripe->logical + (sector_nr << fs_info->sectorsize_bits),
+ scrub_write_endio, stripe);
+ scrub_bio_add_sector(bbio, stripe, sector_nr);
}
if (bbio)
scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
@@ -1246,8 +1395,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
* Slice is divided into intervals when the IO is submitted, adjust by
* bwlimit and maximum of 64 intervals.
*/
- div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
- div = min_t(u32, 64, div);
+ div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64);
/* Start new epoch, set deadline */
now = ktime_get();
@@ -1339,7 +1487,7 @@ static int compare_extent_item_range(struct btrfs_path *path,
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
- key.type == BTRFS_METADATA_ITEM_KEY);
+ key.type == BTRFS_METADATA_ITEM_KEY, "key.type=%u", key.type);
if (key.type == BTRFS_METADATA_ITEM_KEY)
len = fs_info->nodesize;
else
@@ -1380,17 +1528,17 @@ static int find_first_extent_item(struct btrfs_root *extent_root,
if (path->nodes[0])
goto search_forward;
+ key.objectid = search_start;
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = search_start;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -1444,7 +1592,7 @@ static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
- key.type == BTRFS_EXTENT_ITEM_KEY);
+ key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u", key.type);
*extent_start_ret = key.objectid;
if (key.type == BTRFS_METADATA_ITEM_KEY)
*size_ret = path->nodes[0]->fs_info->nodesize;
@@ -1470,8 +1618,7 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
physical,
sctx->write_pointer);
if (ret)
- btrfs_err(fs_info,
- "zoned: failed to recover write pointer");
+ btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer");
}
mutex_unlock(&sctx->wr_lock);
btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
@@ -1493,9 +1640,9 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
struct scrub_sector_verification *sector =
&stripe->sectors[nr_sector];
- set_bit(nr_sector, &stripe->extent_sector_bitmap);
+ scrub_bitmap_set_bit_has_extent(stripe, nr_sector);
if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- sector->is_metadata = true;
+ scrub_bitmap_set_bit_is_metadata(stripe, nr_sector);
sector->generation = extent_gen;
}
}
@@ -1503,15 +1650,8 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
{
- stripe->extent_sector_bitmap = 0;
- stripe->init_error_bitmap = 0;
- stripe->init_nr_io_errors = 0;
- stripe->init_nr_csum_errors = 0;
- stripe->init_nr_meta_errors = 0;
- stripe->error_bitmap = 0;
- stripe->io_error_bitmap = 0;
- stripe->csum_error_bitmap = 0;
- stripe->meta_error_bitmap = 0;
+ ASSERT(stripe->nr_sectors);
+ bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors);
}
/*
@@ -1541,12 +1681,18 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
u64 extent_gen;
int ret;
+ if (unlikely(!extent_root || !csum_root)) {
+ btrfs_err(fs_info, "scrub: no valid extent or csum root found");
+ return -EUCLEAN;
+ }
memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
stripe->nr_sectors);
scrub_stripe_reset_bitmaps(stripe);
/* The range must be inside the bg. */
- ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
+ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length,
+ "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu",
+ bg->start, logical_start, logical_end, bg->start + bg->length);
ret = find_first_extent_item(extent_root, extent_path, logical_start,
logical_len);
@@ -1642,7 +1788,6 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe)
stripe->state = 0;
for (int i = 0; i < stripe->nr_sectors; i++) {
- stripe->sectors[i].is_metadata = false;
stripe->sectors[i].csum = NULL;
stripe->sectors[i].generation = 0;
}
@@ -1661,24 +1806,21 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
+ const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe);
u64 stripe_len = BTRFS_STRIPE_LEN;
int mirror = stripe->mirror_num;
int i;
atomic_inc(&stripe->pending_io);
- for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
- struct page *page = scrub_stripe_get_page(stripe, i);
- unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
-
+ for_each_set_bit(i, &has_extent, stripe->nr_sectors) {
/* We're beyond the chunk boundary, no need to read anymore. */
if (i >= nr_sectors)
break;
/* The current sector cannot be merged, submit the bio. */
if (bbio &&
- ((i > 0 &&
- !test_bit(i - 1, &stripe->extent_sector_bitmap)) ||
+ ((i > 0 && !test_bit(i - 1, &has_extent)) ||
bbio->bio.bi_iter.bi_size >= stripe_len)) {
ASSERT(bbio->bio.bi_iter.bi_size);
atomic_inc(&stripe->pending_io);
@@ -1691,7 +1833,7 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
struct btrfs_io_context *bioc = NULL;
const u64 logical = stripe->logical +
(i << fs_info->sectorsize_bits);
- int err;
+ int ret;
io_stripe.rst_search_commit_root = true;
stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
@@ -1699,11 +1841,11 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
* For RST cases, we need to manually split the bbio to
* follow the RST boundary.
*/
- err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
&stripe_len, &bioc, &io_stripe, &mirror);
btrfs_put_bioc(bioc);
- if (err < 0) {
- if (err != -ENODATA) {
+ if (ret < 0) {
+ if (ret != -ENODATA) {
/*
* Earlier btrfs_get_raid_extent_offset()
* returned -ENODATA, which means there's
@@ -1712,18 +1854,17 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
* the extent tree, then it's a preallocated
* extent and not an error.
*/
- set_bit(i, &stripe->io_error_bitmap);
- set_bit(i, &stripe->error_bitmap);
+ scrub_bitmap_set_bit_io_error(stripe, i);
+ scrub_bitmap_set_bit_error(stripe, i);
}
continue;
}
- bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
- fs_info, scrub_read_endio, stripe);
- bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+ bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ,
+ logical, scrub_read_endio, stripe);
}
- __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+ scrub_bio_add_sector(bbio, stripe, i);
}
if (bbio) {
@@ -1744,6 +1885,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_bio *bbio;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
int mirror = stripe->mirror_num;
@@ -1756,20 +1898,11 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
return;
}
- bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
- scrub_read_endio, stripe);
-
- bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
+ bbio = alloc_scrub_bbio(fs_info, BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ,
+ stripe->logical, scrub_read_endio, stripe);
/* Read the whole range inside the chunk boundary. */
- for (unsigned int cur = 0; cur < nr_sectors; cur++) {
- struct page *page = scrub_stripe_get_page(stripe, cur);
- unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
- int ret;
-
- ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
- /* We should have allocated enough bio vectors. */
- ASSERT(ret == fs_info->sectorsize);
- }
+ for (unsigned int cur = 0; cur < nr_sectors; cur++)
+ scrub_bio_add_sector(bbio, stripe, cur);
atomic_inc(&stripe->pending_io);
/*
@@ -1790,14 +1923,15 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
{
+ const unsigned long error = scrub_bitmap_read_error(stripe);
int i;
- for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) {
- if (stripe->sectors[i].is_metadata) {
+ for_each_set_bit(i, &error, stripe->nr_sectors) {
+ if (scrub_bitmap_test_bit_is_metadata(stripe, i)) {
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
btrfs_err(fs_info,
- "stripe %llu has unrepaired metadata sector at %llu",
+ "scrub: stripe %llu has unrepaired metadata sector at logical %llu",
stripe->logical,
stripe->logical + (i << fs_info->sectorsize_bits));
return true;
@@ -1861,20 +1995,23 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
* metadata, we should immediately abort.
*/
for (int i = 0; i < nr_stripes; i++) {
- if (stripe_has_metadata_error(&sctx->stripes[i])) {
+ if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) {
ret = -EIO;
goto out;
}
}
for (int i = 0; i < nr_stripes; i++) {
unsigned long good;
+ unsigned long has_extent;
+ unsigned long error;
stripe = &sctx->stripes[i];
ASSERT(stripe->dev == fs_info->dev_replace.srcdev);
- bitmap_andnot(&good, &stripe->extent_sector_bitmap,
- &stripe->error_bitmap, stripe->nr_sectors);
+ has_extent = scrub_bitmap_read_has_extent(stripe);
+ error = scrub_bitmap_read_error(stripe);
+ bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors);
scrub_write_sectors(sctx, stripe, good, true);
}
}
@@ -1940,37 +2077,135 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
return 0;
}
+/*
+ * Return 0 if we should not cancel the scrub.
+ * Return <0 if we need to cancel the scrub, returned value will
+ * indicate the reason:
+ * - -ECANCELED - Being explicitly canceled through ioctl.
+ * - -EINTR - Being interrupted by signal or fs/process freezing.
+ */
+static int should_cancel_scrub(const struct scrub_ctx *sctx)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+
+ if (atomic_read(&fs_info->scrub_cancel_req) ||
+ atomic_read(&sctx->cancel_req))
+ return -ECANCELED;
+
+ /*
+ * The user (e.g. fsfreeze command) or power management (PM)
+ * suspend/hibernate can freeze the fs. And PM suspend/hibernate will
+ * also freeze all user processes.
+ *
+ * A user process can only be frozen when it is in user space, thus we
+ * have to cancel the run so that the process can return to the user
+ * space.
+ *
+ * Furthermore we have to check both filesystem and process freezing,
+ * as PM can be configured to freeze the filesystems before processes.
+ *
+ * If we only check fs freezing, then suspend without fs freezing
+ * will timeout, as the process is still in kernel space.
+ *
+ * If we only check process freezing, then suspend with fs freezing
+ * will timeout, as the running scrub will prevent the fs from being frozen.
+ */
+ if (fs_info->sb->s_writers.frozen > SB_UNFROZEN ||
+ freezing(current) || signal_pending(current))
+ return -EINTR;
+ return 0;
+}
+
+static int scrub_raid56_cached_parity(struct scrub_ctx *sctx,
+ struct btrfs_device *scrub_dev,
+ struct btrfs_chunk_map *map,
+ u64 full_stripe_start,
+ unsigned long *extent_bitmap)
+{
+ DECLARE_COMPLETION_ONSTACK(io_done);
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_raid_bio *rbio;
+ struct bio bio;
+ const int data_stripes = nr_data_stripes(map);
+ u64 length = btrfs_stripe_nr_to_offset(data_stripes);
+ int ret;
+
+ bio_init(&bio, NULL, NULL, 0, REQ_OP_READ);
+ bio.bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
+ bio.bi_private = &io_done;
+ bio.bi_end_io = raid56_scrub_wait_endio;
+
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
+ &length, &bioc, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ /* For RAID56 write there must be an @bioc allocated. */
+ ASSERT(bioc);
+ rbio = raid56_parity_alloc_scrub_rbio(&bio, bioc, scrub_dev, extent_bitmap,
+ BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
+ btrfs_put_bioc(bioc);
+ if (!rbio) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ /* Use the recovered stripes as cache to avoid read them from disk again. */
+ for (int i = 0; i < data_stripes; i++) {
+ struct scrub_stripe *stripe = &sctx->raid56_data_stripes[i];
+
+ raid56_parity_cache_data_folios(rbio, stripe->folios,
+ full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT));
+ }
+ raid56_parity_submit_scrub_rbio(rbio);
+ wait_for_completion_io(&io_done);
+ ret = blk_status_to_errno(bio.bi_status);
+out:
+ btrfs_bio_counter_dec(fs_info);
+ bio_uninit(&bio);
+ return ret;
+}
+
static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
u64 full_stripe_start)
{
- DECLARE_COMPLETION_ONSTACK(io_done);
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_raid_bio *rbio;
- struct btrfs_io_context *bioc = NULL;
struct btrfs_path extent_path = { 0 };
struct btrfs_path csum_path = { 0 };
- struct bio *bio;
struct scrub_stripe *stripe;
bool all_empty = true;
const int data_stripes = nr_data_stripes(map);
unsigned long extent_bitmap = 0;
- u64 length = btrfs_stripe_nr_to_offset(data_stripes);
int ret;
ASSERT(sctx->raid56_data_stripes);
+ ret = should_cancel_scrub(sctx);
+ if (ret < 0)
+ return ret;
+
+ if (atomic_read(&fs_info->scrub_pause_req))
+ scrub_blocked_if_needed(fs_info);
+
+ spin_lock(&bg->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
+ spin_unlock(&bg->lock);
+ return 0;
+ }
+ spin_unlock(&bg->lock);
+
/*
* For data stripe search, we cannot reuse the same extent/csum paths,
* as the data stripe bytenr may be smaller than previous extent. Thus
* we have to use our own extent/csum paths.
*/
- extent_path.search_commit_root = 1;
- extent_path.skip_locking = 1;
- csum_path.search_commit_root = 1;
- csum_path.skip_locking = 1;
+ extent_path.search_commit_root = true;
+ extent_path.skip_locking = true;
+ csum_path.search_commit_root = true;
+ csum_path.skip_locking = true;
for (int i = 0; i < data_stripes; i++) {
int stripe_index;
@@ -2008,7 +2243,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
/* Check if all data stripes are empty. */
for (int i = 0; i < data_stripes; i++) {
stripe = &sctx->raid56_data_stripes[i];
- if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) {
+ if (!scrub_bitmap_empty_has_extent(stripe)) {
all_empty = false;
break;
}
@@ -2040,65 +2275,36 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
*/
for (int i = 0; i < data_stripes; i++) {
unsigned long error;
+ unsigned long has_extent;
stripe = &sctx->raid56_data_stripes[i];
+ error = scrub_bitmap_read_error(stripe);
+ has_extent = scrub_bitmap_read_has_extent(stripe);
+
/*
* We should only check the errors where there is an extent.
* As we may hit an empty data stripe while it's missing.
*/
- bitmap_and(&error, &stripe->error_bitmap,
- &stripe->extent_sector_bitmap, stripe->nr_sectors);
- if (!bitmap_empty(&error, stripe->nr_sectors)) {
+ bitmap_and(&error, &error, &has_extent, stripe->nr_sectors);
+ if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) {
btrfs_err(fs_info,
-"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
+"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
full_stripe_start, i, stripe->nr_sectors,
&error);
ret = -EIO;
goto out;
}
- bitmap_or(&extent_bitmap, &extent_bitmap,
- &stripe->extent_sector_bitmap, stripe->nr_sectors);
+ bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent,
+ stripe->nr_sectors);
}
/* Now we can check and regenerate the P/Q stripe. */
- bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS);
- bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
- bio->bi_private = &io_done;
- bio->bi_end_io = raid56_scrub_wait_endio;
-
- btrfs_bio_counter_inc_blocked(fs_info);
- ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
- &length, &bioc, NULL, NULL);
- if (ret < 0) {
- btrfs_put_bioc(bioc);
- btrfs_bio_counter_dec(fs_info);
- goto out;
- }
- rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap,
- BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
- btrfs_put_bioc(bioc);
- if (!rbio) {
- ret = -ENOMEM;
- btrfs_bio_counter_dec(fs_info);
- goto out;
- }
- /* Use the recovered stripes as cache to avoid read them from disk again. */
- for (int i = 0; i < data_stripes; i++) {
- stripe = &sctx->raid56_data_stripes[i];
-
- raid56_parity_cache_data_pages(rbio, stripe->pages,
- full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT));
- }
- raid56_parity_submit_scrub_rbio(rbio);
- wait_for_completion_io(&io_done);
- ret = blk_status_to_errno(bio->bi_status);
- bio_put(bio);
- btrfs_bio_counter_dec(fs_info);
-
+ ret = scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start,
+ &extent_bitmap);
+out:
btrfs_release_path(&extent_path);
btrfs_release_path(&csum_path);
-out:
return ret;
}
@@ -2129,18 +2335,13 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
u64 found_logical = U64_MAX;
u64 cur_physical = physical + cur_logical - logical_start;
- /* Canceled? */
- if (atomic_read(&fs_info->scrub_cancel_req) ||
- atomic_read(&sctx->cancel_req)) {
- ret = -ECANCELED;
+ ret = should_cancel_scrub(sctx);
+ if (ret < 0)
break;
- }
- /* Paused? */
- if (atomic_read(&fs_info->scrub_pause_req)) {
- /* Push queued extents */
+
+ if (atomic_read(&fs_info->scrub_pause_req))
scrub_blocked_if_needed(fs_info);
- }
- /* Block group removed? */
+
spin_lock(&bg->lock);
if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
spin_unlock(&bg->lock);
@@ -2395,8 +2596,6 @@ out:
}
if (sctx->is_dev_replace && ret >= 0) {
- int ret2;
-
ret2 = sync_write_pointer_for_zoned(sctx,
chunk_logical + offset,
map->stripes[stripe_index].physical,
@@ -2471,7 +2670,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev, u64 start, u64 end)
{
struct btrfs_dev_extent *dev_extent = NULL;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root = fs_info->dev_root;
u64 chunk_offset;
@@ -2489,12 +2688,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
return -ENOMEM;
path->reada = READA_FORWARD;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
key.objectid = scrub_dev->devid;
- key.offset = 0ull;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0ull;
while (1) {
u64 dev_extent_len;
@@ -2673,14 +2872,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
ro_set = 0;
} else if (ret == -ETXTBSY) {
btrfs_warn(fs_info,
- "skipping scrub of block group %llu due to active swapfile",
+ "scrub: skipping scrub of block group %llu due to active swapfile",
cache->start);
scrub_pause_off(fs_info);
ret = 0;
goto skip_unfreeze;
} else {
- btrfs_warn(fs_info,
- "failed setting block group ro: %d", ret);
+ btrfs_warn(fs_info, "scrub: failed setting block group ro: %d",
+ ret);
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
scrub_pause_off(fs_info);
@@ -2743,8 +2942,8 @@ skip_unfreeze:
btrfs_put_block_group(cache);
if (ret)
break;
- if (sctx->is_dev_replace &&
- atomic64_read(&dev_replace->num_write_errors) > 0) {
+ if (unlikely(sctx->is_dev_replace &&
+ atomic64_read(&dev_replace->num_write_errors) > 0)) {
ret = -EIO;
break;
}
@@ -2757,8 +2956,6 @@ skip:
btrfs_release_path(path);
}
- btrfs_free_path(path);
-
return ret;
}
@@ -2766,29 +2963,23 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
struct page *page, u64 physical, u64 generation)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct bio_vec bvec;
- struct bio bio;
struct btrfs_super_block *sb = page_address(page);
int ret;
- bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
- __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
- ret = submit_bio_wait(&bio);
- bio_uninit(&bio);
-
+ ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb,
+ BTRFS_SUPER_INFO_SIZE, REQ_OP_READ);
if (ret < 0)
return ret;
ret = btrfs_check_super_csum(fs_info, sb);
- if (ret != 0) {
+ if (unlikely(ret != 0)) {
btrfs_err_rl(fs_info,
- "super block at physical %llu devid %llu has bad csum",
+ "scrub: super block at physical %llu devid %llu has bad csum",
physical, dev->devid);
return -EIO;
}
- if (btrfs_super_generation(sb) != generation) {
+ if (unlikely(btrfs_super_generation(sb) != generation)) {
btrfs_err_rl(fs_info,
-"super block at physical %llu devid %llu has bad generation %llu expect %llu",
+"scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu",
physical, dev->devid,
btrfs_super_generation(sb), generation);
return -EUCLEAN;
@@ -2904,7 +3095,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info)
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
- int readonly, int is_dev_replace)
+ bool readonly, bool is_dev_replace)
{
struct btrfs_dev_lookup_args args = { .devid = devid };
struct scrub_ctx *sctx;
@@ -2913,6 +3104,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
unsigned int nofs_flag;
bool need_commit = false;
+ /* Set the basic fallback @last_physical before we got a sctx. */
+ if (progress)
+ progress->last_physical = start;
+
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
@@ -2931,6 +3126,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
sctx = scrub_setup_ctx(fs_info, is_dev_replace);
if (IS_ERR(sctx))
return PTR_ERR(sctx);
+ sctx->stat.last_physical = start;
ret = scrub_workers_get(fs_info);
if (ret)
@@ -2948,16 +3144,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!is_dev_replace && !readonly &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_err_in_rcu(fs_info,
- "scrub on devid %llu: filesystem on %s is not writable",
+ btrfs_err(fs_info,
+ "scrub: devid %llu: filesystem on %s is not writable",
devid, btrfs_dev_name(dev));
ret = -EROFS;
goto out;
}
mutex_lock(&fs_info->scrub_lock);
- if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
- test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
+ if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) {
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EIO;
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
index f0df597b75c7..aa68b6ebaf55 100644
--- a/fs/btrfs/scrub.h
+++ b/fs/btrfs/scrub.h
@@ -11,7 +11,7 @@ struct btrfs_scrub_progress;
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
- int readonly, int is_dev_replace);
+ bool readonly, bool is_dev_replace);
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
int btrfs_scrub_cancel(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7254279c3cc9..2522faa97478 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4,6 +4,7 @@
*/
#include <linux/bsearch.h>
+#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/sort.h>
@@ -16,7 +17,6 @@
#include <linux/compat.h>
#include <linux/crc32c.h>
#include <linux/fsverity.h>
-
#include "send.h"
#include "ctree.h"
#include "backref.h"
@@ -47,28 +47,30 @@
* It allows fast adding of path elements on the right side (normal path) and
* fast adding to the left side (reversed path). A reversed path can also be
* unreversed if needed.
+ *
+ * The definition of struct fs_path relies on -fms-extensions to allow
+ * including a tagged struct as an anonymous member.
*/
+struct __fs_path {
+ char *start;
+ char *end;
+
+ char *buf;
+ unsigned short buf_len:15;
+ unsigned short reversed:1;
+};
+static_assert(sizeof(struct __fs_path) < 256);
struct fs_path {
- union {
- struct {
- char *start;
- char *end;
-
- char *buf;
- unsigned short buf_len:15;
- unsigned short reversed:1;
- char inline_buf[];
- };
- /*
- * Average path length does not exceed 200 bytes, we'll have
- * better packing in the slab and higher chance to satisfy
- * an allocation later during send.
- */
- char pad[256];
- };
+ struct __fs_path;
+ /*
+ * Average path length does not exceed 200 bytes, we'll have
+ * better packing in the slab and higher chance to satisfy
+ * an allocation later during send.
+ */
+ char inline_buf[256 - sizeof(struct __fs_path)];
};
#define FS_PATH_INLINE_SIZE \
- (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
+ sizeof_field(struct fs_path, inline_buf)
/* reused for each extent */
@@ -304,6 +306,8 @@ struct send_ctx {
struct btrfs_lru_cache dir_created_cache;
struct btrfs_lru_cache dir_utimes_cache;
+
+ struct fs_path cur_inode_path;
};
struct pending_dir_move {
@@ -383,11 +387,11 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
result_string = "updated";
break;
case BTRFS_COMPARE_TREE_SAME:
- ASSERT(0);
+ DEBUG_WARN("no change between trees");
result_string = "unchanged";
break;
default:
- ASSERT(0);
+ DEBUG_WARN("unexpected comparison result %d", result);
result_string = "unexpected";
}
@@ -425,15 +429,21 @@ static int need_send_hole(struct send_ctx *sctx)
static void fs_path_reset(struct fs_path *p)
{
- if (p->reversed) {
+ if (p->reversed)
p->start = p->buf + p->buf_len - 1;
- p->end = p->start;
- *p->start = 0;
- } else {
+ else
p->start = p->buf;
- p->end = p->start;
- *p->start = 0;
- }
+
+ p->end = p->start;
+ *p->start = 0;
+}
+
+static void init_path(struct fs_path *p)
+{
+ p->reversed = 0;
+ p->buf = p->inline_buf;
+ p->buf_len = FS_PATH_INLINE_SIZE;
+ fs_path_reset(p);
}
static struct fs_path *fs_path_alloc(void)
@@ -443,10 +453,7 @@ static struct fs_path *fs_path_alloc(void)
p = kmalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return NULL;
- p->reversed = 0;
- p->buf = p->inline_buf;
- p->buf_len = FS_PATH_INLINE_SIZE;
- fs_path_reset(p);
+ init_path(p);
return p;
}
@@ -471,7 +478,7 @@ static void fs_path_free(struct fs_path *p)
kfree(p);
}
-static int fs_path_len(struct fs_path *p)
+static inline int fs_path_len(const struct fs_path *p)
{
return p->end - p->start;
}
@@ -487,12 +494,10 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (p->buf_len >= len)
return 0;
- if (len > PATH_MAX) {
- WARN_ON(1);
- return -ENOMEM;
- }
+ if (WARN_ON(len > PATH_MAX))
+ return -ENAMETOOLONG;
- path_len = p->end - p->start;
+ path_len = fs_path_len(p);
old_buf_len = p->buf_len;
/*
@@ -533,12 +538,12 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
int ret;
int new_len;
- new_len = p->end - p->start + name_len;
+ new_len = fs_path_len(p) + name_len;
if (p->start != p->end)
new_len++;
ret = fs_path_ensure_buf(p, new_len);
if (ret < 0)
- goto out;
+ return ret;
if (p->reversed) {
if (p->start != p->end)
@@ -553,8 +558,7 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
*p->end = 0;
}
-out:
- return ret;
+ return 0;
}
static int fs_path_add(struct fs_path *p, const char *name, int name_len)
@@ -564,25 +568,15 @@ static int fs_path_add(struct fs_path *p, const char *name, int name_len)
ret = fs_path_prepare_for_add(p, name_len, &prepared);
if (ret < 0)
- goto out;
+ return ret;
memcpy(prepared, name, name_len);
-out:
- return ret;
+ return 0;
}
-static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
+static inline int fs_path_add_path(struct fs_path *p, const struct fs_path *p2)
{
- int ret;
- char *prepared;
-
- ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
- if (ret < 0)
- goto out;
- memcpy(prepared, p2->start, p2->end - p2->start);
-
-out:
- return ret;
+ return fs_path_add(p, p2->start, fs_path_len(p2));
}
static int fs_path_add_from_extent_buffer(struct fs_path *p,
@@ -594,12 +588,11 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
ret = fs_path_prepare_for_add(p, len, &prepared);
if (ret < 0)
- goto out;
+ return ret;
read_extent_buffer(eb, prepared, off, len);
-out:
- return ret;
+ return 0;
}
static int fs_path_copy(struct fs_path *p, struct fs_path *from)
@@ -619,13 +612,21 @@ static void fs_path_unreverse(struct fs_path *p)
return;
tmp = p->start;
- len = p->end - p->start;
+ len = fs_path_len(p);
p->start = p->buf;
p->end = p->start + len;
memmove(p->start, tmp, len + 1);
p->reversed = 0;
}
+static inline bool is_current_inode_path(const struct send_ctx *sctx,
+ const struct fs_path *path)
+{
+ const struct fs_path *cur = &sctx->cur_inode_path;
+
+ return (strncmp(path->start, cur->start, fs_path_len(cur)) == 0);
+}
+
static struct btrfs_path *alloc_path_for_send(void)
{
struct btrfs_path *path;
@@ -633,9 +634,9 @@ static struct btrfs_path *alloc_path_for_send(void)
path = btrfs_alloc_path();
if (!path)
return NULL;
- path->search_commit_root = 1;
- path->skip_locking = 1;
- path->need_commit_sem = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
+ path->need_commit_sem = true;
return path;
}
@@ -648,7 +649,7 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
ret = kernel_write(filp, buf + pos, len - pos, off);
if (ret < 0)
return ret;
- if (ret == 0)
+ if (unlikely(ret == 0))
return -EIO;
pos += ret;
}
@@ -740,7 +741,7 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
#define TLV_PUT_PATH(sctx, attrtype, p) \
do { \
ret = tlv_put_string(sctx, attrtype, p->start, \
- p->end - p->start); \
+ fs_path_len((p))); \
if (ret < 0) \
goto tlv_put_failure; \
} while(0)
@@ -761,7 +762,7 @@ static int send_header(struct send_ctx *sctx)
{
struct btrfs_stream_header hdr;
- strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
+ strscpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
hdr.version = cpu_to_le32(sctx->proto);
return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
&sctx->send_off);
@@ -819,14 +820,11 @@ static int send_cmd(struct send_ctx *sctx)
static int send_rename(struct send_ctx *sctx,
struct fs_path *from, struct fs_path *to)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
- btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
-
ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
@@ -834,7 +832,6 @@ static int send_rename(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -844,14 +841,11 @@ out:
static int send_link(struct send_ctx *sctx,
struct fs_path *path, struct fs_path *lnk)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
- btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
-
ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
@@ -859,7 +853,6 @@ static int send_link(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -868,21 +861,17 @@ out:
*/
static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
- btrfs_debug(fs_info, "send_unlink %s", path->start);
-
ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -891,21 +880,17 @@ out:
*/
static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
- btrfs_debug(fs_info, "send_rmdir %s", path->start);
-
ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -927,7 +912,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
struct btrfs_inode_info *info)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_inode_item *ii;
struct btrfs_key key;
@@ -942,11 +927,11 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
if (ret) {
if (ret > 0)
ret = -ENOENT;
- goto out;
+ return ret;
}
if (!info)
- goto out;
+ return 0;
ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
@@ -963,9 +948,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
*/
info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
@@ -991,13 +974,13 @@ typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx);
* path must point to the INODE_REF or INODE_EXTREF when called.
*/
static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *found_key, int resolve,
+ struct btrfs_key *found_key, bool resolve,
iterate_inode_ref_t iterate, void *ctx)
{
struct extent_buffer *eb = path->nodes[0];
struct btrfs_inode_ref *iref;
struct btrfs_inode_extref *extref;
- struct btrfs_path *tmp_path;
+ BTRFS_PATH_AUTO_FREE(tmp_path);
struct fs_path *p;
u32 cur = 0;
u32 total;
@@ -1071,10 +1054,8 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
}
if (unlikely(start < p->buf)) {
btrfs_err(root->fs_info,
- "send: path ref buffer underflow for key (%llu %u %llu)",
- found_key->objectid,
- found_key->type,
- found_key->offset);
+ "send: path ref buffer underflow for key " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(found_key));
ret = -EINVAL;
goto out;
}
@@ -1094,7 +1075,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
}
out:
- btrfs_free_path(tmp_path);
fs_path_free(p);
return ret;
}
@@ -1155,12 +1135,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) {
- if (name_len > XATTR_NAME_MAX) {
+ if (unlikely(name_len > XATTR_NAME_MAX)) {
ret = -ENAMETOOLONG;
goto out;
}
- if (name_len + data_len >
- BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
+ if (unlikely(name_len + data_len >
+ BTRFS_MAX_XATTR_SIZE(root->fs_info))) {
ret = -E2BIG;
goto out;
}
@@ -1168,7 +1148,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
/*
* Path too long
*/
- if (name_len + data_len > PATH_MAX) {
+ if (unlikely(name_len + data_len > PATH_MAX)) {
ret = -ENAMETOOLONG;
goto out;
}
@@ -1242,7 +1222,7 @@ static int get_inode_path(struct btrfs_root *root,
{
int ret;
struct btrfs_key key, found_key;
- struct btrfs_path *p;
+ BTRFS_PATH_AUTO_FREE(p);
p = alloc_path_for_send();
if (!p)
@@ -1256,28 +1236,20 @@ static int get_inode_path(struct btrfs_root *root,
ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
if (ret < 0)
- goto out;
- if (ret) {
- ret = 1;
- goto out;
- }
+ return ret;
+ if (ret)
+ return 1;
+
btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
if (found_key.objectid != ino ||
(found_key.type != BTRFS_INODE_REF_KEY &&
- found_key.type != BTRFS_INODE_EXTREF_KEY)) {
- ret = -ENOENT;
- goto out;
- }
+ found_key.type != BTRFS_INODE_EXTREF_KEY))
+ return -ENOENT;
- ret = iterate_inode_ref(root, p, &found_key, 1,
- __copy_first_ref, path);
+ ret = iterate_inode_ref(root, p, &found_key, true, __copy_first_ref, path);
if (ret < 0)
- goto out;
- ret = 0;
-
-out:
- btrfs_free_path(p);
- return ret;
+ return ret;
+ return 0;
}
struct backref_ctx {
@@ -1407,7 +1379,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
struct backref_ctx *bctx = ctx;
struct send_ctx *sctx = bctx->sctx;
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
- const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
+ const u64 key = leaf_bytenr >> fs_info->nodesize_bits;
struct btrfs_lru_cache_entry *raw_entry;
struct backref_cache_entry *entry;
@@ -1462,7 +1434,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
if (!new_entry)
return;
- new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits;
+ new_entry->entry.key = leaf_bytenr >> fs_info->nodesize_bits;
new_entry->entry.gen = 0;
new_entry->num_roots = 0;
ULIST_ITER_INIT(&uiter);
@@ -1580,7 +1552,6 @@ static int find_extent_clone(struct send_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
int extent_type;
- u64 logical;
u64 disk_byte;
u64 num_bytes;
struct btrfs_file_extent_item *fi;
@@ -1611,7 +1582,6 @@ static int find_extent_clone(struct send_ctx *sctx,
compressed = btrfs_file_extent_compression(eb, fi);
num_bytes = btrfs_file_extent_num_bytes(eb, fi);
- logical = disk_byte + btrfs_file_extent_offset(eb, fi);
/*
* Setup the clone roots.
@@ -1693,14 +1663,8 @@ static int find_extent_clone(struct send_ctx *sctx,
}
up_read(&fs_info->commit_root_sem);
- btrfs_debug(fs_info,
- "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
- data_offset, ino, num_bytes, logical);
-
- if (!backref_ctx.found) {
- btrfs_debug(fs_info, "no clones found");
+ if (!backref_ctx.found)
return -ENOENT;
- }
cur_clone_root = NULL;
for (i = 0; i < sctx->clone_roots_cnt; i++) {
@@ -1742,7 +1706,7 @@ static int read_symlink(struct btrfs_root *root,
struct fs_path *dest)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_file_extent_item *ei;
u8 type;
@@ -1759,21 +1723,20 @@ static int read_symlink(struct btrfs_root *root,
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
- if (ret) {
+ return ret;
+ if (unlikely(ret)) {
/*
* An empty symlink inode. Can happen in rare error paths when
* creating a symlink (transaction committed before the inode
* eviction handler removed the symlink inode items and a crash
- * happened in between or the subvol was snapshoted in between).
+ * happened in between or the subvol was snapshotted in between).
* Print an informative message to dmesg/syslog so that the user
* can delete the symlink.
*/
btrfs_err(root->fs_info,
"Found empty symlink inode %llu at root %llu",
ino, btrfs_root_id(root));
- ret = -EIO;
- goto out;
+ return -EIO;
}
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -1784,7 +1747,7 @@ static int read_symlink(struct btrfs_root *root,
btrfs_crit(root->fs_info,
"send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
ino, btrfs_root_id(root), type);
- goto out;
+ return ret;
}
compression = btrfs_file_extent_compression(path->nodes[0], ei);
if (unlikely(compression != BTRFS_COMPRESS_NONE)) {
@@ -1792,17 +1755,13 @@ static int read_symlink(struct btrfs_root *root,
btrfs_crit(root->fs_info,
"send: found symlink extent with compression, ino %llu root %llu compression type %d",
ino, btrfs_root_id(root), compression);
- goto out;
+ return ret;
}
off = btrfs_file_extent_inline_start(ei);
len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
- ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
}
/*
@@ -1813,8 +1772,7 @@ static int gen_unique_name(struct send_ctx *sctx,
u64 ino, u64 gen,
struct fs_path *dest)
{
- int ret = 0;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *di;
char tmp[64];
int len;
@@ -1831,16 +1789,15 @@ static int gen_unique_name(struct send_ctx *sctx,
ino, gen, idx);
ASSERT(len < sizeof(tmp));
tmp_name.name = tmp;
- tmp_name.len = strlen(tmp);
+ tmp_name.len = len;
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
path, BTRFS_FIRST_FREE_OBJECTID,
&tmp_name, 0);
btrfs_release_path(path);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- }
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+
if (di) {
/* not unique, try again */
idx++;
@@ -1849,7 +1806,6 @@ static int gen_unique_name(struct send_ctx *sctx,
if (!sctx->parent_root) {
/* unique */
- ret = 0;
break;
}
@@ -1857,10 +1813,9 @@ static int gen_unique_name(struct send_ctx *sctx,
path, BTRFS_FIRST_FREE_OBJECTID,
&tmp_name, 0);
btrfs_release_path(path);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- }
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+
if (di) {
/* not unique, try again */
idx++;
@@ -1870,11 +1825,7 @@ static int gen_unique_name(struct send_ctx *sctx,
break;
}
- ret = fs_path_add(dest, tmp, strlen(tmp));
-
-out:
- btrfs_free_path(path);
- return ret;
+ return fs_path_add(dest, tmp, len);
}
enum inode_state {
@@ -1897,7 +1848,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
- goto out;
+ return ret;
left_ret = (info.nlink == 0) ? -ENOENT : ret;
left_gen = info.gen;
if (send_gen)
@@ -1908,7 +1859,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
} else {
ret = get_inode_info(sctx->parent_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
- goto out;
+ return ret;
right_ret = (info.nlink == 0) ? -ENOENT : ret;
right_gen = info.gen;
if (parent_gen)
@@ -1953,7 +1904,6 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
ret = -ENOENT;
}
-out:
return ret;
}
@@ -1967,17 +1917,14 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
if (ret < 0)
- goto out;
+ return ret;
if (ret == inode_state_no_change ||
ret == inode_state_did_create ||
ret == inode_state_will_delete)
- ret = 1;
- else
- ret = 0;
+ return 1;
-out:
- return ret;
+ return 0;
}
/*
@@ -1990,7 +1937,7 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
int ret = 0;
struct btrfs_dir_item *di;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
path = alloc_path_for_send();
@@ -1998,19 +1945,15 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
return -ENOMEM;
di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
- if (IS_ERR_OR_NULL(di)) {
- ret = di ? PTR_ERR(di) : -ENOENT;
- goto out;
- }
+ if (IS_ERR_OR_NULL(di))
+ return di ? PTR_ERR(di) : -ENOENT;
+
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
- if (key.type == BTRFS_ROOT_ITEM_KEY) {
- ret = -ENOENT;
- goto out;
- }
+ if (key.type == BTRFS_ROOT_ITEM_KEY)
+ return -ENOENT;
+
*found_inode = key.objectid;
-out:
- btrfs_free_path(path);
return ret;
}
@@ -2024,7 +1967,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int len;
u64 parent_dir;
@@ -2038,16 +1981,14 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
if (ret < 0)
- goto out;
+ return ret;
if (!ret)
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
if (ret || found_key.objectid != ino ||
(found_key.type != BTRFS_INODE_REF_KEY &&
- found_key.type != BTRFS_INODE_EXTREF_KEY)) {
- ret = -ENOENT;
- goto out;
- }
+ found_key.type != BTRFS_INODE_EXTREF_KEY))
+ return -ENOENT;
if (found_key.type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *iref;
@@ -2068,19 +2009,17 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
}
if (ret < 0)
- goto out;
+ return ret;
btrfs_release_path(path);
if (dir_gen) {
ret = get_inode_gen(root, parent_dir, dir_gen);
if (ret < 0)
- goto out;
+ return ret;
}
*dir = parent_dir;
-out:
- btrfs_free_path(path);
return ret;
}
@@ -2326,9 +2265,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
*parent_gen = nce->parent_gen;
ret = fs_path_add(dest, nce->name, nce->name_len);
if (ret < 0)
- goto out;
- ret = nce->ret;
- goto out;
+ return ret;
+ return nce->ret;
}
}
@@ -2339,12 +2277,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
*/
ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
if (ret < 0)
- goto out;
+ return ret;
if (!ret) {
ret = gen_unique_name(sctx, ino, gen, dest);
if (ret < 0)
- goto out;
+ return ret;
ret = 1;
goto out_cache;
}
@@ -2360,21 +2298,21 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
ret = get_first_ref(sctx->parent_root, ino,
parent_ino, parent_gen, dest);
if (ret < 0)
- goto out;
+ return ret;
/*
* Check if the ref was overwritten by an inode's ref that was processed
* earlier. If yes, treat as orphan and return 1.
*/
ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
- dest->start, dest->end - dest->start);
+ dest->start, fs_path_len(dest));
if (ret < 0)
- goto out;
+ return ret;
if (ret) {
fs_path_reset(dest);
ret = gen_unique_name(sctx, ino, gen, dest);
if (ret < 0)
- goto out;
+ return ret;
ret = 1;
}
@@ -2383,10 +2321,8 @@ out_cache:
* Store the result of the lookup in the name cache.
*/
nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL);
- if (!nce) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!nce)
+ return -ENOMEM;
nce->entry.key = ino;
nce->entry.gen = gen;
@@ -2404,10 +2340,9 @@ out_cache:
nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL);
if (nce_ret < 0) {
kfree(nce);
- ret = nce_ret;
+ return nce_ret;
}
-out:
return ret;
}
@@ -2444,6 +2379,14 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
u64 parent_inode = 0;
u64 parent_gen = 0;
int stop = 0;
+ const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen);
+
+ if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) {
+ if (dest != &sctx->cur_inode_path)
+ return fs_path_copy(dest, &sctx->cur_inode_path);
+
+ return 0;
+ }
name = fs_path_alloc();
if (!name) {
@@ -2495,8 +2438,12 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
out:
fs_path_free(name);
- if (!ret)
+ if (!ret) {
fs_path_unreverse(dest);
+ if (is_cur_inode && dest != &sctx->cur_inode_path)
+ ret = fs_path_copy(&sctx->cur_inode_path, dest);
+ }
+
return ret;
}
@@ -2508,11 +2455,11 @@ static int send_subvol_begin(struct send_ctx *sctx)
int ret;
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_root *parent_root = sctx->parent_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
- char *name = NULL;
+ char AUTO_KFREE(name);
int namelen;
path = btrfs_alloc_path();
@@ -2520,10 +2467,8 @@ static int send_subvol_begin(struct send_ctx *sctx)
return -ENOMEM;
name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
- if (!name) {
- btrfs_free_path(path);
+ if (!name)
return -ENOMEM;
- }
key.objectid = btrfs_root_id(send_root);
key.type = BTRFS_ROOT_BACKREF_KEY;
@@ -2532,18 +2477,15 @@ static int send_subvol_begin(struct send_ctx *sctx)
ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
&key, path, 1, 0);
if (ret < 0)
- goto out;
- if (ret) {
- ret = -ENOENT;
- goto out;
- }
+ return ret;
+ if (ret)
+ return -ENOENT;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.type != BTRFS_ROOT_BACKREF_KEY ||
key.objectid != btrfs_root_id(send_root)) {
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
namelen = btrfs_root_ref_name_len(leaf, ref);
@@ -2553,11 +2495,11 @@ static int send_subvol_begin(struct send_ctx *sctx)
if (parent_root) {
ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
if (ret < 0)
- goto out;
+ return ret;
} else {
ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
if (ret < 0)
- goto out;
+ return ret;
}
TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
@@ -2585,31 +2527,63 @@ static int send_subvol_begin(struct send_ctx *sctx)
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- btrfs_free_path(path);
- kfree(name);
return ret;
}
+static struct fs_path *get_cur_inode_path(struct send_ctx *sctx)
+{
+ if (fs_path_len(&sctx->cur_inode_path) == 0) {
+ int ret;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ &sctx->cur_inode_path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+
+ return &sctx->cur_inode_path;
+}
+
+static struct fs_path *get_path_for_command(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+ struct fs_path *path;
+ int ret;
+
+ if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen)
+ return get_cur_inode_path(sctx);
+
+ path = fs_path_alloc();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = get_cur_path(sctx, ino, gen, path);
+ if (ret < 0) {
+ fs_path_free(path);
+ return ERR_PTR(ret);
+ }
+
+ return path;
+}
+
+static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *path)
+{
+ if (path != &sctx->cur_inode_path)
+ fs_path_free(path);
+}
+
static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
@@ -2617,29 +2591,23 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
@@ -2647,32 +2615,26 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
if (sctx->proto < 2)
return 0;
- btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
@@ -2680,30 +2642,23 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
- ino, uid, gid);
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
@@ -2712,26 +2667,23 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p = NULL;
struct btrfs_inode_item *ii;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *eb;
struct btrfs_key key;
int slot;
- btrfs_debug(fs_info, "send_utimes %llu", ino);
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
path = alloc_path_for_send();
if (!path) {
@@ -2756,9 +2708,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
@@ -2770,8 +2719,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
tlv_put_failure:
out:
- fs_path_free(p);
- btrfs_free_path(path);
+ free_path_for_command(sctx, p);
return ret;
}
@@ -2781,7 +2729,7 @@ out:
* processing an inode that is a directory and it just got renamed, and existing
* entries in the cache may refer to inodes that have the directory in their
* full path - in which case we would generate outdated paths (pre-rename)
- * for the inodes that the cache entries point to. Instead of prunning the
+ * for the inodes that the cache entries point to. Instead of pruning the
* cache when inserting, do it after we finish processing each inode at
* finish_inode_if_needed().
*/
@@ -2838,7 +2786,6 @@ static int trim_dir_utimes_cache(struct send_ctx *sctx)
*/
static int send_create_inode(struct send_ctx *sctx, u64 ino)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
int cmd;
@@ -2847,8 +2794,6 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
u64 mode;
u64 rdev;
- btrfs_debug(fs_info, "send_create_inode %llu", ino);
-
p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2945,7 +2890,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
{
int ret = 0;
int iter_ret = 0;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_key di_key;
@@ -2985,7 +2930,6 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
if (iter_ret < 0)
ret = iter_ret;
- btrfs_free_path(path);
return ret;
}
@@ -3075,7 +3019,7 @@ static void __free_recorded_refs(struct list_head *head)
struct recorded_ref *cur;
while (!list_empty(head)) {
- cur = list_entry(head->next, struct recorded_ref, list);
+ cur = list_first_entry(head, struct recorded_ref, list);
recorded_ref_free(cur);
}
}
@@ -3106,6 +3050,11 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
goto out;
ret = send_rename(sctx, path, orphan);
+ if (ret < 0)
+ goto out;
+
+ if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen)
+ ret = fs_path_copy(&sctx->cur_inode_path, orphan);
out:
fs_path_free(orphan);
@@ -3760,7 +3709,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
struct recorded_ref *parent_ref,
const bool is_orphan)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key di_key;
struct btrfs_dir_item *di;
@@ -3781,19 +3730,15 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return 0;
di = btrfs_match_dir_item_name(path, parent_ref->name,
parent_ref->name_len);
- if (!di) {
- ret = 0;
- goto out;
- }
+ if (!di)
+ return 0;
/*
* di_key.objectid has the number of the inode that has a dentry in the
* parent directory with the same name that sctx->cur_ino is being
@@ -3803,26 +3748,22 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
* that it happens after that other inode is renamed.
*/
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
- if (di_key.type != BTRFS_INODE_ITEM_KEY) {
- ret = 0;
- goto out;
- }
+ if (di_key.type != BTRFS_INODE_ITEM_KEY)
+ return 0;
ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
if (ret < 0)
- goto out;
+ return ret;
ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
- goto out;
+ return ret;
}
/* Different inode, no need to delay the rename of sctx->cur_ino */
- if (right_gen != left_gen) {
- ret = 0;
- goto out;
- }
+ if (right_gen != left_gen)
+ return 0;
wdm = get_waiting_dir_move(sctx, di_key.objectid);
if (wdm && !wdm->orphanized) {
@@ -3836,8 +3777,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
if (!ret)
ret = 1;
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -3887,7 +3826,7 @@ static int is_ancestor(struct btrfs_root *root,
bool free_fs_path = false;
int ret = 0;
int iter_ret = 0;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
if (!fs_path) {
@@ -3955,7 +3894,6 @@ static int is_ancestor(struct btrfs_root *root,
ret = iter_ret;
out:
- btrfs_free_path(path);
if (free_fs_path)
fs_path_free(fs_path);
return ret;
@@ -4135,7 +4073,7 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
*/
static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
{
- char *name;
+ char AUTO_KFREE(name);
int ret;
name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
@@ -4145,17 +4083,75 @@ static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
fs_path_reset(ref->full_path);
ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
if (ret < 0)
- goto out;
+ return ret;
ret = fs_path_add(ref->full_path, name, ref->name_len);
if (ret < 0)
- goto out;
+ return ret;
/* Update the reference's base name pointer. */
set_ref_path(ref, ref->full_path);
-out:
- kfree(name);
- return ret;
+
+ return 0;
+}
+
+static int rbtree_check_dir_ref_comp(const void *k, const struct rb_node *node)
+{
+ const struct recorded_ref *data = k;
+ const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
+
+ if (data->dir > ref->dir)
+ return 1;
+ if (data->dir < ref->dir)
+ return -1;
+ if (data->dir_gen > ref->dir_gen)
+ return 1;
+ if (data->dir_gen < ref->dir_gen)
+ return -1;
+ return 0;
+}
+
+static bool rbtree_check_dir_ref_less(struct rb_node *node, const struct rb_node *parent)
+{
+ const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node);
+
+ return rbtree_check_dir_ref_comp(entry, parent) < 0;
+}
+
+static int record_check_dir_ref_in_tree(struct rb_root *root,
+ struct recorded_ref *ref, struct list_head *list)
+{
+ struct recorded_ref *tmp_ref;
+ int ret;
+
+ if (rb_find(ref, root, rbtree_check_dir_ref_comp))
+ return 0;
+
+ ret = dup_ref(ref, list);
+ if (ret < 0)
+ return ret;
+
+ tmp_ref = list_last_entry(list, struct recorded_ref, list);
+ rb_add(&tmp_ref->node, root, rbtree_check_dir_ref_less);
+ tmp_ref->root = root;
+ return 0;
+}
+
+static int rename_current_inode(struct send_ctx *sctx,
+ struct fs_path *current_path,
+ struct fs_path *new_path)
+{
+ int ret;
+
+ ret = send_rename(sctx, current_path, new_path);
+ if (ret < 0)
+ return ret;
+
+ ret = fs_path_copy(&sctx->cur_inode_path, new_path);
+ if (ret < 0)
+ return ret;
+
+ return fs_path_copy(current_path, new_path);
}
/*
@@ -4168,19 +4164,17 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
struct recorded_ref *cur;
struct recorded_ref *cur2;
LIST_HEAD(check_dirs);
+ struct rb_root rbtree_check_dirs = RB_ROOT;
struct fs_path *valid_path = NULL;
u64 ow_inode = 0;
u64 ow_gen;
u64 ow_mode;
- int did_overwrite = 0;
- int is_orphan = 0;
- u64 last_dir_ino_rm = 0;
+ bool did_overwrite = false;
+ bool is_orphan = false;
bool can_rename = true;
bool orphanized_dir = false;
bool orphanized_ancestor = false;
- btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
-
/*
* This should never happen as the root dir always has the same ref
* which is always '..'
@@ -4216,14 +4210,14 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret < 0)
goto out;
if (ret)
- did_overwrite = 1;
+ did_overwrite = true;
}
if (sctx->cur_inode_new || did_overwrite) {
ret = gen_unique_name(sctx, sctx->cur_ino,
sctx->cur_inode_gen, valid_path);
if (ret < 0)
goto out;
- is_orphan = 1;
+ is_orphan = true;
} else {
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
valid_path);
@@ -4348,6 +4342,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret > 0) {
orphanized_ancestor = true;
fs_path_reset(valid_path);
+ fs_path_reset(&sctx->cur_inode_path);
ret = get_cur_path(sctx, sctx->cur_ino,
sctx->cur_inode_gen,
valid_path);
@@ -4443,13 +4438,10 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* it depending on the inode mode.
*/
if (is_orphan && can_rename) {
- ret = send_rename(sctx, valid_path, cur->full_path);
- if (ret < 0)
- goto out;
- is_orphan = 0;
- ret = fs_path_copy(valid_path, cur->full_path);
+ ret = rename_current_inode(sctx, valid_path, cur->full_path);
if (ret < 0)
goto out;
+ is_orphan = false;
} else if (can_rename) {
if (S_ISDIR(sctx->cur_inode_mode)) {
/*
@@ -4457,10 +4449,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* dirs, we always have one new and one deleted
* ref. The deleted ref is ignored later.
*/
- ret = send_rename(sctx, valid_path,
- cur->full_path);
- if (!ret)
- ret = fs_path_copy(valid_path,
+ ret = rename_current_inode(sctx, valid_path,
cur->full_path);
if (ret < 0)
goto out;
@@ -4483,7 +4472,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
goto out;
}
}
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
}
@@ -4507,11 +4496,11 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
sctx->cur_inode_gen, valid_path);
if (ret < 0)
goto out;
- is_orphan = 1;
+ is_orphan = true;
}
list_for_each_entry(cur, &sctx->deleted_refs, list) {
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
}
@@ -4520,9 +4509,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
/*
* We have a moved dir. Add the old parent to check_dirs
*/
- cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
- list);
- ret = dup_ref(cur, &check_dirs);
+ cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
} else if (!S_ISDIR(sctx->cur_inode_mode)) {
@@ -4553,8 +4541,10 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
goto out;
+ if (is_current_inode_path(sctx, cur->full_path))
+ fs_path_reset(&sctx->cur_inode_path);
}
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
}
@@ -4597,8 +4587,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- } else if (ret == inode_state_did_delete &&
- cur->dir != last_dir_ino_rm) {
+ } else if (ret == inode_state_did_delete) {
ret = can_rmdir(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
@@ -4610,7 +4599,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = send_rmdir(sctx, valid_path);
if (ret < 0)
goto out;
- last_dir_ino_rm = cur->dir;
}
}
}
@@ -4628,7 +4616,6 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node)
{
const struct recorded_ref *data = k;
const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
- int result;
if (data->dir > ref->dir)
return 1;
@@ -4642,12 +4629,7 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node)
return 1;
if (data->name_len < ref->name_len)
return -1;
- result = strcmp(data->name, ref->name);
- if (result > 0)
- return 1;
- if (result < 0)
- return -1;
- return 0;
+ return strcmp(data->name, ref->name);
}
static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent)
@@ -4701,7 +4683,7 @@ out:
static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
- int ret = 0;
+ int ret;
struct send_ctx *sctx = ctx;
struct rb_node *node = NULL;
struct recorded_ref data;
@@ -4710,7 +4692,7 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
if (ret < 0)
- goto out;
+ return ret;
data.dir = dir;
data.dir_gen = dir_gen;
@@ -4724,13 +4706,13 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
&sctx->new_refs, name, dir, dir_gen,
sctx);
}
-out:
+
return ret;
}
static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
- int ret = 0;
+ int ret;
struct send_ctx *sctx = ctx;
struct rb_node *node = NULL;
struct recorded_ref data;
@@ -4739,7 +4721,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx
ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
if (ret < 0)
- goto out;
+ return ret;
data.dir = dir;
data.dir_gen = dir_gen;
@@ -4753,7 +4735,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx
&sctx->deleted_refs, name, dir,
dir_gen, sctx);
}
-out:
+
return ret;
}
@@ -4761,47 +4743,40 @@ static int record_new_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key,
+ false, record_new_ref_if_needed, sctx);
if (ret < 0)
- goto out;
- ret = 0;
+ return ret;
-out:
- return ret;
+ return 0;
}
static int record_deleted_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, record_deleted_ref_if_needed,
- sctx);
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key,
+ false, record_deleted_ref_if_needed, sctx);
if (ret < 0)
- goto out;
- ret = 0;
+ return ret;
-out:
- return ret;
+ return 0;
}
static int record_changed_ref(struct send_ctx *sctx)
{
- int ret = 0;
+ int ret;
- ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key,
+ false, record_new_ref_if_needed, sctx);
if (ret < 0)
- goto out;
- ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
+ return ret;
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key,
+ false, record_deleted_ref_if_needed, sctx);
if (ret < 0)
- goto out;
- ret = 0;
+ return ret;
-out:
- return ret;
+ return 0;
}
/*
@@ -4814,7 +4789,7 @@ static int process_all_refs(struct send_ctx *sctx,
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
iterate_inode_ref_t cb;
@@ -4833,8 +4808,7 @@ static int process_all_refs(struct send_ctx *sctx,
} else {
btrfs_err(sctx->send_root->fs_info,
"Wrong command %d in process_all_refs", cmd);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
key.objectid = sctx->cmp_key->objectid;
@@ -4846,15 +4820,14 @@ static int process_all_refs(struct send_ctx *sctx,
found_key.type != BTRFS_INODE_EXTREF_KEY))
break;
- ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
+ ret = iterate_inode_ref(root, path, &found_key, false, cb, sctx);
if (ret < 0)
- goto out;
+ return ret;
}
/* Catch error found during iteration */
- if (iter_ret < 0) {
- ret = iter_ret;
- goto out;
- }
+ if (iter_ret < 0)
+ return iter_ret;
+
btrfs_release_path(path);
/*
@@ -4862,22 +4835,23 @@ static int process_all_refs(struct send_ctx *sctx,
* re-creating this inode and will be rename'ing it into place once we
* rename the parent directory.
*/
- ret = process_recorded_refs(sctx, &pending_move);
-out:
- btrfs_free_path(path);
- return ret;
+ return process_recorded_refs(sctx, &pending_move);
}
static int send_set_xattr(struct send_ctx *sctx,
- struct fs_path *path,
const char *name, int name_len,
const char *data, int data_len)
{
- int ret = 0;
+ struct fs_path *path;
+ int ret;
+
+ path = get_cur_inode_path(sctx);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
@@ -4886,7 +4860,6 @@ static int send_set_xattr(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -4894,11 +4867,11 @@ static int send_remove_xattr(struct send_ctx *sctx,
struct fs_path *path,
const char *name, int name_len)
{
- int ret = 0;
+ int ret;
ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
@@ -4906,7 +4879,6 @@ static int send_remove_xattr(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -4914,19 +4886,13 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len, const char *data,
int data_len, void *ctx)
{
- int ret;
struct send_ctx *sctx = ctx;
- struct fs_path *p;
struct posix_acl_xattr_header dummy_acl;
/* Capabilities are emitted by finish_inode_if_needed */
if (!strncmp(name, XATTR_NAME_CAPS, name_len))
return 0;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
/*
* This hack is needed because empty acls are stored as zero byte
* data in xattrs. Problem with that is, that receiving these zero byte
@@ -4943,48 +4909,27 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
}
}
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
-
- ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
-
-out:
- fs_path_free(p);
- return ret;
+ return send_set_xattr(sctx, name, name_len, data, data_len);
}
static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len, void *ctx)
{
- int ret;
struct send_ctx *sctx = ctx;
struct fs_path *p;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
-
- ret = send_remove_xattr(sctx, p, name, name_len);
-
-out:
- fs_path_free(p);
- return ret;
+ return send_remove_xattr(sctx, p, name, name_len);
}
static int process_new_xattr(struct send_ctx *sctx)
{
- int ret = 0;
-
- ret = iterate_dir_item(sctx->send_root, sctx->left_path,
- __process_new_xattr, sctx);
-
- return ret;
+ return iterate_dir_item(sctx->send_root, sctx->left_path,
+ __process_new_xattr, sctx);
}
static int process_deleted_xattr(struct send_ctx *sctx)
@@ -4999,6 +4944,7 @@ struct find_xattr_ctx {
int found_idx;
char *found_data;
int found_data_len;
+ bool copy_data;
};
static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
@@ -5010,9 +4956,11 @@ static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
strncmp(name, ctx->name, name_len) == 0) {
ctx->found_idx = num;
ctx->found_data_len = data_len;
- ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
- if (!ctx->found_data)
- return -ENOMEM;
+ if (ctx->copy_data) {
+ ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
+ if (!ctx->found_data)
+ return -ENOMEM;
+ }
return 1;
}
return 0;
@@ -5032,6 +4980,7 @@ static int find_xattr(struct btrfs_root *root,
ctx.found_idx = -1;
ctx.found_data = NULL;
ctx.found_data_len = 0;
+ ctx.copy_data = (data != NULL);
ret = iterate_dir_item(root, path, __find_xattr, &ctx);
if (ret < 0)
@@ -5043,7 +4992,7 @@ static int find_xattr(struct btrfs_root *root,
*data = ctx.found_data;
*data_len = ctx.found_data_len;
} else {
- kfree(ctx.found_data);
+ ASSERT(ctx.found_data == NULL);
}
return ctx.found_idx;
}
@@ -5056,8 +5005,8 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
{
int ret;
struct send_ctx *sctx = ctx;
- char *found_data = NULL;
- int found_data_len = 0;
+ char AUTO_KFREE(found_data);
+ int found_data_len = 0;
ret = find_xattr(sctx->parent_root, sctx->right_path,
sctx->cmp_key, name, name_len, &found_data,
@@ -5075,7 +5024,6 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
}
}
- kfree(found_data);
return ret;
}
@@ -5100,17 +5048,15 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
static int process_changed_xattr(struct send_ctx *sctx)
{
- int ret = 0;
+ int ret;
ret = iterate_dir_item(sctx->send_root, sctx->left_path,
__process_changed_new_xattr, sctx);
if (ret < 0)
- goto out;
- ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
- __process_changed_deleted_xattr, sctx);
+ return ret;
-out:
- return ret;
+ return iterate_dir_item(sctx->parent_root, sctx->right_path,
+ __process_changed_deleted_xattr, sctx);
}
static int process_all_new_xattrs(struct send_ctx *sctx)
@@ -5118,7 +5064,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
@@ -5146,7 +5092,6 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
if (iter_ret < 0)
ret = iter_ret;
- btrfs_free_path(path);
return ret;
}
@@ -5157,7 +5102,7 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path,
ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
@@ -5172,25 +5117,24 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
static int process_verity(struct send_ctx *sctx)
{
int ret = 0;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fs_path *p;
inode = btrfs_iget(sctx->cur_ino, sctx->send_root);
if (IS_ERR(inode))
return PTR_ERR(inode);
- ret = btrfs_get_verity_descriptor(inode, NULL, 0);
+ ret = btrfs_get_verity_descriptor(&inode->vfs_inode, NULL, 0);
if (ret < 0)
goto iput;
- if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
+ if (unlikely(ret > FS_VERITY_MAX_DESCRIPTOR_SIZE)) {
ret = -EMSGSIZE;
goto iput;
}
@@ -5203,27 +5147,19 @@ static int process_verity(struct send_ctx *sctx)
}
}
- ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
+ ret = btrfs_get_verity_descriptor(&inode->vfs_inode, sctx->verity_descriptor, ret);
if (ret < 0)
goto iput;
- p = fs_path_alloc();
- if (!p) {
- ret = -ENOMEM;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p)) {
+ ret = PTR_ERR(p);
goto iput;
}
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto free_path;
ret = send_verity(sctx, p, sctx->verity_descriptor);
- if (ret < 0)
- goto free_path;
-
-free_path:
- fs_path_free(p);
iput:
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -5242,14 +5178,14 @@ static int put_data_header(struct send_ctx *sctx, u32 len)
* Since v2, the data attribute header doesn't include a length,
* it is implicitly to the end of the command.
*/
- if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)
+ if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(__le16) + len))
return -EOVERFLOW;
put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size);
sctx->send_size += sizeof(__le16);
} else {
struct btrfs_tlv_header *hdr;
- if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len))
return -EOVERFLOW;
hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
@@ -5263,10 +5199,9 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct folio *folio;
- pgoff_t index = offset >> PAGE_SHIFT;
- pgoff_t last_index;
- unsigned pg_offset = offset_in_page(offset);
+ u64 cur = offset;
+ const u64 end = offset + len;
+ const pgoff_t last_index = ((end - 1) >> PAGE_SHIFT);
struct address_space *mapping = sctx->cur_inode->i_mapping;
int ret;
@@ -5274,11 +5209,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
if (ret)
return ret;
- last_index = (offset + len - 1) >> PAGE_SHIFT;
-
- while (index <= last_index) {
- unsigned cur_len = min_t(unsigned, len,
- PAGE_SIZE - pg_offset);
+ while (cur < end) {
+ pgoff_t index = (cur >> PAGE_SHIFT);
+ unsigned int cur_len;
+ unsigned int pg_offset;
+ struct folio *folio;
folio = filemap_lock_folio(mapping, index);
if (IS_ERR(folio)) {
@@ -5292,8 +5227,8 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
break;
}
}
-
- WARN_ON(folio_order(folio));
+ pg_offset = offset_in_folio(folio, cur);
+ cur_len = min_t(unsigned int, end - cur, folio_size(folio) - pg_offset);
if (folio_test_readahead(folio))
page_cache_async_readahead(mapping, &sctx->ra, NULL, folio,
@@ -5302,7 +5237,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
if (!folio_test_uptodate(folio)) {
btrfs_read_folio(NULL, folio);
folio_lock(folio);
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
folio_unlock(folio);
btrfs_err(fs_info,
"send: IO error at offset %llu for inode %llu root %llu",
@@ -5312,15 +5247,18 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
ret = -EIO;
break;
}
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
}
memcpy_from_folio(sctx->send_buf + sctx->send_size, folio,
pg_offset, cur_len);
folio_unlock(folio);
folio_put(folio);
- index++;
- pg_offset = 0;
- len -= cur_len;
+ cur += cur_len;
sctx->send_size += cur_len;
}
@@ -5333,35 +5271,26 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
*/
static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
- btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
ret = put_file_data(sctx, offset, len);
if (ret < 0)
- goto out;
+ return ret;
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- fs_path_free(p);
return ret;
}
@@ -5374,12 +5303,12 @@ static int send_clone(struct send_ctx *sctx,
{
int ret = 0;
struct fs_path *p;
+ struct fs_path *cur_inode_path;
u64 gen;
- btrfs_debug(sctx->send_root->fs_info,
- "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
- offset, len, btrfs_root_id(clone_root->root),
- clone_root->ino, clone_root->offset);
+ cur_inode_path = get_cur_inode_path(sctx);
+ if (IS_ERR(cur_inode_path))
+ return PTR_ERR(cur_inode_path);
p = fs_path_alloc();
if (!p)
@@ -5389,13 +5318,9 @@ static int send_clone(struct send_ctx *sctx,
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
-
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
- TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path);
if (clone_root->root == sctx->send_root) {
ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
@@ -5446,27 +5371,45 @@ static int send_update_extent(struct send_ctx *sctx,
int ret = 0;
struct fs_path *p;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
if (ret < 0)
- goto out;
+ return ret;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+ return ret;
+}
+
+static int send_fallocate(struct send_ctx *sctx, u32 mode, u64 offset, u64 len)
+{
+ struct fs_path *path;
+ int ret;
+
+ path = get_cur_inode_path(sctx);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE);
if (ret < 0)
- goto out;
+ return ret;
- TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_MODE, mode);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- fs_path_free(p);
return ret;
}
@@ -5478,6 +5421,14 @@ static int send_hole(struct send_ctx *sctx, u64 end)
int ret = 0;
/*
+ * Starting with send stream v2 we have fallocate and can use it to
+ * punch holes instead of sending writes full of zeroes.
+ */
+ if (proto_cmd_ok(sctx, BTRFS_SEND_C_FALLOCATE))
+ return send_fallocate(sctx, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ offset, end - offset);
+
+ /*
* A hole that starts at EOF or beyond it. Since we do not yet support
* fallocate (for extent preallocation and hole punching), sending a
* write of zeroes starting at EOF or beyond would later require issuing
@@ -5495,12 +5446,10 @@ static int send_hole(struct send_ctx *sctx, u64 end)
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, end - offset);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto tlv_put_failure;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
while (offset < end) {
u64 len = min(end - offset, read_size);
@@ -5521,7 +5470,6 @@ static int send_hole(struct send_ctx *sctx, u64 end)
}
sctx->cur_inode_next_write_offset = offset;
tlv_put_failure:
- fs_path_free(p);
return ret;
}
@@ -5529,9 +5477,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
struct btrfs_path *path, u64 offset,
u64 len)
{
- struct btrfs_root *root = sctx->send_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
struct fs_path *fspath;
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_key key;
@@ -5540,23 +5486,13 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
size_t inline_size;
int ret;
- inode = btrfs_iget(sctx->cur_ino, root);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- fspath = fs_path_alloc();
- if (!fspath) {
- ret = -ENOMEM;
- goto out;
- }
+ fspath = get_cur_inode_path(sctx);
+ if (IS_ERR(fspath))
+ return PTR_ERR(fspath);
ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
- if (ret < 0)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
@@ -5572,12 +5508,12 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
ret = btrfs_encoded_io_compression_from_extent(fs_info,
btrfs_file_extent_compression(leaf, ei));
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
ret = put_data_header(sctx, inline_size);
if (ret < 0)
- goto out;
+ return ret;
read_extent_buffer(leaf, sctx->send_buf + sctx->send_size,
btrfs_file_extent_inline_start(ei), inline_size);
sctx->send_size += inline_size;
@@ -5585,9 +5521,6 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- fs_path_free(fspath);
- iput(inode);
return ret;
}
@@ -5596,7 +5529,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fs_path *fspath;
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_key key;
@@ -5611,9 +5544,9 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
if (IS_ERR(inode))
return PTR_ERR(inode);
- fspath = fs_path_alloc();
- if (!fspath) {
- ret = -ENOMEM;
+ fspath = get_cur_inode_path(sctx);
+ if (IS_ERR(fspath)) {
+ ret = PTR_ERR(fspath);
goto out;
}
@@ -5621,10 +5554,6 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
- if (ret < 0)
- goto out;
-
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
@@ -5656,8 +5585,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* between the beginning of the command and the file data.
*/
data_offset = PAGE_ALIGN(sctx->send_size);
- if (data_offset > sctx->send_max_size ||
- sctx->send_max_size - data_offset < disk_num_bytes) {
+ if (unlikely(data_offset > sctx->send_max_size ||
+ sctx->send_max_size - data_offset < disk_num_bytes)) {
ret = -EOVERFLOW;
goto out;
}
@@ -5666,7 +5595,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* Note that send_buf is a mapping of send_buf_pages, so this is really
* reading into send_buf.
*/
- ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode),
+ ret = btrfs_encoded_read_regular_fill_pages(inode,
disk_bytenr, disk_num_bytes,
sctx->send_buf_pages +
(data_offset >> PAGE_SHIFT),
@@ -5692,8 +5621,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
tlv_put_failure:
out:
- fs_path_free(fspath);
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -5735,15 +5663,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
}
if (sctx->cur_inode == NULL) {
+ struct btrfs_inode *btrfs_inode;
struct btrfs_root *root = sctx->send_root;
- sctx->cur_inode = btrfs_iget(sctx->cur_ino, root);
- if (IS_ERR(sctx->cur_inode)) {
- int err = PTR_ERR(sctx->cur_inode);
+ btrfs_inode = btrfs_iget(sctx->cur_ino, root);
+ if (IS_ERR(btrfs_inode))
+ return PTR_ERR(btrfs_inode);
- sctx->cur_inode = NULL;
- return err;
- }
+ sctx->cur_inode = &btrfs_inode->vfs_inode;
memset(&sctx->ra, 0, sizeof(struct file_ra_state));
file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
@@ -5822,12 +5749,11 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
*/
static int send_capabilities(struct send_ctx *sctx)
{
- struct fs_path *fspath = NULL;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *di;
struct extent_buffer *leaf;
unsigned long data_ptr;
- char *buf = NULL;
+ char AUTO_KFREE(buf);
int buf_len;
int ret = 0;
@@ -5839,35 +5765,23 @@ static int send_capabilities(struct send_ctx *sctx)
XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
if (!di) {
/* There is no xattr for this inode */
- goto out;
+ return 0;
} else if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
+ return PTR_ERR(di);
}
leaf = path->nodes[0];
buf_len = btrfs_dir_data_len(leaf, di);
- fspath = fs_path_alloc();
buf = kmalloc(buf_len, GFP_KERNEL);
- if (!fspath || !buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
- if (ret < 0)
- goto out;
+ if (!buf)
+ return -ENOMEM;
data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
read_extent_buffer(leaf, buf, data_ptr, buf_len);
- ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
+ ret = send_set_xattr(sctx, XATTR_NAME_CAPS,
strlen(XATTR_NAME_CAPS), buf, buf_len);
-out:
- kfree(buf);
- fs_path_free(fspath);
- btrfs_free_path(path);
return ret;
}
@@ -5875,7 +5789,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
struct clone_root *clone_root, const u64 disk_byte,
u64 data_offset, u64 offset, u64 len)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret;
struct btrfs_inode_info info;
@@ -5911,7 +5825,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = get_inode_info(clone_root->root, clone_root->ino, &info);
btrfs_release_path(path);
if (ret < 0)
- goto out;
+ return ret;
clone_src_i_size = info.size;
/*
@@ -5941,7 +5855,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
key.offset = clone_root->offset;
ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0 && path->slots[0] > 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
if (key.objectid == clone_root->ino &&
@@ -5962,7 +5876,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(clone_root->root, path);
if (ret < 0)
- goto out;
+ return ret;
else if (ret > 0)
break;
continue;
@@ -5999,7 +5913,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = send_extent_data(sctx, dst_path, offset,
hole_len);
if (ret < 0)
- goto out;
+ return ret;
len -= hole_len;
if (len == 0)
@@ -6070,7 +5984,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = send_clone(sctx, offset, slen,
clone_root);
if (ret < 0)
- goto out;
+ return ret;
}
ret = send_extent_data(sctx, dst_path,
offset + slen,
@@ -6104,7 +6018,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
}
if (ret < 0)
- goto out;
+ return ret;
len -= clone_len;
if (len == 0)
@@ -6135,8 +6049,6 @@ next:
ret = send_extent_data(sctx, dst_path, offset, len);
else
ret = 0;
-out:
- btrfs_free_path(path);
return ret;
}
@@ -6225,7 +6137,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
{
int ret = 0;
struct btrfs_key key;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *eb;
int slot;
struct btrfs_key found_key;
@@ -6251,10 +6163,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
left_type = btrfs_file_extent_type(eb, ei);
- if (left_type != BTRFS_FILE_EXTENT_REG) {
- ret = 0;
- goto out;
- }
+ if (left_type != BTRFS_FILE_EXTENT_REG)
+ return 0;
+
left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
left_len = btrfs_file_extent_num_bytes(eb, ei);
left_offset = btrfs_file_extent_offset(eb, ei);
@@ -6286,11 +6197,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
key.offset = ekey->offset;
ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ return ret;
+ if (ret)
+ return 0;
/*
* Handle special case where the right side has no extents at all.
@@ -6299,11 +6208,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
slot = path->slots[0];
btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
- found_key.type != key.type) {
+ found_key.type != key.type)
/* If we're a hole then just pretend nothing changed */
- ret = (left_disknr) ? 0 : 1;
- goto out;
- }
+ return (left_disknr ? 0 : 1);
/*
* We're now on 2a, 2b or 7.
@@ -6313,10 +6220,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
right_type = btrfs_file_extent_type(eb, ei);
if (right_type != BTRFS_FILE_EXTENT_REG &&
- right_type != BTRFS_FILE_EXTENT_INLINE) {
- ret = 0;
- goto out;
- }
+ right_type != BTRFS_FILE_EXTENT_INLINE)
+ return 0;
if (right_type == BTRFS_FILE_EXTENT_INLINE) {
right_len = btrfs_file_extent_ram_bytes(eb, ei);
@@ -6329,11 +6234,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
* Are we at extent 8? If yes, we know the extent is changed.
* This may only happen on the first iteration.
*/
- if (found_key.offset + right_len <= ekey->offset) {
+ if (found_key.offset + right_len <= ekey->offset)
/* If we're a hole just pretend nothing changed */
- ret = (left_disknr) ? 0 : 1;
- goto out;
- }
+ return (left_disknr ? 0 : 1);
/*
* We just wanted to see if when we have an inline extent, what
@@ -6343,10 +6246,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
* compressed extent representing data with a size matching
* the page size (currently the same as sector size).
*/
- if (right_type == BTRFS_FILE_EXTENT_INLINE) {
- ret = 0;
- goto out;
- }
+ if (right_type == BTRFS_FILE_EXTENT_INLINE)
+ return 0;
right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
right_offset = btrfs_file_extent_offset(eb, ei);
@@ -6366,17 +6267,15 @@ static int is_extent_unchanged(struct send_ctx *sctx,
*/
if (left_disknr != right_disknr ||
left_offset_fixed != right_offset ||
- left_gen != right_gen) {
- ret = 0;
- goto out;
- }
+ left_gen != right_gen)
+ return 0;
/*
* Go to the next extent.
*/
ret = btrfs_next_item(sctx->parent_root, path);
if (ret < 0)
- goto out;
+ return ret;
if (!ret) {
eb = path->nodes[0];
slot = path->slots[0];
@@ -6387,10 +6286,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
key.offset += right_len;
break;
}
- if (found_key.offset != key.offset + right_len) {
- ret = 0;
- goto out;
- }
+ if (found_key.offset != key.offset + right_len)
+ return 0;
+
key = found_key;
}
@@ -6403,15 +6301,12 @@ static int is_extent_unchanged(struct send_ctx *sctx,
else
ret = 0;
-
-out:
- btrfs_free_path(path);
return ret;
}
static int get_last_extent(struct send_ctx *sctx, u64 offset)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = sctx->send_root;
struct btrfs_key key;
int ret;
@@ -6427,15 +6322,13 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
key.offset = offset;
ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
if (ret < 0)
- goto out;
+ return ret;
ret = 0;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
- goto out;
+ return ret;
sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -6443,7 +6336,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
const u64 start,
const u64 end)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root *root = sctx->parent_root;
u64 search_start = start;
@@ -6458,7 +6351,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
key.offset = search_start;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0 && path->slots[0] > 0)
path->slots[0]--;
@@ -6471,8 +6364,8 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
- else if (ret > 0)
+ return ret;
+ if (ret > 0)
break;
continue;
}
@@ -6494,15 +6387,11 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
search_start = extent_end;
goto next;
}
- ret = 0;
- goto out;
+ return 0;
next:
path->slots[0]++;
}
- ret = 1;
-out:
- btrfs_free_path(path);
- return ret;
+ return 1;
}
static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
@@ -6610,7 +6499,7 @@ static int process_all_extents(struct send_ctx *sctx)
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
@@ -6637,11 +6526,10 @@ static int process_all_extents(struct send_ctx *sctx)
if (iter_ret < 0)
ret = iter_ret;
- btrfs_free_path(path);
return ret;
}
-static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end,
int *pending_move,
int *refs_processed)
{
@@ -6664,7 +6552,7 @@ out:
return ret;
}
-static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
+static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end)
{
int ret = 0;
struct btrfs_inode_info info;
@@ -6892,6 +6780,7 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_last_extent = (u64)-1;
sctx->cur_inode_next_write_offset = 0;
sctx->ignore_cur_inode = false;
+ fs_path_reset(&sctx->cur_inode_path);
/*
* Set send_progress to current inode. This will tell all get_cur_xxx
@@ -7098,7 +6987,7 @@ static int changed_ref(struct send_ctx *sctx,
{
int ret = 0;
- if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) {
inconsistent_snapshot_error(sctx, result, "reference");
return -EIO;
}
@@ -7126,7 +7015,7 @@ static int changed_xattr(struct send_ctx *sctx,
{
int ret = 0;
- if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) {
inconsistent_snapshot_error(sctx, result, "xattr");
return -EIO;
}
@@ -7253,7 +7142,7 @@ static int changed_cb(struct btrfs_path *left_path,
enum btrfs_compare_tree_result result,
struct send_ctx *sctx)
{
- int ret = 0;
+ int ret;
/*
* We can not hold the commit root semaphore here. This is because in
@@ -7313,7 +7202,6 @@ static int changed_cb(struct btrfs_path *left_path,
return 0;
}
result = BTRFS_COMPARE_TREE_CHANGED;
- ret = 0;
}
sctx->left_path = left_path;
@@ -7367,11 +7255,11 @@ static int search_key_again(const struct send_ctx *sctx,
*/
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
ASSERT(ret <= 0);
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
btrfs_print_tree(path->nodes[path->lowest_level], false);
btrfs_err(root->fs_info,
-"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
- key->objectid, key->type, key->offset,
+"send: key " BTRFS_KEY_FMT" not found in %s root %llu, lowest_level %d, slot %d",
+ BTRFS_KEY_FMT_VALUE(key),
(root == sctx->parent_root ? "parent" : "send"),
btrfs_root_id(root), path->lowest_level,
path->slots[path->lowest_level]);
@@ -7387,7 +7275,7 @@ static int full_send_tree(struct send_ctx *sctx)
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_key key;
struct btrfs_fs_info *fs_info = send_root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = alloc_path_for_send();
if (!path)
@@ -7404,7 +7292,7 @@ static int full_send_tree(struct send_ctx *sctx)
ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret)
goto out_finish;
@@ -7414,7 +7302,7 @@ static int full_send_tree(struct send_ctx *sctx)
ret = changed_cb(path, NULL, &key,
BTRFS_COMPARE_TREE_NEW, sctx);
if (ret < 0)
- goto out;
+ return ret;
down_read(&fs_info->commit_root_sem);
if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
@@ -7433,14 +7321,14 @@ static int full_send_tree(struct send_ctx *sctx)
btrfs_release_path(path);
ret = search_key_again(sctx, send_root, path, &key);
if (ret < 0)
- goto out;
+ return ret;
} else {
up_read(&fs_info->commit_root_sem);
}
ret = btrfs_next_item(send_root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret) {
ret = 0;
break;
@@ -7448,11 +7336,7 @@ static int full_send_tree(struct send_ctx *sctx)
}
out_finish:
- ret = finish_inode_if_needed(sctx, 1);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return finish_inode_if_needed(sctx, 1);
}
static int replace_node_with_clone(struct btrfs_path *path, int level)
@@ -7707,8 +7591,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
int cmp;
- struct btrfs_path *left_path = NULL;
- struct btrfs_path *right_path = NULL;
+ BTRFS_PATH_AUTO_FREE(left_path);
+ BTRFS_PATH_AUTO_FREE(right_path);
struct btrfs_key left_key;
struct btrfs_key right_key;
char *tmp_buf = NULL;
@@ -7743,10 +7627,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- left_path->search_commit_root = 1;
- left_path->skip_locking = 1;
- right_path->search_commit_root = 1;
- right_path->skip_locking = 1;
+ left_path->search_commit_root = true;
+ left_path->skip_locking = true;
+ right_path->search_commit_root = true;
+ right_path->skip_locking = true;
/*
* Strategy: Go to the first items of both trees. Then do
@@ -7981,8 +7865,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
out_unlock:
up_read(&fs_info->commit_root_sem);
out:
- btrfs_free_path(left_path);
- btrfs_free_path(right_path);
kvfree(tmp_buf);
return ret;
}
@@ -8049,7 +7931,7 @@ static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
}
/*
- * Make sure any existing dellaloc is flushed for any root used by a send
+ * Make sure any existing delalloc is flushed for any root used by a send
* operation so that we do not miss any data and we do not race with writeback
* finishing and changing a tree while send is using the tree. This could
* happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
@@ -8102,10 +7984,9 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
btrfs_root_id(root), root->dedupe_in_progress);
}
-long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg)
+long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
- struct btrfs_root *send_root = inode->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
struct send_ctx *sctx = NULL;
@@ -8168,6 +8049,7 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
goto out;
}
+ init_path(&sctx->cur_inode_path);
INIT_LIST_HEAD(&sctx->new_refs);
INIT_LIST_HEAD(&sctx->deleted_refs);
@@ -8444,6 +8326,9 @@ out:
btrfs_lru_cache_clear(&sctx->dir_created_cache);
btrfs_lru_cache_clear(&sctx->dir_utimes_cache);
+ if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf)
+ kfree(sctx->cur_inode_path.buf);
+
kfree(sctx);
}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 9309886c5ea1..652bb28f63d4 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -11,7 +11,7 @@
#include <linux/sizes.h>
#include <linux/align.h>
-struct btrfs_inode;
+struct btrfs_root;
struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
@@ -182,6 +182,6 @@ enum {
__BTRFS_SEND_A_MAX = 35,
};
-long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg);
+long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg);
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 255e85f78313..6babbe333741 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
-#include "linux/spinlock.h"
+#include <linux/spinlock.h>
#include <linux/minmax.h>
#include "misc.h"
#include "ctree.h"
@@ -14,6 +14,8 @@
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
+#include "zoned.h"
+#include "delayed-inode.h"
/*
* HOW DOES SPACE RESERVATION WORK
@@ -49,11 +51,11 @@
* num_bytes we want to reserve.
*
* ->reserve
- * space_info->bytes_may_reserve += num_bytes
+ * space_info->bytes_may_use += num_bytes
*
* ->extent allocation
* Call btrfs_add_reserved_bytes() which does
- * space_info->bytes_may_reserve -= num_bytes
+ * space_info->bytes_may_use -= num_bytes
* space_info->bytes_reserved += extent_bytes
*
* ->insert reference
@@ -66,7 +68,7 @@
* Assume we are unable to simply make the reservation because we do not have
* enough space
*
- * -> __reserve_bytes
+ * -> reserve_bytes
* create a reserve_ticket with ->bytes set to our reservation, add it to
* the tail of space_info->tickets, kick async flush thread
*
@@ -127,6 +129,14 @@
* churn a lot and we can avoid making some extent tree modifications if we
* are able to delay for as long as possible.
*
+ * RESET_ZONES
+ * This state works only for the zoned mode. On the zoned mode, we cannot
+ * reuse once allocated then freed region until we reset the zone, due to
+ * the sequential write zone requirement. The RESET_ZONES state resets the
+ * zones of an unused block group and let us reuse the space. The reusing
+ * is faster than removing the block group and allocating another block
+ * group on the zones.
+ *
* ALLOC_CHUNK
* We will skip this the first time through space reservation, because of
* overcommit and we don't want to have a lot of useless metadata space when
@@ -163,15 +173,14 @@
* thing with or without extra unallocated space.
*/
-u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info,
- bool may_use_included)
-{
- ASSERT(s_info);
- return s_info->bytes_used + s_info->bytes_reserved +
- s_info->bytes_pinned + s_info->bytes_readonly +
- s_info->bytes_zone_unusable +
- (may_use_included ? s_info->bytes_may_use : 0);
-}
+struct reserve_ticket {
+ u64 bytes;
+ int error;
+ bool steal;
+ struct list_head list;
+ wait_queue_head_t wait;
+ spinlock_t lock;
+};
/*
* after adding space to the filesystem, we need to clear the full flags
@@ -183,7 +192,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
struct btrfs_space_info *found;
list_for_each_entry(found, head, list)
- found->full = 0;
+ found->full = false;
}
/*
@@ -202,7 +211,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
if (btrfs_is_zoned(fs_info))
return fs_info->zone_size;
- ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+ ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK, "flags=%llu", flags);
if (flags & BTRFS_BLOCK_GROUP_DATA)
return BTRFS_MAX_DATA_CHUNK_SIZE;
@@ -225,19 +234,11 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
WRITE_ONCE(space_info->chunk_size, chunk_size);
}
-static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+static void init_space_info(struct btrfs_fs_info *info,
+ struct btrfs_space_info *space_info, u64 flags)
{
-
- struct btrfs_space_info *space_info;
- int i;
- int ret;
-
- space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
- if (!space_info)
- return -ENOMEM;
-
space_info->fs_info = info;
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]);
init_rwsem(&space_info->groups_sem);
spin_lock_init(&space_info->lock);
@@ -248,11 +249,67 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
+ space_info->subgroup_id = BTRFS_SUB_GROUP_PRIMARY;
if (btrfs_is_zoned(info))
space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
+}
+
+static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flags,
+ enum btrfs_space_info_sub_group id, int index)
+{
+ struct btrfs_fs_info *fs_info = parent->fs_info;
+ struct btrfs_space_info *sub_group;
+ int ret;
+
+ ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY,
+ "parent->subgroup_id=%d", parent->subgroup_id);
+ ASSERT(id != BTRFS_SUB_GROUP_PRIMARY, "id=%d", id);
+
+ sub_group = kzalloc(sizeof(*sub_group), GFP_NOFS);
+ if (!sub_group)
+ return -ENOMEM;
+
+ init_space_info(fs_info, sub_group, flags);
+ parent->sub_group[index] = sub_group;
+ sub_group->parent = parent;
+ sub_group->subgroup_id = id;
+
+ ret = btrfs_sysfs_add_space_info_type(sub_group);
+ if (ret) {
+ kfree(sub_group);
+ parent->sub_group[index] = NULL;
+ }
+ return ret;
+}
+
+static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+{
+
+ struct btrfs_space_info *space_info;
+ int ret = 0;
+
+ space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
+ if (!space_info)
+ return -ENOMEM;
+
+ init_space_info(info, space_info, flags);
+
+ if (btrfs_is_zoned(info)) {
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ ret = create_space_info_sub_group(space_info, flags,
+ BTRFS_SUB_GROUP_DATA_RELOC,
+ 0);
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ ret = create_space_info_sub_group(space_info, flags,
+ BTRFS_SUB_GROUP_TREELOG,
+ 0);
+
+ if (ret)
+ return ret;
+ }
- ret = btrfs_sysfs_add_space_info_type(info, space_info);
+ ret = btrfs_sysfs_add_space_info_type(space_info);
if (ret)
return ret;
@@ -303,31 +360,29 @@ out:
void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
struct btrfs_block_group *block_group)
{
- struct btrfs_space_info *found;
+ struct btrfs_space_info *space_info = block_group->space_info;
int factor, index;
factor = btrfs_bg_type_to_factor(block_group->flags);
- found = btrfs_find_space_info(info, block_group->flags);
- ASSERT(found);
- spin_lock(&found->lock);
- found->total_bytes += block_group->length;
- found->disk_total += block_group->length * factor;
- found->bytes_used += block_group->used;
- found->disk_used += block_group->used * factor;
- found->bytes_readonly += block_group->bytes_super;
- btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable);
+ spin_lock(&space_info->lock);
+ space_info->total_bytes += block_group->length;
+ space_info->disk_total += block_group->length * factor;
+ space_info->bytes_used += block_group->used;
+ space_info->disk_used += block_group->used * factor;
+ space_info->bytes_readonly += block_group->bytes_super;
+ btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable);
if (block_group->length > 0)
- found->full = 0;
- btrfs_try_granting_tickets(info, found);
- spin_unlock(&found->lock);
+ space_info->full = false;
+ btrfs_try_granting_tickets(space_info);
+ spin_unlock(&space_info->lock);
- block_group->space_info = found;
+ block_group->space_info = space_info;
index = btrfs_bg_flags_to_raid_index(block_group->flags);
- down_write(&found->groups_sem);
- list_add_tail(&block_group->list, &found->block_groups[index]);
- up_write(&found->groups_sem);
+ down_write(&space_info->groups_sem);
+ list_add_tail(&block_group->list, &space_info->block_groups[index]);
+ up_write(&space_info->groups_sem);
}
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -367,10 +422,10 @@ static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info)
return min_t(u64, data_chunk_size, SZ_1G);
}
-static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *space_info,
- enum btrfs_reserve_flush_enum flush)
+static u64 calc_available_free_space(const struct btrfs_space_info *space_info,
+ enum btrfs_reserve_flush_enum flush)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 profile;
u64 avail;
u64 data_chunk_size;
@@ -425,7 +480,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
/*
* On the zoned mode, we always allocate one zone as one chunk.
- * Returning non-zone size alingned bytes here will result in
+ * Returning non-zone size aligned bytes here will result in
* less pressure for the async metadata reclaim process, and it
* will over-commit too much leading to ENOSPC. Align down to the
* zone size to avoid that.
@@ -436,44 +491,77 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
return avail;
}
-int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *space_info, u64 bytes,
- enum btrfs_reserve_flush_enum flush)
+static inline bool check_can_overcommit(const struct btrfs_space_info *space_info,
+ u64 space_info_used_bytes, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ const u64 avail = calc_available_free_space(space_info, flush);
+
+ return (space_info_used_bytes + bytes < space_info->total_bytes + avail);
+}
+
+static inline bool can_overcommit(const struct btrfs_space_info *space_info,
+ u64 space_info_used_bytes, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ /* Don't overcommit when in mixed mode. */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ return check_can_overcommit(space_info, space_info_used_bytes, bytes, flush);
+}
+
+bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
{
- u64 avail;
u64 used;
/* Don't overcommit when in mixed mode */
if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
- return 0;
+ return false;
used = btrfs_space_info_used(space_info, true);
- avail = calc_available_free_space(fs_info, space_info, flush);
- if (used + bytes < space_info->total_bytes + avail)
- return 1;
- return 0;
+ return check_can_overcommit(space_info, used, bytes, flush);
}
static void remove_ticket(struct btrfs_space_info *space_info,
- struct reserve_ticket *ticket)
+ struct reserve_ticket *ticket, int error)
{
+ lockdep_assert_held(&space_info->lock);
+
if (!list_empty(&ticket->list)) {
list_del_init(&ticket->list);
- ASSERT(space_info->reclaim_size >= ticket->bytes);
+ ASSERT(space_info->reclaim_size >= ticket->bytes,
+ "space_info->reclaim_size=%llu ticket->bytes=%llu",
+ space_info->reclaim_size, ticket->bytes);
space_info->reclaim_size -= ticket->bytes;
}
+
+ spin_lock(&ticket->lock);
+ /*
+ * If we are called from a task waiting on the ticket, it may happen
+ * that before it sets an error on the ticket, a reclaim task was able
+ * to satisfy the ticket. In that case ignore the error.
+ */
+ if (error && ticket->bytes > 0)
+ ticket->error = error;
+ else
+ ticket->bytes = 0;
+
+ wake_up(&ticket->wait);
+ spin_unlock(&ticket->lock);
}
/*
* This is for space we already have accounted in space_info->bytes_may_use, so
* basically when we're returning space from block_rsv's.
*/
-void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+void btrfs_try_granting_tickets(struct btrfs_space_info *space_info)
{
struct list_head *head;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
+ u64 used = btrfs_space_info_used(space_info, true);
lockdep_assert_held(&space_info->lock);
@@ -481,21 +569,18 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
again:
while (!list_empty(head)) {
struct reserve_ticket *ticket;
- u64 used = btrfs_space_info_used(space_info, true);
+ u64 used_after;
ticket = list_first_entry(head, struct reserve_ticket, list);
+ used_after = used + ticket->bytes;
/* Check and see if our ticket can be satisfied now. */
- if ((used + ticket->bytes <= space_info->total_bytes) ||
- btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
- flush)) {
- btrfs_space_info_update_bytes_may_use(fs_info,
- space_info,
- ticket->bytes);
- remove_ticket(space_info, ticket);
- ticket->bytes = 0;
+ if (used_after <= space_info->total_bytes ||
+ can_overcommit(space_info, used, ticket->bytes, flush)) {
+ btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes);
+ remove_ticket(space_info, ticket, 0);
space_info->tickets_id++;
- wake_up(&ticket->wait);
+ used = used_after;
} else {
break;
}
@@ -542,15 +627,16 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
}
-static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *info)
+static void __btrfs_dump_space_info(const struct btrfs_space_info *info)
{
+ const struct btrfs_fs_info *fs_info = info->fs_info;
const char *flag_str = space_info_flag_to_str(info);
lockdep_assert_held(&info->lock);
/* The free space could be negative in case of overcommit */
- btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
- flag_str,
+ btrfs_info(fs_info,
+ "space_info %s (sub-group id %d) has %lld free, is %sfull",
+ flag_str, info->subgroup_id,
(s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
@@ -560,16 +646,16 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info,
info->bytes_readonly, info->bytes_zone_unusable);
}
-void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *info, u64 bytes,
- int dump_block_groups)
+void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ bool dump_block_groups)
{
+ struct btrfs_fs_info *fs_info = info->fs_info;
struct btrfs_block_group *cache;
u64 total_avail = 0;
int index = 0;
spin_lock(&info->lock);
- __btrfs_dump_space_info(fs_info, info);
+ __btrfs_dump_space_info(info);
dump_global_block_rsv(fs_info);
spin_unlock(&info->lock);
@@ -617,11 +703,11 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
/*
* shrink metadata reservation for delalloc
*/
-static void shrink_delalloc(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static void shrink_delalloc(struct btrfs_space_info *space_info,
u64 to_reclaim, bool wait_ordered,
bool for_preempt)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 ordered_bytes;
@@ -748,10 +834,10 @@ skip_async:
* and may fail for various reasons. The caller is supposed to examine the
* state of @space_info to detect the outcome.
*/
-static void flush_space(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 num_bytes,
- enum btrfs_flush_state state, bool for_preempt)
+static void flush_space(struct btrfs_space_info *space_info, u64 num_bytes,
+ enum btrfs_flush_state state, bool for_preempt)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
int nr;
@@ -780,7 +866,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
case FLUSH_DELALLOC_FULL:
if (state == FLUSH_DELALLOC_FULL)
num_bytes = U64_MAX;
- shrink_delalloc(fs_info, space_info, num_bytes,
+ shrink_delalloc(space_info, num_bytes,
state != FLUSH_DELALLOC, for_preempt);
break;
case FLUSH_DELAYED_REFS_NR:
@@ -805,7 +891,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
ret = PTR_ERR(trans);
break;
}
- ret = btrfs_chunk_alloc(trans,
+ ret = btrfs_chunk_alloc(trans, space_info,
btrfs_get_alloc_profile(fs_info, space_info->flags),
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
@@ -834,6 +920,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
*/
ret = btrfs_commit_current_transaction(root);
break;
+ case RESET_ZONES:
+ ret = btrfs_reset_unused_block_groups(space_info, num_bytes);
+ break;
default:
ret = -ENOSPC;
break;
@@ -844,8 +933,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
return;
}
-static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *space_info)
+static u64 btrfs_calc_reclaim_metadata_size(const struct btrfs_space_info *space_info)
{
u64 used;
u64 avail;
@@ -853,8 +941,7 @@ static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&space_info->lock);
- avail = calc_available_free_space(fs_info, space_info,
- BTRFS_RESERVE_FLUSH_ALL);
+ avail = calc_available_free_space(space_info, BTRFS_RESERVE_FLUSH_ALL);
used = btrfs_space_info_used(space_info, true);
/*
@@ -869,18 +956,25 @@ static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
return to_reclaim;
}
-static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *space_info)
+static bool need_preemptive_reclaim(const struct btrfs_space_info *space_info)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
u64 ordered, delalloc;
u64 thresh;
u64 used;
- thresh = mult_perc(space_info->total_bytes, 90);
-
lockdep_assert_held(&space_info->lock);
+ /*
+ * We have tickets queued, bail so we don't compete with the async
+ * flushers.
+ */
+ if (space_info->reclaim_size)
+ return false;
+
+ thresh = mult_perc(space_info->total_bytes, 90);
+
/* If we're just plain full then async reclaim just slows us down. */
if ((space_info->bytes_used + space_info->bytes_reserved +
global_rsv_size) >= thresh)
@@ -901,13 +995,6 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
return false;
/*
- * We have tickets queued, bail so we don't compete with the async
- * flushers.
- */
- if (space_info->reclaim_size)
- return false;
-
- /*
* If we have over half of the free space occupied by reservations or
* pinned then we want to start flushing.
*
@@ -936,8 +1023,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
* much delalloc we need for the background flusher to kick in.
*/
- thresh = calc_available_free_space(fs_info, space_info,
- BTRFS_RESERVE_FLUSH_ALL);
+ thresh = calc_available_free_space(space_info, BTRFS_RESERVE_FLUSH_ALL);
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_readonly + global_rsv_size;
if (used < space_info->total_bytes)
@@ -981,13 +1067,15 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
-static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static bool steal_from_global_rsv(struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 min_bytes;
+ lockdep_assert_held(&space_info->lock);
+
if (!ticket->steal)
return false;
@@ -1001,21 +1089,19 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
return false;
}
global_rsv->reserved -= ticket->bytes;
- remove_ticket(space_info, ticket);
- ticket->bytes = 0;
- wake_up(&ticket->wait);
- space_info->tickets_id++;
if (global_rsv->reserved < global_rsv->size)
- global_rsv->full = 0;
+ global_rsv->full = false;
spin_unlock(&global_rsv->lock);
+ remove_ticket(space_info, ticket, 0);
+ space_info->tickets_id++;
+
return true;
}
/*
* We've exhausted our flushing, start failing tickets.
*
- * @fs_info - fs_info for this fs
* @space_info - the space info we were flushing
*
* We call this when we've exhausted our flushing ability and haven't made
@@ -1028,72 +1114,66 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
* other tickets, or if it stumbles across a ticket that was smaller than the
* first ticket.
*/
-static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id;
- const bool aborted = BTRFS_FS_ERROR(fs_info);
+ const int abort_error = BTRFS_FS_ERROR(fs_info);
trace_btrfs_fail_all_tickets(fs_info, space_info);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
- __btrfs_dump_space_info(fs_info, space_info);
+ __btrfs_dump_space_info(space_info);
}
while (!list_empty(&space_info->tickets) &&
tickets_id == space_info->tickets_id) {
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
+ if (unlikely(abort_error)) {
+ remove_ticket(space_info, ticket, abort_error);
+ } else {
+ if (steal_from_global_rsv(space_info, ticket))
+ return true;
- if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket))
- return true;
-
- if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_info(fs_info, "failing ticket with %llu bytes",
- ticket->bytes);
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_info(fs_info, "failing ticket with %llu bytes",
+ ticket->bytes);
- remove_ticket(space_info, ticket);
- if (aborted)
- ticket->error = -EIO;
- else
- ticket->error = -ENOSPC;
- wake_up(&ticket->wait);
+ remove_ticket(space_info, ticket, -ENOSPC);
- /*
- * We're just throwing tickets away, so more flushing may not
- * trip over btrfs_try_granting_tickets, so we need to call it
- * here to see if we can make progress with the next ticket in
- * the list.
- */
- if (!aborted)
- btrfs_try_granting_tickets(fs_info, space_info);
+ /*
+ * We're just throwing tickets away, so more flushing may
+ * not trip over btrfs_try_granting_tickets, so we need
+ * to call it here to see if we can make progress with
+ * the next ticket in the list.
+ */
+ btrfs_try_granting_tickets(space_info);
+ }
}
return (tickets_id != space_info->tickets_id);
}
-/*
- * This is for normal flushers, we can wait all goddamned day if we want to. We
- * will loop and continuously try to flush as long as we are making progress.
- * We count progress as clearing off tickets each time we have to loop.
- */
-static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info)
{
- struct btrfs_fs_info *fs_info;
- struct btrfs_space_info *space_info;
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 to_reclaim;
enum btrfs_flush_state flush_state;
int commit_cycles = 0;
u64 last_tickets_id;
+ enum btrfs_flush_state final_state;
- fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
- space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ if (btrfs_is_zoned(fs_info))
+ final_state = RESET_ZONES;
+ else
+ final_state = COMMIT_TRANS;
spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(space_info);
if (!to_reclaim) {
- space_info->flush = 0;
+ space_info->flush = false;
spin_unlock(&space_info->lock);
return;
}
@@ -1102,15 +1182,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
flush_state = FLUSH_DELAYED_ITEMS_NR;
do {
- flush_space(fs_info, space_info, to_reclaim, flush_state, false);
+ flush_space(space_info, to_reclaim, flush_state, false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
- space_info->flush = 0;
+ space_info->flush = false;
spin_unlock(&space_info->lock);
return;
}
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
- space_info);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(space_info);
if (last_tickets_id == space_info->tickets_id) {
flush_state++;
} else {
@@ -1141,21 +1220,40 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
flush_state++;
- if (flush_state > COMMIT_TRANS) {
+ if (flush_state > final_state) {
commit_cycles++;
if (commit_cycles > 2) {
- if (maybe_fail_all_tickets(fs_info, space_info)) {
+ if (maybe_fail_all_tickets(space_info)) {
flush_state = FLUSH_DELAYED_ITEMS_NR;
commit_cycles--;
} else {
- space_info->flush = 0;
+ space_info->flush = false;
}
} else {
flush_state = FLUSH_DELAYED_ITEMS_NR;
}
}
spin_unlock(&space_info->lock);
- } while (flush_state <= COMMIT_TRANS);
+ } while (flush_state <= final_state);
+}
+
+/*
+ * This is for normal flushers, it can wait as much time as needed. We will
+ * loop and continuously try to flush as long as we are making progress. We
+ * count progress as clearing off tickets each time we have to loop.
+ */
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ do_async_reclaim_metadata_space(space_info);
+ for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
+ if (space_info->sub_group[i])
+ do_async_reclaim_metadata_space(space_info->sub_group[i]);
+ }
}
/*
@@ -1185,14 +1283,15 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
trans_rsv = &fs_info->trans_block_rsv;
spin_lock(&space_info->lock);
- while (need_preemptive_reclaim(fs_info, space_info)) {
+ while (need_preemptive_reclaim(space_info)) {
enum btrfs_flush_state flush;
u64 delalloc_size = 0;
u64 to_reclaim, block_rsv_size;
const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv);
+ const u64 bytes_may_use = space_info->bytes_may_use;
+ const u64 bytes_pinned = space_info->bytes_pinned;
- loops++;
-
+ spin_unlock(&space_info->lock);
/*
* We don't have a precise counter for the metadata being
* reserved for delalloc, so we'll approximate it by subtracting
@@ -1204,8 +1303,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
btrfs_block_rsv_reserved(delayed_block_rsv) +
btrfs_block_rsv_reserved(delayed_refs_rsv) +
btrfs_block_rsv_reserved(trans_rsv);
- if (block_rsv_size < space_info->bytes_may_use)
- delalloc_size = space_info->bytes_may_use - block_rsv_size;
+ if (block_rsv_size < bytes_may_use)
+ delalloc_size = bytes_may_use - block_rsv_size;
/*
* We don't want to include the global_rsv in our calculation,
@@ -1222,10 +1321,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
if (delalloc_size > block_rsv_size) {
to_reclaim = delalloc_size;
flush = FLUSH_DELALLOC;
- } else if (space_info->bytes_pinned >
+ } else if (bytes_pinned >
(btrfs_block_rsv_reserved(delayed_block_rsv) +
btrfs_block_rsv_reserved(delayed_refs_rsv))) {
- to_reclaim = space_info->bytes_pinned;
+ to_reclaim = bytes_pinned;
flush = COMMIT_TRANS;
} else if (btrfs_block_rsv_reserved(delayed_block_rsv) >
btrfs_block_rsv_reserved(delayed_refs_rsv)) {
@@ -1236,7 +1335,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
flush = FLUSH_DELAYED_REFS_NR;
}
- spin_unlock(&space_info->lock);
+ loops++;
/*
* We don't want to reclaim everything, just a portion, so scale
@@ -1246,7 +1345,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
to_reclaim >>= 2;
if (!to_reclaim)
to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
- flush_space(fs_info, space_info, to_reclaim, flush, true);
+ flush_space(space_info, to_reclaim, flush, true);
cond_resched();
spin_lock(&space_info->lock);
}
@@ -1286,6 +1385,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* This is where we reclaim all of the pinned space generated by running the
* iputs
*
+ * RESET_ZONES
+ * This state works only for the zoned mode. We scan the unused block group
+ * list and reset the zones and reuse the block group.
+ *
* ALLOC_CHUNK_FORCE
* For data we start with alloc chunk force, however we could have been full
* before, and then the transaction commit could have freed new block groups,
@@ -1295,22 +1398,19 @@ static const enum btrfs_flush_state data_flush_states[] = {
FLUSH_DELALLOC_FULL,
RUN_DELAYED_IPUTS,
COMMIT_TRANS,
+ RESET_ZONES,
ALLOC_CHUNK_FORCE,
};
-static void btrfs_async_reclaim_data_space(struct work_struct *work)
+static void do_async_reclaim_data_space(struct btrfs_space_info *space_info)
{
- struct btrfs_fs_info *fs_info;
- struct btrfs_space_info *space_info;
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 last_tickets_id;
enum btrfs_flush_state flush_state = 0;
- fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
- space_info = fs_info->data_sinfo;
-
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
- space_info->flush = 0;
+ space_info->flush = false;
spin_unlock(&space_info->lock);
return;
}
@@ -1318,27 +1418,27 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
spin_unlock(&space_info->lock);
while (!space_info->full) {
- flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
+ flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
- space_info->flush = 0;
+ space_info->flush = false;
spin_unlock(&space_info->lock);
return;
}
/* Something happened, fail everything and bail. */
- if (BTRFS_FS_ERROR(fs_info))
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
goto aborted_fs;
last_tickets_id = space_info->tickets_id;
spin_unlock(&space_info->lock);
}
while (flush_state < ARRAY_SIZE(data_flush_states)) {
- flush_space(fs_info, space_info, U64_MAX,
+ flush_space(space_info, U64_MAX,
data_flush_states[flush_state], false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
- space_info->flush = 0;
+ space_info->flush = false;
spin_unlock(&space_info->lock);
return;
}
@@ -1352,16 +1452,16 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
if (flush_state >= ARRAY_SIZE(data_flush_states)) {
if (space_info->full) {
- if (maybe_fail_all_tickets(fs_info, space_info))
+ if (maybe_fail_all_tickets(space_info))
flush_state = 0;
else
- space_info->flush = 0;
+ space_info->flush = false;
} else {
flush_state = 0;
}
/* Something happened, fail everything and bail. */
- if (BTRFS_FS_ERROR(fs_info))
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
goto aborted_fs;
}
@@ -1370,11 +1470,24 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
return;
aborted_fs:
- maybe_fail_all_tickets(fs_info, space_info);
- space_info->flush = 0;
+ maybe_fail_all_tickets(space_info);
+ space_info->flush = false;
spin_unlock(&space_info->lock);
}
+static void btrfs_async_reclaim_data_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
+ space_info = fs_info->data_sinfo;
+ do_async_reclaim_data_space(space_info);
+ for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++)
+ if (space_info->sub_group[i])
+ do_async_reclaim_data_space(space_info->sub_group[i]);
+}
+
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
@@ -1386,6 +1499,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
static const enum btrfs_flush_state priority_flush_states[] = {
FLUSH_DELAYED_ITEMS_NR,
FLUSH_DELAYED_ITEMS,
+ RESET_ZONES,
ALLOC_CHUNK,
};
@@ -1399,92 +1513,90 @@ static const enum btrfs_flush_state evict_flush_states[] = {
FLUSH_DELALLOC_FULL,
ALLOC_CHUNK,
COMMIT_TRANS,
+ RESET_ZONES,
};
-static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- struct reserve_ticket *ticket,
- const enum btrfs_flush_state *states,
- int states_nr)
+static bool is_ticket_served(struct reserve_ticket *ticket)
{
+ bool ret;
+
+ spin_lock(&ticket->lock);
+ ret = (ticket->bytes == 0);
+ spin_unlock(&ticket->lock);
+
+ return ret;
+}
+
+static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket,
+ const enum btrfs_flush_state *states,
+ int states_nr)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 to_reclaim;
int flush_state = 0;
- spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
/*
* This is the priority reclaim path, so to_reclaim could be >0 still
* because we may have only satisfied the priority tickets and still
* left non priority tickets on the list. We would then have
* to_reclaim but ->bytes == 0.
*/
- if (ticket->bytes == 0) {
- spin_unlock(&space_info->lock);
+ if (is_ticket_served(ticket))
return;
- }
+
+ spin_lock(&space_info->lock);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(space_info);
+ spin_unlock(&space_info->lock);
while (flush_state < states_nr) {
- spin_unlock(&space_info->lock);
- flush_space(fs_info, space_info, to_reclaim, states[flush_state],
- false);
- flush_state++;
- spin_lock(&space_info->lock);
- if (ticket->bytes == 0) {
- spin_unlock(&space_info->lock);
+ flush_space(space_info, to_reclaim, states[flush_state], false);
+ if (is_ticket_served(ticket))
return;
- }
+ flush_state++;
}
+ spin_lock(&space_info->lock);
/*
* Attempt to steal from the global rsv if we can, except if the fs was
* turned into error mode due to a transaction abort when flushing space
* above, in that case fail with the abort error instead of returning
* success to the caller if we can steal from the global rsv - this is
- * just to have caller fail immeditelly instead of later when trying to
+ * just to have caller fail immediately instead of later when trying to
* modify the fs, making it easier to debug -ENOSPC problems.
*/
- if (BTRFS_FS_ERROR(fs_info)) {
- ticket->error = BTRFS_FS_ERROR(fs_info);
- remove_ticket(space_info, ticket);
- } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
- ticket->error = -ENOSPC;
- remove_ticket(space_info, ticket);
- }
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
+ remove_ticket(space_info, ticket, BTRFS_FS_ERROR(fs_info));
+ else if (!steal_from_global_rsv(space_info, ticket))
+ remove_ticket(space_info, ticket, -ENOSPC);
/*
* We must run try_granting_tickets here because we could be a large
* ticket in front of a smaller ticket that can now be satisfied with
* the available space.
*/
- btrfs_try_granting_tickets(fs_info, space_info);
+ btrfs_try_granting_tickets(space_info);
spin_unlock(&space_info->lock);
}
-static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static void priority_reclaim_data_space(struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
- spin_lock(&space_info->lock);
-
/* We could have been granted before we got here. */
- if (ticket->bytes == 0) {
- spin_unlock(&space_info->lock);
+ if (is_ticket_served(ticket))
return;
- }
+ spin_lock(&space_info->lock);
while (!space_info->full) {
spin_unlock(&space_info->lock);
- flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
- spin_lock(&space_info->lock);
- if (ticket->bytes == 0) {
- spin_unlock(&space_info->lock);
+ flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
+ if (is_ticket_served(ticket))
return;
- }
+ spin_lock(&space_info->lock);
}
- ticket->error = -ENOSPC;
- remove_ticket(space_info, ticket);
- btrfs_try_granting_tickets(fs_info, space_info);
+ remove_ticket(space_info, ticket, -ENOSPC);
+ btrfs_try_granting_tickets(space_info);
spin_unlock(&space_info->lock);
}
@@ -1493,11 +1605,13 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info,
{
DEFINE_WAIT(wait);
- int ret = 0;
- spin_lock(&space_info->lock);
+ spin_lock(&ticket->lock);
while (ticket->bytes > 0 && ticket->error == 0) {
+ int ret;
+
ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
+ spin_unlock(&ticket->lock);
if (ret) {
/*
* Delete us from the list. After we unlock the space
@@ -1507,24 +1621,23 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info,
* despite getting an error, resulting in a space leak
* (bytes_may_use counter of our space_info).
*/
- remove_ticket(space_info, ticket);
- ticket->error = -EINTR;
- break;
+ spin_lock(&space_info->lock);
+ remove_ticket(space_info, ticket, -EINTR);
+ spin_unlock(&space_info->lock);
+ return;
}
- spin_unlock(&space_info->lock);
schedule();
finish_wait(&ticket->wait, &wait);
- spin_lock(&space_info->lock);
+ spin_lock(&ticket->lock);
}
- spin_unlock(&space_info->lock);
+ spin_unlock(&ticket->lock);
}
/*
* Do the appropriate flushing and waiting for a ticket.
*
- * @fs_info: the filesystem
* @space_info: space info for the reservation
* @ticket: ticket for the reservation
* @start_ns: timestamp when the reservation started
@@ -1534,8 +1647,7 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info,
* This does the work of figuring out how to flush for the ticket, waiting for
* the reservation, and returning the appropriate error if there is one.
*/
-static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static int handle_reserve_ticket(struct btrfs_space_info *space_info,
struct reserve_ticket *ticket,
u64 start_ns, u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
@@ -1549,20 +1661,20 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
wait_reserve_ticket(space_info, ticket);
break;
case BTRFS_RESERVE_FLUSH_LIMIT:
- priority_reclaim_metadata_space(fs_info, space_info, ticket,
+ priority_reclaim_metadata_space(space_info, ticket,
priority_flush_states,
ARRAY_SIZE(priority_flush_states));
break;
case BTRFS_RESERVE_FLUSH_EVICT:
- priority_reclaim_metadata_space(fs_info, space_info, ticket,
+ priority_reclaim_metadata_space(space_info, ticket,
evict_flush_states,
ARRAY_SIZE(evict_flush_states));
break;
case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
- priority_reclaim_data_space(fs_info, space_info, ticket);
+ priority_reclaim_data_space(space_info, ticket);
break;
default:
- ASSERT(0);
+ ASSERT(0, "flush=%d", flush);
break;
}
@@ -1574,9 +1686,10 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
* releasing reserved space (if an error happens the expectation is that
* space wasn't reserved at all).
*/
- ASSERT(!(ticket->bytes == 0 && ticket->error));
- trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
- start_ns, flush, ticket->error);
+ ASSERT(!(ticket->bytes == 0 && ticket->error),
+ "ticket->bytes=%llu ticket->error=%d", ticket->bytes, ticket->error);
+ trace_btrfs_reserve_ticket(space_info->fs_info, space_info->flags,
+ orig_bytes, start_ns, flush, ticket->error);
return ret;
}
@@ -1590,9 +1703,9 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
(flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
}
-static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+static inline void maybe_clamp_preempt(struct btrfs_space_info *space_info)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
@@ -1627,7 +1740,6 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
/*
* Try to reserve bytes from the block_rsv's space.
*
- * @fs_info: the filesystem
* @space_info: space info we want to allocate from
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
@@ -1639,10 +1751,10 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int __reserve_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+static int reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct work_struct *async_work;
struct reserve_ticket ticket;
u64 start_ns = 0;
@@ -1650,7 +1762,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
int ret = -ENOSPC;
bool pending_tickets;
- ASSERT(orig_bytes);
+ ASSERT(orig_bytes, "orig_bytes=%llu", orig_bytes);
/*
* If have a transaction handle (current->journal_info != NULL), then
* the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor
@@ -1659,9 +1771,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
*/
if (current->journal_info) {
/* One assert per line for easier debugging. */
- ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL);
- ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL);
- ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT);
+ ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL, "flush=%d", flush);
+ ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL, "flush=%d", flush);
+ ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT, "flush=%d", flush);
}
if (flush == BTRFS_RESERVE_FLUSH_DATA)
@@ -1689,9 +1801,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
*/
if (!pending_tickets &&
((used + orig_bytes <= space_info->total_bytes) ||
- btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
- btrfs_space_info_update_bytes_may_use(fs_info, space_info,
- orig_bytes);
+ can_overcommit(space_info, used, orig_bytes, flush))) {
+ btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
ret = 0;
}
@@ -1701,10 +1812,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* left to allocate for the block.
*/
if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
- used = btrfs_space_info_used(space_info, false);
+ used -= space_info->bytes_may_use;
if (used + orig_bytes <= space_info->total_bytes) {
- btrfs_space_info_update_bytes_may_use(fs_info, space_info,
- orig_bytes);
+ btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
ret = 0;
}
}
@@ -1721,6 +1831,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
ticket.error = 0;
space_info->reclaim_size += ticket.bytes;
init_waitqueue_head(&ticket.wait);
+ spin_lock_init(&ticket.lock);
ticket.steal = can_steal(flush);
if (trace_btrfs_reserve_ticket_enabled())
start_ns = ktime_get_ns();
@@ -1737,14 +1848,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* preemptive flushing in order to keep up with
* the workload.
*/
- maybe_clamp_preempt(fs_info, space_info);
+ maybe_clamp_preempt(space_info);
- space_info->flush = 1;
+ space_info->flush = true;
trace_btrfs_trigger_flush(fs_info,
space_info->flags,
orig_bytes, flush,
"enospc");
- queue_work(system_unbound_wq, async_work);
+ queue_work(system_dfl_wq, async_work);
}
} else {
list_add_tail(&ticket.list,
@@ -1758,10 +1869,10 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
!work_busy(&fs_info->preempt_reclaim_work) &&
- need_preemptive_reclaim(fs_info, space_info)) {
+ need_preemptive_reclaim(space_info)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
- queue_work(system_unbound_wq,
+ queue_work(system_dfl_wq,
&fs_info->preempt_reclaim_work);
}
}
@@ -1769,14 +1880,12 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
if (!ret || !can_ticket(flush))
return ret;
- return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
- orig_bytes, flush);
+ return handle_reserve_ticket(space_info, &ticket, start_ns, orig_bytes, flush);
}
/*
* Try to reserve metadata bytes from the block_rsv's space.
*
- * @fs_info: the filesystem
* @space_info: the space_info we're allocating for
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
@@ -1788,20 +1897,21 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
int ret;
- ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush);
+ ret = reserve_bytes(space_info, orig_bytes, flush);
if (ret == -ENOSPC) {
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
space_info->flags, orig_bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0);
+ btrfs_dump_space_info(space_info, orig_bytes, false);
}
return ret;
}
@@ -1809,30 +1919,32 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
/*
* Try to reserve data bytes for an allocation.
*
- * @fs_info: the filesystem
+ * @space_info: the space_info we're allocating for
* @bytes: number of bytes we need
* @flush: how we are allowed to flush
*
* This will reserve bytes from the data space info. If there is not enough
* space then we will attempt to flush space as specified by flush.
*/
-int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
int ret;
ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
- flush == BTRFS_RESERVE_NO_FLUSH);
- ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
+ flush == BTRFS_RESERVE_NO_FLUSH, "flush=%d", flush);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA,
+ "current->journal_info=0x%lx flush=%d",
+ (unsigned long)current->journal_info, flush);
- ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
+ ret = reserve_bytes(space_info, bytes, flush);
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
- data_sinfo->flags, bytes, 1);
+ space_info->flags, bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
+ btrfs_dump_space_info(space_info, bytes, false);
}
return ret;
}
@@ -1845,7 +1957,7 @@ __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
btrfs_info(fs_info, "dumping space info:");
list_for_each_entry(space_info, &fs_info->space_info, list) {
spin_lock(&space_info->lock);
- __btrfs_dump_space_info(fs_info, space_info);
+ __btrfs_dump_space_info(space_info);
spin_unlock(&space_info->lock);
}
dump_global_block_rsv(fs_info);
@@ -1862,7 +1974,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
int factor;
/* It's df, we don't care if it's racy */
- if (list_empty(&sinfo->ro_bgs))
+ if (data_race(list_empty(&sinfo->ro_bgs)))
return 0;
spin_lock(&sinfo->lock);
@@ -1887,13 +1999,13 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
static u64 calc_pct_ratio(u64 x, u64 y)
{
- int err;
+ int ret;
if (!y)
return 0;
again:
- err = check_mul_overflow(100, x, &x);
- if (err)
+ ret = check_mul_overflow(100, x, &x);
+ if (ret)
goto lose_precision;
return div64_u64(x, y);
lose_precision:
@@ -2053,7 +2165,7 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool
}
}
-bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
+static bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
{
bool ret;
@@ -2082,3 +2194,32 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
do_reclaim_sweep(space_info, raid);
}
}
+
+void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+
+ lockdep_assert_held(&space_info->lock);
+
+ /* Prioritize the global reservation to receive the freed space. */
+ if (global_rsv->space_info != space_info)
+ goto grant;
+
+ spin_lock(&global_rsv->lock);
+ if (!global_rsv->full) {
+ u64 to_add = min(len, global_rsv->size - global_rsv->reserved);
+
+ global_rsv->reserved += to_add;
+ btrfs_space_info_update_bytes_may_use(space_info, to_add);
+ if (global_rsv->reserved >= global_rsv->size)
+ global_rsv->full = true;
+ len -= to_add;
+ }
+ spin_unlock(&global_rsv->lock);
+
+grant:
+ /* Add to any tickets we may have. */
+ if (len)
+ btrfs_try_granting_tickets(space_info);
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index efbecc0c5258..446c0614ad4a 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -79,6 +79,10 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_EMERGENCY,
};
+/*
+ * Please be aware that the order of enum values will be the order of the reclaim
+ * process in btrfs_async_reclaim_metadata_space().
+ */
enum btrfs_flush_state {
FLUSH_DELAYED_ITEMS_NR = 1,
FLUSH_DELAYED_ITEMS = 2,
@@ -91,10 +95,21 @@ enum btrfs_flush_state {
ALLOC_CHUNK_FORCE = 9,
RUN_DELAYED_IPUTS = 10,
COMMIT_TRANS = 11,
+ RESET_ZONES = 12,
+};
+
+enum btrfs_space_info_sub_group {
+ BTRFS_SUB_GROUP_PRIMARY,
+ BTRFS_SUB_GROUP_DATA_RELOC,
+ BTRFS_SUB_GROUP_TREELOG,
};
+#define BTRFS_SPACE_INFO_SUB_GROUP_MAX 1
struct btrfs_space_info {
struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *parent;
+ struct btrfs_space_info *sub_group[BTRFS_SPACE_INFO_SUB_GROUP_MAX];
+ int subgroup_id;
spinlock_t lock;
u64 total_bytes; /* total bytes in the space,
@@ -127,11 +142,11 @@ struct btrfs_space_info {
flushing. The value is >> clamp, so turns
out to be a 2^clamp divisor. */
- unsigned int full:1; /* indicates that we cannot allocate any more
+ bool full; /* indicates that we cannot allocate any more
chunks for this space */
- unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
+ bool chunk_alloc; /* set if we are allocating a chunk */
- unsigned int flush:1; /* set if we are trying to make space */
+ bool flush; /* set if we are trying to make space */
unsigned int force_alloc; /* set if we need to force a chunk
alloc for this space */
@@ -209,14 +224,6 @@ struct btrfs_space_info {
s64 reclaimable_bytes;
};
-struct reserve_ticket {
- u64 bytes;
- int error;
- bool steal;
- struct list_head list;
- wait_queue_head_t wait;
-};
-
static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info)
{
return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
@@ -229,10 +236,10 @@ static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_i
*/
#define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \
static inline void \
-btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \
- struct btrfs_space_info *sinfo, \
+btrfs_space_info_update_##name(struct btrfs_space_info *sinfo, \
s64 bytes) \
{ \
+ struct btrfs_fs_info *fs_info = sinfo->fs_info; \
const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \
lockdep_assert_held(&sinfo->lock); \
trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \
@@ -251,6 +258,17 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
DECLARE_SPACE_INFO_UPDATE(bytes_zone_unusable, "zone_unusable");
+static inline u64 btrfs_space_info_used(const struct btrfs_space_info *s_info,
+ bool may_use_included)
+{
+ lockdep_assert_held(&s_info->lock);
+
+ return s_info->bytes_used + s_info->bytes_reserved +
+ s_info->bytes_pinned + s_info->bytes_readonly +
+ s_info->bytes_zone_unusable +
+ (may_use_included ? s_info->bytes_may_use : 0);
+}
+
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
struct btrfs_block_group *block_group);
@@ -258,33 +276,26 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
u64 chunk_size);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
-u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info,
- bool may_use_included);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *info, u64 bytes,
- int dump_block_groups);
-int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ bool dump_block_groups);
+int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush);
-void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info);
-int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *space_info, u64 bytes,
- enum btrfs_reserve_flush_enum flush);
+void btrfs_try_granting_tickets(struct btrfs_space_info *space_info);
+bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush);
static inline void btrfs_space_info_free_bytes_may_use(
- struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 num_bytes)
{
spin_lock(&space_info->lock);
- btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
- btrfs_try_granting_tickets(fs_info, space_info);
+ btrfs_space_info_update_bytes_may_use(space_info, -num_bytes);
+ btrfs_try_granting_tickets(space_info);
spin_unlock(&space_info->lock);
}
-int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
@@ -292,8 +303,8 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes);
void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready);
-bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info);
void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info);
+void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len);
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 8c68059ac1b0..f82e71f5d88b 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -2,12 +2,11 @@
#include <linux/slab.h>
#include "messages.h"
-#include "ctree.h"
#include "subpage.h"
#include "btrfs_inode.h"
/*
- * Subpage (sectorsize < PAGE_SIZE) support overview:
+ * Subpage (block size < folio size) support overview:
*
* Limitations:
*
@@ -50,7 +49,7 @@
* Implementation:
*
* - Common
- * Both metadata and data will use a new structure, btrfs_subpage, to
+ * Both metadata and data will use a new structure, btrfs_folio_state, to
* record the status of each sector inside a page. This provides the extra
* granularity needed.
*
@@ -64,34 +63,14 @@
* This means a slightly higher tree locking latency.
*/
-#if PAGE_SIZE > SZ_4K
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping)
+int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, enum btrfs_folio_type type)
{
- if (fs_info->sectorsize >= PAGE_SIZE)
- return false;
+ struct btrfs_folio_state *bfs;
- /*
- * Only data pages (either through DIO or compression) can have no
- * mapping. And if page->mapping->host is data inode, it's subpage.
- * As we have ruled our sectorsize >= PAGE_SIZE case already.
- */
- if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host)))
- return true;
-
- /*
- * Now the only remaining case is metadata, which we only go subpage
- * routine if nodesize < PAGE_SIZE.
- */
- if (fs_info->nodesize < PAGE_SIZE)
- return true;
- return false;
-}
-#endif
-
-int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct folio *folio, enum btrfs_subpage_type type)
-{
- struct btrfs_subpage *subpage;
+ /* For metadata we don't support large folio yet. */
+ if (type == BTRFS_SUBPAGE_METADATA)
+ ASSERT(!folio_test_large(folio));
/*
* We have cases like a dummy extent buffer page, which is not mapped
@@ -101,40 +80,50 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
ASSERT(folio_test_locked(folio));
/* Either not subpage, or the folio already has private attached. */
- if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio))
+ if (folio_test_private(folio))
+ return 0;
+ if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info))
+ return 0;
+ if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return 0;
- subpage = btrfs_alloc_subpage(fs_info, type);
- if (IS_ERR(subpage))
- return PTR_ERR(subpage);
+ bfs = btrfs_alloc_folio_state(fs_info, folio_size(folio), type);
+ if (IS_ERR(bfs))
+ return PTR_ERR(bfs);
- folio_attach_private(folio, subpage);
+ folio_attach_private(folio, bfs);
return 0;
}
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio)
+void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio,
+ enum btrfs_folio_type type)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
/* Either not subpage, or the folio already has private attached. */
- if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio))
+ if (!folio_test_private(folio))
+ return;
+ if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info))
+ return;
+ if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return;
- subpage = folio_detach_private(folio);
- ASSERT(subpage);
- btrfs_free_subpage(subpage);
+ bfs = folio_detach_private(folio);
+ ASSERT(bfs);
+ btrfs_free_folio_state(bfs);
}
-struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- enum btrfs_subpage_type type)
+struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info,
+ size_t fsize, enum btrfs_folio_type type)
{
- struct btrfs_subpage *ret;
+ struct btrfs_folio_state *ret;
unsigned int real_size;
- ASSERT(fs_info->sectorsize < PAGE_SIZE);
+ ASSERT(fs_info->sectorsize < fsize);
real_size = struct_size(ret, bitmaps,
- BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page));
+ BITS_TO_LONGS(btrfs_bitmap_nr_max *
+ (fsize >> fs_info->sectorsize_bits)));
ret = kzalloc(real_size, GFP_NOFS);
if (!ret)
return ERR_PTR(-ENOMEM);
@@ -147,11 +136,6 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
return ret;
}
-void btrfs_free_subpage(struct btrfs_subpage *subpage)
-{
- kfree(subpage);
-}
-
/*
* Increase the eb_refs of current subpage.
*
@@ -163,59 +147,59 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage)
*/
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_meta_is_subpage(fs_info))
return;
ASSERT(folio_test_private(folio) && folio->mapping);
lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = folio_get_private(folio);
- atomic_inc(&subpage->eb_refs);
+ bfs = folio_get_private(folio);
+ atomic_inc(&bfs->eb_refs);
}
void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_meta_is_subpage(fs_info))
return;
ASSERT(folio_test_private(folio) && folio->mapping);
lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = folio_get_private(folio);
- ASSERT(atomic_read(&subpage->eb_refs));
- atomic_dec(&subpage->eb_refs);
+ bfs = folio_get_private(folio);
+ ASSERT(atomic_read(&bfs->eb_refs));
+ atomic_dec(&bfs->eb_refs);
}
static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- /* For subpage support, the folio must be single page. */
- ASSERT(folio_order(folio) == 0);
-
/* Basic checks */
ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
- IS_ALIGNED(len, fs_info->sectorsize));
+ IS_ALIGNED(len, fs_info->sectorsize), "start=%llu len=%u", start, len);
/*
* The range check only works for mapped page, we can still have
* unmapped page like dummy extent buffer pages.
*/
if (folio->mapping)
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + PAGE_SIZE);
+ start + len <= folio_next_pos(folio),
+ "start=%llu len=%u folio_pos=%llu folio_size=%zu",
+ start, len, folio_pos(folio), folio_size(folio));
}
#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
({ \
- unsigned int __start_bit; \
+ unsigned int __start_bit; \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
\
btrfs_subpage_assert(fs_info, folio, start, len); \
- __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
- __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \
+ __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \
+ __start_bit += __bpf * btrfs_bitmap_nr_##name; \
__start_bit; \
})
@@ -233,14 +217,13 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, folio_pos(folio) + PAGE_SIZE,
- orig_start + orig_len) - *start;
+ *len = min_t(u64, folio_next_pos(folio), orig_start + orig_len) - *start;
}
static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = (len >> fs_info->sectorsize_bits);
unsigned long flags;
@@ -250,7 +233,7 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
btrfs_subpage_assert(fs_info, folio, start, len);
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
/*
* We have call sites passing @lock_page into
* extent_clear_unlock_delalloc() for compression path.
@@ -258,18 +241,20 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
* This @locked_page is locked by plain lock_page(), thus its
* subpage::locked is 0. Handle them in a special way.
*/
- if (atomic_read(&subpage->nr_locked) == 0) {
- spin_unlock_irqrestore(&subpage->lock, flags);
+ if (atomic_read(&bfs->nr_locked) == 0) {
+ spin_unlock_irqrestore(&bfs->lock, flags);
return true;
}
- for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) {
- clear_bit(bit, subpage->bitmaps);
+ for_each_set_bit_from(bit, bfs->bitmaps, start_bit + nbits) {
+ clear_bit(bit, bfs->bitmaps);
cleared++;
}
- ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->nr_locked);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ ASSERT(atomic_read(&bfs->nr_locked) >= cleared,
+ "atomic_read(&bfs->nr_locked)=%d cleared=%d",
+ atomic_read(&bfs->nr_locked), cleared);
+ last = atomic_sub_and_test(cleared, &bfs->nr_locked);
+ spin_unlock_irqrestore(&bfs->lock, flags);
return last;
}
@@ -292,11 +277,11 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
ASSERT(folio_test_locked(folio));
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) {
folio_unlock(folio);
return;
}
@@ -308,7 +293,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
* Since we own the page lock, no one else could touch subpage::locked
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->nr_locked) == 0) {
+ if (atomic_read(&bfs->nr_locked) == 0) {
/* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
@@ -322,86 +307,97 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, unsigned long bitmap)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked;
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
+ const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked;
unsigned long flags;
bool last = false;
int cleared = 0;
int bit;
- if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (!btrfs_is_subpage(fs_info, folio)) {
folio_unlock(folio);
return;
}
- if (atomic_read(&subpage->nr_locked) == 0) {
+ if (atomic_read(&bfs->nr_locked) == 0) {
/* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
}
- spin_lock_irqsave(&subpage->lock, flags);
- for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) {
- if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
+ spin_lock_irqsave(&bfs->lock, flags);
+ for_each_set_bit(bit, &bitmap, blocks_per_folio) {
+ if (test_and_clear_bit(bit + start_bit, bfs->bitmaps))
cleared++;
}
- ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->nr_locked);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ ASSERT(atomic_read(&bfs->nr_locked) >= cleared,
+ "atomic_read(&bfs->nr_locked)=%d cleared=%d",
+ atomic_read(&bfs->nr_locked), cleared);
+ last = atomic_sub_and_test(cleared, &bfs->nr_locked);
+ spin_unlock_irqrestore(&bfs->lock, flags);
if (last)
folio_unlock(folio);
}
-#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
- bitmap_test_range_all_set(subpage->bitmaps, \
- fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \
- fs_info->sectors_per_page)
+#define subpage_test_bitmap_all_set(fs_info, folio, name) \
+({ \
+ struct btrfs_folio_state *__bfs = folio_get_private(folio); \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
+ \
+ bitmap_test_range_all_set(__bfs->bitmaps, \
+ __bpf * btrfs_bitmap_nr_##name, __bpf); \
+})
-#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \
- bitmap_test_range_all_zero(subpage->bitmaps, \
- fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \
- fs_info->sectors_per_page)
+#define subpage_test_bitmap_all_zero(fs_info, folio, name) \
+({ \
+ struct btrfs_folio_state *__bfs = folio_get_private(folio); \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
+ \
+ bitmap_test_range_all_zero(__bfs->bitmaps, \
+ __bpf * btrfs_bitmap_nr_##name, __bpf); \
+})
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, folio, uptodate))
folio_mark_uptodate(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
folio_clear_uptodate(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_unlock_irqrestore(&bfs->lock, flags);
folio_mark_dirty(folio);
}
@@ -418,17 +414,17 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
bool last = false;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, folio, dirty))
last = true;
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
return last;
}
@@ -445,91 +441,100 @@ void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
+ bool keep_write;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+
+ /*
+ * Don't clear the TOWRITE tag when starting writeback on a still-dirty
+ * folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it,
+ * assume writeback is complete, and exit too early — violating sync
+ * ordering guarantees.
+ */
+ keep_write = folio_test_dirty(folio);
if (!folio_test_writeback(folio))
- folio_start_writeback(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ __folio_start_writeback(folio, keep_write);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) {
ASSERT(folio_test_writeback(folio));
folio_end_writeback(folio);
}
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
folio_set_ordered(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, folio, ordered))
folio_clear_ordered(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, folio, checked))
folio_set_checked(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
folio_clear_checked(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
/*
@@ -540,16 +545,16 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
- struct btrfs_subpage *subpage = folio_get_private(folio); \
+ struct btrfs_folio_state *bfs = folio_get_private(folio); \
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \
name, start, len); \
unsigned long flags; \
bool ret; \
\
- spin_lock_irqsave(&subpage->lock, flags); \
- ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \
+ spin_lock_irqsave(&bfs->lock, flags); \
+ ret = bitmap_test_range_all_set(bfs->bitmaps, start_bit, \
len >> fs_info->sectorsize_bits); \
- spin_unlock_irqrestore(&subpage->lock, flags); \
+ spin_unlock_irqrestore(&bfs->lock, flags); \
return ret; \
}
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
@@ -569,7 +574,7 @@ void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_set_func(folio); \
return; \
} \
@@ -579,7 +584,7 @@ void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_clear_func(folio); \
return; \
} \
@@ -589,7 +594,7 @@ bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) \
+ !btrfs_is_subpage(fs_info, folio)) \
return folio_test_func(folio); \
return btrfs_subpage_test_##name(fs_info, folio, start, len); \
} \
@@ -597,7 +602,7 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_set_func(folio); \
return; \
} \
@@ -608,7 +613,7 @@ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_clear_func(folio); \
return; \
} \
@@ -619,10 +624,32 @@ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) \
+ !btrfs_is_subpage(fs_info, folio)) \
return folio_test_func(folio); \
btrfs_subpage_clamp_range(folio, &start, &len); \
return btrfs_subpage_test_##name(fs_info, folio, start, len); \
+} \
+void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb) \
+{ \
+ if (!btrfs_meta_is_subpage(eb->fs_info)) { \
+ folio_set_func(folio); \
+ return; \
+ } \
+ btrfs_subpage_set_##name(eb->fs_info, folio, eb->start, eb->len); \
+} \
+void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb) \
+{ \
+ if (!btrfs_meta_is_subpage(eb->fs_info)) { \
+ folio_clear_func(folio); \
+ return; \
+ } \
+ btrfs_subpage_clear_##name(eb->fs_info, folio, eb->start, eb->len); \
+} \
+bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb) \
+{ \
+ if (!btrfs_meta_is_subpage(eb->fs_info)) \
+ return folio_test_func(folio); \
+ return btrfs_subpage_test_##name(eb->fs_info, folio, eb->start, eb->len); \
}
IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate,
folio_test_uptodate);
@@ -635,6 +662,27 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
folio_test_checked);
+#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \
+{ \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
+ const struct btrfs_folio_state *__bfs = folio_get_private(folio); \
+ \
+ ASSERT(__bpf <= BITS_PER_LONG); \
+ *dst = bitmap_read(__bfs->bitmaps, \
+ __bpf * btrfs_bitmap_nr_##name, __bpf); \
+}
+
+#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \
+{ \
+ unsigned long bitmap; \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
+ \
+ GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \
+ btrfs_warn(fs_info, \
+ "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
+ start, len, folio_pos(folio), __bpf, &bitmap); \
+}
+
/*
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
* is cleared.
@@ -642,7 +690,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
unsigned int start_bit;
unsigned int nbits;
unsigned long flags;
@@ -650,18 +698,22 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
- if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (!btrfs_is_subpage(fs_info, folio)) {
ASSERT(!folio_test_dirty(folio));
return;
}
start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len);
nbits = len >> fs_info->sectorsize_bits;
- subpage = folio_get_private(folio);
- ASSERT(subpage);
- spin_lock_irqsave(&subpage->lock, flags);
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
- spin_unlock_irqrestore(&subpage->lock, flags);
+ bfs = folio_get_private(folio);
+ ASSERT(bfs);
+ spin_lock_irqsave(&bfs->lock, flags);
+ if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) {
+ SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len);
+ ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
+ }
+ ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
/*
@@ -674,86 +726,103 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
unsigned long flags;
unsigned int start_bit;
unsigned int nbits;
int ret;
ASSERT(folio_test_locked(folio));
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping))
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio))
return;
- subpage = folio_get_private(folio);
+ bfs = folio_get_private(folio);
start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
nbits = len >> fs_info->sectorsize_bits;
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
/* Target range should not yet be locked. */
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
- bitmap_set(subpage->bitmaps, start_bit, nbits);
- ret = atomic_add_return(nbits, &subpage->nr_locked);
- ASSERT(ret <= fs_info->sectors_per_page);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) {
+ SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len);
+ ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
+ }
+ bitmap_set(bfs->bitmaps, start_bit, nbits);
+ ret = atomic_add_return(nbits, &bfs->nr_locked);
+ ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio));
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
-#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
-{ \
- const int sectors_per_page = fs_info->sectors_per_page; \
- \
- ASSERT(sectors_per_page < BITS_PER_LONG); \
- *dst = bitmap_read(subpage->bitmaps, \
- sectors_per_page * btrfs_bitmap_nr_##name, \
- sectors_per_page); \
+/*
+ * Clear the dirty flag for the folio.
+ *
+ * If the affected folio is no longer dirty, return true. Otherwise return false.
+ */
+bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb)
+{
+ bool last;
+
+ if (!btrfs_meta_is_subpage(eb->fs_info)) {
+ folio_clear_dirty_for_io(folio);
+ return true;
+ }
+
+ last = btrfs_subpage_clear_and_test_dirty(eb->fs_info, folio, eb->start, eb->len);
+ if (last) {
+ folio_clear_dirty_for_io(folio);
+ return true;
+ }
+ return false;
}
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage;
- const u32 sectors_per_page = fs_info->sectors_per_page;
+ struct btrfs_folio_state *bfs;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long uptodate_bitmap;
unsigned long dirty_bitmap;
unsigned long writeback_bitmap;
unsigned long ordered_bitmap;
unsigned long checked_bitmap;
+ unsigned long locked_bitmap;
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- ASSERT(sectors_per_page > 1);
- subpage = folio_get_private(folio);
-
- spin_lock_irqsave(&subpage->lock, flags);
- GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap);
- spin_unlock_irqrestore(&subpage->lock, flags);
-
- dump_page(folio_page(folio, 0), "btrfs subpage dump");
+ ASSERT(blocks_per_folio > 1);
+ bfs = folio_get_private(folio);
+
+ spin_lock_irqsave(&bfs->lock, flags);
+ GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap);
+ spin_unlock_irqrestore(&bfs->lock, flags);
+
+ dump_page(folio_page(folio, 0), "btrfs folio state dump");
btrfs_warn(fs_info,
-"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
+"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
start, len, folio_pos(folio),
- sectors_per_page, &uptodate_bitmap,
- sectors_per_page, &dirty_bitmap,
- sectors_per_page, &writeback_bitmap,
- sectors_per_page, &ordered_bitmap,
- sectors_per_page, &checked_bitmap);
+ blocks_per_folio, &uptodate_bitmap,
+ blocks_per_folio, &dirty_bitmap,
+ blocks_per_folio, &locked_bitmap,
+ blocks_per_folio, &writeback_bitmap,
+ blocks_per_folio, &ordered_bitmap,
+ blocks_per_folio, &checked_bitmap);
}
void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
struct folio *folio,
unsigned long *ret_bitmap)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- ASSERT(fs_info->sectors_per_page > 1);
- subpage = folio_get_private(folio);
+ ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1);
+ bfs = folio_get_private(folio);
- spin_lock_irqsave(&subpage->lock, flags);
- GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
+ GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 428fa9389fd4..d81a0ade559f 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -6,13 +6,13 @@
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/sizes.h>
+#include "btrfs_inode.h"
struct address_space;
struct folio;
-struct btrfs_fs_info;
/*
- * Extra info for subpapge bitmap.
+ * Extra info for subpage bitmap.
*
* For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into
* one larger bitmap.
@@ -31,9 +31,31 @@ struct btrfs_fs_info;
enum {
btrfs_bitmap_nr_uptodate = 0,
btrfs_bitmap_nr_dirty,
+
+ /*
+ * This can be changed to atomic eventually. But this change will rely
+ * on the async delalloc range rework for locked bitmap. As async
+ * delalloc can unlock its range and mark blocks writeback at random
+ * timing.
+ */
btrfs_bitmap_nr_writeback,
+
+ /*
+ * The ordered and checked flags are for COW fixup, already marked
+ * deprecated, and will be removed eventually.
+ */
btrfs_bitmap_nr_ordered,
btrfs_bitmap_nr_checked,
+
+ /*
+ * The locked bit is for async delalloc range (compression), currently
+ * async extent is queued with the range locked, until the compression
+ * is done.
+ * So an async extent can unlock the range at any random timing.
+ *
+ * This will need a rework on the async extent lifespan (mark writeback
+ * and do compression) before deprecating this flag.
+ */
btrfs_bitmap_nr_locked,
btrfs_bitmap_nr_max
};
@@ -42,7 +64,7 @@ enum {
* Structure to trace status of each sector inside a page, attached to
* page::private for both data and metadata inodes.
*/
-struct btrfs_subpage {
+struct btrfs_folio_state {
/* Common members for both data and metadata pages */
spinlock_t lock;
union {
@@ -50,7 +72,7 @@ struct btrfs_subpage {
* Structures only used by metadata
*
* @eb_refs should only be operated under private_lock, as it
- * manages whether the subpage can be detached.
+ * manages whether the btrfs_folio_state can be detached.
*/
atomic_t eb_refs;
@@ -64,29 +86,44 @@ struct btrfs_subpage {
unsigned long bitmaps[];
};
-enum btrfs_subpage_type {
+enum btrfs_folio_type {
BTRFS_SUBPAGE_METADATA,
BTRFS_SUBPAGE_DATA,
};
-#if PAGE_SIZE > SZ_4K
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping);
-#else
+/*
+ * Subpage support for metadata is more complex, as we can have dummy extent
+ * buffers, where folios have no mapping to determine the owning inode.
+ *
+ * Thankfully we only need to check if node size is smaller than page size.
+ * Even with larger folio support, we will only allocate a folio as large as
+ * node size.
+ * Thus if nodesize < PAGE_SIZE, we know metadata needs need to subpage routine.
+ */
+static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->nodesize < PAGE_SIZE;
+}
static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info,
- struct address_space *mapping)
+ struct folio *folio)
{
- return false;
+ if (folio->mapping && folio->mapping->host)
+ ASSERT(is_data_inode(BTRFS_I(folio->mapping->host)));
+ return fs_info->sectorsize < folio_size(folio);
}
-#endif
-int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct folio *folio, enum btrfs_subpage_type type);
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio);
+int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, enum btrfs_folio_type type);
+void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio,
+ enum btrfs_folio_type type);
/* Allocate additional data where page represents more than one sector */
-struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- enum btrfs_subpage_type type);
-void btrfs_free_subpage(struct btrfs_subpage *subpage);
+struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info,
+ size_t fsize, enum btrfs_folio_type type);
+static inline void btrfs_free_folio_state(struct btrfs_folio_state *bfs)
+{
+ kfree(bfs);
+}
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
@@ -110,6 +147,13 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
* btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't
* need to be inside the page. Those functions will truncate the range
* automatically.
+ *
+ * Both btrfs_folio_*() and btrfs_folio_clamp_*() are for data folios.
+ *
+ * For metadata, one should use btrfs_meta_folio_*() helpers instead, and there
+ * is no clamp version for metadata helpers, as we either go subpage
+ * (nodesize < PAGE_SIZE) or go regular folio helpers (nodesize >= PAGE_SIZE,
+ * and our folio is never larger than nodesize).
*/
#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
@@ -129,7 +173,10 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len); \
bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct folio *folio, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb); \
+void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb); \
+bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb);
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
@@ -137,11 +184,25 @@ DECLARE_BTRFS_SUBPAGE_OPS(writeback);
DECLARE_BTRFS_SUBPAGE_OPS(ordered);
DECLARE_BTRFS_SUBPAGE_OPS(checked);
+/*
+ * Helper for error cleanup, where a folio will have its dirty flag cleared,
+ * with writeback started and finished.
+ */
+static inline void btrfs_folio_clamp_finish_io(struct btrfs_fs_info *fs_info,
+ struct folio *locked_folio,
+ u64 start, u32 len)
+{
+ btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len);
+ btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len);
+ btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len);
+}
+
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
+bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb);
void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
struct folio *folio,
unsigned long *ret_bitmap);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 97a85d180b61..1999533b52be 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -84,10 +84,13 @@ struct btrfs_fs_context {
u32 thread_pool_size;
unsigned long long mount_opt;
unsigned long compress_type:4;
- unsigned int compress_level;
+ int compress_level;
refcount_t refs;
};
+static void btrfs_emit_options(struct btrfs_fs_info *info,
+ struct btrfs_fs_context *old);
+
enum {
Opt_acl,
Opt_clear_cache,
@@ -125,15 +128,13 @@ enum {
/* Rescue options */
Opt_rescue,
Opt_usebackuproot,
- Opt_nologreplay,
/* Debugging options */
Opt_enospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
-#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
Opt_ref_verify,
+ Opt_ref_tracker,
#endif
Opt_err,
};
@@ -246,8 +247,6 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = {
/* Rescue options. */
fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue),
- /* Deprecated, with alias rescue=nologreplay */
- __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL),
/* Deprecated, with alias rescue=usebackuproot */
__fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL),
/* For compatibility only, alias for "rescue=nologreplay". */
@@ -257,17 +256,85 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = {
fsparam_flag_no("enospc_debug", Opt_enospc_debug),
#ifdef CONFIG_BTRFS_DEBUG
fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment),
-#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ fsparam_flag("ref_tracker", Opt_ref_tracker),
fsparam_flag("ref_verify", Opt_ref_verify),
#endif
{}
};
-/* No support for restricting writes to btrfs devices yet... */
-static inline blk_mode_t btrfs_open_mode(struct fs_context *fc)
+static bool btrfs_match_compress_type(const char *string, const char *type, bool may_have_level)
{
- return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES;
+ const int len = strlen(type);
+
+ return (strncmp(string, type, len) == 0) &&
+ ((may_have_level && string[len] == ':') || string[len] == '\0');
+}
+
+static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
+ const struct fs_parameter *param, int opt)
+{
+ const char *string = param->string;
+ int ret;
+
+ /*
+ * Provide the same semantics as older kernels that don't use fs
+ * context, specifying the "compress" option clears "force-compress"
+ * without the need to pass "compress-force=[no|none]" before
+ * specifying "compress".
+ */
+ if (opt != Opt_compress_force && opt != Opt_compress_force_type)
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+
+ if (opt == Opt_compress || opt == Opt_compress_force) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "zlib", true)) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "lzo", true)) {
+ ctx->compress_type = BTRFS_COMPRESS_LZO;
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
+ if (string[3] == ':' && string[4])
+ btrfs_warn(NULL, "Compression level ignored for LZO");
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "zstd", true)) {
+ ctx->compress_type = BTRFS_COMPRESS_ZSTD;
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "no", false) ||
+ btrfs_match_compress_type(string, "none", false)) {
+ ctx->compress_level = 0;
+ ctx->compress_type = 0;
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ } else {
+ ret = -EINVAL;
+ goto error;
+ }
+ return 0;
+error:
+ btrfs_err(NULL, "failed to parse compression option '%s'", string);
+ return ret;
+
}
static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
@@ -306,10 +373,9 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_device: {
struct btrfs_device *device;
- blk_mode_t mode = btrfs_open_mode(fc);
mutex_lock(&uuid_mutex);
- device = btrfs_scan_one_device(param->string, mode, false);
+ device = btrfs_scan_one_device(param->string, false);
mutex_unlock(&uuid_mutex);
if (IS_ERR(device))
return PTR_ERR(device);
@@ -339,53 +405,8 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
fallthrough;
case Opt_compress:
case Opt_compress_type:
- /*
- * Provide the same semantics as older kernels that don't use fs
- * context, specifying the "compress" option clears
- * "force-compress" without the need to pass
- * "compress-force=[no|none]" before specifying "compress".
- */
- if (opt != Opt_compress_force && opt != Opt_compress_force_type)
- btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
-
- if (opt == Opt_compress || opt == Opt_compress_force) {
- ctx->compress_type = BTRFS_COMPRESS_ZLIB;
- ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "zlib", 4) == 0) {
- ctx->compress_type = BTRFS_COMPRESS_ZLIB;
- ctx->compress_level =
- btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
- param->string + 4);
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "lzo", 3) == 0) {
- ctx->compress_type = BTRFS_COMPRESS_LZO;
- ctx->compress_level = 0;
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "zstd", 4) == 0) {
- ctx->compress_type = BTRFS_COMPRESS_ZSTD;
- ctx->compress_level =
- btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
- param->string + 4);
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "no", 2) == 0) {
- ctx->compress_level = 0;
- ctx->compress_type = 0;
- btrfs_clear_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
- } else {
- btrfs_err(NULL, "unrecognized compression value %s",
- param->string);
+ if (btrfs_parse_compress(ctx, param, opt))
return -EINVAL;
- }
break;
case Opt_ssd:
if (result.negated) {
@@ -449,11 +470,6 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
else
btrfs_clear_opt(ctx->mount_opt, NOTREELOG);
break;
- case Opt_nologreplay:
- btrfs_warn(NULL,
- "'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
- btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
- break;
case Opt_norecovery:
btrfs_info(NULL,
"'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'");
@@ -569,6 +585,10 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_commit_interval:
ctx->commit_interval = result.uint_32;
+ if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) {
+ btrfs_warn(NULL, "excessive commit interval %u, use with care",
+ ctx->commit_interval);
+ }
if (ctx->commit_interval == 0)
ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
break;
@@ -624,11 +644,12 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
}
break;
-#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
case Opt_ref_verify:
btrfs_set_opt(ctx->mount_opt, REF_VERIFY);
break;
+ case Opt_ref_tracker:
+ btrfs_set_opt(ctx->mount_opt, REF_TRACKER);
+ break;
#endif
default:
btrfs_err(NULL, "unrecognized mount option '%s'", param->key);
@@ -693,12 +714,9 @@ bool btrfs_check_options(const struct btrfs_fs_info *info,
if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
- btrfs_info(info, "disk space caching is enabled");
btrfs_warn(info,
"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2");
}
- if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
- btrfs_info(info, "using free-space-tree");
}
return ret;
@@ -789,17 +807,15 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
struct btrfs_root_ref *root_ref;
struct btrfs_inode_ref *inode_ref;
struct btrfs_key key;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
char *name = NULL, *ptr;
u64 dirid;
int len;
int ret;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto err;
- }
+ if (!path)
+ return ERR_PTR(-ENOMEM);
name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!name) {
@@ -887,7 +903,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
fs_root = NULL;
}
- btrfs_free_path(path);
if (ptr == name + PATH_MAX - 1) {
name[0] = '/';
name[1] = '\0';
@@ -898,7 +913,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
err:
btrfs_put_root(fs_root);
- btrfs_free_path(path);
kfree(name);
return ERR_PTR(ret);
}
@@ -907,7 +921,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_dir_item *di;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key location;
struct fscrypt_str name = FSTR_INIT("default", 7);
u64 dir_id;
@@ -924,7 +938,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0);
if (IS_ERR(di)) {
- btrfs_free_path(path);
return PTR_ERR(di);
}
if (!di) {
@@ -933,13 +946,11 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
* it's always been there, but don't freak out, just try and
* mount the top-level subvolume.
*/
- btrfs_free_path(path);
*objectid = BTRFS_FS_TREE_OBJECTID;
return 0;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
- btrfs_free_path(path);
*objectid = location.objectid;
return 0;
}
@@ -947,44 +958,46 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
static int btrfs_fill_super(struct super_block *sb,
struct btrfs_fs_devices *fs_devices)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- int err;
+ int ret;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_magic = BTRFS_SUPER_MAGIC;
sb->s_op = &btrfs_super_ops;
- sb->s_d_op = &btrfs_dentry_operations;
+ set_default_d_op(sb, &btrfs_dentry_operations);
sb->s_export_op = &btrfs_export_ops;
#ifdef CONFIG_FS_VERITY
sb->s_vop = &btrfs_verityops;
#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
- sb->s_iflags |= SB_I_CGROUPWB;
+ sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
- err = super_setup_bdi(sb);
- if (err) {
+ ret = super_setup_bdi(sb);
+ if (ret) {
btrfs_err(fs_info, "super_setup_bdi failed");
- return err;
+ return ret;
}
- err = open_ctree(sb, fs_devices);
- if (err) {
- btrfs_err(fs_info, "open_ctree failed");
- return err;
+ ret = open_ctree(sb, fs_devices);
+ if (ret) {
+ btrfs_err(fs_info, "open_ctree failed: %d", ret);
+ return ret;
}
+ btrfs_emit_options(fs_info, NULL);
+
inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- btrfs_handle_fs_error(fs_info, err, NULL);
+ ret = PTR_ERR(inode);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
goto fail_close;
}
- sb->s_root = d_make_root(inode);
+ sb->s_root = d_make_root(&inode->vfs_inode);
if (!sb->s_root) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto fail_close;
}
@@ -993,7 +1006,7 @@ static int btrfs_fill_super(struct super_block *sb,
fail_close:
close_ctree(fs_info);
- return err;
+ return ret;
}
int btrfs_sync_fs(struct super_block *sb, int wait)
@@ -1072,7 +1085,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_printf(seq, ",compress-force=%s", compress_type);
else
seq_printf(seq, ",compress=%s", compress_type);
- if (info->compress_level)
+ if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO)
seq_printf(seq, ":%d", info->compress_level);
}
if (btrfs_test_opt(info, NOSSD))
@@ -1135,12 +1148,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
#endif
if (btrfs_test_opt(info, REF_VERIFY))
seq_puts(seq, ",ref_verify");
+ if (btrfs_test_opt(info, REF_TRACKER))
+ seq_puts(seq, ",ref_tracker");
seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
subvol_name = btrfs_get_subvol_name_from_objectid(info,
btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
if (!IS_ERR(subvol_name)) {
- seq_puts(seq, ",subvol=");
- seq_escape(seq, subvol_name, " \t\n\\");
+ seq_show_option(seq, "subvol", subvol_name);
kfree(subvol_name);
}
return 0;
@@ -1149,11 +1163,11 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
/*
* subvolumes are identified by ino 256
*/
-static inline int is_subvolume_inode(struct inode *inode)
+static inline bool is_subvolume_inode(struct inode *inode)
{
if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
- return 1;
- return 0;
+ return true;
+ return false;
}
static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
@@ -1262,7 +1276,7 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
/*
- * We need to cleanup all defragable inodes if the autodefragment is
+ * We need to cleanup all defraggable inodes if the autodefragment is
* close or the filesystem is read only.
*/
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
@@ -1433,7 +1447,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
{
btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
- btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow");
btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
@@ -1455,10 +1469,11 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums");
btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags");
+ btrfs_info_if_unset(info, old, NODATASUM, "setting datasum");
btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
- btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers");
btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
@@ -1595,7 +1610,7 @@ static inline void btrfs_descending_sort_devices(
static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
u64 *free_bytes)
{
- struct btrfs_device_info *devices_info;
+ struct btrfs_device_info AUTO_KFREE(devices_info);
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 type;
@@ -1693,7 +1708,6 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
nr_devices--;
}
- kfree(devices_info);
*free_bytes = avail_space;
return 0;
}
@@ -1831,10 +1845,9 @@ static int btrfs_get_tree_super(struct fs_context *fc)
struct btrfs_fs_info *fs_info = fc->s_fs_info;
struct btrfs_fs_context *ctx = fc->fs_private;
struct btrfs_fs_devices *fs_devices = NULL;
- struct block_device *bdev;
struct btrfs_device *device;
struct super_block *sb;
- blk_mode_t mode = btrfs_open_mode(fc);
+ blk_mode_t mode = sb_open_mode(fc->sb_flags);
int ret;
btrfs_ctx_to_info(fs_info, ctx);
@@ -1844,69 +1857,103 @@ static int btrfs_get_tree_super(struct fs_context *fc)
* With 'true' passed to btrfs_scan_one_device() (mount time) we expect
* either a valid device or an error.
*/
- device = btrfs_scan_one_device(fc->source, mode, true);
+ device = btrfs_scan_one_device(fc->source, true);
ASSERT(device != NULL);
if (IS_ERR(device)) {
mutex_unlock(&uuid_mutex);
return PTR_ERR(device);
}
-
fs_devices = device->fs_devices;
+ /*
+ * We cannot hold uuid_mutex calling sget_fc(), it will lead to a
+ * locking order reversal with s_umount.
+ *
+ * So here we increase the holding number of fs_devices, this will ensure
+ * the fs_devices itself won't be freed.
+ */
+ btrfs_fs_devices_inc_holding(fs_devices);
fs_info->fs_devices = fs_devices;
-
- ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type);
mutex_unlock(&uuid_mutex);
- if (ret)
- return ret;
-
- if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
- ret = -EACCES;
- goto error;
- }
- bdev = fs_devices->latest_dev->bdev;
- /*
- * From now on the error handling is not straightforward.
- *
- * If successful, this will transfer the fs_info into the super block,
- * and fc->s_fs_info will be NULL. However if there's an existing
- * super, we'll still have fc->s_fs_info populated. If we error
- * completely out it'll be cleaned up when we drop the fs_context,
- * otherwise it's tied to the lifetime of the super_block.
- */
sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc);
if (IS_ERR(sb)) {
- ret = PTR_ERR(sb);
- goto error;
+ mutex_lock(&uuid_mutex);
+ btrfs_fs_devices_dec_holding(fs_devices);
+ /*
+ * Since the fs_devices is not opened, it can be freed at any
+ * time after unlocking uuid_mutex. We need to avoid double
+ * free through put_fs_context()->btrfs_free_fs_info().
+ * So here we reset fs_info->fs_devices to NULL, and let the
+ * regular fs_devices reclaim path to handle it.
+ *
+ * This applies to all later branches where no fs_devices is
+ * opened.
+ */
+ fs_info->fs_devices = NULL;
+ mutex_unlock(&uuid_mutex);
+ return PTR_ERR(sb);
}
- set_device_specific_options(fs_info);
-
if (sb->s_root) {
- btrfs_close_devices(fs_devices);
- if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)
- ret = -EBUSY;
+ /*
+ * Not the first mount of the fs thus got an existing super block.
+ * Will reuse the returned super block, fs_info and fs_devices.
+ *
+ * fc->s_fs_info is not touched and will be later freed by
+ * put_fs_context() through btrfs_free_fs_context().
+ */
+ ASSERT(fc->s_fs_info == fs_info);
+
+ mutex_lock(&uuid_mutex);
+ btrfs_fs_devices_dec_holding(fs_devices);
+ fs_info->fs_devices = NULL;
+ mutex_unlock(&uuid_mutex);
+ /*
+ * At this stage we may have RO flag mismatch between
+ * fc->sb_flags and sb->s_flags. Caller should detect such
+ * mismatch and reconfigure with sb->s_umount rwsem held if
+ * needed.
+ */
} else {
+ struct block_device *bdev;
+
+ /*
+ * The first mount of the fs thus a new superblock, fc->s_fs_info
+ * must be NULL, and the ownership of our fs_info and fs_devices is
+ * transferred to the super block.
+ */
+ ASSERT(fc->s_fs_info == NULL);
+
+ mutex_lock(&uuid_mutex);
+ btrfs_fs_devices_dec_holding(fs_devices);
+ ret = btrfs_open_devices(fs_devices, mode, sb);
+ if (ret < 0)
+ fs_info->fs_devices = NULL;
+ mutex_unlock(&uuid_mutex);
+ if (ret < 0) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
+ if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+ deactivate_locked_super(sb);
+ return -EACCES;
+ }
+ set_device_specific_options(fs_info);
+ bdev = fs_devices->latest_dev->bdev;
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
- btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
ret = btrfs_fill_super(sb, fs_devices);
- }
-
- if (ret) {
- deactivate_locked_super(sb);
- return ret;
+ if (ret) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
}
btrfs_clear_oneshot_options(fs_info);
fc->root = dget(sb->s_root);
return 0;
-
-error:
- btrfs_close_devices(fs_devices);
- return ret;
}
/*
@@ -1982,39 +2029,14 @@ error:
* btrfs or not, setting the whole super block RO. To make per-subvolume mounting
* work with different options work we need to keep backward compatibility.
*/
-static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc)
+static int btrfs_reconfigure_for_mount(struct fs_context *fc)
{
- struct vfsmount *mnt;
- int ret;
- const bool ro2rw = !(fc->sb_flags & SB_RDONLY);
-
- /*
- * We got an EBUSY because our SB_RDONLY flag didn't match the existing
- * super block, so invert our setting here and retry the mount so we
- * can get our vfsmount.
- */
- if (ro2rw)
- fc->sb_flags |= SB_RDONLY;
- else
- fc->sb_flags &= ~SB_RDONLY;
-
- mnt = fc_mount(fc);
- if (IS_ERR(mnt))
- return mnt;
+ int ret = 0;
- if (!ro2rw)
- return mnt;
+ if (!(fc->sb_flags & SB_RDONLY) && (fc->root->d_sb->s_flags & SB_RDONLY))
+ ret = btrfs_reconfigure(fc);
- /* We need to convert to rw, call reconfigure. */
- fc->sb_flags &= ~SB_RDONLY;
- down_write(&mnt->mnt_sb->s_umount);
- ret = btrfs_reconfigure(fc);
- up_write(&mnt->mnt_sb->s_umount);
- if (ret) {
- mntput(mnt);
- return ERR_PTR(ret);
- }
- return mnt;
+ return ret;
}
static int btrfs_get_tree_subvol(struct fs_context *fc)
@@ -2024,6 +2046,7 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
struct fs_context *dup_fc;
struct dentry *dentry;
struct vfsmount *mnt;
+ int ret = 0;
/*
* Setup a dummy root and fs_info for test/set super. This is because
@@ -2040,7 +2063,13 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
- btrfs_free_fs_info(fs_info);
+ /*
+ * Dont call btrfs_free_fs_info() to free it as it's still
+ * initialized partially.
+ */
+ kfree(fs_info->super_copy);
+ kfree(fs_info->super_for_commit);
+ kvfree(fs_info);
return -ENOMEM;
}
btrfs_init_fs_info(fs_info);
@@ -2057,17 +2086,15 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
*/
dup_fc->s_fs_info = fs_info;
- /*
- * We'll do the security settings in our btrfs_get_tree_super() mount
- * loop, they were duplicated into dup_fc, we can drop the originals
- * here.
- */
- security_free_mnt_opts(&fc->security);
- fc->security = NULL;
+ ret = btrfs_get_tree_super(dup_fc);
+ if (ret)
+ goto error;
- mnt = fc_mount(dup_fc);
- if (PTR_ERR_OR_ZERO(mnt) == -EBUSY)
- mnt = btrfs_reconfigure_for_mount(dup_fc);
+ ret = btrfs_reconfigure_for_mount(dup_fc);
+ up_write(&dup_fc->root->d_sb->s_umount);
+ if (ret)
+ goto error;
+ mnt = vfs_create_mount(dup_fc);
put_fs_context(dup_fc);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
@@ -2084,25 +2111,15 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
fc->root = dentry;
return 0;
+error:
+ put_fs_context(dup_fc);
+ return ret;
}
static int btrfs_get_tree(struct fs_context *fc)
{
- /*
- * Since we use mount_subtree to mount the default/specified subvol, we
- * have to do mounts in two steps.
- *
- * First pass through we call btrfs_get_tree_subvol(), this is just a
- * wrapper around fc_mount() to call back into here again, and this time
- * we'll call btrfs_get_tree_super(). This will do the open_ctree() and
- * everything to open the devices and file system. Then we return back
- * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and
- * from there we can do our mount_subvol() call, which will lookup
- * whichever subvol we're mounting and setup this fc with the
- * appropriate dentry for the subvol.
- */
- if (fc->s_fs_info)
- return btrfs_get_tree_super(fc);
+ ASSERT(fc->s_fs_info == NULL);
+
return btrfs_get_tree_subvol(fc);
}
@@ -2234,7 +2251,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
* Scanning outside of mount can return NULL which would turn
* into 0 error code.
*/
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+ device = btrfs_scan_one_device(vol->name, false);
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
@@ -2252,13 +2269,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
* Scanning outside of mount can return NULL which would turn
* into 0 error code.
*/
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+ device = btrfs_scan_one_device(vol->name, false);
if (IS_ERR_OR_NULL(device)) {
mutex_unlock(&uuid_mutex);
- if (IS_ERR(device))
- ret = PTR_ERR(device);
- else
- ret = 0;
+ ret = PTR_ERR_OR_ZERO(device);
break;
}
ret = !(device->fs_devices->num_devices ==
@@ -2305,20 +2319,20 @@ static int check_dev_super(struct btrfs_device *dev)
return 0;
/* Only need to check the primary super block. */
- sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
+ sb = btrfs_read_disk_super(dev->bdev, 0, true);
if (IS_ERR(sb))
return PTR_ERR(sb);
/* Verify the checksum. */
csum_type = btrfs_super_csum_type(sb);
- if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) {
+ if (unlikely(csum_type != btrfs_super_csum_type(fs_info->super_copy))) {
btrfs_err(fs_info, "csum type changed, has %u expect %u",
csum_type, btrfs_super_csum_type(fs_info->super_copy));
ret = -EUCLEAN;
goto out;
}
- if (btrfs_check_super_csum(fs_info, sb)) {
+ if (unlikely(btrfs_check_super_csum(fs_info, sb))) {
btrfs_err(fs_info, "csum for on-disk super block no longer matches");
ret = -EUCLEAN;
goto out;
@@ -2330,7 +2344,7 @@ static int check_dev_super(struct btrfs_device *dev)
goto out;
last_trans = btrfs_get_last_trans_committed(fs_info);
- if (btrfs_super_generation(sb) != last_trans) {
+ if (unlikely(btrfs_super_generation(sb) != last_trans)) {
btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
btrfs_super_generation(sb), last_trans);
ret = -EUCLEAN;
@@ -2411,6 +2425,66 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
return 0;
}
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+static int btrfs_remove_bdev(struct super_block *sb, struct block_device *bdev)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_device *device;
+ struct btrfs_dev_lookup_args lookup_args = { .devt = bdev->bd_dev };
+ bool can_rw;
+
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ device = btrfs_find_device(fs_info->fs_devices, &lookup_args);
+ if (!device) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ /* Device not found, should not affect the running fs, just give a warning. */
+ btrfs_warn(fs_info, "unable to find btrfs device for block device '%pg'", bdev);
+ return 0;
+ }
+ /*
+ * The to-be-removed device is already missing?
+ *
+ * That's weird but no special handling needed and can exit right now.
+ */
+ if (unlikely(test_and_set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ btrfs_warn(fs_info, "btrfs device id %llu is already missing", device->devid);
+ return 0;
+ }
+
+ device->fs_devices->missing_devices++;
+ if (test_and_clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+ list_del_init(&device->dev_alloc_list);
+ WARN_ON(device->fs_devices->rw_devices < 1);
+ device->fs_devices->rw_devices--;
+ }
+ can_rw = btrfs_check_rw_degradable(fs_info, device);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ /*
+ * Now device is considered missing, btrfs_device_name() won't give a
+ * meaningful result anymore, so only output the devid.
+ */
+ if (unlikely(!can_rw)) {
+ btrfs_crit(fs_info,
+ "btrfs device id %llu has gone missing, can not maintain read-write",
+ device->devid);
+ return -EIO;
+ }
+ btrfs_warn(fs_info,
+ "btrfs device id %llu has gone missing, continue as degraded",
+ device->devid);
+ btrfs_set_opt(fs_info->mount_opt, DEGRADED);
+ return 0;
+}
+
+static void btrfs_shutdown(struct super_block *sb)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ btrfs_force_shutdown(fs_info);
+}
+#endif
+
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.evict_inode = btrfs_evict_inode,
@@ -2426,6 +2500,10 @@ static const struct super_operations btrfs_super_ops = {
.unfreeze_fs = btrfs_unfreeze,
.nr_cached_objects = btrfs_nr_cached_objects,
.free_cached_objects = btrfs_free_cached_objects,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ .remove_bdev = btrfs_remove_bdev,
+ .shutdown = btrfs_shutdown,
+#endif
};
static const struct file_operations btrfs_ctl_fops = {
@@ -2458,15 +2536,15 @@ static __cold void btrfs_interface_exit(void)
static int __init btrfs_print_mod_info(void)
{
static const char options[] = ""
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ ", experimental=on"
+#endif
#ifdef CONFIG_BTRFS_DEBUG
", debug=on"
#endif
#ifdef CONFIG_BTRFS_ASSERT
", assert=on"
#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- ", ref-verify=on"
-#endif
#ifdef CONFIG_BLK_DEV_ZONED
", zoned=yes"
#else
@@ -2478,7 +2556,17 @@ static int __init btrfs_print_mod_info(void)
", fsverity=no"
#endif
;
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ if (btrfs_get_mod_read_policy() == NULL)
+ pr_info("Btrfs loaded%s\n", options);
+ else
+ pr_info("Btrfs loaded%s, read_policy=%s\n",
+ options, btrfs_get_mod_read_policy());
+#else
pr_info("Btrfs loaded%s\n", options);
+#endif
+
return 0;
}
@@ -2525,8 +2613,8 @@ static const struct init_sequence mod_init_seq[] = {
.init_func = btrfs_free_space_init,
.exit_func = btrfs_free_space_exit,
}, {
- .init_func = extent_state_init_cachep,
- .exit_func = extent_state_free_cachep,
+ .init_func = btrfs_extent_state_init_cachep,
+ .exit_func = btrfs_extent_state_free_cachep,
}, {
.init_func = extent_buffer_init_cachep,
.exit_func = extent_buffer_free_cachep,
@@ -2534,8 +2622,13 @@ static const struct init_sequence mod_init_seq[] = {
.init_func = btrfs_bioset_init,
.exit_func = btrfs_bioset_exit,
}, {
- .init_func = extent_map_init,
- .exit_func = extent_map_exit,
+ .init_func = btrfs_extent_map_init,
+ .exit_func = btrfs_extent_map_exit,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ }, {
+ .init_func = btrfs_read_policy_init,
+ .exit_func = NULL,
+#endif
}, {
.init_func = ordered_data_init,
.exit_func = ordered_data_exit,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b843308e2bc6..1f64c132b387 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -10,6 +10,7 @@
#include <linux/completion.h>
#include <linux/bug.h>
#include <linux/list.h>
+#include <linux/string_choices.h>
#include <crypto/hash.h>
#include "messages.h"
#include "ctree.h"
@@ -25,6 +26,7 @@
#include "misc.h"
#include "fs.h"
#include "accessors.h"
+#include "zoned.h"
/*
* Structure name Path
@@ -160,8 +162,7 @@ static int can_modify_feature(struct btrfs_feature_attr *fa)
clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
break;
default:
- pr_warn("btrfs: sysfs: unknown feature set %d\n",
- fa->feature_set);
+ btrfs_warn(NULL, "sysfs: unknown feature set %d", fa->feature_set);
return 0;
}
@@ -295,7 +296,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA);
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
#endif
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
/* Remove once support for extent tree v2 is feature complete */
BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
/* Remove once support for raid stripe tree is feature complete. */
@@ -329,7 +330,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_PTR(zoned),
#endif
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
BTRFS_FEAT_ATTR_PTR(raid_stripe_tree),
#endif
@@ -410,12 +411,17 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
char *buf)
{
ssize_t ret = 0;
+ bool has_output = false;
- /* An artificial limit to only support 4K and PAGE_SIZE */
- if (PAGE_SIZE > SZ_4K)
- ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
- ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
-
+ for (u32 cur = BTRFS_MIN_BLOCKSIZE; cur <= BTRFS_MAX_BLOCKSIZE; cur *= 2) {
+ if (!btrfs_supported_blocksize(cur))
+ continue;
+ if (has_output)
+ ret += sysfs_emit_at(buf, ret, " ");
+ ret += sysfs_emit_at(buf, ret, "%u", cur);
+ has_output = true;
+ }
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_sectorsizes,
@@ -1118,7 +1124,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize);
+ return sysfs_emit(buf, "%u\n", fs_info->nodesize);
}
BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -1128,7 +1134,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->sectorsize);
}
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -1137,13 +1143,21 @@ static ssize_t btrfs_commit_stats_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ u64 now = ktime_get_ns();
+ u64 start_time = fs_info->commit_stats.critical_section_start_time;
+ u64 pending = 0;
+
+ if (start_time)
+ pending = now - start_time;
return sysfs_emit(buf,
"commits %llu\n"
+ "cur_commit_ms %llu\n"
"last_commit_ms %llu\n"
"max_commit_ms %llu\n"
"total_commit_ms %llu\n",
fs_info->commit_stats.commit_count,
+ div_u64(pending, NSEC_PER_MSEC),
div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC),
div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC),
div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC));
@@ -1175,12 +1189,62 @@ static ssize_t btrfs_commit_stats_store(struct kobject *kobj,
}
BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store);
+static ssize_t btrfs_zoned_stats_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ struct btrfs_block_group *bg;
+ size_t ret = 0;
+
+
+ if (!btrfs_is_zoned(fs_info))
+ return ret;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ret += sysfs_emit_at(buf, ret, "active block-groups: %zu\n",
+ list_count_nodes(&fs_info->zone_active_bgs));
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ mutex_lock(&fs_info->reclaim_bgs_lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ ret += sysfs_emit_at(buf, ret, "\treclaimable: %zu\n",
+ list_count_nodes(&fs_info->reclaim_bgs));
+ ret += sysfs_emit_at(buf, ret, "\tunused: %zu\n",
+ list_count_nodes(&fs_info->unused_bgs));
+ spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+
+ ret += sysfs_emit_at(buf, ret, "\tneed reclaim: %s\n",
+ str_true_false(btrfs_zoned_should_reclaim(fs_info)));
+
+ if (fs_info->data_reloc_bg)
+ ret += sysfs_emit_at(buf, ret,
+ "data relocation block-group: %llu\n",
+ fs_info->data_reloc_bg);
+ if (fs_info->treelog_bg)
+ ret += sysfs_emit_at(buf, ret,
+ "tree-log block-group: %llu\n",
+ fs_info->treelog_bg);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ret += sysfs_emit_at(buf, ret, "active zones:\n");
+ list_for_each_entry(bg, &fs_info->zone_active_bgs, active_bg_list) {
+ ret += sysfs_emit_at(buf, ret,
+ "\tstart: %llu, wp: %llu used: %llu, reserved: %llu, unusable: %llu\n",
+ bg->start, bg->alloc_offset, bg->used,
+ bg->reserved, bg->zone_unusable);
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ return ret;
+}
+BTRFS_ATTR(, zoned_stats, btrfs_zoned_stats_show);
+
static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->sectorsize);
}
BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@@ -1201,7 +1265,7 @@ static ssize_t quota_override_store(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
unsigned long knob;
- int err;
+ int ret;
if (!fs_info)
return -EPERM;
@@ -1209,9 +1273,9 @@ static ssize_t quota_override_store(struct kobject *kobj,
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- err = kstrtoul(buf, 10, &knob);
- if (err)
- return err;
+ ret = kstrtoul(buf, 10, &knob);
+ if (ret)
+ return ret;
if (knob > 1)
return -EINVAL;
@@ -1305,7 +1369,74 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
}
BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
-static const char * const btrfs_read_policy_name[] = { "pid" };
+static const char *btrfs_read_policy_name[] = {
+ "pid",
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ "round-robin",
+ "devid",
+#endif
+};
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+
+/* Global module configuration parameters. */
+static char *read_policy;
+char *btrfs_get_mod_read_policy(void)
+{
+ return read_policy;
+}
+
+/* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */
+module_param(read_policy, charp, 0);
+MODULE_PARM_DESC(read_policy,
+"Global read policy: pid (default), round-robin[:<min_contig_read>], devid[:<devid>]");
+#endif
+
+int btrfs_read_policy_to_enum(const char *str, s64 *value_ret)
+{
+ char param[32];
+ char __maybe_unused *value_str;
+
+ if (!str || strlen(str) == 0)
+ return 0;
+
+ strscpy(param, str);
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /* Separate value from input in policy:value format. */
+ value_str = strchr(param, ':');
+ if (value_str) {
+ char *retptr;
+
+ *value_str = 0;
+ value_str++;
+ if (!value_ret)
+ return -EINVAL;
+
+ *value_ret = memparse(value_str, &retptr);
+ /* There could be any trailing typos after the value. */
+ retptr = skip_spaces(retptr);
+ if (*retptr != 0 || *value_ret <= 0)
+ return -EINVAL;
+ }
+#endif
+
+ return sysfs_match_string(btrfs_read_policy_name, param);
+}
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+int __init btrfs_read_policy_init(void)
+{
+ s64 value;
+
+ if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) {
+ btrfs_err(NULL, "invalid read policy or value %s", read_policy);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+#endif
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -1316,14 +1447,25 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
int i;
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
- if (policy == i)
- ret += sysfs_emit_at(buf, ret, "%s[%s]",
- (ret == 0 ? "" : " "),
- btrfs_read_policy_name[i]);
- else
- ret += sysfs_emit_at(buf, ret, "%s%s",
- (ret == 0 ? "" : " "),
- btrfs_read_policy_name[i]);
+ if (ret != 0)
+ ret += sysfs_emit_at(buf, ret, " ");
+
+ if (i == policy)
+ ret += sysfs_emit_at(buf, ret, "[");
+
+ ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ if (i == BTRFS_READ_POLICY_RR)
+ ret += sysfs_emit_at(buf, ret, ":%u",
+ READ_ONCE(fs_devices->rr_min_contig_read));
+
+ if (i == BTRFS_READ_POLICY_DEVID)
+ ret += sysfs_emit_at(buf, ret, ":%llu",
+ READ_ONCE(fs_devices->read_devid));
+#endif
+ if (i == policy)
+ ret += sysfs_emit_at(buf, ret, "]");
}
ret += sysfs_emit_at(buf, ret, "\n");
@@ -1336,21 +1478,80 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
const char *buf, size_t len)
{
struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
- int i;
+ int index;
+ s64 value = -1;
- for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
- if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
- if (i != READ_ONCE(fs_devices->read_policy)) {
- WRITE_ONCE(fs_devices->read_policy, i);
- btrfs_info(fs_devices->fs_info,
- "read policy set to '%s'",
- btrfs_read_policy_name[i]);
+ index = btrfs_read_policy_to_enum(buf, &value);
+ if (index < 0)
+ return -EINVAL;
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /* If moving from RR then disable collecting fs stats. */
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR)
+ fs_devices->collect_fs_stats = false;
+
+ if (index == BTRFS_READ_POLICY_RR) {
+ if (value != -1) {
+ const u32 sectorsize = fs_devices->fs_info->sectorsize;
+
+ if (!IS_ALIGNED(value, sectorsize)) {
+ u64 temp_value = round_up(value, sectorsize);
+
+ btrfs_debug(fs_devices->fs_info,
+"read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu",
+ value, sectorsize, temp_value);
+ value = temp_value;
}
- return len;
+ } else {
+ value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
}
+
+ if (index != READ_ONCE(fs_devices->read_policy) ||
+ value != READ_ONCE(fs_devices->rr_min_contig_read)) {
+ WRITE_ONCE(fs_devices->read_policy, index);
+ WRITE_ONCE(fs_devices->rr_min_contig_read, value);
+
+ btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'",
+ btrfs_read_policy_name[index], value);
+ }
+
+ fs_devices->collect_fs_stats = true;
+
+ return len;
}
- return -EINVAL;
+ if (index == BTRFS_READ_POLICY_DEVID) {
+ if (value != -1) {
+ BTRFS_DEV_LOOKUP_ARGS(args);
+
+ /* Validate input devid. */
+ args.devid = value;
+ if (btrfs_find_device(fs_devices, &args) == NULL)
+ return -EINVAL;
+ } else {
+ /* Set default devid to the devid of the latest device. */
+ value = fs_devices->latest_dev->devid;
+ }
+
+ if (index != READ_ONCE(fs_devices->read_policy) ||
+ value != READ_ONCE(fs_devices->read_devid)) {
+ WRITE_ONCE(fs_devices->read_policy, index);
+ WRITE_ONCE(fs_devices->read_devid, value);
+
+ btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'",
+ btrfs_read_policy_name[index], value);
+ }
+
+ return len;
+ }
+#endif
+ if (index != READ_ONCE(fs_devices->read_policy)) {
+ WRITE_ONCE(fs_devices->read_policy, index);
+ btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
+ btrfs_read_policy_name[index]);
+ }
+
+ return len;
}
BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
@@ -1450,6 +1651,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
BTRFS_ATTR_PTR(, commit_stats),
BTRFS_ATTR_PTR(, temp_fsid),
+ BTRFS_ATTR_PTR(, zoned_stats),
#ifdef CONFIG_BTRFS_EXPERIMENTAL
BTRFS_ATTR_PTR(, offload_csum),
#endif
@@ -1792,16 +1994,35 @@ void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info)
kobject_put(&space_info->kobj);
}
-static const char *alloc_name(u64 flags)
+static const char *alloc_name(struct btrfs_space_info *space_info)
{
+ u64 flags = space_info->flags;
+
switch (flags) {
case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
return "mixed";
case BTRFS_BLOCK_GROUP_METADATA:
- return "metadata";
+ switch (space_info->subgroup_id) {
+ case BTRFS_SUB_GROUP_PRIMARY:
+ return "metadata";
+ case BTRFS_SUB_GROUP_TREELOG:
+ return "metadata-treelog";
+ default:
+ WARN_ON_ONCE(1);
+ return "metadata (unknown sub-group)";
+ }
case BTRFS_BLOCK_GROUP_DATA:
- return "data";
+ switch (space_info->subgroup_id) {
+ case BTRFS_SUB_GROUP_PRIMARY:
+ return "data";
+ case BTRFS_SUB_GROUP_DATA_RELOC:
+ return "data-reloc";
+ default:
+ WARN_ON_ONCE(1);
+ return "data (unknown sub-group)";
+ }
case BTRFS_BLOCK_GROUP_SYSTEM:
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY);
return "system";
default:
WARN_ON(1);
@@ -1813,14 +2034,13 @@ static const char *alloc_name(u64 flags)
* Create a sysfs entry for a space info type at path
* /sys/fs/btrfs/UUID/allocation/TYPE
*/
-int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+int btrfs_sysfs_add_space_info_type(struct btrfs_space_info *space_info)
{
int ret;
ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
- fs_info->space_info_kobj, "%s",
- alloc_name(space_info->flags));
+ space_info->fs_info->space_info_kobj, "%s",
+ alloc_name(space_info));
if (ret) {
kobject_put(&space_info->kobj);
return ret;
@@ -2082,7 +2302,7 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
if (ret)
- pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
+ btrfs_warn(NULL, "sending event %d to kobject: '%s' (%p): failed",
action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
&disk_to_dev(bdev->bd_disk)->kobj);
}
@@ -2125,15 +2345,15 @@ static struct kset *btrfs_kset;
*/
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
{
- int error;
+ int ret;
init_completion(&fs_devs->kobj_unregister);
fs_devs->fsid_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL,
- "%pU", fs_devs->fsid);
- if (error) {
+ ret = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL,
+ "%pU", fs_devs->fsid);
+ if (ret) {
kobject_put(&fs_devs->fsid_kobj);
- return error;
+ return ret;
}
fs_devs->devices_kobj = kobject_create_and_add("devices",
@@ -2159,71 +2379,70 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
{
- int error;
+ int ret;
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
- error = btrfs_sysfs_add_fs_devices(fs_devs);
- if (error)
- return error;
+ ret = btrfs_sysfs_add_fs_devices(fs_devs);
+ if (ret)
+ return ret;
- error = sysfs_create_files(fsid_kobj, btrfs_attrs);
- if (error) {
+ ret = sysfs_create_files(fsid_kobj, btrfs_attrs);
+ if (ret) {
btrfs_sysfs_remove_fs_devices(fs_devs);
- return error;
+ return ret;
}
- error = sysfs_create_group(fsid_kobj,
- &btrfs_feature_attr_group);
- if (error)
+ ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+ if (ret)
goto failure;
#ifdef CONFIG_BTRFS_DEBUG
fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj);
if (!fs_info->debug_kobj) {
- error = -ENOMEM;
+ ret = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
- if (error)
+ ret = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
+ if (ret)
goto failure;
#endif
/* Discard directory */
fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj);
if (!fs_info->discard_kobj) {
- error = -ENOMEM;
+ ret = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
- if (error)
+ ret = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
+ if (ret)
goto failure;
- error = addrm_unknown_feature_attrs(fs_info, true);
- if (error)
+ ret = addrm_unknown_feature_attrs(fs_info, true);
+ if (ret)
goto failure;
- error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
- if (error)
+ ret = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
+ if (ret)
goto failure;
fs_info->space_info_kobj = kobject_create_and_add("allocation",
fsid_kobj);
if (!fs_info->space_info_kobj) {
- error = -ENOMEM;
+ ret = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
- if (error)
+ ret = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
+ if (ret)
goto failure;
return 0;
failure:
btrfs_sysfs_remove_mounted(fs_info);
- return error;
+ return ret;
}
static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index e6a284c59809..05498e5346c3 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -7,6 +7,7 @@
#include <linux/compiler_types.h>
#include <linux/kobject.h>
+struct block_device;
struct btrfs_fs_info;
struct btrfs_device;
struct btrfs_fs_devices;
@@ -36,8 +37,7 @@ void __cold btrfs_exit_sysfs(void);
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache);
-int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info);
+int btrfs_sysfs_add_space_info_type(struct btrfs_space_info *space_info);
void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
void btrfs_sysfs_update_devid(struct btrfs_device *device);
@@ -47,5 +47,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup);
+int btrfs_read_policy_to_enum(const char *str, s64 *value);
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+int __init btrfs_read_policy_init(void);
+char *btrfs_get_mod_read_policy(void);
+#endif
#endif
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index e607b5d52fb1..b576897d71cc 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -30,6 +30,7 @@ const char *test_error[] = {
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
[TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
[TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context",
+ [TEST_ALLOC_TRANSACTION] = "cannot allocate transaction",
};
static const struct super_operations btrfs_test_super_ops = {
@@ -101,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
if (!dev)
return ERR_PTR(-ENOMEM);
- extent_io_tree_init(fs_info, &dev->alloc_state, 0);
+ btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, 0);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);
@@ -110,7 +111,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
static void btrfs_free_dummy_device(struct btrfs_device *dev)
{
- extent_io_tree_release(&dev->alloc_state);
+ btrfs_extent_io_tree_release(&dev->alloc_state);
kfree(dev);
}
@@ -142,6 +143,11 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
+
+ /* CRC32C csum size. */
+ fs_info->csum_size = 4;
+ fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) /
+ fs_info->csum_size;
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
test_mnt->mnt_sb->s_fs_info = fs_info;
@@ -151,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
- struct radix_tree_iter iter;
- void **slot;
struct btrfs_device *dev, *tmp;
+ struct extent_buffer *eb;
+ unsigned long index;
if (!fs_info)
return;
@@ -163,25 +169,13 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
test_mnt->mnt_sb->s_fs_info = NULL;
- spin_lock(&fs_info->buffer_lock);
- radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
- struct extent_buffer *eb;
-
- eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
- if (!eb)
- continue;
- /* Shouldn't happen but that kind of thinking creates CVE's */
- if (radix_tree_exception(eb)) {
- if (radix_tree_deref_retry(eb))
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- slot = radix_tree_iter_resume(slot, &iter);
- spin_unlock(&fs_info->buffer_lock);
- free_extent_buffer_stale(eb);
- spin_lock(&fs_info->buffer_lock);
+ xa_lock_irq(&fs_info->buffer_tree);
+ xa_for_each(&fs_info->buffer_tree, index, eb) {
+ xa_unlock_irq(&fs_info->buffer_tree);
+ free_extent_buffer(eb);
+ xa_lock_irq(&fs_info->buffer_tree);
}
- spin_unlock(&fs_info->buffer_lock);
+ xa_unlock_irq(&fs_info->buffer_tree);
btrfs_mapping_tree_free(fs_info);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
@@ -247,6 +241,15 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
kfree(cache);
}
+void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info)
+{
+ memset(trans, 0, sizeof(*trans));
+ trans->fs_info = fs_info;
+ xa_init(&trans->delayed_refs.head_refs);
+ xa_init(&trans->delayed_refs.dirty_extents);
+ spin_lock_init(&trans->delayed_refs.lock);
+}
+
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
@@ -295,6 +298,9 @@ int btrfs_run_sanity_tests(void)
ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize);
if (ret)
goto out;
+ ret = btrfs_test_delayed_refs(sectorsize, nodesize);
+ if (ret)
+ goto out;
}
}
ret = btrfs_test_extent_map();
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index b524ecf2f452..4307bdaa6749 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_TESTS_H
#define BTRFS_TESTS_H
+#include <linux/types.h>
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_run_sanity_tests(void);
@@ -25,12 +27,14 @@ enum {
TEST_ALLOC_EXTENT_MAP,
TEST_ALLOC_CHUNK_MAP,
TEST_ALLOC_IO_CONTEXT,
+ TEST_ALLOC_TRANSACTION,
};
extern const char *test_error[];
struct btrfs_root;
struct btrfs_trans_handle;
+struct btrfs_transaction;
int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize);
@@ -40,6 +44,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_extent_map(void);
+int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
@@ -49,6 +54,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt
void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
+void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info);
struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info);
#else
static inline int btrfs_run_sanity_tests(void)
diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c
new file mode 100644
index 000000000000..e2248acb906b
--- /dev/null
+++ b/fs/btrfs/tests/delayed-refs-tests.c
@@ -0,0 +1,1016 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sizes.h>
+#include "btrfs-tests.h"
+#include "../transaction.h"
+#include "../delayed-ref.h"
+#include "../extent-tree.h"
+
+#define FAKE_ROOT_OBJECTID 256
+#define FAKE_BYTENR 0
+#define FAKE_LEVEL 1
+#define FAKE_INO 256
+#define FAKE_FILE_OFFSET 0
+#define FAKE_PARENT SZ_1M
+
+struct ref_head_check {
+ u64 bytenr;
+ u64 num_bytes;
+ int ref_mod;
+ int total_ref_mod;
+ int must_insert;
+};
+
+struct ref_node_check {
+ u64 bytenr;
+ u64 num_bytes;
+ int ref_mod;
+ enum btrfs_delayed_ref_action action;
+ u8 type;
+ u64 parent;
+ u64 root;
+ u64 owner;
+ u64 offset;
+};
+
+static enum btrfs_ref_type ref_type_from_disk_ref_type(u8 type)
+{
+ if ((type == BTRFS_TREE_BLOCK_REF_KEY) ||
+ (type == BTRFS_SHARED_BLOCK_REF_KEY))
+ return BTRFS_REF_METADATA;
+ return BTRFS_REF_DATA;
+}
+
+static void delete_delayed_ref_head(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+
+ spin_lock(&delayed_refs->lock);
+ spin_lock(&head->lock);
+ btrfs_delete_ref_head(fs_info, delayed_refs, head);
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_delayed_ref_unlock(head);
+ btrfs_put_delayed_ref_head(head);
+}
+
+static void delete_delayed_ref_node(struct btrfs_delayed_ref_head *head,
+ struct btrfs_delayed_ref_node *node)
+{
+ rb_erase_cached(&node->ref_node, &head->ref_tree);
+ RB_CLEAR_NODE(&node->ref_node);
+ if (!list_empty(&node->add_list))
+ list_del_init(&node->add_list);
+ btrfs_put_delayed_ref(node);
+}
+
+static int validate_ref_head(struct btrfs_delayed_ref_head *head,
+ struct ref_head_check *check)
+{
+ if (head->bytenr != check->bytenr) {
+ test_err("invalid bytenr have: %llu want: %llu", head->bytenr,
+ check->bytenr);
+ return -EINVAL;
+ }
+
+ if (head->num_bytes != check->num_bytes) {
+ test_err("invalid num_bytes have: %llu want: %llu",
+ head->num_bytes, check->num_bytes);
+ return -EINVAL;
+ }
+
+ if (head->ref_mod != check->ref_mod) {
+ test_err("invalid ref_mod have: %d want: %d", head->ref_mod,
+ check->ref_mod);
+ return -EINVAL;
+ }
+
+ if (head->total_ref_mod != check->total_ref_mod) {
+ test_err("invalid total_ref_mod have: %d want: %d",
+ head->total_ref_mod, check->total_ref_mod);
+ return -EINVAL;
+ }
+
+ if (head->must_insert_reserved != check->must_insert) {
+ test_err("invalid must_insert have: %d want: %d",
+ head->must_insert_reserved, check->must_insert);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int validate_ref_node(struct btrfs_delayed_ref_node *node,
+ struct ref_node_check *check)
+{
+ if (node->bytenr != check->bytenr) {
+ test_err("invalid bytenr have: %llu want: %llu", node->bytenr,
+ check->bytenr);
+ return -EINVAL;
+ }
+
+ if (node->num_bytes != check->num_bytes) {
+ test_err("invalid num_bytes have: %llu want: %llu",
+ node->num_bytes, check->num_bytes);
+ return -EINVAL;
+ }
+
+ if (node->ref_mod != check->ref_mod) {
+ test_err("invalid ref_mod have: %d want: %d", node->ref_mod,
+ check->ref_mod);
+ return -EINVAL;
+ }
+
+ if (node->action != check->action) {
+ test_err("invalid action have: %d want: %d", node->action,
+ check->action);
+ return -EINVAL;
+ }
+
+ if (node->parent != check->parent) {
+ test_err("invalid parent have: %llu want: %llu", node->parent,
+ check->parent);
+ return -EINVAL;
+ }
+
+ if (node->ref_root != check->root) {
+ test_err("invalid root have: %llu want: %llu", node->ref_root,
+ check->root);
+ return -EINVAL;
+ }
+
+ if (node->type != check->type) {
+ test_err("invalid type have: %d want: %d", node->type,
+ check->type);
+ return -EINVAL;
+ }
+
+ if (btrfs_delayed_ref_owner(node) != check->owner) {
+ test_err("invalid owner have: %llu want: %llu",
+ btrfs_delayed_ref_owner(node), check->owner);
+ return -EINVAL;
+ }
+
+ if (btrfs_delayed_ref_offset(node) != check->offset) {
+ test_err("invalid offset have: %llu want: %llu",
+ btrfs_delayed_ref_offset(node), check->offset);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int simple_test(struct btrfs_trans_handle *trans,
+ struct ref_head_check *head_check,
+ struct ref_node_check *node_check)
+{
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_node *node;
+ struct btrfs_ref ref = {
+ .type = ref_type_from_disk_ref_type(node_check->type),
+ .action = node_check->action,
+ .parent = node_check->parent,
+ .ref_root = node_check->root,
+ .bytenr = node_check->bytenr,
+ .num_bytes = fs_info->nodesize,
+ };
+ int ret;
+
+ if (ref.type == BTRFS_REF_METADATA)
+ btrfs_init_tree_ref(&ref, node_check->owner, node_check->root,
+ false);
+ else
+ btrfs_init_data_ref(&ref, node_check->owner, node_check->offset,
+ node_check->root, true);
+
+ if (ref.type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ return ret;
+ }
+
+ head = btrfs_select_ref_head(fs_info, delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ return -EINVAL;
+ }
+
+ ret = -EINVAL;
+ if (validate_ref_head(head, head_check))
+ goto out;
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ if (validate_ref_node(node, node_check))
+ goto out;
+ ret = 0;
+out:
+ btrfs_unselect_ref_head(delayed_refs, head);
+ btrfs_destroy_delayed_refs(trans->transaction);
+ return ret;
+}
+
+/*
+ * These are simple tests, make sure that our btrfs_ref's get turned into the
+ * appropriate btrfs_delayed_ref_node based on their settings and action.
+ */
+static int simple_tests(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct ref_head_check head_check = {
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ .ref_mod = 1,
+ .total_ref_mod = 1,
+ };
+ struct ref_node_check node_check = {
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ .ref_mod = 1,
+ .action = BTRFS_ADD_DELAYED_REF,
+ .type = BTRFS_TREE_BLOCK_REF_KEY,
+ .parent = 0,
+ .root = FAKE_ROOT_OBJECTID,
+ .owner = FAKE_LEVEL,
+ .offset = 0,
+ };
+
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single add tree block failed");
+ return -EINVAL;
+ }
+
+ node_check.type = BTRFS_EXTENT_DATA_REF_KEY;
+ node_check.owner = FAKE_INO;
+ node_check.offset = FAKE_FILE_OFFSET;
+
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single add extent data failed");
+ return -EINVAL;
+ }
+
+ node_check.parent = FAKE_PARENT;
+ node_check.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ node_check.owner = FAKE_LEVEL;
+ node_check.offset = 0;
+
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single add shared block failed");
+ return -EINVAL;
+ }
+
+ node_check.type = BTRFS_SHARED_DATA_REF_KEY;
+ node_check.owner = FAKE_INO;
+ node_check.offset = FAKE_FILE_OFFSET;
+
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single add shared data failed");
+ return -EINVAL;
+ }
+
+ head_check.ref_mod = -1;
+ head_check.total_ref_mod = -1;
+ node_check.action = BTRFS_DROP_DELAYED_REF;
+ node_check.type = BTRFS_TREE_BLOCK_REF_KEY;
+ node_check.owner = FAKE_LEVEL;
+ node_check.offset = 0;
+ node_check.parent = 0;
+
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single drop tree block failed");
+ return -EINVAL;
+ }
+
+ node_check.type = BTRFS_EXTENT_DATA_REF_KEY;
+ node_check.owner = FAKE_INO;
+ node_check.offset = FAKE_FILE_OFFSET;
+
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single drop extent data failed");
+ return -EINVAL;
+ }
+
+ node_check.parent = FAKE_PARENT;
+ node_check.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ node_check.owner = FAKE_LEVEL;
+ node_check.offset = 0;
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single drop shared block failed");
+ return -EINVAL;
+ }
+
+ node_check.type = BTRFS_SHARED_DATA_REF_KEY;
+ node_check.owner = FAKE_INO;
+ node_check.offset = FAKE_FILE_OFFSET;
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single drop shared data failed");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Merge tests, validate that we do delayed ref merging properly, the ref counts
+ * all end up properly, and delayed refs are deleted once they're no longer
+ * needed.
+ */
+static int merge_tests(struct btrfs_trans_handle *trans,
+ enum btrfs_ref_type type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_head *head = NULL;
+ struct btrfs_delayed_ref_node *node;
+ struct btrfs_ref ref = {
+ .type = type,
+ .action = BTRFS_ADD_DELAYED_REF,
+ .parent = 0,
+ .ref_root = FAKE_ROOT_OBJECTID,
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ };
+ struct ref_head_check head_check = {
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ .ref_mod = 0,
+ .total_ref_mod = 0,
+ };
+ struct ref_node_check node_check = {
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ .ref_mod = 2,
+ .action = BTRFS_ADD_DELAYED_REF,
+ .parent = 0,
+ .root = FAKE_ROOT_OBJECTID,
+ };
+ int ret;
+
+ /*
+ * First add a ref and then drop it, make sure we get a head ref with a
+ * 0 total ref mod and no nodes.
+ */
+ if (type == BTRFS_REF_METADATA) {
+ node_check.type = BTRFS_TREE_BLOCK_REF_KEY;
+ node_check.owner = FAKE_LEVEL;
+ btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false);
+ } else {
+ node_check.type = BTRFS_EXTENT_DATA_REF_KEY;
+ node_check.owner = FAKE_INO;
+ node_check.offset = FAKE_FILE_OFFSET;
+ btrfs_init_data_ref(&ref, FAKE_INO, FAKE_FILE_OFFSET,
+ FAKE_ROOT_OBJECTID, true);
+ }
+
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ return ret;
+ }
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ goto out;
+ }
+
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("single add and drop failed");
+ goto out;
+ }
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (node) {
+ test_err("found node when none should exist");
+ goto out;
+ }
+
+ delete_delayed_ref_head(trans, head);
+ head = NULL;
+
+ /*
+ * Add a ref, then add another ref, make sure we get a head ref with a
+ * 2 total ref mod and 1 node.
+ */
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ goto out;
+ }
+
+ head_check.ref_mod = 2;
+ head_check.total_ref_mod = 2;
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("double add failed");
+ goto out;
+ }
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+
+ delete_delayed_ref_node(head, node);
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (node) {
+ test_err("found node when none should exist");
+ goto out;
+ }
+ delete_delayed_ref_head(trans, head);
+ head = NULL;
+
+ /* Add two drop refs, make sure they are merged properly. */
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ goto out;
+ }
+
+ head_check.ref_mod = -2;
+ head_check.total_ref_mod = -2;
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("double drop failed");
+ goto out;
+ }
+
+ node_check.action = BTRFS_DROP_DELAYED_REF;
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+
+ delete_delayed_ref_node(head, node);
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (node) {
+ test_err("found node when none should exist");
+ goto out;
+ }
+ delete_delayed_ref_head(trans, head);
+ head = NULL;
+
+ /* Add multiple refs, then drop until we go negative again. */
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ for (int i = 0; i < 10; i++) {
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+ }
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ for (int i = 0; i < 12; i++) {
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+ }
+
+ head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ head_check.ref_mod = -2;
+ head_check.total_ref_mod = -2;
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("double drop failed");
+ goto out;
+ }
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+
+ delete_delayed_ref_node(head, node);
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (node) {
+ test_err("found node when none should exist");
+ goto out;
+ }
+
+ delete_delayed_ref_head(trans, head);
+ head = NULL;
+
+ /* Drop multiple refs, then add until we go positive again. */
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ for (int i = 0; i < 10; i++) {
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+ }
+
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ for (int i = 0; i < 12; i++) {
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+ }
+
+ head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ head_check.ref_mod = 2;
+ head_check.total_ref_mod = 2;
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("add and drop to positive failed");
+ goto out;
+ }
+
+ node_check.action = BTRFS_ADD_DELAYED_REF;
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+
+ delete_delayed_ref_node(head, node);
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (node) {
+ test_err("found node when none should exist");
+ goto out;
+ }
+ delete_delayed_ref_head(trans, head);
+ head = NULL;
+
+ /*
+ * Add a bunch of refs with different roots and parents, then drop them
+ * all, make sure everything is properly merged.
+ */
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ for (int i = 0; i < 50; i++) {
+ if (!(i % 2)) {
+ ref.parent = 0;
+ ref.ref_root = FAKE_ROOT_OBJECTID + i;
+ } else {
+ ref.parent = FAKE_PARENT + (i * fs_info->nodesize);
+ }
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+ }
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ for (int i = 0; i < 50; i++) {
+ if (!(i % 2)) {
+ ref.parent = 0;
+ ref.ref_root = FAKE_ROOT_OBJECTID + i;
+ } else {
+ ref.parent = FAKE_PARENT + (i * fs_info->nodesize);
+ }
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+ }
+
+ head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ head_check.ref_mod = 0;
+ head_check.total_ref_mod = 0;
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("add and drop multiple failed");
+ goto out;
+ }
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (node) {
+ test_err("found node when none should exist");
+ goto out;
+ }
+ ret = 0;
+out:
+ if (!IS_ERR_OR_NULL(head))
+ btrfs_unselect_ref_head(&trans->transaction->delayed_refs, head);
+ btrfs_destroy_delayed_refs(trans->transaction);
+ return ret;
+}
+
+/*
+ * Basic test to validate we always get the add operations first followed by any
+ * delete operations.
+ */
+static int select_delayed_refs_test(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_head *head = NULL;
+ struct btrfs_delayed_ref_node *node;
+ struct btrfs_ref ref = {
+ .type = BTRFS_REF_METADATA,
+ .action = BTRFS_DROP_DELAYED_REF,
+ .parent = 0,
+ .ref_root = FAKE_ROOT_OBJECTID,
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ };
+ struct ref_head_check head_check = {
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ .ref_mod = 0,
+ .total_ref_mod = 0,
+ };
+ struct ref_node_check node_check = {
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ .ref_mod = 1,
+ .action = BTRFS_ADD_DELAYED_REF,
+ .type = BTRFS_TREE_BLOCK_REF_KEY,
+ .parent = 0,
+ .owner = FAKE_LEVEL,
+ .offset = 0,
+ };
+ int ret;
+
+ /* Add the drop first. */
+ btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false);
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ return ret;
+ }
+
+ /*
+ * Now add the add, and make it a different root so it's logically later
+ * in the rb tree.
+ */
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.ref_root = FAKE_ROOT_OBJECTID + 1;
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ head = btrfs_select_ref_head(fs_info, delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ ret = -EINVAL;
+ head = NULL;
+ goto out;
+ }
+
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("head check failed");
+ goto out;
+ }
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ node_check.root = FAKE_ROOT_OBJECTID + 1;
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+ delete_delayed_ref_node(head, node);
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ node_check.action = BTRFS_DROP_DELAYED_REF;
+ node_check.root = FAKE_ROOT_OBJECTID;
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+ delete_delayed_ref_node(head, node);
+ delete_delayed_ref_head(trans, head);
+ head = NULL;
+
+ /*
+ * Now we're going to do the same thing, but we're going to have an add
+ * that gets deleted because of a merge, and make sure we still have
+ * another add in place.
+ */
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.ref_root = FAKE_ROOT_OBJECTID;
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.ref_root = FAKE_ROOT_OBJECTID + 1;
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.ref_root = FAKE_ROOT_OBJECTID + 2;
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ head = btrfs_select_ref_head(fs_info, delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ ret = -EINVAL;
+ head = NULL;
+ goto out;
+ }
+
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("head check failed");
+ goto out;
+ }
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ node_check.action = BTRFS_ADD_DELAYED_REF;
+ node_check.root = FAKE_ROOT_OBJECTID + 2;
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+ delete_delayed_ref_node(head, node);
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ node_check.action = BTRFS_DROP_DELAYED_REF;
+ node_check.root = FAKE_ROOT_OBJECTID;
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+ delete_delayed_ref_node(head, node);
+ ret = 0;
+out:
+ if (head)
+ btrfs_unselect_ref_head(delayed_refs, head);
+ btrfs_destroy_delayed_refs(trans->transaction);
+ return ret;
+}
+
+int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_transaction *transaction;
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info;
+ int ret;
+
+ test_msg("running delayed refs tests");
+
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ return -ENOMEM;
+ }
+ transaction = kmalloc(sizeof(*transaction), GFP_KERNEL);
+ if (!transaction) {
+ test_std_err(TEST_ALLOC_TRANSACTION);
+ ret = -ENOMEM;
+ goto out_free_fs_info;
+ }
+ btrfs_init_dummy_trans(&trans, fs_info);
+ btrfs_init_dummy_transaction(transaction, fs_info);
+ trans.transaction = transaction;
+
+ ret = simple_tests(&trans);
+ if (!ret) {
+ test_msg("running delayed refs merge tests on metadata refs");
+ ret = merge_tests(&trans, BTRFS_REF_METADATA);
+ }
+
+ if (!ret) {
+ test_msg("running delayed refs merge tests on data refs");
+ ret = merge_tests(&trans, BTRFS_REF_DATA);
+ }
+
+ if (!ret)
+ ret = select_delayed_refs_test(&trans);
+
+ kfree(transaction);
+out_free_fs_info:
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 0a2dbfaaf49e..a0187d6163df 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -14,17 +14,17 @@
#include "../disk-io.h"
#include "../btrfs_inode.h"
-#define PROCESS_UNLOCK (1 << 0)
-#define PROCESS_RELEASE (1 << 1)
-#define PROCESS_TEST_LOCKED (1 << 2)
+#define PROCESS_UNLOCK (1U << 0)
+#define PROCESS_RELEASE (1U << 1)
+#define PROCESS_TEST_LOCKED (1U << 2)
static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
unsigned long flags)
{
int ret;
struct folio_batch fbatch;
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
+ pgoff_t index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
int i;
int count = 0;
int loops = 0;
@@ -74,9 +74,9 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest)
dest[0] = 0;
PRINT_ONE_FLAG(state, dest, cur, DIRTY);
- PRINT_ONE_FLAG(state, dest, cur, UPTODATE);
PRINT_ONE_FLAG(state, dest, cur, LOCKED);
- PRINT_ONE_FLAG(state, dest, cur, NEW);
+ PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG1);
+ PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG2);
PRINT_ONE_FLAG(state, dest, cur, DELALLOC);
PRINT_ONE_FLAG(state, dest, cur, DEFRAG);
PRINT_ONE_FLAG(state, dest, cur, BOUNDARY);
@@ -114,7 +114,6 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
struct extent_io_tree *tmp;
struct page *page;
struct page *locked_page = NULL;
- unsigned long index = 0;
/* In this test we need at least 2 file extents at its maximum size */
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 total_dirty = 2 * max_bytes;
@@ -150,14 +149,14 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
* Passing NULL as we don't have fs_info but tracepoints are not used
* at this point
*/
- extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST);
+ btrfs_extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST);
/*
* First go through and create and mark all of our pages dirty, we pin
* everything to make sure our pages don't get evicted and screw up our
* test.
*/
- for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
+ for (pgoff_t index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
if (!page) {
test_err("failed to allocate test page");
@@ -177,7 +176,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
* |--- delalloc ---|
* |--- search ---|
*/
- set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL);
+ btrfs_set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL);
start = 0;
end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
@@ -191,7 +190,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
sectorsize - 1, start, end);
goto out_bits;
}
- unlock_extent(tmp, start, end, NULL);
+ btrfs_unlock_extent(tmp, start, end, NULL);
unlock_page(locked_page);
put_page(locked_page);
@@ -208,7 +207,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
test_err("couldn't find the locked page");
goto out_bits;
}
- set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL);
+ btrfs_set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL);
start = test_start;
end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
@@ -227,7 +226,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
test_err("there were unlocked pages in the range");
goto out_bits;
}
- unlock_extent(tmp, start, end, NULL);
+ btrfs_unlock_extent(tmp, start, end, NULL);
/* locked_page was unlocked above */
put_page(locked_page);
@@ -263,7 +262,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
*
* We are re-using our test_start from above since it works out well.
*/
- set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL);
+ btrfs_set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL);
start = test_start;
end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
@@ -282,7 +281,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
test_err("pages in range were not all locked");
goto out_bits;
}
- unlock_extent(tmp, start, end, NULL);
+ btrfs_unlock_extent(tmp, start, end, NULL);
/*
* Now to test where we run into a page that is no longer dirty in the
@@ -327,7 +326,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
out_bits:
if (ret)
dump_extent_io_tree(tmp);
- clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
+ btrfs_clear_extent_bit(tmp, 0, total_dirty - 1, (unsigned)-1, NULL);
out:
if (locked_page)
put_page(locked_page);
@@ -344,11 +343,11 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb)
unsigned long i;
for (i = 0; i < eb->len * BITS_PER_BYTE; i++) {
- int bit, bit1;
+ bool bit_set, bit1_set;
- bit = !!test_bit(i, bitmap);
- bit1 = !!extent_buffer_test_bit(eb, 0, i);
- if (bit1 != bit) {
+ bit_set = test_bit(i, bitmap);
+ bit1_set = extent_buffer_test_bit(eb, 0, i);
+ if (bit1_set != bit_set) {
u8 has;
u8 expect;
@@ -361,9 +360,9 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb)
return -EINVAL;
}
- bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
- i % BITS_PER_BYTE);
- if (bit1 != bit) {
+ bit1_set = extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
+ i % BITS_PER_BYTE);
+ if (bit1_set != bit_set) {
u8 has;
u8 expect;
@@ -506,7 +505,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb)
static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
{
struct btrfs_fs_info *fs_info;
- unsigned long *bitmap = NULL;
+ unsigned long AUTO_KFREE(bitmap);
struct extent_buffer *eb = NULL;
int ret;
@@ -525,7 +524,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
goto out;
}
- eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, 0);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
@@ -542,7 +541,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
* Test again for case where the tree block is sectorsize aligned but
* not nodesize aligned.
*/
- eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, sectorsize);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
@@ -552,7 +551,6 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
ret = __test_eb_bitmaps(bitmap, eb);
out:
free_extent_buffer(eb);
- kfree(bitmap);
btrfs_free_dummy_fs_info(fs_info);
return ret;
}
@@ -565,10 +563,10 @@ static int test_find_first_clear_extent_bit(void)
test_msg("running find_first_clear_extent_bit test");
- extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST);
+ btrfs_extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST);
/* Test correct handling of empty tree */
- find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
+ btrfs_find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
if (start != 0 || end != -1) {
test_err(
"error getting a range from completely empty tree: start %llu end %llu",
@@ -579,11 +577,11 @@ static int test_find_first_clear_extent_bit(void)
* Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between
* 4M-32M
*/
- set_extent_bit(&tree, SZ_1M, SZ_4M - 1,
- CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
+ btrfs_set_extent_bit(&tree, SZ_1M, SZ_4M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
- find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
if (start != 0 || end != SZ_1M - 1) {
test_err("error finding beginning range: start %llu end %llu",
@@ -592,14 +590,14 @@ static int test_find_first_clear_extent_bit(void)
}
/* Now add 32M-64M so that we have a hole between 4M-32M */
- set_extent_bit(&tree, SZ_32M, SZ_64M - 1,
- CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
+ btrfs_set_extent_bit(&tree, SZ_32M, SZ_64M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
/*
* Request first hole starting at 12M, we should get 4M-32M
*/
- find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
if (start != SZ_4M || end != SZ_32M - 1) {
test_err("error finding trimmed range: start %llu end %llu",
@@ -611,8 +609,8 @@ static int test_find_first_clear_extent_bit(void)
* Search in the middle of allocated range, should get the next one
* available, which happens to be unallocated -> 4M-32M
*/
- find_first_clear_extent_bit(&tree, SZ_2M, &start, &end,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_find_first_clear_extent_bit(&tree, SZ_2M, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
if (start != SZ_4M || end != SZ_32M - 1) {
test_err("error finding next unalloc range: start %llu end %llu",
@@ -624,9 +622,9 @@ static int test_find_first_clear_extent_bit(void)
* Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag
* being unset in this range, we should get the entry in range 64M-72M
*/
- set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL);
- find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
- CHUNK_TRIMMED);
+ btrfs_set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL);
+ btrfs_find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
+ CHUNK_TRIMMED);
if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) {
test_err("error finding exact range: start %llu end %llu",
@@ -634,8 +632,8 @@ static int test_find_first_clear_extent_bit(void)
goto out;
}
- find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end,
- CHUNK_TRIMMED);
+ btrfs_find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end,
+ CHUNK_TRIMMED);
/*
* Search in the middle of set range whose immediate neighbour doesn't
@@ -651,7 +649,7 @@ static int test_find_first_clear_extent_bit(void)
* Search beyond any known range, shall return after last known range
* and end should be -1
*/
- find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED);
+ btrfs_find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED);
if (start != SZ_64M + SZ_8M || end != -1) {
test_err(
"error handling beyond end of range search: start %llu end %llu",
@@ -663,7 +661,7 @@ static int test_find_first_clear_extent_bit(void)
out:
if (ret)
dump_extent_io_tree(&tree);
- clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_clear_extent_bit(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
return ret;
}
@@ -730,7 +728,7 @@ static int test_eb_mem_ops(u32 sectorsize, u32 nodesize)
goto out;
}
- eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, SZ_1M);
if (!eb) {
test_std_err(TEST_ALLOC_EXTENT_BUFFER);
ret = -ENOMEM;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 56e61ac1cc64..0b9f25dd1a68 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -22,7 +22,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode)
while (!RB_EMPTY_ROOT(&em_tree->root)) {
node = rb_first(&em_tree->root);
em = rb_entry(node, struct extent_map, rb_node);
- remove_extent_mapping(inode, em);
+ btrfs_remove_extent_mapping(inode, em);
#ifdef CONFIG_BTRFS_DEBUG
if (refcount_read(&em->refs) != 1) {
@@ -36,7 +36,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode)
refcount_set(&em->refs, 1);
}
#endif
- free_extent_map(em);
+ btrfs_free_extent_map(em);
}
write_unlock(&em_tree->lock);
@@ -68,7 +68,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -87,10 +87,10 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("cannot add extent range [0, 16K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Add [16K, 20K) following [0, 16K) */
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -109,9 +109,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("cannot add extent range [16K, 20K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -137,7 +137,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
ret = -ENOENT;
goto out;
}
- if (em->start != 0 || extent_map_end(em) != SZ_16K ||
+ if (em->start != 0 || btrfs_extent_map_end(em) != SZ_16K ||
em->disk_bytenr != 0 || em->disk_num_bytes != SZ_16K) {
test_err(
"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu",
@@ -145,7 +145,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
@@ -167,7 +167,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -186,10 +186,10 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("cannot add extent range [0, 1K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Add [4K, 8K) following [0, 1K) */
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -208,9 +208,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("cannot add extent range [4K, 8K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -235,14 +235,14 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
ret = -ENOENT;
goto out;
}
- if (em->start != 0 || extent_map_end(em) != SZ_1K ||
+ if (em->start != 0 || btrfs_extent_map_end(em) != SZ_1K ||
em->disk_bytenr != EXTENT_MAP_INLINE) {
test_err(
"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu",
ret, em->start, em->len, em->disk_bytenr);
ret = -EINVAL;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
@@ -260,7 +260,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -279,9 +279,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
test_err("cannot add extent range [4K, 8K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -312,15 +312,15 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
* Since bytes within em are contiguous, em->block_start is identical to
* em->start.
*/
- if (start < em->start || start + len > extent_map_end(em) ||
- em->start != extent_map_block_start(em)) {
+ if (start < em->start || start + len > btrfs_extent_map_end(em) ||
+ em->start != btrfs_extent_map_block_start(em)) {
test_err(
"case3 [%llu %llu): ret %d em (start %llu len %llu disk_bytenr %llu block_len %llu)",
start, start + len, ret, em->start, em->len,
em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
@@ -369,7 +369,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -388,9 +388,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
test_err("cannot add extent range [0, 8K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -410,9 +410,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
test_err("cannot add extent range [8K, 32K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -438,14 +438,14 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
ret = -ENOENT;
goto out;
}
- if (start < em->start || start + len > extent_map_end(em)) {
+ if (start < em->start || start + len > btrfs_extent_map_end(em)) {
test_err(
"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu)",
start, start + len, ret, em->start, em->len,
em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
@@ -498,7 +498,7 @@ static int add_compressed_extent(struct btrfs_inode *inode,
struct extent_map *em;
int ret;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -513,7 +513,7 @@ static int add_compressed_extent(struct btrfs_inode *inode,
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (ret < 0) {
test_err("cannot add extent map [%llu, %llu)", start, start + len);
return ret;
@@ -719,7 +719,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
if (ret)
goto out;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -751,7 +751,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
}
ret = 0;
out:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret2 = free_extent_map_tree(inode);
if (ret == 0)
ret = ret2;
@@ -773,7 +773,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_msg("Running btrfs_drop_extent_cache with pinned");
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -793,9 +793,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("couldn't add extent map");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -815,7 +815,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("couldn't add extent map");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/*
* Drop [0, 36K) This should skip the [0, 4K) extent and then split the
@@ -826,7 +826,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* Make sure our extent maps look sane. */
ret = -EINVAL;
- em = lookup_extent_mapping(em_tree, 0, SZ_16K);
+ em = btrfs_lookup_extent_mapping(em_tree, 0, SZ_16K);
if (!em) {
test_err("didn't find an em at 0 as expected");
goto out;
@@ -842,10 +842,10 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K);
+ em = btrfs_lookup_extent_mapping(em_tree, SZ_16K, SZ_16K);
read_unlock(&em_tree->lock);
if (em) {
test_err("found an em when we weren't expecting one");
@@ -853,7 +853,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
}
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K);
+ em = btrfs_lookup_extent_mapping(em_tree, SZ_32K, SZ_16K);
read_unlock(&em_tree->lock);
if (!em) {
test_err("didn't find an em at 32K as expected");
@@ -870,16 +870,16 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
goto out;
}
- if (extent_map_block_start(em) != SZ_32K + SZ_4K) {
+ if (btrfs_extent_map_block_start(em) != SZ_32K + SZ_4K) {
test_err("em->block_start is %llu, expected 36K",
- extent_map_block_start(em));
+ btrfs_extent_map_block_start(em));
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1);
+ em = btrfs_lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1);
read_unlock(&em_tree->lock);
if (em) {
test_err("found an unexpected em above 48K");
@@ -888,9 +888,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
ret = 0;
out:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Unpin our extent to prevent warning when removing it below. */
- ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0);
+ ret2 = btrfs_unpin_extent_cache(inode, 0, SZ_16K, 0);
if (ret == 0)
ret = ret2;
ret2 = free_extent_map_tree(inode);
@@ -913,7 +913,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -928,13 +928,13 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (ret < 0) {
test_err("couldn't add extent map for range [120K, 128K)");
goto out;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -967,7 +967,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K);
write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (ret < 0) {
test_err("couldn't add extent map for range [108K, 144K)");
goto out;
@@ -1013,7 +1013,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
struct rmap_test_vector *test)
{
struct btrfs_chunk_map *map;
- u64 *logical = NULL;
+ u64 AUTO_KFREE(logical);
int i, out_ndaddrs, out_stripe_len;
int ret;
@@ -1045,7 +1045,8 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
ret = btrfs_add_chunk_map(fs_info, map);
if (ret) {
test_err("error adding chunk map to mapping tree");
- goto out_free;
+ btrfs_free_chunk_map(map);
+ return ret;
}
ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1),
@@ -1078,8 +1079,6 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
ret = 0;
out:
btrfs_remove_chunk_map(fs_info, map);
-out_free:
- kfree(logical);
return ret;
}
@@ -1094,7 +1093,7 @@ int btrfs_test_extent_map(void)
/*
* Test a chunk with 2 data stripes one of which
* intersects the physical address of the super block
- * is correctly recognised.
+ * is correctly recognized.
*/
.raid_type = BTRFS_BLOCK_GROUP_RAID1,
.physical_start = SZ_64M - SZ_4M,
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index b61972046feb..c8822edd32e2 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -32,7 +32,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
unsigned int i;
int ret;
- info = search_free_space_info(trans, cache, path, 0);
+ info = btrfs_search_free_space_info(trans, cache, path, 0);
if (IS_ERR(info)) {
test_err("could not find free space info");
ret = PTR_ERR(info);
@@ -57,7 +57,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
goto invalid;
offset = key.objectid;
while (offset < key.objectid + key.offset) {
- bit = free_space_test_bit(cache, path, offset);
+ bit = btrfs_free_space_test_bit(cache, path, offset);
if (prev_bit == 0 && bit == 1) {
extent_start = offset;
} else if (prev_bit == 1 && bit == 0) {
@@ -115,7 +115,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
u32 flags;
int ret;
- info = search_free_space_info(trans, cache, path, 0);
+ info = btrfs_search_free_space_info(trans, cache, path, 0);
if (IS_ERR(info)) {
test_err("could not find free space info");
btrfs_release_path(path);
@@ -131,13 +131,13 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
/* Flip it to the other format and check that for good measure. */
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
- ret = convert_free_space_to_extents(trans, cache, path);
+ ret = btrfs_convert_free_space_to_extents(trans, cache, path);
if (ret) {
test_err("could not convert to extents");
return ret;
}
} else {
- ret = convert_free_space_to_bitmaps(trans, cache, path);
+ ret = btrfs_convert_free_space_to_bitmaps(trans, cache, path);
if (ret) {
test_err("could not convert to bitmaps");
return ret;
@@ -170,9 +170,8 @@ static int test_remove_all(struct btrfs_trans_handle *trans,
const struct free_space_extent extents[] = {};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start,
- cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -193,8 +192,8 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, alignment);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -216,7 +215,7 @@ static int test_remove_end(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
cache->start + cache->length - alignment,
alignment);
if (ret) {
@@ -240,9 +239,9 @@ static int test_remove_middle(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start + alignment,
- alignment);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start + alignment,
+ alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -263,23 +262,22 @@ static int test_merge_left(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->start,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + alignment,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -300,24 +298,23 @@ static int test_merge_right(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 2 * alignment,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + alignment,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -338,29 +335,29 @@ static int test_merge_both(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->start,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 2 * alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -383,29 +380,29 @@ static int test_merge_none(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->start,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 4 * alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 4 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 2 * alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -483,14 +480,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
goto out;
}
- ret = add_block_group_free_space(&trans, cache);
+ ret = btrfs_add_block_group_free_space(&trans, cache);
if (ret) {
test_err("could not add block group free space");
goto out;
}
if (bitmaps) {
- ret = convert_free_space_to_bitmaps(&trans, cache, path);
+ ret = btrfs_convert_free_space_to_bitmaps(&trans, cache, path);
if (ret) {
test_err("could not convert block group to bitmaps");
goto out;
@@ -501,7 +498,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
if (ret)
goto out;
- ret = remove_block_group_free_space(&trans, cache);
+ ret = btrfs_remove_block_group_free_space(&trans, cache);
if (ret) {
test_err("could not remove block group free space");
goto out;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 3ea3bc2225fe..a4c2b7748b95 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -268,7 +268,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
/*
@@ -314,7 +314,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
* this?
*/
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -336,7 +336,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Regular extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -363,7 +363,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* The next 3 are split extents */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -389,10 +389,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- disk_bytenr = extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -414,7 +414,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -441,13 +441,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
disk_bytenr += (em->start - orig_start);
- if (extent_map_block_start(em) != disk_bytenr) {
+ if (btrfs_extent_map_block_start(em) != disk_bytenr) {
test_err("wrong block start, want %llu, have %llu",
- disk_bytenr, extent_map_block_start(em));
+ disk_bytenr, btrfs_extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Prealloc extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -475,7 +475,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* The next 3 are a half written prealloc extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -502,10 +502,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- disk_bytenr = extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -531,13 +531,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start - orig_start, em->offset);
goto out;
}
- if (extent_map_block_start(em) != disk_bytenr + em->offset) {
+ if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) {
test_err("unexpected block start, wanted %llu, have %llu",
- disk_bytenr + em->offset, extent_map_block_start(em));
+ disk_bytenr + em->offset, btrfs_extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -564,13 +564,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->offset, orig_start);
goto out;
}
- if (extent_map_block_start(em) != disk_bytenr + em->offset) {
+ if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) {
test_err("unexpected block start, wanted %llu, have %llu",
- disk_bytenr + em->offset, extent_map_block_start(em));
+ disk_bytenr + em->offset, btrfs_extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Now for the compressed extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -597,13 +597,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
+ if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
+ BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Split compressed extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -630,15 +630,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
+ if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
+ BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em));
goto out;
}
- disk_bytenr = extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -664,16 +664,16 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (extent_map_block_start(em) != disk_bytenr) {
+ if (btrfs_extent_map_block_start(em) != disk_bytenr) {
test_err("block start does not match, want %llu got %llu",
- disk_bytenr, extent_map_block_start(em));
+ disk_bytenr, btrfs_extent_map_block_start(em));
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -692,13 +692,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->offset, orig_start);
goto out;
}
- if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
+ if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
+ BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* A hole between regular extents but no hole extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize);
@@ -725,7 +725,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M);
if (IS_ERR(em)) {
@@ -757,7 +757,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -785,7 +785,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
ret = 0;
out:
if (!IS_ERR(em))
- free_extent_map(em);
+ btrfs_free_extent_map(em);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
@@ -858,15 +858,16 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
em->flags);
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (extent_map_block_start(em) != sectorsize) {
- test_err("expected a real extent, got %llu", extent_map_block_start(em));
+ if (btrfs_extent_map_block_start(em) != sectorsize) {
+ test_err("expected a real extent, got %llu",
+ btrfs_extent_map_block_start(em));
goto out;
}
if (em->start != sectorsize || em->len != sectorsize) {
@@ -883,7 +884,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
ret = 0;
out:
if (!IS_ERR(em))
- free_extent_map(em);
+ btrfs_free_extent_map(em);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
@@ -949,11 +950,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */
- ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
- BTRFS_MAX_EXTENT_SIZE >> 1,
- (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE >> 1,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1017,11 +1017,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
- ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
- BTRFS_MAX_EXTENT_SIZE + sectorsize,
- BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1052,9 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* Empty */
- ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1068,9 +1066,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = 0;
out:
if (ret)
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, NULL);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 3fc8dc3fd980..05cfda8af422 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -20,7 +20,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
struct btrfs_extent_item *item;
struct btrfs_extent_inline_ref *iref;
struct btrfs_tree_block_info *block_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key ins;
u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
@@ -41,7 +41,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
if (ret) {
test_err("couldn't insert ref %d", ret);
- btrfs_free_path(path);
return ret;
}
@@ -61,7 +60,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY);
btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
}
- btrfs_free_path(path);
return 0;
}
@@ -70,7 +68,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
{
struct btrfs_trans_handle trans;
struct btrfs_extent_item *item;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
u64 refs;
int ret;
@@ -90,7 +88,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
test_err("couldn't find extent ref");
- btrfs_free_path(path);
return ret;
}
@@ -112,7 +109,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
if (ret)
test_err("failed to insert backref");
- btrfs_free_path(path);
return ret;
}
@@ -121,7 +117,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
{
struct btrfs_trans_handle trans;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
btrfs_init_dummy_trans(&trans, NULL);
@@ -139,11 +135,9 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
if (ret) {
test_err("didn't find our key %d", ret);
- btrfs_free_path(path);
return ret;
}
btrfs_del_item(&trans, root, path);
- btrfs_free_path(path);
return 0;
}
@@ -152,7 +146,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
{
struct btrfs_trans_handle trans;
struct btrfs_extent_item *item;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
u64 refs;
int ret;
@@ -172,7 +166,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
test_err("couldn't find extent ref");
- btrfs_free_path(path);
return ret;
}
@@ -198,7 +191,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
return ret;
}
btrfs_del_item(&trans, root, path);
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
index 30f17eb7b6a8..a7bc58a5c1e2 100644
--- a/fs/btrfs/tests/raid-stripe-tree-tests.c
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -14,6 +14,8 @@
#define RST_TEST_NUM_DEVICES (2)
#define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1)
+#define SZ_48K (SZ_32K + SZ_16K)
+
typedef int (*test_func_t)(struct btrfs_trans_handle *trans);
static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices,
@@ -30,6 +32,613 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de
}
/*
+ * Test creating a range of three extents and then punch a hole in the middle,
+ * deleting all of the middle extents and partially deleting the "book ends".
+ */
+static int test_punch_hole_3extents(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical1 = SZ_1M;
+ u64 len1 = SZ_1M;
+ u64 logical2 = logical1 + len1;
+ u64 len2 = SZ_1M;
+ u64 logical3 = logical2 + len2;
+ u64 len3 = SZ_1M;
+ u64 hole_start = logical1 + SZ_256K;
+ u64 hole_len = SZ_2M;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+
+ /* Prepare for the test, 1st create 3 x 1M extents. */
+ bioc->map_type = map_type;
+ bioc->size = len1;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical1 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ bioc->logical = logical2;
+ bioc->size = len2;
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical2 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ bioc->logical = logical3;
+ bioc->size = len3;
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical3 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ /*
+ * Delete a range starting at logical1 + 256K and 2M in length. Extent
+ * 1 is truncated to 256k length, extent 2 is completely dropped and
+ * extent 3 is moved 256K to the right.
+ */
+ ret = btrfs_delete_raid_extent(trans, hole_start, hole_len);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ hole_start, hole_start + hole_len);
+ goto out;
+ }
+
+ /* Get the first extent and check its size. */
+ ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type,
+ 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical1, logical1 + len1);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical1) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical1, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len1 != SZ_256K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_256K, len1);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Get the second extent and check it's absent. */
+ ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type,
+ 0, &io_stripe);
+ if (ret != -ENODATA) {
+ test_err("lookup of RAID extent [%llu, %llu] succeeded should fail",
+ logical2, logical2 + len2);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Get the third extent and check its size. */
+ logical3 += SZ_256K;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type,
+ 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical3, logical3 + len3);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical3) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical3 + SZ_256K, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len3 != SZ_1M - SZ_256K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_1M - SZ_256K, len3);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical1, len1);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ logical1, logical1 + len1);
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical3, len3);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ logical1, logical1 + len1);
+ goto out;
+ }
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+static int test_delete_two_extents(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical1 = SZ_1M;
+ u64 len1 = SZ_1M;
+ u64 logical2 = logical1 + len1;
+ u64 len2 = SZ_1M;
+ u64 logical3 = logical2 + len2;
+ u64 len3 = SZ_1M;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+
+ /* Prepare for the test, 1st create 3 x 1M extents. */
+ bioc->map_type = map_type;
+ bioc->size = len1;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical1 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ bioc->logical = logical2;
+ bioc->size = len2;
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical2 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ bioc->logical = logical3;
+ bioc->size = len3;
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical3 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ /*
+ * Delete a range starting at logical1 and 2M in length. Extents 1
+ * and 2 are dropped and extent 3 is kept as is.
+ */
+ ret = btrfs_delete_raid_extent(trans, logical1, len1 + len2);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ logical1, logical1 + len1 + len2);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type,
+ 0, &io_stripe);
+ if (ret != -ENODATA) {
+ test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+ logical1, len1);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type,
+ 0, &io_stripe);
+ if (ret != -ENODATA) {
+ test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+ logical2, len2);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type,
+ 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical3, len3);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical3) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical3, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len3 != SZ_1M) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_1M, len3);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical3, len3);
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/* Test punching a hole into a single RAID stripe-extent. */
+static int test_punch_hole(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical1 = SZ_1M;
+ u64 hole_start = logical1 + SZ_32K;
+ u64 hole_len = SZ_64K;
+ u64 logical2 = hole_start + hole_len;
+ u64 len = SZ_1M;
+ u64 len1 = SZ_32K;
+ u64 len2 = len - len1 - hole_len;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical1 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0,
+ &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical1,
+ logical1 + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical1) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical1, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_1M) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_1M, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, hole_start, hole_len);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ hole_start, hole_start + hole_len);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type,
+ 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical1, logical1 + len1);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical1) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical1, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len1 != SZ_32K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_32K, len1);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type,
+ 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical2,
+ logical2 + len2);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical2) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical2, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len2 != len - len1 - hole_len) {
+ test_err("invalid length, expected %llu, got %llu",
+ len - len1 - hole_len, len2);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Check for the absence of the hole. */
+ ret = btrfs_get_raid_extent_offset(fs_info, hole_start, &hole_len,
+ map_type, 0, &io_stripe);
+ if (ret != -ENODATA) {
+ ret = -EINVAL;
+ test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+ hole_start, hole_start + SZ_64K);
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical1, len1);
+ if (ret)
+ goto out;
+
+ ret = btrfs_delete_raid_extent(trans, logical2, len2);
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Test a 1M RST write that spans two adjacent RST items on disk and then
+ * delete a portion starting in the first item and spanning into the second
+ * item. This is similar to test_front_delete(), but spanning multiple items.
+ */
+static int test_front_delete_prev_item(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical1 = SZ_1M;
+ u64 logical2 = SZ_2M;
+ u64 len = SZ_1M;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ /* Insert RAID extent 1. */
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical1 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ bioc->logical = logical2;
+ /* Insert RAID extent 2, directly adjacent to it. */
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical2 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical1 + SZ_512K, SZ_1M);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ logical1 + SZ_512K, (u64)SZ_1M);
+ goto out;
+ }
+
+ /* Verify item 1 is truncated to 512K. */
+ ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0,
+ &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical1,
+ logical1 + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical1) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical1, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_512K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_512K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Verify item 2's start is moved by 512K. */
+ ret = btrfs_get_raid_extent_offset(fs_info, logical2 + SZ_512K, &len,
+ map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical2 + SZ_512K, logical2 + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical2 + SZ_512K) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical2 + SZ_512K, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_512K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_512K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Verify there's a hole at [1M+512K, 2M+512K] . */
+ len = SZ_1M;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical1 + SZ_512K, &len,
+ map_type, 0, &io_stripe);
+ if (ret != -ENODATA) {
+ test_err("lookup of RAID [%llu, %llu] succeeded, should fail",
+ logical1 + SZ_512K, logical1 + SZ_512K + len);
+ goto out;
+ }
+
+ /* Clean up after us. */
+ ret = btrfs_delete_raid_extent(trans, logical1, SZ_512K);
+ if (ret)
+ goto out;
+
+ ret = btrfs_delete_raid_extent(trans, logical2 + SZ_512K, SZ_512K);
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
* Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
* delete the 1st 32K, making the new start address 1M+32K.
*/
@@ -94,45 +703,45 @@ static int test_front_delete(struct btrfs_trans_handle *trans)
goto out;
}
- ret = btrfs_delete_raid_extent(trans, logical, SZ_32K);
+ ret = btrfs_delete_raid_extent(trans, logical, SZ_16K);
if (ret) {
test_err("deleting RAID extent [%llu, %llu] failed", logical,
- logical + SZ_32K);
+ logical + SZ_16K);
goto out;
}
- len = SZ_32K;
- ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len,
+ len -= SZ_16K;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_16K, &len,
map_type, 0, &io_stripe);
if (ret) {
test_err("lookup of RAID extent [%llu, %llu] failed",
- logical + SZ_32K, logical + SZ_32K + len);
+ logical + SZ_16K, logical + SZ_64K);
goto out;
}
- if (io_stripe.physical != logical + SZ_32K) {
+ if (io_stripe.physical != logical + SZ_16K) {
test_err("invalid physical address, expected %llu, got %llu",
- logical + SZ_32K, io_stripe.physical);
+ logical + SZ_16K, io_stripe.physical);
ret = -EINVAL;
goto out;
}
- if (len != SZ_32K) {
+ if (len != SZ_48K) {
test_err("invalid stripe length, expected %llu, got %llu",
- (u64)SZ_32K, len);
+ (u64)SZ_48K, len);
ret = -EINVAL;
goto out;
}
ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
- if (!ret) {
+ if (ret != -ENODATA) {
ret = -EINVAL;
test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
- logical, logical + SZ_32K);
+ logical, logical + SZ_16K);
goto out;
}
- ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+ ret = btrfs_delete_raid_extent(trans, logical + SZ_16K, SZ_48K);
out:
btrfs_put_bioc(bioc);
return ret;
@@ -209,14 +818,14 @@ static int test_tail_delete(struct btrfs_trans_handle *trans)
goto out;
}
- ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+ ret = btrfs_delete_raid_extent(trans, logical + SZ_48K, SZ_16K);
if (ret) {
test_err("deleting RAID extent [%llu, %llu] failed",
- logical + SZ_32K, logical + SZ_64K);
+ logical + SZ_48K, logical + SZ_64K);
goto out;
}
- len = SZ_32K;
+ len = SZ_48K;
ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
if (ret) {
test_err("lookup of RAID extent [%llu, %llu] failed", logical,
@@ -231,9 +840,19 @@ static int test_tail_delete(struct btrfs_trans_handle *trans)
goto out;
}
- if (len != SZ_32K) {
+ if (len != SZ_48K) {
test_err("invalid stripe length, expected %llu, got %llu",
- (u64)SZ_32K, len);
+ (u64)SZ_48K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ len = SZ_16K;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_48K, &len,
+ map_type, 0, &io_stripe);
+ if (ret != -ENODATA) {
+ test_err("lookup of RAID extent [%llu, %llu] succeeded should fail",
+ logical + SZ_48K, logical + SZ_64K);
ret = -EINVAL;
goto out;
}
@@ -456,6 +1075,10 @@ static const test_func_t tests[] = {
test_create_update_delete,
test_tail_delete,
test_front_delete,
+ test_front_delete_prev_item,
+ test_punch_hole,
+ test_punch_hole_3extents,
+ test_delete_two_extents,
};
static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
@@ -478,8 +1101,8 @@ static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
ret = PTR_ERR(root);
goto out;
}
- btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
- BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE);
+ btrfs_set_super_incompat_flags(root->fs_info->super_copy,
+ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE);
root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dc0b837efd5d..05ee4391c83a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -32,6 +32,8 @@
#include "ioctl.h"
#include "relocation.h"
#include "scrub.h"
+#include "ordered-data.h"
+#include "delayed-inode.h"
static struct kmem_cache *btrfs_trans_handle_cachep;
@@ -103,7 +105,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep;
* | attached to transid N+1. |
* | |
* | To next stage: |
- * | Until all tree blocks are super blocks are |
+ * | Until all tree blocks and super blocks are |
* | written to block devices |
* V |
* Transaction N [[TRANS_STATE_COMPLETED]] V
@@ -138,7 +140,6 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
void btrfs_put_transaction(struct btrfs_transaction *transaction)
{
- WARN_ON(refcount_read(&transaction->use_count) == 0);
if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs));
@@ -160,7 +161,13 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
cache = list_first_entry(&transaction->deleted_bgs,
struct btrfs_block_group,
bg_list);
+ /*
+ * Not strictly necessary to lock, as no other task will be using a
+ * block_group on the deleted_bgs list during a transaction abort.
+ */
+ spin_lock(&transaction->fs_info->unused_bgs_lock);
list_del_init(&cache->bg_list);
+ spin_unlock(&transaction->fs_info->unused_bgs_lock);
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
}
@@ -179,7 +186,8 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
* At this point no one can be using this transaction to modify any tree
* and no one can start another transaction to modify any tree either.
*/
- ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);
+ ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING,
+ "cur_trans->state=%d", cur_trans->state);
down_write(&fs_info->commit_root_sem);
@@ -191,7 +199,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
root->commit_root = btrfs_root_node(root);
- extent_io_tree_release(&root->dirty_log_pages);
+ btrfs_extent_io_tree_release(&root->dirty_log_pages);
btrfs_qgroup_clean_swapped_blocks(root);
}
@@ -274,8 +282,10 @@ loop:
cur_trans = fs_info->running_transaction;
if (cur_trans) {
if (TRANS_ABORTED(cur_trans)) {
+ const int abort_error = cur_trans->aborted;
+
spin_unlock(&fs_info->trans_lock);
- return cur_trans->aborted;
+ return abort_error;
}
if (btrfs_blocked_trans_types[cur_trans->state] & type) {
spin_unlock(&fs_info->trans_lock);
@@ -375,10 +385,10 @@ loop:
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
spin_lock_init(&cur_trans->dropped_roots_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
- extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
- IO_TREE_TRANS_DIRTY_PAGES);
- extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
- IO_TREE_FS_PINNED_EXTENTS);
+ btrfs_extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
+ IO_TREE_TRANS_DIRTY_PAGES);
+ btrfs_extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
+ IO_TREE_FS_PINNED_EXTENTS);
btrfs_set_fs_generation(fs_info, fs_info->generation + 1);
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@@ -396,7 +406,7 @@ loop:
*/
static int record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- int force)
+ bool force)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
@@ -530,15 +540,15 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
}
}
-static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
+static bool may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
{
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
- return 0;
+ return false;
if (type == TRANS_START)
- return 1;
+ return true;
- return 0;
+ return false;
}
static inline bool need_reserve_reloc_root(struct btrfs_root *root)
@@ -567,7 +577,7 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
* We want to reserve all the bytes we may need all at once, so we only
* do 1 enospc flushing cycle per transaction start.
*/
- ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(si, bytes, flush);
/*
* If we are an emergency flush, which can steal from the global block
@@ -577,7 +587,7 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
bytes -= *delayed_refs_bytes;
*delayed_refs_bytes = 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(si, bytes, flush);
}
return ret;
@@ -753,9 +763,10 @@ got_it:
* value here.
*/
if (do_chunk_alloc && num_bytes) {
- u64 flags = h->block_rsv->space_info->flags;
+ struct btrfs_space_info *space_info = h->block_rsv->space_info;
+ u64 flags = space_info->flags;
- btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags),
+ btrfs_chunk_alloc(h, space_info, btrfs_get_alloc_profile(fs_info, flags),
CHUNK_ALLOC_NO_FORCE);
}
@@ -795,8 +806,7 @@ alloc_fail:
if (num_bytes)
btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
if (delayed_refs_bytes)
- btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
- delayed_refs_bytes);
+ btrfs_space_info_free_bytes_may_use(trans_rsv->space_info, delayed_refs_bytes);
reserve_fail:
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
return ERR_PTR(ret);
@@ -1016,13 +1026,18 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
if (!trans->block_rsv) {
- ASSERT(!trans->bytes_reserved);
- ASSERT(!trans->delayed_refs_bytes_reserved);
+ ASSERT(trans->bytes_reserved == 0,
+ "trans->bytes_reserved=%llu", trans->bytes_reserved);
+ ASSERT(trans->delayed_refs_bytes_reserved == 0,
+ "trans->delayed_refs_bytes_reserved=%llu",
+ trans->delayed_refs_bytes_reserved);
return;
}
if (!trans->bytes_reserved) {
- ASSERT(!trans->delayed_refs_bytes_reserved);
+ ASSERT(trans->delayed_refs_bytes_reserved == 0,
+ "trans->delayed_refs_bytes_reserved=%llu",
+ trans->delayed_refs_bytes_reserved);
return;
}
@@ -1121,13 +1136,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
u64 start = 0;
u64 end;
- while (find_first_extent_bit(dirty_pages, start, &start, &end,
- mark, &cached_state)) {
+ while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, &cached_state)) {
bool wait_writeback = false;
- ret = convert_extent_bit(dirty_pages, start, end,
- EXTENT_NEED_WAIT,
- mark, &cached_state);
+ ret = btrfs_convert_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT,
+ mark, &cached_state);
/*
* convert_extent_bit can return -ENOMEM, which is most of the
* time a temporary error. So when it happens, ignore the error
@@ -1148,8 +1163,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
if (!ret)
ret = filemap_fdatawrite_range(mapping, start, end);
if (!ret && wait_writeback)
- ret = filemap_fdatawait_range(mapping, start, end);
- free_extent_state(cached_state);
+ btrfs_btree_wait_writeback_range(fs_info, start, end);
+ btrfs_free_extent_state(cached_state);
if (ret)
break;
cached_state = NULL;
@@ -1168,14 +1183,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
- struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
int ret = 0;
- while (find_first_extent_bit(dirty_pages, start, &start, &end,
- EXTENT_NEED_WAIT, &cached_state)) {
+ while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
+ EXTENT_NEED_WAIT, &cached_state)) {
/*
* Ignore -ENOMEM errors returned by clear_extent_bit().
* When committing the transaction, we'll remove any entries
@@ -1184,13 +1198,13 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* concurrently - we do it only at transaction commit time when
* it's safe to do it (through extent_io_tree_release()).
*/
- ret = clear_extent_bit(dirty_pages, start, end,
- EXTENT_NEED_WAIT, &cached_state);
+ ret = btrfs_clear_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT, &cached_state);
if (ret == -ENOMEM)
ret = 0;
if (!ret)
- ret = filemap_fdatawait_range(mapping, start, end);
- free_extent_state(cached_state);
+ btrfs_btree_wait_writeback_range(fs_info, start, end);
+ btrfs_free_extent_state(cached_state);
if (ret)
break;
cached_state = NULL;
@@ -1204,15 +1218,15 @@ static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
bool errors = false;
- int err;
+ int ret;
- err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+ ret = __btrfs_wait_marked_extents(fs_info, dirty_pages);
if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
errors = true;
- if (errors && !err)
- err = -EIO;
- return err;
+ if (errors && !ret)
+ ret = -EIO;
+ return ret;
}
int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
@@ -1220,22 +1234,23 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
struct btrfs_fs_info *fs_info = log_root->fs_info;
struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
bool errors = false;
- int err;
+ int ret;
- ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID,
+ "root_id(log_root)=%llu", btrfs_root_id(log_root));
- err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
- if ((mark & EXTENT_DIRTY) &&
+ ret = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+ if ((mark & EXTENT_DIRTY_LOG1) &&
test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
errors = true;
- if ((mark & EXTENT_NEW) &&
+ if ((mark & EXTENT_DIRTY_LOG2) &&
test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
errors = true;
- if (errors && !err)
- err = -EIO;
- return err;
+ if (errors && !ret)
+ ret = -EIO;
+ return ret;
}
/*
@@ -1258,7 +1273,7 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
blk_finish_plug(&plug);
ret2 = btrfs_wait_extents(fs_info, dirty_pages);
- extent_io_tree_release(&trans->transaction->dirty_pages);
+ btrfs_extent_io_tree_release(&trans->transaction->dirty_pages);
if (ret)
return ret;
@@ -1320,7 +1335,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
struct list_head *io_bgs = &trans->transaction->io_bgs;
- struct list_head *next;
struct extent_buffer *eb;
int ret;
@@ -1328,7 +1342,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
* At this point no one can be using this transaction to modify any tree
* and no one can start another transaction to modify any tree either.
*/
- ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING,
+ "trans->transaction->state=%d", trans->transaction->state);
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
@@ -1356,13 +1371,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
again:
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
struct btrfs_root *root;
- next = fs_info->dirty_cowonly_roots.next;
- list_del_init(next);
- root = list_entry(next, struct btrfs_root, dirty_list);
+
+ root = list_first_entry(&fs_info->dirty_cowonly_roots,
+ struct btrfs_root, dirty_list);
clear_bit(BTRFS_ROOT_DIRTY, &root->state);
+ list_move_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
- list_add_tail(&root->dirty_list,
- &trans->transaction->switch_commits);
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
@@ -1462,7 +1477,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
* At this point no one can be using this transaction to modify any tree
* and no one can start another transaction to modify any tree either.
*/
- ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING,
+ "trans->transaction->state=%d", trans->transaction->state);
spin_lock(&fs_info->fs_roots_radix_lock);
while (1) {
@@ -1480,9 +1496,15 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
* At this point we can neither have tasks logging inodes
* from a root nor trying to commit a log tree.
*/
- ASSERT(atomic_read(&root->log_writers) == 0);
- ASSERT(atomic_read(&root->log_commit[0]) == 0);
- ASSERT(atomic_read(&root->log_commit[1]) == 0);
+ ASSERT(atomic_read(&root->log_writers) == 0,
+ "atomic_read(&root->log_writers)=%d",
+ atomic_read(&root->log_writers));
+ ASSERT(atomic_read(&root->log_commit[0]) == 0,
+ "atomic_read(&root->log_commit[0])=%d",
+ atomic_read(&root->log_commit[0]));
+ ASSERT(atomic_read(&root->log_commit[1]) == 0,
+ "atomic_read(&root->log_commit[1])=%d",
+ atomic_read(&root->log_commit[1]));
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)btrfs_root_id(root),
@@ -1563,7 +1585,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* qgroup counters could end up wrong.
*/
ret = btrfs_run_delayed_refs(trans, U64_MAX);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1634,8 +1656,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
- struct inode *parent_inode = &pending->dir->vfs_inode;
- struct btrfs_path *path;
+ struct btrfs_inode *parent_inode = pending->dir;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *dir_item;
struct extent_buffer *tmp;
struct extent_buffer *old;
@@ -1660,7 +1682,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* filesystem.
*/
nofs_flags = memalloc_nofs_save();
- pending->error = fscrypt_setup_filename(parent_inode,
+ pending->error = fscrypt_setup_filename(&parent_inode->vfs_inode,
&pending->dentry->d_name, 0,
&fname);
memalloc_nofs_restore(nofs_flags);
@@ -1688,34 +1710,30 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto clear_skip_qgroup;
}
- key.objectid = objectid;
- key.offset = (u64)-1;
- key.type = BTRFS_ROOT_ITEM_KEY;
-
rsv = trans->block_rsv;
trans->block_rsv = &pending->block_rsv;
trans->bytes_reserved = trans->block_rsv->reserved;
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid,
trans->bytes_reserved, 1);
- parent_root = BTRFS_I(parent_inode)->root;
+ parent_root = parent_inode->root;
ret = record_root_in_trans(trans, parent_root, 0);
if (ret)
goto fail;
- cur_time = current_time(parent_inode);
+ cur_time = current_time(&parent_inode->vfs_inode);
/*
* insert the directory item
*/
- ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
- if (ret) {
+ ret = btrfs_set_inode_index(parent_inode, &index);
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
/* check if there is a file/dir which has the same name. */
dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
- btrfs_ino(BTRFS_I(parent_inode)),
+ btrfs_ino(parent_inode),
&fname.disk_name, 0);
if (dir_item != NULL && !IS_ERR(dir_item)) {
pending->error = -EEXIST;
@@ -1729,8 +1747,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_create_qgroup(trans, objectid);
if (ret && ret != -EEXIST) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
+ if (unlikely(ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info))) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
}
/*
@@ -1740,13 +1760,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* snapshot
*/
ret = btrfs_run_delayed_items(trans);
- if (ret) { /* Transaction aborted */
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = record_root_in_trans(trans, root, 0);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1781,7 +1801,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
old = btrfs_lock_root_node(root);
ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
BTRFS_NESTING_COW);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
btrfs_abort_transaction(trans, ret);
@@ -1792,21 +1812,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/* clean up in any case */
btrfs_tree_unlock(old);
free_extent_buffer(old);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
/* see comments in should_cow_block() */
set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
- smp_wmb();
+ smp_mb__after_atomic();
btrfs_set_root_node(new_root_item, tmp);
/* record when the snapshot was created in key.offset */
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = trans->transid;
ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
btrfs_tree_unlock(tmp);
free_extent_buffer(tmp);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1816,9 +1838,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
*/
ret = btrfs_add_root_ref(trans, objectid,
btrfs_root_id(parent_root),
- btrfs_ino(BTRFS_I(parent_inode)), index,
+ btrfs_ino(parent_inode), index,
&fname.disk_name);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1833,7 +1855,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
ret = btrfs_reloc_post_snapshot(trans, pending);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1854,26 +1876,26 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
ret = btrfs_insert_dir_item(trans, &fname.disk_name,
- BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
+ parent_inode, &key, BTRFS_FT_DIR,
index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
- btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
+ btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
fname.disk_name.len * 2);
- inode_set_mtime_to_ts(parent_inode,
- inode_set_ctime_current(parent_inode));
- ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode));
- if (ret) {
+ inode_set_mtime_to_ts(&parent_inode->vfs_inode,
+ inode_set_ctime_current(&parent_inode->vfs_inode));
+ ret = btrfs_update_inode_fallback(trans, parent_inode);
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
BTRFS_UUID_KEY_SUBVOL,
objectid);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1881,7 +1903,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
objectid);
- if (ret && ret != -EEXIST) {
+ if (unlikely(ret && ret != -EEXIST)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1899,7 +1921,6 @@ free_fname:
free_pending:
kfree(new_root_item);
pending->root_item = NULL;
- btrfs_free_path(path);
pending->path = NULL;
return ret;
@@ -2095,7 +2116,14 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+ /*
+ * Not strictly necessary to lock, as no other task will be using a
+ * block_group on the new_bgs list during a transaction abort.
+ */
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ spin_unlock(&fs_info->unused_bgs_lock);
}
}
@@ -2145,18 +2173,25 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans)
return;
lockdep_assert_held(&trans->fs_info->trans_lock);
- ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP);
+ ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP,
+ "cur_trans->state=%d", cur_trans->state);
list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}
-static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
+static void update_commit_stats(struct btrfs_fs_info *fs_info)
{
+ ktime_t now = ktime_get_ns();
+ ktime_t interval = now - fs_info->commit_stats.critical_section_start_time;
+
+ ASSERT(fs_info->commit_stats.critical_section_start_time);
+
fs_info->commit_stats.commit_count++;
fs_info->commit_stats.last_commit_dur = interval;
fs_info->commit_stats.max_commit_dur =
max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
fs_info->commit_stats.total_commit_dur += interval;
+ fs_info->commit_stats.critical_section_start_time = 0;
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
@@ -2165,10 +2200,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
int ret;
- ktime_t start_time;
- ktime_t interval;
- ASSERT(refcount_read(&trans->use_count) == 1);
+ ASSERT(refcount_read(&trans->use_count) == 1,
+ "refcount_read(&trans->use_count)=%d", refcount_read(&trans->use_count));
btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
@@ -2257,14 +2291,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_blocked_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
- if (cur_trans->list.prev != &fs_info->trans_list) {
+ if (!list_is_first(&cur_trans->list, &fs_info->trans_list)) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
if (trans->in_fsync)
want_state = TRANS_STATE_SUPER_COMMITTED;
- prev_trans = list_entry(cur_trans->list.prev,
- struct btrfs_transaction, list);
+ prev_trans = list_prev_entry(cur_trans, list);
if (prev_trans->state < want_state) {
refcount_inc(&prev_trans->use_count);
spin_unlock(&fs_info->trans_lock);
@@ -2300,8 +2333,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* Get the time spent on the work done by the commit thread and not
* the time spent waiting on a previous commit
*/
- start_time = ktime_get_ns();
-
+ fs_info->commit_stats.critical_section_start_time = ktime_get_ns();
extwriter_counter_dec(cur_trans, trans->type);
ret = btrfs_start_delalloc_flush(fs_info);
@@ -2406,7 +2438,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* them.
*
* We needn't worry that this operation will corrupt the snapshots,
- * because all the tree which are snapshoted will be forced to COW
+ * because all the tree which are snapshotted will be forced to COW
* the nodes and leaves.
*/
ret = btrfs_run_delayed_items(trans);
@@ -2533,6 +2565,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto scrub_continue;
+ update_commit_stats(fs_info);
/*
* We needn't acquire the lock here because there is no other task
* which can change it.
@@ -2541,7 +2574,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&cur_trans->commit_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
- btrfs_finish_extent_commit(trans);
+ ret = btrfs_finish_extent_commit(trans);
+ if (ret)
+ goto scrub_continue;
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(fs_info);
@@ -2567,8 +2602,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
trace_btrfs_transaction_commit(fs_info);
- interval = ktime_get_ns() - start_time;
-
btrfs_scrub_continue(fs_info);
if (current->journal_info == trans)
@@ -2576,8 +2609,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- update_commit_stats(fs_info, interval);
-
return ret;
unlock_reloc:
@@ -2641,9 +2672,9 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
- ret = btrfs_drop_snapshot(root, 0, 0);
+ ret = btrfs_drop_snapshot(root, false, false);
else
- ret = btrfs_drop_snapshot(root, 1, 0);
+ ret = btrfs_drop_snapshot(root, true, false);
btrfs_put_root(root);
return (ret < 0) ? 0 : 1;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 184fa5c0062a..18ef069197e5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -14,10 +14,6 @@
#include <linux/wait.h>
#include "btrfs_inode.h"
#include "delayed-ref.h"
-#include "extent-io-tree.h"
-#include "block-rsv.h"
-#include "messages.h"
-#include "misc.h"
struct dentry;
struct inode;
@@ -227,7 +223,21 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
delayed_refs->qgroup_to_skip = 0;
}
-bool __cold abort_should_print_stack(int error);
+/*
+ * We want the transaction abort to print stack trace only for errors where the
+ * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
+ * caused by external factors.
+ */
+static inline bool btrfs_abort_should_print_stack(int error)
+{
+ switch (error) {
+ case -EIO:
+ case -EROFS:
+ case -ENOMEM:
+ return false;
+ }
+ return true;
+}
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
@@ -240,7 +250,7 @@ do { \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
__first = true; \
- if (WARN(abort_should_print_stack(error), \
+ if (WARN(btrfs_abort_should_print_stack(error), \
KERN_ERR \
"BTRFS: Transaction aborted (error %d)\n", \
(error))) { \
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 148d8cefa40e..c21c21adf61e 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -183,15 +183,16 @@ static bool check_prev_ino(struct extent_buffer *leaf,
/* Only these key->types needs to be checked */
ASSERT(key->type == BTRFS_XATTR_ITEM_KEY ||
key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_INODE_EXTREF_KEY ||
key->type == BTRFS_DIR_INDEX_KEY ||
key->type == BTRFS_DIR_ITEM_KEY ||
- key->type == BTRFS_EXTENT_DATA_KEY);
+ key->type == BTRFS_EXTENT_DATA_KEY, "key->type=%u", key->type);
/*
* Only subvolume trees along with their reloc trees need this check.
* Things like log tree doesn't follow this ino requirement.
*/
- if (!is_fstree(btrfs_header_owner(leaf)))
+ if (!btrfs_is_fstree(btrfs_header_owner(leaf)))
return true;
if (key->objectid == prev_key->objectid)
@@ -475,7 +476,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
* to be COWed to be relocated.
*/
if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
- !is_fstree(key->offset))) {
+ !btrfs_is_fstree(key->offset))) {
generic_err(leaf, slot,
"invalid reloc tree for root %lld, root id is not a subvolume tree",
key->offset);
@@ -493,7 +494,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
}
/* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */
- if (unlikely(!is_fstree(key->objectid) && !is_root_item)) {
+ if (unlikely(!btrfs_is_fstree(key->objectid) && !is_root_item)) {
dir_item_err(leaf, slot,
"invalid location key objectid, have %llu expect [%llu, %llu]",
key->objectid, BTRFS_FIRST_FREE_OBJECTID,
@@ -764,22 +765,19 @@ static int check_block_group_item(struct extent_buffer *leaf,
return 0;
}
-__printf(4, 5)
+__printf(5, 6)
__cold
-static void chunk_err(const struct extent_buffer *leaf,
+static void chunk_err(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *leaf,
const struct btrfs_chunk *chunk, u64 logical,
const char *fmt, ...)
{
- const struct btrfs_fs_info *fs_info = leaf->fs_info;
- bool is_sb;
+ bool is_sb = !leaf;
struct va_format vaf;
va_list args;
int i;
int slot = -1;
- /* Only superblock eb is able to have such small offset */
- is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET);
-
if (!is_sb) {
/*
* Get the slot number by iterating through all slots, this
@@ -812,13 +810,17 @@ static void chunk_err(const struct extent_buffer *leaf,
/*
* The common chunk check which could also work on super block sys chunk array.
*
+ * If @leaf is NULL, then @chunk must be an on-stack chunk item.
+ * (For superblock sys_chunk array, and fs_info->sectorsize is unreliable)
+ *
* Return -EUCLEAN if anything is corrupted.
* Return 0 if everything is OK.
*/
-int btrfs_check_chunk_valid(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk, u64 logical)
+int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *leaf,
+ const struct btrfs_chunk *chunk, u64 logical,
+ u32 sectorsize)
{
- struct btrfs_fs_info *fs_info = leaf->fs_info;
u64 length;
u64 chunk_end;
u64 stripe_len;
@@ -826,63 +828,73 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
u16 sub_stripes;
u64 type;
u64 features;
+ u32 chunk_sector_size;
bool mixed = false;
int raid_index;
int nparity;
int ncopies;
- length = btrfs_chunk_length(leaf, chunk);
- stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
- num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
- sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
- type = btrfs_chunk_type(leaf, chunk);
+ if (leaf) {
+ length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
+ chunk_sector_size = btrfs_chunk_sector_size(leaf, chunk);
+ } else {
+ length = btrfs_stack_chunk_length(chunk);
+ stripe_len = btrfs_stack_chunk_stripe_len(chunk);
+ num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+ sub_stripes = btrfs_stack_chunk_sub_stripes(chunk);
+ type = btrfs_stack_chunk_type(chunk);
+ chunk_sector_size = btrfs_stack_chunk_sector_size(chunk);
+ }
raid_index = btrfs_bg_flags_to_raid_index(type);
ncopies = btrfs_raid_array[raid_index].ncopies;
nparity = btrfs_raid_array[raid_index].nparity;
if (unlikely(!num_stripes)) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk num_stripes, have %u", num_stripes);
return -EUCLEAN;
}
if (unlikely(num_stripes < ncopies)) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk num_stripes < ncopies, have %u < %d",
num_stripes, ncopies);
return -EUCLEAN;
}
if (unlikely(nparity && num_stripes == nparity)) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk num_stripes == nparity, have %u == %d",
num_stripes, nparity);
return -EUCLEAN;
}
- if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) {
- chunk_err(leaf, chunk, logical,
+ if (unlikely(!IS_ALIGNED(logical, sectorsize))) {
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk logical, have %llu should aligned to %u",
- logical, fs_info->sectorsize);
+ logical, sectorsize);
return -EUCLEAN;
}
- if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) {
- chunk_err(leaf, chunk, logical,
+ if (unlikely(chunk_sector_size != sectorsize)) {
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk sectorsize, have %u expect %u",
- btrfs_chunk_sector_size(leaf, chunk),
- fs_info->sectorsize);
+ chunk_sector_size, sectorsize);
return -EUCLEAN;
}
- if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) {
- chunk_err(leaf, chunk, logical,
+ if (unlikely(!length || !IS_ALIGNED(length, sectorsize))) {
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk length, have %llu", length);
return -EUCLEAN;
}
if (unlikely(check_add_overflow(logical, length, &chunk_end))) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk logical start and length, have logical start %llu length %llu",
logical, length);
return -EUCLEAN;
}
if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk stripe length: %llu",
stripe_len);
return -EUCLEAN;
@@ -896,30 +908,29 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
* Thus it should be a good way to catch obvious bitflips.
*/
if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"chunk length too large: have %llu limit %llu",
length, btrfs_stripe_nr_to_offset(U32_MAX));
return -EUCLEAN;
}
if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"unrecognized chunk type: 0x%llx",
~(BTRFS_BLOCK_GROUP_TYPE_MASK |
- BTRFS_BLOCK_GROUP_PROFILE_MASK) &
- btrfs_chunk_type(leaf, chunk));
+ BTRFS_BLOCK_GROUP_PROFILE_MASK) & type);
return -EUCLEAN;
}
if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
return -EUCLEAN;
}
if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
type, BTRFS_BLOCK_GROUP_TYPE_MASK);
return -EUCLEAN;
@@ -928,7 +939,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
(type & (BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DATA)))) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"system chunk with data or metadata type: 0x%llx",
type);
return -EUCLEAN;
@@ -941,7 +952,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
if (!mixed) {
if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) &&
(type & BTRFS_BLOCK_GROUP_DATA))) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"mixed chunk type in non-mixed mode: 0x%llx", type);
return -EUCLEAN;
}
@@ -963,7 +974,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid num_stripes:sub_stripes %u:%u for profile %llu",
num_stripes, sub_stripes,
type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
@@ -983,14 +994,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
struct btrfs_key *key, int slot)
{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
int num_stripes;
if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
- chunk_err(leaf, chunk, key->offset,
+ chunk_err(fs_info, leaf, chunk, key->offset,
"invalid chunk item size: have %u expect [%zu, %u)",
btrfs_item_size(leaf, slot),
sizeof(struct btrfs_chunk),
- BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ BTRFS_LEAF_DATA_SIZE(fs_info));
return -EUCLEAN;
}
@@ -1001,14 +1013,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
if (unlikely(btrfs_chunk_item_size(num_stripes) !=
btrfs_item_size(leaf, slot))) {
- chunk_err(leaf, chunk, key->offset,
+ chunk_err(fs_info, leaf, chunk, key->offset,
"invalid chunk item size: have %u expect %lu",
btrfs_item_size(leaf, slot),
btrfs_chunk_item_size(num_stripes));
return -EUCLEAN;
}
out:
- return btrfs_check_chunk_valid(leaf, chunk, key->offset);
+ return btrfs_check_chunk_valid(fs_info, leaf, chunk, key->offset,
+ fs_info->sectorsize);
}
__printf(3, 4)
@@ -1197,7 +1210,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
/*
* For legacy root item, the members starting at generation_v2 will be
* all filled with 0.
- * And since we allow geneartion_v2 as 0, it will still pass the check.
+ * And since we allow generation_v2 as 0, it will still pass the check.
*/
read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
btrfs_item_size(leaf, slot));
@@ -1299,7 +1312,7 @@ static bool is_valid_dref_root(u64 rootid)
* - tree root
* For v1 space cache
*/
- return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID ||
+ return btrfs_is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID ||
rootid == BTRFS_ROOT_TREE_OBJECTID;
}
@@ -1527,6 +1540,11 @@ static int check_extent_item(struct extent_buffer *leaf,
dref_offset, fs_info->sectorsize);
return -EUCLEAN;
}
+ if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) {
+ extent_err(leaf, slot,
+ "invalid data ref count, should have non-zero value");
+ return -EUCLEAN;
+ }
inline_refs += btrfs_extent_data_ref_count(leaf, dref);
break;
/* Contains parent bytenr and ref count */
@@ -1539,6 +1557,11 @@ static int check_extent_item(struct extent_buffer *leaf,
inline_offset, fs_info->sectorsize);
return -EUCLEAN;
}
+ if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) {
+ extent_err(leaf, slot,
+ "invalid shared data ref count, should have non-zero value");
+ return -EUCLEAN;
+ }
inline_refs += btrfs_shared_data_ref_count(leaf, sref);
break;
case BTRFS_EXTENT_OWNER_REF_KEY:
@@ -1549,7 +1572,7 @@ static int check_extent_item(struct extent_buffer *leaf,
inline_type);
return -EUCLEAN;
}
- if (inline_type < last_type) {
+ if (unlikely(inline_type < last_type)) {
extent_err(leaf, slot,
"inline ref out-of-order: has type %u, prev type %u",
inline_type, last_type);
@@ -1558,7 +1581,7 @@ static int check_extent_item(struct extent_buffer *leaf,
/* Type changed, allow the sequence starts from U64_MAX again. */
if (inline_type > last_type)
last_seq = U64_MAX;
- if (seq > last_seq) {
+ if (unlikely(seq > last_seq)) {
extent_err(leaf, slot,
"inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx",
inline_type, inline_offset, seq,
@@ -1595,10 +1618,9 @@ static int check_extent_item(struct extent_buffer *leaf,
if (unlikely(prev_end > key->objectid)) {
extent_err(leaf, slot,
- "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]",
- prev_key->objectid, prev_key->type,
- prev_key->offset, key->objectid, key->type,
- key->offset);
+ "previous extent " BTRFS_KEY_FMT " overlaps current extent " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(prev_key),
+ BTRFS_KEY_FMT_VALUE(key));
return -EUCLEAN;
}
}
@@ -1611,8 +1633,18 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf,
{
u32 expect_item_size = 0;
- if (key->type == BTRFS_SHARED_DATA_REF_KEY)
+ if (key->type == BTRFS_SHARED_DATA_REF_KEY) {
+ struct btrfs_shared_data_ref *sref;
+
+ sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref);
+ if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) {
+ extent_err(leaf, slot,
+ "invalid shared data backref count, should have non-zero value");
+ return -EUCLEAN;
+ }
+
expect_item_size = sizeof(struct btrfs_shared_data_ref);
+ }
if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
generic_err(leaf, slot,
@@ -1689,6 +1721,11 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
offset, leaf->fs_info->sectorsize);
return -EUCLEAN;
}
+ if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) {
+ extent_err(leaf, slot,
+ "invalid extent data backref count, should have non-zero value");
+ return -EUCLEAN;
+ }
}
return 0;
}
@@ -1719,10 +1756,10 @@ static int check_inode_ref(struct extent_buffer *leaf,
while (ptr < end) {
u16 namelen;
- if (unlikely(ptr + sizeof(iref) > end)) {
+ if (unlikely(ptr + sizeof(*iref) > end)) {
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
- ptr, end, sizeof(iref));
+ ptr, end, sizeof(*iref));
return -EUCLEAN;
}
@@ -1745,6 +1782,39 @@ static int check_inode_ref(struct extent_buffer *leaf,
return 0;
}
+static int check_inode_extref(struct extent_buffer *leaf,
+ struct btrfs_key *key, struct btrfs_key *prev_key,
+ int slot)
+{
+ unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
+ unsigned long end = ptr + btrfs_item_size(leaf, slot);
+
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
+ return -EUCLEAN;
+
+ while (ptr < end) {
+ struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr;
+ u16 namelen;
+
+ if (unlikely(ptr + sizeof(*extref) > end)) {
+ inode_ref_err(leaf, slot,
+ "inode extref overflow, ptr %lu end %lu inode_extref size %zu",
+ ptr, end, sizeof(*extref));
+ return -EUCLEAN;
+ }
+
+ namelen = btrfs_inode_extref_name_len(leaf, extref);
+ if (unlikely(ptr + sizeof(*extref) + namelen > end)) {
+ inode_ref_err(leaf, slot,
+ "inode extref overflow, ptr %lu end %lu namelen %u",
+ ptr, end, namelen);
+ return -EUCLEAN;
+ }
+ ptr += sizeof(*extref) + namelen;
+ }
+ return 0;
+}
+
static int check_raid_stripe_extent(const struct extent_buffer *leaf,
const struct btrfs_key *key, int slot)
{
@@ -1856,6 +1926,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
case BTRFS_INODE_REF_KEY:
ret = check_inode_ref(leaf, key, prev_key, slot);
break;
+ case BTRFS_INODE_EXTREF_KEY:
+ ret = check_inode_extref(leaf, key, prev_key, slot);
+ break;
case BTRFS_BLOCK_GROUP_ITEM_KEY:
ret = check_block_group_item(leaf, key, slot);
break;
@@ -1892,7 +1965,7 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
break;
}
- if (ret)
+ if (unlikely(ret))
return BTRFS_TREE_BLOCK_INVALID_ITEM;
return BTRFS_TREE_BLOCK_CLEAN;
}
@@ -1986,10 +2059,9 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
/* Make sure the keys are in the right order */
if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) {
generic_err(leaf, slot,
- "bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
- prev_key.objectid, prev_key.type,
- prev_key.offset, key.objectid, key.type,
- key.offset);
+ "bad key order, prev " BTRFS_KEY_FMT " current " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&prev_key),
+ BTRFS_KEY_FMT_VALUE(&key));
return BTRFS_TREE_BLOCK_BAD_KEY_ORDER;
}
@@ -2107,10 +2179,9 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node)
if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) {
generic_err(node, slot,
- "bad key order, current (%llu %u %llu) next (%llu %u %llu)",
- key.objectid, key.type, key.offset,
- next_key.objectid, next_key.type,
- next_key.offset);
+ "bad key order, current " BTRFS_KEY_FMT " next " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&key),
+ BTRFS_KEY_FMT_VALUE(&next_key));
return BTRFS_TREE_BLOCK_BAD_KEY_ORDER;
}
}
@@ -2130,7 +2201,7 @@ ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
{
- const bool is_subvol = is_fstree(root_owner);
+ const bool is_subvol = btrfs_is_fstree(root_owner);
const u64 eb_owner = btrfs_header_owner(eb);
/*
@@ -2172,7 +2243,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
* For subvolume trees, owners can mismatch, but they should all belong
* to subvolume trees.
*/
- if (unlikely(is_subvol != is_fstree(eb_owner))) {
+ if (unlikely(is_subvol != btrfs_is_fstree(eb_owner))) {
btrfs_crit(eb->fs_info,
"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -2192,13 +2263,12 @@ int btrfs_verify_level_key(struct extent_buffer *eb,
int ret;
found_level = btrfs_header_level(eb);
- if (found_level != check->level) {
- WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
- KERN_ERR "BTRFS: tree level check failed\n");
+ if (unlikely(found_level != check->level)) {
+ DEBUG_WARN();
btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
eb->start, check->level, found_level);
- return -EIO;
+ return -EUCLEAN;
}
if (!check->has_first_key)
@@ -2214,11 +2284,11 @@ int btrfs_verify_level_key(struct extent_buffer *eb,
return 0;
/* We have @first_key, so this @eb must have at least one item */
- if (btrfs_header_nritems(eb) == 0) {
+ if (unlikely(btrfs_header_nritems(eb) == 0)) {
btrfs_err(fs_info,
"invalid tree nritems, bytenr=%llu nritems=0 expect >0",
eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN();
return -EUCLEAN;
}
@@ -2226,11 +2296,10 @@ int btrfs_verify_level_key(struct extent_buffer *eb,
btrfs_node_key_to_cpu(eb, &found_key, 0);
else
btrfs_item_key_to_cpu(eb, &found_key, 0);
- ret = btrfs_comp_cpu_keys(&check->first_key, &found_key);
- if (ret) {
- WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
- KERN_ERR "BTRFS: tree first key check failed\n");
+ ret = btrfs_comp_cpu_keys(&check->first_key, &found_key);
+ if (unlikely(ret)) {
+ DEBUG_WARN();
btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
eb->start, check->transid, check->first_key.objectid,
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index db67f96cbe4b..eb201f4ec3c7 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -10,6 +10,7 @@
#include <uapi/linux/btrfs_tree.h>
struct extent_buffer;
+struct btrfs_fs_info;
struct btrfs_chunk;
struct btrfs_key;
@@ -66,8 +67,10 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node);
int btrfs_check_leaf(struct extent_buffer *leaf);
int btrfs_check_node(struct extent_buffer *node);
-int btrfs_check_chunk_valid(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk, u64 logical);
+int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *leaf,
+ const struct btrfs_chunk *chunk, u64 logical,
+ u32 sectorsize);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
int btrfs_verify_level_key(struct extent_buffer *eb,
const struct btrfs_tree_parent_check *check);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c8d6587688b3..fff37c8d96a4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,7 +27,9 @@
#include "file-item.h"
#include "file.h"
#include "orphan.h"
+#include "print-tree.h"
#include "tree-checker.h"
+#include "delayed-inode.h"
#define MAX_CONFLICT_INODES 10
@@ -101,18 +103,135 @@ enum {
LOG_WALK_REPLAY_ALL,
};
+/*
+ * The walk control struct is used to pass state down the chain when processing
+ * the log tree. The stage field tells us which part of the log tree processing
+ * we are currently doing.
+ */
+struct walk_control {
+ /*
+ * Signal that we are freeing the metadata extents of a log tree.
+ * This is used at transaction commit time while freeing a log tree.
+ */
+ bool free;
+
+ /*
+ * Signal that we are pinning the metadata extents of a log tree and the
+ * data extents its leaves point to (if using mixed block groups).
+ * This happens in the first stage of log replay to ensure that during
+ * replay, while we are modifying subvolume trees, we don't overwrite
+ * the metadata extents of log trees.
+ */
+ bool pin;
+
+ /* What stage of the replay code we're currently in. */
+ int stage;
+
+ /*
+ * Ignore any items from the inode currently being processed. Needs
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY.
+ */
+ bool ignore_cur_inode;
+
+ /*
+ * The root we are currently replaying to. This is NULL for the replay
+ * stage LOG_WALK_PIN_ONLY.
+ */
+ struct btrfs_root *root;
+
+ /* The log tree we are currently processing (not NULL for any stage). */
+ struct btrfs_root *log;
+
+ /* The transaction handle used for replaying all log trees. */
+ struct btrfs_trans_handle *trans;
+
+ /*
+ * The function that gets used to process blocks we find in the tree.
+ * Note the extent_buffer might not be up to date when it is passed in,
+ * and it must be checked or read if you need the data inside it.
+ */
+ int (*process_func)(struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen, int level);
+
+ /*
+ * The following are used only when stage is >= LOG_WALK_REPLAY_INODES
+ * and by the replay_one_buffer() callback.
+ */
+
+ /* The current log leaf being processed. */
+ struct extent_buffer *log_leaf;
+ /* The key being processed of the current log leaf. */
+ struct btrfs_key log_key;
+ /* The slot being processed of the current log leaf. */
+ int log_slot;
+
+ /* A path used for searches and modifications to subvolume trees. */
+ struct btrfs_path *subvol_path;
+};
+
+static void do_abort_log_replay(struct walk_control *wc, const char *function,
+ unsigned int line, int error, const char *fmt, ...)
+{
+ struct btrfs_fs_info *fs_info = wc->trans->fs_info;
+ struct va_format vaf;
+ va_list args;
+
+ /*
+ * Do nothing if we already aborted, to avoid dumping leaves again which
+ * can be verbose. Further more, only the first call is useful since it
+ * is where we have a problem. Note that we do not use the flag
+ * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that
+ * are outside of tree-log.c that can abort transactions (such as
+ * btrfs_add_link() for example), so if that happens we still want to
+ * dump all log replay specific information below.
+ */
+ if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state))
+ return;
+
+ btrfs_abort_transaction(wc->trans, error);
+
+ if (wc->subvol_path->nodes[0]) {
+ btrfs_crit(fs_info,
+ "subvolume (root %llu) leaf currently being processed:",
+ btrfs_root_id(wc->root));
+ btrfs_print_leaf(wc->subvol_path->nodes[0]);
+ }
+
+ if (wc->log_leaf) {
+ btrfs_crit(fs_info,
+"log tree (for root %llu) leaf currently being processed (slot %d key " BTRFS_KEY_FMT "):",
+ btrfs_root_id(wc->root), wc->log_slot,
+ BTRFS_KEY_FMT_VALUE(&wc->log_key));
+ btrfs_print_leaf(wc->log_leaf);
+ }
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV",
+ function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf);
+
+ va_end(args);
+}
+
+/*
+ * Use this for aborting a transaction during log replay while we are down the
+ * call chain of replay_one_buffer(), so that we get a lot more useful
+ * information for debugging issues when compared to a plain call to
+ * btrfs_abort_transaction().
+ */
+#define btrfs_abort_log_replay(wc, error, fmt, args...) \
+ do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args)
+
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx);
-static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid);
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- u64 dirid, int del_all);
+static int link_to_fixup_dir(struct walk_control *wc, u64 objectid);
+static noinline int replay_dir_deletes(struct walk_control *wc,
+ u64 dirid, bool del_all);
static void wait_log_commit(struct btrfs_root *root, int transid);
/*
@@ -138,10 +257,13 @@ static void wait_log_commit(struct btrfs_root *root, int transid);
* and once to do all the other items.
*/
-static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
{
unsigned int nofs_flag;
- struct inode *inode;
+ struct btrfs_inode *inode;
+
+ /* Only meant to be called for subvolume roots and not for log roots. */
+ ASSERT(btrfs_is_fstree(btrfs_root_id(root)), "root_id=%llu", btrfs_root_id(root));
/*
* We're holding a transaction handle whether we are logging or
@@ -297,54 +419,13 @@ void btrfs_end_log_trans(struct btrfs_root *root)
}
/*
- * the walk control struct is used to pass state down the chain when
- * processing the log tree. The stage field tells us which part
- * of the log tree processing we are currently doing. The others
- * are state fields used for that specific part
- */
-struct walk_control {
- /* should we free the extent on disk when done? This is used
- * at transaction commit time while freeing a log tree
- */
- int free;
-
- /* pin only walk, we record which extents on disk belong to the
- * log trees
- */
- int pin;
-
- /* what stage of the replay code we're currently in */
- int stage;
-
- /*
- * Ignore any items from the inode currently being processed. Needs
- * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
- * the LOG_WALK_REPLAY_INODES stage.
- */
- bool ignore_cur_inode;
-
- /* the root we are currently replaying */
- struct btrfs_root *replay_dest;
-
- /* the trans handle for the current replay */
- struct btrfs_trans_handle *trans;
-
- /* the function that gets used to process blocks we find in the
- * tree. Note the extent_buffer might not be up to date when it is
- * passed in, and it must be checked or read if you need the data
- * inside it
- */
- int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
- struct walk_control *wc, u64 gen, int level);
-};
-
-/*
* process_func used to pin down extents, write them or wait on them
*/
-static int process_one_buffer(struct btrfs_root *log,
- struct extent_buffer *eb,
+static int process_one_buffer(struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
+ struct btrfs_root *log = wc->log;
+ struct btrfs_trans_handle *trans = wc->trans;
struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
@@ -359,29 +440,40 @@ static int process_one_buffer(struct btrfs_root *log,
};
ret = btrfs_read_extent_buffer(eb, &check);
- if (ret)
+ if (unlikely(ret)) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
+ }
}
if (wc->pin) {
- ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
- if (ret)
+ ASSERT(trans != NULL);
+ ret = btrfs_pin_extent_for_log_replay(trans, eb);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
return ret;
+ }
- if (btrfs_buffer_uptodate(eb, gen, 0) &&
- btrfs_header_level(eb) == 0)
+ if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) {
ret = btrfs_exclude_logged_extents(eb);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
}
return ret;
}
/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
+ * Item overwrite used by log replay. The given log tree leaf, slot and key
+ * from the walk_control structure all refer to the source data we are copying
+ * out.
*
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
+ * The given root is for the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and will be
+ * released on exit).
*
* If the key is already in the destination tree the existing item is
* overwritten. If the existing item isn't big enough, it is extended.
@@ -389,19 +481,19 @@ static int process_one_buffer(struct btrfs_root *log,
*
* If the key isn't in the destination yet, a new item is inserted.
*/
-static int overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int overwrite_item(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
int ret;
u32 item_size;
u64 saved_i_size = 0;
int save_old_i_size = 0;
unsigned long src_ptr;
unsigned long dst_ptr;
- bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
+ struct extent_buffer *dst_eb;
+ int dst_slot;
+ const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY);
/*
* This is only used during log replay, so the root is always from a
@@ -410,45 +502,46 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* the leaf before writing into the log tree. See the comments at
* copy_items() for more details.
*/
- ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID, "root_id=%llu", btrfs_root_id(root));
- item_size = btrfs_item_size(eb, slot);
- src_ptr = btrfs_item_ptr_offset(eb, slot);
+ item_size = btrfs_item_size(wc->log_leaf, wc->log_slot);
+ src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
/* Look for the key in the destination tree. */
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
+ ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&wc->log_key),
+ btrfs_root_id(root));
return ret;
+ }
+
+ dst_eb = wc->subvol_path->nodes[0];
+ dst_slot = wc->subvol_path->slots[0];
if (ret == 0) {
char *src_copy;
- char *dst_copy;
- u32 dst_size = btrfs_item_size(path->nodes[0],
- path->slots[0]);
+ const u32 dst_size = btrfs_item_size(dst_eb, dst_slot);
+
if (dst_size != item_size)
goto insert;
if (item_size == 0) {
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
- dst_copy = kmalloc(item_size, GFP_NOFS);
src_copy = kmalloc(item_size, GFP_NOFS);
- if (!dst_copy || !src_copy) {
- btrfs_release_path(path);
- kfree(dst_copy);
- kfree(src_copy);
+ if (!src_copy) {
+ btrfs_abort_log_replay(wc, -ENOMEM,
+ "failed to allocate memory for log leaf item");
return -ENOMEM;
}
- read_extent_buffer(eb, src_copy, src_ptr, item_size);
+ read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size);
+ dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
+ ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
- dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
- item_size);
- ret = memcmp(dst_copy, src_copy, item_size);
-
- kfree(dst_copy);
kfree(src_copy);
/*
* they have the same contents, just return, this saves
@@ -457,7 +550,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* sync
*/
if (ret == 0) {
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
@@ -465,28 +558,28 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* We need to load the old nbytes into the inode so when we
* replay the extents we've logged we get the right nbytes.
*/
- if (inode_item) {
+ if (is_inode_item) {
struct btrfs_inode_item *item;
u64 nbytes;
u32 mode;
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ item = btrfs_item_ptr(dst_eb, dst_slot,
struct btrfs_inode_item);
- nbytes = btrfs_inode_nbytes(path->nodes[0], item);
- item = btrfs_item_ptr(eb, slot,
+ nbytes = btrfs_inode_nbytes(dst_eb, item);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot,
struct btrfs_inode_item);
- btrfs_set_inode_nbytes(eb, item, nbytes);
+ btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes);
/*
* If this is a directory we need to reset the i_size to
* 0 so that we can set it up properly when replaying
* the rest of the items in this log.
*/
- mode = btrfs_inode_mode(eb, item);
+ mode = btrfs_inode_mode(wc->log_leaf, item);
if (S_ISDIR(mode))
- btrfs_set_inode_size(eb, item, 0);
+ btrfs_set_inode_size(wc->log_leaf, item, 0);
}
- } else if (inode_item) {
+ } else if (is_inode_item) {
struct btrfs_inode_item *item;
u32 mode;
@@ -494,40 +587,43 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* New inode, set nbytes to 0 so that the nbytes comes out
* properly when we replay the extents.
*/
- item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
- btrfs_set_inode_nbytes(eb, item, 0);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item);
+ btrfs_set_inode_nbytes(wc->log_leaf, item, 0);
/*
* If this is a directory we need to reset the i_size to 0 so
* that we can set it up properly when replaying the rest of
* the items in this log.
*/
- mode = btrfs_inode_mode(eb, item);
+ mode = btrfs_inode_mode(wc->log_leaf, item);
if (S_ISDIR(mode))
- btrfs_set_inode_size(eb, item, 0);
+ btrfs_set_inode_size(wc->log_leaf, item, 0);
}
insert:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* try to insert the key into the destination tree */
- path->skip_release_on_error = 1;
- ret = btrfs_insert_empty_item(trans, root, path,
- key, item_size);
- path->skip_release_on_error = 0;
+ wc->subvol_path->skip_release_on_error = true;
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size);
+ wc->subvol_path->skip_release_on_error = false;
+
+ dst_eb = wc->subvol_path->nodes[0];
+ dst_slot = wc->subvol_path->slots[0];
/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
- u32 found_size;
- found_size = btrfs_item_size(path->nodes[0],
- path->slots[0]);
+ const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
+
if (found_size > item_size)
- btrfs_truncate_item(trans, path, item_size, 1);
+ btrfs_truncate_item(trans, wc->subvol_path, item_size, 1);
else if (found_size < item_size)
- btrfs_extend_item(trans, path, item_size - found_size);
+ btrfs_extend_item(trans, wc->subvol_path, item_size - found_size);
} else if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert item for key " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&wc->log_key));
return ret;
}
- dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]);
+ dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
/* don't overwrite an existing inode if the generation number
* was logged as zero. This is done when the tree logging code
@@ -538,16 +634,15 @@ insert:
* state of the tree found in the subvolume, and i_size is modified
* as it goes
*/
- if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+ if (is_inode_item && ret == -EEXIST) {
struct btrfs_inode_item *src_item;
struct btrfs_inode_item *dst_item;
src_item = (struct btrfs_inode_item *)src_ptr;
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(eb, src_item) == 0) {
- struct extent_buffer *dst_eb = path->nodes[0];
- const u64 ino_size = btrfs_inode_size(eb, src_item);
+ if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) {
+ const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item);
/*
* For regular files an ino_size == 0 is used only when
@@ -556,42 +651,39 @@ insert:
* case don't set the size of the inode in the fs/subvol
* tree, otherwise we would be throwing valid data away.
*/
- if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+ if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) &&
S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
ino_size != 0)
btrfs_set_inode_size(dst_eb, dst_item, ino_size);
goto no_copy;
}
- if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
- S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+ if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) &&
+ S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
save_old_i_size = 1;
- saved_i_size = btrfs_inode_size(path->nodes[0],
- dst_item);
+ saved_i_size = btrfs_inode_size(dst_eb, dst_item);
}
}
- copy_extent_buffer(path->nodes[0], eb, dst_ptr,
- src_ptr, item_size);
+ copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size);
if (save_old_i_size) {
struct btrfs_inode_item *dst_item;
+
dst_item = (struct btrfs_inode_item *)dst_ptr;
- btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+ btrfs_set_inode_size(dst_eb, dst_item, saved_i_size);
}
/* make sure the generation is filled in */
- if (key->type == BTRFS_INODE_ITEM_KEY) {
+ if (is_inode_item) {
struct btrfs_inode_item *dst_item;
+
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
- btrfs_set_inode_generation(path->nodes[0], dst_item,
- trans->transid);
- }
+ if (btrfs_inode_generation(dst_eb, dst_item) == 0)
+ btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
}
no_copy:
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
@@ -610,21 +702,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
return 0;
}
-/*
- * simple helper to read an inode off the disk from a given root
- * This can only be called for subvolume roots and not for the log
- */
-static noinline struct inode *read_one_inode(struct btrfs_root *root,
- u64 objectid)
-{
- struct inode *inode;
-
- inode = btrfs_iget_logging(objectid, root);
- if (IS_ERR(inode))
- inode = NULL;
- return inode;
-}
-
/* replays a single extent in 'eb' at 'slot' with 'key' into the
* subvolume 'root'. path is released on entry and should be released
* on exit.
@@ -637,51 +714,53 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
* The extent is inserted into the file, dropping any existing extents
* from the file that overlap the new one.
*/
-static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static noinline int replay_one_extent(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = root->fs_info;
int found_type;
u64 extent_end;
- u64 start = key->offset;
+ const u64 start = wc->log_key.offset;
u64 nbytes = 0;
+ u64 csum_start;
+ u64 csum_end;
+ LIST_HEAD(ordered_sums);
+ u64 offset;
+ unsigned long dest_offset;
+ struct btrfs_key ins;
struct btrfs_file_extent_item *item;
- struct inode *inode = NULL;
- unsigned long size;
+ struct btrfs_inode *inode = NULL;
int ret = 0;
- item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
- found_type = btrfs_file_extent_type(eb, item);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(wc->log_leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- nbytes = btrfs_file_extent_num_bytes(eb, item);
- extent_end = start + nbytes;
-
- /*
- * We don't add to the inodes nbytes if we are prealloc or a
- * hole.
- */
- if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
- nbytes = 0;
+ extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
+ /* Holes don't take up space. */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0)
+ nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item);
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- size = btrfs_file_extent_ram_bytes(eb, item);
- nbytes = btrfs_file_extent_ram_bytes(eb, item);
- extent_end = ALIGN(start + size,
- fs_info->sectorsize);
+ nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item);
+ extent_end = ALIGN(start + nbytes, fs_info->sectorsize);
} else {
- ret = 0;
- goto out;
+ btrfs_abort_log_replay(wc, -EUCLEAN,
+ "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
+ found_type, btrfs_root_id(root),
+ wc->log_key.objectid, wc->log_key.offset);
+ return -EUCLEAN;
}
- inode = read_one_inode(root, key->objectid);
- if (!inode) {
- ret = -EIO;
- goto out;
+ inode = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get inode %llu for root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
+ return ret;
}
/*
@@ -689,249 +768,300 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
- ret = btrfs_lookup_file_extent(trans, root, path,
- btrfs_ino(BTRFS_I(inode)), start, 0);
+ ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path,
+ btrfs_ino(inode), start, 0);
if (ret == 0 &&
(found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
- struct btrfs_file_extent_item cmp1;
- struct btrfs_file_extent_item cmp2;
- struct btrfs_file_extent_item *existing;
- struct extent_buffer *leaf;
-
- leaf = path->nodes[0];
- existing = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+ struct btrfs_file_extent_item existing;
+ unsigned long ptr;
- read_extent_buffer(eb, &cmp1, (unsigned long)item,
- sizeof(cmp1));
- read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
- sizeof(cmp2));
+ ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ read_extent_buffer(leaf, &existing, ptr, sizeof(existing));
/*
* we already have a pointer to this exact extent,
* we don't have to do anything
*/
- if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
- btrfs_release_path(path);
+ if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item,
+ sizeof(existing)) == 0) {
+ btrfs_release_path(wc->subvol_path);
goto out;
}
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* drop any overlapping extents */
drop_args.start = start;
drop_args.end = extent_end;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
- if (ret)
+ drop_args.path = wc->subvol_path;
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to drop extents for inode %llu range [%llu, %llu) root %llu",
+ wc->log_key.objectid, start, extent_end,
+ btrfs_root_id(root));
goto out;
+ }
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- u64 offset;
- unsigned long dest_offset;
- struct btrfs_key ins;
-
- if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
- btrfs_fs_incompat(fs_info, NO_HOLES))
- goto update_inode;
-
- ret = btrfs_insert_empty_item(trans, root, path, key,
- sizeof(*item));
+ if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ /* inline extents are easy, we just overwrite them */
+ ret = overwrite_item(wc);
if (ret)
goto out;
- dest_offset = btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]);
- copy_extent_buffer(path->nodes[0], eb, dest_offset,
- (unsigned long)item, sizeof(*item));
+ goto update_inode;
+ }
- ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
- ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
- ins.type = BTRFS_EXTENT_ITEM_KEY;
- offset = key->offset - btrfs_file_extent_offset(eb, item);
+ /*
+ * If not an inline extent, it can only be a regular or prealloc one.
+ * We have checked that above and returned -EUCLEAN if not.
+ */
- /*
- * Manually record dirty extent, as here we did a shallow
- * file extent item copy and skip normal backref update,
- * but modifying extent tree all by ourselves.
- * So need to manually record dirty extent for qgroup,
- * as the owner of the file extent changed from log tree
- * (doesn't affect qgroup) to fs/file tree(affects qgroup)
- */
- ret = btrfs_qgroup_trace_extent(trans,
- btrfs_file_extent_disk_bytenr(eb, item),
- btrfs_file_extent_disk_num_bytes(eb, item));
- if (ret < 0)
- goto out;
+ /* A hole and NO_HOLES feature enabled, nothing else to do. */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 &&
+ btrfs_fs_incompat(fs_info, NO_HOLES))
+ goto update_inode;
- if (ins.objectid > 0) {
- u64 csum_start;
- u64 csum_end;
- LIST_HEAD(ordered_sums);
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path,
+ &wc->log_key, sizeof(*item));
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert item with key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&wc->log_key),
+ btrfs_root_id(root));
+ goto out;
+ }
+ dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0],
+ wc->subvol_path->slots[0]);
+ copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset,
+ (unsigned long)item, sizeof(*item));
- /*
- * is this extent already allocated in the extent
- * allocation tree? If so, just add a reference
- */
- ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
- ins.offset);
- if (ret < 0) {
- goto out;
- } else if (ret == 0) {
- struct btrfs_ref ref = {
- .action = BTRFS_ADD_DELAYED_REF,
- .bytenr = ins.objectid,
- .num_bytes = ins.offset,
- .owning_root = btrfs_root_id(root),
- .ref_root = btrfs_root_id(root),
- };
- btrfs_init_data_ref(&ref, key->objectid, offset,
- 0, false);
- ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret)
- goto out;
- } else {
- /*
- * insert the extent pointer in the extent
- * allocation tree
- */
- ret = btrfs_alloc_logged_file_extent(trans,
- btrfs_root_id(root),
- key->objectid, offset, &ins);
- if (ret)
- goto out;
- }
- btrfs_release_path(path);
+ /*
+ * We have an explicit hole and NO_HOLES is not enabled. We have added
+ * the hole file extent item to the subvolume tree, so we don't have
+ * anything else to do other than update the file extent item range and
+ * update the inode item.
+ */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) {
+ btrfs_release_path(wc->subvol_path);
+ goto update_inode;
+ }
- if (btrfs_file_extent_compression(eb, item)) {
- csum_start = ins.objectid;
- csum_end = csum_start + ins.offset;
- } else {
- csum_start = ins.objectid +
- btrfs_file_extent_offset(eb, item);
- csum_end = csum_start +
- btrfs_file_extent_num_bytes(eb, item);
- }
+ ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item);
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item);
+ offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item);
- ret = btrfs_lookup_csums_list(root->log_root,
- csum_start, csum_end - 1,
- &ordered_sums, false);
- if (ret < 0)
- goto out;
- ret = 0;
- /*
- * Now delete all existing cums in the csum root that
- * cover our range. We do this because we can have an
- * extent that is completely referenced by one file
- * extent item and partially referenced by another
- * file extent item (like after using the clone or
- * extent_same ioctls). In this case if we end up doing
- * the replay of the one that partially references the
- * extent first, and we do not do the csum deletion
- * below, we can get 2 csum items in the csum tree that
- * overlap each other. For example, imagine our log has
- * the two following file extent items:
- *
- * key (257 EXTENT_DATA 409600)
- * extent data disk byte 12845056 nr 102400
- * extent data offset 20480 nr 20480 ram 102400
- *
- * key (257 EXTENT_DATA 819200)
- * extent data disk byte 12845056 nr 102400
- * extent data offset 0 nr 102400 ram 102400
- *
- * Where the second one fully references the 100K extent
- * that starts at disk byte 12845056, and the log tree
- * has a single csum item that covers the entire range
- * of the extent:
- *
- * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
- *
- * After the first file extent item is replayed, the
- * csum tree gets the following csum item:
- *
- * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
- *
- * Which covers the 20K sub-range starting at offset 20K
- * of our extent. Now when we replay the second file
- * extent item, if we do not delete existing csum items
- * that cover any of its blocks, we end up getting two
- * csum items in our csum tree that overlap each other:
- *
- * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
- * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
- *
- * Which is a problem, because after this anyone trying
- * to lookup up for the checksum of any block of our
- * extent starting at an offset of 40K or higher, will
- * end up looking at the second csum item only, which
- * does not contain the checksum for any block starting
- * at offset 40K or higher of our extent.
- */
- while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums;
- struct btrfs_root *csum_root;
-
- sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
- csum_root = btrfs_csum_root(fs_info,
- sums->logical);
- if (!ret)
- ret = btrfs_del_csums(trans, csum_root,
- sums->logical,
- sums->len);
- if (!ret)
- ret = btrfs_csum_file_blocks(trans,
- csum_root,
- sums);
- list_del(&sums->list);
- kfree(sums);
- }
- if (ret)
- goto out;
- } else {
- btrfs_release_path(path);
+ /*
+ * Manually record dirty extent, as here we did a shallow file extent
+ * item copy and skip normal backref update, but modifying extent tree
+ * all by ourselves. So need to manually record dirty extent for qgroup,
+ * as the owner of the file extent changed from log tree (doesn't affect
+ * qgroup) to fs/file tree (affects qgroup).
+ */
+ ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid, btrfs_root_id(root));
+ goto out;
+ }
+
+ /*
+ * Is this extent already allocated in the extent tree?
+ * If so, just add a reference.
+ */
+ ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid, btrfs_root_id(root));
+ goto out;
+ } else if (ret == 0) {
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = ins.objectid,
+ .num_bytes = ins.offset,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+
+ btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
}
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- /* inline extents are easy, we just overwrite them */
- ret = overwrite_item(trans, root, path, eb, slot, key);
- if (ret)
+ } else {
+ /* Insert the extent pointer in the extent tree. */
+ ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root),
+ wc->log_key.objectid, offset, &ins);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu",
+ ins.objectid, ins.offset, offset,
+ wc->log_key.objectid, btrfs_root_id(root));
goto out;
+ }
+ }
+
+ btrfs_release_path(wc->subvol_path);
+
+ if (btrfs_file_extent_compression(wc->log_leaf, item)) {
+ csum_start = ins.objectid;
+ csum_end = csum_start + ins.offset;
+ } else {
+ csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item);
+ csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
}
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
- extent_end - start);
+ ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1,
+ &ordered_sums, false);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookups csums for range [%llu, %llu) inode %llu root %llu",
+ csum_start, csum_end, wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
+ ret = 0;
+ /*
+ * Now delete all existing cums in the csum root that cover our range.
+ * We do this because we can have an extent that is completely
+ * referenced by one file extent item and partially referenced by
+ * another file extent item (like after using the clone or extent_same
+ * ioctls). In this case if we end up doing the replay of the one that
+ * partially references the extent first, and we do not do the csum
+ * deletion below, we can get 2 csum items in the csum tree that overlap
+ * each other. For example, imagine our log has the two following file
+ * extent items:
+ *
+ * key (257 EXTENT_DATA 409600)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 20480 nr 20480 ram 102400
+ *
+ * key (257 EXTENT_DATA 819200)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 0 nr 102400 ram 102400
+ *
+ * Where the second one fully references the 100K extent that starts at
+ * disk byte 12845056, and the log tree has a single csum item that
+ * covers the entire range of the extent:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ *
+ * After the first file extent item is replayed, the csum tree gets the
+ * following csum item:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which covers the 20K sub-range starting at offset 20K of our extent.
+ * Now when we replay the second file extent item, if we do not delete
+ * existing csum items that cover any of its blocks, we end up getting
+ * two csum items in our csum tree that overlap each other:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which is a problem, because after this anyone trying to lookup for
+ * the checksum of any block of our extent starting at an offset of 40K
+ * or higher, will end up looking at the second csum item only, which
+ * does not contain the checksum for any block starting at offset 40K or
+ * higher of our extent.
+ */
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_root *csum_root;
+
+ sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list);
+ csum_root = btrfs_csum_root(fs_info, sums->logical);
+ if (!ret) {
+ ret = btrfs_del_csums(trans, csum_root, sums->logical,
+ sums->len);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete csums for range [%llu, %llu) inode %llu root %llu",
+ sums->logical,
+ sums->logical + sums->len,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ }
+ if (!ret) {
+ ret = btrfs_csum_file_blocks(trans, csum_root, sums);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to add csums for range [%llu, %llu) inode %llu root %llu",
+ sums->logical,
+ sums->logical + sums->len,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ }
+ list_del(&sums->list);
+ kfree(sums);
+ }
if (ret)
goto out;
update_inode:
- btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to set file extent range [%llu, %llu) inode %llu root %llu",
+ start, extent_end, wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
+
+ btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
out:
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
-static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
+static int unlink_inode_for_log_replay(struct walk_control *wc,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
const struct fscrypt_str *name)
{
+ struct btrfs_trans_handle *trans = wc->trans;
int ret;
ret = btrfs_unlink_inode(trans, dir, inode, name);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to unlink inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir), name->len,
+ name->name, btrfs_root_id(inode->root));
return ret;
+ }
/*
* Whenever we need to check if a name exists or not, we check the
* fs/subvolume tree. So after an unlink we must run delayed items, so
* that future checks for a name during log replay see that the name
* does not exists anymore.
*/
- return btrfs_run_delayed_items(trans);
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+"failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir), name->len,
+ name->name, btrfs_root_id(inode->root));
+
+ return ret;
}
/*
@@ -942,41 +1072,48 @@ static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
* This is a helper function to do the unlink of a specific directory
* item
*/
-static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
+static noinline int drop_one_dir_item(struct walk_control *wc,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
struct btrfs_root *root = dir->root;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fscrypt_str name;
- struct extent_buffer *leaf;
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
struct btrfs_key location;
int ret;
- leaf = path->nodes[0];
-
btrfs_dir_item_key_to_cpu(leaf, di, &location);
ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
- if (ret)
- return -ENOMEM;
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
+ return ret;
+ }
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to open inode %llu parent dir %llu name %.*s root %llu",
+ location.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
+ inode = NULL;
goto out;
}
- ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ ret = link_to_fixup_dir(wc, location.objectid);
if (ret)
goto out;
- ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
out:
kfree(name.name);
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1039,7 +1176,7 @@ static noinline int backref_in_log(struct btrfs_root *log,
u64 ref_objectid,
const struct fscrypt_str *name)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
path = btrfs_alloc_path();
@@ -1047,12 +1184,10 @@ static noinline int backref_in_log(struct btrfs_root *log,
return -ENOMEM;
ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
- if (ret < 0) {
- goto out;
- } else if (ret == 1) {
- ret = 0;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
@@ -1061,172 +1196,224 @@ static noinline int backref_in_log(struct btrfs_root *log,
else
ret = !!btrfs_find_name_in_backref(path->nodes[0],
path->slots[0], name);
-out:
- btrfs_free_path(path);
return ret;
}
-static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_root *log_root,
+static int unlink_refs_not_in_log(struct walk_control *wc,
+ struct btrfs_key *search_key,
struct btrfs_inode *dir,
- struct btrfs_inode *inode,
- u64 inode_objectid, u64 parent_objectid,
- u64 ref_index, struct fscrypt_str *name)
+ struct btrfs_inode *inode)
{
- int ret;
- struct extent_buffer *leaf;
- struct btrfs_dir_item *di;
- struct btrfs_key search_key;
- struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+ unsigned long ptr;
+ unsigned long ptr_end;
-again:
- /* Search old style refs */
- search_key.objectid = inode_objectid;
- search_key.type = BTRFS_INODE_REF_KEY;
- search_key.offset = parent_objectid;
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret == 0) {
+ /*
+ * Check all the names in this back reference to see if they are in the
+ * log. If so, we allow them to stay otherwise they must be unlinked as
+ * a conflict.
+ */
+ ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]);
+ while (ptr < ptr_end) {
+ struct fscrypt_str victim_name;
struct btrfs_inode_ref *victim_ref;
- unsigned long ptr;
- unsigned long ptr_end;
-
- leaf = path->nodes[0];
-
- /* are we trying to overwrite a back ref for the root directory
- * if so, just jump out, we're done
- */
- if (search_key.objectid == search_key.offset)
- return 1;
-
- /* check all the names in this back reference to see
- * if they are in the log. if so, we allow them to stay
- * otherwise they must be unlinked as a conflict
- */
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
- while (ptr < ptr_end) {
- struct fscrypt_str victim_name;
+ int ret;
- victim_ref = (struct btrfs_inode_ref *)ptr;
- ret = read_alloc_one_name(leaf, (victim_ref + 1),
- btrfs_inode_ref_name_len(leaf, victim_ref),
- &victim_name);
- if (ret)
- return ret;
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ ret = read_alloc_one_name(leaf, (victim_ref + 1),
+ btrfs_inode_ref_name_len(leaf, victim_ref),
+ &victim_name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for inode %llu parent dir %llu root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ btrfs_root_id(inode->root));
+ return ret;
+ }
- ret = backref_in_log(log_root, &search_key,
- parent_objectid, &victim_name);
+ ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name);
+ if (ret) {
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ victim_name.len, victim_name.name,
+ btrfs_root_id(inode->root));
kfree(victim_name.name);
return ret;
- } else if (!ret) {
- inc_nlink(&inode->vfs_inode);
- btrfs_release_path(path);
-
- ret = unlink_inode_for_log_replay(trans, dir, inode,
- &victim_name);
- kfree(victim_name.name);
- if (ret)
- return ret;
- goto again;
}
kfree(victim_name.name);
-
ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
+ continue;
}
- }
- btrfs_release_path(path);
- /* Same search but for extended refs */
- extref = btrfs_lookup_inode_extref(NULL, root, path, name,
- inode_objectid, parent_objectid, 0,
- 0);
- if (IS_ERR(extref)) {
- return PTR_ERR(extref);
- } else if (extref) {
- u32 item_size;
- u32 cur_offset = 0;
- unsigned long base;
- struct inode *victim_parent;
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(wc->subvol_path);
- leaf = path->nodes[0];
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ return -EAGAIN;
+ }
- item_size = btrfs_item_size(leaf, path->slots[0]);
- base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ return 0;
+}
- while (cur_offset < item_size) {
- struct fscrypt_str victim_name;
+static int unlink_extrefs_not_in_log(struct walk_control *wc,
+ struct btrfs_key *search_key,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode)
+{
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+ const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]);
+ u32 cur_offset = 0;
- extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ while (cur_offset < item_size) {
+ struct btrfs_root *log_root = wc->log;
+ struct btrfs_inode_extref *extref;
+ struct fscrypt_str victim_name;
+ int ret;
- if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
- goto next;
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
- ret = read_alloc_one_name(leaf, &extref->name,
- btrfs_inode_extref_name_len(leaf, extref),
- &victim_name);
- if (ret)
- return ret;
+ if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir))
+ goto next;
- search_key.objectid = inode_objectid;
- search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = btrfs_extref_hash(parent_objectid,
- victim_name.name,
- victim_name.len);
- ret = backref_in_log(log_root, &search_key,
- parent_objectid, &victim_name);
+ ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
+ &victim_name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for inode %llu parent dir %llu root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ btrfs_root_id(inode->root));
+ return ret;
+ }
+
+ search_key->objectid = btrfs_ino(inode);
+ search_key->type = BTRFS_INODE_EXTREF_KEY;
+ search_key->offset = btrfs_extref_hash(btrfs_ino(dir),
+ victim_name.name,
+ victim_name.len);
+ ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name);
+ if (ret) {
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ victim_name.len, victim_name.name,
+ btrfs_root_id(inode->root));
kfree(victim_name.name);
return ret;
- } else if (!ret) {
- ret = -ENOENT;
- victim_parent = read_one_inode(root,
- parent_objectid);
- if (victim_parent) {
- inc_nlink(&inode->vfs_inode);
- btrfs_release_path(path);
-
- ret = unlink_inode_for_log_replay(trans,
- BTRFS_I(victim_parent),
- inode, &victim_name);
- }
- iput(victim_parent);
- kfree(victim_name.name);
- if (ret)
- return ret;
- goto again;
}
kfree(victim_name.name);
next:
cur_offset += victim_name.len + sizeof(*extref);
+ continue;
}
+
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(wc->subvol_path);
+
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ return -EAGAIN;
}
- btrfs_release_path(path);
+
+ return 0;
+}
+
+static inline int __add_inode_ref(struct walk_control *wc,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ u64 ref_index, struct fscrypt_str *name)
+{
+ int ret;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_dir_item *di;
+ struct btrfs_key search_key;
+ struct btrfs_inode_extref *extref;
+
+again:
+ /* Search old style refs */
+ search_key.objectid = btrfs_ino(inode);
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = btrfs_ino(dir);
+ ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&search_key),
+ btrfs_root_id(root));
+ return ret;
+ } else if (ret == 0) {
+ /*
+ * Are we trying to overwrite a back ref for the root directory?
+ * If so, we're done.
+ */
+ if (search_key.objectid == search_key.offset)
+ return 1;
+
+ ret = unlink_refs_not_in_log(wc, &search_key, dir, inode);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ return ret;
+ }
+ btrfs_release_path(wc->subvol_path);
+
+ /* Same search but for extended refs */
+ extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name,
+ btrfs_ino(inode), btrfs_ino(dir));
+ if (IS_ERR(extref)) {
+ return PTR_ERR(extref);
+ } else if (extref) {
+ ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ return ret;
+ }
+ btrfs_release_path(wc->subvol_path);
/* look for a conflicting sequence number */
- di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+ di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir),
ref_index, name, 0);
if (IS_ERR(di)) {
- return PTR_ERR(di);
+ ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+"failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(dir), ref_index, name->len,
+ name->name, btrfs_root_id(root));
+ return ret;
} else if (di) {
- ret = drop_one_dir_item(trans, path, dir, di);
+ ret = drop_one_dir_item(wc, dir, di);
if (ret)
return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* look for a conflicting name */
- di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
+ di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0);
if (IS_ERR(di)) {
- return PTR_ERR(di);
+ ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir item for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name->len, name->name,
+ btrfs_root_id(root));
+ return ret;
} else if (di) {
- ret = drop_one_dir_item(trans, path, dir, di);
+ ret = drop_one_dir_item(wc, dir, di);
if (ret)
return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
@@ -1279,66 +1466,81 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
* proper unlink of that name (that is, remove its entry from the inode
* reference item and both dir index keys).
*/
-static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_inode *inode,
- struct extent_buffer *log_eb,
- int log_slot,
- struct btrfs_key *key)
+static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode)
{
+ struct btrfs_root *root = wc->root;
int ret;
unsigned long ref_ptr;
unsigned long ref_end;
struct extent_buffer *eb;
again:
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ btrfs_release_path(wc->subvol_path);
+ ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
if (ret > 0) {
ret = 0;
goto out;
}
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&wc->log_key),
+ btrfs_root_id(root));
goto out;
+ }
- eb = path->nodes[0];
- ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
- ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
+ eb = wc->subvol_path->nodes[0];
+ ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]);
+ ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]);
while (ref_ptr < ref_end) {
struct fscrypt_str name;
u64 parent_id;
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
ret = extref_get_fields(eb, ref_ptr, &name,
NULL, &parent_id);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get extref details for inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
+ goto out;
+ }
} else {
- parent_id = key->offset;
+ parent_id = wc->log_key.offset;
ret = ref_get_fields(eb, ref_ptr, &name, NULL);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get ref details for inode %llu parent_id %llu root %llu",
+ btrfs_ino(inode), parent_id,
+ btrfs_root_id(root));
+ goto out;
+ }
}
- if (ret)
- goto out;
- if (key->type == BTRFS_INODE_EXTREF_KEY)
- ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
+ ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot,
parent_id, &name);
else
- ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
+ ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot,
+ &name);
if (!ret) {
- struct inode *dir;
+ struct btrfs_inode *dir;
- btrfs_release_path(path);
- dir = read_one_inode(root, parent_id);
- if (!dir) {
- ret = -ENOENT;
+ btrfs_release_path(wc->subvol_path);
+ dir = btrfs_iget_logging(parent_id, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
kfree(name.name);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_id, btrfs_root_id(root));
goto out;
}
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
- inode, &name);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
kfree(name.name);
- iput(dir);
+ iput(&dir->vfs_inode);
if (ret)
goto out;
goto again;
@@ -1346,57 +1548,51 @@ again:
kfree(name.name);
ref_ptr += name.len;
- if (key->type == BTRFS_INODE_EXTREF_KEY)
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
ref_ptr += sizeof(struct btrfs_inode_extref);
else
ref_ptr += sizeof(struct btrfs_inode_ref);
}
ret = 0;
out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
/*
- * replay one inode back reference item found in the log tree.
- * eb, slot and key refer to the buffer and key found in the log tree.
- * root is the destination we are replaying into, and path is for temp
- * use by this function. (it should be released on return).
+ * Replay one inode back reference item found in the log tree.
+ * Path is for temporary use by this function (it should be released on return).
*/
-static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static noinline int add_inode_ref(struct walk_control *wc)
{
- struct inode *dir = NULL;
- struct inode *inode = NULL;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_inode *dir = NULL;
+ struct btrfs_inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
struct fscrypt_str name = { 0 };
int ret;
- int log_ref_ver = 0;
+ const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY);
u64 parent_objectid;
u64 inode_objectid;
u64 ref_index = 0;
int ref_struct_size;
- ref_ptr = btrfs_item_ptr_offset(eb, slot);
- ref_end = ref_ptr + btrfs_item_size(eb, slot);
+ ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
+ ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot);
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ if (is_extref_item) {
struct btrfs_inode_extref *r;
ref_struct_size = sizeof(struct btrfs_inode_extref);
- log_ref_ver = 1;
r = (struct btrfs_inode_extref *)ref_ptr;
- parent_objectid = btrfs_inode_extref_parent(eb, r);
+ parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r);
} else {
ref_struct_size = sizeof(struct btrfs_inode_ref);
- parent_objectid = key->offset;
+ parent_objectid = wc->log_key.offset;
}
- inode_objectid = key->objectid;
+ inode_objectid = wc->log_key.objectid;
/*
* it is possible that we didn't log all the parent directories
@@ -1404,41 +1600,93 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* copy the back ref in. The link count fixup code will take
* care of the rest
*/
- dir = read_one_inode(root, parent_objectid);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_objectid, btrfs_root_id(root));
+ dir = NULL;
goto out;
}
- inode = read_one_inode(root, inode_objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(inode_objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ inode_objectid, btrfs_root_id(root));
+ inode = NULL;
goto out;
}
while (ref_ptr < ref_end) {
- if (log_ref_ver) {
- ret = extref_get_fields(eb, ref_ptr, &name,
+ if (is_extref_item) {
+ ret = extref_get_fields(wc->log_leaf, ref_ptr, &name,
&ref_index, &parent_objectid);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get extref details for inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
+ goto out;
+ }
/*
* parent object can change from one array
* item to another.
*/
- if (!dir)
- dir = read_one_inode(root, parent_objectid);
if (!dir) {
- ret = -ENOENT;
- goto out;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ dir = NULL;
+ /*
+ * A new parent dir may have not been
+ * logged and not exist in the subvolume
+ * tree, see the comment above before
+ * the loop when getting the first
+ * parent dir.
+ */
+ if (ret == -ENOENT) {
+ /*
+ * The next extref may refer to
+ * another parent dir that
+ * exists, so continue.
+ */
+ ret = 0;
+ goto next;
+ } else {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_objectid,
+ btrfs_root_id(root));
+ }
+ goto out;
+ }
}
} else {
- ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
+ ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get ref details for inode %llu parent_objectid %llu root %llu",
+ btrfs_ino(inode),
+ parent_objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
}
- if (ret)
- goto out;
- ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index, &name);
+ ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir),
+ btrfs_ino(inode), ref_index, &name);
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ ref_index, name.len, name.name,
+ btrfs_root_id(root));
goto out;
} else if (ret == 0) {
/*
@@ -1448,10 +1696,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
- ret = __add_inode_ref(trans, root, path, log,
- BTRFS_I(dir), BTRFS_I(inode),
- inode_objectid, parent_objectid,
- ref_index, &name);
+ ret = __add_inode_ref(wc, dir, inode, ref_index, &name);
if (ret) {
if (ret == 1)
ret = 0;
@@ -1459,22 +1704,34 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
}
/* insert our name */
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- &name, 0, ref_index);
- if (ret)
+ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(inode),
+ btrfs_ino(dir), ref_index,
+ name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ }
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret)
+ ret = btrfs_update_inode(trans, inode);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
goto out;
+ }
}
/* Else, ret == 1, we already have a perfect match, we're done. */
+next:
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
kfree(name.name);
name.name = NULL;
- if (log_ref_ver) {
- iput(dir);
+ if (is_extref_item && dir) {
+ iput(&dir->vfs_inode);
dir = NULL;
}
}
@@ -1487,18 +1744,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* dir index entries exist for a name but there is no inode reference
* item with the same name.
*/
- ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
- key);
+ ret = unlink_old_inode_refs(wc, inode);
if (ret)
goto out;
/* finally write the back reference in the inode */
- ret = overwrite_item(trans, root, path, eb, slot, key);
+ ret = overwrite_item(wc);
out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
kfree(name.name);
- iput(dir);
- iput(inode);
+ if (dir)
+ iput(&dir->vfs_inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1611,26 +1869,22 @@ process_slot:
* number of back refs found. If it goes down to zero, the iput
* will free the inode.
*/
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
- struct inode *inode)
+static noinline int fixup_inode_link_count(struct walk_control *wc,
+ struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_path *path;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = inode->root;
int ret;
u64 nlink = 0;
- u64 ino = btrfs_ino(BTRFS_I(inode));
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ const u64 ino = btrfs_ino(inode);
- ret = count_inode_refs(BTRFS_I(inode), path);
+ ret = count_inode_refs(inode, wc->subvol_path);
if (ret < 0)
goto out;
nlink = ret;
- ret = count_inode_extrefs(BTRFS_I(inode), path);
+ ret = count_inode_extrefs(inode, wc->subvol_path);
if (ret < 0)
goto out;
@@ -1638,19 +1892,18 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
ret = 0;
- if (nlink != inode->i_nlink) {
- set_nlink(inode, nlink);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (nlink != inode->vfs_inode.i_nlink) {
+ set_nlink(&inode->vfs_inode, nlink);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
goto out;
}
- if (S_ISDIR(inode->i_mode))
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->index_cnt = (u64)-1;
- if (inode->i_nlink == 0) {
- if (S_ISDIR(inode->i_mode)) {
- ret = replay_dir_deletes(trans, root, NULL, path,
- ino, 1);
+ if (inode->vfs_inode.i_nlink == 0) {
+ if (S_ISDIR(inode->vfs_inode.i_mode)) {
+ ret = replay_dir_deletes(wc, ino, true);
if (ret)
goto out;
}
@@ -1660,62 +1913,63 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
}
out:
- btrfs_free_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
-static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path)
+static noinline int fixup_inode_link_counts(struct walk_control *wc)
{
int ret;
struct btrfs_key key;
- struct inode *inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_inode *inode;
+
+ ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1);
if (ret < 0)
break;
if (ret == 1) {
ret = 0;
- if (path->slots[0] == 0)
+ if (wc->subvol_path->slots[0] == 0)
break;
- path->slots[0]--;
+ wc->subvol_path->slots[0]--;
}
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]);
if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
- ret = btrfs_del_item(trans, root, path);
+ ret = btrfs_del_item(trans, root, wc->subvol_path);
if (ret)
break;
- btrfs_release_path(path);
- inode = read_one_inode(root, key.offset);
- if (!inode) {
- ret = -EIO;
+ btrfs_release_path(wc->subvol_path);
+ inode = btrfs_iget_logging(key.offset, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
break;
}
- ret = fixup_inode_link_count(trans, inode);
- iput(inode);
+ ret = fixup_inode_link_count(wc, inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
/*
* fixup on a directory may create new entries,
- * make sure we always look for the highset possible
+ * make sure we always look for the highest possible
* offset
*/
key.offset = (u64)-1;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
@@ -1725,36 +1979,50 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
* count when replay is done. The link count is incremented here
* so the inode won't go away until we check it
*/
-static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 objectid)
+static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_key key;
int ret = 0;
- struct inode *inode;
+ struct btrfs_inode *inode;
+ struct inode *vfs_inode;
- inode = read_one_inode(root, objectid);
- if (!inode)
- return -EIO;
+ inode = btrfs_iget_logging(objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ objectid, btrfs_root_id(root));
+ return ret;
+ }
+ vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
- ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (ret == 0) {
- if (!inode->i_nlink)
- set_nlink(inode, 1);
+ if (!vfs_inode->i_nlink)
+ set_nlink(vfs_inode, 1);
else
- inc_nlink(inode);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ inc_nlink(vfs_inode);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ objectid, btrfs_root_id(root));
} else if (ret == -EEXIST) {
ret = 0;
+ } else {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert fixup item for inode %llu root %llu",
+ objectid, btrfs_root_id(root));
}
- iput(inode);
+ iput(vfs_inode);
return ret;
}
@@ -1770,33 +2038,31 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name,
struct btrfs_key *location)
{
- struct inode *inode;
- struct inode *dir;
+ struct btrfs_inode *inode;
+ struct btrfs_inode *dir;
int ret;
- inode = read_one_inode(root, location->objectid);
- if (!inode)
- return -ENOENT;
+ inode = btrfs_iget_logging(location->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- dir = read_one_inode(root, dirid);
- if (!dir) {
- iput(inode);
- return -EIO;
+ dir = btrfs_iget_logging(dirid, root);
+ if (IS_ERR(dir)) {
+ iput(&inode->vfs_inode);
+ return PTR_ERR(dir);
}
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- 1, index);
+ ret = btrfs_add_link(trans, dir, inode, name, 1, index);
/* FIXME, put inode into FIXUP list */
- iput(inode);
- iput(dir);
+ iput(&inode->vfs_inode);
+ iput(&dir->vfs_inode);
return ret;
}
-static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
+static int delete_conflicting_dir_entry(struct walk_control *wc,
struct btrfs_inode *dir,
- struct btrfs_path *path,
struct btrfs_dir_item *dst_di,
const struct btrfs_key *log_key,
u8 log_flags,
@@ -1804,12 +2070,12 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
{
struct btrfs_key found_key;
- btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+ btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key);
/* The existing dentry points to the same inode, don't delete it. */
if (found_key.objectid == log_key->objectid &&
found_key.type == log_key->type &&
found_key.offset == log_key->offset &&
- btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
+ btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags)
return 1;
/*
@@ -1819,7 +2085,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
if (!exists)
return 0;
- return drop_one_dir_item(trans, path, dir, dst_di);
+ return drop_one_dir_item(wc, dir, dst_di);
}
/*
@@ -1838,13 +2104,10 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
* Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
* non-existing inode) and 1 if the name was replayed.
*/
-static noinline int replay_one_name(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb,
- struct btrfs_dir_item *di,
- struct btrfs_key *key)
+static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct fscrypt_str name = { 0 };
struct btrfs_dir_item *dir_dst_di;
struct btrfs_dir_item *index_dst_di;
@@ -1852,62 +2115,92 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
bool index_dst_matches = false;
struct btrfs_key log_key;
struct btrfs_key search_key;
- struct inode *dir;
+ struct btrfs_inode *dir;
u8 log_flags;
bool exists;
int ret;
bool update_size = true;
bool name_added = false;
- dir = read_one_inode(root, key->objectid);
- if (!dir)
- return -EIO;
+ dir = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
+ return ret;
+ }
- ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
- if (ret)
+ ret = read_alloc_one_name(wc->log_leaf, di + 1,
+ btrfs_dir_name_len(wc->log_leaf, di), &name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
goto out;
+ }
- log_flags = btrfs_dir_flags(eb, di);
- btrfs_dir_item_key_to_cpu(eb, di, &log_key);
- ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
- btrfs_release_path(path);
- if (ret < 0)
+ log_flags = btrfs_dir_flags(wc->log_leaf, di);
+ btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key);
+ ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0);
+ btrfs_release_path(wc->subvol_path);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ log_key.objectid, btrfs_root_id(root));
goto out;
+ }
exists = (ret == 0);
ret = 0;
- dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
- &name, 1);
+ dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path,
+ wc->log_key.objectid, &name, 1);
if (IS_ERR(dir_dst_di)) {
ret = PTR_ERR(dir_dst_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir item for dir %llu name %.*s root %llu",
+ wc->log_key.objectid, name.len, name.name,
+ btrfs_root_id(root));
goto out;
} else if (dir_dst_di) {
- ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- dir_dst_di, &log_key,
- log_flags, exists);
- if (ret < 0)
+ ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di,
+ &log_key, log_flags, exists);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete conflicting entry for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ }
dir_dst_matches = (ret == 1);
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
- index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
- key->objectid, key->offset,
- &name, 1);
+ index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path,
+ wc->log_key.objectid,
+ wc->log_key.offset, &name, 1);
if (IS_ERR(index_dst_di)) {
ret = PTR_ERR(index_dst_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir index item for dir %llu name %.*s root %llu",
+ wc->log_key.objectid, name.len, name.name,
+ btrfs_root_id(root));
goto out;
} else if (index_dst_di) {
- ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- index_dst_di, &log_key,
- log_flags, exists);
- if (ret < 0)
+ ret = delete_conflicting_dir_entry(wc, dir, index_dst_di,
+ &log_key, log_flags, exists);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete conflicting entry for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ }
index_dst_matches = (ret == 1);
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (dir_dst_matches && index_dst_matches) {
ret = 0;
@@ -1921,9 +2214,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
*/
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_REF_KEY;
- search_key.offset = key->objectid;
+ search_key.offset = wc->log_key.objectid;
ret = backref_in_log(root->log_root, &search_key, 0, &name);
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu",
+ search_key.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
goto out;
} else if (ret) {
/* The dentry will be added later. */
@@ -1934,9 +2231,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
+ search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len);
+ ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name);
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu",
+ search_key.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
goto out;
} else if (ret) {
/* The dentry will be added later. */
@@ -1944,11 +2245,15 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
update_size = false;
goto out;
}
- btrfs_release_path(path);
- ret = insert_one_name(trans, root, key->objectid, key->offset,
+ ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset,
&name, &log_key);
- if (ret && ret != -ENOENT && ret != -EEXIST)
+ if (ret && ret != -ENOENT && ret != -EEXIST) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert name %.*s for inode %llu dir %llu root %llu",
+ name.len, name.name, log_key.objectid,
+ btrfs_ino(dir), btrfs_root_id(root));
goto out;
+ }
if (!ret)
name_added = true;
update_size = false;
@@ -1956,31 +2261,32 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
out:
if (!ret && update_size) {
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
- ret = btrfs_update_inode(trans, BTRFS_I(dir));
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
+ ret = btrfs_update_inode(trans, dir);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update dir inode %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
}
kfree(name.name);
- iput(dir);
+ iput(&dir->vfs_inode);
if (!ret && name_added)
ret = 1;
return ret;
}
/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
-static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static noinline int replay_one_dir_item(struct walk_control *wc)
{
int ret;
struct btrfs_dir_item *di;
/* We only log dir index keys, which only contain a single dir item. */
- ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
+ ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY,
+ "wc->log_key.type=%u", wc->log_key.type);
- di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
- ret = replay_one_name(trans, root, path, eb, di, key);
+ di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item);
+ ret = replay_one_name(wc, di);
if (ret < 0)
return ret;
@@ -2010,17 +2316,11 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
* to ever delete the parent directory has it would result in stale
* dentries that can never be deleted.
*/
- if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
- struct btrfs_path *fixup_path;
+ if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) {
struct btrfs_key di_key;
- fixup_path = btrfs_alloc_path();
- if (!fixup_path)
- return -ENOMEM;
-
- btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
- btrfs_free_path(fixup_path);
+ btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key);
+ ret = link_to_fixup_dir(wc, di_key.objectid);
}
return ret;
@@ -2113,20 +2413,20 @@ out:
* item is not in the log, the item is removed and the inode it points
* to is unlinked
*/
-static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *log,
- struct btrfs_path *path,
+static noinline int check_item_in_log(struct walk_control *wc,
struct btrfs_path *log_path,
- struct inode *dir,
- struct btrfs_key *dir_key)
+ struct btrfs_inode *dir,
+ struct btrfs_key *dir_key,
+ bool force_remove)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = dir->root;
int ret;
struct extent_buffer *eb;
int slot;
struct btrfs_dir_item *di;
struct fscrypt_str name = { 0 };
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
struct btrfs_key location;
/*
@@ -2135,23 +2435,33 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
* we need to do is process the dir index keys, we (and our caller) can
* safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
*/
- ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
+ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY, "dir_key->type=%u", dir_key->type);
- eb = path->nodes[0];
- slot = path->slots[0];
+ eb = wc->subvol_path->nodes[0];
+ slot = wc->subvol_path->slots[0];
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu index %llu root %llu",
+ btrfs_ino(dir), dir_key->offset,
+ btrfs_root_id(root));
goto out;
+ }
- if (log) {
+ if (!force_remove) {
struct btrfs_dir_item *log_di;
- log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
+ log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path,
dir_key->objectid,
dir_key->offset, &name, 0);
if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir index item for dir %llu index %llu name %.*s root %llu",
+ btrfs_ino(dir), dir_key->offset,
+ name.len, name.name,
+ btrfs_root_id(root));
goto out;
} else if (log_di) {
/* The dentry exists in the log, we have nothing to do. */
@@ -2161,87 +2471,99 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
}
btrfs_dir_item_key_to_cpu(eb, di, &location);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
btrfs_release_path(log_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ location.objectid, btrfs_root_id(root));
goto out;
}
- ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ ret = link_to_fixup_dir(wc, location.objectid);
if (ret)
goto out;
- inc_nlink(inode);
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
- &name);
+ inc_nlink(&inode->vfs_inode);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
* (an index number), so we're done.
*/
out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
btrfs_release_path(log_path);
kfree(name.name);
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
-static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- const u64 ino)
+static int replay_xattr_deletes(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_root *log = wc->log;
struct btrfs_key search_key;
- struct btrfs_path *log_path;
- int i;
+ BTRFS_PATH_AUTO_FREE(log_path);
+ const u64 ino = wc->log_key.objectid;
int nritems;
int ret;
log_path = btrfs_alloc_path();
- if (!log_path)
+ if (!log_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
search_key.objectid = ino;
search_key.type = BTRFS_XATTR_ITEM_KEY;
search_key.offset = 0;
again:
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret < 0)
+ ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search xattrs for inode %llu root %llu",
+ ino, btrfs_root_id(root));
goto out;
+ }
process_leaf:
- nritems = btrfs_header_nritems(path->nodes[0]);
- for (i = path->slots[0]; i < nritems; i++) {
+ nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
+ for (int i = wc->subvol_path->slots[0]; i < nritems; i++) {
struct btrfs_key key;
struct btrfs_dir_item *di;
struct btrfs_dir_item *log_di;
u32 total_size;
u32 cur;
- btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i);
if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
ret = 0;
goto out;
}
- di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
- total_size = btrfs_item_size(path->nodes[0], i);
+ di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item);
+ total_size = btrfs_item_size(wc->subvol_path->nodes[0], i);
cur = 0;
while (cur < total_size) {
- u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
- u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+ u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di);
+ u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di);
u32 this_len = sizeof(*di) + name_len + data_len;
char *name;
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
ret = -ENOMEM;
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate memory for name of length %u",
+ name_len);
goto out;
}
- read_extent_buffer(path->nodes[0], name,
+ read_extent_buffer(wc->subvol_path->nodes[0], name,
(unsigned long)(di + 1), name_len);
log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
@@ -2249,40 +2571,59 @@ process_leaf:
btrfs_release_path(log_path);
if (!log_di) {
/* Doesn't exist in log tree, so delete it. */
- btrfs_release_path(path);
- di = btrfs_lookup_xattr(trans, root, path, ino,
+ btrfs_release_path(wc->subvol_path);
+ di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino,
name, name_len, -1);
- kfree(name);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup xattr with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
goto out;
}
ASSERT(di);
ret = btrfs_delete_one_dir_name(trans, root,
- path, di);
- if (ret)
+ wc->subvol_path, di);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete xattr with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
goto out;
- btrfs_release_path(path);
+ }
+ btrfs_release_path(wc->subvol_path);
+ kfree(name);
search_key = key;
goto again;
}
- kfree(name);
if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup xattr in log tree with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
goto out;
}
+ kfree(name);
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
}
- ret = btrfs_next_leaf(root, path);
+ ret = btrfs_next_leaf(root, wc->subvol_path);
if (ret > 0)
ret = 0;
else if (ret == 0)
goto process_leaf;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get next leaf in subvolume root %llu",
+ btrfs_root_id(root));
out:
- btrfs_free_path(log_path);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
@@ -2297,34 +2638,41 @@ out:
* Anything we don't find in the log is unlinked and removed from the
* directory.
*/
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- u64 dirid, int del_all)
+static noinline int replay_dir_deletes(struct walk_control *wc,
+ u64 dirid, bool del_all)
{
+ struct btrfs_root *root = wc->root;
+ struct btrfs_root *log = (del_all ? NULL : wc->log);
u64 range_start;
u64 range_end;
int ret = 0;
struct btrfs_key dir_key;
struct btrfs_key found_key;
- struct btrfs_path *log_path;
- struct inode *dir;
+ BTRFS_PATH_AUTO_FREE(log_path);
+ struct btrfs_inode *dir;
dir_key.objectid = dirid;
dir_key.type = BTRFS_DIR_INDEX_KEY;
log_path = btrfs_alloc_path();
- if (!log_path)
+ if (!log_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
- dir = read_one_inode(root, dirid);
- /* it isn't an error if the inode isn't there, that can happen
- * because we replay the deletes before we copy in the inode item
- * from the log
+ dir = btrfs_iget_logging(dirid, root);
+ /*
+ * It isn't an error if the inode isn't there, that can happen because
+ * we replay the deletes before we copy in the inode item from the log.
*/
- if (!dir) {
- btrfs_free_path(log_path);
- return 0;
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ dirid, btrfs_root_id(root));
+ return ret;
}
range_start = 0;
@@ -2333,32 +2681,45 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
if (del_all)
range_end = (u64)-1;
else {
- ret = find_dir_range(log, path, dirid,
+ ret = find_dir_range(log, wc->subvol_path, dirid,
&range_start, &range_end);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to find range for dir %llu in log tree root %llu",
+ dirid, btrfs_root_id(root));
goto out;
- else if (ret > 0)
+ } else if (ret > 0) {
break;
+ }
}
dir_key.offset = range_start;
while (1) {
int nritems;
- ret = btrfs_search_slot(NULL, root, &dir_key, path,
- 0, 0);
- if (ret < 0)
+ ret = btrfs_search_slot(NULL, root, &dir_key,
+ wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search root %llu for key " BTRFS_KEY_FMT,
+ btrfs_root_id(root),
+ BTRFS_KEY_FMT_VALUE(&dir_key));
goto out;
+ }
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 1)
+ nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
+ if (wc->subvol_path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, wc->subvol_path);
+ if (ret == 1) {
break;
- else if (ret < 0)
+ } else if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get next leaf in subvolume root %llu",
+ btrfs_root_id(root));
goto out;
+ }
}
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key,
+ wc->subvol_path->slots[0]);
if (found_key.objectid != dirid ||
found_key.type != dir_key.type) {
ret = 0;
@@ -2368,25 +2729,22 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
if (found_key.offset > range_end)
break;
- ret = check_item_in_log(trans, log, path,
- log_path, dir,
- &found_key);
+ ret = check_item_in_log(wc, log_path, dir, &found_key, del_all);
if (ret)
goto out;
if (found_key.offset == (u64)-1)
break;
dir_key.offset = found_key.offset + 1;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (range_end == (u64)-1)
break;
range_start = range_end + 1;
}
ret = 0;
out:
- btrfs_release_path(path);
- btrfs_free_path(log_path);
- iput(dir);
+ btrfs_release_path(wc->subvol_path);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -2401,7 +2759,7 @@ out:
* only in the log (references come from either directory items or inode
* back refs).
*/
-static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+static int replay_one_buffer(struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
int nritems;
@@ -2409,44 +2767,62 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
.transid = gen,
.level = level
};
- struct btrfs_path *path;
- struct btrfs_root *root = wc->replay_dest;
- struct btrfs_key key;
- int i;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_trans_handle *trans = wc->trans;
int ret;
- ret = btrfs_read_extent_buffer(eb, &check);
- if (ret)
- return ret;
-
- level = btrfs_header_level(eb);
-
if (level != 0)
return 0;
- path = btrfs_alloc_path();
- if (!path)
+ /*
+ * Set to NULL since it was not yet read and in case we abort log replay
+ * on error, we have no valid log tree leaf to dump.
+ */
+ wc->log_leaf = NULL;
+ ret = btrfs_read_extent_buffer(eb, &check);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to read log tree leaf %llu for root %llu",
+ eb->start, btrfs_root_id(root));
+ return ret;
+ }
+
+ ASSERT(wc->subvol_path == NULL);
+ wc->subvol_path = btrfs_alloc_path();
+ if (!wc->subvol_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
+
+ wc->log_leaf = eb;
nritems = btrfs_header_nritems(eb);
- for (i = 0; i < nritems; i++) {
- btrfs_item_key_to_cpu(eb, &key, i);
+ for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) {
+ struct btrfs_inode_item *inode_item;
- /* inode keys are done during the first stage */
- if (key.type == BTRFS_INODE_ITEM_KEY &&
- wc->stage == LOG_WALK_REPLAY_INODES) {
- struct btrfs_inode_item *inode_item;
- u32 mode;
+ btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot);
- inode_item = btrfs_item_ptr(eb, i,
- struct btrfs_inode_item);
+ if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) {
+ inode_item = btrfs_item_ptr(eb, wc->log_slot,
+ struct btrfs_inode_item);
/*
- * If we have a tmpfile (O_TMPFILE) that got fsync'ed
- * and never got linked before the fsync, skip it, as
- * replaying it is pointless since it would be deleted
- * later. We skip logging tmpfiles, but it's always
- * possible we are replaying a log created with a kernel
- * that used to log tmpfiles.
+ * An inode with no links is either:
+ *
+ * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never
+ * got linked before the fsync, skip it, as replaying
+ * it is pointless since it would be deleted later.
+ * We skip logging tmpfiles, but it's always possible
+ * we are replaying a log created with a kernel that
+ * used to log tmpfiles;
+ *
+ * 2) A non-tmpfile which got its last link deleted
+ * while holding an open fd on it and later got
+ * fsynced through that fd. We always log the
+ * parent inodes when inode->last_unlink_trans is
+ * set to the current transaction, so ignore all the
+ * inode items for this inode. We will delete the
+ * inode when processing the parent directory with
+ * replay_dir_deletes().
*/
if (btrfs_inode_nlink(eb, inode_item) == 0) {
wc->ignore_cur_inode = true;
@@ -2454,19 +2830,23 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
} else {
wc->ignore_cur_inode = false;
}
- ret = replay_xattr_deletes(wc->trans, root, log,
- path, key.objectid);
+ }
+
+ /* Inode keys are done during the first stage. */
+ if (wc->log_key.type == BTRFS_INODE_ITEM_KEY &&
+ wc->stage == LOG_WALK_REPLAY_INODES) {
+ u32 mode;
+
+ ret = replay_xattr_deletes(wc);
if (ret)
break;
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
- ret = replay_dir_deletes(wc->trans,
- root, log, path, key.objectid, 0);
+ ret = replay_dir_deletes(wc, wc->log_key.objectid, false);
if (ret)
break;
}
- ret = overwrite_item(wc->trans, root, path,
- eb, i, &key);
+ ret = overwrite_item(wc);
if (ret)
break;
@@ -2480,36 +2860,48 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
*/
if (S_ISREG(mode)) {
struct btrfs_drop_extents_args drop_args = { 0 };
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 from;
- inode = read_one_inode(root, key.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ wc->log_key.objectid,
+ btrfs_root_id(root));
break;
}
- from = ALIGN(i_size_read(inode),
+ from = ALIGN(i_size_read(&inode->vfs_inode),
root->fs_info->sectorsize);
drop_args.start = from;
drop_args.end = (u64)-1;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(wc->trans, root,
- BTRFS_I(inode),
- &drop_args);
- if (!ret) {
- inode_sub_bytes(inode,
+ drop_args.path = wc->subvol_path;
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to drop extents for inode %llu root %llu offset %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root),
+ from);
+ } else {
+ inode_sub_bytes(&inode->vfs_inode,
drop_args.bytes_found);
/* Update the inode's nbytes. */
- ret = btrfs_update_inode(wc->trans,
- BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
}
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
}
- ret = link_to_fixup_dir(wc->trans, root,
- path, key.objectid);
+ ret = link_to_fixup_dir(wc, wc->log_key.objectid);
if (ret)
break;
}
@@ -2517,10 +2909,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
if (wc->ignore_cur_inode)
continue;
- if (key.type == BTRFS_DIR_INDEX_KEY &&
+ if (wc->log_key.type == BTRFS_DIR_INDEX_KEY &&
wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
- ret = replay_one_dir_item(wc->trans, root, path,
- eb, i, &key);
+ ret = replay_one_dir_item(wc);
if (ret)
break;
}
@@ -2529,21 +2920,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
continue;
/* these keys are simply copied */
- if (key.type == BTRFS_XATTR_ITEM_KEY) {
- ret = overwrite_item(wc->trans, root, path,
- eb, i, &key);
+ if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) {
+ ret = overwrite_item(wc);
if (ret)
break;
- } else if (key.type == BTRFS_INODE_REF_KEY ||
- key.type == BTRFS_INODE_EXTREF_KEY) {
- ret = add_inode_ref(wc->trans, root, log, path,
- eb, i, &key);
- if (ret && ret != -ENOENT)
+ } else if (wc->log_key.type == BTRFS_INODE_REF_KEY ||
+ wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = add_inode_ref(wc);
+ if (ret)
break;
- ret = 0;
- } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
- ret = replay_one_extent(wc->trans, root, path,
- eb, i, &key);
+ } else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) {
+ ret = replay_one_extent(wc);
if (ret)
break;
}
@@ -2554,37 +2941,16 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
* older kernel with such keys, ignore them.
*/
}
- btrfs_free_path(path);
+ btrfs_free_path(wc->subvol_path);
+ wc->subvol_path = NULL;
return ret;
}
-/*
- * Correctly adjust the reserved bytes occupied by a log tree extent buffer
- */
-static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
-{
- struct btrfs_block_group *cache;
-
- cache = btrfs_lookup_block_group(fs_info, start);
- if (!cache) {
- btrfs_err(fs_info, "unable to find block group for %llu", start);
- return;
- }
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->reserved -= fs_info->nodesize;
- cache->space_info->bytes_reserved -= fs_info->nodesize;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-
- btrfs_put_block_group(cache);
-}
-
static int clean_log_buffer(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
- int ret;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_block_group *bg;
btrfs_tree_lock(eb);
btrfs_clear_buffer_dirty(trans, eb);
@@ -2592,22 +2958,38 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(eb);
if (trans) {
+ int ret;
+
ret = btrfs_pin_reserved_extent(trans, eb);
if (ret)
- return ret;
- } else {
- unaccount_log_buffer(eb->fs_info, eb->start);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ bg = btrfs_lookup_block_group(fs_info, eb->start);
+ if (!bg) {
+ btrfs_err(fs_info, "unable to find block group for %llu", eb->start);
+ btrfs_handle_fs_error(fs_info, -ENOENT, NULL);
+ return -ENOENT;
}
+ spin_lock(&bg->space_info->lock);
+ spin_lock(&bg->lock);
+ bg->reserved -= fs_info->nodesize;
+ bg->space_info->bytes_reserved -= fs_info->nodesize;
+ spin_unlock(&bg->lock);
+ spin_unlock(&bg->space_info->lock);
+
+ btrfs_put_block_group(bg);
+
return 0;
}
-static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level,
- struct walk_control *wc)
+static noinline int walk_down_log_tree(struct btrfs_path *path, int *level,
+ struct walk_control *wc)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_fs_info *fs_info = wc->log->fs_info;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
@@ -2635,12 +3017,17 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
next = btrfs_find_create_tree_block(fs_info, bytenr,
btrfs_header_owner(cur),
*level - 1);
- if (IS_ERR(next))
- return PTR_ERR(next);
+ if (IS_ERR(next)) {
+ ret = PTR_ERR(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+ return ret;
+ }
if (*level == 1) {
- ret = wc->process_func(root, next, wc, ptr_gen,
- *level - 1);
+ ret = wc->process_func(next, wc, ptr_gen, *level - 1);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2651,6 +3038,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
}
@@ -2666,6 +3057,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
}
@@ -2682,10 +3077,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return 0;
}
-static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level,
- struct walk_control *wc)
+static noinline int walk_up_log_tree(struct btrfs_path *path, int *level,
+ struct walk_control *wc)
{
int i;
int slot;
@@ -2699,14 +3092,14 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level == 0);
return 0;
} else {
- ret = wc->process_func(root, path->nodes[*level], wc,
+ ret = wc->process_func(path->nodes[*level], wc,
btrfs_header_generation(path->nodes[*level]),
*level);
if (ret)
return ret;
if (wc->free) {
- ret = clean_log_buffer(trans, path->nodes[*level]);
+ ret = clean_log_buffer(wc->trans, path->nodes[*level]);
if (ret)
return ret;
}
@@ -2723,13 +3116,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
* the tree freeing any blocks that have a ref count of zero after being
* decremented.
*/
-static int walk_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *log, struct walk_control *wc)
+static int walk_log_tree(struct walk_control *wc)
{
+ struct btrfs_root *log = wc->log;
int ret = 0;
int wret;
int level;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int orig_level;
path = btrfs_alloc_path();
@@ -2739,40 +3132,34 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
level = btrfs_header_level(log->node);
orig_level = level;
path->nodes[level] = log->node;
- atomic_inc(&log->node->refs);
+ refcount_inc(&log->node->refs);
path->slots[level] = 0;
while (1) {
- wret = walk_down_log_tree(trans, log, path, &level, wc);
+ wret = walk_down_log_tree(path, &level, wc);
if (wret > 0)
break;
- if (wret < 0) {
- ret = wret;
- goto out;
- }
+ if (wret < 0)
+ return wret;
- wret = walk_up_log_tree(trans, log, path, &level, wc);
+ wret = walk_up_log_tree(path, &level, wc);
if (wret > 0)
break;
- if (wret < 0) {
- ret = wret;
- goto out;
- }
+ if (wret < 0)
+ return wret;
}
/* was the root node processed? if not, catch it here */
if (path->nodes[orig_level]) {
- ret = wc->process_func(log, path->nodes[orig_level], wc,
+ ret = wc->process_func(path->nodes[orig_level], wc,
btrfs_header_generation(path->nodes[orig_level]),
orig_level);
if (ret)
- goto out;
+ return ret;
if (wc->free)
- ret = clean_log_buffer(trans, path->nodes[orig_level]);
+ ret = clean_log_buffer(wc->trans, path->nodes[orig_level]);
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -2951,7 +3338,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
- ASSERT(log_transid == root->log_transid);
+ ASSERT(log_transid == root->log_transid,
+ "log_transid=%d root->log_transid=%d", log_transid, root->log_transid);
atomic_set(&root->log_commit[index1], 1);
/* wait for previous tree log sync to complete */
@@ -2980,9 +3368,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (log_transid % 2 == 0)
- mark = EXTENT_DIRTY;
+ mark = EXTENT_DIRTY_LOG1;
else
- mark = EXTENT_NEW;
+ mark = EXTENT_DIRTY_LOG2;
/* we start IO on all the marked extents here, but we don't actually
* wait for them until later.
@@ -3091,7 +3479,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = root_log_ctx.log_ret;
goto out;
}
- ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
+ ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid,
+ "root_log_ctx.log_transid=%d log_root_tree->log_transid=%d",
+ root_log_ctx.log_transid, log_root_tree->log_transid);
atomic_set(&log_root_tree->log_commit[index2], 1);
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
@@ -3113,7 +3503,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_marked_extents(fs_info,
&log_root_tree->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
blk_finish_plug(&plug);
/*
* As described above, -EAGAIN indicates a hole in the extents. We
@@ -3133,7 +3523,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_wait_tree_log_extents(log, mark);
if (!ret)
ret = btrfs_wait_tree_log_extents(log_root_tree,
- EXTENT_NEW | EXTENT_DIRTY);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
if (ret) {
btrfs_set_log_full_commit(trans);
mutex_unlock(&log_root_tree->log_mutex);
@@ -3181,7 +3571,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
ret = write_all_supers(fs_info, 1);
mutex_unlock(&fs_info->tree_log_mutex);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
goto out_wake_log_root;
@@ -3195,7 +3585,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* someone else already started it. We use <= and not < because the
* first log transaction has an ID of 0.
*/
- ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
+ ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid,
+ "last_log_commit(root)=%d log_transid=%d",
+ btrfs_get_root_last_log_commit(root), log_transid);
btrfs_set_root_last_log_commit(root, log_transid);
out_wake_log_root:
@@ -3233,12 +3625,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
{
int ret;
struct walk_control wc = {
- .free = 1,
- .process_func = process_one_buffer
+ .free = true,
+ .process_func = process_one_buffer,
+ .log = log,
+ .trans = trans,
};
if (log->node) {
- ret = walk_log_tree(trans, log, &wc);
+ ret = walk_log_tree(&wc);
if (ret) {
/*
* We weren't able to traverse the entire log tree, the
@@ -3259,9 +3653,9 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
*/
btrfs_write_marked_extents(log->fs_info,
&log->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
btrfs_wait_tree_log_extents(log,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
if (trans)
btrfs_abort_transaction(trans, ret);
@@ -3270,8 +3664,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
}
}
- extent_io_tree_release(&log->dirty_log_pages);
- extent_io_tree_release(&log->log_csum_range);
+ btrfs_extent_io_tree_release(&log->dirty_log_pages);
+ btrfs_extent_io_tree_release(&log->log_csum_range);
btrfs_put_root(log);
}
@@ -3301,6 +3695,31 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
return 0;
}
+static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ bool ret = false;
+
+ /*
+ * Do this only if ->logged_trans is still 0 to prevent races with
+ * concurrent logging as we may see the inode not logged when
+ * inode_logged() is called but it gets logged after inode_logged() did
+ * not find it in the log tree and we end up setting ->logged_trans to a
+ * value less than trans->transid after the concurrent logging task has
+ * set it to trans->transid. As a consequence, subsequent rename, unlink
+ * and link operations may end up not logging new names and removing old
+ * names from the log.
+ */
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == 0)
+ inode->logged_trans = trans->transid - 1;
+ else if (inode->logged_trans == trans->transid)
+ ret = true;
+ spin_unlock(&inode->lock);
+
+ return ret;
+}
+
/*
* Check if an inode was logged in the current transaction. This correctly deals
* with the case where the inode was logged but has a logged_trans of 0, which
@@ -3318,15 +3737,32 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret;
- if (inode->logged_trans == trans->transid)
+ /*
+ * Quick lockless call, since once ->logged_trans is set to the current
+ * transaction, we never set it to a lower value anywhere else.
+ */
+ if (data_race(inode->logged_trans) == trans->transid)
return 1;
/*
- * If logged_trans is not 0, then we know the inode logged was not logged
- * in this transaction, so we can return false right away.
+ * If logged_trans is not 0 and not trans->transid, then we know the
+ * inode was not logged in this transaction, so we can return false
+ * right away. We take the lock to avoid a race caused by load/store
+ * tearing with a concurrent btrfs_log_inode() call or a concurrent task
+ * in this function further below - an update to trans->transid can be
+ * teared into two 32 bits updates for example, in which case we could
+ * see a positive value that is not trans->transid and assume the inode
+ * was not logged when it was.
*/
- if (inode->logged_trans > 0)
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == trans->transid) {
+ spin_unlock(&inode->lock);
+ return 1;
+ } else if (inode->logged_trans > 0) {
+ spin_unlock(&inode->lock);
return 0;
+ }
+ spin_unlock(&inode->lock);
/*
* If no log tree was created for this root in this transaction, then
@@ -3335,10 +3771,8 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
* transaction's ID, to avoid the search below in a future call in case
* a log tree gets created after this.
*/
- if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
- inode->logged_trans = trans->transid - 1;
- return 0;
- }
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+ return mark_inode_as_not_logged(trans, inode);
/*
* We have a log tree and the inode's logged_trans is 0. We can't tell
@@ -3392,29 +3826,17 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
* Set logged_trans to a value greater than 0 and less then the
* current transaction to avoid doing the search in future calls.
*/
- inode->logged_trans = trans->transid - 1;
- return 0;
+ return mark_inode_as_not_logged(trans, inode);
}
/*
* The inode was previously logged and then evicted, set logged_trans to
- * the current transacion's ID, to avoid future tree searches as long as
+ * the current transaction's ID, to avoid future tree searches as long as
* the inode is not evicted again.
*/
+ spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
-
- /*
- * If it's a directory, then we must set last_dir_index_offset to the
- * maximum possible value, so that the next attempt to log the inode does
- * not skip checking if dir index keys found in modified subvolume tree
- * leaves have been logged before, otherwise it would result in attempts
- * to insert duplicate dir index keys in the log tree. This must be done
- * because last_dir_index_offset is an in-memory only field, not persisted
- * in the inode item or any other on-disk structure, so its value is lost
- * once the inode is evicted.
- */
- if (S_ISDIR(inode->vfs_inode.i_mode))
- inode->last_dir_index_offset = (u64)-1;
+ spin_unlock(&inode->lock);
return 1;
}
@@ -3451,7 +3873,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
* inode item because on log replay we update the field to reflect
* all existing entries in the directory (see overwrite_item()).
*/
- return btrfs_delete_one_dir_name(trans, log, path, di);
+ return btrfs_del_item(trans, log, path);
}
/*
@@ -3476,37 +3898,36 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
* or the entire directory.
*/
void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
const struct fscrypt_str *name,
struct btrfs_inode *dir, u64 index)
{
- struct btrfs_path *path;
+ struct btrfs_root *root = dir->root;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
ret = inode_logged(trans, dir, NULL);
if (ret == 0)
return;
- else if (ret < 0) {
+ if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
btrfs_set_log_full_commit(trans);
return;
}
ret = join_running_log_trans(root);
- if (ret)
+ ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
+ if (WARN_ON(ret))
return;
mutex_lock(&dir->log_mutex);
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
name, index);
- btrfs_free_path(path);
-out_unlock:
mutex_unlock(&dir->log_mutex);
if (ret < 0)
btrfs_set_log_full_commit(trans);
@@ -3515,12 +3936,11 @@ out_unlock:
/* see comments for btrfs_del_dir_entries_in_log */
void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
const struct fscrypt_str *name,
- struct btrfs_inode *inode, u64 dirid)
+ struct btrfs_inode *inode,
+ struct btrfs_inode *dir)
{
- struct btrfs_root *log;
- u64 index;
+ struct btrfs_root *root = dir->root;
int ret;
ret = inode_logged(trans, inode, NULL);
@@ -3532,13 +3952,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
}
ret = join_running_log_trans(root);
- if (ret)
+ ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
+ if (WARN_ON(ret))
return;
- log = root->log_root;
mutex_lock(&inode->log_mutex);
- ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
- dirid, &index);
+ ret = btrfs_del_inode_ref(trans, root->log_root, name, btrfs_ino(inode),
+ btrfs_ino(dir), NULL);
mutex_unlock(&inode->log_mutex);
if (ret < 0 && ret != -ENOENT)
btrfs_set_log_full_commit(trans);
@@ -3561,8 +3981,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_dir_log_item *item;
key.objectid = dirid;
- key.offset = first_offset;
key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ key.offset = first_offset;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
/*
* -EEXIST is fine and can happen sporadically when we are logging a
@@ -3588,7 +4008,6 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
last_offset = max(last_offset, curr_end);
}
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
return 0;
}
@@ -3601,7 +4020,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
int count)
{
struct btrfs_root *log = inode->root->log_root;
- char *ins_data = NULL;
+ char AUTO_KFREE(ins_data);
struct btrfs_item_batch batch;
struct extent_buffer *dst;
unsigned long src_offset;
@@ -3612,7 +4031,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
int ret;
int i;
- ASSERT(count > 0);
+ ASSERT(count > 0, "count=%d", count);
batch.nr = count;
if (count == 1) {
@@ -3625,8 +4044,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
struct btrfs_key *ins_keys;
u32 *ins_sizes;
- ins_data = kmalloc(count * sizeof(u32) +
- count * sizeof(struct btrfs_key), GFP_NOFS);
+ ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
if (!ins_data)
return -ENOMEM;
@@ -3647,7 +4065,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
if (ret)
- goto out;
+ return ret;
dst = dst_path->nodes[0];
/*
@@ -3666,7 +4084,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
btrfs_release_path(dst_path);
last_index = batch.keys[count - 1].offset;
- ASSERT(last_index > inode->last_dir_index_offset);
+ ASSERT(last_index > inode->last_dir_index_offset,
+ "last_index=%llu inode->last_dir_index_offset=%llu",
+ last_index, inode->last_dir_index_offset);
/*
* If for some unexpected reason the last item's index is not greater
@@ -3679,8 +4099,6 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
if (btrfs_get_first_dir_index_to_log(inode) == 0)
btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
-out:
- kfree(ins_data);
return ret;
}
@@ -3704,7 +4122,7 @@ static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
* Add extra ref to scratch eb so that it is not freed when callers
* release the path, so we can reuse it later if needed.
*/
- atomic_inc(&ctx->scratch_eb->refs);
+ refcount_inc(&ctx->scratch_eb->refs);
return 0;
}
@@ -3739,7 +4157,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
for (int i = path->slots[0]; i < nritems; i++) {
struct btrfs_dir_item *di;
struct btrfs_key key;
- int ret;
btrfs_item_key_to_cpu(src, &key, i);
@@ -3809,8 +4226,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
}
if (batch_size > 0) {
- int ret;
-
ret = flush_dir_items_batch(trans, inode, src, dst_path,
batch_start, batch_size);
if (ret < 0)
@@ -3995,7 +4410,9 @@ done:
* change in the current transaction), then we don't need to log
* a range, last_old_dentry_offset is == to last_offset.
*/
- ASSERT(last_old_dentry_offset <= last_offset);
+ ASSERT(last_old_dentry_offset <= last_offset,
+ "last_old_dentry_offset=%llu last_offset=%llu",
+ last_old_dentry_offset, last_offset);
if (last_old_dentry_offset < last_offset)
ret = insert_dir_log_key(trans, log, path, ino,
last_old_dentry_offset + 1,
@@ -4007,7 +4424,7 @@ done:
/*
* If the inode was logged before and it was evicted, then its
- * last_dir_index_offset is (u64)-1, so we don't the value of the last index
+ * last_dir_index_offset is 0, so we don't know the value of the last index
* key offset. If that's the case, search for it and update the inode. This
* is to avoid lookups in the log tree every time we try to insert a dir index
* key from a leaf changed in the current transaction, and to allow us to always
@@ -4023,7 +4440,7 @@ static int update_last_dir_index_offset(struct btrfs_inode *inode,
lockdep_assert_held(&inode->log_mutex);
- if (inode->last_dir_index_offset != (u64)-1)
+ if (inode->last_dir_index_offset != 0)
return 0;
if (!ctx->logged_before) {
@@ -4189,47 +4606,40 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans,
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
- struct inode *inode, int log_inode_only,
+ struct inode *inode, bool log_inode_only,
u64 logged_isize)
{
- struct btrfs_map_token token;
u64 flags;
- btrfs_init_map_token(&token, leaf);
-
if (log_inode_only) {
/* set the generation to zero so the recover code
* can tell the difference between an logging
* just to say 'this inode exists' and a logging
* to say 'update this inode with these values'
*/
- btrfs_set_token_inode_generation(&token, item, 0);
- btrfs_set_token_inode_size(&token, item, logged_isize);
+ btrfs_set_inode_generation(leaf, item, 0);
+ btrfs_set_inode_size(leaf, item, logged_isize);
} else {
- btrfs_set_token_inode_generation(&token, item,
- BTRFS_I(inode)->generation);
- btrfs_set_token_inode_size(&token, item, inode->i_size);
+ btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+ btrfs_set_inode_size(leaf, item, inode->i_size);
}
- btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
- btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
- btrfs_set_token_inode_mode(&token, item, inode->i_mode);
- btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
+ btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+ btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
- btrfs_set_token_timespec_sec(&token, &item->atime,
- inode_get_atime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode_get_atime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode_get_mtime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode_get_mtime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
/*
* We do not need to set the nbytes field, in fact during a fast fsync
@@ -4240,13 +4650,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* inode item in subvolume tree as needed (see overwrite_item()).
*/
- btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
- btrfs_set_token_inode_transid(&token, item, trans->transid);
- btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
BTRFS_I(inode)->ro_flags);
- btrfs_set_token_inode_flags(&token, item, flags);
- btrfs_set_token_inode_block_group(&token, item, 0);
+ btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_block_group(leaf, item, 0);
}
static int log_inode_item(struct btrfs_trans_handle *trans,
@@ -4292,7 +4702,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
- 0, 0);
+ false, 0);
btrfs_release_path(path);
return 0;
}
@@ -4320,8 +4730,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
* file which happens to refer to the same extent as well. Such races
* can leave checksum items in the log with overlapping ranges.
*/
- ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
- &cached_state);
+ ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
+ &cached_state);
if (ret)
return ret;
/*
@@ -4337,8 +4747,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
if (!ret)
ret = btrfs_csum_file_blocks(trans, log_root, sums);
- unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
- &cached_state);
+ btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
+ &cached_state);
return ret;
}
@@ -4357,7 +4767,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_key *ins_keys;
u32 *ins_sizes;
struct btrfs_item_batch batch;
- char *ins_data;
+ char AUTO_KFREE(ins_data);
int dst_index;
const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
const u64 i_size = i_size_read(&inode->vfs_inode);
@@ -4396,8 +4806,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src = src_path->nodes[0];
- ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
- nr * sizeof(u32), GFP_NOFS);
+ ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS);
if (!ins_data)
return -ENOMEM;
@@ -4486,7 +4895,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
disk_bytenr + extent_num_bytes - 1,
&ordered_sums, false);
if (ret < 0)
- goto out;
+ return ret;
ret = 0;
list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
@@ -4496,7 +4905,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
kfree(sums);
}
if (ret)
- goto out;
+ return ret;
add_to_batch:
ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
@@ -4510,11 +4919,11 @@ add_to_batch:
* so we don't need to do anything.
*/
if (batch.nr == 0)
- goto out;
+ return 0;
ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
if (ret)
- goto out;
+ return ret;
dst_index = 0;
for (int i = 0; i < nr; i++) {
@@ -4566,10 +4975,7 @@ copy_item:
dst_index++;
}
- btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);
btrfs_release_path(dst_path);
-out:
- kfree(ins_data);
return ret;
}
@@ -4669,7 +5075,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
return 0;
/* If we're compressed we have to save the entire range of csums. */
- if (extent_map_is_compressed(em)) {
+ if (btrfs_extent_map_is_compressed(em)) {
csum_offset = 0;
csum_len = em->disk_num_bytes;
} else {
@@ -4678,7 +5084,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
}
/* block start is already adjusted for the file extent offset. */
- block_start = extent_map_block_start(em);
+ block_start = btrfs_extent_map_block_start(em);
csum_root = btrfs_csum_root(trans->fs_info, block_start);
ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
block_start + csum_offset + csum_len - 1,
@@ -4688,9 +5094,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
ret = 0;
while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
+ struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums,
+ struct btrfs_ordered_sum,
+ list);
if (!ret)
ret = log_csums(trans, inode, log_root, sums);
list_del(&sums->list);
@@ -4713,7 +5119,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
enum btrfs_compression_type compress_type;
u64 extent_offset = em->offset;
- u64 block_start = extent_map_block_start(em);
+ u64 block_start = btrfs_extent_map_block_start(em);
u64 block_len;
int ret;
@@ -4724,7 +5130,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
block_len = em->disk_num_bytes;
- compress_type = extent_map_compression(em);
+ compress_type = btrfs_extent_map_compression(em);
if (compress_type != BTRFS_COMPRESS_NONE) {
btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
@@ -4776,7 +5182,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, &fi,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(fi));
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
@@ -4800,7 +5205,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
struct btrfs_key key;
const u64 i_size = i_size_read(&inode->vfs_inode);
const u64 ino = btrfs_ino(inode);
- struct btrfs_path *dst_path = NULL;
+ BTRFS_PATH_AUTO_FREE(dst_path);
bool dropped_extents = false;
u64 truncate_offset = i_size;
struct extent_buffer *leaf;
@@ -4918,7 +5323,6 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
start_slot, ins_nr, 1, 0, ctx);
out:
btrfs_release_path(path);
- btrfs_free_path(dst_path);
return ret;
}
@@ -4969,7 +5373,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
list_sort(NULL, &extents, extent_cmp);
process:
while (!list_empty(&extents)) {
- em = list_entry(extents.next, struct extent_map, list);
+ em = list_first_entry(&extents, struct extent_map, list);
list_del_init(&em->list);
@@ -4978,8 +5382,8 @@ process:
* private list.
*/
if (ret) {
- clear_em_logging(inode, em);
- free_extent_map(em);
+ btrfs_clear_em_logging(inode, em);
+ btrfs_free_extent_map(em);
continue;
}
@@ -4987,8 +5391,8 @@ process:
ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
- clear_em_logging(inode, em);
- free_extent_map(em);
+ btrfs_clear_em_logging(inode, em);
+ btrfs_free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
@@ -5010,12 +5414,12 @@ process:
set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
atomic_inc(&trans->transaction->pending_ordered);
}
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
}
btrfs_put_ordered_extent(ordered);
}
@@ -5290,9 +5694,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
struct btrfs_inode *inode,
u64 *other_ino, u64 *other_parent)
{
- int ret;
- struct btrfs_path *search_path;
- char *name = NULL;
+ BTRFS_PATH_AUTO_FREE(search_path);
+ char AUTO_KFREE(name);
u32 name_len = 0;
u32 item_size = btrfs_item_size(eb, slot);
u32 cur_offset = 0;
@@ -5301,8 +5704,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
search_path = btrfs_alloc_path();
if (!search_path)
return -ENOMEM;
- search_path->search_commit_root = 1;
- search_path->skip_locking = 1;
+ search_path->search_commit_root = true;
+ search_path->skip_locking = true;
while (cur_offset < item_size) {
u64 parent;
@@ -5335,10 +5738,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
char *new_name;
new_name = krealloc(name, this_name_len, GFP_NOFS);
- if (!new_name) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!new_name)
+ return -ENOMEM;
name_len = this_name_len;
name = new_name;
}
@@ -5356,29 +5757,24 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
di, &di_key);
if (di_key.type == BTRFS_INODE_ITEM_KEY) {
if (di_key.objectid != key->objectid) {
- ret = 1;
*other_ino = di_key.objectid;
*other_parent = parent;
+ return 1;
} else {
- ret = 0;
+ return 0;
}
} else {
- ret = -EAGAIN;
+ return -EAGAIN;
}
- goto out;
} else if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
+ return PTR_ERR(di);
}
btrfs_release_path(search_path);
cur_offset += this_len;
}
- ret = 0;
-out:
- btrfs_free_path(search_path);
- kfree(name);
- return ret;
+
+ return 0;
}
/*
@@ -5426,7 +5822,7 @@ struct btrfs_dir_list {
* See process_dir_items_leaf() for details about why it is needed.
* This is a recursive operation - if an existing dentry corresponds to a
* directory, that directory's new entries are logged too (same behaviour as
- * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * ext3/4, xfs, f2fs, nilfs2). Note that when logging the inodes
* the dentries point to we do not acquire their VFS lock, otherwise lockdep
* complains about the following circular lock dependency / possible deadlock:
*
@@ -5485,7 +5881,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
ihold(&curr_inode->vfs_inode);
while (true) {
- struct inode *vfs_inode;
struct btrfs_key key;
struct btrfs_key found_key;
u64 next_index;
@@ -5501,7 +5896,7 @@ again:
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_dir_item *di;
struct btrfs_key di_key;
- struct inode *di_inode;
+ struct btrfs_inode *di_inode;
int log_mode = LOG_INODE_EXISTS;
int type;
@@ -5528,17 +5923,16 @@ again:
goto out;
}
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
break;
}
ctx->log_new_dentries = false;
if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
- log_mode, ctx);
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
+ btrfs_add_delayed_iput(di_inode);
if (ret)
goto out;
if (ctx->log_new_dentries) {
@@ -5580,14 +5974,13 @@ again:
kfree(dir_elem);
btrfs_add_delayed_iput(curr_inode);
- curr_inode = NULL;
- vfs_inode = btrfs_iget_logging(ino, root);
- if (IS_ERR(vfs_inode)) {
- ret = PTR_ERR(vfs_inode);
+ curr_inode = btrfs_iget_logging(ino, root);
+ if (IS_ERR(curr_inode)) {
+ ret = PTR_ERR(curr_inode);
+ curr_inode = NULL;
break;
}
- curr_inode = BTRFS_I(vfs_inode);
}
out:
btrfs_free_path(path);
@@ -5631,8 +6024,8 @@ static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (WARN_ON_ONCE(ret > 0)) {
@@ -5652,8 +6045,8 @@ static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
}
btrfs_release_path(path);
- path->search_commit_root = 0;
- path->skip_locking = 0;
+ path->search_commit_root = false;
+ path->skip_locking = false;
return ret;
}
@@ -5665,7 +6058,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *ino_elem;
- struct inode *inode;
+ struct btrfs_inode *inode;
/*
* It's rare to have a lot of conflicting inodes, in practice it is not
@@ -5756,12 +6149,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
* inode in LOG_INODE_EXISTS mode and rename operations update the log,
* so that the log ends up with the new name and without the old name.
*/
- if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
return 0;
}
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
@@ -5797,7 +6190,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
*/
while (!list_empty(&ctx->conflict_inodes)) {
struct btrfs_ino_list *curr;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
u64 parent;
@@ -5833,9 +6226,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* dir index key range logged for the directory. So we
* must make sure the deletion is recorded.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode),
- LOG_INODE_ALL, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
break;
continue;
@@ -5851,8 +6243,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* it again because if some other task logged the inode after
* that, we can avoid doing it again.
*/
- if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
continue;
}
@@ -5863,8 +6255,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
break;
}
@@ -6108,8 +6500,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
if (!first)
return 0;
- ins_data = kmalloc(max_batch_size * sizeof(u32) +
- max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
+ ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
if (!ins_data)
return -ENOMEM;
ins_sizes = (u32 *)ins_data;
@@ -6145,7 +6536,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
curr = list_next_entry(curr, log_list);
}
- ASSERT(batch.nr >= 1);
+ ASSERT(batch.nr >= 1, "batch.nr=%d", batch.nr);
ret = insert_delayed_items_batch(trans, log, path, &batch, first);
curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
@@ -6189,7 +6580,9 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
}
last_dir_index = curr->index;
- ASSERT(last_dir_index >= first_dir_index);
+ ASSERT(last_dir_index >= first_dir_index,
+ "last_dir_index=%llu first_dir_index=%llu",
+ last_dir_index, first_dir_index);
ret = insert_dir_log_key(trans, inode->root->log_root, path,
ino, first_dir_index, last_dir_index);
@@ -6283,7 +6676,9 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
goto next_batch;
last_dir_index = last->index;
- ASSERT(last_dir_index >= first_dir_index);
+ ASSERT(last_dir_index >= first_dir_index,
+ "last_dir_index=%llu first_dir_index=%llu",
+ last_dir_index, first_dir_index);
/*
* If this range starts right after where the previous one ends,
* then we want to reuse the previous range item and change its
@@ -6350,12 +6745,13 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
*/
lockdep_assert_not_held(&inode->log_mutex);
- ASSERT(!ctx->logging_new_delayed_dentries);
+ ASSERT(!ctx->logging_new_delayed_dentries,
+ "ctx->logging_new_delayed_dentries=%d", ctx->logging_new_delayed_dentries);
ctx->logging_new_delayed_dentries = true;
list_for_each_entry(item, delayed_ins_list, log_list) {
struct btrfs_dir_item *dir_item;
- struct inode *di_inode;
+ struct btrfs_inode *di_inode;
struct btrfs_key key;
int log_mode = LOG_INODE_EXISTS;
@@ -6371,8 +6767,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
break;
}
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
continue;
}
@@ -6380,12 +6776,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
log_mode = LOG_INODE_ALL;
ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
+ ret = log_new_dir_dentries(trans, di_inode, ctx);
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ btrfs_add_delayed_iput(di_inode);
if (ret)
break;
@@ -6609,6 +7005,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
btrfs_log_get_delayed_items(inode, &delayed_ins_list,
&delayed_del_list);
+ /*
+ * If we are fsyncing a file with 0 hard links, then commit the delayed
+ * inode because the last inode ref (or extref) item may still be in the
+ * subvolume tree and if we log it the file will still exist after a log
+ * replay. So commit the delayed inode to delete that last ref and we
+ * skip logging it.
+ */
+ if (inode->vfs_inode.i_nlink == 0) {
+ ret = btrfs_commit_inode_delayed_inode(inode);
+ if (ret)
+ goto out_unlock;
+ }
+
ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
inode_only, ctx,
@@ -6711,7 +7120,7 @@ log_extents:
* a power failure unless the log was synced as part of an fsync
* against any other unrelated inode.
*/
- if (inode_only != LOG_INODE_EXISTS)
+ if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
@@ -6750,7 +7159,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root *root = inode->root;
const u64 ino = btrfs_ino(inode);
@@ -6758,15 +7167,15 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->skip_locking = 1;
- path->search_commit_root = 1;
+ path->skip_locking = true;
+ path->search_commit_root = true;
key.objectid = ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
while (true) {
struct extent_buffer *leaf = path->nodes[0];
@@ -6778,8 +7187,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
- else if (ret > 0)
+ return ret;
+ if (ret > 0)
break;
continue;
}
@@ -6792,28 +7201,24 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
item_size = btrfs_item_size(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
- struct btrfs_key inode_key;
- struct inode *dir_inode;
-
- inode_key.type = BTRFS_INODE_ITEM_KEY;
- inode_key.offset = 0;
+ u64 dir_id;
+ struct btrfs_inode *dir_inode;
if (key.type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)
(ptr + cur_offset);
- inode_key.objectid = btrfs_inode_extref_parent(
- leaf, extref);
+ dir_id = btrfs_inode_extref_parent(leaf, extref);
cur_offset += sizeof(*extref);
cur_offset += btrfs_inode_extref_name_len(leaf,
extref);
} else {
- inode_key.objectid = key.offset;
+ dir_id = key.offset;
cur_offset = item_size;
}
- dir_inode = btrfs_iget_logging(inode_key.objectid, root);
+ dir_inode = btrfs_iget_logging(dir_id, root);
/*
* If the parent inode was deleted, return an error to
* fallback to a transaction commit. This is to prevent
@@ -6837,32 +7242,25 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
* at both parents and the old parent B would still
* exist.
*/
- if (IS_ERR(dir_inode)) {
- ret = PTR_ERR(dir_inode);
- goto out;
- }
+ if (IS_ERR(dir_inode))
+ return PTR_ERR(dir_inode);
- if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(dir_inode));
+ if (!need_log_inode(trans, dir_inode)) {
+ btrfs_add_delayed_iput(dir_inode);
continue;
}
ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
- LOG_INODE_ALL, ctx);
+ ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans,
- BTRFS_I(dir_inode), ctx);
- btrfs_add_delayed_iput(BTRFS_I(dir_inode));
+ ret = log_new_dir_dentries(trans, dir_inode, ctx);
+ btrfs_add_delayed_iput(dir_inode);
if (ret)
- goto out;
+ return ret;
}
path->slots[0]++;
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static int log_new_ancestors(struct btrfs_trans_handle *trans,
@@ -6878,7 +7276,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int slot;
struct btrfs_key search_key;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
int ret = 0;
@@ -6893,11 +7291,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (BTRFS_I(inode)->generation >= trans->transid &&
- need_log_inode(trans, BTRFS_I(inode)))
- ret = btrfs_log_inode(trans, BTRFS_I(inode),
- LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode))
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -6974,7 +7371,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
const u64 ino = btrfs_ino(inode);
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key search_key;
int ret;
@@ -6995,7 +7392,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret == 0)
path->slots[0]++;
@@ -7007,8 +7404,8 @@ again:
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
- else if (ret > 0)
+ return ret;
+ if (ret > 0)
break;
continue;
}
@@ -7025,10 +7422,8 @@ again:
* this loop, etc). So just return some error to fallback to
* a transaction commit.
*/
- if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
- ret = -EMLINK;
- goto out;
- }
+ if (found_key.type == BTRFS_INODE_EXTREF_KEY)
+ return -EMLINK;
/*
* Logging ancestors needs to do more searches on the fs/subvol
@@ -7040,14 +7435,11 @@ again:
ret = log_new_ancestors(trans, root, path, ctx);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
goto again;
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -7065,42 +7457,29 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
- bool log_dentries = false;
+ bool log_dentries;
- if (btrfs_test_opt(fs_info, NOTREELOG)) {
- ret = BTRFS_LOG_FORCE_COMMIT;
- goto end_no_trans;
- }
+ if (btrfs_test_opt(fs_info, NOTREELOG))
+ return BTRFS_LOG_FORCE_COMMIT;
- if (btrfs_root_refs(&root->root_item) == 0) {
- ret = BTRFS_LOG_FORCE_COMMIT;
- goto end_no_trans;
- }
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return BTRFS_LOG_FORCE_COMMIT;
/*
* If we're logging an inode from a subvolume created in the current
* transaction we must force a commit since the root is not persisted.
*/
- if (btrfs_root_generation(&root->root_item) == trans->transid) {
- ret = BTRFS_LOG_FORCE_COMMIT;
- goto end_no_trans;
- }
+ if (btrfs_root_generation(&root->root_item) == trans->transid)
+ return BTRFS_LOG_FORCE_COMMIT;
- /*
- * Skip already logged inodes or inodes corresponding to tmpfiles
- * (since logging them is pointless, a link count of 0 means they
- * will never be accessible).
- */
- if ((btrfs_inode_in_log(inode, trans->transid) &&
- list_empty(&ctx->ordered_extents)) ||
- inode->vfs_inode.i_nlink == 0) {
- ret = BTRFS_NO_LOG_SYNC;
- goto end_no_trans;
- }
+ /* Skip already logged inodes and without new extents. */
+ if (btrfs_inode_in_log(inode, trans->transid) &&
+ list_empty(&ctx->ordered_extents))
+ return BTRFS_NO_LOG_SYNC;
ret = start_log_trans(trans, root, ctx);
if (ret)
- goto end_no_trans;
+ return ret;
ret = btrfs_log_inode(trans, inode, inode_only, ctx);
if (ret)
@@ -7119,8 +7498,11 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
- if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
- log_dentries = true;
+ /*
+ * Track if we need to log dentries because ctx->log_new_dentries can
+ * be modified in the call chains below.
+ */
+ log_dentries = ctx->log_new_dentries;
/*
* On unlink we must make sure all our current and old parent directory
@@ -7175,8 +7557,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (log_dentries)
ret = log_new_dir_dentries(trans, inode, ctx);
- else
- ret = 0;
end_trans:
if (ret < 0) {
btrfs_set_log_full_commit(trans);
@@ -7186,7 +7566,7 @@ end_trans:
if (ret)
btrfs_remove_log_ctx(root, ctx);
btrfs_end_log_trans(root);
-end_no_trans:
+
return ret;
}
@@ -7220,8 +7600,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
struct btrfs_key key;
- struct btrfs_key found_key;
- struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
struct walk_control wc = {
.process_func = process_one_buffer,
@@ -7241,23 +7619,27 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
}
wc.trans = trans;
- wc.pin = 1;
+ wc.pin = true;
+ wc.log = log_root_tree;
- ret = walk_log_tree(trans, log_root_tree, &wc);
- if (ret) {
+ ret = walk_log_tree(&wc);
+ wc.log = NULL;
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error;
}
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
+ struct btrfs_key found_key;
+
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -7272,17 +7654,22 @@ again:
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
- log = btrfs_read_tree_root(log_root_tree, &found_key);
- if (IS_ERR(log)) {
- ret = PTR_ERR(log);
+ wc.log = btrfs_read_tree_root(log_root_tree, &found_key);
+ if (IS_ERR(wc.log)) {
+ ret = PTR_ERR(wc.log);
+ wc.log = NULL;
btrfs_abort_transaction(trans, ret);
goto error;
}
- wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
- true);
- if (IS_ERR(wc.replay_dest)) {
- ret = PTR_ERR(wc.replay_dest);
+ wc.root = btrfs_get_fs_root(fs_info, found_key.offset, true);
+ if (IS_ERR(wc.root)) {
+ ret = PTR_ERR(wc.root);
+ wc.root = NULL;
+ if (unlikely(ret != -ENOENT)) {
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
/*
* We didn't find the subvol, likely because it was
@@ -7295,36 +7682,37 @@ again:
* block from being modified, and we'll just bail for
* each subsequent pass.
*/
- if (ret == -ENOENT)
- ret = btrfs_pin_extent_for_log_replay(trans, log->node);
- btrfs_put_root(log);
+ ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
+ goto next;
+ }
- if (!ret)
- goto next;
+ wc.root->log_root = wc.log;
+ ret = btrfs_record_root_in_trans(trans, wc.root);
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto error;
+ goto next;
}
- wc.replay_dest->log_root = log;
- ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
- if (ret)
- /* The loop needs to continue due to the root refs */
+ ret = walk_log_tree(&wc);
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- else
- ret = walk_log_tree(trans, log, &wc);
-
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- ret = fixup_inode_link_counts(trans, wc.replay_dest,
- path);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ goto next;
}
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- struct btrfs_root *root = wc.replay_dest;
-
- btrfs_release_path(path);
+ if (wc.stage == LOG_WALK_REPLAY_ALL) {
+ struct btrfs_root *root = wc.root;
+ wc.subvol_path = path;
+ ret = fixup_inode_link_counts(&wc);
+ wc.subvol_path = NULL;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
/*
* We have just replayed everything, and the highest
* objectid of fs roots probably has changed in case
@@ -7334,17 +7722,21 @@ again:
* could only happen during mount.
*/
ret = btrfs_init_root_free_objectid(root);
- if (ret)
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
}
-
- wc.replay_dest->log_root = NULL;
- btrfs_put_root(wc.replay_dest);
- btrfs_put_root(log);
+next:
+ if (wc.root) {
+ wc.root->log_root = NULL;
+ btrfs_put_root(wc.root);
+ }
+ btrfs_put_root(wc.log);
+ wc.log = NULL;
if (ret)
goto error;
-next:
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
@@ -7353,7 +7745,7 @@ next:
/* step one is to pin it all, step two is to replay just inodes */
if (wc.pin) {
- wc.pin = 0;
+ wc.pin = false;
wc.process_func = replay_one_buffer;
wc.stage = LOG_WALK_REPLAY_INODES;
goto again;
@@ -7371,14 +7763,13 @@ next:
if (ret)
return ret;
- log_root_tree->log_root = NULL;
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
- btrfs_put_root(log_root_tree);
return 0;
error:
if (wc.trans)
btrfs_end_transaction(wc.trans);
+ btrfs_put_root(wc.log);
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_free_path(path);
return ret;
@@ -7475,6 +7866,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
* full log sync.
* Also we don't need to worry with renames, since btrfs_rename() marks the log
* for full commit when renaming a subvolume.
+ *
+ * Must be called before creating the subvolume entry in its parent directory.
*/
void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
struct btrfs_inode *dir)
@@ -7511,6 +7904,12 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
bool log_pinned = false;
int ret;
+ /* The inode has a new name (ref/extref), so make sure we log it. */
+ set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
+
+ btrfs_init_log_ctx(&ctx, inode);
+ ctx.logging_new_name = true;
+
/*
* this will force the logging code to walk the dentry chain
* up for the file
@@ -7542,6 +7941,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
ret = 0;
/*
+ * Now that we know we need to update the log, allocate the scratch eb
+ * for the context before joining a log transaction below, as this can
+ * take time and therefore we could delay log commits from other tasks.
+ */
+ btrfs_init_log_ctx_scratch_eb(&ctx);
+
+ /*
* If we are doing a rename (old_dir is not NULL) from a directory that
* was previously logged, make sure that on log replay we get the old
* dir entry deleted. This is needed because we will also log the new
@@ -7553,12 +7959,21 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct fscrypt_name fname;
- ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
+ ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX,
+ "old_dir_index=%llu", old_dir_index);
ret = fscrypt_setup_filename(&old_dir->vfs_inode,
&old_dentry->d_name, 0, &fname);
if (ret)
goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ fscrypt_free_filename(&fname);
+ goto out;
+ }
+
/*
* We have two inodes to update in the log, the old directory and
* the inode that got renamed, so we must pin the log to prevent
@@ -7572,19 +7987,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* mark the log for a full commit.
*/
if (WARN_ON_ONCE(ret < 0)) {
+ btrfs_free_path(path);
fscrypt_free_filename(&fname);
goto out;
}
log_pinned = true;
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- fscrypt_free_filename(&fname);
- goto out;
- }
-
/*
* Other concurrent task might be logging the old directory,
* as it can be triggered when logging other inode that had or
@@ -7616,9 +8025,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
goto out;
}
- btrfs_init_log_ctx(&ctx, inode);
- ctx.logging_new_name = true;
- btrfs_init_log_ctx_scratch_eb(&ctx);
/*
* We don't care about the return value. If we fail to log the new name
* then we know the next attempt to sync the log will fallback to a full
@@ -7627,7 +8033,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
- free_extent_buffer(ctx.scratch_eb);
ASSERT(list_empty(&ctx.conflict_inodes));
out:
/*
@@ -7640,5 +8045,6 @@ out:
btrfs_set_log_full_commit(trans);
if (log_pinned)
btrfs_end_log_trans(root);
+ free_extent_buffer(ctx.scratch_eb);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index dc313e6bb2fa..41e47fda036d 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -8,8 +8,7 @@
#include <linux/list.h>
#include <linux/fs.h>
-#include "messages.h"
-#include "ctree.h"
+#include <linux/fscrypt.h>
#include "transaction.h"
struct inode;
@@ -80,13 +79,12 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
struct btrfs_log_ctx *ctx);
void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
const struct fscrypt_str *name,
struct btrfs_inode *dir, u64 index);
void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
const struct fscrypt_str *name,
- struct btrfs_inode *inode, u64 dirid);
+ struct btrfs_inode *inode,
+ struct btrfs_inode *dir);
void btrfs_end_log_trans(struct btrfs_root *root);
void btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 1ac2678fc4ca..9e8cb3b7c064 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -27,18 +27,29 @@ struct tree_mod_elem {
/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
u64 generation;
- /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
- struct btrfs_disk_key key;
- u64 blockptr;
-
- /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
- struct {
- int dst_slot;
- int nr_items;
- } move;
-
- /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
- struct tree_mod_root old_root;
+ union {
+ /*
+ * This is used for the following op types:
+ *
+ * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING
+ * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING
+ * BTRFS_MOD_LOG_KEY_REMOVE
+ * BTRFS_MOD_LOG_KEY_REPLACE
+ */
+ struct {
+ struct btrfs_disk_key key;
+ u64 blockptr;
+ } slot_change;
+
+ /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
+ struct {
+ int dst_slot;
+ int nr_items;
+ } move;
+
+ /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
+ struct tree_mod_root old_root;
+ };
};
/*
@@ -164,6 +175,30 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
return 0;
}
+static inline bool skip_eb_logging(const struct extent_buffer *eb)
+{
+ const u64 owner = btrfs_header_owner(eb);
+
+ if (btrfs_header_level(eb) == 0)
+ return true;
+
+ /*
+ * Tree mod logging exists so that there's a consistent view of the
+ * extents and backrefs of inodes even if while a task is iterating over
+ * them other tasks are modifying subvolume trees and the extent tree
+ * (including running delayed refs). So we only need to log extent
+ * buffers from the extent tree and subvolume trees.
+ */
+
+ if (owner == BTRFS_EXTENT_TREE_OBJECTID)
+ return false;
+
+ if (btrfs_is_fstree(owner))
+ return false;
+
+ return true;
+}
+
/*
* Determines if logging can be omitted. Returns true if it can. Otherwise, it
* returns false with the tree_mod_log_lock acquired. The caller must hold
@@ -174,7 +209,7 @@ static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return true;
- if (eb && btrfs_header_level(eb) == 0)
+ if (eb && skip_eb_logging(eb))
return true;
write_lock(&fs_info->tree_mod_log_lock);
@@ -192,7 +227,7 @@ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return false;
- if (eb && btrfs_header_level(eb) == 0)
+ if (eb && skip_eb_logging(eb))
return false;
return true;
@@ -204,15 +239,17 @@ static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb,
{
struct tree_mod_elem *tm;
+ /* Can't be one of these types, due to union in struct tree_mod_elem. */
+ ASSERT(op != BTRFS_MOD_LOG_MOVE_KEYS);
+ ASSERT(op != BTRFS_MOD_LOG_ROOT_REPLACE);
+
tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm)
return NULL;
tm->logical = eb->start;
- if (op != BTRFS_MOD_LOG_KEY_ADD) {
- btrfs_node_key(eb, &tm->key, slot);
- tm->blockptr = btrfs_node_blockptr(eb, slot);
- }
+ btrfs_node_key(eb, &tm->slot_change.key, slot);
+ tm->slot_change.blockptr = btrfs_node_blockptr(eb, slot);
tm->op = op;
tm->slot = slot;
tm->generation = btrfs_node_ptr_generation(eb, slot);
@@ -830,8 +867,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
fallthrough;
case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
case BTRFS_MOD_LOG_KEY_REMOVE:
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
n++;
@@ -840,8 +877,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
break;
case BTRFS_MOD_LOG_KEY_REPLACE:
BUG_ON(tm->slot >= n);
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
break;
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index fc59b57257d6..7e16a253fb35 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -129,21 +129,25 @@ void ulist_free(struct ulist *ulist)
kfree(ulist);
}
+static int ulist_node_val_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *val = key;
+ const struct ulist_node *unode = rb_entry(node, struct ulist_node, rb_node);
+
+ if (unode->val < *val)
+ return 1;
+ else if (unode->val > *val)
+ return -1;
+
+ return 0;
+}
+
static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
{
- struct rb_node *n = ulist->root.rb_node;
- struct ulist_node *u = NULL;
-
- while (n) {
- u = rb_entry(n, struct ulist_node, rb_node);
- if (u->val < val)
- n = n->rb_right;
- else if (u->val > val)
- n = n->rb_left;
- else
- return u;
- }
- return NULL;
+ struct rb_node *node;
+
+ node = rb_find(&val, &ulist->root, ulist_node_val_key_cmp);
+ return rb_entry_safe(node, struct ulist_node, rb_node);
}
static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
@@ -155,25 +159,20 @@ static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
ulist->nnodes--;
}
+static int ulist_node_val_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct ulist_node *unode = rb_entry(new, struct ulist_node, rb_node);
+
+ return ulist_node_val_key_cmp(&unode->val, existing);
+}
+
static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
{
- struct rb_node **p = &ulist->root.rb_node;
- struct rb_node *parent = NULL;
- struct ulist_node *cur = NULL;
-
- while (*p) {
- parent = *p;
- cur = rb_entry(parent, struct ulist_node, rb_node);
-
- if (cur->val < ins->val)
- p = &(*p)->rb_right;
- else if (cur->val > ins->val)
- p = &(*p)->rb_left;
- else
- return -EEXIST;
- }
- rb_link_node(&ins->rb_node, parent, p);
- rb_insert_color(&ins->rb_node, &ulist->root);
+ struct rb_node *node;
+
+ node = rb_find_add(&ins->rb_node, &ulist->root, ulist_node_val_cmp);
+ if (node)
+ return -EEXIST;
return 0;
}
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index aca2861f2187..e3a1310fa7d5 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -27,32 +27,26 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid,
u8 type, u64 subid)
{
int ret;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *eb;
int slot;
u32 item_size;
unsigned long offset;
struct btrfs_key key;
- if (WARN_ON_ONCE(!uuid_root)) {
- ret = -ENOENT;
- goto out;
- }
+ if (WARN_ON_ONCE(!uuid_root))
+ return -ENOENT;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
btrfs_uuid_to_key(uuid, type, &key);
ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return -ENOENT;
eb = path->nodes[0];
slot = path->slots[0];
@@ -64,7 +58,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid,
btrfs_warn(uuid_root->fs_info,
"uuid item with illegal size %lu!",
(unsigned long)item_size);
- goto out;
+ return ret;
}
while (item_size) {
__le64 data;
@@ -78,8 +72,6 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid,
item_size -= sizeof(data);
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -89,7 +81,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *uuid_root = fs_info->uuid_root;
int ret;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *eb;
int slot;
@@ -100,18 +92,14 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ
if (ret != -ENOENT)
return ret;
- if (WARN_ON_ONCE(!uuid_root)) {
- ret = -EINVAL;
- goto out;
- }
+ if (WARN_ON_ONCE(!uuid_root))
+ return -EINVAL;
btrfs_uuid_to_key(uuid, type, &key);
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
ret = btrfs_insert_empty_item(trans, uuid_root, path, &key,
sizeof(subid_le));
@@ -134,17 +122,12 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ
btrfs_warn(fs_info,
"insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
ret, key.objectid, key.offset, type);
- goto out;
+ return ret;
}
- ret = 0;
subid_le = cpu_to_le64(subid_cpu);
write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
- btrfs_mark_buffer_dirty(trans, eb);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
@@ -153,7 +136,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *uuid_root = fs_info->uuid_root;
int ret;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *eb;
int slot;
@@ -163,29 +146,23 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8
unsigned long move_src;
unsigned long move_len;
- if (WARN_ON_ONCE(!uuid_root)) {
- ret = -EINVAL;
- goto out;
- }
+ if (WARN_ON_ONCE(!uuid_root))
+ return -EINVAL;
btrfs_uuid_to_key(uuid, type, &key);
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1);
if (ret < 0) {
btrfs_warn(fs_info, "error %d while searching for uuid item!",
ret);
- goto out;
- }
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
+ return ret;
}
+ if (ret > 0)
+ return -ENOENT;
eb = path->nodes[0];
slot = path->slots[0];
@@ -194,8 +171,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8
if (!IS_ALIGNED(item_size, sizeof(u64))) {
btrfs_warn(fs_info, "uuid item with illegal size %lu!",
(unsigned long)item_size);
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
while (item_size) {
__le64 read_subid;
@@ -207,16 +183,12 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8
item_size -= sizeof(read_subid);
}
- if (!item_size) {
- ret = -ENOENT;
- goto out;
- }
+ if (!item_size)
+ return -ENOENT;
item_size = btrfs_item_size(eb, slot);
- if (item_size == sizeof(subid)) {
- ret = btrfs_del_item(trans, uuid_root, path);
- goto out;
- }
+ if (item_size == sizeof(subid))
+ return btrfs_del_item(trans, uuid_root, path);
move_dst = offset;
move_src = offset + sizeof(subid);
@@ -224,9 +196,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8
memmove_extent_buffer(eb, move_dst, move_src, move_len);
btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1);
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
@@ -295,7 +265,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->uuid_root;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret = 0;
struct extent_buffer *leaf;
int slot;
@@ -303,10 +273,8 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
unsigned long offset;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
key.objectid = 0;
key.type = 0;
@@ -314,17 +282,15 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
again_search_slot:
ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
- if (ret) {
- if (ret > 0)
- ret = 0;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return 0;
while (1) {
- if (btrfs_fs_closing(fs_info)) {
- ret = -EINTR;
- goto out;
- }
+ if (btrfs_fs_closing(fs_info))
+ return -EINTR;
+
cond_resched();
leaf = path->nodes[0];
slot = path->slots[0];
@@ -355,7 +321,7 @@ again_search_slot:
ret = btrfs_check_uuid_tree_entry(fs_info, uuid,
key.type, subid_cpu);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0) {
btrfs_release_path(path);
ret = btrfs_uuid_iter_rem(root, uuid, key.type,
@@ -371,7 +337,7 @@ again_search_slot:
goto again_search_slot;
}
if (ret < 0 && ret != -ENOENT)
- goto out;
+ return ret;
key.offset++;
goto again_search_slot;
}
@@ -388,8 +354,6 @@ skip:
break;
}
-out:
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index e97ad824ae16..a2ac3fb68bc8 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -109,7 +109,7 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = inode->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int count = 0;
int ret;
@@ -121,10 +121,8 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
while (1) {
/* 1 for the item being dropped */
trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
/*
* Walk backwards through all the items until we find one that
@@ -143,7 +141,7 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
path->slots[0]--;
} else if (ret < 0) {
btrfs_end_transaction(trans);
- goto out;
+ return ret;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -161,17 +159,14 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
ret = btrfs_del_items(trans, root, path, path->slots[0], 1);
if (ret) {
btrfs_end_transaction(trans);
- goto out;
+ return ret;
}
count++;
btrfs_release_path(path);
btrfs_end_transaction(trans);
}
- ret = count;
btrfs_end_transaction(trans);
-out:
- btrfs_free_path(path);
- return ret;
+ return count;
}
/*
@@ -217,7 +212,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
const char *src, u64 len)
{
struct btrfs_trans_handle *trans;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -233,10 +228,8 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
while (len > 0) {
/* 1 for the new item being inserted */
trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
key.objectid = btrfs_ino(inode);
key.type = key_type;
@@ -267,7 +260,6 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
btrfs_end_transaction(trans);
}
- btrfs_free_path(path);
return ret;
}
@@ -296,7 +288,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
char *dest, u64 len, struct folio *dest_folio)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -404,7 +396,6 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
}
}
out:
- btrfs_free_path(path);
if (!ret)
ret = copied;
return ret;
@@ -485,14 +476,14 @@ static int rollback_verity(struct btrfs_inode *inode)
goto out;
}
inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = del_orphan(trans, inode);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -552,7 +543,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
goto out;
}
inode->ro_flags |= BTRFS_INODE_RO_VERITY;
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
if (ret)
goto end_trans;
@@ -587,6 +578,9 @@ static int btrfs_begin_enable_verity(struct file *filp)
btrfs_assert_inode_locked(inode);
+ if (IS_ENCRYPTED(&inode->vfs_inode))
+ return -EOPNOTSUPP;
+
if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
return -EBUSY;
@@ -676,11 +670,11 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
if (ret < 0)
return ret;
- if (item.reserved[0] != 0 || item.reserved[1] != 0)
+ if (unlikely(item.reserved[0] != 0 || item.reserved[1] != 0))
return -EUCLEAN;
true_size = btrfs_stack_verity_descriptor_size(&item);
- if (true_size > INT_MAX)
+ if (unlikely(true_size > INT_MAX))
return -EUCLEAN;
if (buf_size == 0)
@@ -742,7 +736,7 @@ again:
}
folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS),
- 0);
+ 0, NULL);
if (!folio)
return ERR_PTR(-ENOMEM);
@@ -802,6 +796,8 @@ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
}
const struct fsverity_operations btrfs_verityops = {
+ .inode_info_offs = (int)offsetof(struct btrfs_inode, i_verity_info) -
+ (int)offsetof(struct btrfs_inode, vfs_inode),
.begin_enable_verity = btrfs_begin_enable_verity,
.end_enable_verity = btrfs_end_enable_verity,
.get_verity_descriptor = btrfs_get_verity_descriptor,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1cccaf9c2b0d..ae1742a35e76 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -13,12 +13,11 @@
#include <linux/list_sort.h>
#include <linux/namei.h>
#include "misc.h"
-#include "ctree.h"
#include "disk-io.h"
+#include "extent-tree.h"
#include "transaction.h"
#include "volumes.h"
#include "raid56.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "tree-checker.h"
@@ -48,6 +47,7 @@ struct btrfs_io_geometry {
u64 raid56_full_stripe_start;
int max_errors;
enum btrfs_map_op op;
+ bool use_rst;
};
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
@@ -213,10 +213,8 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
u64 flags = bg_flags;
u32 size_bp = size_buf;
- if (!flags) {
- strcpy(bp, "NONE");
+ if (!flags)
return;
- }
#define DESCRIBE_FLAG(flag, desc) \
do { \
@@ -402,8 +400,12 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
static void btrfs_free_device(struct btrfs_device *device)
{
WARN_ON(!list_empty(&device->post_commit_list));
- rcu_string_free(device->name);
- extent_io_tree_release(&device->alloc_state);
+ /*
+ * No need to call kfree_rcu() nor do RCU lock/unlock, nothing is
+ * reading the device name.
+ */
+ kfree(rcu_dereference_raw(device->name));
+ btrfs_extent_io_tree_release(&device->alloc_state);
btrfs_destroy_dev_zone_info(device);
kfree(device);
}
@@ -413,9 +415,10 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
struct btrfs_device *device;
WARN_ON(fs_devices->opened);
+ WARN_ON(fs_devices->holding);
while (!list_empty(&fs_devices->devices)) {
- device = list_entry(fs_devices->devices.next,
- struct btrfs_device, dev_list);
+ device = list_first_entry(&fs_devices->devices,
+ struct btrfs_device, dev_list);
list_del(&device->dev_list);
btrfs_free_device(device);
}
@@ -427,8 +430,8 @@ void __exit btrfs_cleanup_fs_uuids(void)
struct btrfs_fs_devices *fs_devices;
while (!list_empty(&fs_uuids)) {
- fs_devices = list_entry(fs_uuids.next,
- struct btrfs_fs_devices, fs_list);
+ fs_devices = list_first_entry(&fs_uuids, struct btrfs_fs_devices,
+ fs_list);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
@@ -472,7 +475,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
struct block_device *bdev;
int ret;
- *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
+ *bdev_file = bdev_file_open_by_path(device_path, flags, holder, &fs_holder_ops);
if (IS_ERR(*bdev_file)) {
ret = PTR_ERR(*bdev_file);
@@ -487,15 +490,15 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
if (holder) {
ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
- fput(*bdev_file);
+ bdev_fput(*bdev_file);
goto error;
}
}
invalidate_bdev(bdev);
- *disk_super = btrfs_read_dev_super(bdev);
+ *disk_super = btrfs_read_disk_super(bdev, 0, false);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- fput(*bdev_file);
+ bdev_fput(*bdev_file);
goto error;
}
@@ -540,7 +543,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
continue;
if (devt && devt != device->devt)
continue;
- if (fs_devices->opened) {
+ if (fs_devices->opened || fs_devices->holding) {
if (devt)
ret = -EBUSY;
break;
@@ -656,7 +659,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (!device->name)
return -EINVAL;
- ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+ ret = btrfs_get_bdev_and_sb(rcu_dereference_raw(device->name), flags, holder, 1,
&bdev_file, &disk_super);
if (ret)
return ret;
@@ -673,8 +676,8 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
if (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
- pr_err(
- "BTRFS: Invalid seeding and uuid-changed device detected\n");
+ btrfs_err(NULL,
+ "invalid seeding and uuid-changed device detected");
goto error_free_page;
}
@@ -700,7 +703,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (device->devt != device->bdev->bd_dev) {
btrfs_warn(NULL,
"device %s maj:min changed from %d:%d to %d:%d",
- device->name->str, MAJOR(device->devt),
+ rcu_dereference_raw(device->name), MAJOR(device->devt),
MINOR(device->devt), MAJOR(device->bdev->bd_dev),
MINOR(device->bdev->bd_dev));
@@ -719,7 +722,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- fput(bdev_file);
+ bdev_fput(bdev_file);
return -EINVAL;
}
@@ -732,83 +735,11 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
}
-/*
- * We can have very weird soft links passed in.
- * One example is "/proc/self/fd/<fd>", which can be a soft link to
- * a block device.
- *
- * But it's never a good idea to use those weird names.
- * Here we check if the path (not following symlinks) is a good one inside
- * "/dev/".
- */
-static bool is_good_dev_path(const char *dev_path)
-{
- struct path path = { .mnt = NULL, .dentry = NULL };
- char *path_buf = NULL;
- char *resolved_path;
- bool is_good = false;
- int ret;
-
- if (!dev_path)
- goto out;
-
- path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!path_buf)
- goto out;
-
- /*
- * Do not follow soft link, just check if the original path is inside
- * "/dev/".
- */
- ret = kern_path(dev_path, 0, &path);
- if (ret)
- goto out;
- resolved_path = d_path(&path, path_buf, PATH_MAX);
- if (IS_ERR(resolved_path))
- goto out;
- if (strncmp(resolved_path, "/dev/", strlen("/dev/")))
- goto out;
- is_good = true;
-out:
- kfree(path_buf);
- path_put(&path);
- return is_good;
-}
-
-static int get_canonical_dev_path(const char *dev_path, char *canonical)
-{
- struct path path = { .mnt = NULL, .dentry = NULL };
- char *path_buf = NULL;
- char *resolved_path;
- int ret;
-
- if (!dev_path) {
- ret = -EINVAL;
- goto out;
- }
-
- path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!path_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = kern_path(dev_path, LOOKUP_FOLLOW, &path);
- if (ret)
- goto out;
- resolved_path = d_path(&path, path_buf, PATH_MAX);
- ret = strscpy(canonical, resolved_path, PATH_MAX);
-out:
- kfree(path_buf);
- path_put(&path);
- return ret;
-}
-
static bool is_same_device(struct btrfs_device *device, const char *new_path)
{
struct path old = { .mnt = NULL, .dentry = NULL };
struct path new = { .mnt = NULL, .dentry = NULL };
- char *old_path = NULL;
+ char AUTO_KFREE(old_path);
bool is_same = false;
int ret;
@@ -820,7 +751,7 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path)
goto out;
rcu_read_lock();
- ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX);
+ ret = strscpy(old_path, rcu_dereference(device->name), PATH_MAX);
rcu_read_unlock();
if (ret < 0)
goto out;
@@ -834,7 +765,6 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path)
if (path_equal(&old, &new))
is_same = true;
out:
- kfree(old_path);
path_put(&old);
path_put(&new);
return is_same;
@@ -853,11 +783,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = NULL;
- struct rcu_string *name;
+ const char *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_t path_devt;
- int error;
+ int ret;
bool same_fsid_diff_dev = false;
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
@@ -869,11 +799,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(-EAGAIN);
}
- error = lookup_bdev(path, &path_devt);
- if (error) {
+ ret = lookup_bdev(path, &path_devt);
+ if (ret) {
btrfs_err(NULL, "failed to lookup block device for path %s: %d",
- path, error);
- return ERR_PTR(error);
+ path, ret);
+ return ERR_PTR(ret);
}
fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
@@ -890,7 +820,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (same_fsid_diff_dev) {
generate_random_uuid(fs_devices->fsid);
fs_devices->temp_fsid = true;
- pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n",
+ btrfs_info(NULL, "device %s (%d:%d) using temp-fsid %pU",
path, MAJOR(path_devt), MINOR(path_devt),
fs_devices->fsid);
}
@@ -961,6 +891,8 @@ static noinline struct btrfs_device *device_list_add(const char *path,
current->comm, task_pid_nr(current));
} else if (!device->name || !is_same_device(device, path)) {
+ const char *old_name;
+
/*
* When FS is already mounted.
* 1. If you are here and if the device->name is NULL that
@@ -1014,27 +946,31 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (device->bdev) {
if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_warn_in_rcu(NULL,
+ btrfs_warn(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
current->comm,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(NULL,
+ btrfs_info(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, btrfs_dev_name(device),
path, current->comm,
task_pid_nr(current));
}
- name = rcu_string_strdup(path, GFP_NOFS);
+ name = kstrdup(path, GFP_NOFS);
if (!name) {
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-ENOMEM);
}
- rcu_string_free(device->name);
+ rcu_read_lock();
+ old_name = rcu_dereference(device->name);
+ rcu_read_unlock();
rcu_assign_pointer(device->name, name);
+ kfree_rcu_mightsleep(old_name);
+
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
fs_devices->missing_devices--;
clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
@@ -1083,7 +1019,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
* uuid mutex so nothing we touch in here is going to disappear.
*/
if (orig_dev->name)
- dev_path = orig_dev->name->str;
+ dev_path = rcu_dereference_raw(orig_dev->name);
device = btrfs_alloc_device(NULL, &orig_dev->devid,
orig_dev->uuid, dev_path);
@@ -1141,7 +1077,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
continue;
if (device->bdev_file) {
- fput(device->bdev_file);
+ bdev_fput(device->bdev_file);
device->bdev = NULL;
device->bdev_file = NULL;
fs_devices->open_devices--;
@@ -1188,7 +1124,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- fput(device->bdev_file);
+ bdev_fput(device->bdev_file);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1220,7 +1156,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
device->fs_info = NULL;
atomic_set(&device->dev_stats_ccnt, 0);
- extent_io_tree_release(&device->alloc_state);
+ btrfs_extent_io_tree_release(&device->alloc_state);
/*
* Reset the flush error record. We might have a transient flush error
@@ -1268,7 +1204,7 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&uuid_mutex);
close_fs_devices(fs_devices);
- if (!fs_devices->opened) {
+ if (!fs_devices->opened && !fs_devices->holding) {
list_splice_init(&fs_devices->seed_list, &list);
/*
@@ -1298,6 +1234,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
struct btrfs_device *tmp_device;
+ s64 __maybe_unused value = 0;
int ret = 0;
list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
@@ -1327,7 +1264,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
fs_devices->latest_dev = latest_dev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
+ fs_devices->read_devid = latest_dev->devid;
+ fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(),
+ &value);
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR)
+ fs_devices->collect_fs_stats = true;
+
+ if (value) {
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR)
+ fs_devices->rr_min_contig_read = value;
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID)
+ fs_devices->read_devid = value;
+ }
+#else
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+#endif
return 0;
}
@@ -1379,48 +1332,58 @@ void btrfs_release_disk_super(struct btrfs_super_block *super)
put_page(page);
}
-static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
- u64 bytenr, u64 bytenr_orig)
+struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
+ int copy_num, bool drop_cache)
{
- struct btrfs_super_block *disk_super;
+ struct btrfs_super_block *super;
struct page *page;
- void *p;
- pgoff_t index;
+ u64 bytenr, bytenr_orig;
+ struct address_space *mapping = bdev->bd_mapping;
+ int ret;
- /* make sure our super fits in the device */
- if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
- return ERR_PTR(-EINVAL);
+ bytenr_orig = btrfs_sb_offset(copy_num);
+ ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ ret = -EINVAL;
+ return ERR_PTR(ret);
+ }
- /* make sure our super fits in the page */
- if (sizeof(*disk_super) > PAGE_SIZE)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
- /* make sure our super doesn't straddle pages on disk */
- index = bytenr >> PAGE_SHIFT;
- if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
- return ERR_PTR(-EINVAL);
+ if (drop_cache) {
+ /* This should only be called with the primary sb. */
+ ASSERT(copy_num == 0);
- /* pull in the page with our super */
- page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL);
+ /*
+ * Drop the page of the primary superblock, so later read will
+ * always read from the device.
+ */
+ invalidate_inode_pages2_range(mapping, bytenr >> PAGE_SHIFT,
+ (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
+ }
+ page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
if (IS_ERR(page))
return ERR_CAST(page);
- p = page_address(page);
-
- /* align our pointer to the offset of the super block */
- disk_super = p + offset_in_page(bytenr);
-
- if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
- btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- btrfs_release_disk_super(p);
+ super = page_address(page);
+ if (btrfs_super_magic(super) != BTRFS_MAGIC ||
+ btrfs_super_bytenr(super) != bytenr_orig) {
+ btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
- if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
- disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
+ /*
+ * Make sure the last byte of label is properly NUL terminated. We use
+ * '%s' to print the label, if not properly NUL terminated we can access
+ * beyond the label.
+ */
+ if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1])
+ super->label[BTRFS_LABEL_SIZE - 1] = 0;
- return disk_super;
+ return super;
}
int btrfs_forget_devices(dev_t devt)
@@ -1458,7 +1421,7 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->bdev && (device->bdev->bd_dev == devt) &&
- strcmp(device->name->str, path) != 0) {
+ strcmp(rcu_dereference_raw(device->name), path) != 0) {
mutex_unlock(&fs_devices->device_list_mutex);
/* Do not skip registration. */
@@ -1484,30 +1447,17 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
* the device or return an error. Multi-device and seeding devices are registered
* in both cases.
*/
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+struct btrfs_device *btrfs_scan_one_device(const char *path,
bool mount_arg_dev)
{
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct file *bdev_file;
- char *canonical_path = NULL;
- u64 bytenr;
dev_t devt;
- int ret;
lockdep_assert_held(&uuid_mutex);
- if (!is_good_dev_path(path)) {
- canonical_path = kmalloc(PATH_MAX, GFP_KERNEL);
- if (canonical_path) {
- ret = get_canonical_dev_path(path, canonical_path);
- if (ret < 0) {
- kfree(canonical_path);
- canonical_path = NULL;
- }
- }
- }
/*
* Avoid an exclusive open here, as the systemd-udev may initiate the
* device scan which may race with the user's mount or mkfs command,
@@ -1518,24 +1468,11 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
- bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
+ bdev_file = bdev_file_open_by_path(path, BLK_OPEN_READ, NULL, NULL);
if (IS_ERR(bdev_file))
return ERR_CAST(bdev_file);
- /*
- * We would like to check all the super blocks, but doing so would
- * allow a mount to succeed after a mkfs from a different filesystem.
- * Currently, recovery from a bad primary btrfs superblock is done
- * using the userspace command 'btrfs check --super'.
- */
- ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
- if (ret) {
- device = ERR_PTR(ret);
- goto error_bdev_put;
- }
-
- disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
- btrfs_sb_offset(0));
+ disk_super = btrfs_read_disk_super(file_bdev(bdev_file), 0, false);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
@@ -1543,7 +1480,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
devt = file_bdev(bdev_file)->bd_dev;
if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
- pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
+ btrfs_debug(NULL, "skip registering single non-seed device %s (%d:%d)",
path, MAJOR(devt), MINOR(devt));
btrfs_free_stale_devices(devt, NULL);
@@ -1552,8 +1489,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
goto free_disk_super;
}
- device = device_list_add(canonical_path ? : path, disk_super,
- &new_device_added);
+ device = device_list_add(path, disk_super, &new_device_added);
if (!IS_ERR(device) && new_device_added)
btrfs_free_stale_devices(device->devt, device);
@@ -1561,8 +1497,7 @@ free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- fput(bdev_file);
- kfree(canonical_path);
+ bdev_fput(bdev_file);
return device;
}
@@ -1578,9 +1513,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
lockdep_assert_held(&device->fs_info->chunk_mutex);
- if (find_first_extent_bit(&device->alloc_state, *start,
- &physical_start, &physical_end,
- CHUNK_ALLOCATED, NULL)) {
+ if (btrfs_find_first_extent_bit(&device->alloc_state, *start,
+ &physical_start, &physical_end,
+ CHUNK_ALLOCATED, NULL)) {
if (in_range(physical_start, *start, len) ||
in_range(*start, physical_start,
@@ -1595,6 +1530,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
static u64 dev_extent_search_start(struct btrfs_device *device)
{
switch (device->fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
return BTRFS_DEVICE_RANGE_RESERVED;
case BTRFS_CHUNK_ALLOC_ZONED:
@@ -1604,8 +1542,6 @@ static u64 dev_extent_search_start(struct btrfs_device *device)
* for superblock logging.
*/
return 0;
- default:
- BUG();
}
}
@@ -1618,7 +1554,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
int ret;
bool changed = false;
- ASSERT(IS_ALIGNED(*hole_start, zone_size));
+ ASSERT(IS_ALIGNED(*hole_start, zone_size),
+ "hole_start=%llu zone_size=%llu", *hole_start, zone_size);
while (*hole_size > 0) {
pos = btrfs_find_allocatable_zones(device, *hole_start,
@@ -1684,6 +1621,9 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
}
switch (device->fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
/* No extra check */
break;
@@ -1698,8 +1638,6 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
continue;
}
break;
- default:
- BUG();
}
break;
@@ -1742,7 +1680,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
struct btrfs_dev_extent *dev_extent;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
u64 search_start;
u64 hole_size;
u64 max_hole_start;
@@ -1772,12 +1710,12 @@ again:
}
path->reada = READA_FORWARD;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
key.objectid = device->devid;
- key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = search_start;
ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
@@ -1869,9 +1807,10 @@ next:
else
ret = 0;
- ASSERT(max_hole_start + max_hole_size <= search_end);
+ ASSERT(max_hole_start + max_hole_size <= search_end,
+ "max_hole_start=%llu max_hole_size=%llu search_end=%llu",
+ max_hole_start, max_hole_size, search_end);
out:
- btrfs_free_path(path);
*start = max_hole_start;
if (len)
*len = max_hole_size;
@@ -1885,7 +1824,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf = NULL;
@@ -1896,15 +1835,15 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
return -ENOMEM;
key.objectid = device->devid;
- key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = start;
again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid,
BTRFS_DEV_EXTENT_KEY);
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1919,7 +1858,7 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- goto out;
+ return ret;
}
*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
@@ -1927,8 +1866,6 @@ again:
ret = btrfs_del_item(trans, root, path);
if (ret == 0)
set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -1956,7 +1893,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
@@ -1968,13 +1905,12 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
if (ret < 0)
- goto error;
+ return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/* Corruption */
btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
- ret = -EUCLEAN;
- goto error;
+ return -EUCLEAN;
}
ret = btrfs_previous_item(fs_info->chunk_root, path,
@@ -1987,10 +1923,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
path->slots[0]);
*devid_ret = found_key.offset + 1;
}
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -2001,7 +1934,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -2020,7 +1953,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
&key, sizeof(*dev_item));
btrfs_trans_release_chunk_metadata(trans);
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
@@ -2045,12 +1978,8 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
ptr = btrfs_device_fsid(dev_item);
write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
ptr, BTRFS_FSID_SIZE);
- btrfs_mark_buffer_dirty(trans, leaf);
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -2062,14 +1991,11 @@ out:
static void update_dev_time(const char *device_path)
{
struct path path;
- int ret;
-
- ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
- if (ret)
- return;
- inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION);
- path_put(&path);
+ if (!kern_path(device_path, LOOKUP_FOLLOW, &path)) {
+ vfs_utimes(&path, NULL);
+ path_put(&path);
+ }
}
static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
@@ -2077,7 +2003,7 @@ static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = device->fs_info->chunk_root;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
path = btrfs_alloc_path();
@@ -2091,16 +2017,12 @@ static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
btrfs_trans_release_chunk_metadata(trans);
- if (ret) {
- if (ret > 0)
- ret = -ENOENT;
- goto out;
- }
+ if (ret > 0)
+ return -ENOENT;
+ if (ret < 0)
+ return ret;
- ret = btrfs_del_item(trans, root, path);
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, root, path);
}
/*
@@ -2183,7 +2105,7 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
down_read(&fs_info->dev_replace.rwsem);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
- ASSERT(num_devices > 1);
+ ASSERT(num_devices > 1, "num_devices=%llu", num_devices);
num_devices--;
}
up_read(&fs_info->dev_replace.rwsem);
@@ -2199,7 +2121,7 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
const u64 bytenr = btrfs_sb_offset(copy_num);
int ret;
- disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr);
+ disk_super = btrfs_read_disk_super(bdev, copy_num, false);
if (IS_ERR(disk_super))
return;
@@ -2232,7 +2154,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_devic
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(device->name->str);
+ update_dev_time(rcu_dereference_raw(device->name));
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
@@ -2272,7 +2194,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
if (btrfs_pinned_by_swapfile(fs_info, device)) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
btrfs_dev_name(device), device->devid);
return -ETXTBSY;
@@ -2303,7 +2225,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
ret = btrfs_rm_dev_item(trans, device);
- if (ret) {
+ if (unlikely(ret)) {
/* Any error in dev item removal is critical */
btrfs_crit(fs_info,
"failed to remove device item for devid %llu: %d",
@@ -2362,7 +2284,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and fput() on the block device will pull in the
+ * write lock, and bdev_fput() on the block device will pull in the
* ->open_mutex on the block device and it's dependencies. Instead
* just flush the device and let the caller do the final bdev_release.
*/
@@ -2387,7 +2309,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
*/
if (cur_devices->num_devices == 0) {
list_del_init(&cur_devices->seed_list);
- ASSERT(cur_devices->opened == 1);
+ ASSERT(cur_devices->opened == 1, "opened=%d", cur_devices->opened);
cur_devices->opened--;
free_fs_devices(cur_devices);
}
@@ -2541,7 +2463,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- fput(bdev_file);
+ bdev_fput(bdev_file);
return 0;
}
@@ -2686,7 +2608,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_dev_item *dev_item;
struct btrfs_device *device;
@@ -2700,15 +2622,15 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
- key.offset = 0;
key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = 0;
while (1) {
btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
btrfs_trans_release_chunk_metadata(trans);
if (ret < 0)
- goto error;
+ return ret;
leaf = path->nodes[0];
next_slot:
@@ -2717,7 +2639,7 @@ next_slot:
if (ret > 0)
break;
if (ret < 0)
- goto error;
+ return ret;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
@@ -2741,19 +2663,14 @@ next_slot:
device = btrfs_find_device(fs_info->fs_devices, &args);
BUG_ON(!device); /* Logic error */
- if (device->fs_devices->seeding) {
+ if (device->fs_devices->seeding)
btrfs_set_device_generation(leaf, dev_item,
device->generation);
- btrfs_mark_buffer_dirty(trans, leaf);
- }
path->slots[0]++;
goto next_slot;
}
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
@@ -2775,7 +2692,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
return -EROFS;
bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
+ fs_info->sb, &fs_holder_ops);
if (IS_ERR(bdev_file))
return PTR_ERR(bdev_file);
@@ -2784,6 +2701,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error;
}
+ if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) {
+ ret = -EINVAL;
+ goto error;
+ }
+
if (fs_devices->seeding) {
seeding_dev = true;
down_write(&sb->s_umount);
@@ -2900,21 +2822,21 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
mutex_lock(&fs_info->chunk_mutex);
ret = init_first_rw_device(trans);
mutex_unlock(&fs_info->chunk_mutex);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
}
ret = btrfs_add_dev_item(trans, device);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
if (seeding_dev) {
ret = btrfs_finish_sprout(trans);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
@@ -2991,7 +2913,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- fput(bdev_file);
+ bdev_fput(bdev_file);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
@@ -3003,7 +2925,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = device->fs_info->chunk_root;
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
@@ -3019,12 +2941,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
- goto out;
+ return ret;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret > 0)
+ return -ENOENT;
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
@@ -3038,10 +2958,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
btrfs_device_get_disk_total_bytes(device));
btrfs_set_device_bytes_used(leaf, dev_item,
btrfs_device_get_bytes_used(device));
- btrfs_mark_buffer_dirty(trans, leaf);
-
-out:
- btrfs_free_path(path);
return ret;
}
@@ -3094,7 +3010,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
path = btrfs_alloc_path();
@@ -3102,28 +3018,26 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
return -ENOMEM;
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = chunk_offset;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = chunk_offset;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
- goto out;
- else if (ret > 0) { /* Logic error or corruption */
+ return ret;
+ if (unlikely(ret > 0)) {
+ /* Logic error or corruption */
btrfs_err(fs_info, "failed to lookup chunk %llu when freeing",
chunk_offset);
btrfs_abort_transaction(trans, -ENOENT);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
ret = btrfs_del_item(trans, root, path);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset);
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -3321,7 +3235,8 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
- ASSERT(0);
+ DEBUG_WARN("errr %ld reading chunk map at offset %llu",
+ PTR_ERR(map), chunk_offset);
return PTR_ERR(map);
}
@@ -3341,7 +3256,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
ret = btrfs_free_dev_extent(trans, device,
map->stripes[i].physical,
&dev_extent_len);
- if (ret) {
+ if (unlikely(ret)) {
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3353,6 +3268,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
device->bytes_used - dev_extent_len);
atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
+
+ if (list_empty(&device->post_commit_list)) {
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
+ }
+
mutex_unlock(&fs_info->chunk_mutex);
}
}
@@ -3402,8 +3323,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *sys_bg;
+ struct btrfs_space_info *space_info;
- sys_bg = btrfs_create_chunk(trans, sys_flags);
+ space_info = btrfs_find_space_info(fs_info, sys_flags);
+ if (unlikely(!space_info)) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ sys_bg = btrfs_create_chunk(trans, space_info, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -3411,17 +3340,17 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = remove_chunk_item(trans, map, chunk_offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- } else if (ret) {
+ } else if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3430,7 +3359,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3446,7 +3375,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
btrfs_trans_release_chunk_metadata(trans);
ret = btrfs_remove_block_group(trans, map);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3461,7 +3390,8 @@ out:
return ret;
}
-int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ bool verbose)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
@@ -3491,7 +3421,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
- ret = btrfs_relocate_block_group(fs_info, chunk_offset);
+ ret = btrfs_relocate_block_group(fs_info, chunk_offset, true);
btrfs_scrub_continue(fs_info);
if (ret) {
/*
@@ -3544,7 +3474,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *chunk_root = fs_info->chunk_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_chunk *chunk;
struct btrfs_key key;
@@ -3560,17 +3490,17 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
- goto error;
+ return ret;
}
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* On the first search we would find chunk tree with
* offset -1, which is not possible. On subsequent
@@ -3578,9 +3508,8 @@ again:
* offset (one less than the previous one, wrong
* alignment and size).
*/
- ret = -EUCLEAN;
mutex_unlock(&fs_info->reclaim_bgs_lock);
- goto error;
+ return -EUCLEAN;
}
ret = btrfs_previous_item(chunk_root, path, key.objectid,
@@ -3588,7 +3517,7 @@ again:
if (ret)
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret < 0)
- goto error;
+ return ret;
if (ret > 0)
break;
@@ -3601,7 +3530,8 @@ again:
btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_relocate_chunk(fs_info, found_key.offset);
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset,
+ true);
if (ret == -ENOSPC)
failed++;
else
@@ -3621,8 +3551,6 @@ again:
} else if (WARN_ON(failed && retried)) {
ret = -ENOSPC;
}
-error:
- btrfs_free_path(path);
return ret;
}
@@ -3748,10 +3676,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,
btrfs_set_balance_meta(leaf, item, &disk_bargs);
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
btrfs_set_balance_sys(leaf, item, &disk_bargs);
-
btrfs_set_balance_flags(leaf, item, bctl->flags);
-
- btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans);
@@ -3866,26 +3791,25 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info)
* Balance filters. Return 1 if chunk should be filtered out
* (should not be balanced).
*/
-static int chunk_profiles_filter(u64 chunk_type,
- struct btrfs_balance_args *bargs)
+static bool chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs)
{
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->profiles & chunk_type)
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
- struct btrfs_balance_args *bargs)
+static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_block_group *cache;
u64 chunk_used;
u64 user_thresh_min;
u64 user_thresh_max;
- int ret = 1;
+ bool ret = true;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = cache->used;
@@ -3903,18 +3827,18 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off
user_thresh_max = mult_perc(cache->length, bargs->usage_max);
if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
- ret = 0;
+ ret = false;
btrfs_put_block_group(cache);
return ret;
}
-static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
- u64 chunk_offset, struct btrfs_balance_args *bargs)
+static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_block_group *cache;
u64 chunk_used, user_thresh;
- int ret = 1;
+ bool ret = true;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = cache->used;
@@ -3927,15 +3851,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
user_thresh = mult_perc(cache->length, bargs->usage);
if (chunk_used < user_thresh)
- ret = 0;
+ ret = false;
btrfs_put_block_group(cache);
return ret;
}
-static int chunk_devid_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- struct btrfs_balance_args *bargs)
+static bool chunk_devid_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3944,10 +3867,10 @@ static int chunk_devid_filter(struct extent_buffer *leaf,
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static u64 calc_data_stripes(u64 type, int num_stripes)
@@ -3960,9 +3883,8 @@ static u64 calc_data_stripes(u64 type, int num_stripes)
}
/* [pstart, pend) */
-static int chunk_drange_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- struct btrfs_balance_args *bargs)
+static bool chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3973,7 +3895,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
int i;
if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
- return 0;
+ return false;
type = btrfs_chunk_type(leaf, chunk);
factor = calc_data_stripes(type, num_stripes);
@@ -3989,56 +3911,53 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
if (stripe_offset < bargs->pend &&
stripe_offset + stripe_length > bargs->pstart)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* [vstart, vend) */
-static int chunk_vrange_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- u64 chunk_offset,
- struct btrfs_balance_args *bargs)
+static bool chunk_vrange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ u64 chunk_offset, struct btrfs_balance_args *bargs)
{
if (chunk_offset < bargs->vend &&
chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
/* at least part of the chunk is inside this vrange */
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int chunk_stripes_range_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- struct btrfs_balance_args *bargs)
+static bool chunk_stripes_range_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
{
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
if (bargs->stripes_min <= num_stripes
&& num_stripes <= bargs->stripes_max)
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int chunk_soft_convert_filter(u64 chunk_type,
- struct btrfs_balance_args *bargs)
+static bool chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs)
{
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
- return 0;
+ return false;
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->target == chunk_type)
- return 1;
+ return true;
- return 0;
+ return false;
}
-static int should_balance_chunk(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk, u64 chunk_offset)
+static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -4048,7 +3967,7 @@ static int should_balance_chunk(struct extent_buffer *leaf,
/* type filter */
if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
- return 0;
+ return false;
}
if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
@@ -4061,46 +3980,46 @@ static int should_balance_chunk(struct extent_buffer *leaf,
/* profiles filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
chunk_profiles_filter(chunk_type, bargs)) {
- return 0;
+ return false;
}
/* usage filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
chunk_usage_filter(fs_info, chunk_offset, bargs)) {
- return 0;
+ return false;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
- return 0;
+ return false;
}
/* devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
chunk_devid_filter(leaf, chunk, bargs)) {
- return 0;
+ return false;
}
/* drange filter, makes sense only with devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
chunk_drange_filter(leaf, chunk, bargs)) {
- return 0;
+ return false;
}
/* vrange filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
- return 0;
+ return false;
}
/* stripes filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
chunk_stripes_range_filter(leaf, chunk, bargs)) {
- return 0;
+ return false;
}
/* soft profile changing mode */
if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
chunk_soft_convert_filter(chunk_type, bargs)) {
- return 0;
+ return false;
}
/*
@@ -4108,7 +4027,7 @@ static int should_balance_chunk(struct extent_buffer *leaf,
*/
if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
if (bargs->limit == 0)
- return 0;
+ return false;
else
bargs->limit--;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
@@ -4118,12 +4037,12 @@ static int should_balance_chunk(struct extent_buffer *leaf,
* about the count of all chunks that satisfy the filters.
*/
if (bargs->limit_max == 0)
- return 0;
+ return false;
else
bargs->limit_max--;
}
- return 1;
+ return true;
}
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
@@ -4132,7 +4051,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
struct btrfs_root *chunk_root = fs_info->chunk_root;
u64 chunk_type;
struct btrfs_chunk *chunk;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf;
@@ -4170,8 +4089,8 @@ again:
bctl->sys.limit = limit_sys;
}
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
@@ -4275,7 +4194,7 @@ again:
}
}
- ret = btrfs_relocate_chunk(fs_info, found_key.offset);
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset, true);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
enospc_errors++;
@@ -4303,7 +4222,6 @@ loop:
goto again;
}
error:
- btrfs_free_path(path);
if (enospc_errors) {
btrfs_info(fs_info, "%d enospc errors during balance",
enospc_errors);
@@ -4320,7 +4238,7 @@ error:
* @flags: profile to validate
* @extended: if true @flags is treated as an extended profile
*/
-static int alloc_profile_is_valid(u64 flags, int extended)
+static int alloc_profile_is_valid(u64 flags, bool extended)
{
u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
BTRFS_BLOCK_GROUP_PROFILE_MASK);
@@ -4461,7 +4379,7 @@ static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
{
u32 size_buf = 1024;
char tmp_buf[192] = {'\0'};
- char *buf;
+ char AUTO_KFREE(buf);
char *bp;
u32 size_bp = size_buf;
int ret;
@@ -4509,12 +4427,10 @@ out_overflow:
btrfs_info(fs_info, "balance: %s %s",
(bctl->flags & BTRFS_BALANCE_RESUME) ?
"resume" : "start", buf);
-
- kfree(buf);
}
/*
- * Should be called with balance mutexe held
+ * Should be called with balance mutex held
*/
int btrfs_balance(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl,
@@ -4711,12 +4627,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
- sb_start_write(fs_info->sb);
+ guard(super_write)(fs_info->sb);
+
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
- sb_end_write(fs_info->sb);
return ret;
}
@@ -4738,7 +4654,8 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
}
spin_lock(&fs_info->super_lock);
- ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED,
+ "exclusive_operation=%d", fs_info->exclusive_operation);
fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
spin_unlock(&fs_info->super_lock);
/*
@@ -4759,7 +4676,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
struct btrfs_balance_control *bctl;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
int ret;
@@ -4774,17 +4691,14 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0) { /* ret = -ENOENT; */
- ret = 0;
- goto out;
+ return 0;
}
bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
- if (!bctl) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!bctl)
+ return -ENOMEM;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
@@ -4821,8 +4735,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
fs_info->balance_ctl = bctl;
spin_unlock(&fs_info->balance_lock);
mutex_unlock(&fs_info->balance_mutex);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -4987,8 +4899,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
again:
key.objectid = device->devid;
- key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = (u64)-1;
do {
mutex_lock(&fs_info->reclaim_bgs_lock);
@@ -5042,7 +4954,7 @@ again:
goto done;
}
- ret = btrfs_relocate_chunk(fs_info, chunk_offset);
+ ret = btrfs_relocate_chunk(fs_info, chunk_offset, true);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
failed++;
@@ -5074,8 +4986,8 @@ again:
mutex_lock(&fs_info->chunk_mutex);
/* Clear all state bits beyond the shrunk device size */
- clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
- CHUNK_STATE_MASK);
+ btrfs_clear_extent_bit(&device->alloc_state, new_size, (u64)-1,
+ CHUNK_STATE_MASK, NULL);
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->post_commit_list))
@@ -5091,7 +5003,7 @@ again:
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
btrfs_trans_release_chunk_metadata(trans);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
} else {
@@ -5202,6 +5114,8 @@ struct alloc_chunk_ctl {
u64 stripe_size;
u64 chunk_size;
int ndevs;
+ /* Space_info the block group is going to belong. */
+ struct btrfs_space_info *space_info;
};
static void init_alloc_chunk_ctl_policy_regular(
@@ -5275,14 +5189,15 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
ctl->ndevs = 0;
switch (fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
break;
case BTRFS_CHUNK_ALLOC_ZONED:
init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
break;
- default:
- BUG();
}
}
@@ -5421,7 +5336,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
* It should hold because:
* dev_extent_min == dev_extent_want == zone_size * dev_stripes
*/
- ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
+ ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min,
+ "ndevs=%d max_avail=%llu dev_extent_min=%llu", ctl->ndevs,
+ devices_info[ctl->ndevs - 1].max_avail, ctl->dev_extent_min);
ctl->stripe_size = zone_size;
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
@@ -5434,7 +5351,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
ctl->dev_stripes);
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
- ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
+ ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size,
+ "stripe_size=%llu data_stripes=%d max_chunk_size=%llu",
+ ctl->stripe_size, data_stripes, ctl->max_chunk_size);
}
ctl->chunk_size = ctl->stripe_size * data_stripes;
@@ -5467,12 +5386,13 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
switch (fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
return decide_stripe_size_regular(ctl, devices_info);
case BTRFS_CHUNK_ALLOC_ZONED:
return decide_stripe_size_zoned(ctl, devices_info);
- default:
- BUG();
}
}
@@ -5482,9 +5402,9 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int
struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
- set_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + map->stripe_size - 1,
- bits | EXTENT_NOWAIT, NULL);
+ btrfs_set_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
}
}
@@ -5494,10 +5414,9 @@ static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned in
struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
- __clear_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + map->stripe_size - 1,
- bits | EXTENT_NOWAIT,
- NULL, NULL);
+ btrfs_clear_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
}
}
@@ -5513,33 +5432,34 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma
btrfs_free_chunk_map(map);
}
+static int btrfs_chunk_map_cmp(const struct rb_node *new,
+ const struct rb_node *exist)
+{
+ const struct btrfs_chunk_map *new_map =
+ rb_entry(new, struct btrfs_chunk_map, rb_node);
+ const struct btrfs_chunk_map *exist_map =
+ rb_entry(exist, struct btrfs_chunk_map, rb_node);
+
+ if (new_map->start == exist_map->start)
+ return 0;
+ if (new_map->start < exist_map->start)
+ return -1;
+ return 1;
+}
+
EXPORT_FOR_TESTS
int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- bool leftmost = true;
+ struct rb_node *exist;
write_lock(&fs_info->mapping_tree_lock);
- p = &fs_info->mapping_tree.rb_root.rb_node;
- while (*p) {
- struct btrfs_chunk_map *entry;
-
- parent = *p;
- entry = rb_entry(parent, struct btrfs_chunk_map, rb_node);
-
- if (map->start < entry->start) {
- p = &(*p)->rb_left;
- } else if (map->start > entry->start) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else {
- write_unlock(&fs_info->mapping_tree_lock);
- return -EEXIST;
- }
+ exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree,
+ btrfs_chunk_map_cmp);
+
+ if (exist) {
+ write_unlock(&fs_info->mapping_tree_lock);
+ return -EEXIST;
}
- rb_link_node(&map->rb_node, parent, p);
- rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);
chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
write_unlock(&fs_info->mapping_tree_lock);
@@ -5603,7 +5523,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
return ERR_PTR(ret);
}
- block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
+ block_group = btrfs_make_block_group(trans, ctl->space_info, type, start,
+ ctl->chunk_size);
if (IS_ERR(block_group)) {
btrfs_remove_chunk_map(info, map);
return block_group;
@@ -5629,19 +5550,19 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
}
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
- u64 type)
+ struct btrfs_space_info *space_info,
+ u64 type)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
- struct btrfs_device_info *devices_info = NULL;
+ struct btrfs_device_info AUTO_KFREE(devices_info);
struct alloc_chunk_ctl ctl;
- struct btrfs_block_group *block_group;
int ret;
lockdep_assert_held(&info->chunk_mutex);
if (!alloc_profile_is_valid(type, 0)) {
- ASSERT(0);
+ DEBUG_WARN("invalid alloc profile for type %llu", type);
return ERR_PTR(-EINVAL);
}
@@ -5653,12 +5574,13 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(info, "invalid chunk type 0x%llx requested", type);
- ASSERT(0);
+ DEBUG_WARN();
return ERR_PTR(-EINVAL);
}
ctl.start = find_next_chunk(info);
ctl.type = type;
+ ctl.space_info = space_info;
init_alloc_chunk_ctl(fs_devices, &ctl);
devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
@@ -5667,22 +5589,14 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
return ERR_PTR(-ENOMEM);
ret = gather_device_info(fs_devices, &ctl, devices_info);
- if (ret < 0) {
- block_group = ERR_PTR(ret);
- goto out;
- }
+ if (ret < 0)
+ return ERR_PTR(ret);
ret = decide_stripe_size(fs_devices, &ctl, devices_info);
- if (ret < 0) {
- block_group = ERR_PTR(ret);
- goto out;
- }
-
- block_group = create_chunk(trans, &ctl, devices_info);
+ if (ret < 0)
+ return ERR_PTR(ret);
-out:
- kfree(devices_info);
- return block_group;
+ return create_chunk(trans, &ctl, devices_info);
}
/*
@@ -5740,7 +5654,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
item_size = btrfs_chunk_item_size(map->num_stripes);
chunk = kzalloc(item_size, GFP_NOFS);
- if (!chunk) {
+ if (unlikely(!chunk)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -5802,7 +5716,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
u64 alloc_profile;
struct btrfs_block_group *meta_bg;
+ struct btrfs_space_info *meta_space_info;
struct btrfs_block_group *sys_bg;
+ struct btrfs_space_info *sys_space_info;
/*
* When adding a new device for sprouting, the seed device is read-only
@@ -5826,12 +5742,22 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
*/
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- meta_bg = btrfs_create_chunk(trans, alloc_profile);
+ meta_space_info = btrfs_find_space_info(fs_info, alloc_profile);
+ if (!meta_space_info) {
+ DEBUG_WARN();
+ return -EINVAL;
+ }
+ meta_bg = btrfs_create_chunk(trans, meta_space_info, alloc_profile);
if (IS_ERR(meta_bg))
return PTR_ERR(meta_bg);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- sys_bg = btrfs_create_chunk(trans, alloc_profile);
+ sys_space_info = btrfs_find_space_info(fs_info, alloc_profile);
+ if (!sys_space_info) {
+ DEBUG_WARN();
+ return -EINVAL;
+ }
+ sys_bg = btrfs_create_chunk(trans, sys_space_info, alloc_profile);
if (IS_ERR(sys_bg))
return PTR_ERR(sys_bg);
@@ -5959,9 +5885,79 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
return len;
}
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes)
+{
+ for (int index = first; index < first + num_stripes; index++) {
+ const struct btrfs_device *device = map->stripes[index].dev;
+
+ if (device->devid == READ_ONCE(device->fs_devices->read_devid))
+ return index;
+ }
+
+ /* If no read-preferred device is set use the first stripe. */
+ return first;
+}
+
+struct stripe_mirror {
+ u64 devid;
+ int num;
+};
+
+static int btrfs_cmp_devid(const void *a, const void *b)
+{
+ const struct stripe_mirror *s1 = (const struct stripe_mirror *)a;
+ const struct stripe_mirror *s2 = (const struct stripe_mirror *)b;
+
+ if (s1->devid < s2->devid)
+ return -1;
+ if (s1->devid > s2->devid)
+ return 1;
+ return 0;
+}
+
+/*
+ * Select a stripe for reading using the round-robin algorithm.
+ *
+ * 1. Compute the read cycle as the total sectors read divided by the minimum
+ * sectors per device.
+ * 2. Determine the stripe number for the current read by taking the modulus
+ * of the read cycle with the total number of stripes:
+ *
+ * stripe index = (total sectors / min sectors per dev) % num stripes
+ *
+ * The calculated stripe index is then used to select the corresponding device
+ * from the list of devices, which is ordered by devid.
+ */
+static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes)
+{
+ struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 };
+ struct btrfs_device *device = map->stripes[first].dev;
+ struct btrfs_fs_info *fs_info = device->fs_devices->fs_info;
+ unsigned int read_cycle;
+ unsigned int total_reads;
+ unsigned int min_reads_per_dev;
+
+ total_reads = percpu_counter_sum(&fs_info->stats_read_blocks);
+ min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >>
+ fs_info->sectorsize_bits;
+
+ for (int index = 0, i = first; i < first + num_stripes; i++) {
+ stripes[index].devid = map->stripes[i].dev->devid;
+ stripes[index].num = i;
+ index++;
+ }
+ sort(stripes, num_stripes, sizeof(struct stripe_mirror),
+ btrfs_cmp_devid, NULL);
+
+ read_cycle = total_reads / min_reads_per_dev;
+ return stripes[read_cycle % num_stripes].num;
+}
+#endif
+
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_chunk_map *map, int first,
- int dev_replace_is_ongoing)
+ bool dev_replace_is_ongoing)
{
const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy);
int i;
@@ -5970,8 +5966,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
int tolerance;
struct btrfs_device *srcdev;
- ASSERT((map->type &
- (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
+ ASSERT((map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)),
+ "type=%llu", map->type);
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = map->sub_stripes;
@@ -5988,6 +5984,14 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
case BTRFS_READ_POLICY_PID:
preferred_mirror = first + (current->pid % num_stripes);
break;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ case BTRFS_READ_POLICY_RR:
+ preferred_mirror = btrfs_read_rr(map, first, num_stripes);
+ break;
+ case BTRFS_READ_POLICY_DEVID:
+ preferred_mirror = btrfs_read_preferred(map, first, num_stripes);
+ break;
+#endif
}
if (dev_replace_is_ongoing &&
@@ -6025,12 +6029,7 @@ struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
{
struct btrfs_io_context *bioc;
- bioc = kzalloc(
- /* The size of btrfs_io_context */
- sizeof(struct btrfs_io_context) +
- /* Plus the variable array for the stripes */
- sizeof(struct btrfs_io_stripe) * (total_stripes),
- GFP_NOFS);
+ bioc = kzalloc(struct_size(bioc, stripes, total_stripes), GFP_NOFS);
if (!bioc)
return NULL;
@@ -6264,7 +6263,7 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
}
/* We can only have at most 2 extra nr_stripes (for DUP). */
- ASSERT(nr_extra_stripes <= 2);
+ ASSERT(nr_extra_stripes <= 2, "nr_extra_stripes=%d", nr_extra_stripes);
/*
* For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
* replace.
@@ -6275,7 +6274,8 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
/* Only DUP can have two extra stripes. */
- ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
+ ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP,
+ "map_type=%llu", bioc->map_type);
/*
* Swap the last stripe stripes and reduce @nr_extra_stripes.
@@ -6302,7 +6302,8 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
*/
io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
- ASSERT(io_geom->stripe_offset < U32_MAX);
+ ASSERT(io_geom->stripe_offset < U32_MAX,
+ "stripe_offset=%llu", io_geom->stripe_offset);
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len =
@@ -6320,8 +6321,12 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
- ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset);
- ASSERT(io_geom->raid56_full_stripe_start <= offset);
+ ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset,
+ "raid56_full_stripe_start=%llu full_stripe_len=%lu offset=%llu",
+ io_geom->raid56_full_stripe_start, full_stripe_len, offset);
+ ASSERT(io_geom->raid56_full_stripe_start <= offset,
+ "raid56_full_stripe_start=%llu offset=%llu",
+ io_geom->raid56_full_stripe_start, offset);
/*
* For writes to RAID56, allow to write a full stripe set, but
* no straddling of stripe sets.
@@ -6346,8 +6351,7 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
{
dst->dev = map->stripes[io_geom->stripe_index].dev;
- if (io_geom->op == BTRFS_MAP_READ &&
- btrfs_need_stripe_tree_update(fs_info, map->type))
+ if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst)
return btrfs_get_raid_extent_offset(fs_info, logical, length,
map->type,
io_geom->stripe_index, dst);
@@ -6362,7 +6366,7 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,
const struct btrfs_io_stripe *smap,
const struct btrfs_chunk_map *map,
int num_alloc_stripes,
- enum btrfs_map_op op, int mirror_num)
+ struct btrfs_io_geometry *io_geom)
{
if (!smap)
return false;
@@ -6370,10 +6374,10 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,
if (num_alloc_stripes != 1)
return false;
- if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
+ if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ)
return false;
- if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
+ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1)
return false;
return true;
@@ -6488,7 +6492,7 @@ static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
{
int data_stripes = nr_data_stripes(map);
- ASSERT(io_geom->mirror_num <= 1);
+ ASSERT(io_geom->mirror_num <= 1, "mirror_num=%d", io_geom->mirror_num);
/* Just grab the data stripe directly. */
io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
io_geom->stripe_nr /= data_stripes;
@@ -6556,7 +6560,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int num_copies;
struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- int dev_replace_is_ongoing = 0;
+ bool dev_replace_is_ongoing = false;
u16 num_alloc_stripes;
u64 max_len;
@@ -6579,6 +6583,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
io_geom.raid56_full_stripe_start = (u64)-1;
max_len = btrfs_max_io_len(map, map_offset, &io_geom);
*length = min_t(u64, map->chunk_len - map_offset, max_len);
+ io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);
if (dev_replace->replace_task != current)
down_read(&dev_replace->rwsem);
@@ -6647,8 +6652,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* physical block information on the stack instead of allocating an
* I/O context structure.
*/
- if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
- io_geom.mirror_num)) {
+ if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) {
ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
if (mirror_num_ret)
*mirror_num_ret = io_geom.mirror_num;
@@ -6662,6 +6666,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
goto out;
}
bioc->map_type = map->type;
+ bioc->use_rst = io_geom.use_rst;
/*
* For RAID56 full map, we need to make sure the stripes[] follows the
@@ -6750,6 +6755,8 @@ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
const struct btrfs_device *device)
{
+ if (args->devt)
+ return device->devt == args->devt;
if (args->missing) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
!device->bdev)
@@ -6860,7 +6867,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
- extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
+ btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
if (devid)
tmp = *devid;
@@ -6881,9 +6888,9 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
generate_random_uuid(dev->uuid);
if (path) {
- struct rcu_string *name;
+ const char *name;
- name = rcu_string_strdup(path, GFP_KERNEL);
+ name = kstrdup(path, GFP_KERNEL);
if (!name) {
btrfs_free_device(dev);
return ERR_PTR(-ENOMEM);
@@ -7002,16 +7009,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
warn_32bit_meta_chunk(fs_info, logical, length, type);
#endif
- /*
- * Only need to verify chunk item if we're reading from sys chunk array,
- * as chunk item in tree block is already verified by tree-checker.
- */
- if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
- ret = btrfs_check_chunk_valid(leaf, chunk, logical);
- if (ret)
- return ret;
- }
-
map = btrfs_find_chunk_map(fs_info, logical, 1);
/* already mapped? */
@@ -7072,6 +7069,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
map->start, map->chunk_len, ret);
+ btrfs_free_chunk_map(map);
}
return ret;
@@ -7117,8 +7115,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
- if (!btrfs_test_opt(fs_info, DEGRADED))
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_err(fs_info,
+ "failed to find fsid %pU when attempting to open seed devices",
+ fsid);
return ERR_PTR(-ENOENT);
+ }
fs_devices = alloc_fs_devices(fsid);
if (IS_ERR(fs_devices))
@@ -7137,7 +7139,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
if (IS_ERR(fs_devices))
return fs_devices;
- ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder);
+ ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->sb);
if (ret) {
free_fs_devices(fs_devices);
return ERR_PTR(ret);
@@ -7269,16 +7271,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
{
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct extent_buffer *sb;
- struct btrfs_disk_key *disk_key;
- struct btrfs_chunk *chunk;
u8 *array_ptr;
unsigned long sb_array_offset;
int ret = 0;
- u32 num_stripes;
u32 array_size;
- u32 len = 0;
u32 cur_offset;
- u64 type;
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
@@ -7301,10 +7298,15 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
cur_offset = 0;
while (cur_offset < array_size) {
- disk_key = (struct btrfs_disk_key *)array_ptr;
- len = sizeof(*disk_key);
- if (cur_offset + len > array_size)
- goto out_short_read;
+ struct btrfs_chunk *chunk;
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr;
+ u32 len = sizeof(*disk_key);
+
+ /*
+ * The sys_chunk_array has been already verified at super block
+ * read time. Only do ASSERT()s for basic checks.
+ */
+ ASSERT(cur_offset + len <= array_size);
btrfs_disk_key_to_cpu(&key, disk_key);
@@ -7312,44 +7314,14 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
sb_array_offset += len;
cur_offset += len;
- if (key.type != BTRFS_CHUNK_ITEM_KEY) {
- btrfs_err(fs_info,
- "unexpected item type %u in sys_array at offset %u",
- (u32)key.type, cur_offset);
- ret = -EIO;
- break;
- }
+ ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY);
chunk = (struct btrfs_chunk *)sb_array_offset;
- /*
- * At least one btrfs_chunk with one stripe must be present,
- * exact stripe count check comes afterwards
- */
- len = btrfs_chunk_item_size(1);
- if (cur_offset + len > array_size)
- goto out_short_read;
-
- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
- if (!num_stripes) {
- btrfs_err(fs_info,
- "invalid number of stripes %u in sys_array at offset %u",
- num_stripes, cur_offset);
- ret = -EIO;
- break;
- }
+ ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM);
- type = btrfs_chunk_type(sb, chunk);
- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
- btrfs_err(fs_info,
- "invalid chunk type %llu in sys_array at offset %u",
- type, cur_offset);
- ret = -EIO;
- break;
- }
+ len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk));
- len = btrfs_chunk_item_size(num_stripes);
- if (cur_offset + len > array_size)
- goto out_short_read;
+ ASSERT(cur_offset + len <= array_size);
ret = read_one_chunk(&key, sb, chunk);
if (ret)
@@ -7362,13 +7334,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
clear_extent_buffer_uptodate(sb);
free_extent_buffer_stale(sb);
return ret;
-
-out_short_read:
- btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
- len, cur_offset);
- clear_extent_buffer_uptodate(sb);
- free_extent_buffer_stale(sb);
- return -EIO;
}
/*
@@ -7440,7 +7405,7 @@ static void readahead_tree_node_children(struct extent_buffer *node)
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->chunk_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
struct btrfs_key found_key;
@@ -7471,7 +7436,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
/*
* Lockdep complains about possible circular locking dependency between
* a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
- * used for freeze procection of a fs (struct super_block.s_writers),
+ * used for freeze protection of a fs (struct super_block.s_writers),
* which we take when starting a transaction, and extent buffers of the
* chunk tree if we call read_one_dev() while holding a lock on an
* extent buffer of the chunk tree. Since we are mounting the filesystem
@@ -7479,7 +7444,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* chunk tree, to keep it simple, just skip locking on the chunk tree.
*/
ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
- path->skip_locking = 1;
+ path->skip_locking = true;
/*
* Read all device items, and then all the chunk items. All
@@ -7488,8 +7453,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
*/
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
- key.offset = 0;
key.type = 0;
+ key.offset = 0;
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *node = path->nodes[1];
@@ -7557,8 +7522,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
ret = 0;
error:
mutex_unlock(&uuid_mutex);
-
- btrfs_free_path(path);
return ret;
}
@@ -7568,8 +7531,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
int ret = 0;
- fs_devices->fs_info = fs_info;
-
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list)
device->fs_info = fs_info;
@@ -7660,7 +7621,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
int ret = 0;
path = btrfs_alloc_path();
@@ -7682,8 +7643,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
}
out:
mutex_unlock(&fs_devices->device_list_mutex);
-
- btrfs_free_path(path);
return ret;
}
@@ -7692,7 +7651,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_dev_stats_item *ptr;
@@ -7708,10 +7667,10 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"error %d while searching for dev_stats item for device %s",
ret, btrfs_dev_name(device));
- goto out;
+ return ret;
}
if (ret == 0 &&
@@ -7719,10 +7678,10 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"delete too small dev_stats item for device %s failed %d",
btrfs_dev_name(device), ret);
- goto out;
+ return ret;
}
ret = 1;
}
@@ -7733,10 +7692,10 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"insert dev_stats item for device %s failed %d",
btrfs_dev_name(device), ret);
- goto out;
+ return ret;
}
}
@@ -7745,10 +7704,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
btrfs_set_dev_stats_value(eb, ptr, i,
btrfs_dev_stat_read(device, i));
- btrfs_mark_buffer_dirty(trans, eb);
-
-out:
- btrfs_free_path(path);
return ret;
}
@@ -7798,7 +7753,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
if (!dev->dev_stats_valid)
return;
- btrfs_err_rl_in_rcu(dev->fs_info,
+ btrfs_err_rl(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -7818,7 +7773,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
- btrfs_info_in_rcu(dev->fs_info,
+ btrfs_info(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -7878,7 +7833,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
{
struct btrfs_device *curr, *next;
- ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
+ ASSERT(trans->state == TRANS_STATE_COMMIT_DOING, "state=%d" , trans->state);
if (list_empty(&trans->dev_update_list))
return;
@@ -7908,8 +7863,6 @@ int btrfs_bg_type_to_factor(u64 flags)
return btrfs_raid_array[index].ncopies;
}
-
-
static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
@@ -7923,7 +7876,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
int i;
map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
- if (!map) {
+ if (unlikely(!map)) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
physical_offset, devid);
@@ -7932,7 +7885,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
stripe_len = btrfs_calc_stripe_length(map);
- if (physical_len != stripe_len) {
+ if (unlikely(physical_len != stripe_len)) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
physical_offset, devid, map->start, physical_len,
@@ -7942,7 +7895,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/*
- * Very old mkfs.btrfs (before v4.1) will not respect the reserved
+ * Very old mkfs.btrfs (before v4.15) will not respect the reserved
* space. Although kernel can handle it without problem, better to warn
* the users.
*/
@@ -7952,8 +7905,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
devid, physical_offset, physical_len);
for (i = 0; i < map->num_stripes; i++) {
- if (map->stripes[i].dev->devid == devid &&
- map->stripes[i].physical == physical_offset) {
+ if (unlikely(map->stripes[i].dev->devid == devid &&
+ map->stripes[i].physical == physical_offset)) {
found = true;
if (map->verified_stripes >= map->num_stripes) {
btrfs_err(fs_info,
@@ -7966,7 +7919,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
break;
}
}
- if (!found) {
+ if (unlikely(!found)) {
btrfs_err(fs_info,
"dev extent physical offset %llu devid %llu has no corresponding chunk",
physical_offset, devid);
@@ -7975,13 +7928,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
/* Make sure no dev extent is beyond device boundary */
dev = btrfs_find_device(fs_info->fs_devices, &args);
- if (!dev) {
+ if (unlikely(!dev)) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
goto out;
}
- if (physical_offset + physical_len > dev->disk_total_bytes) {
+ if (unlikely(physical_offset + physical_len > dev->disk_total_bytes)) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
devid, physical_offset, physical_len,
@@ -7993,8 +7946,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
if (dev->zone_info) {
u64 zone_size = dev->zone_info->zone_size;
- if (!IS_ALIGNED(physical_offset, zone_size) ||
- !IS_ALIGNED(physical_len, zone_size)) {
+ if (unlikely(!IS_ALIGNED(physical_offset, zone_size) ||
+ !IS_ALIGNED(physical_len, zone_size))) {
btrfs_err(fs_info,
"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
devid, physical_offset, physical_len);
@@ -8018,7 +7971,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
struct btrfs_chunk_map *map;
map = rb_entry(node, struct btrfs_chunk_map, rb_node);
- if (map->num_stripes != map->verified_stripes) {
+ if (unlikely(map->num_stripes != map->verified_stripes)) {
btrfs_err(fs_info,
"chunk %llu has missing dev extent, have %d expect %d",
map->start, map->verified_stripes, map->num_stripes);
@@ -8040,7 +7993,7 @@ out:
*/
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
u64 prev_devid = 0;
@@ -8071,17 +8024,15 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
path->reada = READA_FORWARD;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
+ return ret;
/* No dev extents at all? Not good */
- if (ret > 0) {
- ret = -EUCLEAN;
- goto out;
- }
+ if (unlikely(ret > 0))
+ return -EUCLEAN;
}
while (1) {
struct extent_buffer *leaf = path->nodes[0];
@@ -8103,24 +8054,23 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
physical_len = btrfs_dev_extent_length(leaf, dext);
/* Check if this dev extent overlaps with the previous one */
- if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
+ if (unlikely(devid == prev_devid && physical_offset < prev_dev_ext_end)) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
devid, physical_offset, prev_dev_ext_end);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
physical_offset, physical_len);
if (ret < 0)
- goto out;
+ return ret;
prev_devid = devid;
prev_dev_ext_end = physical_offset + physical_len;
ret = btrfs_next_item(root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0) {
ret = 0;
break;
@@ -8128,10 +8078,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
}
/* Ensure all chunks have corresponding dev extents */
- ret = verify_chunk_dev_extent_mapping(fs_info);
-out:
- btrfs_free_path(path);
- return ret;
+ return verify_chunk_dev_extent_mapping(fs_info);
}
/*
@@ -8168,12 +8115,12 @@ static int relocating_repair_kthread(void *data)
target = cache->start;
btrfs_put_block_group(cache);
- sb_start_write(fs_info->sb);
+ guard(super_write)(fs_info->sb);
+
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
btrfs_info(fs_info,
"zoned: skip relocating block group %llu to repair: EBUSY",
target);
- sb_end_write(fs_info->sb);
return -EBUSY;
}
@@ -8194,14 +8141,13 @@ static int relocating_repair_kthread(void *data)
btrfs_info(fs_info,
"zoned: relocating block group %llu to repair IO failure",
target);
- ret = btrfs_relocate_chunk(fs_info, target);
+ ret = btrfs_relocate_chunk(fs_info, target, true);
out:
if (cache)
btrfs_put_block_group(cache);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
- sb_end_write(fs_info->sb);
return ret;
}
@@ -8247,7 +8193,7 @@ static void map_raid56_repair_block(struct btrfs_io_context *bioc,
logical < stripe_start + BTRFS_STRIPE_LEN)
break;
}
- ASSERT(i < data_stripes);
+ ASSERT(i < data_stripes, "i=%d data_stripes=%d", i, data_stripes);
smap->dev = bioc->stripes[i].dev;
smap->physical = bioc->stripes[i].physical +
((logical - bioc->full_stripe_logical) &
@@ -8276,7 +8222,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
int mirror_ret = mirror_num;
int ret;
- ASSERT(mirror_num > 0);
+ ASSERT(mirror_num > 0, "mirror_num=%d", mirror_num);
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
&bioc, smap, &mirror_ret);
@@ -8284,7 +8230,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
return ret;
/* The map range should not cross stripe boundary. */
- ASSERT(map_length >= length);
+ ASSERT(map_length >= length, "map_length=%llu length=%u", map_length, length);
/* Already mapped to single stripe. */
if (!bioc)
@@ -8296,7 +8242,8 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
goto out;
}
- ASSERT(mirror_num <= bioc->num_stripes);
+ ASSERT(mirror_num <= bioc->num_stripes,
+ "mirror_num=%d num_stripes=%d", mirror_num, bioc->num_stripes);
smap->dev = bioc->stripes[mirror_num - 1].dev;
smap->physical = bioc->stripes[mirror_num - 1].physical;
out:
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3a416b1bc24c..34b854c1a303 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -7,6 +7,7 @@
#define BTRFS_VOLUMES_H
#include <linux/blk_types.h>
+#include <linux/blkdev.h>
#include <linux/sizes.h>
#include <linux/atomic.h>
#include <linux/sort.h>
@@ -18,20 +19,22 @@
#include <linux/completion.h>
#include <linux/rbtree.h>
#include <uapi/linux/btrfs.h>
+#include <uapi/linux/btrfs_tree.h>
#include "messages.h"
-#include "rcu-string.h"
+#include "extent-io-tree.h"
struct block_device;
struct bdev_handle;
struct btrfs_fs_info;
struct btrfs_block_group;
struct btrfs_trans_handle;
+struct btrfs_transaction;
struct btrfs_zoned_device_info;
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
/*
- * Arbitratry maximum size of one discard request to limit potentially long time
+ * Arbitrary maximum size of one discard request to limit potentially long time
* spent in blkdev_issue_discard().
*/
#define BTRFS_MAX_DISCARD_CHUNK_SIZE (SZ_1G)
@@ -42,7 +45,7 @@ extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN_SHIFT (16)
#define BTRFS_STRIPE_LEN_MASK (BTRFS_STRIPE_LEN - 1)
-static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
+static_assert(ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
/* Used by sanity check for btrfs_raid_types. */
#define const_ffs(n) (__builtin_ctzll(n) + 1)
@@ -55,8 +58,7 @@ static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
*/
static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
-static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) >
- ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
+static_assert(ilog2(BTRFS_BLOCK_GROUP_RAID0) > ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
/* ilog2() can handle both constants and variables */
#define BTRFS_BG_FLAG_TO_INDEX(profile) \
@@ -110,7 +112,8 @@ struct btrfs_device {
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
- struct rcu_string __rcu *name;
+ /* Device path or NULL if missing. */
+ const char __rcu *name;
u64 generation;
@@ -296,6 +299,9 @@ enum btrfs_chunk_allocation_policy {
BTRFS_CHUNK_ALLOC_ZONED,
};
+#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ (SZ_256K)
+/* Keep in sync with raid_attr table, current maximum is RAID1C4. */
+#define BTRFS_RAID1_MAX_MIRRORS (4)
/*
* Read policies for mirrored block group profiles, read picks the stripe based
* on these policies.
@@ -303,6 +309,12 @@ enum btrfs_chunk_allocation_policy {
enum btrfs_read_policy {
/* Use process PID to choose the stripe */
BTRFS_READ_POLICY_PID,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /* Balancing RAID1 reads across all striped devices (round-robin). */
+ BTRFS_READ_POLICY_RR,
+ /* Read from a specific device. */
+ BTRFS_READ_POLICY_DEVID,
+#endif
BTRFS_NR_READ_POLICY,
};
@@ -409,6 +421,16 @@ struct btrfs_fs_devices {
/* Count fs-devices opened. */
int opened;
+ /*
+ * Counter of the processes that are holding this fs_devices but not
+ * yet opened.
+ * This is for mounting handling, as we can only open the fs_devices
+ * after a super block is created. But we cannot take uuid_mutex
+ * during sget_fc(), thus we have to hold the fs_devices (meaning it
+ * cannot be released) until a super block is returned.
+ */
+ int holding;
+
/* Set when we find or add a device that doesn't have the nonrot flag set. */
bool rotating;
/* Devices support TRIM/discard commands. */
@@ -417,6 +439,8 @@ struct btrfs_fs_devices {
bool seeding;
/* The mount needs to use a randomly generated fsid. */
bool temp_fsid;
+ /* Enable/disable the filesystem stats tracking. */
+ bool collect_fs_stats;
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
@@ -431,6 +455,15 @@ struct btrfs_fs_devices {
enum btrfs_read_policy read_policy;
#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /*
+ * Minimum contiguous reads before switching to next device, the unit
+ * is one block/sectorsize.
+ */
+ u32 rr_min_contig_read;
+
+ /* Device to be used for reading in case of RAID1. */
+ u64 read_devid;
+
/* Checksum mode - offload it or do it synchronously. */
enum btrfs_offload_csum_mode offload_csum_mode;
#endif
@@ -449,7 +482,6 @@ struct btrfs_io_stripe {
struct btrfs_device *dev;
/* Block mapping. */
u64 physical;
- u64 length;
bool rst_search_commit_root;
/* For the endio handler. */
struct btrfs_io_context *bioc;
@@ -462,7 +494,7 @@ struct btrfs_discard_stripe {
};
/*
- * Context for IO subsmission for device stripe.
+ * Context for IO submission for device stripe.
*
* - Track the unfinished mirrors for mirror based profiles
* Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
@@ -485,6 +517,7 @@ struct btrfs_io_context {
struct bio *orig_bio;
atomic_t error;
u16 max_errors;
+ bool use_rst;
u64 logical;
u64 size;
@@ -628,6 +661,11 @@ struct btrfs_dev_lookup_args {
u64 devid;
u8 *uuid;
u8 *fsid;
+ /*
+ * If devt is specified, all other members will be ignored as it is
+ * enough to uniquely locate a device.
+ */
+ dev_t devt;
bool missing;
};
@@ -643,7 +681,7 @@ enum btrfs_map_op {
BTRFS_MAP_GET_READ_MIRRORS,
};
-static inline enum btrfs_map_op btrfs_op(struct bio *bio)
+static inline enum btrfs_map_op btrfs_op(const struct bio *bio)
{
switch (bio_op(bio)) {
case REQ_OP_WRITE:
@@ -690,12 +728,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
- u64 type);
+ struct btrfs_space_info *space_info,
+ u64 type);
void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
blk_mode_t flags, void *holder);
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
- bool mount_arg_dev);
+struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev);
int btrfs_forget_devices(dev_t devt);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
@@ -729,7 +767,8 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
-int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ bool verbose);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
@@ -761,6 +800,8 @@ struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_inf
struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
+struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
+ int copy_num, bool drop_cache);
void btrfs_release_disk_super(struct btrfs_super_block *super);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
@@ -819,7 +860,26 @@ static inline const char *btrfs_dev_name(const struct btrfs_device *device)
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
return "<missing disk>";
else
- return rcu_str_deref(device->name);
+ return rcu_dereference(device->name);
+}
+
+static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocation_policy pol)
+{
+ WARN_ONCE(1, "unknown allocation policy %d, fallback to regular", pol);
+}
+
+static inline void btrfs_fs_devices_inc_holding(struct btrfs_fs_devices *fs_devices)
+{
+ lockdep_assert_held(&uuid_mutex);
+ ASSERT(fs_devices->holding >= 0);
+ fs_devices->holding++;
+}
+
+static inline void btrfs_fs_devices_dec_holding(struct btrfs_fs_devices *fs_devices)
+{
+ lockdep_assert_held(&uuid_mutex);
+ ASSERT(fs_devices->holding > 0);
+ fs_devices->holding--;
}
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index bc18710d1dcf..ab55d10bd71f 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -29,9 +29,8 @@ int btrfs_getxattr(const struct inode *inode, const char *name,
{
struct btrfs_dir_item *di;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
- int ret = 0;
unsigned long data_ptr;
path = btrfs_alloc_path();
@@ -41,26 +40,19 @@ int btrfs_getxattr(const struct inode *inode, const char *name,
/* lookup the xattr by name */
di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)),
name, strlen(name), 0);
- if (!di) {
- ret = -ENODATA;
- goto out;
- } else if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- }
+ if (!di)
+ return -ENODATA;
+ if (IS_ERR(di))
+ return PTR_ERR(di);
leaf = path->nodes[0];
/* if size is 0, that means we want the size of the attr */
- if (!size) {
- ret = btrfs_dir_data_len(leaf, di);
- goto out;
- }
+ if (!size)
+ return btrfs_dir_data_len(leaf, di);
/* now get the data out of our dir_item */
- if (btrfs_dir_data_len(leaf, di) > size) {
- ret = -ERANGE;
- goto out;
- }
+ if (btrfs_dir_data_len(leaf, di) > size)
+ return -ERANGE;
/*
* The way things are packed into the leaf is like this
@@ -73,11 +65,7 @@ int btrfs_getxattr(const struct inode *inode, const char *name,
btrfs_dir_name_len(leaf, di));
read_extent_buffer(leaf, buffer, data_ptr,
btrfs_dir_data_len(leaf, di));
- ret = btrfs_dir_data_len(leaf, di);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_dir_data_len(leaf, di);
}
int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
@@ -85,7 +73,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
{
struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
size_t name_len = strlen(name);
int ret = 0;
@@ -97,7 +85,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->skip_release_on_error = 1;
+ path->skip_release_on_error = true;
if (!value) {
di = btrfs_lookup_xattr(trans, root, path,
@@ -204,7 +192,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
btrfs_set_dir_data_len(leaf, di, size);
data_ptr = ((unsigned long)(di + 1)) + name_len;
write_extent_buffer(leaf, value, data_ptr, size);
- btrfs_mark_buffer_dirty(trans, leaf);
} else {
/*
* Insert, and we had space for the xattr, so path->slots[0] is
@@ -213,7 +200,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
*/
}
out:
- btrfs_free_path(path);
if (!ret) {
set_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags);
@@ -279,7 +265,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct btrfs_key key;
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int iter_ret = 0;
int ret = 0;
size_t total_size = 0, size_left = size;
@@ -355,8 +341,6 @@ next:
else
ret = total_size;
- btrfs_free_path(path);
-
return ret;
}
@@ -511,14 +495,15 @@ static int btrfs_initxattrs(struct inode *inode,
*/
nofs_flag = memalloc_nofs_save();
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
- strlen(xattr->name) + 1, GFP_KERNEL);
+ const size_t name_len = XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1;
+
+ name = kmalloc(name_len, GFP_KERNEL);
if (!name) {
ret = -ENOMEM;
break;
}
- strcpy(name, XATTR_SECURITY_PREFIX);
- strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+ scnprintf(name, name_len, "%s%s", XATTR_SECURITY_PREFIX, xattr->name);
if (strcmp(name, XATTR_NAME_CAPS) == 0)
clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 8dc4cf49f6f0..0ce10e4ec836 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_XATTR_H
#define BTRFS_XATTR_H
+#include <linux/types.h>
+
struct dentry;
struct inode;
struct qstr;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ddf0d5a448a7..6caba8be7c84 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -34,11 +34,9 @@ struct workspace {
int level;
};
-static struct workspace_manager wsm;
-
-struct list_head *zlib_get_workspace(unsigned int level)
+struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level)
{
- struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level);
+ struct list_head *ws = btrfs_get_workspace(fs_info, BTRFS_COMPRESS_ZLIB, level);
struct workspace *workspace = list_entry(ws, struct workspace, list);
workspace->level = level;
@@ -55,8 +53,25 @@ void zlib_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *zlib_alloc_workspace(unsigned int level)
+/*
+ * For s390 hardware acceleration, the buffer size should be at least
+ * ZLIB_DFLTCC_BUF_SIZE to achieve the best performance.
+ *
+ * But if bs > ps we can have large enough folios that meet the s390 hardware
+ * handling.
+ */
+static bool need_special_buffer(struct btrfs_fs_info *fs_info)
+{
+ if (!zlib_deflate_dfltcc_enabled())
+ return false;
+ if (btrfs_min_folio_size(fs_info) >= ZLIB_DFLTCC_BUF_SIZE)
+ return false;
+ return true;
+}
+
+struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level)
{
+ const u32 blocksize = fs_info->sectorsize;
struct workspace *workspace;
int workspacesize;
@@ -69,19 +84,15 @@ struct list_head *zlib_alloc_workspace(unsigned int level)
workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN);
workspace->level = level;
workspace->buf = NULL;
- /*
- * In case of s390 zlib hardware support, allocate lager workspace
- * buffer. If allocator fails, fall back to a single page buffer.
- */
- if (zlib_deflate_dfltcc_enabled()) {
+ if (need_special_buffer(fs_info)) {
workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE,
__GFP_NOMEMALLOC | __GFP_NORETRY |
__GFP_NOWARN | GFP_NOIO);
workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE;
}
if (!workspace->buf) {
- workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
- workspace->buf_size = PAGE_SIZE;
+ workspace->buf = kmalloc(blocksize, GFP_KERNEL);
+ workspace->buf_size = blocksize;
}
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -94,22 +105,64 @@ fail:
return ERR_PTR(-ENOMEM);
}
-int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+/*
+ * Helper for S390x with hardware zlib compression support.
+ *
+ * That hardware acceleration requires a buffer size larger than a single page
+ * to get ideal performance, thus we need to do the memory copy rather than
+ * use the page cache directly as input buffer.
+ */
+static int copy_data_into_buffer(struct address_space *mapping,
+ struct workspace *workspace, u64 filepos,
+ unsigned long length)
+{
+ u64 cur = filepos;
+
+ /* It's only for hardware accelerated zlib code. */
+ ASSERT(zlib_deflate_dfltcc_enabled());
+
+ while (cur < filepos + length) {
+ struct folio *folio;
+ void *data_in;
+ unsigned int offset;
+ unsigned long copy_length;
+ int ret;
+
+ ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio);
+ if (ret < 0)
+ return ret;
+
+ offset = offset_in_folio(folio, cur);
+ copy_length = min(folio_size(folio) - offset,
+ filepos + length - cur);
+
+ data_in = kmap_local_folio(folio, offset);
+ memcpy(workspace->buf + cur - filepos, data_in, copy_length);
+ kunmap_local(data_in);
+ cur += copy_length;
+ }
+ return 0;
+}
+
+int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
int ret;
char *data_in = NULL;
char *cfolio_out;
int nr_folios = 0;
struct folio *in_folio = NULL;
struct folio *out_folio = NULL;
- unsigned long bytes_left;
- unsigned int in_buf_folios;
unsigned long len = *total_out;
unsigned long nr_dest_folios = *out_folios;
- const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
+ const unsigned long max_out = nr_dest_folios << min_folio_shift;
+ const u32 blocksize = fs_info->sectorsize;
const u64 orig_end = start + len;
*out_folios = 0;
@@ -118,9 +171,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = zlib_deflateInit(&workspace->strm, workspace->level);
if (unlikely(ret != Z_OK)) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_err(inode->root->fs_info,
+ btrfs_err(fs_info,
"zlib compression init failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode), start);
ret = -EIO;
@@ -130,7 +181,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -142,7 +193,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
workspace->strm.next_in = workspace->buf;
workspace->strm.avail_in = 0;
workspace->strm.next_out = cfolio_out;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = min_folio_size;
while (workspace->strm.total_in < len) {
/*
@@ -150,36 +201,23 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
* the workspace buffer if required.
*/
if (workspace->strm.avail_in == 0) {
- bytes_left = len - workspace->strm.total_in;
- in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
- workspace->buf_size / PAGE_SIZE);
- if (in_buf_folios > 1) {
- int i;
-
- /* S390 hardware acceleration path, not subpage. */
- ASSERT(!btrfs_is_subpage(
- inode_to_fs_info(mapping->host),
- mapping));
- for (i = 0; i < in_buf_folios; i++) {
- if (data_in) {
- kunmap_local(data_in);
- folio_put(in_folio);
- data_in = NULL;
- }
- ret = btrfs_compress_filemap_get_folio(mapping,
- start, &in_folio);
- if (ret < 0)
- goto out;
- data_in = kmap_local_folio(in_folio, 0);
- copy_page(workspace->buf + i * PAGE_SIZE,
- data_in);
- start += PAGE_SIZE;
- workspace->strm.avail_in =
- (in_buf_folios << PAGE_SHIFT);
- }
+ unsigned long bytes_left = len - workspace->strm.total_in;
+ unsigned int copy_length = min(bytes_left, workspace->buf_size);
+
+ /*
+ * For s390 hardware accelerated zlib, and our folio is smaller
+ * than the copy_length, we need to fill the buffer so that
+ * we can take full advantage of hardware acceleration.
+ */
+ if (need_special_buffer(fs_info)) {
+ ret = copy_data_into_buffer(mapping, workspace,
+ start, copy_length);
+ if (ret < 0)
+ goto out;
+ start += copy_length;
workspace->strm.next_in = workspace->buf;
+ workspace->strm.avail_in = copy_length;
} else {
- unsigned int pg_off;
unsigned int cur_len;
if (data_in) {
@@ -191,9 +229,9 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
start, &in_folio);
if (ret < 0)
goto out;
- pg_off = offset_in_page(start);
- cur_len = btrfs_calc_input_length(orig_end, start);
- data_in = kmap_local_folio(in_folio, pg_off);
+ cur_len = btrfs_calc_input_length(in_folio, orig_end, start);
+ data_in = kmap_local_folio(in_folio,
+ offset_in_folio(in_folio, start));
start += cur_len;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = cur_len;
@@ -202,9 +240,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
if (unlikely(ret != Z_OK)) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_warn(inode->root->fs_info,
+ btrfs_warn(fs_info,
"zlib compression failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
start);
@@ -214,7 +250,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
}
/* we're making it bigger, give up */
- if (workspace->strm.total_in > 8192 &&
+ if (workspace->strm.total_in > blocksize * 2 &&
workspace->strm.total_in <
workspace->strm.total_out) {
ret = -E2BIG;
@@ -229,7 +265,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -237,7 +273,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
cfolio_out = folio_address(out_folio);
folios[nr_folios] = out_folio;
nr_folios++;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = min_folio_size;
workspace->strm.next_out = cfolio_out;
}
/* we're all done */
@@ -255,7 +291,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = zlib_deflate(&workspace->strm, Z_FINISH);
if (ret == Z_STREAM_END)
break;
- if (ret != Z_OK && ret != Z_BUF_ERROR) {
+ if (unlikely(ret != Z_OK && ret != Z_BUF_ERROR)) {
zlib_deflateEnd(&workspace->strm);
ret = -EIO;
goto out;
@@ -265,7 +301,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -273,7 +309,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
cfolio_out = folio_address(out_folio);
folios[nr_folios] = out_folio;
nr_folios++;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = min_folio_size;
workspace->strm.next_out = cfolio_out;
}
}
@@ -299,20 +335,22 @@ out:
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
int ret = 0, ret2;
int wbits = MAX_WBITS;
char *data_in;
size_t total_out = 0;
unsigned long folio_in_index = 0;
size_t srclen = cb->compressed_len;
- unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size);
unsigned long buf_start;
struct folio **folios_in = cb->compressed_folios;
data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
- workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size);
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
@@ -373,7 +411,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
- workspace->strm.avail_in = min(tmp, PAGE_SIZE);
+ workspace->strm.avail_in = min(tmp, min_folio_size);
}
}
if (unlikely(ret != Z_STREAM_END)) {
@@ -461,8 +499,8 @@ out:
return ret;
}
-const struct btrfs_compress_op btrfs_zlib_compress = {
- .workspace_manager = &wsm,
+const struct btrfs_compress_levels btrfs_zlib_compress = {
+ .min_level = 1,
.max_level = 9,
.default_level = BTRFS_ZLIB_DEFAULT_LEVEL,
};
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 11ed523e528e..359a98e6de85 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -9,7 +9,6 @@
#include "ctree.h"
#include "volumes.h"
#include "zoned.h"
-#include "rcu-string.h"
#include "disk-io.h"
#include "block-group.h"
#include "dev-replace.h"
@@ -17,6 +16,8 @@
#include "fs.h"
#include "accessors.h"
#include "bio.h"
+#include "transaction.h"
+#include "sysfs.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@@ -36,12 +37,15 @@
#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
-#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
-#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
+#define BTRFS_SB_LOG_FIRST_SHIFT ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
+#define BTRFS_SB_LOG_SECOND_SHIFT ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2
+/* Default number of max active zones when the device has no limits. */
+#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128
+
/*
* Minimum of active zones we need:
*
@@ -89,7 +93,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
sector_t sector;
for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
- ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
+ ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL,
+ "zones[%d].type=%d", i, zones[i].type);
empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
full[i] = sb_zone_is_full(&zones[i]);
}
@@ -162,14 +167,14 @@ static inline u32 sb_zone_number(int shift, int mirror)
{
u64 zone = U64_MAX;
- ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+ ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX, "mirror=%d", mirror);
switch (mirror) {
case 0: zone = 0; break;
case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
}
- ASSERT(zone <= U32_MAX);
+ ASSERT(zone <= U32_MAX, "zone=%llu", zone);
return (u32)zone;
}
@@ -236,7 +241,8 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
unsigned int i;
u32 zno;
- ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(pos, zinfo->zone_size),
+ "pos=%llu zinfo->zone_size=%llu", pos, zinfo->zone_size);
zno = pos >> zinfo->zone_size_shift;
/*
* We cannot report zones beyond the zone end. So, it is OK to
@@ -260,17 +266,17 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
}
}
- ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
- copy_zone_info_cb, zones);
+ ret = blkdev_report_zones_cached(device->bdev, pos >> SECTOR_SHIFT,
+ *nr_zones, copy_zone_info_cb, zones);
if (ret < 0) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: failed to read zone %llu on %s (devid %llu)",
- pos, rcu_str_deref(device->name),
+ pos, rcu_dereference(device->name),
device->devid);
return ret;
}
*nr_zones = ret;
- if (!ret)
+ if (unlikely(!ret))
return -EIO;
/* Populate cache */
@@ -311,7 +317,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
if (ret < 0)
return ret;
/* No dev extents at all? Not good */
- if (ret > 0)
+ if (unlikely(ret > 0))
return -EUCLEAN;
}
@@ -395,16 +401,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
/* We reject devices with a zone size larger than 8GB */
if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: zone size %llu larger than supported maximum %llu",
- rcu_str_deref(device->name),
+ rcu_dereference(device->name),
zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
ret = -EINVAL;
goto out;
} else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: zone size %llu smaller than supported minimum %u",
- rcu_str_deref(device->name),
+ rcu_dereference(device->name),
zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
ret = -EINVAL;
goto out;
@@ -416,11 +422,14 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
- max_active_zones = bdev_max_active_zones(bdev);
+ max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
+ bdev_max_open_zones(bdev));
+ if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES)
+ max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES;
if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
- rcu_str_deref(device->name), max_active_zones,
+ rcu_dereference(device->name), max_active_zones,
BTRFS_MIN_ACTIVE_ZONES);
ret = -EINVAL;
goto out;
@@ -460,9 +469,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
zone_info->zone_cache = vcalloc(zone_info->nr_zones,
sizeof(struct blk_zone));
if (!zone_info->zone_cache) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: failed to allocate zone cache for %s",
- rcu_str_deref(device->name));
+ rcu_dereference(device->name));
ret = -ENOMEM;
goto out;
}
@@ -487,6 +496,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
case BLK_ZONE_COND_CLOSED:
+ case BLK_ZONE_COND_ACTIVE:
__set_bit(nreported, zone_info->active_zones);
nactive++;
break;
@@ -496,20 +506,25 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
}
- if (nreported != zone_info->nr_zones) {
- btrfs_err_in_rcu(device->fs_info,
+ if (unlikely(nreported != zone_info->nr_zones)) {
+ btrfs_err(device->fs_info,
"inconsistent number of zones on %s (%u/%u)",
- rcu_str_deref(device->name), nreported,
+ rcu_dereference(device->name), nreported,
zone_info->nr_zones);
ret = -EIO;
goto out;
}
if (max_active_zones) {
- if (nactive > max_active_zones) {
- btrfs_err_in_rcu(device->fs_info,
+ if (unlikely(nactive > max_active_zones)) {
+ if (bdev_max_active_zones(bdev) == 0) {
+ max_active_zones = 0;
+ zone_info->max_active_zones = 0;
+ goto validate;
+ }
+ btrfs_err(device->fs_info,
"zoned: %u active zones on %s exceeds max_active_zones %u",
- nactive, rcu_str_deref(device->name),
+ nactive, rcu_dereference(device->name),
max_active_zones);
ret = -EIO;
goto out;
@@ -519,6 +534,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
}
+validate:
/* Validate superblock log */
nr_zones = BTRFS_NR_SB_LOG_ZONES;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -537,8 +553,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (ret)
goto out;
- if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
- btrfs_err_in_rcu(device->fs_info,
+ if (unlikely(nr_zones != BTRFS_NR_SB_LOG_ZONES)) {
+ btrfs_err(device->fs_info,
"zoned: failed to read super block log zone info at devid %llu zone %u",
device->devid, sb_zone);
ret = -EUCLEAN;
@@ -555,8 +571,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
ret = sb_write_pointer(device->bdev,
&zone_info->sb_zones[sb_pos], &sb_wp);
- if (ret != -ENOENT && ret) {
- btrfs_err_in_rcu(device->fs_info,
+ if (unlikely(ret != -ENOENT && ret)) {
+ btrfs_err(device->fs_info,
"zoned: super block log zone corrupted devid %llu zone %u",
device->devid, sb_zone);
ret = -EUCLEAN;
@@ -575,9 +591,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
emulated = "emulated ";
}
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"%s block device %s, %u %szones of %llu bytes",
- model, rcu_str_deref(device->name), zone_info->nr_zones,
+ model, rcu_dereference(device->name), zone_info->nr_zones,
emulated, zone_info->zone_size);
return 0;
@@ -748,8 +764,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
(u64)lim->max_segments << PAGE_SHIFT),
fs_info->sectorsize);
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
- if (fs_info->max_zone_append_size < fs_info->max_extent_size)
- fs_info->max_extent_size = fs_info->max_zone_append_size;
+
+ fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size,
+ fs_info->max_zone_append_size);
/*
* Check mount options here, because we might change fs_info->zoned
@@ -882,12 +899,12 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
- BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
- zones);
+ ret = blkdev_report_zones_cached(bdev, zone_start_sector(sb_zone, bdev),
+ BTRFS_NR_SB_LOG_ZONES,
+ copy_zone_info_cb, zones);
if (ret < 0)
return ret;
- if (ret != BTRFS_NR_SB_LOG_ZONES)
+ if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES))
return -EIO;
return sb_log_location(bdev, zones, rw, bytenr_ret);
@@ -988,7 +1005,7 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
}
/* All the zones are FULL. Should not reach here. */
- ASSERT(0);
+ DEBUG_WARN("unexpected state, all zones full");
return -EIO;
}
@@ -1041,8 +1058,10 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
bool have_sb;
int i;
- ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
- ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size),
+ "hole_start=%llu zinfo->zone_size=%llu", hole_start, zinfo->zone_size);
+ ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size),
+ "num_bytes=%llu zinfo->zone_size=%llu", num_bytes, zinfo->zone_size);
while (pos < hole_end) {
begin = pos >> shift;
@@ -1158,8 +1177,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
u64 pos;
int ret;
- ASSERT(IS_ALIGNED(start, zinfo->zone_size));
- ASSERT(IS_ALIGNED(size, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(start, zinfo->zone_size),
+ "start=%llu, zinfo->zone_size=%llu", start, zinfo->zone_size);
+ ASSERT(IS_ALIGNED(size, zinfo->zone_size),
+ "size=%llu, zinfo->zone_size=%llu", size, zinfo->zone_size);
if (begin + nbits > zinfo->nr_zones)
return -ERANGE;
@@ -1181,10 +1202,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
continue;
/* Free regions should be empty */
- btrfs_warn_in_rcu(
+ btrfs_warn(
device->fs_info,
"zoned: resetting device %s (devid %llu) zone %llu for allocation",
- rcu_str_deref(device->name), device->devid, pos >> shift);
+ rcu_dereference(device->name), device->devid, pos >> shift);
WARN_ON_ONCE(1);
ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
@@ -1239,7 +1260,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
root = btrfs_extent_root(fs_info, key.objectid);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
/* We should not find the exact match */
- if (!ret)
+ if (unlikely(!ret))
ret = -EUCLEAN;
if (ret < 0)
return ret;
@@ -1260,8 +1281,8 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
else
length = fs_info->nodesize;
- if (!(found_key.objectid >= cache->start &&
- found_key.objectid + length <= cache->start + cache->length)) {
+ if (unlikely(!(found_key.objectid >= cache->start &&
+ found_key.objectid + length <= cache->start + cache->length))) {
return -EUCLEAN;
}
*offset_ret = found_key.objectid + length - cache->start;
@@ -1276,7 +1297,7 @@ struct zone_info {
static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct zone_info *info, unsigned long *active,
- struct btrfs_chunk_map *map)
+ struct btrfs_chunk_map *map, bool new)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *device;
@@ -1303,9 +1324,12 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
if (!btrfs_dev_is_sequential(device, info->physical)) {
up_read(&dev_replace->rwsem);
info->alloc_offset = WP_CONVENTIONAL;
+ info->capacity = device->zone_info->zone_size;
return 0;
}
+ ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical));
+
/* This zone will be used for allocation, so mark this zone non-empty. */
btrfs_dev_clear_zone_empty(device, info->physical);
@@ -1318,6 +1342,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
* to determine the allocation offset within the zone.
*/
WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
+
+ if (new) {
+ sector_t capacity;
+
+ capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT);
+ up_read(&dev_replace->rwsem);
+ info->alloc_offset = 0;
+ info->capacity = capacity << SECTOR_SHIFT;
+
+ return 0;
+ }
+
nofs_flag = memalloc_nofs_save();
ret = btrfs_get_dev_zone(device, info->physical, &zone);
memalloc_nofs_restore(nofs_flag);
@@ -1329,10 +1365,10 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
return 0;
}
- if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
- btrfs_err_in_rcu(fs_info,
+ if (unlikely(zone.type == BLK_ZONE_TYPE_CONVENTIONAL)) {
+ btrfs_err(fs_info,
"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
- zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
+ zone.start << SECTOR_SHIFT, rcu_dereference(device->name),
device->devid);
up_read(&dev_replace->rwsem);
return -EIO;
@@ -1343,10 +1379,10 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: offline/readonly zone %llu on device %s (devid %llu)",
(info->physical >> device->zone_info->zone_size_shift),
- rcu_str_deref(device->name), device->devid);
+ rcu_dereference(device->name), device->devid);
info->alloc_offset = WP_MISSING_DEV;
break;
case BLK_ZONE_COND_EMPTY:
@@ -1371,7 +1407,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
struct zone_info *info,
unsigned long *active)
{
- if (info->alloc_offset == WP_MISSING_DEV) {
+ if (unlikely(info->alloc_offset == WP_MISSING_DEV)) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
info->physical);
@@ -1388,7 +1424,8 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1399,26 +1436,33 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
- if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
+ if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
zone_info[0].physical);
return -EIO;
}
- if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
+ if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
zone_info[1].physical);
return -EIO;
}
- if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
+
+ if (zone_info[0].alloc_offset == WP_CONVENTIONAL)
+ zone_info[0].alloc_offset = last_alloc;
+
+ if (zone_info[1].alloc_offset == WP_CONVENTIONAL)
+ zone_info[1].alloc_offset = last_alloc;
+
+ if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) {
btrfs_err(bg->fs_info,
"zoned: write pointer offset mismatch of zones in DUP profile");
return -EIO;
}
if (test_bit(0, active) != test_bit(1, active)) {
- if (!btrfs_zone_activate(bg))
+ if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
} else if (test_bit(0, active)) {
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
@@ -1431,7 +1475,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
int i;
@@ -1446,20 +1491,22 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
for (i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
- if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ zone_info[i].alloc_offset = last_alloc;
+
+ if (unlikely((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
+ !btrfs_test_opt(fs_info, DEGRADED))) {
btrfs_err(fs_info,
"zoned: write pointer offset mismatch of zones in %s profile",
btrfs_bg_type_to_raid_name(map->type));
return -EIO;
}
if (test_bit(0, active) != test_bit(i, active)) {
- if (!btrfs_test_opt(fs_info, DEGRADED) &&
- !btrfs_zone_activate(bg)) {
+ if (unlikely(!btrfs_test_opt(fs_info, DEGRADED) &&
+ !btrfs_zone_activate(bg))) {
return -EIO;
}
} else {
@@ -1479,9 +1526,12 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 stripe_nr = 0, stripe_offset = 0;
+ u32 stripe_index = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1489,13 +1539,30 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
return -EINVAL;
}
+ if (last_alloc) {
+ u32 factor = map->num_stripes;
+
+ stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK;
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ }
+
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
+
+ zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
+
+ if (stripe_index > i)
+ zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
+ else if (stripe_index == i)
+ zone_info[i].alloc_offset += stripe_offset;
+ }
+
if (test_bit(0, active) != test_bit(i, active)) {
- if (!btrfs_zone_activate(bg))
+ if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
} else {
if (test_bit(0, active))
@@ -1511,9 +1578,12 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 stripe_nr = 0, stripe_offset = 0;
+ u32 stripe_index = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1521,19 +1591,35 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
return -EINVAL;
}
+ if (last_alloc) {
+ u32 factor = map->num_stripes / map->sub_stripes;
+
+ stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK;
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ }
+
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
if (test_bit(0, active) != test_bit(i, active)) {
- if (!btrfs_zone_activate(bg))
+ if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
} else {
if (test_bit(0, active))
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
}
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
+ zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
+
+ if (stripe_index > (i / map->sub_stripes))
+ zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
+ else if (stripe_index == (i / map->sub_stripes))
+ zone_info[i].alloc_offset += stripe_offset;
+ }
+
if ((i % map->sub_stripes) == 0) {
bg->zone_capacity += zone_info[i].capacity;
bg->alloc_offset += zone_info[i].alloc_offset;
@@ -1549,7 +1635,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
struct btrfs_chunk_map *map;
u64 logical = cache->start;
u64 length = cache->length;
- struct zone_info *zone_info = NULL;
+ struct zone_info AUTO_KFREE(zone_info);
int ret;
int i;
unsigned long *active = NULL;
@@ -1561,7 +1647,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
return 0;
/* Sanity check */
- if (!IS_ALIGNED(length, fs_info->zone_size)) {
+ if (unlikely(!IS_ALIGNED(length, fs_info->zone_size))) {
btrfs_err(fs_info,
"zoned: block group %llu len %llu unaligned to zone size %llu",
logical, length, fs_info->zone_size);
@@ -1587,7 +1673,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
for (i = 0; i < map->num_stripes; i++) {
- ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
+ ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new);
if (ret)
goto out;
@@ -1601,8 +1687,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
if (num_conventional > 0) {
- /* Zone capacity is always zone size in emulation */
- cache->zone_capacity = cache->length;
ret = calculate_alloc_pointer(cache, &last_alloc, new);
if (ret) {
btrfs_err(fs_info,
@@ -1611,6 +1695,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
} else if (map->num_stripes == num_conventional) {
cache->alloc_offset = last_alloc;
+ cache->zone_capacity = cache->length;
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
goto out;
}
@@ -1622,18 +1707,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
case BTRFS_BLOCK_GROUP_DUP:
- ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_dup(cache, map, zone_info, active,
+ last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID1:
case BTRFS_BLOCK_GROUP_RAID1C3:
case BTRFS_BLOCK_GROUP_RAID1C4:
- ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid1(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID0:
- ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid0(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID10:
- ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid10(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID5:
case BTRFS_BLOCK_GROUP_RAID6:
@@ -1658,7 +1747,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
* stripe.
*/
cache->alloc_offset = cache->zone_capacity;
- ret = 0;
}
out:
@@ -1668,10 +1756,10 @@ out:
!fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
btrfs_bg_type_to_raid_name(map->type));
- return -EINVAL;
+ ret = -EINVAL;
}
- if (cache->alloc_offset > cache->zone_capacity) {
+ if (unlikely(cache->alloc_offset > cache->zone_capacity)) {
btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
cache->alloc_offset, cache->zone_capacity,
@@ -1701,7 +1789,6 @@ out:
cache->physical_map = NULL;
}
bitmap_free(active);
- kfree(zone_info);
return ret;
}
@@ -1728,14 +1815,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
struct btrfs_inode *inode = bbio->inode;
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_group *cache;
bool ret = false;
if (!btrfs_is_zoned(fs_info))
return false;
- if (!inode || !is_data_inode(inode))
+ if (!is_data_inode(inode))
return false;
if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
@@ -1783,12 +1870,12 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
ordered->disk_bytenr = logical;
write_lock(&em_tree->lock);
- em = search_extent_mapping(em_tree, ordered->file_offset,
- ordered->num_bytes);
+ em = btrfs_search_extent_mapping(em_tree, ordered->file_offset,
+ ordered->num_bytes);
/* The em should be a new COW extent, thus it should not have an offset. */
- ASSERT(em->offset == 0);
+ ASSERT(em->offset == 0, "em->offset=%llu", em->offset);
em->disk_bytenr = logical;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
write_unlock(&em_tree->lock);
}
@@ -1798,8 +1885,8 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
struct btrfs_ordered_extent *new;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
- split_extent_map(ordered->inode, ordered->file_offset,
- ordered->num_bytes, len, logical))
+ btrfs_split_extent_map(ordered->inode, ordered->file_offset,
+ ordered->num_bytes, len, logical))
return false;
new = btrfs_split_ordered_extent(ordered, len);
@@ -2002,7 +2089,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
&mapped_length, &bioc, NULL, NULL);
- if (ret || !bioc || mapped_length < PAGE_SIZE) {
+ if (unlikely(ret || !bioc || mapped_length < PAGE_SIZE)) {
ret = -EIO;
goto out_put_bioc;
}
@@ -2060,7 +2147,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
if (physical_pos == wp)
return 0;
- if (physical_pos > wp)
+ if (unlikely(physical_pos > wp))
return -EUCLEAN;
length = wp - physical_pos;
@@ -2096,10 +2183,15 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
- /* No space left */
- if (btrfs_zoned_bg_is_full(block_group)) {
- ret = false;
- goto out_unlock;
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) {
+ /* The caller should check if the block group is full. */
+ if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) {
+ ret = false;
+ goto out_unlock;
+ }
+ } else {
+ /* Since it is already written, it should have been active. */
+ WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start);
}
for (i = 0; i < map->num_stripes; i++) {
@@ -2110,6 +2202,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
physical = map->stripes[i].physical;
zinfo = device->zone_info;
+ if (!device->bdev)
+ continue;
+
if (zinfo->max_active_zones == 0)
continue;
@@ -2154,27 +2249,15 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
const u64 end = block_group->start + block_group->length;
- struct radix_tree_iter iter;
struct extent_buffer *eb;
- void __rcu **slot;
+ unsigned long index, start = (block_group->start >> fs_info->nodesize_bits);
rcu_read_lock();
- radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
- block_group->start >> fs_info->sectorsize_bits) {
- eb = radix_tree_deref_slot(slot);
- if (!eb)
- continue;
- if (radix_tree_deref_retry(eb)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
+ xa_for_each_start(&fs_info->buffer_tree, index, eb, start) {
if (eb->start < block_group->start)
continue;
if (eb->start >= end)
break;
-
- slot = radix_tree_iter_resume(slot, &iter);
rcu_read_unlock();
wait_on_extent_buffer_writeback(eb);
rcu_read_lock();
@@ -2182,6 +2265,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
rcu_read_unlock();
}
+static int call_zone_finish(struct btrfs_block_group *block_group,
+ struct btrfs_io_stripe *stripe)
+{
+ struct btrfs_device *device = stripe->dev;
+ const u64 physical = stripe->physical;
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ int ret;
+
+ if (!device->bdev)
+ return 0;
+
+ if (zinfo->max_active_zones == 0)
+ return 0;
+
+ if (btrfs_dev_is_sequential(device, physical)) {
+ unsigned int nofs_flags;
+
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
+
+ if (ret)
+ return ret;
+ }
+
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ zinfo->reserved_active_zones++;
+ btrfs_dev_clear_active_zone(device, physical);
+
+ return 0;
+}
+
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -2266,28 +2383,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
down_read(&dev_replace->rwsem);
map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_device *device = map->stripes[i].dev;
- const u64 physical = map->stripes[i].physical;
- struct btrfs_zoned_device_info *zinfo = device->zone_info;
- unsigned int nofs_flags;
-
- if (zinfo->max_active_zones == 0)
- continue;
-
- nofs_flags = memalloc_nofs_save();
- ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
- physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT);
- memalloc_nofs_restore(nofs_flags);
+ ret = call_zone_finish(block_group, &map->stripes[i]);
if (ret) {
up_read(&dev_replace->rwsem);
return ret;
}
-
- if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
- zinfo->reserved_active_zones++;
- btrfs_dev_clear_active_zone(device, physical);
}
up_read(&dev_replace->rwsem);
@@ -2324,6 +2425,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
if (!btrfs_is_zoned(fs_info))
return true;
+ if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags))
+ return false;
+
/* Check if there is a device with active zones left */
mutex_lock(&fs_info->chunk_mutex);
spin_lock(&fs_info->zone_active_bgs_lock);
@@ -2362,16 +2466,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
return ret;
}
-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
{
struct btrfs_block_group *block_group;
u64 min_alloc_bytes;
if (!btrfs_is_zoned(fs_info))
- return;
+ return 0;
block_group = btrfs_lookup_block_group(fs_info, logical);
- ASSERT(block_group);
+ if (WARN_ON_ONCE(!block_group))
+ return -ENOENT;
/* No MIXED_BG on zoned btrfs. */
if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
@@ -2388,16 +2493,21 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
out:
btrfs_put_block_group(block_group);
+ return 0;
}
static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
{
+ int ret;
struct btrfs_block_group *bg =
container_of(work, struct btrfs_block_group, zone_finish_work);
wait_on_extent_buffer_writeback(bg->last_eb);
free_extent_buffer(bg->last_eb);
- btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+ ret = do_zone_finish(bg, true);
+ if (ret)
+ btrfs_handle_fs_error(bg->fs_info, ret,
+ "Failed to finish block-group's zone");
btrfs_put_block_group(bg);
}
@@ -2416,10 +2526,10 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
/* For the work */
btrfs_get_block_group(bg);
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
bg->last_eb = eb;
INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
- queue_work(system_unbound_wq, &bg->zone_finish_work);
+ queue_work(system_dfl_wq, &bg->zone_finish_work);
}
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
@@ -2432,6 +2542,106 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
spin_unlock(&fs_info->relocation_bg_lock);
}
+void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ struct btrfs_space_info *space_info = data_sinfo;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_block_group *bg;
+ struct list_head *bg_list;
+ u64 alloc_flags;
+ bool first = true;
+ bool did_chunk_alloc = false;
+ int index;
+ int ret;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ if (fs_info->data_reloc_bg)
+ return;
+
+ if (sb_rdonly(fs_info->sb))
+ return;
+
+ alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
+ index = btrfs_bg_flags_to_raid_index(alloc_flags);
+
+ /* Scan the data space_info to find empty block groups. Take the second one. */
+again:
+ bg_list = &space_info->block_groups[index];
+ list_for_each_entry(bg, bg_list, list) {
+ if (bg->alloc_offset != 0)
+ continue;
+
+ if (first) {
+ first = false;
+ continue;
+ }
+
+ if (space_info == data_sinfo) {
+ /* Migrate the block group to the data relocation space_info. */
+ struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0];
+ int factor;
+
+ ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC,
+ "reloc_sinfo->subgroup_id=%d", reloc_sinfo->subgroup_id);
+ factor = btrfs_bg_type_to_factor(bg->flags);
+
+ down_write(&space_info->groups_sem);
+ list_del_init(&bg->list);
+ /* We can assume this as we choose the second empty one. */
+ ASSERT(!list_empty(&space_info->block_groups[index]));
+ up_write(&space_info->groups_sem);
+
+ spin_lock(&space_info->lock);
+ space_info->total_bytes -= bg->length;
+ space_info->disk_total -= bg->length * factor;
+ space_info->disk_total -= bg->zone_unusable;
+ /* There is no allocation ever happened. */
+ ASSERT(bg->used == 0, "bg->used=%llu", bg->used);
+ /* No super block in a block group on the zoned setup. */
+ ASSERT(bg->bytes_super == 0, "bg->bytes_super=%llu", bg->bytes_super);
+ spin_unlock(&space_info->lock);
+
+ bg->space_info = reloc_sinfo;
+ if (reloc_sinfo->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(bg);
+
+ btrfs_add_bg_to_space_info(fs_info, bg);
+ }
+
+ fs_info->data_reloc_bg = bg->start;
+ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);
+ btrfs_zone_activate(bg);
+
+ return;
+ }
+
+ if (did_chunk_alloc)
+ return;
+
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return;
+
+ /* Allocate new BG in the data relocation space_info. */
+ space_info = data_sinfo->sub_group[0];
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC,
+ "space_info->subgroup_id=%d", space_info->subgroup_id);
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
+ btrfs_end_transaction(trans);
+ if (ret == 1) {
+ /*
+ * We allocated a new block group in the data relocation space_info. We
+ * can take that one.
+ */
+ first = false;
+ did_chunk_alloc = true;
+ goto again;
+ }
+}
+
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
@@ -2454,8 +2664,8 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ u64 total = btrfs_super_total_bytes(fs_info->super_copy);
u64 used = 0;
- u64 total = 0;
u64 factor;
ASSERT(btrfs_is_zoned(fs_info));
@@ -2468,7 +2678,6 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
if (!device->bdev)
continue;
- total += device->disk_total_bytes;
used += device->bytes_used;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -2522,7 +2731,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->alloc_offset == 0 ||
- (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
+ !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) ||
test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
continue;
@@ -2549,10 +2758,9 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
return ret < 0 ? ret : 1;
}
-int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- bool do_finish)
+int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct btrfs_block_group *bg;
int index;
@@ -2651,3 +2859,128 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->zone_active_bgs_lock);
}
+
+/*
+ * Reset the zones of unused block groups from @space_info->bytes_zone_unusable.
+ *
+ * @space_info: the space to work on
+ * @num_bytes: targeting reclaim bytes
+ *
+ * This one resets the zones of a block group, so we can reuse the region
+ * without removing the block group. On the other hand, btrfs_delete_unused_bgs()
+ * just removes a block group and frees up the underlying zones. So, we still
+ * need to allocate a new block group to reuse the zones.
+ *
+ * Resetting is faster than deleting/recreating a block group. It is similar
+ * to freeing the logical space on the regular mode. However, we cannot change
+ * the block group's profile with this operation.
+ */
+int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ while (num_bytes > 0) {
+ struct btrfs_chunk_map *map;
+ struct btrfs_block_group *bg = NULL;
+ bool found = false;
+ u64 reclaimed = 0;
+
+ /*
+ * Here, we choose a fully zone_unusable block group. It's
+ * technically possible to reset a partly zone_unusable block
+ * group, which still has some free space left. However,
+ * handling that needs to cope with the allocation side, which
+ * makes the logic more complex. So, let's handle the easy case
+ * for now.
+ */
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) {
+ if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags)
+ continue;
+
+ /*
+ * Use trylock to avoid locking order violation. In
+ * btrfs_reclaim_bgs_work(), the lock order is
+ * &bg->lock -> &fs_info->unused_bgs_lock. We skip a
+ * block group if we cannot take its lock.
+ */
+ if (!spin_trylock(&bg->lock))
+ continue;
+ if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) {
+ spin_unlock(&bg->lock);
+ continue;
+ }
+ spin_unlock(&bg->lock);
+ found = true;
+ break;
+ }
+ if (!found) {
+ spin_unlock(&fs_info->unused_bgs_lock);
+ return 0;
+ }
+
+ list_del_init(&bg->bg_list);
+ btrfs_put_block_group(bg);
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /*
+ * Since the block group is fully zone_unusable and we cannot
+ * allocate from this block group anymore, we don't need to set
+ * this block group read-only.
+ */
+
+ down_read(&fs_info->dev_replace.rwsem);
+ map = bg->physical_map;
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ unsigned int nofs_flags;
+ int ret;
+
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET,
+ stripe->physical >> SECTOR_SHIFT,
+ zone_size_sectors);
+ memalloc_nofs_restore(nofs_flags);
+
+ if (ret) {
+ up_read(&fs_info->dev_replace.rwsem);
+ return ret;
+ }
+ }
+ up_read(&fs_info->dev_replace.rwsem);
+
+ spin_lock(&space_info->lock);
+ spin_lock(&bg->lock);
+ ASSERT(!btrfs_is_block_group_used(bg));
+ if (bg->ro) {
+ spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
+ continue;
+ }
+
+ reclaimed = bg->alloc_offset;
+ bg->zone_unusable = bg->length - bg->zone_capacity;
+ bg->alloc_offset = 0;
+ /*
+ * This holds because we currently reset fully used then freed
+ * block group.
+ */
+ ASSERT(reclaimed == bg->zone_capacity,
+ "reclaimed=%llu bg->zone_capacity=%llu", reclaimed, bg->zone_capacity);
+ bg->free_space_ctl->free_space += reclaimed;
+ space_info->bytes_zone_unusable -= reclaimed;
+ spin_unlock(&bg->lock);
+ btrfs_return_free_space(space_info, reclaimed);
+ spin_unlock(&space_info->lock);
+
+ if (num_bytes <= reclaimed)
+ break;
+ num_bytes -= reclaimed;
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 7612e6572605..5cefdeb08b7b 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -15,7 +15,6 @@
#include "disk-io.h"
#include "block-group.h"
#include "btrfs_inode.h"
-#include "fs.h"
struct block_device;
struct extent_buffer;
@@ -83,19 +82,20 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
bool btrfs_zone_activate(struct btrfs_block_group *block_group);
int btrfs_zone_finish(struct btrfs_block_group *block_group);
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info);
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
-int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, bool do_finish);
+int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish);
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
+int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
@@ -232,14 +232,19 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
return true;
}
-static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length) { }
+static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ return 0;
+}
static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb) { }
static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+static inline void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) { }
+
static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
@@ -255,8 +260,7 @@ static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
return 1;
}
-static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static inline int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info,
bool do_finish)
{
/* Consider all the block groups are active */
@@ -265,6 +269,12 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { }
+static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info,
+ u64 num_bytes)
+{
+ return 0;
+}
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 5232b56d5892..c9cddcfa337b 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -24,13 +24,14 @@
#include "super.h"
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
-#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
+#define ZSTD_BTRFS_MAX_INPUT (1U << ZSTD_BTRFS_MAX_WINDOWLOG)
#define ZSTD_BTRFS_DEFAULT_LEVEL 3
+#define ZSTD_BTRFS_MIN_LEVEL -15
#define ZSTD_BTRFS_MAX_LEVEL 15
/* 307s to avoid pathologically clashing with transaction commit */
#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
-static zstd_parameters zstd_get_btrfs_parameters(unsigned int level,
+static zstd_parameters zstd_get_btrfs_parameters(int level,
size_t src_len)
{
zstd_parameters params = zstd_get_params(level, src_len);
@@ -45,13 +46,14 @@ struct workspace {
void *mem;
size_t size;
char *buf;
- unsigned int level;
- unsigned int req_level;
+ int level;
+ int req_level;
unsigned long last_used; /* jiffies */
struct list_head list;
struct list_head lru_list;
zstd_in_buffer in_buf;
zstd_out_buffer out_buf;
+ zstd_parameters params;
};
/*
@@ -75,7 +77,6 @@ struct workspace {
*/
struct zstd_workspace_manager {
- const struct btrfs_compress_op *ops;
spinlock_t lock;
struct list_head lru_list;
struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
@@ -84,8 +85,6 @@ struct zstd_workspace_manager {
struct timer_list timer;
};
-static struct zstd_workspace_manager wsm;
-
static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
static inline struct workspace *list_to_workspace(struct list_head *list)
@@ -93,8 +92,10 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
return container_of(list, struct workspace, list);
}
-void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_alloc_workspace(unsigned int level);
+static inline int clip_level(int level)
+{
+ return max(0, level - 1);
+}
/*
* Timer callback to free unused workspaces.
@@ -108,22 +109,22 @@ struct list_head *zstd_alloc_workspace(unsigned int level);
*/
static void zstd_reclaim_timer_fn(struct timer_list *timer)
{
+ struct zstd_workspace_manager *zwsm =
+ container_of(timer, struct zstd_workspace_manager, timer);
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
- ASSERT(timer == &wsm.timer);
-
- spin_lock(&wsm.lock);
+ spin_lock(&zwsm->lock);
- if (list_empty(&wsm.lru_list)) {
- spin_unlock(&wsm.lock);
+ if (list_empty(&zwsm->lru_list)) {
+ spin_unlock(&zwsm->lock);
return;
}
- list_for_each_prev_safe(pos, next, &wsm.lru_list) {
+ list_for_each_prev_safe(pos, next, &zwsm->lru_list) {
struct workspace *victim = container_of(pos, struct workspace,
lru_list);
- unsigned int level;
+ int level;
if (time_after(victim->last_used, reclaim_threshold))
break;
@@ -137,15 +138,15 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
list_del(&victim->list);
zstd_free_workspace(&victim->list);
- if (list_empty(&wsm.idle_ws[level - 1]))
- clear_bit(level - 1, &wsm.active_map);
+ if (list_empty(&zwsm->idle_ws[level]))
+ clear_bit(level, &zwsm->active_map);
}
- if (!list_empty(&wsm.lru_list))
- mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+ if (!list_empty(&zwsm->lru_list))
+ mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
- spin_unlock(&wsm.lock);
+ spin_unlock(&zwsm->lock);
}
/*
@@ -160,9 +161,11 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
static void zstd_calc_ws_mem_sizes(void)
{
size_t max_size = 0;
- unsigned int level;
+ int level;
- for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ for (level = ZSTD_BTRFS_MIN_LEVEL; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ if (level == 0)
+ continue;
zstd_parameters params =
zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
size_t level_size =
@@ -171,54 +174,61 @@ static void zstd_calc_ws_mem_sizes(void)
zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT));
max_size = max_t(size_t, max_size, level_size);
- zstd_ws_mem_sizes[level - 1] = max_size;
+ /* Use level 1 workspace size for all the fast mode negative levels. */
+ zstd_ws_mem_sizes[clip_level(level)] = max_size;
}
}
-void zstd_init_workspace_manager(void)
+int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info)
{
+ struct zstd_workspace_manager *zwsm;
struct list_head *ws;
- int i;
+ ASSERT(fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] == NULL);
+ zwsm = kzalloc(sizeof(*zwsm), GFP_KERNEL);
+ if (!zwsm)
+ return -ENOMEM;
zstd_calc_ws_mem_sizes();
+ spin_lock_init(&zwsm->lock);
+ init_waitqueue_head(&zwsm->wait);
+ timer_setup(&zwsm->timer, zstd_reclaim_timer_fn, 0);
- wsm.ops = &btrfs_zstd_compress;
- spin_lock_init(&wsm.lock);
- init_waitqueue_head(&wsm.wait);
- timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
-
- INIT_LIST_HEAD(&wsm.lru_list);
- for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
- INIT_LIST_HEAD(&wsm.idle_ws[i]);
+ INIT_LIST_HEAD(&zwsm->lru_list);
+ for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&zwsm->idle_ws[i]);
+ fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = zwsm;
- ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
+ ws = zstd_alloc_workspace(fs_info, ZSTD_BTRFS_MAX_LEVEL);
if (IS_ERR(ws)) {
- pr_warn(
- "BTRFS: cannot preallocate zstd compression workspace\n");
+ btrfs_warn(NULL, "cannot preallocate zstd compression workspace");
} else {
- set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
- list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
+ set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &zwsm->active_map);
+ list_add(ws, &zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
}
+ return 0;
}
-void zstd_cleanup_workspace_manager(void)
+void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info)
{
+ struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
struct workspace *workspace;
- int i;
- spin_lock_bh(&wsm.lock);
- for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
- while (!list_empty(&wsm.idle_ws[i])) {
- workspace = container_of(wsm.idle_ws[i].next,
+ if (!zwsm)
+ return;
+ fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = NULL;
+ spin_lock_bh(&zwsm->lock);
+ for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
+ while (!list_empty(&zwsm->idle_ws[i])) {
+ workspace = container_of(zwsm->idle_ws[i].next,
struct workspace, list);
list_del(&workspace->list);
list_del(&workspace->lru_list);
zstd_free_workspace(&workspace->list);
}
}
- spin_unlock_bh(&wsm.lock);
-
- del_timer_sync(&wsm.timer);
+ spin_unlock_bh(&zwsm->lock);
+ timer_delete_sync(&zwsm->timer);
+ kfree(zwsm);
}
/*
@@ -233,29 +243,31 @@ void zstd_cleanup_workspace_manager(void)
* offer the opportunity to reclaim the workspace in favor of allocating an
* appropriately sized one in the future.
*/
-static struct list_head *zstd_find_workspace(unsigned int level)
+static struct list_head *zstd_find_workspace(struct btrfs_fs_info *fs_info, int level)
{
+ struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
struct list_head *ws;
struct workspace *workspace;
- int i = level - 1;
+ int i = clip_level(level);
- spin_lock_bh(&wsm.lock);
- for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
- if (!list_empty(&wsm.idle_ws[i])) {
- ws = wsm.idle_ws[i].next;
+ ASSERT(zwsm);
+ spin_lock_bh(&zwsm->lock);
+ for_each_set_bit_from(i, &zwsm->active_map, ZSTD_BTRFS_MAX_LEVEL) {
+ if (!list_empty(&zwsm->idle_ws[i])) {
+ ws = zwsm->idle_ws[i].next;
workspace = list_to_workspace(ws);
list_del_init(ws);
/* keep its place if it's a lower level using this */
workspace->req_level = level;
- if (level == workspace->level)
+ if (clip_level(level) == workspace->level)
list_del(&workspace->lru_list);
- if (list_empty(&wsm.idle_ws[i]))
- clear_bit(i, &wsm.active_map);
- spin_unlock_bh(&wsm.lock);
+ if (list_empty(&zwsm->idle_ws[i]))
+ clear_bit(i, &zwsm->active_map);
+ spin_unlock_bh(&zwsm->lock);
return ws;
}
}
- spin_unlock_bh(&wsm.lock);
+ spin_unlock_bh(&zwsm->lock);
return NULL;
}
@@ -270,30 +282,33 @@ static struct list_head *zstd_find_workspace(unsigned int level)
* attempt to allocate a new workspace. If we fail to allocate one due to
* memory pressure, go to sleep waiting for the max level workspace to free up.
*/
-struct list_head *zstd_get_workspace(unsigned int level)
+struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level)
{
+ struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
struct list_head *ws;
unsigned int nofs_flag;
+ ASSERT(zwsm);
+
/* level == 0 means we can use any workspace */
if (!level)
level = 1;
again:
- ws = zstd_find_workspace(level);
+ ws = zstd_find_workspace(fs_info, level);
if (ws)
return ws;
nofs_flag = memalloc_nofs_save();
- ws = zstd_alloc_workspace(level);
+ ws = zstd_alloc_workspace(fs_info, level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ws)) {
DEFINE_WAIT(wait);
- prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(&zwsm->wait, &wait, TASK_UNINTERRUPTIBLE);
schedule();
- finish_wait(&wsm.wait, &wait);
+ finish_wait(&zwsm->wait, &wait);
goto again;
}
@@ -312,34 +327,36 @@ again:
* isn't set, it is also set here. Only the max level workspace tries and wakes
* up waiting workspaces.
*/
-void zstd_put_workspace(struct list_head *ws)
+void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws)
{
+ struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
struct workspace *workspace = list_to_workspace(ws);
- spin_lock_bh(&wsm.lock);
+ ASSERT(zwsm);
+ spin_lock_bh(&zwsm->lock);
/* A node is only taken off the lru if we are the corresponding level */
- if (workspace->req_level == workspace->level) {
+ if (clip_level(workspace->req_level) == workspace->level) {
/* Hide a max level workspace from reclaim */
- if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
+ if (list_empty(&zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
INIT_LIST_HEAD(&workspace->lru_list);
} else {
workspace->last_used = jiffies;
- list_add(&workspace->lru_list, &wsm.lru_list);
- if (!timer_pending(&wsm.timer))
- mod_timer(&wsm.timer,
+ list_add(&workspace->lru_list, &zwsm->lru_list);
+ if (!timer_pending(&zwsm->timer))
+ mod_timer(&zwsm->timer,
jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
}
}
- set_bit(workspace->level - 1, &wsm.active_map);
- list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
+ set_bit(workspace->level, &zwsm->active_map);
+ list_add(&workspace->list, &zwsm->idle_ws[workspace->level]);
workspace->req_level = 0;
- spin_unlock_bh(&wsm.lock);
+ spin_unlock_bh(&zwsm->lock);
- if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
- cond_wake_up(&wsm.wait);
+ if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL))
+ cond_wake_up(&zwsm->wait);
}
void zstd_free_workspace(struct list_head *ws)
@@ -351,20 +368,22 @@ void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *zstd_alloc_workspace(unsigned int level)
+struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level)
{
+ const u32 blocksize = fs_info->sectorsize;
struct workspace *workspace;
workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->size = zstd_ws_mem_sizes[level - 1];
- workspace->level = level;
+ /* Use level 1 workspace size for all the fast mode negative levels. */
+ workspace->size = zstd_ws_mem_sizes[clip_level(level)];
+ workspace->level = clip_level(level);
workspace->req_level = level;
workspace->last_used = jiffies;
workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN);
- workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ workspace->buf = kmalloc(blocksize, GFP_KERNEL);
if (!workspace->mem || !workspace->buf)
goto fail;
@@ -377,11 +396,13 @@ fail:
return ERR_PTR(-ENOMEM);
}
-int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
zstd_cstream *stream;
int ret = 0;
int nr_folios = 0;
@@ -392,23 +413,21 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned long len = *total_out;
const unsigned long nr_dest_folios = *out_folios;
const u64 orig_end = start + len;
- unsigned long max_out = nr_dest_folios * PAGE_SIZE;
- unsigned int pg_off;
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
+ unsigned long max_out = nr_dest_folios * min_folio_size;
unsigned int cur_len;
- zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
- len);
+ workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len);
*out_folios = 0;
*total_out = 0;
*total_in = 0;
/* Initialize the stream */
- stream = zstd_init_cstream(&params, len, workspace->mem,
+ stream = zstd_init_cstream(&workspace->params, len, workspace->mem,
workspace->size);
if (unlikely(!stream)) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_err(inode->root->fs_info,
+ btrfs_err(fs_info,
"zstd compression init level %d failed, root %llu inode %llu offset %llu",
workspace->req_level, btrfs_root_id(inode->root),
btrfs_ino(inode), start);
@@ -420,14 +439,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- pg_off = offset_in_page(start);
- cur_len = btrfs_calc_input_length(orig_end, start);
- workspace->in_buf.src = kmap_local_folio(in_folio, pg_off);
+ cur_len = btrfs_calc_input_length(in_folio, orig_end, start);
+ workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start));
workspace->in_buf.pos = 0;
workspace->in_buf.size = cur_len;
/* Allocate and map in the output buffer */
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -435,7 +453,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
folios[nr_folios++] = out_folio;
workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
- workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
while (1) {
size_t ret2;
@@ -443,9 +461,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret2 = zstd_compress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
if (unlikely(zstd_is_error(ret2))) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_warn(inode->root->fs_info,
+ btrfs_warn(fs_info,
"zstd compression level %d failed, error %d root %llu inode %llu offset %llu",
workspace->req_level, zstd_get_error_code(ret2),
btrfs_root_id(inode->root), btrfs_ino(inode),
@@ -455,7 +471,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
}
/* Check to see if we are making it bigger */
- if (tot_in + workspace->in_buf.pos > 8192 &&
+ if (tot_in + workspace->in_buf.pos > blocksize * 2 &&
tot_in + workspace->in_buf.pos <
tot_out + workspace->out_buf.pos) {
ret = -E2BIG;
@@ -471,13 +487,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
/* Check if we need more output space */
if (workspace->out_buf.pos == workspace->out_buf.size) {
- tot_out += PAGE_SIZE;
- max_out -= PAGE_SIZE;
+ tot_out += min_folio_size;
+ max_out -= min_folio_size;
if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -485,8 +501,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
folios[nr_folios++] = out_folio;
workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
- workspace->out_buf.size = min_t(size_t, max_out,
- PAGE_SIZE);
+ workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
}
/* We've reached the end of the input */
@@ -506,9 +521,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- pg_off = offset_in_page(start);
- cur_len = btrfs_calc_input_length(orig_end, start);
- workspace->in_buf.src = kmap_local_folio(in_folio, pg_off);
+ cur_len = btrfs_calc_input_length(in_folio, orig_end, start);
+ workspace->in_buf.src = kmap_local_folio(in_folio,
+ offset_in_folio(in_folio, start));
workspace->in_buf.pos = 0;
workspace->in_buf.size = cur_len;
}
@@ -518,9 +533,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret2 = zstd_end_stream(stream, &workspace->out_buf);
if (unlikely(zstd_is_error(ret2))) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_err(inode->root->fs_info,
+ btrfs_err(fs_info,
"zstd compression end level %d failed, error %d root %llu inode %llu offset %llu",
workspace->req_level, zstd_get_error_code(ret2),
btrfs_root_id(inode->root), btrfs_ino(inode),
@@ -538,13 +551,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
goto out;
}
- tot_out += PAGE_SIZE;
- max_out -= PAGE_SIZE;
+ tot_out += min_folio_size;
+ max_out -= min_folio_size;
if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -552,7 +565,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
folios[nr_folios++] = out_folio;
workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
- workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
}
if (tot_out >= tot_in) {
@@ -574,13 +587,16 @@ out:
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct folio **folios_in = cb->compressed_folios;
size_t srclen = cb->compressed_len;
zstd_dstream *stream;
int ret = 0;
+ const u32 blocksize = fs_info->sectorsize;
+ const unsigned int min_folio_size = btrfs_min_folio_size(fs_info);
unsigned long folio_in_index = 0;
- unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size);
unsigned long buf_start;
unsigned long total_out = 0;
@@ -598,11 +614,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->in_buf.pos = 0;
- workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->in_buf.size = min_t(size_t, srclen, min_folio_size);
workspace->out_buf.dst = workspace->buf;
workspace->out_buf.pos = 0;
- workspace->out_buf.size = PAGE_SIZE;
+ workspace->out_buf.size = blocksize;
while (1) {
size_t ret2;
@@ -638,16 +654,16 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->in_buf.pos == workspace->in_buf.size) {
kunmap_local(workspace->in_buf.src);
folio_in_index++;
- if (folio_in_index >= total_folios_in) {
+ if (unlikely(folio_in_index >= total_folios_in)) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
- srclen -= PAGE_SIZE;
+ srclen -= min_folio_size;
workspace->in_buf.src =
kmap_local_folio(folios_in[folio_in_index], 0);
workspace->in_buf.pos = 0;
- workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->in_buf.size = min_t(size_t, srclen, min_folio_size);
}
}
ret = 0;
@@ -714,9 +730,8 @@ finish:
return ret;
}
-const struct btrfs_compress_op btrfs_zstd_compress = {
- /* ZSTD uses own workspace manager */
- .workspace_manager = NULL,
+const struct btrfs_compress_levels btrfs_zstd_compress = {
+ .min_level = ZSTD_BTRFS_MIN_LEVEL,
.max_level = ZSTD_BTRFS_MAX_LEVEL,
.default_level = ZSTD_BTRFS_DEFAULT_LEVEL,
};