summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-03-22 10:51:40 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-03-22 10:51:40 -0700
commit5191290407668028179f2544a11ae9b57f0bcf07 (patch)
tree1296aea3a45e412bf5155a9ddf8484097eb9538e
parent9b03992f0c88baef524842e411fbdc147780dd5d (diff)
parentd3e29967079c522ce1c5cab0e9fab2c280b977eb (diff)
Merge tag 'for-5.18-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "This contains feature updates, performance improvements, preparatory and core work and some related VFS updates: Features: - encoded read/write ioctls, allows user space to read or write raw data directly to extents (now compressed, encrypted in the future), will be used by send/receive v2 where it saves processing time - zoned mode now works with metadata DUP (the mkfs.btrfs default) - error message header updates: - print error state: transaction abort, other error, log tree errors - print transient filesystem state: remount, device replace, ignored checksum verifications - tree-checker: verify the transaction id of the to-be-written dirty extent buffer Performance improvements for fsync: - directory logging speedups (up to -90% run time) - avoid logging all directory changes during renames (up to -60% run time) - avoid inode logging during rename and link when possible (up to -60% run time) - prepare extents to be logged before locking a log tree path (throughput +7%) - stop copying old file extents when doing a full fsync() - improved logging of old extents after truncate Core, fixes: - improved stale device identification by dev_t and not just path (for devices that are behind other layers like device mapper) - continued extent tree v2 preparatory work - disable features that won't work yet - add wrappers and abstractions for new tree roots - improved error handling - add super block write annotations around background block group reclaim - fix device scanning messages potentially accessing stale pointer - cleanups and refactoring VFS: - allow reflinks/deduplication from two different mounts of the same filesystem - export and add helpers for read/write range verification, for the encoded ioctls" * tag 'for-5.18-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (98 commits) btrfs: zoned: put block group after final usage btrfs: don't access possibly stale fs_info data in device_list_add btrfs: add lockdep_assert_held to need_preemptive_reclaim btrfs: verify the tranisd of the to-be-written dirty extent buffer btrfs: unify the error handling of btrfs_read_buffer() btrfs: unify the error handling pattern for read_tree_block() btrfs: factor out do_free_extent_accounting helper btrfs: remove last_ref from the extent freeing code btrfs: add a alloc_reserved_extent helper btrfs: remove BUG_ON(ret) in alloc_reserved_tree_block btrfs: add and use helper for unlinking inode during log replay btrfs: extend locking to all space_info members accesses btrfs: zoned: mark relocation as writing fs: allow cross-vfsmount reflink/dedupe btrfs: remove the cross file system checks from remap btrfs: pass btrfs_fs_info to btrfs_recover_relocation btrfs: pass btrfs_fs_info for deleting snapshots and cleaner btrfs: add filesystems state details to error messages btrfs: deal with unexpected extent type during reflinking btrfs: fix unexpected error path when reflinking an inline extent ...
-rw-r--r--fs/btrfs/backref.c7
-rw-r--r--fs/btrfs/block-group.c36
-rw-r--r--fs/btrfs/block-group.h1
-rw-r--r--fs/btrfs/btrfs_inode.h42
-rw-r--r--fs/btrfs/compression.c63
-rw-r--r--fs/btrfs/compression.h10
-rw-r--r--fs/btrfs/ctree.c108
-rw-r--r--fs/btrfs/ctree.h83
-rw-r--r--fs/btrfs/delalloc-space.c18
-rw-r--r--fs/btrfs/dev-replace.c18
-rw-r--r--fs/btrfs/disk-io.c219
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c148
-rw-r--r--fs/btrfs/extent_io.c45
-rw-r--r--fs/btrfs/extent_map.c4
-rw-r--r--fs/btrfs/file-item.c76
-rw-r--r--fs/btrfs/file.c79
-rw-r--r--fs/btrfs/free-space-tree.c2
-rw-r--r--fs/btrfs/inode.c1183
-rw-r--r--fs/btrfs/ioctl.c309
-rw-r--r--fs/btrfs/lzo.c11
-rw-r--r--fs/btrfs/ordered-data.c132
-rw-r--r--fs/btrfs/ordered-data.h25
-rw-r--r--fs/btrfs/print-tree.c5
-rw-r--r--fs/btrfs/qgroup.c72
-rw-r--r--fs/btrfs/reflink.c43
-rw-r--r--fs/btrfs/relocation.c11
-rw-r--r--fs/btrfs/scrub.c2
-rw-r--r--fs/btrfs/send.c11
-rw-r--r--fs/btrfs/send.h2
-rw-r--r--fs/btrfs/space-info.c5
-rw-r--r--fs/btrfs/super.c96
-rw-r--r--fs/btrfs/sysfs.c15
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/transaction.c19
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-checker.c35
-rw-r--r--fs/btrfs/tree-log.c982
-rw-r--r--fs/btrfs/tree-log.h7
-rw-r--r--fs/btrfs/volumes.c147
-rw-r--r--fs/btrfs/volumes.h7
-rw-r--r--fs/btrfs/zoned.c167
-rw-r--r--fs/internal.h5
-rw-r--r--fs/ioctl.c4
-rw-r--r--fs/read_write.c34
-rw-r--r--fs/remap_range.c7
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/trace/events/btrfs.h1
-rw-r--r--include/uapi/linux/btrfs.h133
-rw-r--r--include/uapi/linux/btrfs_tree.h3
50 files changed, 3109 insertions, 1331 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index c9ee579bc5a6..ebc392ea1d74 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -789,11 +789,13 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
if (IS_ERR(eb)) {
free_pref(ref);
return PTR_ERR(eb);
- } else if (!extent_buffer_uptodate(eb)) {
+ }
+ if (!extent_buffer_uptodate(eb)) {
free_pref(ref);
free_extent_buffer(eb);
return -EIO;
}
+
if (lock)
btrfs_tree_read_lock(eb);
if (btrfs_header_level(eb) == 0)
@@ -1335,7 +1337,8 @@ again:
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
+ }
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 8202ad6aa131..c22d287e020b 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1522,8 +1522,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+ sb_start_write(fs_info->sb);
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ sb_end_write(fs_info->sb);
return;
+ }
/*
* Long running balances can keep us blocked here for eternity, so
@@ -1531,6 +1535,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
return;
}
@@ -1605,6 +1610,7 @@ next:
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
}
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
@@ -2006,6 +2012,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
cache->length = key->offset;
cache->used = btrfs_stack_block_group_used(bgi);
cache->flags = btrfs_stack_block_group_flags(bgi);
+ cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
set_free_space_tree_thresholds(cache);
@@ -2288,7 +2295,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
spin_lock(&block_group->lock);
btrfs_set_stack_block_group_used(&bgi, block_group->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ block_group->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
@@ -2444,6 +2451,27 @@ next:
btrfs_trans_release_chunk_metadata(trans);
}
+/*
+ * For extent tree v2 we use the block_group_item->chunk_offset to point at our
+ * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
+ */
+static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
+{
+ u64 div = SZ_1G;
+ u64 index;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+
+ /* If we have a smaller fs index based on 128MiB. */
+ if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
+ div = SZ_128M;
+
+ offset = div64_u64(offset, div);
+ div64_u64_rem(offset, fs_info->nr_global_roots, &index);
+ return index;
+}
+
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 bytes_used, u64 type,
u64 chunk_offset, u64 size)
@@ -2464,6 +2492,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
+ cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
+
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
cache->needs_free_space = 1;
@@ -2693,7 +2723,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
btrfs_set_stack_block_group_used(&bgi, cache->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
btrfs_mark_buffer_dirty(leaf);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 5878b7ce3b78..93aabc68bb6a 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -68,6 +68,7 @@ struct btrfs_block_group {
u64 bytes_super;
u64 flags;
u64 cache_generation;
+ u64 global_root_id;
/*
* If the free space extent count exceeds this number, convert the block
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b3e46aabc3d8..47e72d72f7d0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -14,6 +14,13 @@
#include "delayed-inode.h"
/*
+ * Since we search a directory based on f_pos (struct dir_context::pos) we have
+ * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
+ * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()).
+ */
+#define BTRFS_DIR_START_INDEX 2
+
+/*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
* the btrfs file release call will add this inode to the
@@ -173,8 +180,9 @@ struct btrfs_inode {
u64 disk_i_size;
/*
- * if this is a directory then index_cnt is the counter for the index
- * number for new files that are created
+ * If this is a directory then index_cnt is the counter for the index
+ * number for new files that are created. For an empty directory, this
+ * must be initialized to BTRFS_DIR_START_INDEX.
*/
u64 index_cnt;
@@ -333,6 +341,36 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
spin_unlock(&inode->lock);
}
+/*
+ * Should be called while holding the inode's VFS lock in exclusive mode or in a
+ * context where no one else can access the inode concurrently (during inode
+ * creation or when loading an inode from disk).
+ */
+static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
+{
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ /*
+ * The inode may have been part of a reflink operation in the last
+ * transaction that modified it, and then a fsync has reset the
+ * last_reflink_trans to avoid subsequent fsyncs in the same
+ * transaction to do unnecessary work. So update last_reflink_trans
+ * to the last_trans value (we have to be pessimistic and assume a
+ * reflink happened).
+ *
+ * The ->last_trans is protected by the inode's spinlock and we can
+ * have a concurrent ordered extent completion update it. Also set
+ * last_reflink_trans to ->last_trans only if the former is less than
+ * the later, because we can be called in a context where
+ * last_reflink_trans was set to the current transaction generation
+ * while ->last_trans was not yet updated in the current transaction,
+ * and therefore has a lower value.
+ */
+ spin_lock(&inode->lock);
+ if (inode->last_reflink_trans < inode->last_trans)
+ inode->last_reflink_trans = inode->last_trans;
+ spin_unlock(&inode->lock);
+}
+
static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
bool ret = false;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 71e5b2e9a1ba..be476f094300 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -219,7 +219,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b
bi_size += bvec->bv_len;
if (bio->bi_status)
- cb->errors = 1;
+ cb->status = bio->bi_status;
ASSERT(bi_size && bi_size <= cb->compressed_len);
last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
@@ -234,7 +234,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b
return last_io;
}
-static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio)
+static void finish_compressed_bio_read(struct compressed_bio *cb)
{
unsigned int index;
struct page *page;
@@ -247,19 +247,18 @@ static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bi
}
/* Do io completion on the original bio */
- if (cb->errors) {
- bio_io_error(cb->orig_bio);
+ if (cb->status != BLK_STS_OK) {
+ cb->orig_bio->bi_status = cb->status;
+ bio_endio(cb->orig_bio);
} else {
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
- ASSERT(bio);
- ASSERT(!bio->bi_status);
/*
* We have verified the checksum already, set page checked so
* the end_io handlers know about it
*/
- ASSERT(!bio_flagged(bio, BIO_CLONED));
+ ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED));
bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
u64 bvec_start = page_offset(bvec->bv_page) +
bvec->bv_offset;
@@ -308,7 +307,7 @@ static void end_compressed_bio_read(struct bio *bio)
* Some IO in this cb have failed, just skip checksum as there
* is no way it could be correct.
*/
- if (cb->errors == 1)
+ if (cb->status != BLK_STS_OK)
goto csum_failed;
inode = cb->inode;
@@ -324,8 +323,8 @@ static void end_compressed_bio_read(struct bio *bio)
csum_failed:
if (ret)
- cb->errors = 1;
- finish_compressed_bio_read(cb, bio);
+ cb->status = errno_to_blk_status(ret);
+ finish_compressed_bio_read(cb);
out:
bio_put(bio);
}
@@ -342,11 +341,12 @@ static noinline void end_compressed_writeback(struct inode *inode,
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct page *pages[16];
unsigned long nr_pages = end_index - index + 1;
+ const int errno = blk_status_to_errno(cb->status);
int i;
int ret;
- if (cb->errors)
- mapping_set_error(inode->i_mapping, -EIO);
+ if (errno)
+ mapping_set_error(inode->i_mapping, errno);
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
@@ -358,7 +358,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
continue;
}
for (i = 0; i < ret; i++) {
- if (cb->errors)
+ if (errno)
SetPageError(pages[i]);
btrfs_page_clamp_clear_writeback(fs_info, pages[i],
cb->start, cb->len);
@@ -381,9 +381,10 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
*/
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
- !cb->errors);
+ cb->status == BLK_STS_OK);
- end_compressed_writeback(inode, cb);
+ if (cb->writeback)
+ end_compressed_writeback(inode, cb);
/* Note, our inode could be gone now */
/*
@@ -506,7 +507,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct page **compressed_pages,
unsigned int nr_pages,
unsigned int write_flags,
- struct cgroup_subsys_state *blkcg_css)
+ struct cgroup_subsys_state *blkcg_css,
+ bool writeback)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
@@ -524,13 +526,14 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
if (!cb)
return BLK_STS_RESOURCE;
refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
- cb->errors = 0;
+ cb->status = BLK_STS_OK;
cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
cb->mirror_num = 0;
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
+ cb->writeback = writeback;
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
@@ -591,7 +594,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
if (submit) {
if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, start, 1);
+ ret = btrfs_csum_one_bio(inode, bio, start, true);
if (ret)
goto finish_cb;
}
@@ -808,7 +811,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_len;
u64 em_start;
struct extent_map *em;
- blk_status_t ret = BLK_STS_RESOURCE;
+ blk_status_t ret;
int faili = 0;
u8 *sums;
@@ -821,17 +824,21 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
read_unlock(&em_tree->lock);
- if (!em)
- return BLK_STS_IOERR;
+ if (!em) {
+ ret = BLK_STS_IOERR;
+ goto out;
+ }
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
- if (!cb)
+ if (!cb) {
+ ret = BLK_STS_RESOURCE;
goto out;
+ }
refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
- cb->errors = 0;
+ cb->status = BLK_STS_OK;
cb->inode = inode;
cb->mirror_num = mirror_num;
sums = cb->sums;
@@ -851,8 +858,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
GFP_NOFS);
- if (!cb->compressed_pages)
+ if (!cb->compressed_pages) {
+ ret = BLK_STS_RESOURCE;
goto fail1;
+ }
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
@@ -938,7 +947,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
comp_bio = NULL;
}
}
- return 0;
+ return BLK_STS_OK;
fail2:
while (faili >= 0) {
@@ -951,6 +960,8 @@ fail1:
kfree(cb);
out:
free_extent_map(em);
+ bio->bi_status = ret;
+ bio_endio(bio);
return ret;
finish_cb:
if (comp_bio) {
@@ -970,7 +981,7 @@ finish_cb:
*/
ASSERT(refcount_read(&cb->pending_sectors));
/* Now we are the only one referring @cb, can finish it safely. */
- finish_compressed_bio_read(cb, NULL);
+ finish_compressed_bio_read(cb);
return ret;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 56eef0821e3e..ac5b20731d2a 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -22,6 +22,8 @@ struct btrfs_inode;
/* Maximum length of compressed data stored on disk */
#define BTRFS_MAX_COMPRESSED (SZ_128K)
+static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
+
/* Maximum size of data before compression */
#define BTRFS_MAX_UNCOMPRESSED (SZ_128K)
@@ -52,8 +54,11 @@ struct compressed_bio {
/* The compression algorithm for this bio */
u8 compress_type;
+ /* Whether this is a write for writeback. */
+ bool writeback;
+
/* IO errors */
- u8 errors;
+ blk_status_t status;
int mirror_num;
/* for reads, this is the bio we are copying the data into */
@@ -95,7 +100,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct page **compressed_pages,
unsigned int nr_pages,
unsigned int write_flags,
- struct cgroup_subsys_state *blkcg_css);
+ struct cgroup_subsys_state *blkcg_css,
+ bool writeback);
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a7db3f6f1b7b..0eecf98d0abb 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -846,9 +846,11 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
btrfs_header_owner(parent),
btrfs_node_ptr_generation(parent, slot),
level - 1, &first_key);
- if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb))
+ return eb;
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
- eb = ERR_PTR(-EIO);
+ return ERR_PTR(-EIO);
}
return eb;
@@ -1436,13 +1438,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
/* now we're allowed to do a blocking uptodate check */
ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
- if (!ret) {
- *eb_ret = tmp;
- return 0;
+ if (ret) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EIO;
}
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return -EIO;
+ *eb_ret = tmp;
+ return 0;
}
/*
@@ -1460,19 +1462,19 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
ret = -EAGAIN;
tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
gen, parent_level - 1, &first_key);
- if (!IS_ERR(tmp)) {
- /*
- * If the read above didn't mark this buffer up to date,
- * it will never end up being up to date. Set ret to EIO now
- * and give up so that our caller doesn't loop forever
- * on our EAGAINs.
- */
- if (!extent_buffer_uptodate(tmp))
- ret = -EIO;
- free_extent_buffer(tmp);
- } else {
- ret = PTR_ERR(tmp);
+ if (IS_ERR(tmp)) {
+ btrfs_release_path(p);
+ return PTR_ERR(tmp);
}
+ /*
+ * If the read above didn't mark this buffer up to date,
+ * it will never end up being up to date. Set ret to EIO now
+ * and give up so that our caller doesn't loop forever
+ * on our EAGAINs.
+ */
+ if (!extent_buffer_uptodate(tmp))
+ ret = -EIO;
+ free_extent_buffer(tmp);
btrfs_release_path(p);
return ret;
@@ -2990,16 +2992,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (free_space < data_size)
goto out_unlock;
- /* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
- free_space = btrfs_leaf_free_space(right);
- if (free_space < data_size)
- goto out_unlock;
-
left_nritems = btrfs_header_nritems(left);
if (left_nritems == 0)
goto out_unlock;
@@ -3224,7 +3221,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- /* cow and double check */
ret = btrfs_cow_block(trans, root, left,
path->nodes[1], slot - 1, &left,
BTRFS_NESTING_LEFT_COW);
@@ -3235,12 +3231,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- free_space = btrfs_leaf_free_space(left);
- if (free_space < data_size) {
- ret = 1;
- goto out;
- }
-
if (check_sibling_keys(left, right)) {
ret = -EUCLEAN;
goto out;
@@ -4170,24 +4160,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
- u32 last_off;
- u32 dsize = 0;
int ret = 0;
int wret;
- int i;
u32 nritems;
leaf = path->nodes[0];
- last_off = btrfs_item_offset(leaf, slot + nr - 1);
-
- for (i = 0; i < nr; i++)
- dsize += btrfs_item_size(leaf, slot + i);
-
nritems = btrfs_header_nritems(leaf);
if (slot + nr != nritems) {
- int data_end = leaf_data_end(leaf);
+ const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
+ const int data_end = leaf_data_end(leaf);
struct btrfs_map_token token;
+ u32 dsize = 0;
+ int i;
+
+ for (i = 0; i < nr; i++)
+ dsize += btrfs_item_size(leaf, slot + i);
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + dsize,
@@ -4227,24 +4215,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
fixup_low_keys(path, &disk_key, 1);
}
- /* delete the leaf if it is mostly empty */
+ /*
+ * Try to delete the leaf if it is mostly empty. We do this by
+ * trying to move all its items into its left and right neighbours.
+ * If we can't move all the items, then we don't delete it - it's
+ * not ideal, but future insertions might fill the leaf with more
+ * items, or items from other leaves might be moved later into our
+ * leaf due to deletions on those leaves.
+ */
if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
+ u32 min_push_space;
+
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
*/
slot = path->slots[1];
atomic_inc(&leaf->refs);
-
- wret = push_leaf_left(trans, root, path, 1, 1,
- 1, (u32)-1);
+ /*
+ * We want to be able to at least push one item to the
+ * left neighbour leaf, and that's the first item.
+ */
+ min_push_space = sizeof(struct btrfs_item) +
+ btrfs_item_size(leaf, 0);
+ wret = push_leaf_left(trans, root, path, 0,
+ min_push_space, 1, (u32)-1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (path->nodes[0] == leaf &&
btrfs_header_nritems(leaf)) {
- wret = push_leaf_right(trans, root, path, 1,
- 1, 1, 0);
+ /*
+ * If we were not able to push all items from our
+ * leaf to its left neighbour, then attempt to
+ * either push all the remaining items to the
+ * right neighbour or none. There's no advantage
+ * in pushing only some items, instead of all, as
+ * it's pointless to end up with a leaf having
+ * too few items while the neighbours can be full
+ * or nearly full.
+ */
+ nritems = btrfs_header_nritems(leaf);
+ min_push_space = leaf_space_used(leaf, 0, nritems);
+ wret = push_leaf_right(trans, root, path, 0,
+ min_push_space, 1, 0);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ebb2d109e8bb..4db17bd05a21 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -49,6 +49,7 @@ extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
struct btrfs_ordered_sum;
struct btrfs_ref;
struct btrfs_bio;
+struct btrfs_ioctl_encoded_io_args;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
@@ -148,6 +149,8 @@ enum {
/* Indicates there was an error cleaning up a log tree. */
BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+
+ BTRFS_FS_STATE_COUNT
};
#define BTRFS_BACKREF_REV_MAX 256
@@ -274,8 +277,14 @@ struct btrfs_super_block {
/* the UUID written into btree blocks */
u8 metadata_uuid[BTRFS_FSID_SIZE];
+ /* Extent tree v2 */
+ __le64 block_group_root;
+ __le64 block_group_root_generation;
+ u8 block_group_root_level;
+
/* future expansion */
- __le64 reserved[28];
+ u8 reserved8[7];
+ __le64 reserved[25];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
@@ -300,6 +309,26 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
+ */
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED | \
+ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+#else
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
@@ -314,6 +343,7 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
BTRFS_FEATURE_INCOMPAT_ZONED)
+#endif
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -636,6 +666,7 @@ struct btrfs_fs_info {
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
struct btrfs_root *data_reloc_root;
+ struct btrfs_root *block_group_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -1030,6 +1061,8 @@ struct btrfs_fs_info {
spinlock_t relocation_bg_lock;
u64 data_reloc_bg;
+ u64 nr_global_roots;
+
spinlock_t zone_active_bgs_lock;
struct list_head zone_active_bgs;
@@ -1609,25 +1642,25 @@ DECLARE_BTRFS_SETGET_BITS(64)
static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
const type *s) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
return btrfs_get_##bits(eb, s, offsetof(type, member)); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
u##bits val) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
} \
static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
const type *s) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
return btrfs_get_token_##bits(token, s, offsetof(type, member));\
} \
static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
type *s, u##bits val) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
}
@@ -1658,8 +1691,8 @@ static inline void btrfs_set_##name(type *s, u##bits val) \
static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s)
{
- BUILD_BUG_ON(sizeof(u64) !=
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
total_bytes));
}
@@ -1667,8 +1700,8 @@ static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s,
u64 val)
{
- BUILD_BUG_ON(sizeof(u64) !=
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
}
@@ -2328,6 +2361,17 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
+/*
+ * For extent tree v2 we overload the extent root with the block group root, as
+ * we will have multiple extent roots.
+ */
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup,
+ extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup,
+ extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level,
+ struct btrfs_root_backup, extent_root_level, 8);
+
/* struct btrfs_balance_item */
BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
@@ -2462,6 +2506,13 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
uuid_tree_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block,
+ block_group_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation,
+ struct btrfs_super_block,
+ block_group_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block,
+ block_group_root_level, 8);
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
@@ -2839,7 +2890,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
@@ -3155,7 +3207,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- u64 file_start, int contig);
+ u64 offset, bool one_ordered);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
@@ -3256,6 +3308,11 @@ int btrfs_writepage_cow_fixup(struct page *page);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate);
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded);
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
+
extern const struct dentry_operations btrfs_dentry_operations;
extern const struct iomap_ops btrfs_dio_iomap_ops;
extern const struct iomap_dio_ops btrfs_dio_ops;
@@ -3318,6 +3375,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
@@ -3774,7 +3833,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index fb46a28f5065..bd8267c4687d 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -270,11 +270,11 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
}
static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
- u64 num_bytes, u64 *meta_reserve,
- u64 *qgroup_reserve)
+ u64 num_bytes, u64 disk_num_bytes,
+ u64 *meta_reserve, u64 *qgroup_reserve)
{
u64 nr_extents = count_max_extents(num_bytes);
- u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
+ u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
@@ -288,7 +288,8 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
*qgroup_reserve = nr_extents * fs_info->nodesize;
}
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -318,6 +319,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
}
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+ disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize);
/*
* We always want to do it this way, every other way is wrong and ends
@@ -329,8 +331,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
- calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
- &qgroup_reserve);
+ calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+ &meta_reserve, &qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
if (ret)
return ret;
@@ -349,7 +351,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
spin_lock(&inode->lock);
nr_extents = count_max_extents(num_bytes);
btrfs_mod_outstanding_extents(inode, nr_extents);
- inode->csum_bytes += num_bytes;
+ inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -454,7 +456,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
- ret = btrfs_delalloc_reserve_metadata(inode, len);
+ ret = btrfs_delalloc_reserve_metadata(inode, len, len);
if (ret < 0) {
btrfs_free_reserved_data_space(inode, *reserved, start, len);
extent_changeset_free(*reserved);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 62b9651ea662..71fd99b48283 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -243,6 +243,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
struct block_device *bdev;
struct rcu_string *name;
@@ -271,7 +272,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
sync_blockdev(bdev);
- list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
@@ -302,6 +303,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
goto error;
}
rcu_assign_pointer(device->name, name);
+ ret = lookup_bdev(device_path, &device->devt);
+ if (ret)
+ goto error;
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
@@ -320,17 +324,17 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
- device->fs_devices = fs_info->fs_devices;
+ device->fs_devices = fs_devices;
ret = btrfs_get_dev_zone_info(device, false);
if (ret)
goto error;
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- list_add(&device->dev_list, &fs_info->fs_devices->devices);
- fs_info->fs_devices->num_devices++;
- fs_info->fs_devices->open_devices++;
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_add(&device->dev_list, &fs_devices->devices);
+ fs_devices->num_devices++;
+ fs_devices->open_devices++;
+ mutex_unlock(&fs_devices->device_list_mutex);
*device_out = device;
return 0;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b3e9cf3fd1dd..62565ee00b97 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -441,17 +441,31 @@ static int csum_one_extent_buffer(struct extent_buffer *eb)
else
ret = btrfs_check_leaf_full(eb);
- if (ret < 0) {
- btrfs_print_tree(eb, 0);
+ if (ret < 0)
+ goto error;
+
+ /*
+ * Also check the generation, the eb reached here must be newer than
+ * last committed. Or something seriously wrong happened.
+ */
+ if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
+ ret = -EUCLEAN;
btrfs_err(fs_info,
- "block=%llu write time tree block corruption detected",
- eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- return ret;
+ "block=%llu bad generation, have %llu expect > %llu",
+ eb->start, btrfs_header_generation(eb),
+ fs_info->last_trans_committed);
+ goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
return 0;
+
+error:
+ btrfs_print_tree(eb, 0);
+ btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
+ eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ return ret;
}
/* Checksum all dirty extent buffers in one bio_vec */
@@ -1289,12 +1303,33 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
return root;
}
+static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group *block_group;
+ u64 ret;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ if (bytenr)
+ block_group = btrfs_lookup_block_group(fs_info, bytenr);
+ else
+ block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
+ ASSERT(block_group);
+ if (!block_group)
+ return 0;
+ ret = block_group->global_root_id;
+ btrfs_put_block_group(block_group);
+
+ return ret;
+}
+
struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct btrfs_key key = {
.objectid = BTRFS_CSUM_TREE_OBJECTID,
.type = BTRFS_ROOT_ITEM_KEY,
- .offset = 0,
+ .offset = btrfs_global_root_id(fs_info, bytenr),
};
return btrfs_global_root(fs_info, &key);
@@ -1305,7 +1340,7 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
struct btrfs_key key = {
.objectid = BTRFS_EXTENT_TREE_OBJECTID,
.type = BTRFS_ROOT_ITEM_KEY,
- .offset = 0,
+ .offset = btrfs_global_root_id(fs_info, bytenr),
};
return btrfs_global_root(fs_info, &key);
@@ -1522,7 +1557,8 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
ret = PTR_ERR(root->node);
root->node = NULL;
goto fail;
- } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ }
+ if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
goto fail;
}
@@ -1727,6 +1763,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_put_root(fs_info->uuid_root);
btrfs_put_root(fs_info->fs_root);
btrfs_put_root(fs_info->data_reloc_root);
+ btrfs_put_root(fs_info->block_group_root);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
@@ -1925,8 +1962,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
static int cleaner_kthread(void *arg)
{
- struct btrfs_root *root = arg;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg;
int again;
while (1) {
@@ -1959,7 +1995,7 @@ static int cleaner_kthread(void *arg)
btrfs_run_delayed_iputs(fs_info);
- again = btrfs_clean_one_deleted_snapshot(root);
+ again = btrfs_clean_one_deleted_snapshot(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
/*
@@ -2095,8 +2131,6 @@ static void backup_super_roots(struct btrfs_fs_info *info)
{
const int next_backup = info->backup_root_index;
struct btrfs_root_backup *root_backup;
- struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
- struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
root_backup = info->super_for_commit->super_roots + next_backup;
@@ -2121,11 +2155,30 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
- btrfs_set_backup_extent_root(root_backup, extent_root->node->start);
- btrfs_set_backup_extent_root_gen(root_backup,
- btrfs_header_generation(extent_root->node));
- btrfs_set_backup_extent_root_level(root_backup,
- btrfs_header_level(extent_root->node));
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
+ btrfs_set_backup_block_group_root(root_backup,
+ info->block_group_root->node->start);
+ btrfs_set_backup_block_group_root_gen(root_backup,
+ btrfs_header_generation(info->block_group_root->node));
+ btrfs_set_backup_block_group_root_level(root_backup,
+ btrfs_header_level(info->block_group_root->node));
+ } else {
+ struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
+ struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
+
+ btrfs_set_backup_extent_root(root_backup,
+ extent_root->node->start);
+ btrfs_set_backup_extent_root_gen(root_backup,
+ btrfs_header_generation(extent_root->node));
+ btrfs_set_backup_extent_root_level(root_backup,
+ btrfs_header_level(extent_root->node));
+
+ btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
+ btrfs_set_backup_csum_root_gen(root_backup,
+ btrfs_header_generation(csum_root->node));
+ btrfs_set_backup_csum_root_level(root_backup,
+ btrfs_header_level(csum_root->node));
+ }
/*
* we might commit during log recovery, which happens before we set
@@ -2146,12 +2199,6 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_dev_root_level(root_backup,
btrfs_header_level(info->dev_root->node));
- btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
- btrfs_set_backup_csum_root_gen(root_backup,
- btrfs_header_generation(csum_root->node));
- btrfs_set_backup_csum_root_level(root_backup,
- btrfs_header_level(csum_root->node));
-
btrfs_set_backup_total_bytes(root_backup,
btrfs_super_total_bytes(info->super_copy));
btrfs_set_backup_bytes_used(root_backup,
@@ -2269,6 +2316,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
free_root_extent_buffers(info->uuid_root);
free_root_extent_buffers(info->fs_root);
free_root_extent_buffers(info->data_reloc_root);
+ free_root_extent_buffers(info->block_group_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
}
@@ -2504,11 +2552,13 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
log_tree_root->node = NULL;
btrfs_put_root(log_tree_root);
return ret;
- } else if (!extent_buffer_uptodate(log_tree_root->node)) {
+ }
+ if (!extent_buffer_uptodate(log_tree_root->node)) {
btrfs_err(fs_info, "failed to read log tree");
btrfs_put_root(log_tree_root);
return -EIO;
}
+
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
@@ -2533,6 +2583,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
{
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_root *root;
+ u64 max_global_id = 0;
int ret;
struct btrfs_key key = {
.objectid = objectid,
@@ -2568,6 +2619,13 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
break;
btrfs_release_path(path);
+ /*
+ * Just worry about this for extent tree, it'll be the same for
+ * everybody.
+ */
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ max_global_id = max(max_global_id, key.offset);
+
found = true;
root = read_tree_root_path(tree_root, path, &key);
if (IS_ERR(root)) {
@@ -2585,6 +2643,9 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
}
btrfs_release_path(path);
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ fs_info->nr_global_roots = max_global_id + 1;
+
if (!found || ret) {
if (objectid == BTRFS_CSUM_TREE_OBJECTID)
set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
@@ -2930,6 +2991,56 @@ out:
return ret;
}
+static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
+{
+ int ret = 0;
+
+ root->node = read_tree_block(root->fs_info, bytenr,
+ root->root_key.objectid, gen, level, NULL);
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
+ root->node = NULL;
+ return ret;
+ }
+ if (!extent_buffer_uptodate(root->node)) {
+ free_extent_buffer(root->node);
+ root->node = NULL;
+ return -EIO;
+ }
+
+ btrfs_set_root_node(&root->root_item, root->node);
+ root->commit_root = btrfs_root_node(root);
+ btrfs_set_root_refs(&root->root_item, 1);
+ return ret;
+}
+
+static int load_important_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ u64 gen, bytenr;
+ int level, ret;
+
+ bytenr = btrfs_super_root(sb);
+ gen = btrfs_super_generation(sb);
+ level = btrfs_super_root_level(sb);
+ ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
+ if (ret) {
+ btrfs_warn(fs_info, "couldn't read tree root");
+ return ret;
+ }
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ bytenr = btrfs_super_block_group_root(sb);
+ gen = btrfs_super_block_group_root_generation(sb);
+ level = btrfs_super_block_group_root_level(sb);
+ ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
+ if (ret)
+ btrfs_warn(fs_info, "couldn't read block group root");
+ return ret;
+}
+
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
{
int backup_index = find_newest_super_backup(fs_info);
@@ -2939,10 +3050,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
int ret = 0;
int i;
- for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
- u64 generation;
- int level;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ struct btrfs_root *root;
+
+ root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
+ GFP_KERNEL);
+ if (!root)
+ return -ENOMEM;
+ fs_info->block_group_root = root;
+ }
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
if (handle_error) {
if (!IS_ERR(tree_root->node))
free_extent_buffer(tree_root->node);
@@ -2967,29 +3085,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
if (ret < 0)
return ret;
}
- generation = btrfs_super_generation(sb);
- level = btrfs_super_root_level(sb);
- tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
- BTRFS_ROOT_TREE_OBJECTID,
- generation, level, NULL);
- if (IS_ERR(tree_root->node)) {
- handle_error = true;
- ret = PTR_ERR(tree_root->node);
- tree_root->node = NULL;
- btrfs_warn(fs_info, "couldn't read tree root");
- continue;
- } else if (!extent_buffer_uptodate(tree_root->node)) {
+ ret = load_important_roots(fs_info);
+ if (ret) {
handle_error = true;
- ret = -EIO;
- btrfs_warn(fs_info, "error while reading tree root");
continue;
}
- btrfs_set_root_node(&tree_root->root_item, tree_root->node);
- tree_root->commit_root = btrfs_root_node(tree_root);
- btrfs_set_root_refs(&tree_root->root_item, 1);
-
/*
* No need to hold btrfs_root::objectid_mutex since the fs
* hasn't been fully initialised and we are the only user
@@ -3009,8 +3111,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
}
/* All successful */
- fs_info->generation = generation;
- fs_info->last_trans_committed = generation;
+ fs_info->generation = btrfs_header_generation(tree_root->node);
+ fs_info->last_trans_committed = fs_info->generation;
fs_info->last_reloc_trans = 0;
/* Always begin writing backup roots after the one being used */
@@ -3293,7 +3395,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
up_read(&fs_info->cleanup_work_sem);
mutex_lock(&fs_info->cleaner_mutex);
- ret = btrfs_recover_relocation(fs_info->tree_root);
+ ret = btrfs_recover_relocation(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
@@ -3594,21 +3696,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
generation = btrfs_super_chunk_root_generation(disk_super);
level = btrfs_super_chunk_root_level(disk_super);
-
- chunk_root->node = read_tree_block(fs_info,
- btrfs_super_chunk_root(disk_super),
- BTRFS_CHUNK_TREE_OBJECTID,
- generation, level, NULL);
- if (IS_ERR(chunk_root->node) ||
- !extent_buffer_uptodate(chunk_root->node)) {
+ ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
+ generation, level);
+ if (ret) {
btrfs_err(fs_info, "failed to read chunk root");
- if (!IS_ERR(chunk_root->node))
- free_extent_buffer(chunk_root->node);
- chunk_root->node = NULL;
goto fail_tree_roots;
}
- btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
- chunk_root->commit_root = btrfs_root_node(chunk_root);
read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
offsetof(struct btrfs_header, chunk_tree_uuid),
@@ -3728,7 +3821,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sysfs;
}
- fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+ fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
"btrfs-cleaner");
if (IS_ERR(fs_info->cleaner_kthread))
goto fail_sysfs;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 5e8bef4b7563..2e10514ecda8 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -111,6 +111,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return fs_info->block_group_root;
return btrfs_extent_root(fs_info, 0);
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 96427b1ecac3..f477035a2ac2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -598,7 +598,7 @@ fail:
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- int refs_to_drop, int *last_ref)
+ int refs_to_drop)
{
struct btrfs_key key;
struct btrfs_extent_data_ref *ref1 = NULL;
@@ -631,7 +631,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
if (num_refs == 0) {
ret = btrfs_del_item(trans, root, path);
- *last_ref = 1;
} else {
if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
@@ -1072,8 +1071,7 @@ static noinline_for_stack
void update_inline_extent_backref(struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_mod,
- struct btrfs_delayed_extent_op *extent_op,
- int *last_ref)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_extent_item *ei;
@@ -1121,7 +1119,6 @@ void update_inline_extent_backref(struct btrfs_path *path,
else
btrfs_set_shared_data_ref_count(leaf, sref, refs);
} else {
- *last_ref = 1;
size = btrfs_extent_inline_ref_size(type);
item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = (unsigned long)iref;
@@ -1166,8 +1163,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
}
return -EUCLEAN;
}
- update_inline_extent_backref(path, iref, refs_to_add,
- extent_op, NULL);
+ update_inline_extent_backref(path, iref, refs_to_add, extent_op);
} else if (ret == -ENOENT) {
setup_inline_extent_backref(trans->fs_info, path, iref, parent,
root_objectid, owner, offset,
@@ -1181,21 +1177,17 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
- int refs_to_drop, int is_data, int *last_ref)
+ int refs_to_drop, int is_data)
{
int ret = 0;
BUG_ON(!is_data && refs_to_drop != 1);
- if (iref) {
- update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
- last_ref);
- } else if (is_data) {
- ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
- last_ref);
- } else {
- *last_ref = 1;
+ if (iref)
+ update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
+ else if (is_data)
+ ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+ else
ret = btrfs_del_item(trans, root, path);
- }
return ret;
}
@@ -2766,12 +2758,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
spin_unlock(&cache->lock);
if (!readonly && return_free_space &&
global_rsv->space_info == space_info) {
- u64 to_add = len;
-
spin_lock(&global_rsv->lock);
if (!global_rsv->full) {
- to_add = min(len, global_rsv->size -
- global_rsv->reserved);
+ u64 to_add = min(len, global_rsv->size -
+ global_rsv->reserved);
+
global_rsv->reserved += to_add;
btrfs_space_info_update_bytes_may_use(fs_info,
space_info, to_add);
@@ -2862,6 +2853,35 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
return 0;
}
+static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, bool is_data)
+{
+ int ret;
+
+ if (is_data) {
+ struct btrfs_root *csum_root;
+
+ csum_root = btrfs_csum_root(trans->fs_info, bytenr);
+ ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+
+ ret = add_to_free_space_tree(trans, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
+}
+
/*
* Drop one or more refs of @node.
*
@@ -2943,7 +2963,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 refs;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
- int last_ref = 0;
bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
extent_root = btrfs_extent_root(info, bytenr);
@@ -3010,8 +3029,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
/* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, extent_root, path,
- NULL, refs_to_drop, is_data,
- &last_ref);
+ NULL, refs_to_drop, is_data);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3136,8 +3154,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
if (found_extent) {
ret = remove_extent_backref(trans, extent_root, path,
- iref, refs_to_drop, is_data,
- &last_ref);
+ iref, refs_to_drop, is_data);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3182,7 +3199,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- last_ref = 1;
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
if (ret) {
@@ -3191,28 +3207,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- if (is_data) {
- struct btrfs_root *csum_root;
- csum_root = btrfs_csum_root(info, bytenr);
- ret = btrfs_del_csums(trans, csum_root, bytenr,
- num_bytes);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
- }
-
- ret = add_to_free_space_tree(trans, bytenr, num_bytes);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
-
- ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
+ ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
}
btrfs_release_path(path);
@@ -4605,6 +4600,28 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
return ret;
}
+static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
+
+ ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
+ if (ret)
+ return ret;
+
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
+ if (ret) {
+ ASSERT(!ret);
+ btrfs_err(fs_info, "update block group failed for %llu %llu",
+ bytenr, num_bytes);
+ return ret;
+ }
+
+ trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
+ return 0;
+}
+
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
@@ -4665,18 +4682,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
- if (ret)
- return ret;
-
- ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true);
- if (ret) { /* -ENOENT, logic error */
- btrfs_err(fs_info, "update block group failed for %llu %llu",
- ins->objectid, ins->offset);
- BUG();
- }
- trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
- return ret;
+ return alloc_reserved_extent(trans, ins->objectid, ins->offset);
}
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
@@ -4694,7 +4700,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_delayed_tree_ref *ref;
u32 size = sizeof(*extent_item) + sizeof(*iref);
- u64 num_bytes;
u64 flags = extent_op->flags_to_set;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
@@ -4704,12 +4709,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
extent_key.offset = ref->level;
extent_key.type = BTRFS_METADATA_ITEM_KEY;
- num_bytes = fs_info->nodesize;
} else {
extent_key.offset = node->num_bytes;
extent_key.type = BTRFS_EXTENT_ITEM_KEY;
size += sizeof(*block_info);
- num_bytes = node->num_bytes;
}
path = btrfs_alloc_path();
@@ -4754,22 +4757,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = remove_from_free_space_tree(trans, extent_key.objectid,
- num_bytes);
- if (ret)
- return ret;
-
- ret = btrfs_update_block_group(trans, extent_key.objectid,
- fs_info->nodesize, true);
- if (ret) { /* -ENOENT, logic error */
- btrfs_err(fs_info, "update block group failed for %llu %llu",
- extent_key.objectid, extent_key.offset);
- BUG();
- }
-
- trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
- fs_info->nodesize);
- return ret;
+ return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
}
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5923eec8caa8..df7c81255964 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2610,6 +2610,7 @@ static bool btrfs_check_repairable(struct inode *inode,
* a good copy of the failed sector and if we succeed, we have setup
* everything for repair_io_failure to do the rest for us.
*/
+ ASSERT(failed_mirror);
failrec->failed_mirror = failed_mirror;
failrec->this_mirror++;
if (failrec->this_mirror == failed_mirror)
@@ -2639,7 +2640,6 @@ int btrfs_repair_one_sector(struct inode *inode,
const int icsum = bio_offset >> fs_info->sectorsize_bits;
struct bio *repair_bio;
struct btrfs_bio *repair_bbio;
- blk_status_t status;
btrfs_debug(fs_info,
"repair read error: read error at %llu", start);
@@ -2678,13 +2678,13 @@ int btrfs_repair_one_sector(struct inode *inode,
"repair read error: submitting new read to mirror %d",
failrec->this_mirror);
- status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
- failrec->bio_flags);
- if (status) {
- free_io_failure(failure_tree, tree, failrec);
- bio_put(repair_bio);
- }
- return blk_status_to_errno(status);
+ /*
+ * At this point we have a bio, so any errors from submit_bio_hook()
+ * will be handled by the endio on the repair_bio, so we can't return an
+ * error here.
+ */
+ submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags);
+ return BLK_STS_OK;
}
static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
@@ -3068,6 +3068,14 @@ static void end_bio_extent_readpage(struct bio *bio)
if (is_data_inode(inode)) {
/*
+ * If we failed to submit the IO at all we'll have a
+ * mirror_num == 0, in which case we need to just mark
+ * the page with an error and unlock it and carry on.
+ */
+ if (mirror == 0)
+ goto readpage_ok;
+
+ /*
* btrfs_submit_read_repair() will handle all the good
* and bad sectors, we just continue to the next bvec.
*/
@@ -3534,7 +3542,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
}
em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
- if (em_cached && !IS_ERR_OR_NULL(em)) {
+ if (em_cached && !IS_ERR(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
*em_cached = em;
@@ -3563,7 +3571,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
u64 cur_end;
struct extent_map *em;
int ret = 0;
- int nr = 0;
size_t pg_offset = 0;
size_t iosize;
size_t blocksize = inode->i_sb->s_blocksize;
@@ -3608,9 +3615,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, em_cached);
- if (IS_ERR_OR_NULL(em)) {
+ if (IS_ERR(em)) {
unlock_extent(tree, cur, end);
end_page_read(page, false, cur, end + 1 - cur);
+ ret = PTR_ERR(em);
break;
}
extent_offset = cur - em->start;
@@ -3721,9 +3729,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
end_bio_extent_readpage, 0,
this_bio_flag,
force_bio_submit);
- if (!ret) {
- nr++;
- } else {
+ if (ret) {
unlock_extent(tree, cur, cur + iosize - 1);
end_page_read(page, false, cur, iosize);
goto out;
@@ -3951,7 +3957,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
}
em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
- if (IS_ERR_OR_NULL(em)) {
+ if (IS_ERR(em)) {
btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
ret = PTR_ERR_OR_ZERO(em);
break;
@@ -4780,11 +4786,12 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
return ret;
}
if (cache) {
- /* Impiles write in zoned mode */
- btrfs_put_block_group(cache);
- /* Mark the last eb in a block group */
+ /*
+ * Implies write in zoned mode. Mark the last eb in a block group.
+ */
if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
+ btrfs_put_block_group(cache);
}
ret = write_one_eb(eb, wbc, epd);
free_extent_buffer(eb);
@@ -5390,7 +5397,7 @@ static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
break;
len = ALIGN(len, sectorsize);
em = btrfs_get_extent_fiemap(inode, offset, len);
- if (IS_ERR_OR_NULL(em))
+ if (IS_ERR(em))
return em;
/* if this isn't a hole return it */
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index c28ceddefae4..6fee14ce2e6b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -492,6 +492,8 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
*/
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
+ lockdep_assert_held_write(&tree->lock);
+
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
rb_erase_cached(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
@@ -506,6 +508,8 @@ void replace_extent_mapping(struct extent_map_tree *tree,
struct extent_map *new,
int modified)
{
+ lockdep_assert_held_write(&tree->lock);
+
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
ASSERT(extent_map_in_tree(cur));
if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 90c5c38836ab..c828f971a346 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -305,7 +305,7 @@ found:
read_extent_buffer(path->nodes[0], dst, (unsigned long)item,
ret * csum_size);
out:
- if (ret == -ENOENT)
+ if (ret == -ENOENT || ret == -EFBIG)
ret = 0;
return ret;
}
@@ -368,6 +368,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_bio *bbio = NULL;
struct btrfs_path *path;
const u32 sectorsize = fs_info->sectorsize;
const u32 csum_size = fs_info->csum_size;
@@ -377,6 +378,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
u8 *csum;
const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
int count = 0;
+ blk_status_t ret = BLK_STS_OK;
if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
@@ -400,7 +402,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
return BLK_STS_RESOURCE;
if (!dst) {
- struct btrfs_bio *bbio = btrfs_bio(bio);
+ bbio = btrfs_bio(bio);
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
@@ -456,21 +458,27 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
count = search_csum_tree(fs_info, path, cur_disk_bytenr,
search_len, csum_dst);
- if (count <= 0) {
- /*
- * Either we hit a critical error or we didn't find
- * the csum.
- * Either way, we put zero into the csums dst, and skip
- * to the next sector.
- */
+ if (count < 0) {
+ ret = errno_to_blk_status(count);
+ if (bbio)
+ btrfs_bio_free_csum(bbio);
+ break;
+ }
+
+ /*
+ * We didn't find a csum for this range. We need to make sure
+ * we complain loudly about this, because we are not NODATASUM.
+ *
+ * However for the DATA_RELOC inode we could potentially be
+ * relocating data extents for a NODATASUM inode, so the inode
+ * itself won't be marked with NODATASUM, but the extent we're
+ * copying is in fact NODATASUM. If we don't find a csum we
+ * assume this is the case.
+ */
+ if (count == 0) {
memset(csum_dst, 0, csum_size);
count = 1;
- /*
- * For data reloc inode, we need to mark the range
- * NODATASUM so that balance won't report false csum
- * error.
- */
if (BTRFS_I(inode)->root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
u64 file_offset;
@@ -491,7 +499,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
}
btrfs_free_path(path);
- return BLK_STS_OK;
+ return ret;
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -612,32 +620,33 @@ fail:
return ret;
}
-/*
- * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio
+/**
+ * Calculate checksums of the data contained inside a bio
+ *
* @inode: Owner of the data inside the bio
* @bio: Contains the data to be checksummed
- * @file_start: offset in file this bio begins to describe
- * @contig: Boolean. If true/1 means all bio vecs in this bio are
- * contiguous and they begin at @file_start in the file. False/0
- * means this bio can contain potentially discontiguous bio vecs
- * so the logical offset of each should be calculated separately.
+ * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the
+ * file offsets are determined from the page offsets in the bio.
+ * Otherwise, this is the starting file offset of the bio vecs in
+ * @bio, which must be contiguous.
+ * @one_ordered: If true, @bio only refers to one ordered extent.
*/
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- u64 file_start, int contig)
+ u64 offset, bool one_ordered)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
+ const bool use_page_offsets = (offset == (u64)-1);
char *data;
struct bvec_iter iter;
struct bio_vec bvec;
int index;
- int nr_sectors;
+ unsigned int blockcount;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
int i;
- u64 offset;
unsigned nofs_flag;
nofs_flag = memalloc_nofs_save();
@@ -651,18 +660,13 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
sums->len = bio->bi_iter.bi_size;
INIT_LIST_HEAD(&sums->list);
- if (contig)
- offset = file_start;
- else
- offset = 0; /* shut up gcc */
-
sums->bytenr = bio->bi_iter.bi_sector << 9;
index = 0;
shash->tfm = fs_info->csum_shash;
bio_for_each_segment(bvec, bio, iter) {
- if (!contig)
+ if (use_page_offsets)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
if (!ordered) {
@@ -681,13 +685,14 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
}
}
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
+ blockcount = BTRFS_BYTES_TO_BLKS(fs_info,
bvec.bv_len + fs_info->sectorsize
- 1);
- for (i = 0; i < nr_sectors; i++) {
- if (offset >= ordered->file_offset + ordered->num_bytes ||
- offset < ordered->file_offset) {
+ for (i = 0; i < blockcount; i++) {
+ if (!one_ordered &&
+ !in_range(offset, ordered->file_offset,
+ ordered->num_bytes)) {
unsigned long bytes_left;
sums->len = this_sum_bytes;
@@ -1211,6 +1216,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
extent_start = key.offset;
extent_end = btrfs_file_extent_end(path);
em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ em->generation = btrfs_file_extent_generation(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
em->start = extent_start;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a0179cc62913..9f455c96c974 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -691,7 +691,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
int modify_tree = -1;
int update_refs;
int found = 0;
- int leafs_visited = 0;
struct btrfs_path *path = args->path;
args->bytes_found = 0;
@@ -729,7 +728,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
path->slots[0]--;
}
ret = 0;
- leafs_visited++;
next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -741,7 +739,6 @@ next_slot:
ret = 0;
break;
}
- leafs_visited++;
leaf = path->nodes[0];
recow = 1;
}
@@ -987,7 +984,7 @@ delete_extent_item:
* which case it unlocked our path, so check path->locks[0] matches a
* write lock.
*/
- if (!ret && args->replace_extent && leafs_visited == 1 &&
+ if (!ret && args->replace_extent &&
path->locks[0] == BTRFS_WRITE_LOCK &&
btrfs_leaf_free_space(leaf) >=
sizeof(struct btrfs_item) + args->extent_item_size) {
@@ -1722,7 +1719,8 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
fs_info->sectorsize);
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- reserve_bytes);
+ reserve_bytes,
+ reserve_bytes);
if (ret) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -2039,12 +2037,43 @@ out:
return err < 0 ? err : written;
}
-static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
- struct iov_iter *from)
+static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ loff_t count;
+ ssize_t ret;
+
+ btrfs_inode_lock(inode, 0);
+ count = encoded->len;
+ ret = generic_write_checks_count(iocb, &count);
+ if (ret == 0 && count != encoded->len) {
+ /*
+ * The write got truncated by generic_write_checks_count(). We
+ * can't do a partial encoded write.
+ */
+ ret = -EFBIG;
+ }
+ if (ret || encoded->len == 0)
+ goto out;
+
+ ret = btrfs_write_check(iocb, from, encoded->len);
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_do_encoded_write(iocb, from, encoded);
+out:
+ btrfs_inode_unlock(inode, 0);
+ return ret;
+}
+
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
{
struct file *file = iocb->ki_filp;
struct btrfs_inode *inode = BTRFS_I(file_inode(file));
- ssize_t num_written = 0;
+ ssize_t num_written, num_sync;
const bool sync = iocb->ki_flags & IOCB_DSYNC;
/*
@@ -2055,22 +2084,28 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
if (BTRFS_FS_ERROR(inode->root->fs_info))
return -EROFS;
- if (!(iocb->ki_flags & IOCB_DIRECT) &&
- (iocb->ki_flags & IOCB_NOWAIT))
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EOPNOTSUPP;
if (sync)
atomic_inc(&inode->sync_writers);
- if (iocb->ki_flags & IOCB_DIRECT)
- num_written = btrfs_direct_write(iocb, from);
- else
- num_written = btrfs_buffered_write(iocb, from);
+ if (encoded) {
+ num_written = btrfs_encoded_write(iocb, from, encoded);
+ num_sync = encoded->len;
+ } else if (iocb->ki_flags & IOCB_DIRECT) {
+ num_written = num_sync = btrfs_direct_write(iocb, from);
+ } else {
+ num_written = num_sync = btrfs_buffered_write(iocb, from);
+ }
btrfs_set_inode_last_sub_trans(inode);
- if (num_written > 0)
- num_written = generic_write_sync(iocb, num_written);
+ if (num_sync > 0) {
+ num_sync = generic_write_sync(iocb, num_sync);
+ if (num_sync < 0)
+ num_written = num_sync;
+ }
if (sync)
atomic_dec(&inode->sync_writers);
@@ -2079,6 +2114,11 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
return num_written;
}
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ return btrfs_do_write_iter(iocb, from, NULL);
+}
+
int btrfs_release_file(struct inode *inode, struct file *filp)
{
struct btrfs_file_private *private = filp->private_data;
@@ -2474,7 +2514,7 @@ out:
hole_em = alloc_extent_map();
if (!hole_em) {
btrfs_drop_extent_cache(inode, offset, end - 1, 0);
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
} else {
hole_em->start = offset;
hole_em->len = end - offset;
@@ -2495,8 +2535,7 @@ out:
} while (ret == -EEXIST);
free_extent_map(hole_em);
if (ret)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
}
return 0;
@@ -2850,7 +2889,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
* maps for the replacement extents (or holes).
*/
if (extent_info && !extent_info->is_new_extent)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
if (ret)
goto out_trans;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 655aad0f9e1c..0ae54d8c10d6 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -25,6 +25,8 @@ static struct btrfs_root *btrfs_free_space_root(
.offset = 0,
};
+ if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2))
+ key.offset = block_group->global_root_id;
return btrfs_global_root(block_group->fs_info, &key);
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5bbea5ec31fc..2e7143ff5523 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,11 @@ struct btrfs_dio_data {
struct extent_changeset *data_reserved;
};
+struct btrfs_rename_ctx {
+ /* Output field. Stores the index number of the old directory entry. */
+ u64 index;
+};
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -234,12 +239,14 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
* no overlapping inline items exist in the btree
*/
static int insert_inline_extent(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, bool extent_inserted,
- struct btrfs_root *root, struct inode *inode,
- u64 start, size_t size, size_t compressed_size,
+ struct btrfs_path *path,
+ struct btrfs_inode *inode, bool extent_inserted,
+ size_t size, size_t compressed_size,
int compress_type,
- struct page **compressed_pages)
+ struct page **compressed_pages,
+ bool update_i_size)
{
+ struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct page *page = NULL;
char *kaddr;
@@ -247,7 +254,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item *ei;
int ret;
size_t cur_size = size;
- unsigned long offset;
+ u64 i_size;
ASSERT((compressed_size > 0 && compressed_pages) ||
(compressed_size == 0 && !compressed_pages));
@@ -259,8 +266,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
size_t datasize;
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.offset = start;
+ key.objectid = btrfs_ino(inode);
+ key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
@@ -298,12 +305,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
- page = find_get_page(inode->i_mapping,
- start >> PAGE_SHIFT);
+ page = find_get_page(inode->vfs_inode.i_mapping, 0);
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_atomic(page);
- offset = offset_in_page(start);
- write_extent_buffer(leaf, kaddr + offset, ptr, size);
+ write_extent_buffer(leaf, kaddr, ptr, size);
kunmap_atomic(kaddr);
put_page(page);
}
@@ -314,21 +319,25 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
* We align size to sectorsize for inline extents just for simplicity
* sake.
*/
- size = ALIGN(size, root->fs_info->sectorsize);
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
+ ret = btrfs_inode_set_file_extent_range(inode, 0,
+ ALIGN(size, root->fs_info->sectorsize));
if (ret)
goto fail;
/*
- * we're an inline extent, so nobody can
- * extend the file past i_size without locking
- * a page we already have locked.
+ * We're an inline extent, so nobody can extend the file past i_size
+ * without locking a page we already have locked.
*
- * We must do any isize and inode updates
- * before we unlock the pages. Otherwise we
- * could end up racing with unlink.
+ * We must do any i_size and inode updates before we unlock the pages.
+ * Otherwise we could end up racing with unlink.
*/
- BTRFS_I(inode)->disk_i_size = inode->i_size;
+ i_size = i_size_read(&inode->vfs_inode);
+ if (update_i_size && size > i_size) {
+ i_size_write(&inode->vfs_inode, size);
+ i_size = size;
+ }
+ inode->disk_i_size = i_size;
+
fail:
return ret;
}
@@ -339,35 +348,31 @@ fail:
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
- u64 end, size_t compressed_size,
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
+ size_t compressed_size,
int compress_type,
- struct page **compressed_pages)
+ struct page **compressed_pages,
+ bool update_i_size)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
- u64 isize = i_size_read(&inode->vfs_inode);
- u64 actual_end = min(end + 1, isize);
- u64 inline_len = actual_end - start;
- u64 aligned_end = ALIGN(end, fs_info->sectorsize);
- u64 data_len = inline_len;
+ u64 data_len = (compressed_size ?: size);
int ret;
struct btrfs_path *path;
- if (compressed_size)
- data_len = compressed_size;
-
- if (start > 0 ||
- actual_end > fs_info->sectorsize ||
+ /*
+ * We can create an inline extent if it ends at or beyond the current
+ * i_size, is no larger than a sector (decompressed), and the (possibly
+ * compressed) data fits in a leaf and the configured maximum inline
+ * size.
+ */
+ if (size < i_size_read(&inode->vfs_inode) ||
+ size > fs_info->sectorsize ||
data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
- (!compressed_size &&
- (actual_end & (fs_info->sectorsize - 1)) == 0) ||
- end + 1 < isize ||
- data_len > fs_info->max_inline) {
+ data_len > fs_info->max_inline)
return 1;
- }
path = btrfs_alloc_path();
if (!path)
@@ -381,30 +386,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
trans->block_rsv = &inode->block_rsv;
drop_args.path = path;
- drop_args.start = start;
- drop_args.end = aligned_end;
+ drop_args.start = 0;
+ drop_args.end = fs_info->sectorsize;
drop_args.drop_cache = true;
drop_args.replace_extent = true;
-
- if (compressed_size && compressed_pages)
- drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
- compressed_size);
- else
- drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
- inline_len);
-
+ drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- if (isize > actual_end)
- inline_len = min_t(u64, isize, actual_end);
- ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
- root, &inode->vfs_inode, start,
- inline_len, compressed_size,
- compress_type, compressed_pages);
+ ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
+ size, compressed_size, compress_type,
+ compressed_pages, update_i_size);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -413,7 +408,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
goto out;
}
- btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
+ btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
ret = btrfs_update_inode(trans, root, inode);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
@@ -423,7 +418,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
goto out;
}
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
out:
/*
* Don't forget to free the reserved space, as for inlined extent
@@ -624,7 +619,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
nr_pages = min_t(unsigned long, nr_pages,
BTRFS_MAX_COMPRESSED / PAGE_SIZE);
@@ -735,14 +729,15 @@ cont:
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
- ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+ ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
0, BTRFS_COMPRESS_NONE,
- NULL);
+ NULL, false);
} else {
/* try making a compressed inline extent */
- ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+ ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
total_compressed,
- compress_type, pages);
+ compress_type, pages,
+ false);
}
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
@@ -981,11 +976,14 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */
- ins.objectid, /* disk_bytenr */
- async_extent->ram_size, /* num_bytes */
- ins.offset, /* disk_num_bytes */
- async_extent->compress_type);
+ ret = btrfs_add_ordered_extent(inode, start, /* file_offset */
+ async_extent->ram_size, /* num_bytes */
+ async_extent->ram_size, /* ram_bytes */
+ ins.objectid, /* disk_bytenr */
+ ins.offset, /* disk_num_bytes */
+ 0, /* offset */
+ 1 << BTRFS_ORDERED_COMPRESSED,
+ async_extent->compress_type);
if (ret) {
btrfs_drop_extent_cache(inode, start, end, 0);
goto out_free_reserve;
@@ -1003,7 +1001,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
async_extent->pages, /* compressed_pages */
async_extent->nr_pages,
async_chunk->write_flags,
- async_chunk->blkcg_css)) {
+ async_chunk->blkcg_css, true)) {
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
@@ -1152,9 +1150,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* So here we skip inline extent creation completely.
*/
if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
+ u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
+ end + 1);
+
/* lets try to make an inline extent */
- ret = cow_file_range_inline(inode, start, end, 0,
- BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(inode, actual_end, 0,
+ BTRFS_COMPRESS_NONE, NULL, false);
if (ret == 0) {
/*
* We use DO_ACCOUNTING here because we need the
@@ -1234,9 +1235,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
- ram_size, cur_alloc_size,
- BTRFS_ORDERED_REGULAR);
+ ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size,
+ ins.objectid, cur_alloc_size, 0,
+ 1 << BTRFS_ORDERED_REGULAR,
+ BTRFS_COMPRESS_NONE);
if (ret)
goto out_drop_extent_cache;
@@ -1895,10 +1897,11 @@ out_check:
goto error;
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, cur_offset,
- disk_bytenr, num_bytes,
- num_bytes,
- BTRFS_ORDERED_PREALLOC);
+ ret = btrfs_add_ordered_extent(inode,
+ cur_offset, num_bytes, num_bytes,
+ disk_bytenr, num_bytes, 0,
+ 1 << BTRFS_ORDERED_PREALLOC,
+ BTRFS_COMPRESS_NONE);
if (ret) {
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + num_bytes - 1,
@@ -1907,9 +1910,11 @@ out_check:
}
} else {
ret = btrfs_add_ordered_extent(inode, cur_offset,
+ num_bytes, num_bytes,
disk_bytenr, num_bytes,
- num_bytes,
- BTRFS_ORDERED_NOCOW);
+ 0,
+ 1 << BTRFS_ORDERED_NOCOW,
+ BTRFS_COMPRESS_NONE);
if (ret)
goto error;
}
@@ -2310,7 +2315,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
u64 dio_file_offset)
{
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
}
/*
@@ -2538,10 +2543,15 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
goto out;
if (bio_flags & EXTENT_BIO_COMPRESSED) {
+ /*
+ * btrfs_submit_compressed_read will handle completing
+ * the bio if there were any errors, so just return
+ * here.
+ */
ret = btrfs_submit_compressed_read(inode, bio,
mirror_num,
bio_flags);
- goto out;
+ goto out_no_endio;
} else {
/*
* Lookup bio sums does extra checks around whether we
@@ -2562,7 +2572,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
0, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
if (ret)
goto out;
}
@@ -2575,6 +2585,7 @@ out:
bio->bi_status = ret;
bio_endio(bio);
}
+out_no_endio:
return ret;
}
@@ -2870,6 +2881,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key ins;
u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
+ u64 offset = btrfs_stack_file_extent_offset(stack_fi);
u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
struct btrfs_drop_extents_args drop_args = { 0 };
@@ -2944,7 +2956,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
goto out;
ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
- file_pos, qgroup_reserved, &ins);
+ file_pos - offset,
+ qgroup_reserved, &ins);
out:
btrfs_free_path(path);
@@ -2970,20 +2983,20 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *oe)
{
struct btrfs_file_extent_item stack_fi;
- u64 logical_len;
bool update_inode_bytes;
+ u64 num_bytes = oe->num_bytes;
+ u64 ram_bytes = oe->ram_bytes;
memset(&stack_fi, 0, sizeof(stack_fi));
btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
+ btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
- logical_len = oe->truncated_len;
- else
- logical_len = oe->num_bytes;
- btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
- btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
+ num_bytes = ram_bytes = oe->truncated_len;
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
/* Encryption and other encoding is reserved and all 0 */
@@ -2994,6 +3007,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
* except if the ordered extent was truncated.
*/
update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
+ test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
@@ -3028,7 +3042,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
- !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
clear_bits |= EXTENT_DELALLOC_NEW;
freespace_inode = btrfs_is_free_space_inode(inode);
@@ -4062,7 +4077,8 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
- const char *name, int name_len)
+ const char *name, int name_len,
+ struct btrfs_rename_ctx *rename_ctx)
{
struct btrfs_root *root = dir->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4118,15 +4134,27 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto err;
}
skip_backref:
+ if (rename_ctx)
+ rename_ctx->index = index;
+
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto err;
}
- btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
- dir_ino);
- btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
+ /*
+ * If we are in a rename context, we don't need to update anything in the
+ * log. That will be done later during the rename by btrfs_log_new_name().
+ * Besides that, doing it here would only cause extra unncessary btree
+ * operations on the log tree, increasing latency for applications.
+ */
+ if (!rename_ctx) {
+ btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+ dir_ino);
+ btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
+ index);
+ }
/*
* If we have a pending delayed iput we could end up with the final iput
@@ -4158,7 +4186,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
const char *name, int name_len)
{
int ret;
- ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
+ ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
if (!ret) {
drop_nlink(&inode->vfs_inode);
ret = btrfs_update_inode(trans, inode->root, inode);
@@ -4565,14 +4593,21 @@ out_up_write:
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
int err = 0;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
+ if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
+ btrfs_err(fs_info,
+ "extent tree v2 doesn't support snapshot deletion yet");
+ return -EOPNOTSUPP;
+ }
return btrfs_delete_subvolume(dir, dentry);
+ }
trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
@@ -4611,7 +4646,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
}
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
+ btrfs_btree_balance_dirty(fs_info);
return err;
}
@@ -4664,7 +4699,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
goto out;
}
}
- ret = btrfs_delalloc_reserve_metadata(inode, blocksize);
+ ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize);
if (ret < 0) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode, data_reserved,
@@ -4876,8 +4911,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
if (!hole_em) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
goto next;
}
hole_em->start = cur_offset;
@@ -5584,21 +5618,17 @@ static struct inode *new_simple_dir(struct super_block *s,
return inode;
}
+static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
+static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
+static_assert(BTRFS_FT_DIR == FT_DIR);
+static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
+static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
+static_assert(BTRFS_FT_FIFO == FT_FIFO);
+static_assert(BTRFS_FT_SOCK == FT_SOCK);
+static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
+
static inline u8 btrfs_inode_type(struct inode *inode)
{
- /*
- * Compile-time asserts that generic FT_* types still match
- * BTRFS_FT_* types
- */
- BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
- BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
- BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
- BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
- BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
- BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
- BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
- BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
-
return fs_umode_to_ftype(inode->i_mode);
}
@@ -5971,14 +6001,8 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
goto out;
ret = 0;
- /*
- * MAGIC NUMBER EXPLANATION:
- * since we search a directory based on f_pos we have to start at 2
- * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
- * else has to start at 2
- */
if (path->slots[0] == 0) {
- inode->index_cnt = 2;
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
goto out;
}
@@ -5989,7 +6013,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
if (found_key.objectid != btrfs_ino(inode) ||
found_key.type != BTRFS_DIR_INDEX_KEY) {
- inode->index_cnt = 2;
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
goto out;
}
@@ -6140,7 +6164,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
* sync since it will be a full sync anyway and this will blow away the
* old info in the log.
*/
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
key[0].objectid = objectid;
key[0].type = BTRFS_INODE_ITEM_KEY;
@@ -6537,7 +6561,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
+ btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
}
fail:
@@ -7040,8 +7064,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
if (IS_ERR(em))
goto out;
}
- ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len,
- block_len, type);
+ ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
+ block_len, 0,
+ (1 << type) |
+ (1 << BTRFS_ORDERED_DIRECT),
+ BTRFS_COMPRESS_NONE);
if (ret) {
if (em) {
free_extent_map(em);
@@ -7441,7 +7468,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct extent_map *em2;
/* We can NOCOW, so only need to reserve metadata space. */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
if (ret < 0) {
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
@@ -7831,7 +7858,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
struct bio *bio,
u64 dio_file_offset)
{
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1);
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
}
static void btrfs_end_dio_bio(struct bio *bio)
@@ -7888,7 +7915,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
if (ret)
goto err;
} else {
@@ -8104,8 +8131,13 @@ int btrfs_readpage(struct file *file, struct page *page)
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
- if (bio_ctrl.bio)
- ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
+ if (bio_ctrl.bio) {
+ int ret2;
+
+ ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
+ if (ret == 0)
+ ret = ret2;
+ }
return ret;
}
@@ -8734,7 +8766,7 @@ out:
* extents beyond i_size to drop.
*/
if (control.extents_found > 0)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
return ret;
}
@@ -9030,14 +9062,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
+ struct btrfs_rename_ctx old_rename_ctx;
+ struct btrfs_rename_ctx new_rename_ctx;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
int ret2;
- bool root_log_pinned = false;
- bool dest_log_pinned = false;
bool need_abort = false;
/*
@@ -9140,29 +9172,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode), 1);
}
- /*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
- *
- * We pin the logs even if at this precise moment none of the inodes was
- * logged before. This is because right after we checked for that, some
- * other task fsyncing some other inode not involved with this rename
- * operation could log that one of our inodes exists.
- *
- * We don't need to pin the logs before the above calls to
- * btrfs_insert_inode_ref(), since those don't ever need to change a log.
- */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
- btrfs_pin_log_trans(root);
- root_log_pinned = true;
- }
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
- btrfs_pin_log_trans(dest);
- dest_log_pinned = true;
- }
-
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@ -9170,7 +9179,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_dentry->d_name.name,
- old_dentry->d_name.len);
+ old_dentry->d_name.len,
+ &old_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9186,7 +9196,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_dentry->d_name.name,
- new_dentry->d_name.len);
+ new_dentry->d_name.len,
+ &new_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
}
@@ -9216,46 +9227,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (new_inode->i_nlink == 1)
BTRFS_I(new_inode)->dir_index = new_idx;
- if (root_log_pinned) {
- btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
- new_dentry->d_parent);
- btrfs_end_log_trans(root);
- root_log_pinned = false;
- }
- if (dest_log_pinned) {
- btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
- old_dentry->d_parent);
- btrfs_end_log_trans(dest);
- dest_log_pinned = false;
- }
-out_fail:
/*
- * If we have pinned a log and an error happened, we unpin tasks
- * trying to sync the log and force them to fallback to a transaction
- * commit if the log currently contains any of the inodes involved in
- * this rename operation (to ensure we do not persist a log with an
- * inconsistent state for any of these inodes or leading to any
- * inconsistencies when replayed). If the transaction was aborted, the
- * abortion reason is propagated to userspace when attempting to commit
- * the transaction. If the log does not contain any of these inodes, we
- * allow the tasks to sync it.
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
*/
- if (ret && (root_log_pinned || dest_log_pinned)) {
- if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
- btrfs_set_log_full_commit(trans);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_pin_log_trans(root);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_pin_log_trans(dest);
- if (root_log_pinned) {
- btrfs_end_log_trans(root);
- root_log_pinned = false;
- }
- if (dest_log_pinned) {
- btrfs_end_log_trans(dest);
- dest_log_pinned = false;
- }
- }
+ /* Do the log updates for all inodes. */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ old_rename_ctx.index, new_dentry->d_parent);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
+ new_rename_ctx.index, old_dentry->d_parent);
+
+ /* Now unpin the logs. */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_end_log_trans(root);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_end_log_trans(dest);
+out_fail:
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -9330,11 +9326,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = d_inode(new_dentry);
struct inode *old_inode = d_inode(old_dentry);
+ struct btrfs_rename_ctx rename_ctx;
u64 index = 0;
int ret;
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
- bool log_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9439,29 +9435,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
- /*
- * Now pin the log. We do it to ensure that no other task can
- * sync the log while we are in progress with the rename, as
- * that could result in an inconsistency in case any of the
- * inodes that are part of this rename operation were logged
- * before.
- *
- * We pin the log even if at this precise moment none of the
- * inodes was logged before. This is because right after we
- * checked for that, some other task fsyncing some other inode
- * not involved with this rename operation could log that one of
- * our inodes exists.
- *
- * We don't need to pin the logs before the above call to
- * btrfs_insert_inode_ref(), since that does not need to change
- * a log.
- */
- btrfs_pin_log_trans(root);
- log_pinned = true;
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
- old_dentry->d_name.len);
+ old_dentry->d_name.len,
+ &rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9503,12 +9481,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (log_pinned) {
- btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
- new_dentry->d_parent);
- btrfs_end_log_trans(root);
- log_pinned = false;
- }
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ rename_ctx.index, new_dentry->d_parent);
if (flags & RENAME_WHITEOUT) {
ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
@@ -9520,28 +9495,6 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
}
}
out_fail:
- /*
- * If we have pinned the log and an error happened, we unpin tasks
- * trying to sync the log and force them to fallback to a transaction
- * commit if the log currently contains any of the inodes involved in
- * this rename operation (to ensure we do not persist a log with an
- * inconsistent state for any of these inodes or leading to any
- * inconsistencies when replayed). If the transaction was aborted, the
- * abortion reason is propagated to userspace when attempting to commit
- * the transaction. If the log does not contain any of these inodes, we
- * allow the tasks to sync it.
- */
- if (ret && log_pinned) {
- if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- (new_inode &&
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
- btrfs_set_log_full_commit(trans);
-
- btrfs_end_log_trans(root);
- log_pinned = false;
- }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -10021,8 +9974,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em = alloc_extent_map();
if (!em) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
goto next;
}
@@ -10210,6 +10162,747 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
}
}
+static int btrfs_encoded_io_compression_from_extent(
+ struct btrfs_fs_info *fs_info,
+ int compress_type)
+{
+ switch (compress_type) {
+ case BTRFS_COMPRESS_NONE:
+ return BTRFS_ENCODED_IO_COMPRESSION_NONE;
+ case BTRFS_COMPRESS_ZLIB:
+ return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
+ case BTRFS_COMPRESS_LZO:
+ /*
+ * The LZO format depends on the sector size. 64K is the maximum
+ * sector size that we support.
+ */
+ if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
+ return -EINVAL;
+ return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
+ (fs_info->sectorsize_bits - 12);
+ case BTRFS_COMPRESS_ZSTD:
+ return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
+ default:
+ return -EUCLEAN;
+ }
+}
+
+static ssize_t btrfs_encoded_read_inline(
+ struct kiocb *iocb,
+ struct iov_iter *iter, u64 start,
+ u64 lockend,
+ struct extent_state **cached_state,
+ u64 extent_start, size_t count,
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ bool *unlocked)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *item;
+ u64 ram_bytes;
+ unsigned long ptr;
+ void *tmp;
+ ssize_t ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+ extent_start, 0);
+ if (ret) {
+ if (ret > 0) {
+ /* The extent item disappeared? */
+ ret = -EIO;
+ }
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
+ ptr = btrfs_file_extent_inline_start(item);
+
+ encoded->len = min_t(u64, extent_start + ram_bytes,
+ inode->vfs_inode.i_size) - iocb->ki_pos;
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, item));
+ if (ret < 0)
+ goto out;
+ encoded->compression = ret;
+ if (encoded->compression) {
+ size_t inline_size;
+
+ inline_size = btrfs_file_extent_inline_item_len(leaf,
+ path->slots[0]);
+ if (inline_size > count) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+ count = inline_size;
+ encoded->unencoded_len = ram_bytes;
+ encoded->unencoded_offset = iocb->ki_pos - extent_start;
+ } else {
+ count = min_t(u64, count, encoded->len);
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ ptr += iocb->ki_pos - extent_start;
+ }
+
+ tmp = kmalloc(count, GFP_NOFS);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(leaf, tmp, ptr, count);
+ btrfs_release_path(path);
+ unlock_extent_cached(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ *unlocked = true;
+
+ ret = copy_to_iter(tmp, count, iter);
+ if (ret != count)
+ ret = -EFAULT;
+ kfree(tmp);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+struct btrfs_encoded_read_private {
+ struct btrfs_inode *inode;
+ u64 file_offset;
+ wait_queue_head_t wait;
+ atomic_t pending;
+ blk_status_t status;
+ bool skip_csum;
+};
+
+static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
+ struct bio *bio, int mirror_num)
+{
+ struct btrfs_encoded_read_private *priv = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ blk_status_t ret;
+
+ if (!priv->skip_csum) {
+ ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
+ if (ret)
+ return ret;
+ }
+
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+ if (ret) {
+ btrfs_bio_free_csum(bbio);
+ return ret;
+ }
+
+ atomic_inc(&priv->pending);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (ret) {
+ atomic_dec(&priv->pending);
+ btrfs_bio_free_csum(bbio);
+ }
+ return ret;
+}
+
+static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
+{
+ const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
+ struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
+ struct btrfs_inode *inode = priv->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u32 sectorsize = fs_info->sectorsize;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+ u64 start = priv->file_offset;
+ u32 bio_offset = 0;
+
+ if (priv->skip_csum || !uptodate)
+ return bbio->bio.bi_status;
+
+ bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ unsigned int i, nr_sectors, pgoff;
+
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+ pgoff = bvec->bv_offset;
+ for (i = 0; i < nr_sectors; i++) {
+ ASSERT(pgoff < PAGE_SIZE);
+ if (check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+ bvec->bv_page, pgoff, start))
+ return BLK_STS_IOERR;
+ start += sectorsize;
+ bio_offset += sectorsize;
+ pgoff += sectorsize;
+ }
+ }
+ return BLK_STS_OK;
+}
+
+static void btrfs_encoded_read_endio(struct bio *bio)
+{
+ struct btrfs_encoded_read_private *priv = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ blk_status_t status;
+
+ status = btrfs_encoded_read_verify_csum(bbio);
+ if (status) {
+ /*
+ * The memory barrier implied by the atomic_dec_return() here
+ * pairs with the memory barrier implied by the
+ * atomic_dec_return() or io_wait_event() in
+ * btrfs_encoded_read_regular_fill_pages() to ensure that this
+ * write is observed before the load of status in
+ * btrfs_encoded_read_regular_fill_pages().
+ */
+ WRITE_ONCE(priv->status, status);
+ }
+ if (!atomic_dec_return(&priv->pending))
+ wake_up(&priv->wait);
+ btrfs_bio_free_csum(bbio);
+ bio_put(bio);
+}
+
+static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset,
+ u64 disk_bytenr,
+ u64 disk_io_size,
+ struct page **pages)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_encoded_read_private priv = {
+ .inode = inode,
+ .file_offset = file_offset,
+ .pending = ATOMIC_INIT(1),
+ .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
+ };
+ unsigned long i = 0;
+ u64 cur = 0;
+ int ret;
+
+ init_waitqueue_head(&priv.wait);
+ /*
+ * Submit bios for the extent, splitting due to bio or stripe limits as
+ * necessary.
+ */
+ while (cur < disk_io_size) {
+ struct extent_map *em;
+ struct btrfs_io_geometry geom;
+ struct bio *bio = NULL;
+ u64 remaining;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
+ disk_io_size - cur);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ } else {
+ ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
+ disk_bytenr + cur, &geom);
+ free_extent_map(em);
+ }
+ if (ret) {
+ WRITE_ONCE(priv.status, errno_to_blk_status(ret));
+ break;
+ }
+ remaining = min(geom.len, disk_io_size - cur);
+ while (bio || remaining) {
+ size_t bytes = min_t(u64, remaining, PAGE_SIZE);
+
+ if (!bio) {
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio->bi_iter.bi_sector =
+ (disk_bytenr + cur) >> SECTOR_SHIFT;
+ bio->bi_end_io = btrfs_encoded_read_endio;
+ bio->bi_private = &priv;
+ bio->bi_opf = REQ_OP_READ;
+ }
+
+ if (!bytes ||
+ bio_add_page(bio, pages[i], bytes, 0) < bytes) {
+ blk_status_t status;
+
+ status = submit_encoded_read_bio(inode, bio, 0);
+ if (status) {
+ WRITE_ONCE(priv.status, status);
+ bio_put(bio);
+ goto out;
+ }
+ bio = NULL;
+ continue;
+ }
+
+ i++;
+ cur += bytes;
+ remaining -= bytes;
+ }
+ }
+
+out:
+ if (atomic_dec_return(&priv.pending))
+ io_wait_event(priv.wait, !atomic_read(&priv.pending));
+ /* See btrfs_encoded_read_endio() for ordering. */
+ return blk_status_to_errno(READ_ONCE(priv.status));
+}
+
+static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
+ struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed,
+ bool *unlocked)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct page **pages;
+ unsigned long nr_pages, i;
+ u64 cur;
+ size_t page_offset;
+ ssize_t ret;
+
+ nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = alloc_page(GFP_NOFS);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+ disk_io_size, pages);
+ if (ret)
+ goto out;
+
+ unlock_extent_cached(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ *unlocked = true;
+
+ if (compressed) {
+ i = 0;
+ page_offset = 0;
+ } else {
+ i = (iocb->ki_pos - start) >> PAGE_SHIFT;
+ page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
+ }
+ cur = 0;
+ while (cur < count) {
+ size_t bytes = min_t(size_t, count - cur,
+ PAGE_SIZE - page_offset);
+
+ if (copy_page_to_iter(pages[i], page_offset, bytes,
+ iter) != bytes) {
+ ret = -EFAULT;
+ goto out;
+ }
+ i++;
+ cur += bytes;
+ page_offset = 0;
+ }
+ ret = count;
+out:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+ return ret;
+}
+
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ ssize_t ret;
+ size_t count = iov_iter_count(iter);
+ u64 start, lockend, disk_bytenr, disk_io_size;
+ struct extent_state *cached_state = NULL;
+ struct extent_map *em;
+ bool unlocked = false;
+
+ file_accessed(iocb->ki_filp);
+
+ btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+
+ if (iocb->ki_pos >= inode->vfs_inode.i_size) {
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ return 0;
+ }
+ start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
+ /*
+ * We don't know how long the extent containing iocb->ki_pos is, but if
+ * it's compressed we know that it won't be longer than this.
+ */
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
+ lockend - start + 1);
+ if (ret)
+ goto out_unlock_inode;
+ lock_extent_bits(io_tree, start, lockend, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ lockend - start + 1);
+ if (!ordered)
+ break;
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ cond_resched();
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock_extent;
+ }
+
+ if (em->block_start == EXTENT_MAP_INLINE) {
+ u64 extent_start = em->start;
+
+ /*
+ * For inline extents we get everything we need out of the
+ * extent item.
+ */
+ free_extent_map(em);
+ em = NULL;
+ ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
+ &cached_state, extent_start,
+ count, encoded, &unlocked);
+ goto out;
+ }
+
+ /*
+ * We only want to return up to EOF even if the extent extends beyond
+ * that.
+ */
+ encoded->len = min_t(u64, extent_map_end(em),
+ inode->vfs_inode.i_size) - iocb->ki_pos;
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ disk_bytenr = EXTENT_MAP_HOLE;
+ count = min_t(u64, count, encoded->len);
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ disk_bytenr = em->block_start;
+ /*
+ * Bail if the buffer isn't large enough to return the whole
+ * compressed extent.
+ */
+ if (em->block_len > count) {
+ ret = -ENOBUFS;
+ goto out_em;
+ }
+ disk_io_size = count = em->block_len;
+ encoded->unencoded_len = em->ram_bytes;
+ encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ em->compress_type);
+ if (ret < 0)
+ goto out_em;
+ encoded->compression = ret;
+ } else {
+ disk_bytenr = em->block_start + (start - em->start);
+ if (encoded->len > count)
+ encoded->len = count;
+ /*
+ * Don't read beyond what we locked. This also limits the page
+ * allocations that we'll do.
+ */
+ disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+ count = start + disk_io_size - iocb->ki_pos;
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+ }
+ free_extent_map(em);
+ em = NULL;
+
+ if (disk_bytenr == EXTENT_MAP_HOLE) {
+ unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ unlocked = true;
+ ret = iov_iter_zero(count, iter);
+ if (ret != count)
+ ret = -EFAULT;
+ } else {
+ ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
+ &cached_state, disk_bytenr,
+ disk_io_size, count,
+ encoded->compression,
+ &unlocked);
+ }
+
+out:
+ if (ret >= 0)
+ iocb->ki_pos += encoded->len;
+out_em:
+ free_extent_map(em);
+out_unlock_extent:
+ if (!unlocked)
+ unlock_extent_cached(io_tree, start, lockend, &cached_state);
+out_unlock_inode:
+ if (!unlocked)
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+}
+
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_changeset *data_reserved = NULL;
+ struct extent_state *cached_state = NULL;
+ int compression;
+ size_t orig_count;
+ u64 start, end;
+ u64 num_bytes, ram_bytes, disk_num_bytes;
+ unsigned long nr_pages, i;
+ struct page **pages;
+ struct btrfs_key ins;
+ bool extent_reserved = false;
+ struct extent_map *em;
+ ssize_t ret;
+
+ switch (encoded->compression) {
+ case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
+ compression = BTRFS_COMPRESS_ZLIB;
+ break;
+ case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
+ compression = BTRFS_COMPRESS_ZSTD;
+ break;
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
+ /* The sector size must match for LZO. */
+ if (encoded->compression -
+ BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
+ fs_info->sectorsize_bits)
+ return -EINVAL;
+ compression = BTRFS_COMPRESS_LZO;
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ return -EINVAL;
+
+ orig_count = iov_iter_count(from);
+
+ /* The extent size must be sane. */
+ if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
+ orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
+ return -EINVAL;
+
+ /*
+ * The compressed data must be smaller than the decompressed data.
+ *
+ * It's of course possible for data to compress to larger or the same
+ * size, but the buffered I/O path falls back to no compression for such
+ * data, and we don't want to break any assumptions by creating these
+ * extents.
+ *
+ * Note that this is less strict than the current check we have that the
+ * compressed data must be at least one sector smaller than the
+ * decompressed data. We only want to enforce the weaker requirement
+ * from old kernels that it is at least one byte smaller.
+ */
+ if (orig_count >= encoded->unencoded_len)
+ return -EINVAL;
+
+ /* The extent must start on a sector boundary. */
+ start = iocb->ki_pos;
+ if (!IS_ALIGNED(start, fs_info->sectorsize))
+ return -EINVAL;
+
+ /*
+ * The extent must end on a sector boundary. However, we allow a write
+ * which ends at or extends i_size to have an unaligned length; we round
+ * up the extent size and set i_size to the unaligned end.
+ */
+ if (start + encoded->len < inode->vfs_inode.i_size &&
+ !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
+ return -EINVAL;
+
+ /* Finally, the offset in the unencoded data must be sector-aligned. */
+ if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
+ return -EINVAL;
+
+ num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
+ ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
+ end = start + num_bytes - 1;
+
+ /*
+ * If the extent cannot be inline, the compressed data on disk must be
+ * sector-aligned. For convenience, we extend it with zeroes if it
+ * isn't.
+ */
+ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
+ nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
+ pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ if (!pages)
+ return -ENOMEM;
+ for (i = 0; i < nr_pages; i++) {
+ size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
+ char *kaddr;
+
+ pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ goto out_pages;
+ }
+ kaddr = kmap(pages[i]);
+ if (copy_from_iter(kaddr, bytes, from) != bytes) {
+ kunmap(pages[i]);
+ ret = -EFAULT;
+ goto out_pages;
+ }
+ if (bytes < PAGE_SIZE)
+ memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
+ kunmap(pages[i]);
+ }
+
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
+ if (ret)
+ goto out_pages;
+ ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
+ start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+ if (ret)
+ goto out_pages;
+ lock_extent_bits(io_tree, start, end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
+ if (!ordered &&
+ !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
+ break;
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(io_tree, start, end, &cached_state);
+ cond_resched();
+ }
+
+ /*
+ * We don't use the higher-level delalloc space functions because our
+ * num_bytes and disk_num_bytes are different.
+ */
+ ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
+ if (ret)
+ goto out_unlock;
+ ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
+ if (ret)
+ goto out_free_data_space;
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
+ if (ret)
+ goto out_qgroup_free_data;
+
+ /* Try an inline extent first. */
+ if (start == 0 && encoded->unencoded_len == encoded->len &&
+ encoded->unencoded_offset == 0) {
+ ret = cow_file_range_inline(inode, encoded->len, orig_count,
+ compression, pages, true);
+ if (ret <= 0) {
+ if (ret == 0)
+ ret = orig_count;
+ goto out_delalloc_release;
+ }
+ }
+
+ ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
+ disk_num_bytes, 0, 0, &ins, 1, 1);
+ if (ret)
+ goto out_delalloc_release;
+ extent_reserved = true;
+
+ em = create_io_em(inode, start, num_bytes,
+ start - encoded->unencoded_offset, ins.objectid,
+ ins.offset, ins.offset, ram_bytes, compression,
+ BTRFS_ORDERED_COMPRESSED);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_free_reserved;
+ }
+ free_extent_map(em);
+
+ ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes,
+ ins.objectid, ins.offset,
+ encoded->unencoded_offset,
+ (1 << BTRFS_ORDERED_ENCODED) |
+ (1 << BTRFS_ORDERED_COMPRESSED),
+ compression);
+ if (ret) {
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ goto out_free_reserved;
+ }
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ if (start + encoded->len > inode->vfs_inode.i_size)
+ i_size_write(&inode->vfs_inode, start + encoded->len);
+
+ unlock_extent_cached(io_tree, start, end, &cached_state);
+
+ btrfs_delalloc_release_extents(inode, num_bytes);
+
+ if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
+ ins.offset, pages, nr_pages, 0, NULL,
+ false)) {
+ btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
+ ret = -EIO;
+ goto out_pages;
+ }
+ ret = orig_count;
+ goto out;
+
+out_free_reserved:
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+out_delalloc_release:
+ btrfs_delalloc_release_extents(inode, num_bytes);
+ btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
+out_qgroup_free_data:
+ if (ret < 0)
+ btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
+out_free_data_space:
+ /*
+ * If btrfs_reserve_extent() succeeded, then we already decremented
+ * bytes_may_use.
+ */
+ if (!extent_reserved)
+ btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
+out_unlock:
+ unlock_extent_cached(io_tree, start, end, &cached_state);
+out_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kvfree(pages);
+out:
+ if (ret >= 0)
+ iocb->ki_pos += encoded->len;
+ return ret;
+}
+
#ifdef CONFIG_SWAP
/*
* Add an entry indicating a block group or device which is pinned by a
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8d47ec5fc4f4..238cee5b5254 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -28,6 +28,7 @@
#include <linux/iversion.h>
#include <linux/fileattr.h>
#include <linux/fsverity.h>
+#include <linux/sched/xacct.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
@@ -88,6 +89,24 @@ struct btrfs_ioctl_send_args_32 {
#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
struct btrfs_ioctl_send_args_32)
+
+struct btrfs_ioctl_encoded_io_args_32 {
+ compat_uptr_t iov;
+ compat_ulong_t iovcnt;
+ __s64 offset;
+ __u64 flags;
+ __u64 len;
+ __u64 unencoded_len;
+ __u64 unencoded_offset;
+ __u32 compression;
+ __u32 encryption;
+ __u8 reserved[64];
+};
+
+#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
+ struct btrfs_ioctl_encoded_io_args_32)
+#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \
+ struct btrfs_ioctl_encoded_io_args_32)
#endif
/* Mask out flags that are inappropriate for the given type of inode. */
@@ -440,10 +459,8 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
}
}
-static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
{
- struct inode *inode = file_inode(file);
-
return put_user(inode->i_generation, arg);
}
@@ -753,6 +770,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_trans_handle *trans;
int ret;
+ /* We do not support snapshotting right now. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_warn(fs_info,
+ "extent tree v2 doesn't support snapshotting yet");
+ return -EOPNOTSUPP;
+ }
+
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return -EINVAL;
@@ -1522,6 +1546,7 @@ next:
}
#define CLUSTER_SIZE (SZ_256K)
+static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
/*
* Defrag one contiguous target range.
@@ -1667,7 +1692,6 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
LIST_HEAD(target_list);
int ret;
- BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
ret = defrag_collect_targets(inode, start, len, extent_thresh,
newer_than, do_compress, false,
&target_list, NULL);
@@ -1810,9 +1834,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
u64 last_scanned = cur;
u64 cluster_end;
- /* The cluster size 256K should always be page aligned */
- BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
-
if (btrfs_defrag_cancelled(fs_info)) {
ret = -EAGAIN;
break;
@@ -2229,10 +2250,9 @@ free_args:
return ret;
}
-static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
void __user *arg)
{
- struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -2562,12 +2582,11 @@ err:
return ret;
}
-static noinline int btrfs_ioctl_tree_search(struct file *file,
- void __user *argp)
+static noinline int btrfs_ioctl_tree_search(struct inode *inode,
+ void __user *argp)
{
struct btrfs_ioctl_search_args __user *uargs;
struct btrfs_ioctl_search_key sk;
- struct inode *inode;
int ret;
size_t buf_size;
@@ -2581,7 +2600,6 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
buf_size = sizeof(uargs->buf);
- inode = file_inode(file);
ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
/*
@@ -2596,12 +2614,11 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
return ret;
}
-static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
+static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
void __user *argp)
{
struct btrfs_ioctl_search_args_v2 __user *uarg;
struct btrfs_ioctl_search_args_v2 args;
- struct inode *inode;
int ret;
size_t buf_size;
const size_t buf_limit = SZ_16M;
@@ -2620,7 +2637,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
if (buf_size > buf_limit)
buf_size = buf_limit;
- inode = file_inode(file);
ret = search_ioctl(inode, &args.key, &buf_size,
(char __user *)(&uarg->buf[0]));
if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
@@ -2871,25 +2887,22 @@ out:
return ret;
}
-static noinline int btrfs_ioctl_ino_lookup(struct file *file,
+static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
void __user *argp)
{
struct btrfs_ioctl_ino_lookup_args *args;
- struct inode *inode;
int ret = 0;
args = memdup_user(argp, sizeof(*args));
if (IS_ERR(args))
return PTR_ERR(args);
- inode = file_inode(file);
-
/*
* Unprivileged query to obtain the containing subvolume root id. The
* path is reset so it's consistent with btrfs_search_path_in_tree.
*/
if (args->treeid == 0)
- args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+ args->treeid = root->root_key.objectid;
if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
args->name[0] = 0;
@@ -2901,7 +2914,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
goto out;
}
- ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
+ ret = btrfs_search_path_in_tree(root->fs_info,
args->treeid, args->objectid,
args->name);
@@ -2957,7 +2970,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
}
/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
-static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
+static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
{
struct btrfs_ioctl_get_subvol_info_args *subvol_info;
struct btrfs_fs_info *fs_info;
@@ -2969,7 +2982,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
struct extent_buffer *leaf;
unsigned long item_off;
unsigned long item_len;
- struct inode *inode;
int slot;
int ret = 0;
@@ -2983,7 +2995,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
return -ENOMEM;
}
- inode = file_inode(file);
fs_info = BTRFS_I(inode)->root->fs_info;
/* Get root_item of inode's subvolume */
@@ -3077,15 +3088,14 @@ out_free:
* Return ROOT_REF information of the subvolume containing this inode
* except the subvolume name.
*/
-static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
+static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
+ void __user *argp)
{
struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
struct btrfs_root_ref *rref;
- struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *leaf;
- struct inode *inode;
u64 objectid;
int slot;
int ret;
@@ -3101,15 +3111,13 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
return PTR_ERR(rootrefs);
}
- inode = file_inode(file);
- root = BTRFS_I(inode)->root->fs_info->tree_root;
- objectid = BTRFS_I(inode)->root->root_key.objectid;
-
+ objectid = root->root_key.objectid;
key.objectid = objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = rootrefs->min_treeid;
found = 0;
+ root = root->fs_info->tree_root;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
goto out;
@@ -3189,6 +3197,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
int err = 0;
bool destroy_parent = false;
+ /* We don't support snapshots with extent tree v2 yet. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "extent tree v2 doesn't support snapshot deletion yet");
+ return -EOPNOTSUPP;
+ }
+
if (destroy_v2) {
vol_args2 = memdup_user(arg, sizeof(*vol_args2));
if (IS_ERR(vol_args2))
@@ -3464,6 +3479,11 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device add not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
@@ -3989,6 +4009,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa))
return PTR_ERR(sa);
@@ -4088,6 +4113,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device replace not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
p = memdup_user(arg, sizeof(*p));
if (IS_ERR(p))
return PTR_ERR(p);
@@ -5149,7 +5179,7 @@ out_drop_write:
return ret;
}
-static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
+static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat)
{
struct btrfs_ioctl_send_args *arg;
int ret;
@@ -5179,11 +5209,194 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
if (IS_ERR(arg))
return PTR_ERR(arg);
}
- ret = btrfs_ioctl_send(file, arg);
+ ret = btrfs_ioctl_send(inode, arg);
kfree(arg);
return ret;
}
+static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
+ bool compat)
+{
+ struct btrfs_ioctl_encoded_io_args args = { 0 };
+ size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
+ flags);
+ size_t copy_end;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+
+ if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
+ flags);
+ if (copy_from_user(&args32, argp, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ copy_end = copy_end_kernel;
+ if (copy_from_user(&args, argp, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+ if (args.flags != 0) {
+ ret = -EINVAL;
+ goto out_acct;
+ }
+
+ ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_iov;
+ }
+ pos = args.offset;
+ ret = rw_verify_area(READ, file, &pos, args.len);
+ if (ret < 0)
+ goto out_iov;
+
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos;
+
+ ret = btrfs_encoded_read(&kiocb, &iter, &args);
+ if (ret >= 0) {
+ fsnotify_access(file);
+ if (copy_to_user(argp + copy_end,
+ (char *)&args + copy_end_kernel,
+ sizeof(args) - copy_end_kernel))
+ ret = -EFAULT;
+ }
+
+out_iov:
+ kfree(iov);
+out_acct:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+ return ret;
+}
+
+static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
+{
+ struct btrfs_ioctl_encoded_io_args args;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+
+ if (!(file->f_mode & FMODE_WRITE)) {
+ ret = -EBADF;
+ goto out_acct;
+ }
+
+ if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ if (copy_from_user(&args32, argp, sizeof(args32))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+ args.len = args32.len;
+ args.unencoded_len = args32.unencoded_len;
+ args.unencoded_offset = args32.unencoded_offset;
+ args.compression = args32.compression;
+ args.encryption = args32.encryption;
+ memcpy(args.reserved, args32.reserved, sizeof(args.reserved));
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ if (copy_from_user(&args, argp, sizeof(args))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+
+ ret = -EINVAL;
+ if (args.flags != 0)
+ goto out_acct;
+ if (memchr_inv(args.reserved, 0, sizeof(args.reserved)))
+ goto out_acct;
+ if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE &&
+ args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ goto out_acct;
+ if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES ||
+ args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES)
+ goto out_acct;
+ if (args.unencoded_offset > args.unencoded_len)
+ goto out_acct;
+ if (args.len > args.unencoded_len - args.unencoded_offset)
+ goto out_acct;
+
+ ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ file_start_write(file);
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_end_write;
+ }
+ pos = args.offset;
+ ret = rw_verify_area(WRITE, file, &pos, args.len);
+ if (ret < 0)
+ goto out_end_write;
+
+ init_sync_kiocb(&kiocb, file);
+ ret = kiocb_set_rw_flags(&kiocb, 0);
+ if (ret)
+ goto out_end_write;
+ kiocb.ki_pos = pos;
+
+ ret = btrfs_do_write_iter(&kiocb, &iter, &args);
+ if (ret > 0)
+ fsnotify_modify(file);
+
+out_end_write:
+ file_end_write(file);
+ kfree(iov);
+out_acct:
+ if (ret > 0)
+ add_wchar(current, ret);
+ inc_syscw(current);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -5194,7 +5407,7 @@ long btrfs_ioctl(struct file *file, unsigned int
switch (cmd) {
case FS_IOC_GETVERSION:
- return btrfs_ioctl_getversion(file, argp);
+ return btrfs_ioctl_getversion(inode, argp);
case FS_IOC_GETFSLABEL:
return btrfs_ioctl_get_fslabel(fs_info, argp);
case FS_IOC_SETFSLABEL:
@@ -5214,7 +5427,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SNAP_DESTROY_V2:
return btrfs_ioctl_snap_destroy(file, argp, true);
case BTRFS_IOC_SUBVOL_GETFLAGS:
- return btrfs_ioctl_subvol_getflags(file, argp);
+ return btrfs_ioctl_subvol_getflags(inode, argp);
case BTRFS_IOC_SUBVOL_SETFLAGS:
return btrfs_ioctl_subvol_setflags(file, argp);
case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -5238,11 +5451,11 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_BALANCE:
return btrfs_ioctl_balance(file, NULL);
case BTRFS_IOC_TREE_SEARCH:
- return btrfs_ioctl_tree_search(file, argp);
+ return btrfs_ioctl_tree_search(inode, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
- return btrfs_ioctl_tree_search_v2(file, argp);
+ return btrfs_ioctl_tree_search_v2(inode, argp);
case BTRFS_IOC_INO_LOOKUP:
- return btrfs_ioctl_ino_lookup(file, argp);
+ return btrfs_ioctl_ino_lookup(root, argp);
case BTRFS_IOC_INO_PATHS:
return btrfs_ioctl_ino_to_path(root, argp);
case BTRFS_IOC_LOGICAL_INO:
@@ -5289,10 +5502,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
- return _btrfs_ioctl_send(file, argp, false);
+ return _btrfs_ioctl_send(inode, argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32:
- return _btrfs_ioctl_send(file, argp, true);
+ return _btrfs_ioctl_send(inode, argp, true);
#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
@@ -5319,15 +5532,25 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
case BTRFS_IOC_GET_SUBVOL_INFO:
- return btrfs_ioctl_get_subvol_info(file, argp);
+ return btrfs_ioctl_get_subvol_info(inode, argp);
case BTRFS_IOC_GET_SUBVOL_ROOTREF:
- return btrfs_ioctl_get_subvol_rootref(file, argp);
+ return btrfs_ioctl_get_subvol_rootref(root, argp);
case BTRFS_IOC_INO_LOOKUP_USER:
return btrfs_ioctl_ino_lookup_user(file, argp);
case FS_IOC_ENABLE_VERITY:
return fsverity_ioctl_enable(file, (const void __user *)argp);
case FS_IOC_MEASURE_VERITY:
return fsverity_ioctl_measure(file, argp);
+ case BTRFS_IOC_ENCODED_READ:
+ return btrfs_ioctl_encoded_read(file, argp, false);
+ case BTRFS_IOC_ENCODED_WRITE:
+ return btrfs_ioctl_encoded_write(file, argp, false);
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_ENCODED_READ_32:
+ return btrfs_ioctl_encoded_read(file, argp, true);
+ case BTRFS_IOC_ENCODED_WRITE_32:
+ return btrfs_ioctl_encoded_write(file, argp, true);
+#endif
}
return -ENOTTY;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index e6e28a9c7987..430ad36b8b08 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -55,6 +55,9 @@
* 0x1000 | SegHdr N+1| Data payload N+1 ... |
*/
+#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
+#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
+
struct workspace {
void *mem;
void *buf; /* where decompressed data goes */
@@ -83,8 +86,8 @@ struct list_head *lzo_alloc_workspace(unsigned int level)
return ERR_PTR(-ENOMEM);
workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
- workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
- workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
+ workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL);
+ workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL);
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
@@ -380,7 +383,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
kunmap(cur_page);
cur_in += LZO_LEN;
- if (seg_len > lzo1x_worst_compress(PAGE_SIZE)) {
+ if (seg_len > WORKSPACE_CBUF_LENGTH) {
/*
* seg_len shouldn't be larger than we have allocated
* for workspace->cbuf
@@ -433,7 +436,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
size_t out_len;
- size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
+ size_t max_segment_len = WORKSPACE_BUF_LENGTH;
int ret = 0;
char *kaddr;
unsigned long bytes;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6b51fd2ec5ac..1957b14b329a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -143,16 +143,28 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
return ret;
}
-/*
- * Allocate and add a new ordered_extent into the per-inode tree.
+/**
+ * Add an ordered extent to the per-inode tree.
+ *
+ * @inode: Inode that this extent is for.
+ * @file_offset: Logical offset in file where the extent starts.
+ * @num_bytes: Logical length of extent in file.
+ * @ram_bytes: Full length of unencoded data.
+ * @disk_bytenr: Offset of extent on disk.
+ * @disk_num_bytes: Size of extent on disk.
+ * @offset: Offset into unencoded data where file data starts.
+ * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @compress_type: Compression algorithm used for data.
*
- * The tree is given a single reference on the ordered extent that was
- * inserted.
+ * Most of these parameters correspond to &struct btrfs_file_extent_item. The
+ * tree is given a single reference on the ordered extent that was inserted.
+ *
+ * Return: 0 or -ENOMEM.
*/
-static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type, int dio,
- int compress_type)
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned flags,
+ int compress_type)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -161,7 +173,8 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
struct btrfs_ordered_extent *entry;
int ret;
- if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
+ if (flags &
+ ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
/* For nocow write, we can release the qgroup rsv right now */
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
if (ret < 0)
@@ -181,9 +194,11 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
return -ENOMEM;
entry->file_offset = file_offset;
- entry->disk_bytenr = disk_bytenr;
entry->num_bytes = num_bytes;
+ entry->ram_bytes = ram_bytes;
+ entry->disk_bytenr = disk_bytenr;
entry->disk_num_bytes = disk_num_bytes;
+ entry->offset = offset;
entry->bytes_left = num_bytes;
entry->inode = igrab(&inode->vfs_inode);
entry->compress_type = compress_type;
@@ -191,18 +206,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
entry->qgroup_rsv = ret;
entry->physical = (u64)-1;
- ASSERT(type == BTRFS_ORDERED_REGULAR ||
- type == BTRFS_ORDERED_NOCOW ||
- type == BTRFS_ORDERED_PREALLOC ||
- type == BTRFS_ORDERED_COMPRESSED);
- set_bit(type, &entry->flags);
+ ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
+ entry->flags = flags;
percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
fs_info->delalloc_batch);
- if (dio)
- set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
-
/* one ref for the tree */
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
@@ -247,41 +256,6 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
return 0;
}
-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
- int type)
-{
- ASSERT(type == BTRFS_ORDERED_REGULAR ||
- type == BTRFS_ORDERED_NOCOW ||
- type == BTRFS_ORDERED_PREALLOC);
- return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes, type, 0,
- BTRFS_COMPRESS_NONE);
-}
-
-int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type)
-{
- ASSERT(type == BTRFS_ORDERED_REGULAR ||
- type == BTRFS_ORDERED_NOCOW ||
- type == BTRFS_ORDERED_PREALLOC);
- return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes, type, 1,
- BTRFS_COMPRESS_NONE);
-}
-
-int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int compress_type)
-{
- ASSERT(compress_type != BTRFS_COMPRESS_NONE);
- return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes,
- BTRFS_ORDERED_COMPRESSED, 0,
- compress_type);
-}
-
/*
* Add a struct btrfs_ordered_sum into the list of checksums to be inserted
* when an ordered extent is finished. If the list covers more than one
@@ -548,9 +522,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
- if (root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
- false);
+ if (root != fs_info->tree_root) {
+ u64 release;
+
+ if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
+ release = entry->disk_num_bytes;
+ else
+ release = entry->num_bytes;
+ btrfs_delalloc_release_metadata(btrfs_inode, release, false);
+ }
percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
@@ -1052,42 +1032,18 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
u64 file_offset = ordered->file_offset + pos;
u64 disk_bytenr = ordered->disk_bytenr + pos;
- u64 num_bytes = len;
- u64 disk_num_bytes = len;
- int type;
- unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT);
- int compress_type = ordered->compress_type;
- unsigned long weight;
- int ret;
-
- weight = hweight_long(flags_masked);
- WARN_ON_ONCE(weight > 1);
- if (!weight)
- type = 0;
- else
- type = __ffs(flags_masked);
+ unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
/*
- * The splitting extent is already counted and will be added again
- * in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid
- * double counting.
+ * The splitting extent is already counted and will be added again in
+ * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
*/
- percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes,
+ percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
fs_info->delalloc_batch);
- if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) {
- WARN_ON_ONCE(1);
- ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode),
- file_offset, disk_bytenr, num_bytes,
- disk_num_bytes, compress_type);
- } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
- ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset,
- disk_bytenr, num_bytes, disk_num_bytes, type);
- } else {
- ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset,
- disk_bytenr, num_bytes, disk_num_bytes, type);
- }
-
- return ret;
+ WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
+ return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
+ disk_bytenr, len, 0, flags,
+ ordered->compress_type);
}
int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4194e960ff61..ecad67a2c745 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -74,8 +74,18 @@ enum {
BTRFS_ORDERED_LOGGED_CSUM,
/* We wait for this extent to complete in the current transaction */
BTRFS_ORDERED_PENDING,
+ /* BTRFS_IOC_ENCODED_WRITE */
+ BTRFS_ORDERED_ENCODED,
};
+/* BTRFS_ORDERED_* flags that specify the type of the extent. */
+#define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \
+ (1UL << BTRFS_ORDERED_NOCOW) | \
+ (1UL << BTRFS_ORDERED_PREALLOC) | \
+ (1UL << BTRFS_ORDERED_COMPRESSED) | \
+ (1UL << BTRFS_ORDERED_DIRECT) | \
+ (1UL << BTRFS_ORDERED_ENCODED))
+
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -84,9 +94,11 @@ struct btrfs_ordered_extent {
* These fields directly correspond to the same fields in
* btrfs_file_extent_item.
*/
- u64 disk_bytenr;
u64 num_bytes;
+ u64 ram_bytes;
+ u64 disk_bytenr;
u64 disk_num_bytes;
+ u64 offset;
/* number of bytes that still need writing */
u64 bytes_left;
@@ -179,14 +191,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
- int type);
-int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type);
-int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int compress_type);
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned flags,
+ int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0775ae9f4419..dd8777872143 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -23,6 +23,7 @@ static const struct root_name_map root_map[] = {
{ BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" },
{ BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" },
{ BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
+ { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },
{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
};
@@ -391,9 +392,9 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow)
btrfs_header_owner(c),
btrfs_node_ptr_generation(c, i),
level - 1, &first_key);
- if (IS_ERR(next)) {
+ if (IS_ERR(next))
continue;
- } else if (!extent_buffer_uptodate(next)) {
+ if (!extent_buffer_uptodate(next)) {
free_extent_buffer(next);
continue;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 30d42ea655ce..1866b1f0da01 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -25,18 +25,6 @@
#include "sysfs.h"
#include "tree-mod-log.h"
-/* TODO XXX FIXME
- * - subvol delete -> delete when ref goes to 0? delete limits also?
- * - reorganize keys
- * - compressed
- * - sync
- * - copy also limits on subvol creation
- * - limit
- * - caches for ulists
- * - performance benchmarks
- * - check all ioctl parameters
- */
-
/*
* Helpers to access qgroup reservation
*
@@ -258,16 +246,19 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
return 0;
}
-/* must be called with qgroup_lock held */
-static int add_relation_rb(struct btrfs_fs_info *fs_info,
- u64 memberid, u64 parentid)
+/*
+ * Add relation specified by two qgroups.
+ *
+ * Must be called with qgroup_lock held.
+ *
+ * Return: 0 on success
+ * -ENOENT if one of the qgroups is NULL
+ * <0 other errors
+ */
+static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
{
- struct btrfs_qgroup *member;
- struct btrfs_qgroup *parent;
struct btrfs_qgroup_list *list;
- member = find_qgroup_rb(fs_info, memberid);
- parent = find_qgroup_rb(fs_info, parentid);
if (!member || !parent)
return -ENOENT;
@@ -283,7 +274,27 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info,
return 0;
}
-/* must be called with qgroup_lock held */
+/*
+ * Add relation specified by two qgoup ids.
+ *
+ * Must be called with qgroup_lock held.
+ *
+ * Return: 0 on success
+ * -ENOENT if one of the ids does not exist
+ * <0 other errors
+ */
+static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
+{
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup *parent;
+
+ member = find_qgroup_rb(fs_info, memberid);
+ parent = find_qgroup_rb(fs_info, parentid);
+
+ return __add_relation_rb(member, parent);
+}
+
+/* Must be called with qgroup_lock held */
static int del_relation_rb(struct btrfs_fs_info *fs_info,
u64 memberid, u64 parentid)
{
@@ -948,6 +959,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
*/
lockdep_assert_held_write(&fs_info->subvol_sem);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "qgroups are currently unsupported in extent tree v2");
+ return -EINVAL;
+ }
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (fs_info->quota_root)
goto out;
@@ -1451,7 +1468,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
}
spin_lock(&fs_info->qgroup_lock);
- ret = add_relation_rb(fs_info, src, dst);
+ ret = __add_relation_rb(member, parent);
if (ret < 0) {
spin_unlock(&fs_info->qgroup_lock);
goto out;
@@ -3268,7 +3285,8 @@ out:
static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
return btrfs_fs_closing(fs_info) ||
- test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
+ !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
}
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3298,11 +3316,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = PTR_ERR(trans);
break;
}
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
- err = -EINTR;
- } else {
- err = qgroup_rescan_leaf(trans, path);
- }
+
+ err = qgroup_rescan_leaf(trans, path);
+
if (err > 0)
btrfs_commit_transaction(trans);
else
@@ -3316,7 +3332,7 @@ out:
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- } else if (err < 0) {
+ } else if (err < 0 || stopped) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index a3930da4eb3f..04a88bfe4fcf 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -277,7 +277,7 @@ copy_inline_extent:
path->slots[0]),
size);
btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(dst));
ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
out:
if (!ret && !trans) {
@@ -494,7 +494,8 @@ process_slot:
&clone_info, &trans);
if (ret)
goto out;
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ } else {
+ ASSERT(type == BTRFS_FILE_EXTENT_INLINE);
/*
* Inline extents always have to start at file offset 0
* and can never be bigger then the sector size. We can
@@ -505,8 +506,12 @@ process_slot:
*/
ASSERT(key.offset == 0);
ASSERT(datal <= fs_info->sectorsize);
- if (key.offset != 0 || datal > fs_info->sectorsize)
- return -EUCLEAN;
+ if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) ||
+ WARN_ON(key.offset != 0) ||
+ WARN_ON(datal > fs_info->sectorsize)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = clone_copy_inline_extent(inode, path, &new_key,
drop_start, datal, size,
@@ -518,17 +523,22 @@ process_slot:
btrfs_release_path(path);
/*
- * If this is a new extent update the last_reflink_trans of both
- * inodes. This is used by fsync to make sure it does not log
- * multiple checksum items with overlapping ranges. For older
- * extents we don't need to do it since inode logging skips the
- * checksums for older extents. Also ignore holes and inline
- * extents because they don't have checksums in the csum tree.
+ * Whenever we share an extent we update the last_reflink_trans
+ * of each inode to the current transaction. This is needed to
+ * make sure fsync does not log multiple checksum items with
+ * overlapping ranges (because some extent items might refer
+ * only to sections of the original extent). For the destination
+ * inode we do this regardless of the generation of the extents
+ * or even if they are inline extents or explicit holes, to make
+ * sure a full fsync does not skip them. For the source inode,
+ * we only need to update last_reflink_trans in case it's a new
+ * extent that is not a hole or an inline extent, to deal with
+ * the checksums problem on fsync.
*/
- if (extent_gen == trans->transid && disko > 0) {
+ if (extent_gen == trans->transid && disko > 0)
BTRFS_I(src)->last_reflink_trans = trans->transid;
- BTRFS_I(inode)->last_reflink_trans = trans->transid;
- }
+
+ BTRFS_I(inode)->last_reflink_trans = trans->transid;
last_dest_end = ALIGN(new_key.offset + datal,
fs_info->sectorsize);
@@ -575,8 +585,7 @@ process_slot:
* replaced file extent items.
*/
if (last_dest_end >= i_size_read(inode))
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
last_dest_end, destoff + len - 1, NULL, &trans);
@@ -772,9 +781,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (btrfs_root_readonly(root_out))
return -EROFS;
- if (file_in->f_path.mnt != file_out->f_path.mnt ||
- inode_in->i_sb != inode_out->i_sb)
- return -EXDEV;
+ ASSERT(inode_in->i_sb == inode_out->i_sb);
}
/* Don't make the dst file partly checksummed */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9d8054839782..fdc2c4b411f0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2599,9 +2599,9 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
eb = read_tree_block(fs_info, block->bytenr, block->owner,
block->key.offset, block->level, NULL);
- if (IS_ERR(eb)) {
+ if (IS_ERR(eb))
return PTR_ERR(eb);
- } else if (!extent_buffer_uptodate(eb)) {
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2997,7 +2997,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
/* Reserve metadata for this range */
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- clamped_len);
+ clamped_len, clamped_len);
if (ret)
goto release_page;
@@ -4123,9 +4123,8 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
* this function resumes merging reloc trees with corresponding fs trees.
* this is important for keeping the sharing of tree blocks
*/
-int btrfs_recover_relocation(struct btrfs_root *root)
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
LIST_HEAD(reloc_roots);
struct btrfs_key key;
struct btrfs_root *fs_root;
@@ -4166,7 +4165,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
key.type != BTRFS_ROOT_ITEM_KEY)
break;
- reloc_root = btrfs_read_tree_root(root, &key);
+ reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
if (IS_ERR(reloc_root)) {
err = PTR_ERR(reloc_root);
goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2e9a322773f2..11089568b287 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3190,7 +3190,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 generation;
int mirror_num;
struct btrfs_key key;
- u64 increment = map->stripe_len;
+ u64 increment;
u64 offset;
u64 extent_logical;
u64 extent_physical;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 201eb2628aea..7d1642937274 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -528,17 +528,12 @@ out:
static int fs_path_copy(struct fs_path *p, struct fs_path *from)
{
- int ret;
-
p->reversed = from->reversed;
fs_path_reset(p);
- ret = fs_path_add_path(p, from);
-
- return ret;
+ return fs_path_add_path(p, from);
}
-
static void fs_path_unreverse(struct fs_path *p)
{
char *tmp;
@@ -7477,10 +7472,10 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
root->root_key.objectid, root->dedupe_in_progress);
}
-long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
+long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
- struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
+ struct btrfs_root *send_root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
struct send_ctx *sctx = NULL;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 23bcefc84e49..08602fdd600a 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -126,7 +126,7 @@ enum {
#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
#ifdef __KERNEL__
-long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg);
+long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
#endif
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 294242c194d8..b87931a458eb 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -737,6 +737,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
u64 thresh = div_factor_fine(space_info->total_bytes, 90);
u64 used;
+ lockdep_assert_held(&space_info->lock);
+
/* If we're just plain full then async reclaim just slows us down. */
if ((space_info->bytes_used + space_info->bytes_reserved +
global_rsv_size) >= thresh)
@@ -1061,7 +1063,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
trans_rsv->reserved;
if (block_rsv_size < space_info->bytes_may_use)
delalloc_size = space_info->bytes_may_use - block_rsv_size;
- spin_unlock(&space_info->lock);
/*
* We don't want to include the global_rsv in our calculation,
@@ -1092,6 +1093,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
flush = FLUSH_DELAYED_REFS_NR;
}
+ spin_unlock(&space_info->lock);
+
/*
* We don't want to reclaim everything, just a portion, so scale
* down the to_reclaim by 1/4. If it takes us down to 0,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4d947ba32da9..b228efe8ab6e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,6 +66,52 @@ static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
+#ifdef CONFIG_PRINTK
+
+#define STATE_STRING_PREFACE ": state "
+#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
+
+/*
+ * Characters to print to indicate error conditions or uncommon filesystem sate.
+ * RO is not an error.
+ */
+static const char fs_state_chars[] = {
+ [BTRFS_FS_STATE_ERROR] = 'E',
+ [BTRFS_FS_STATE_REMOUNTING] = 'M',
+ [BTRFS_FS_STATE_RO] = 0,
+ [BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
+ [BTRFS_FS_STATE_DEV_REPLACING] = 'R',
+ [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
+ [BTRFS_FS_STATE_NO_CSUMS] = 'C',
+ [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
+};
+
+static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
+{
+ unsigned int bit;
+ bool states_printed = false;
+ unsigned long fs_state = READ_ONCE(info->fs_state);
+ char *curr = buf;
+
+ memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
+ curr += sizeof(STATE_STRING_PREFACE) - 1;
+
+ for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
+ WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
+ if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
+ *curr++ = fs_state_chars[bit];
+ states_printed = true;
+ }
+ }
+
+ /* If no states were printed, reset the buffer */
+ if (!states_printed)
+ curr = buf;
+
+ *curr++ = 0;
+}
+#endif
+
/*
* Generally the error codes correspond to their respective errors, but there
* are a few special cases.
@@ -128,6 +174,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
{
struct super_block *sb = fs_info->sb;
#ifdef CONFIG_PRINTK
+ char statestr[STATE_STRING_BUF_LEN];
const char *errstr;
#endif
@@ -140,6 +187,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
#ifdef CONFIG_PRINTK
errstr = btrfs_decode_error(errno);
+ btrfs_state_to_string(fs_info, statestr);
if (fmt) {
struct va_format vaf;
va_list args;
@@ -148,12 +196,12 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
vaf.fmt = fmt;
vaf.va = &args;
- pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
- sb->s_id, function, line, errno, errstr, &vaf);
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
+ sb->s_id, statestr, function, line, errno, errstr, &vaf);
va_end(args);
} else {
- pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
- sb->s_id, function, line, errno, errstr);
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
+ sb->s_id, statestr, function, line, errno, errstr);
}
#endif
@@ -240,11 +288,15 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
vaf.va = &args;
if (__ratelimit(ratelimit)) {
- if (fs_info)
- printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
- fs_info->sb->s_id, &vaf);
- else
+ if (fs_info) {
+ char statestr[STATE_STRING_BUF_LEN];
+
+ btrfs_state_to_string(fs_info, statestr);
+ printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
+ fs_info->sb->s_id, statestr, &vaf);
+ } else {
printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ }
}
va_end(args);
@@ -861,6 +913,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_space_cache:
case Opt_space_cache_version:
+ /*
+ * We already set FREE_SPACE_TREE above because we have
+ * compat_ro(FREE_SPACE_TREE) set, and we aren't going
+ * to allow v1 to be set for extent tree v2, simply
+ * ignore this setting if we're extent tree v2.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
if (token == Opt_space_cache ||
strcmp(args[0].from, "v1") == 0) {
btrfs_clear_opt(info->mount_opt,
@@ -881,6 +941,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
+ /*
+ * We cannot operate without the free space tree with
+ * extent tree v2, ignore this option.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
if (btrfs_test_opt(info, SPACE_CACHE)) {
btrfs_clear_and_info(info, SPACE_CACHE,
"disabling disk space caching");
@@ -896,6 +962,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
"the 'inode_cache' option is deprecated and has no effect since 5.11");
break;
case Opt_clear_cache:
+ /*
+ * We cannot clear the free space tree with extent tree
+ * v2, ignore this option.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
btrfs_set_and_info(info, CLEAR_CACHE,
"force clearing of disk cache");
break;
@@ -2383,6 +2455,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
{
struct btrfs_ioctl_vol_args *vol;
struct btrfs_device *device = NULL;
+ dev_t devt = 0;
int ret = -ENOTTY;
if (!capable(CAP_SYS_ADMIN))
@@ -2402,7 +2475,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
mutex_unlock(&uuid_mutex);
break;
case BTRFS_IOC_FORGET_DEV:
- ret = btrfs_forget_devices(vol->name);
+ if (vol->name[0] != 0) {
+ ret = lookup_bdev(vol->name, &devt);
+ if (ret)
+ break;
+ }
+ ret = btrfs_forget_devices(devt);
break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index beb7f72d50b8..17389a42a3ab 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -283,9 +283,11 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
-/* Remove once support for zoned allocation is feature complete */
#ifdef CONFIG_BTRFS_DEBUG
+/* Remove once support for zoned allocation is feature complete */
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+/* Remove once support for extent tree v2 is feature complete */
+BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
@@ -314,6 +316,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(raid1c34),
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_PTR(zoned),
+ BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_PTR(verity),
@@ -1104,6 +1107,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS];
+static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) ==
+ ARRAY_SIZE(btrfs_feature_attrs));
+static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) ==
+ ARRAY_SIZE(btrfs_feature_attrs[0]));
+
static const u64 supported_feature_masks[FEAT_MAX] = {
[FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
@@ -1272,11 +1280,6 @@ static void init_feature_attrs(void)
struct btrfs_feature_attr *fa;
int set, i;
- BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
- ARRAY_SIZE(btrfs_feature_attrs));
- BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
- ARRAY_SIZE(btrfs_feature_attrs[0]));
-
memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
memset(btrfs_unknown_feature_names, 0,
sizeof(btrfs_unknown_feature_names));
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 319fed82d741..c5b3a631bf4f 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -15,6 +15,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
struct extent_map *em;
struct rb_node *node;
+ write_lock(&em_tree->lock);
while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
node = rb_first_cached(&em_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
@@ -32,6 +33,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
#endif
free_extent_map(em);
}
+ write_unlock(&em_tree->lock);
}
/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1f1c25db6f6b..b008c5110958 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1911,6 +1911,14 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->cache_generation = 0;
if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
+
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ root_item = &fs_info->block_group_root->root_item;
+
+ super->block_group_root = root_item->bytenr;
+ super->block_group_root_generation = root_item->generation;
+ super->block_group_root_level = root_item->level;
+ }
}
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -2362,6 +2370,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
list_add_tail(&fs_info->chunk_root->dirty_list,
&cur_trans->switch_commits);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_set_root_node(&fs_info->block_group_root->root_item,
+ fs_info->block_group_root->node);
+ list_add_tail(&fs_info->block_group_root->dirty_list,
+ &cur_trans->switch_commits);
+ }
+
switch_commit_roots(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
@@ -2490,10 +2505,10 @@ cleanup_transaction:
* because btrfs_commit_super will poke cleaner thread and it will process it a
* few seconds later.
*/
-int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
+int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *root;
int ret;
- struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->trans_lock);
if (list_empty(&fs_info->dead_roots)) {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ba8a9826eb37..970ff316069d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -217,7 +217,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
-int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
+int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index aae5697dde32..e56c0107eea3 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -639,8 +639,10 @@ static void block_group_err(const struct extent_buffer *eb, int slot,
static int check_block_group_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_block_group_item bgi;
u32 item_size = btrfs_item_size(leaf, slot);
+ u64 chunk_objectid;
u64 flags;
u64 type;
@@ -663,8 +665,23 @@ static int check_block_group_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
sizeof(bgi));
- if (unlikely(btrfs_stack_block_group_chunk_objectid(&bgi) !=
- BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
+ chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ /*
+ * We don't init the nr_global_roots until we load the global
+ * roots, so this could be 0 at mount time. If it's 0 we'll
+ * just assume we're fine, and later we'll check against our
+ * actual value.
+ */
+ if (unlikely(fs_info->nr_global_roots &&
+ chunk_objectid >= fs_info->nr_global_roots)) {
+ block_group_err(leaf, slot,
+ "invalid block group global root id, have %llu, needs to be <= %llu",
+ chunk_objectid,
+ fs_info->nr_global_roots);
+ return -EUCLEAN;
+ }
+ } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
block_group_err(leaf, slot,
"invalid block group chunk objectid, have %llu expect %llu",
btrfs_stack_block_group_chunk_objectid(&bgi),
@@ -1648,7 +1665,6 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
/* These trees must never be empty */
if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID ||
owner == BTRFS_CHUNK_TREE_OBJECTID ||
- owner == BTRFS_EXTENT_TREE_OBJECTID ||
owner == BTRFS_DEV_TREE_OBJECTID ||
owner == BTRFS_FS_TREE_OBJECTID ||
owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
@@ -1657,12 +1673,25 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
owner);
return -EUCLEAN;
}
+
/* Unknown tree */
if (unlikely(owner == 0)) {
generic_err(leaf, 0,
"invalid owner, root 0 is not defined");
return -EUCLEAN;
}
+
+ /* EXTENT_TREE_V2 can have empty extent trees. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) {
+ generic_err(leaf, 0,
+ "invalid root, root %llu must never be empty",
+ owner);
+ return -EUCLEAN;
+ }
+
return 0;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6bc8834ac8f7..571dae8ad65e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -270,12 +270,6 @@ void btrfs_end_log_trans(struct btrfs_root *root)
}
}
-static int btrfs_write_tree_block(struct extent_buffer *buf)
-{
- return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
- buf->start + buf->len - 1);
-}
-
static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
filemap_fdatawait_range(buf->pages[0]->mapping,
@@ -294,16 +288,6 @@ struct walk_control {
*/
int free;
- /* should we write out the extent buffer? This is used
- * while flushing the log tree to disk during a sync
- */
- int write;
-
- /* should we wait for the extent buffer io to finish? Also used
- * while flushing the log tree to disk for a sync
- */
- int wait;
-
/* pin only walk, we record which extents on disk belong to the
* log trees
*/
@@ -354,17 +338,15 @@ static int process_one_buffer(struct btrfs_root *log,
return ret;
}
- if (wc->pin)
+ if (wc->pin) {
ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
eb->len);
+ if (ret)
+ return ret;
- if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
- if (wc->pin && btrfs_header_level(eb) == 0)
+ if (btrfs_buffer_uptodate(eb, gen, 0) &&
+ btrfs_header_level(eb) == 0)
ret = btrfs_exclude_logged_extents(eb);
- if (wc->write)
- btrfs_write_tree_block(eb);
- if (wc->wait)
- btrfs_wait_tree_block_writeback(eb);
}
return ret;
}
@@ -917,6 +899,26 @@ out:
return ret;
}
+static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ const char *name,
+ int name_len)
+{
+ int ret;
+
+ ret = btrfs_unlink_inode(trans, dir, inode, name, name_len);
+ if (ret)
+ return ret;
+ /*
+ * Whenever we need to check if a name exists or not, we check the
+ * fs/subvolume tree. So after an unlink we must run delayed items, so
+ * that future checks for a name during log replay see that the name
+ * does not exists anymore.
+ */
+ return btrfs_run_delayed_items(trans);
+}
+
/*
* when cleaning up conflicts between the directory names in the
* subvolume, directory names in the log and directory names in the
@@ -959,12 +961,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name,
+ ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name,
name_len);
- if (ret)
- goto out;
- else
- ret = btrfs_run_delayed_items(trans);
out:
kfree(name);
iput(inode);
@@ -1124,14 +1122,11 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans, dir, inode,
+ ret = unlink_inode_for_log_replay(trans, dir, inode,
victim_name, victim_name_len);
kfree(victim_name);
if (ret)
return ret;
- ret = btrfs_run_delayed_items(trans);
- if (ret)
- return ret;
*search_done = 1;
goto again;
}
@@ -1196,14 +1191,11 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans,
+ ret = unlink_inode_for_log_replay(trans,
BTRFS_I(victim_parent),
inode,
victim_name,
victim_name_len);
- if (!ret)
- ret = btrfs_run_delayed_items(
- trans);
}
iput(victim_parent);
kfree(victim_name);
@@ -1358,19 +1350,10 @@ again:
kfree(name);
goto out;
}
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
inode, name, namelen);
kfree(name);
iput(dir);
- /*
- * Whenever we need to check if a name exists or not, we
- * check the subvolume tree. So after an unlink we must
- * run delayed items, so that future checks for a name
- * during log replay see that the name does not exists
- * anymore.
- */
- if (!ret)
- ret = btrfs_run_delayed_items(trans);
if (ret)
goto out;
goto again;
@@ -1466,8 +1449,8 @@ static int add_link(struct btrfs_trans_handle *trans,
ret = -ENOENT;
goto out;
}
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode),
- name, namelen);
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode),
+ name, namelen);
if (ret)
goto out;
/*
@@ -1476,10 +1459,6 @@ static int add_link(struct btrfs_trans_handle *trans,
*/
if (other_inode->i_nlink == 0)
inc_nlink(other_inode);
-
- ret = btrfs_run_delayed_items(trans);
- if (ret)
- goto out;
add_link:
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
name, namelen, 0, ref_index);
@@ -1612,7 +1591,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ret = btrfs_inode_ref_exists(inode, dir, key->type,
name, namelen);
if (ret > 0) {
- ret = btrfs_unlink_inode(trans,
+ ret = unlink_inode_for_log_replay(trans,
BTRFS_I(dir),
BTRFS_I(inode),
name, namelen);
@@ -1623,15 +1602,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
*/
if (!ret && inode->i_nlink == 0)
inc_nlink(inode);
- /*
- * Whenever we need to check if a name exists or
- * not, we check the subvolume tree. So after an
- * unlink we must run delayed items, so that future
- * checks for a name during log replay see that the
- * name does not exists anymore.
- */
- if (!ret)
- ret = btrfs_run_delayed_items(trans);
}
if (ret < 0)
goto out;
@@ -2368,15 +2338,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
goto out;
inc_nlink(inode);
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- name_len);
- if (ret)
- goto out;
-
- ret = btrfs_run_delayed_items(trans);
- if (ret)
- goto out;
-
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
+ name, name_len);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
@@ -3495,35 +3458,156 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
}
/*
- * Check if an inode was logged in the current transaction. This may often
- * return some false positives, because logged_trans is an in memory only field,
- * not persisted anywhere. This is meant to be used in contexts where a false
- * positive has no functional consequences.
+ * Check if an inode was logged in the current transaction. This correctly deals
+ * with the case where the inode was logged but has a logged_trans of 0, which
+ * happens if the inode is evicted and loaded again, as logged_trans is an in
+ * memory only field (not persisted).
+ *
+ * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
+ * and < 0 on error.
*/
-static bool inode_logged(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+static int inode_logged(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path_in)
{
+ struct btrfs_path *path = path_in;
+ struct btrfs_key key;
+ int ret;
+
if (inode->logged_trans == trans->transid)
- return true;
+ return 1;
- if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
- return false;
+ /*
+ * If logged_trans is not 0, then we know the inode logged was not logged
+ * in this transaction, so we can return false right away.
+ */
+ if (inode->logged_trans > 0)
+ return 0;
/*
- * The inode's logged_trans is always 0 when we load it (because it is
- * not persisted in the inode item or elsewhere). So if it is 0, the
- * inode was last modified in the current transaction then the inode may
- * have been logged before in the current transaction, then evicted and
- * loaded again in the current transaction - or may have never been logged
- * in the current transaction, but since we can not be sure, we have to
- * assume it was, otherwise our callers can leave an inconsistent log.
+ * If no log tree was created for this root in this transaction, then
+ * the inode can not have been logged in this transaction. In that case
+ * set logged_trans to anything greater than 0 and less than the current
+ * transaction's ID, to avoid the search below in a future call in case
+ * a log tree gets created after this.
*/
- if (inode->logged_trans == 0 &&
- inode->last_trans == trans->transid &&
- !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
- return true;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
+ inode->logged_trans = trans->transid - 1;
+ return 0;
+ }
+
+ /*
+ * We have a log tree and the inode's logged_trans is 0. We can't tell
+ * for sure if the inode was logged before in this transaction by looking
+ * only at logged_trans. We could be pessimistic and assume it was, but
+ * that can lead to unnecessarily logging an inode during rename and link
+ * operations, and then further updating the log in followup rename and
+ * link operations, specially if it's a directory, which adds latency
+ * visible to applications doing a series of rename or link operations.
+ *
+ * A logged_trans of 0 here can mean several things:
+ *
+ * 1) The inode was never logged since the filesystem was mounted, and may
+ * or may have not been evicted and loaded again;
+ *
+ * 2) The inode was logged in a previous transaction, then evicted and
+ * then loaded again;
+ *
+ * 3) The inode was logged in the current transaction, then evicted and
+ * then loaded again.
+ *
+ * For cases 1) and 2) we don't want to return true, but we need to detect
+ * case 3) and return true. So we do a search in the log root for the inode
+ * item.
+ */
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
- return false;
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+
+ if (path_in)
+ btrfs_release_path(path);
+ else
+ btrfs_free_path(path);
+
+ /*
+ * Logging an inode always results in logging its inode item. So if we
+ * did not find the item we know the inode was not logged for sure.
+ */
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /*
+ * Set logged_trans to a value greater than 0 and less then the
+ * current transaction to avoid doing the search in future calls.
+ */
+ inode->logged_trans = trans->transid - 1;
+ return 0;
+ }
+
+ /*
+ * The inode was previously logged and then evicted, set logged_trans to
+ * the current transacion's ID, to avoid future tree searches as long as
+ * the inode is not evicted again.
+ */
+ inode->logged_trans = trans->transid;
+
+ /*
+ * If it's a directory, then we must set last_dir_index_offset to the
+ * maximum possible value, so that the next attempt to log the inode does
+ * not skip checking if dir index keys found in modified subvolume tree
+ * leaves have been logged before, otherwise it would result in attempts
+ * to insert duplicate dir index keys in the log tree. This must be done
+ * because last_dir_index_offset is an in-memory only field, not persisted
+ * in the inode item or any other on-disk structure, so its value is lost
+ * once the inode is evicted.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->last_dir_index_offset = (u64)-1;
+
+ return 1;
+}
+
+/*
+ * Delete a directory entry from the log if it exists.
+ *
+ * Returns < 0 on error
+ * 1 if the entry does not exists
+ * 0 if the entry existed and was successfully deleted
+ */
+static int del_logged_dentry(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dir_ino,
+ const char *name, int name_len,
+ u64 index)
+{
+ struct btrfs_dir_item *di;
+
+ /*
+ * We only log dir index items of a directory, so we don't need to look
+ * for dir item keys.
+ */
+ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
+ index, name, name_len, -1);
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+ else if (!di)
+ return 1;
+
+ /*
+ * We do not need to update the size field of the directory's
+ * inode item because on log replay we update the field to reflect
+ * all existing entries in the directory (see overwrite_item()).
+ */
+ return btrfs_delete_one_dir_name(trans, log, path, di);
}
/*
@@ -3552,15 +3636,16 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir, u64 index)
{
- struct btrfs_root *log;
- struct btrfs_dir_item *di;
struct btrfs_path *path;
int ret;
- int err = 0;
- u64 dir_ino = btrfs_ino(dir);
- if (!inode_logged(trans, dir))
+ ret = inode_logged(trans, dir, NULL);
+ if (ret == 0)
+ return;
+ else if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
return;
+ }
ret = join_running_log_trans(root);
if (ret)
@@ -3568,41 +3653,18 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
mutex_lock(&dir->log_mutex);
- log = root->log_root;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out_unlock;
}
- /*
- * We only log dir index items of a directory, so we don't need to look
- * for dir item keys.
- */
- di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
- index, name, name_len, -1);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto fail;
- }
- if (di) {
- ret = btrfs_delete_one_dir_name(trans, log, path, di);
- if (ret) {
- err = ret;
- goto fail;
- }
- }
-
- /*
- * We do not need to update the size field of the directory's inode item
- * because on log replay we update the field to reflect all existing
- * entries in the directory (see overwrite_item()).
- */
-fail:
+ ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
+ name, name_len, index);
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (err < 0)
+ if (ret < 0)
btrfs_set_log_full_commit(trans);
btrfs_end_log_trans(root);
}
@@ -3617,8 +3679,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
u64 index;
int ret;
- if (!inode_logged(trans, inode))
+ ret = inode_logged(trans, inode, NULL);
+ if (ret == 0)
+ return;
+ else if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
return;
+ }
ret = join_running_log_trans(root);
if (ret)
@@ -3743,19 +3810,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path,
- struct btrfs_log_ctx *ctx)
+ struct btrfs_log_ctx *ctx,
+ u64 *last_old_dentry_offset)
{
struct btrfs_root *log = inode->root->log_root;
struct extent_buffer *src = path->nodes[0];
const int nritems = btrfs_header_nritems(src);
const u64 ino = btrfs_ino(inode);
- const bool inode_logged_before = inode_logged(trans, inode);
bool last_found = false;
int batch_start = 0;
int batch_size = 0;
int i;
for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
struct btrfs_key key;
int ret;
@@ -3766,7 +3834,34 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
break;
}
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
ctx->last_dir_item_offset = key.offset;
+
+ /*
+ * Skip ranges of items that consist only of dir item keys created
+ * in past transactions. However if we find a gap, we must log a
+ * dir index range item for that gap, so that index keys in that
+ * gap are deleted during log replay.
+ */
+ if (btrfs_dir_transid(src, di) < trans->transid) {
+ if (key.offset > *last_old_dentry_offset + 1) {
+ ret = insert_dir_log_key(trans, log, dst_path,
+ ino, *last_old_dentry_offset + 1,
+ key.offset - 1);
+ /*
+ * -EEXIST should never happen because when we
+ * log a directory in full mode (LOG_INODE_ALL)
+ * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from
+ * the log tree.
+ */
+ ASSERT(ret != -EEXIST);
+ if (ret < 0)
+ return ret;
+ }
+
+ *last_old_dentry_offset = key.offset;
+ continue;
+ }
/*
* We must make sure that when we log a directory entry, the
* corresponding inode, after log replay, has a matching link
@@ -3790,25 +3885,23 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
* resulting in -ENOTEMPTY errors.
*/
if (!ctx->log_new_dentries) {
- struct btrfs_dir_item *di;
struct btrfs_key di_key;
- di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(src, di, &di_key);
- if ((btrfs_dir_transid(src, di) == trans->transid ||
- btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
- di_key.type != BTRFS_ROOT_ITEM_KEY)
+ if (di_key.type != BTRFS_ROOT_ITEM_KEY)
ctx->log_new_dentries = true;
}
- if (!inode_logged_before)
+ if (!ctx->logged_before)
goto add_to_batch;
/*
* If we were logged before and have logged dir items, we can skip
* checking if any item with a key offset larger than the last one
* we logged is in the log tree, saving time and avoiding adding
- * contention on the log tree.
+ * contention on the log tree. We can only rely on the value of
+ * last_dir_index_offset when we know for sure that the inode was
+ * previously logged in the current transaction.
*/
if (key.offset > inode->last_dir_index_offset)
goto add_to_batch;
@@ -3878,7 +3971,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
int err = 0;
int ret;
- u64 first_offset = min_offset;
+ u64 last_old_dentry_offset = min_offset - 1;
u64 last_offset = (u64)-1;
u64 ino = btrfs_ino(inode);
@@ -3912,10 +4005,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
*/
if (ret == 0) {
struct btrfs_key tmp;
+
btrfs_item_key_to_cpu(path->nodes[0], &tmp,
path->slots[0]);
if (tmp.type == BTRFS_DIR_INDEX_KEY)
- first_offset = max(min_offset, tmp.offset) + 1;
+ last_old_dentry_offset = tmp.offset;
}
goto done;
}
@@ -3924,17 +4018,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
if (ret == 0) {
struct btrfs_key tmp;
+
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (tmp.type == BTRFS_DIR_INDEX_KEY) {
- first_offset = tmp.offset;
- ret = overwrite_item(trans, log, dst_path,
- path->nodes[0], path->slots[0],
- &tmp);
- if (ret) {
- err = ret;
- goto done;
- }
- }
+ /*
+ * The dir index key before the first one we found that needs to
+ * be logged might be in a previous leaf, and there might be a
+ * gap between these keys, meaning that we had deletions that
+ * happened. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
+ * previous key's offset plus 1, so that those deletes are replayed.
+ */
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
}
btrfs_release_path(path);
@@ -3956,7 +4051,8 @@ search:
* from our directory
*/
while (1) {
- ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx);
+ ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
+ &last_old_dentry_offset);
if (ret != 0) {
if (ret < 0)
err = ret;
@@ -3982,14 +4078,16 @@ search:
goto done;
}
if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
- ctx->last_dir_item_offset = min_key.offset;
- ret = overwrite_item(trans, log, dst_path,
- path->nodes[0], path->slots[0],
- &min_key);
- if (ret)
- err = ret;
- else
- last_offset = min_key.offset;
+ /*
+ * The next leaf was not changed in the current transaction
+ * and has at least one dir index key.
+ * We check for the next key because there might have been
+ * one or more deletions between the last key we logged and
+ * that next key. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
+ * offset minus 1, so that those deletes are replayed.
+ */
+ last_offset = min_key.offset - 1;
goto done;
}
if (need_resched()) {
@@ -4005,13 +4103,21 @@ done:
if (err == 0) {
*last_offset_ret = last_offset;
/*
- * insert the log range keys to indicate where the log
- * is valid
+ * In case the leaf was changed in the current transaction but
+ * all its dir items are from a past transaction, the last item
+ * in the leaf is a dir item and there's no gap between that last
+ * dir item and the first one on the next leaf (which did not
+ * change in the current transaction), then we don't need to log
+ * a range, last_old_dentry_offset is == to last_offset.
*/
- ret = insert_dir_log_key(trans, log, path, ino, first_offset,
- last_offset);
- if (ret)
- err = ret;
+ ASSERT(last_old_dentry_offset <= last_offset);
+ if (last_old_dentry_offset < last_offset) {
+ ret = insert_dir_log_key(trans, log, path, ino,
+ last_old_dentry_offset + 1,
+ last_offset);
+ if (ret)
+ err = ret;
+ }
}
return err;
}
@@ -4038,22 +4144,7 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
u64 max_key;
int ret;
- /*
- * If this is the first time we are being logged in the current
- * transaction, or we were logged before but the inode was evicted and
- * reloaded later, in which case its logged_trans is 0, reset the value
- * of the last logged key offset. Note that we don't use the helper
- * function inode_logged() here - that is because the function returns
- * true after an inode eviction, assuming the worst case as it can not
- * know for sure if the inode was logged before. So we can not skip key
- * searches in the case the inode was evicted, because it may not have
- * been logged in this transaction and may have been logged in a past
- * transaction, so we need to reset the last dir index offset to (u64)-1.
- */
- if (inode->logged_trans != trans->transid)
- inode->last_dir_index_offset = (u64)-1;
-
- min_key = 0;
+ min_key = BTRFS_DIR_START_INDEX;
max_key = 0;
ctx->last_dir_item_offset = inode->last_dir_index_offset;
@@ -4089,9 +4180,6 @@ static int drop_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_key found_key;
int start_slot;
- if (!inode_logged(trans, inode))
- return 0;
-
key.objectid = btrfs_ino(inode);
key.type = max_key_type;
key.offset = (u64)-1;
@@ -4311,23 +4399,18 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- unsigned long src_offset;
- unsigned long dst_offset;
struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
- struct btrfs_inode_item *inode_item;
struct extent_buffer *src = src_path->nodes[0];
- int ret;
+ int ret = 0;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
struct btrfs_item_batch batch;
char *ins_data;
int i;
- struct list_head ordered_sums;
- int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
-
- INIT_LIST_HEAD(&ordered_sums);
+ int dst_index;
+ const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
+ const u64 i_size = i_size_read(&inode->vfs_inode);
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
nr * sizeof(u32), GFP_NOFS);
@@ -4339,28 +4422,152 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
batch.keys = ins_keys;
batch.data_sizes = ins_sizes;
batch.total_data_size = 0;
- batch.nr = nr;
+ batch.nr = 0;
+ dst_index = 0;
for (i = 0; i < nr; i++) {
- ins_sizes[i] = btrfs_item_size(src, i + start_slot);
- batch.total_data_size += ins_sizes[i];
- btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+ const int src_slot = start_slot + i;
+ struct btrfs_root *csum_root;
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_ordered_sum *sums_next;
+ LIST_HEAD(ordered_sums);
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 extent_offset;
+ u64 extent_num_bytes;
+ bool is_old_extent;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
+
+ if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
+ goto add_to_batch;
+
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
+
+ is_old_extent = (btrfs_file_extent_generation(src, extent) <
+ trans->transid);
+
+ /*
+ * Don't copy extents from past generations. That would make us
+ * log a lot more metadata for common cases like doing only a
+ * few random writes into a file and then fsync it for the first
+ * time or after the full sync flag is set on the inode. We can
+ * get leaves full of extent items, most of which are from past
+ * generations, so we can skip them - as long as the inode has
+ * not been the target of a reflink operation in this transaction,
+ * as in that case it might have had file extent items with old
+ * generations copied into it. We also must always log prealloc
+ * extents that start at or beyond eof, otherwise we would lose
+ * them on log replay.
+ */
+ if (is_old_extent &&
+ ins_keys[dst_index].offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
+
+ if (skip_csum)
+ goto add_to_batch;
+
+ /* Only regular extents have checksums. */
+ if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
+ goto add_to_batch;
+
+ /*
+ * If it's an extent created in a past transaction, then its
+ * checksums are already accessible from the committed csum tree,
+ * no need to log them.
+ */
+ if (is_old_extent)
+ goto add_to_batch;
+
+ disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
+ /* If it's an explicit hole, there are no checksums. */
+ if (disk_bytenr == 0)
+ goto add_to_batch;
+
+ disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
+
+ if (btrfs_file_extent_compression(src, extent)) {
+ extent_offset = 0;
+ extent_num_bytes = disk_num_bytes;
+ } else {
+ extent_offset = btrfs_file_extent_offset(src, extent);
+ extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
+ }
+
+ csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
+ disk_bytenr += extent_offset;
+ ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
+ disk_bytenr + extent_num_bytes - 1,
+ &ordered_sums, 0);
+ if (ret)
+ goto out;
+
+ list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
+ if (!ret)
+ ret = log_csums(trans, inode, log, sums);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ if (ret)
+ goto out;
+
+add_to_batch:
+ ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
+ batch.total_data_size += ins_sizes[dst_index];
+ batch.nr++;
+ dst_index++;
}
+
+ /*
+ * We have a leaf full of old extent items that don't need to be logged,
+ * so we don't need to do anything.
+ */
+ if (batch.nr == 0)
+ goto out;
+
ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
- if (ret) {
- kfree(ins_data);
- return ret;
- }
+ if (ret)
+ goto out;
+
+ dst_index = 0;
+ for (i = 0; i < nr; i++) {
+ const int src_slot = start_slot + i;
+ const int dst_slot = dst_path->slots[0] + dst_index;
+ struct btrfs_key key;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+
+ /*
+ * We're done, all the remaining items in the source leaf
+ * correspond to old file extent items.
+ */
+ if (dst_index >= batch.nr)
+ break;
+
+ btrfs_item_key_to_cpu(src, &key, src_slot);
- for (i = 0; i < nr; i++, dst_path->slots[0]++) {
- dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
- dst_path->slots[0]);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ goto copy_item;
+
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
+
+ /* See the comment in the previous loop, same logic. */
+ if (btrfs_file_extent_generation(src, extent) < trans->transid &&
+ key.offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
- src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+copy_item:
+ dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
+ src_offset = btrfs_item_ptr_offset(src, src_slot);
- if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
- inode_item = btrfs_item_ptr(dst_path->nodes[0],
- dst_path->slots[0],
+ if (key.type == BTRFS_INODE_ITEM_KEY) {
+ struct btrfs_inode_item *inode_item;
+
+ inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
struct btrfs_inode_item);
fill_inode_item(trans, dst_path->nodes[0], inode_item,
&inode->vfs_inode,
@@ -4368,71 +4575,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
logged_isize);
} else {
copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
- src_offset, ins_sizes[i]);
+ src_offset, ins_sizes[dst_index]);
}
- /* take a reference on file data extents so that truncates
- * or deletes of this inode don't have to relog the inode
- * again
- */
- if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
- !skip_csum) {
- int found_type;
- extent = btrfs_item_ptr(src, start_slot + i,
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_generation(src, extent) < trans->transid)
- continue;
-
- found_type = btrfs_file_extent_type(src, extent);
- if (found_type == BTRFS_FILE_EXTENT_REG) {
- struct btrfs_root *csum_root;
- u64 ds, dl, cs, cl;
- ds = btrfs_file_extent_disk_bytenr(src,
- extent);
- /* ds == 0 is a hole */
- if (ds == 0)
- continue;
-
- dl = btrfs_file_extent_disk_num_bytes(src,
- extent);
- cs = btrfs_file_extent_offset(src, extent);
- cl = btrfs_file_extent_num_bytes(src,
- extent);
- if (btrfs_file_extent_compression(src,
- extent)) {
- cs = 0;
- cl = dl;
- }
-
- csum_root = btrfs_csum_root(fs_info, ds);
- ret = btrfs_lookup_csums_range(csum_root,
- ds + cs, ds + cs + cl - 1,
- &ordered_sums, 0);
- if (ret)
- break;
- }
- }
+ dst_index++;
}
btrfs_mark_buffer_dirty(dst_path->nodes[0]);
btrfs_release_path(dst_path);
+out:
kfree(ins_data);
- /*
- * we have to do this after the loop above to avoid changing the
- * log tree while trying to change the log tree.
- */
- while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
- if (!ret)
- ret = log_csums(trans, inode, log, sums);
- list_del(&sums->list);
- kfree(sums);
- }
-
return ret;
}
@@ -4568,14 +4721,34 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *log = inode->root->log_root;
- struct btrfs_file_extent_item *fi;
+ struct btrfs_file_extent_item fi = { 0 };
struct extent_buffer *leaf;
- struct btrfs_map_token token;
struct btrfs_key key;
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
+ btrfs_set_stack_file_extent_generation(&fi, trans->transid);
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
+ else
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
+
+ block_len = max(em->block_len, em->orig_block_len);
+ if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
+ extent_offset);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ }
+
+ btrfs_set_stack_file_extent_offset(&fi, extent_offset);
+ btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
+ btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
+ btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
+
ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
@@ -4589,12 +4762,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
* are small, with a root at level 2 or 3 at most, due to their short
* life span.
*/
- if (inode_logged(trans, inode)) {
+ if (ctx->logged_before) {
drop_args.path = path;
drop_args.start = em->start;
drop_args.end = em->start + em->len;
drop_args.replace_extent = true;
- drop_args.extent_item_size = sizeof(*fi);
+ drop_args.extent_item_size = sizeof(fi);
ret = btrfs_drop_extents(trans, log, inode, &drop_args);
if (ret)
return ret;
@@ -4606,44 +4779,14 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
key.offset = em->start;
ret = btrfs_insert_empty_item(trans, log, path, &key,
- sizeof(*fi));
+ sizeof(fi));
if (ret)
return ret;
}
leaf = path->nodes[0];
- btrfs_init_map_token(&token, leaf);
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- btrfs_set_token_file_extent_type(&token, fi,
- BTRFS_FILE_EXTENT_PREALLOC);
- else
- btrfs_set_token_file_extent_type(&token, fi,
- BTRFS_FILE_EXTENT_REG);
-
- block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_token_file_extent_disk_bytenr(&token, fi,
- em->block_start);
- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_token_file_extent_disk_bytenr(&token, fi,
- em->block_start -
- extent_offset);
- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
- } else {
- btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
- }
-
- btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
- btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
- btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
- btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
- btrfs_set_token_file_extent_encryption(&token, fi, 0);
- btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
+ write_extent_buffer(leaf, &fi,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(fi));
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
@@ -4857,7 +5000,6 @@ process:
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- btrfs_release_path(path);
if (!ret)
ret = btrfs_log_prealloc_extents(trans, inode, path);
if (ret)
@@ -5551,6 +5693,13 @@ next_key:
} else {
break;
}
+
+ /*
+ * We may process many leaves full of items for our inode, so
+ * avoid monopolizing a cpu for too long by rescheduling while
+ * not holding locks on any tree.
+ */
+ cond_resched();
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
@@ -5595,8 +5744,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = inode->root->log_root;
- int err = 0;
- int ret = 0;
+ int ret;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &inode->extent_tree;
@@ -5605,6 +5753,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
bool xattrs_logged = false;
bool recursive_logging = false;
bool inode_item_dropped = true;
+ const bool orig_logged_before = ctx->logged_before;
path = btrfs_alloc_path();
if (!path)
@@ -5638,8 +5787,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* and figure out which index ranges have to be logged.
*/
if (S_ISDIR(inode->vfs_inode.i_mode)) {
- err = btrfs_commit_inode_delayed_items(trans, inode);
- if (err)
+ ret = btrfs_commit_inode_delayed_items(trans, inode);
+ if (ret)
goto out;
}
@@ -5655,6 +5804,17 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
/*
+ * Before logging the inode item, cache the value returned by
+ * inode_logged(), because after that we have the need to figure out if
+ * the inode was previously logged in this transaction.
+ */
+ ret = inode_logged(trans, inode, path);
+ if (ret < 0)
+ goto out_unlock;
+ ctx->logged_before = (ret == 1);
+ ret = 0;
+
+ /*
* This is for cases where logging a directory could result in losing a
* a file after replaying the log. For example, if we move a file from a
* directory A to a directory B, then fsync directory A, we have no way
@@ -5665,7 +5825,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
inode_only == LOG_INODE_ALL &&
inode->last_unlink_trans >= trans->transid) {
btrfs_set_log_full_commit(trans);
- err = 1;
+ ret = 1;
goto out_unlock;
}
@@ -5679,9 +5839,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
if (inode_only == LOG_INODE_EXISTS)
max_key_type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_inode_items(trans, log, path, inode, max_key_type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key_type);
} else {
- if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) {
+ if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
/*
* Make sure the new inode item we write to the log has
* the same isize as the current one (if it exists).
@@ -5695,22 +5857,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* (zeroes), as if an expanding truncate happened,
* instead of getting a file of 4Kb only.
*/
- err = logged_inode_size(log, inode, path, &logged_isize);
- if (err)
+ ret = logged_inode_size(log, inode, path, &logged_isize);
+ if (ret)
goto out_unlock;
}
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags)) {
if (inode_only == LOG_INODE_EXISTS) {
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_inode_items(trans, log, path, inode,
- max_key.type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path,
+ inode, max_key.type);
} else {
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags);
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags);
- if (inode_logged(trans, inode))
+ if (ctx->logged_before)
ret = truncate_inode_items(trans, log,
inode, 0, 0);
}
@@ -5720,8 +5883,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (inode_only == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_inode_items(trans, log, path, inode,
- max_key.type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
@@ -5730,37 +5894,35 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
}
- if (ret) {
- err = ret;
+ if (ret)
goto out_unlock;
- }
- err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
+ ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
recursive_logging, inode_only, ctx,
&need_log_inode_item);
- if (err)
+ if (ret)
goto out_unlock;
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
- if (err)
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ if (ret)
goto out_unlock;
xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_holes(trans, inode, path);
- if (err)
+ ret = btrfs_log_holes(trans, inode, path);
+ if (ret)
goto out_unlock;
}
log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (need_log_inode_item) {
- err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
- if (err)
+ ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
+ if (ret)
goto out_unlock;
/*
* If we are doing a fast fsync and the inode was logged before
@@ -5771,18 +5933,16 @@ log_extents:
* BTRFS_INODE_COPY_EVERYTHING set.
*/
if (!xattrs_logged && inode->logged_trans < trans->transid) {
- err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
- if (err)
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ if (ret)
goto out_unlock;
btrfs_release_path(path);
}
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
- if (ret) {
- err = ret;
+ if (ret)
goto out_unlock;
- }
} else if (inode_only == LOG_INODE_ALL) {
struct extent_map *em, *n;
@@ -5794,10 +5954,8 @@ log_extents:
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
ret = log_directory_changes(trans, inode, path, dst_path, ctx);
- if (ret) {
- err = ret;
+ if (ret)
goto out_unlock;
- }
}
spin_lock(&inode->lock);
@@ -5836,12 +5994,24 @@ log_extents:
if (inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
+
+ /*
+ * Reset the last_reflink_trans so that the next fsync does not need to
+ * go through the slower path when logging extents and their checksums.
+ */
+ if (inode_only == LOG_INODE_ALL)
+ inode->last_reflink_trans = 0;
+
out_unlock:
mutex_unlock(&inode->log_mutex);
out:
btrfs_free_path(path);
btrfs_free_path(dst_path);
- return err;
+
+ if (recursive_logging)
+ ctx->logged_before = orig_logged_before;
+
+ return ret;
}
/*
@@ -5926,7 +6096,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_root *log = root->log_root;
struct btrfs_path *path;
LIST_HEAD(dir_list);
struct btrfs_dir_list *dir_elem;
@@ -5968,7 +6137,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
min_key.offset = 0;
again:
btrfs_release_path(path);
- ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
if (ret < 0) {
goto next_dir_inode;
} else if (ret > 0) {
@@ -5976,7 +6145,6 @@ again:
goto next_dir_inode;
}
-process_leaf:
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
for (i = path->slots[0]; i < nritems; i++) {
@@ -5994,8 +6162,7 @@ process_leaf:
di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
type = btrfs_dir_type(leaf, di);
- if (btrfs_dir_transid(leaf, di) < trans->transid &&
- type != BTRFS_FT_DIR)
+ if (btrfs_dir_transid(leaf, di) < trans->transid)
continue;
btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
if (di_key.type == BTRFS_ROOT_ITEM_KEY)
@@ -6033,16 +6200,6 @@ process_leaf:
}
break;
}
- if (i == nritems) {
- ret = btrfs_next_leaf(log, path);
- if (ret < 0) {
- goto next_dir_inode;
- } else if (ret > 0) {
- ret = 0;
- goto next_dir_inode;
- }
- goto process_leaf;
- }
if (min_key.offset < (u64)-1) {
min_key.offset++;
goto again;
@@ -6773,15 +6930,32 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
mutex_unlock(&dir->log_mutex);
}
-/*
- * Call this after adding a new name for a file and it will properly
- * update the log to reflect the new name.
+/**
+ * Update the log after adding a new name for an inode.
+ *
+ * @trans: Transaction handle.
+ * @old_dentry: The dentry associated with the old name and the old
+ * parent directory.
+ * @old_dir: The inode of the previous parent directory for the case
+ * of a rename. For a link operation, it must be NULL.
+ * @old_dir_index: The index number associated with the old name, meaningful
+ * only for rename operations (when @old_dir is not NULL).
+ * Ignored for link operations.
+ * @parent: The dentry associated with the directory under which the
+ * new name is located.
+ *
+ * Call this after adding a new name for an inode, as a result of a link or
+ * rename operation, and it will properly update the log to reflect the new name.
*/
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent)
+ struct dentry *old_dentry, struct btrfs_inode *old_dir,
+ u64 old_dir_index, struct dentry *parent)
{
+ struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
+ struct btrfs_root *root = inode->root;
struct btrfs_log_ctx ctx;
+ bool log_pinned = false;
+ int ret;
/*
* this will force the logging code to walk the dentry chain
@@ -6794,26 +6968,83 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (!inode_logged(trans, inode) &&
- (!old_dir || !inode_logged(trans, old_dir)))
- return;
+ ret = inode_logged(trans, inode, NULL);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
+ if (!old_dir)
+ return;
+ /*
+ * If the inode was not logged and we are doing a rename (old_dir is not
+ * NULL), check if old_dir was logged - if it was not we can return and
+ * do nothing.
+ */
+ ret = inode_logged(trans, old_dir, NULL);
+ if (ret < 0)
+ goto out;
+ else if (ret == 0)
+ return;
+ }
+ ret = 0;
/*
* If we are doing a rename (old_dir is not NULL) from a directory that
- * was previously logged, make sure the next log attempt on the directory
- * is not skipped and logs the inode again. This is because the log may
- * not currently be authoritative for a range including the old
- * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we
- * do not end up with both the new and old dentries around (in case the
- * inode is a directory we would have a directory with two hard links and
- * 2 inode references for different parents). The next log attempt of
- * old_dir will happen at btrfs_log_all_parents(), called through
- * btrfs_log_inode_parent() below, because we have previously set
- * inode->last_unlink_trans to the current transaction ID, either here or
- * at btrfs_record_unlink_dir() in case the inode is a directory.
+ * was previously logged, make sure that on log replay we get the old
+ * dir entry deleted. This is needed because we will also log the new
+ * name of the renamed inode, so we need to make sure that after log
+ * replay we don't end up with both the new and old dir entries existing.
*/
- if (old_dir)
- old_dir->logged_trans = 0;
+ if (old_dir && old_dir->logged_trans == trans->transid) {
+ struct btrfs_root *log = old_dir->root->log_root;
+ struct btrfs_path *path;
+
+ ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
+
+ /*
+ * We have two inodes to update in the log, the old directory and
+ * the inode that got renamed, so we must pin the log to prevent
+ * anyone from syncing the log until we have updated both inodes
+ * in the log.
+ */
+ log_pinned = true;
+ btrfs_pin_log_trans(root);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Other concurrent task might be logging the old directory,
+ * as it can be triggered when logging other inode that had or
+ * still has a dentry in the old directory. So take the old
+ * directory's log_mutex to prevent getting an -EEXIST when
+ * logging a key to record the deletion, or having that other
+ * task logging the old directory get an -EEXIST if it attempts
+ * to log the same key after we just did it. In both cases that
+ * would result in falling back to a transaction commit.
+ */
+ mutex_lock(&old_dir->log_mutex);
+ ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
+ old_dentry->d_name.name,
+ old_dentry->d_name.len, old_dir_index);
+ if (ret > 0) {
+ /*
+ * The dentry does not exist in the log, so record its
+ * deletion.
+ */
+ btrfs_release_path(path);
+ ret = insert_dir_log_key(trans, log, path,
+ btrfs_ino(old_dir),
+ old_dir_index, old_dir_index);
+ }
+ mutex_unlock(&old_dir->log_mutex);
+
+ btrfs_free_path(path);
+ if (ret < 0)
+ goto out;
+ }
btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
ctx.logging_new_name = true;
@@ -6825,5 +7056,16 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+out:
+ /*
+ * If an error happened mark the log for a full commit because it's not
+ * consistent and up to date or we couldn't find out if one of the
+ * inodes was logged before in this transaction. Do it before unpinning
+ * the log, to avoid any races with someone else trying to commit it.
+ */
+ if (ret < 0)
+ btrfs_set_log_full_commit(trans);
+ if (log_pinned)
+ btrfs_end_log_trans(root);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index f6811c3df38a..1620f8170629 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -17,6 +17,8 @@ struct btrfs_log_ctx {
int log_transid;
bool log_new_dentries;
bool logging_new_name;
+ /* Indicate if the inode being logged was logged before. */
+ bool logged_before;
/* Tracks the last logged dir item/index key offset. */
u64 last_dir_item_offset;
struct inode *inode;
@@ -32,6 +34,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_transid = 0;
ctx->log_new_dentries = false;
ctx->logging_new_name = false;
+ ctx->logged_before = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
INIT_LIST_HEAD(&ctx->ordered_extents);
@@ -86,7 +89,7 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent);
+ struct dentry *old_dentry, struct btrfs_inode *old_dir,
+ u64 old_dir_index, struct dentry *parent);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b07d382d53a8..1be7cb2f955f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -534,30 +534,20 @@ error:
return ret;
}
-static bool device_path_matched(const char *path, struct btrfs_device *device)
-{
- int found;
-
- rcu_read_lock();
- found = strcmp(rcu_str_deref(device->name), path);
- rcu_read_unlock();
-
- return found == 0;
-}
-
-/*
- * Search and remove all stale (devices which are not mounted) devices.
+/**
+ * Search and remove all stale devices (which are not mounted).
* When both inputs are NULL, it will search and release all stale devices.
- * path: Optional. When provided will it release all unmounted devices
- * matching this path only.
- * skip_dev: Optional. Will skip this device when searching for the stale
+ *
+ * @devt: Optional. When provided will it release all unmounted devices
+ * matching this devt only.
+ * @skip_device: Optional. Will skip this device when searching for the stale
* devices.
- * Return: 0 for success or if @path is NULL.
- * -EBUSY if @path is a mounted device.
- * -ENOENT if @path does not match any device in the list.
+ *
+ * Return: 0 for success or if @devt is 0.
+ * -EBUSY if @devt is a mounted device.
+ * -ENOENT if @devt does not match any device in the list.
*/
-static int btrfs_free_stale_devices(const char *path,
- struct btrfs_device *skip_device)
+static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
@@ -565,7 +555,7 @@ static int btrfs_free_stale_devices(const char *path,
lockdep_assert_held(&uuid_mutex);
- if (path)
+ if (devt)
ret = -ENOENT;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
@@ -575,13 +565,11 @@ static int btrfs_free_stale_devices(const char *path,
&fs_devices->devices, dev_list) {
if (skip_device && skip_device == device)
continue;
- if (path && !device->name)
- continue;
- if (path && !device_path_matched(path, device))
+ if (devt && devt != device->devt)
continue;
if (fs_devices->opened) {
/* for an already deleted device return 0 */
- if (path && ret != 0)
+ if (devt && ret != 0)
ret = -EBUSY;
break;
}
@@ -614,7 +602,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, fmode_t flags,
void *holder)
{
- struct request_queue *q;
struct block_device *bdev;
struct btrfs_super_block *disk_super;
u64 devid;
@@ -656,8 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- q = bdev_get_queue(bdev);
- if (!blk_queue_nonrot(q))
+ if (!blk_queue_nonrot(bdev_get_queue(bdev)))
fs_devices->rotating = true;
device->bdev = bdev;
@@ -781,11 +767,17 @@ static noinline struct btrfs_device *device_list_add(const char *path,
struct rcu_string *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
+ dev_t path_devt;
+ int error;
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+ error = lookup_bdev(path, &path_devt);
+ if (error)
+ return ERR_PTR(error);
+
if (fsid_change_in_progress) {
if (!has_metadata_uuid)
fs_devices = find_fsid_inprogress(disk_super);
@@ -868,6 +860,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(-ENOMEM);
}
rcu_assign_pointer(device->name, name);
+ device->devt = path_devt;
list_add_rcu(&device->dev_list, &fs_devices->devices);
fs_devices->num_devices++;
@@ -928,25 +921,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
+ *
+ * NOTE: the device->fs_info may not be reliable here so pass
+ * in a NULL to message helpers instead. This avoids a possible
+ * use-after-free when the fs_info and fs_info->sb are already
+ * torn down.
*/
if (device->bdev) {
- int error;
- dev_t path_dev;
-
- error = lookup_bdev(path, &path_dev);
- if (error) {
- mutex_unlock(&fs_devices->device_list_mutex);
- return ERR_PTR(error);
- }
-
- if (device->bdev->bd_dev != path_dev) {
+ if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * device->fs_info may not be reliable here, so
- * pass in a NULL instead. This avoids a
- * possible use-after-free when the fs_info and
- * fs_info->sb are already torn down.
- */
btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
@@ -954,7 +937,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(device->fs_info,
+ btrfs_info_in_rcu(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
path, current->comm,
@@ -972,6 +955,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
fs_devices->missing_devices--;
clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
+ device->devt = path_devt;
}
/*
@@ -1331,12 +1315,12 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
return disk_super;
}
-int btrfs_forget_devices(const char *path)
+int btrfs_forget_devices(dev_t devt)
{
int ret;
mutex_lock(&uuid_mutex);
- ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+ ret = btrfs_free_stale_devices(devt, NULL);
mutex_unlock(&uuid_mutex);
return ret;
@@ -1385,10 +1369,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
}
device = device_list_add(path, disk_super, &new_device_added);
- if (!IS_ERR(device)) {
- if (new_device_added)
- btrfs_free_stale_devices(path, device);
- }
+ if (!IS_ERR(device) && new_device_added)
+ btrfs_free_stale_devices(device->devt, device);
btrfs_release_disk_super(disk_super);
@@ -2102,6 +2084,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
u64 num_devices;
int ret = 0;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
/*
* The device list in fs_devices is accessed without locks (neither
* uuid_mutex nor device_list_mutex) as it won't change on a mounted
@@ -2606,7 +2593,6 @@ error:
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
{
struct btrfs_root *root = fs_info->dev_root;
- struct request_queue *q;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct block_device *bdev;
@@ -2668,6 +2654,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->fs_info = fs_info;
device->bdev = bdev;
+ ret = lookup_bdev(device_path, &device->devt);
+ if (ret)
+ goto error_free_device;
ret = btrfs_get_dev_zone_info(device, false);
if (ret)
@@ -2679,7 +2668,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error_free_zone;
}
- q = bdev_get_queue(bdev);
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = trans->transid;
device->io_width = fs_info->sectorsize;
@@ -2727,7 +2715,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
- if (!blk_queue_nonrot(q))
+ if (!blk_queue_nonrot(bdev_get_queue(bdev)))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2814,7 +2802,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
* We can ignore the return value as it typically returns -EINVAL and
* only succeeds if the device was an alien.
*/
- btrfs_forget_devices(device_path);
+ btrfs_forget_devices(device->devt);
/* Update ctime/mtime for blkid or udev */
update_dev_time(device_path);
@@ -3251,6 +3239,12 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
u64 length;
int ret;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "relocate: not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
/*
* Prevent races with automatic removal of unused block groups.
* After we relocate and before we remove the chunk with offset
@@ -7060,6 +7054,27 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
}
#endif
+static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
+ u64 devid, u8 *uuid)
+{
+ struct btrfs_device *dev;
+
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_report_missing_device(fs_info, devid, uuid, true);
+ return ERR_PTR(-ENOENT);
+ }
+
+ dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
+ if (IS_ERR(dev)) {
+ btrfs_err(fs_info, "failed to init missing device %llu: %ld",
+ devid, PTR_ERR(dev));
+ return dev;
+ }
+ btrfs_report_missing_device(fs_info, devid, uuid, false);
+
+ return dev;
+}
+
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
@@ -7147,28 +7162,17 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
BTRFS_UUID_SIZE);
args.uuid = uuid;
map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
- if (!map->stripes[i].dev &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
- free_extent_map(em);
- btrfs_report_missing_device(fs_info, devid, uuid, true);
- return -ENOENT;
- }
if (!map->stripes[i].dev) {
- map->stripes[i].dev =
- add_missing_dev(fs_info->fs_devices, devid,
- uuid);
+ map->stripes[i].dev = handle_missing_device(fs_info,
+ devid, uuid);
if (IS_ERR(map->stripes[i].dev)) {
free_extent_map(em);
- btrfs_err(fs_info,
- "failed to init missing dev %llu: %ld",
- devid, PTR_ERR(map->stripes[i].dev));
return PTR_ERR(map->stripes[i].dev);
}
- btrfs_report_missing_device(fs_info, devid, uuid, false);
}
+
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&(map->stripes[i].dev->dev_state));
-
}
write_lock(&map_tree->lock);
@@ -8299,10 +8303,12 @@ static int relocating_repair_kthread(void *data)
target = cache->start;
btrfs_put_block_group(cache);
+ sb_start_write(fs_info->sb);
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
btrfs_info(fs_info,
"zoned: skip relocating block group %llu to repair: EBUSY",
target);
+ sb_end_write(fs_info->sb);
return -EBUSY;
}
@@ -8330,6 +8336,7 @@ out:
btrfs_put_block_group(cache);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
return ret;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 005c9e2a491a..bd297f23d19e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -72,6 +72,11 @@ struct btrfs_device {
/* the mode sent to blkdev_get */
fmode_t mode;
+ /*
+ * Device's major-minor number. Must be set even if the device is not
+ * opened (bdev == NULL), unless the device is missing.
+ */
+ dev_t devt;
unsigned long dev_state;
blk_status_t last_flush_error;
@@ -505,7 +510,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
-int btrfs_forget_devices(const char *path);
+int btrfs_forget_devices(dev_t devt);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
void btrfs_assign_next_active_device(struct btrfs_device *device,
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index f559d517c7c4..b7b5fac1c779 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -652,8 +652,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
if (model == BLK_ZONED_HM ||
(model == BLK_ZONED_HA && incompat_zoned) ||
(model == BLK_ZONED_NONE && incompat_zoned)) {
- struct btrfs_zoned_device_info *zone_info =
- device->zone_info;
+ struct btrfs_zoned_device_info *zone_info;
zone_info = device->zone_info;
zoned_devices++;
@@ -1215,12 +1214,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
struct btrfs_device *device;
u64 logical = cache->start;
u64 length = cache->length;
- u64 physical = 0;
int ret;
int i;
unsigned int nofs_flag;
u64 *alloc_offsets = NULL;
u64 *caps = NULL;
+ u64 *physical = NULL;
unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
@@ -1264,6 +1263,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
+ if (!physical) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
if (!active) {
ret = -ENOMEM;
@@ -1277,14 +1282,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
int dev_replace_is_ongoing = 0;
device = map->stripes[i].dev;
- physical = map->stripes[i].physical;
+ physical[i] = map->stripes[i].physical;
if (device->bdev == NULL) {
alloc_offsets[i] = WP_MISSING_DEV;
continue;
}
- is_sequential = btrfs_dev_is_sequential(device, physical);
+ is_sequential = btrfs_dev_is_sequential(device, physical[i]);
if (is_sequential)
num_sequential++;
else
@@ -1299,21 +1304,21 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
* This zone will be used for allocation, so mark this zone
* non-empty.
*/
- btrfs_dev_clear_zone_empty(device, physical);
+ btrfs_dev_clear_zone_empty(device, physical[i]);
down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
- btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical);
+ btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
up_read(&dev_replace->rwsem);
/*
* The group is mapped to a sequential zone. Get the zone write
* pointer to determine the allocation offset within the zone.
*/
- WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
+ WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
nofs_flag = memalloc_nofs_save();
- ret = btrfs_get_dev_zone(device, physical, &zone);
+ ret = btrfs_get_dev_zone(device, physical[i], &zone);
memalloc_nofs_restore(nofs_flag);
if (ret == -EIO || ret == -EOPNOTSUPP) {
ret = 0;
@@ -1339,7 +1344,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
case BLK_ZONE_COND_READONLY:
btrfs_err(fs_info,
"zoned: offline/readonly zone %llu on device %s (devid %llu)",
- physical >> device->zone_info->zone_size_shift,
+ physical[i] >> device->zone_info->zone_size_shift,
rcu_str_deref(device->name), device->devid);
alloc_offsets[i] = WP_MISSING_DEV;
break;
@@ -1404,7 +1409,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
if (alloc_offsets[0] == WP_MISSING_DEV) {
btrfs_err(fs_info,
"zoned: cannot recover write pointer for zone %llu",
- physical);
+ physical[0]);
ret = -EIO;
goto out;
}
@@ -1413,6 +1418,42 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
cache->zone_is_active = test_bit(0, active);
break;
case BTRFS_BLOCK_GROUP_DUP:
+ if (map->type & BTRFS_BLOCK_GROUP_DATA) {
+ btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (alloc_offsets[0] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical[0]);
+ ret = -EIO;
+ goto out;
+ }
+ if (alloc_offsets[1] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical[1]);
+ ret = -EIO;
+ goto out;
+ }
+ if (alloc_offsets[0] != alloc_offsets[1]) {
+ btrfs_err(fs_info,
+ "zoned: write pointer offset mismatch of zones in DUP profile");
+ ret = -EIO;
+ goto out;
+ }
+ if (test_bit(0, active) != test_bit(1, active)) {
+ if (!btrfs_zone_activate(cache)) {
+ ret = -EIO;
+ goto out;
+ }
+ } else {
+ cache->zone_is_active = test_bit(0, active);
+ }
+ cache->alloc_offset = alloc_offsets[0];
+ cache->zone_capacity = min(caps[0], caps[1]);
+ break;
case BTRFS_BLOCK_GROUP_RAID1:
case BTRFS_BLOCK_GROUP_RAID0:
case BTRFS_BLOCK_GROUP_RAID10:
@@ -1465,6 +1506,7 @@ out:
cache->physical_map = NULL;
}
bitmap_free(active);
+ kfree(physical);
kfree(caps);
kfree(alloc_offsets);
free_extent_map(em);
@@ -1781,50 +1823,55 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
struct btrfs_device *device;
u64 physical;
bool ret;
+ int i;
if (!btrfs_is_zoned(block_group->fs_info))
return true;
map = block_group->physical_map;
- /* Currently support SINGLE profile only */
- ASSERT(map->num_stripes == 1);
- device = map->stripes[0].dev;
- physical = map->stripes[0].physical;
-
- if (device->zone_info->max_active_zones == 0)
- return true;
spin_lock(&block_group->lock);
-
if (block_group->zone_is_active) {
ret = true;
goto out_unlock;
}
- /* No space left */
- if (block_group->alloc_offset == block_group->zone_capacity) {
- ret = false;
- goto out_unlock;
- }
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ physical = map->stripes[i].physical;
- if (!btrfs_dev_set_active_zone(device, physical)) {
- /* Cannot activate the zone */
- ret = false;
- goto out_unlock;
- }
+ if (device->zone_info->max_active_zones == 0)
+ continue;
+
+ /* No space left */
+ if (block_group->alloc_offset == block_group->zone_capacity) {
+ ret = false;
+ goto out_unlock;
+ }
+
+ if (!btrfs_dev_set_active_zone(device, physical)) {
+ /* Cannot activate the zone */
+ ret = false;
+ goto out_unlock;
+ }
+
+ /* Successfully activated all the zones */
+ if (i == map->num_stripes - 1)
+ block_group->zone_is_active = 1;
- /* Successfully activated all the zones */
- block_group->zone_is_active = 1;
+ }
spin_unlock(&block_group->lock);
- /* For the active block group list */
- btrfs_get_block_group(block_group);
+ if (block_group->zone_is_active) {
+ /* For the active block group list */
+ btrfs_get_block_group(block_group);
- spin_lock(&fs_info->zone_active_bgs_lock);
- ASSERT(list_empty(&block_group->active_bg_list));
- list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
- spin_unlock(&fs_info->zone_active_bgs_lock);
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&block_group->active_bg_list,
+ &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
return true;
@@ -1840,19 +1887,12 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
struct btrfs_device *device;
u64 physical;
int ret = 0;
+ int i;
if (!btrfs_is_zoned(fs_info))
return 0;
map = block_group->physical_map;
- /* Currently support SINGLE profile only */
- ASSERT(map->num_stripes == 1);
-
- device = map->stripes[0].dev;
- physical = map->stripes[0].physical;
-
- if (device->zone_info->max_active_zones == 0)
- return 0;
spin_lock(&block_group->lock);
if (!block_group->zone_is_active) {
@@ -1904,25 +1944,34 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
btrfs_clear_data_reloc_bg(block_group);
spin_unlock(&block_group->lock);
- ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
- physical >> SECTOR_SHIFT,
- device->zone_info->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
- btrfs_dec_block_group_ro(block_group);
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ physical = map->stripes[i].physical;
- if (!ret) {
- btrfs_dev_clear_active_zone(device, physical);
+ if (device->zone_info->max_active_zones == 0)
+ continue;
- spin_lock(&fs_info->zone_active_bgs_lock);
- ASSERT(!list_empty(&block_group->active_bg_list));
- list_del_init(&block_group->active_bg_list);
- spin_unlock(&fs_info->zone_active_bgs_lock);
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ device->zone_info->zone_size >> SECTOR_SHIFT,
+ GFP_NOFS);
- /* For active_bg_list */
- btrfs_put_block_group(block_group);
+ if (ret)
+ return ret;
+
+ btrfs_dev_clear_active_zone(device, physical);
}
+ btrfs_dec_block_group_ro(block_group);
- return ret;
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(!list_empty(&block_group->active_bg_list));
+ list_del_init(&block_group->active_bg_list);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ /* For active_bg_list */
+ btrfs_put_block_group(block_group);
+
+ return 0;
}
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
diff --git a/fs/internal.h b/fs/internal.h
index 56c0477f4215..fb2c2ea807d7 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -158,11 +158,6 @@ extern void dput_to_list(struct dentry *, struct list_head *);
extern void shrink_dentry_list(struct list_head *);
/*
- * read_write.c
- */
-extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
-
-/*
* pipe.c
*/
extern const struct file_operations pipefifo_fops;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 1ed097e94af2..090bf47606ab 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -236,9 +236,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
if (!src_file.file)
return -EBADF;
- ret = -EXDEV;
- if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
- goto fdput;
cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
olen, 0);
if (cloned < 0)
@@ -247,7 +244,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
ret = -EINVAL;
else
ret = 0;
-fdput:
fdput(src_file);
return ret;
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 0074afa7ecb3..dc5000173b80 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -385,6 +385,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
return security_file_permission(file,
read_write == READ ? MAY_READ : MAY_WRITE);
}
+EXPORT_SYMBOL(rw_verify_area);
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
@@ -1617,24 +1618,16 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
return 0;
}
-/*
- * Performs necessary checks before doing a write
- *
- * Can adjust writing position or amount of bytes to write.
- * Returns appropriate error code that caller should return or
- * zero in case that write should be allowed.
- */
-ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+/* Like generic_write_checks(), but takes size of write instead of iter. */
+int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- loff_t count;
- int ret;
if (IS_SWAPFILE(inode))
return -ETXTBSY;
- if (!iov_iter_count(from))
+ if (!*count)
return 0;
/* FIXME: this is for backwards compatibility with 2.4 */
@@ -1644,8 +1637,23 @@ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EINVAL;
- count = iov_iter_count(from);
- ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+ return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
+}
+EXPORT_SYMBOL(generic_write_checks_count);
+
+/*
+ * Performs necessary checks before doing a write
+ *
+ * Can adjust writing position or amount of bytes to write.
+ * Returns appropriate error code that caller should return or
+ * zero in case that write should be allowed.
+ */
+ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+ loff_t count = iov_iter_count(from);
+ int ret;
+
+ ret = generic_write_checks_count(iocb, &count);
if (ret)
return ret;
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 231159682907..bc5fb006dc79 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -362,11 +362,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
- /*
- * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
- * the same mount. Practically, they only need to be on the same file
- * system.
- */
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
return -EXDEV;
@@ -458,7 +453,7 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
goto out_drop_write;
ret = -EXDEV;
- if (src_file->f_path.mnt != dst_file->f_path.mnt)
+ if (file_inode(src_file)->i_sb != file_inode(dst_file)->i_sb)
goto out_drop_write;
ret = -EISDIR;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e2d892b201b0..27746a3da8fd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3130,6 +3130,7 @@ extern int sb_min_blocksize(struct super_block *, int);
extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
+int generic_write_checks_count(struct kiocb *iocb, loff_t *count);
extern int generic_write_check_limits(struct file *file, loff_t pos,
loff_t *count);
extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
@@ -3173,6 +3174,7 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
int whence, loff_t size);
extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);
extern loff_t no_seek_end_llseek(struct file *, loff_t, int);
+int rw_verify_area(int, struct file *, const loff_t *, size_t);
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
extern int stream_open(struct inode * inode, struct file * filp);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 0d729664b4b4..f068ff30d654 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -53,6 +53,7 @@ struct btrfs_space_info;
{ BTRFS_TREE_RELOC_OBJECTID, "TREE_RELOC" }, \
{ BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, \
{ BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, \
+ { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },\
{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" })
#define show_root_type(obj) \
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 738619994e26..d956b2993970 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -309,6 +309,7 @@ struct btrfs_ioctl_fs_info_args {
#define BTRFS_FEATURE_INCOMPAT_METADATA_UUID (1ULL << 10)
#define BTRFS_FEATURE_INCOMPAT_RAID1C34 (1ULL << 11)
#define BTRFS_FEATURE_INCOMPAT_ZONED (1ULL << 12)
+#define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13)
struct btrfs_ioctl_feature_flags {
__u64 compat_flags;
@@ -868,6 +869,134 @@ struct btrfs_ioctl_get_subvol_rootref_args {
__u8 align[7];
};
+/*
+ * Data and metadata for an encoded read or write.
+ *
+ * Encoded I/O bypasses any encoding automatically done by the filesystem (e.g.,
+ * compression). This can be used to read the compressed contents of a file or
+ * write pre-compressed data directly to a file.
+ *
+ * BTRFS_IOC_ENCODED_READ and BTRFS_IOC_ENCODED_WRITE are essentially
+ * preadv/pwritev with additional metadata about how the data is encoded and the
+ * size of the unencoded data.
+ *
+ * BTRFS_IOC_ENCODED_READ fills the given iovecs with the encoded data, fills
+ * the metadata fields, and returns the size of the encoded data. It reads one
+ * extent per call. It can also read data which is not encoded.
+ *
+ * BTRFS_IOC_ENCODED_WRITE uses the metadata fields, writes the encoded data
+ * from the iovecs, and returns the size of the encoded data. Note that the
+ * encoded data is not validated when it is written; if it is not valid (e.g.,
+ * it cannot be decompressed), then a subsequent read may return an error.
+ *
+ * Since the filesystem page cache contains decoded data, encoded I/O bypasses
+ * the page cache. Encoded I/O requires CAP_SYS_ADMIN.
+ */
+struct btrfs_ioctl_encoded_io_args {
+ /* Input parameters for both reads and writes. */
+
+ /*
+ * iovecs containing encoded data.
+ *
+ * For reads, if the size of the encoded data is larger than the sum of
+ * iov[n].iov_len for 0 <= n < iovcnt, then the ioctl fails with
+ * ENOBUFS.
+ *
+ * For writes, the size of the encoded data is the sum of iov[n].iov_len
+ * for 0 <= n < iovcnt. This must be less than 128 KiB (this limit may
+ * increase in the future). This must also be less than or equal to
+ * unencoded_len.
+ */
+ const struct iovec __user *iov;
+ /* Number of iovecs. */
+ unsigned long iovcnt;
+ /*
+ * Offset in file.
+ *
+ * For writes, must be aligned to the sector size of the filesystem.
+ */
+ __s64 offset;
+ /* Currently must be zero. */
+ __u64 flags;
+
+ /*
+ * For reads, the following members are output parameters that will
+ * contain the returned metadata for the encoded data.
+ * For writes, the following members must be set to the metadata for the
+ * encoded data.
+ */
+
+ /*
+ * Length of the data in the file.
+ *
+ * Must be less than or equal to unencoded_len - unencoded_offset. For
+ * writes, must be aligned to the sector size of the filesystem unless
+ * the data ends at or beyond the current end of the file.
+ */
+ __u64 len;
+ /*
+ * Length of the unencoded (i.e., decrypted and decompressed) data.
+ *
+ * For writes, must be no more than 128 KiB (this limit may increase in
+ * the future). If the unencoded data is actually longer than
+ * unencoded_len, then it is truncated; if it is shorter, then it is
+ * extended with zeroes.
+ */
+ __u64 unencoded_len;
+ /*
+ * Offset from the first byte of the unencoded data to the first byte of
+ * logical data in the file.
+ *
+ * Must be less than unencoded_len.
+ */
+ __u64 unencoded_offset;
+ /*
+ * BTRFS_ENCODED_IO_COMPRESSION_* type.
+ *
+ * For writes, must not be BTRFS_ENCODED_IO_COMPRESSION_NONE.
+ */
+ __u32 compression;
+ /* Currently always BTRFS_ENCODED_IO_ENCRYPTION_NONE. */
+ __u32 encryption;
+ /*
+ * Reserved for future expansion.
+ *
+ * For reads, always returned as zero. Users should check for non-zero
+ * bytes. If there are any, then the kernel has a newer version of this
+ * structure with additional information that the user definition is
+ * missing.
+ *
+ * For writes, must be zeroed.
+ */
+ __u8 reserved[64];
+};
+
+/* Data is not compressed. */
+#define BTRFS_ENCODED_IO_COMPRESSION_NONE 0
+/* Data is compressed as a single zlib stream. */
+#define BTRFS_ENCODED_IO_COMPRESSION_ZLIB 1
+/*
+ * Data is compressed as a single zstd frame with the windowLog compression
+ * parameter set to no more than 17.
+ */
+#define BTRFS_ENCODED_IO_COMPRESSION_ZSTD 2
+/*
+ * Data is compressed sector by sector (using the sector size indicated by the
+ * name of the constant) with LZO1X and wrapped in the format documented in
+ * fs/btrfs/lzo.c. For writes, the compression sector size must match the
+ * filesystem sector size.
+ */
+#define BTRFS_ENCODED_IO_COMPRESSION_LZO_4K 3
+#define BTRFS_ENCODED_IO_COMPRESSION_LZO_8K 4
+#define BTRFS_ENCODED_IO_COMPRESSION_LZO_16K 5
+#define BTRFS_ENCODED_IO_COMPRESSION_LZO_32K 6
+#define BTRFS_ENCODED_IO_COMPRESSION_LZO_64K 7
+#define BTRFS_ENCODED_IO_COMPRESSION_TYPES 8
+
+/* Data is not encrypted. */
+#define BTRFS_ENCODED_IO_ENCRYPTION_NONE 0
+#define BTRFS_ENCODED_IO_ENCRYPTION_TYPES 1
+
/* Error codes as returned by the kernel */
enum btrfs_err_code {
BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
@@ -996,5 +1125,9 @@ enum btrfs_err_code {
struct btrfs_ioctl_ino_lookup_user_args)
#define BTRFS_IOC_SNAP_DESTROY_V2 _IOW(BTRFS_IOCTL_MAGIC, 63, \
struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_ENCODED_READ _IOR(BTRFS_IOCTL_MAGIC, 64, \
+ struct btrfs_ioctl_encoded_io_args)
+#define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \
+ struct btrfs_ioctl_encoded_io_args)
#endif /* _UAPI_LINUX_BTRFS_H */
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 5416f1f1a77a..b069752a8ecf 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -53,6 +53,9 @@
/* tracks free space in block groups. */
#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
+/* Holds the block group items for extent tree v2. */
+#define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
+
/* device stats in the device tree */
#define BTRFS_DEV_STATS_OBJECTID 0ULL