summaryrefslogtreecommitdiff
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c7299
1 files changed, 4456 insertions, 2843 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 920cee312f4e..fff37c8d96a4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -13,13 +13,25 @@
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
-#include "print-tree.h"
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
-#include "inode-map.h"
#include "block-group.h"
#include "space-info.h"
+#include "inode-item.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "dir-item.h"
+#include "file-item.h"
+#include "file.h"
+#include "orphan.h"
+#include "print-tree.h"
+#include "tree-checker.h"
+#include "delayed-inode.h"
+
+#define MAX_CONFLICT_INODES 10
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -30,8 +42,6 @@
enum {
LOG_INODE_ALL,
LOG_INODE_EXISTS,
- LOG_OTHER_INODE,
- LOG_OTHER_INODE_ALL,
};
/*
@@ -93,20 +103,136 @@ enum {
LOG_WALK_REPLAY_ALL,
};
+/*
+ * The walk control struct is used to pass state down the chain when processing
+ * the log tree. The stage field tells us which part of the log tree processing
+ * we are currently doing.
+ */
+struct walk_control {
+ /*
+ * Signal that we are freeing the metadata extents of a log tree.
+ * This is used at transaction commit time while freeing a log tree.
+ */
+ bool free;
+
+ /*
+ * Signal that we are pinning the metadata extents of a log tree and the
+ * data extents its leaves point to (if using mixed block groups).
+ * This happens in the first stage of log replay to ensure that during
+ * replay, while we are modifying subvolume trees, we don't overwrite
+ * the metadata extents of log trees.
+ */
+ bool pin;
+
+ /* What stage of the replay code we're currently in. */
+ int stage;
+
+ /*
+ * Ignore any items from the inode currently being processed. Needs
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY.
+ */
+ bool ignore_cur_inode;
+
+ /*
+ * The root we are currently replaying to. This is NULL for the replay
+ * stage LOG_WALK_PIN_ONLY.
+ */
+ struct btrfs_root *root;
+
+ /* The log tree we are currently processing (not NULL for any stage). */
+ struct btrfs_root *log;
+
+ /* The transaction handle used for replaying all log trees. */
+ struct btrfs_trans_handle *trans;
+
+ /*
+ * The function that gets used to process blocks we find in the tree.
+ * Note the extent_buffer might not be up to date when it is passed in,
+ * and it must be checked or read if you need the data inside it.
+ */
+ int (*process_func)(struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen, int level);
+
+ /*
+ * The following are used only when stage is >= LOG_WALK_REPLAY_INODES
+ * and by the replay_one_buffer() callback.
+ */
+
+ /* The current log leaf being processed. */
+ struct extent_buffer *log_leaf;
+ /* The key being processed of the current log leaf. */
+ struct btrfs_key log_key;
+ /* The slot being processed of the current log leaf. */
+ int log_slot;
+
+ /* A path used for searches and modifications to subvolume trees. */
+ struct btrfs_path *subvol_path;
+};
+
+static void do_abort_log_replay(struct walk_control *wc, const char *function,
+ unsigned int line, int error, const char *fmt, ...)
+{
+ struct btrfs_fs_info *fs_info = wc->trans->fs_info;
+ struct va_format vaf;
+ va_list args;
+
+ /*
+ * Do nothing if we already aborted, to avoid dumping leaves again which
+ * can be verbose. Further more, only the first call is useful since it
+ * is where we have a problem. Note that we do not use the flag
+ * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that
+ * are outside of tree-log.c that can abort transactions (such as
+ * btrfs_add_link() for example), so if that happens we still want to
+ * dump all log replay specific information below.
+ */
+ if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state))
+ return;
+
+ btrfs_abort_transaction(wc->trans, error);
+
+ if (wc->subvol_path->nodes[0]) {
+ btrfs_crit(fs_info,
+ "subvolume (root %llu) leaf currently being processed:",
+ btrfs_root_id(wc->root));
+ btrfs_print_leaf(wc->subvol_path->nodes[0]);
+ }
+
+ if (wc->log_leaf) {
+ btrfs_crit(fs_info,
+"log tree (for root %llu) leaf currently being processed (slot %d key " BTRFS_KEY_FMT "):",
+ btrfs_root_id(wc->root), wc->log_slot,
+ BTRFS_KEY_FMT_VALUE(&wc->log_key));
+ btrfs_print_leaf(wc->log_leaf);
+ }
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV",
+ function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf);
+
+ va_end(args);
+}
+
+/*
+ * Use this for aborting a transaction during log replay while we are down the
+ * call chain of replay_one_buffer(), so that we get a lot more useful
+ * information for debugging issues when compared to a plain call to
+ * btrfs_abort_transaction().
+ */
+#define btrfs_abort_log_replay(wc, error, fmt, args...) \
+ do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args)
+
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
-static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid);
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- u64 dirid, int del_all);
+static int link_to_fixup_dir(struct walk_control *wc, u64 objectid);
+static noinline int replay_dir_deletes(struct walk_control *wc,
+ u64 dirid, bool del_all);
+static void wait_log_commit(struct btrfs_root *root, int transid);
/*
* tree logging is a special write ahead log used to make sure that
@@ -131,6 +257,28 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
* and once to do all the other items.
*/
+static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+{
+ unsigned int nofs_flag;
+ struct btrfs_inode *inode;
+
+ /* Only meant to be called for subvolume roots and not for log roots. */
+ ASSERT(btrfs_is_fstree(btrfs_root_id(root)), "root_id=%llu", btrfs_root_id(root));
+
+ /*
+ * We're holding a transaction handle whether we are logging or
+ * replaying a log tree, so we must make sure NOFS semantics apply
+ * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
+ * to allocate an inode, which can recurse back into the filesystem and
+ * attempt a transaction commit, resulting in a deadlock.
+ */
+ nofs_flag = memalloc_nofs_save();
+ inode = btrfs_iget(objectid, root);
+ memalloc_nofs_restore(nofs_flag);
+
+ return inode;
+}
+
/*
* start a sub transaction and setup the log tree
* this increments the log tree writer count to make the people
@@ -141,16 +289,45 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ const bool zoned = btrfs_is_zoned(fs_info);
int ret = 0;
+ bool created = false;
+
+ /*
+ * First check if the log root tree was already created. If not, create
+ * it before locking the root's log_mutex, just to keep lockdep happy.
+ */
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
+ mutex_lock(&tree_root->log_mutex);
+ if (!fs_info->log_root_tree) {
+ ret = btrfs_init_log_root_tree(trans, fs_info);
+ if (!ret) {
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
+ created = true;
+ }
+ }
+ mutex_unlock(&tree_root->log_mutex);
+ if (ret)
+ return ret;
+ }
mutex_lock(&root->log_mutex);
+again:
if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
if (btrfs_need_log_full_commit(trans)) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
+ if (zoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
+
if (!root->log_start_pid) {
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
@@ -158,24 +335,28 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
} else {
- mutex_lock(&fs_info->tree_log_mutex);
- if (!fs_info->log_root_tree)
- ret = btrfs_init_log_root_tree(trans, fs_info);
- mutex_unlock(&fs_info->tree_log_mutex);
- if (ret)
+ /*
+ * This means fs_info->log_root_tree was already created
+ * for some other FS trees. Do the full commit not to mix
+ * nodes from multiple log transactions to do sequential
+ * writing.
+ */
+ if (zoned && !created) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
+ }
ret = btrfs_add_log_tree(trans, root);
if (ret)
goto out;
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
}
- atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
- if (ctx) {
+ if (!ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -193,11 +374,22 @@ out:
*/
static int join_running_log_trans(struct btrfs_root *root)
{
+ const bool zoned = btrfs_is_zoned(root->fs_info);
int ret = -ENOENT;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
+ return ret;
+
mutex_lock(&root->log_mutex);
+again:
if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
ret = 0;
+ if (zoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
atomic_inc(&root->log_writers);
}
mutex_unlock(&root->log_mutex);
@@ -211,9 +403,7 @@ static int join_running_log_trans(struct btrfs_root *root)
*/
void btrfs_pin_log_trans(struct btrfs_root *root)
{
- mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
- mutex_unlock(&root->log_mutex);
}
/*
@@ -228,77 +418,14 @@ void btrfs_end_log_trans(struct btrfs_root *root)
}
}
-static int btrfs_write_tree_block(struct extent_buffer *buf)
-{
- return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
- buf->start + buf->len - 1);
-}
-
-static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
-{
- filemap_fdatawait_range(buf->pages[0]->mapping,
- buf->start, buf->start + buf->len - 1);
-}
-
-/*
- * the walk control struct is used to pass state down the chain when
- * processing the log tree. The stage field tells us which part
- * of the log tree processing we are currently doing. The others
- * are state fields used for that specific part
- */
-struct walk_control {
- /* should we free the extent on disk when done? This is used
- * at transaction commit time while freeing a log tree
- */
- int free;
-
- /* should we write out the extent buffer? This is used
- * while flushing the log tree to disk during a sync
- */
- int write;
-
- /* should we wait for the extent buffer io to finish? Also used
- * while flushing the log tree to disk for a sync
- */
- int wait;
-
- /* pin only walk, we record which extents on disk belong to the
- * log trees
- */
- int pin;
-
- /* what stage of the replay code we're currently in */
- int stage;
-
- /*
- * Ignore any items from the inode currently being processed. Needs
- * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
- * the LOG_WALK_REPLAY_INODES stage.
- */
- bool ignore_cur_inode;
-
- /* the root we are currently replaying */
- struct btrfs_root *replay_dest;
-
- /* the trans handle for the current replay */
- struct btrfs_trans_handle *trans;
-
- /* the function that gets used to process blocks we find in the
- * tree. Note the extent_buffer might not be up to date when it is
- * passed in, and it must be checked or read if you need the data
- * inside it
- */
- int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
- struct walk_control *wc, u64 gen, int level);
-};
-
/*
* process_func used to pin down extents, write them or wait on them
*/
-static int process_one_buffer(struct btrfs_root *log,
- struct extent_buffer *eb,
+static int process_one_buffer(struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
+ struct btrfs_root *log = wc->log;
+ struct btrfs_trans_handle *trans = wc->trans;
struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
@@ -307,33 +434,46 @@ static int process_one_buffer(struct btrfs_root *log,
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
- ret = btrfs_read_buffer(eb, gen, level, NULL);
- if (ret)
+ struct btrfs_tree_parent_check check = {
+ .level = level,
+ .transid = gen
+ };
+
+ ret = btrfs_read_extent_buffer(eb, &check);
+ if (unlikely(ret)) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
+ }
}
- if (wc->pin)
- ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
- eb->len);
+ if (wc->pin) {
+ ASSERT(trans != NULL);
+ ret = btrfs_pin_extent_for_log_replay(trans, eb);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
- if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
- if (wc->pin && btrfs_header_level(eb) == 0)
+ if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) {
ret = btrfs_exclude_logged_extents(eb);
- if (wc->write)
- btrfs_write_tree_block(eb);
- if (wc->wait)
- btrfs_wait_tree_block_writeback(eb);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
}
return ret;
}
/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
+ * Item overwrite used by log replay. The given log tree leaf, slot and key
+ * from the walk_control structure all refer to the source data we are copying
+ * out.
*
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
+ * The given root is for the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and will be
+ * released on exit).
*
* If the key is already in the destination tree the existing item is
* overwritten. If the existing item isn't big enough, it is extended.
@@ -341,61 +481,67 @@ static int process_one_buffer(struct btrfs_root *log,
*
* If the key isn't in the destination yet, a new item is inserted.
*/
-static noinline int overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int overwrite_item(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
int ret;
u32 item_size;
u64 saved_i_size = 0;
int save_old_i_size = 0;
unsigned long src_ptr;
unsigned long dst_ptr;
- int overwrite_root = 0;
- bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
+ struct extent_buffer *dst_eb;
+ int dst_slot;
+ const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY);
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
- overwrite_root = 1;
+ /*
+ * This is only used during log replay, so the root is always from a
+ * fs/subvolume tree. In case we ever need to support a log root, then
+ * we'll have to clone the leaf in the path, release the path and use
+ * the leaf before writing into the log tree. See the comments at
+ * copy_items() for more details.
+ */
+ ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID, "root_id=%llu", btrfs_root_id(root));
- item_size = btrfs_item_size_nr(eb, slot);
- src_ptr = btrfs_item_ptr_offset(eb, slot);
+ item_size = btrfs_item_size(wc->log_leaf, wc->log_slot);
+ src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
- /* look for the key in the destination tree */
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
+ /* Look for the key in the destination tree. */
+ ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&wc->log_key),
+ btrfs_root_id(root));
return ret;
+ }
+
+ dst_eb = wc->subvol_path->nodes[0];
+ dst_slot = wc->subvol_path->slots[0];
if (ret == 0) {
char *src_copy;
- char *dst_copy;
- u32 dst_size = btrfs_item_size_nr(path->nodes[0],
- path->slots[0]);
+ const u32 dst_size = btrfs_item_size(dst_eb, dst_slot);
+
if (dst_size != item_size)
goto insert;
if (item_size == 0) {
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
- dst_copy = kmalloc(item_size, GFP_NOFS);
src_copy = kmalloc(item_size, GFP_NOFS);
- if (!dst_copy || !src_copy) {
- btrfs_release_path(path);
- kfree(dst_copy);
- kfree(src_copy);
+ if (!src_copy) {
+ btrfs_abort_log_replay(wc, -ENOMEM,
+ "failed to allocate memory for log leaf item");
return -ENOMEM;
}
- read_extent_buffer(eb, src_copy, src_ptr, item_size);
+ read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size);
+ dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
+ ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
- dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
- item_size);
- ret = memcmp(dst_copy, src_copy, item_size);
-
- kfree(dst_copy);
kfree(src_copy);
/*
* they have the same contents, just return, this saves
@@ -404,7 +550,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
* sync
*/
if (ret == 0) {
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
@@ -412,28 +558,28 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
* We need to load the old nbytes into the inode so when we
* replay the extents we've logged we get the right nbytes.
*/
- if (inode_item) {
+ if (is_inode_item) {
struct btrfs_inode_item *item;
u64 nbytes;
u32 mode;
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ item = btrfs_item_ptr(dst_eb, dst_slot,
struct btrfs_inode_item);
- nbytes = btrfs_inode_nbytes(path->nodes[0], item);
- item = btrfs_item_ptr(eb, slot,
+ nbytes = btrfs_inode_nbytes(dst_eb, item);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot,
struct btrfs_inode_item);
- btrfs_set_inode_nbytes(eb, item, nbytes);
+ btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes);
/*
* If this is a directory we need to reset the i_size to
* 0 so that we can set it up properly when replaying
* the rest of the items in this log.
*/
- mode = btrfs_inode_mode(eb, item);
+ mode = btrfs_inode_mode(wc->log_leaf, item);
if (S_ISDIR(mode))
- btrfs_set_inode_size(eb, item, 0);
+ btrfs_set_inode_size(wc->log_leaf, item, 0);
}
- } else if (inode_item) {
+ } else if (is_inode_item) {
struct btrfs_inode_item *item;
u32 mode;
@@ -441,40 +587,43 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
* New inode, set nbytes to 0 so that the nbytes comes out
* properly when we replay the extents.
*/
- item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
- btrfs_set_inode_nbytes(eb, item, 0);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item);
+ btrfs_set_inode_nbytes(wc->log_leaf, item, 0);
/*
* If this is a directory we need to reset the i_size to 0 so
* that we can set it up properly when replaying the rest of
* the items in this log.
*/
- mode = btrfs_inode_mode(eb, item);
+ mode = btrfs_inode_mode(wc->log_leaf, item);
if (S_ISDIR(mode))
- btrfs_set_inode_size(eb, item, 0);
+ btrfs_set_inode_size(wc->log_leaf, item, 0);
}
insert:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* try to insert the key into the destination tree */
- path->skip_release_on_error = 1;
- ret = btrfs_insert_empty_item(trans, root, path,
- key, item_size);
- path->skip_release_on_error = 0;
+ wc->subvol_path->skip_release_on_error = true;
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size);
+ wc->subvol_path->skip_release_on_error = false;
+
+ dst_eb = wc->subvol_path->nodes[0];
+ dst_slot = wc->subvol_path->slots[0];
/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
- u32 found_size;
- found_size = btrfs_item_size_nr(path->nodes[0],
- path->slots[0]);
+ const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
+
if (found_size > item_size)
- btrfs_truncate_item(path, item_size, 1);
+ btrfs_truncate_item(trans, wc->subvol_path, item_size, 1);
else if (found_size < item_size)
- btrfs_extend_item(path, item_size - found_size);
+ btrfs_extend_item(trans, wc->subvol_path, item_size - found_size);
} else if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert item for key " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&wc->log_key));
return ret;
}
- dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]);
+ dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
/* don't overwrite an existing inode if the generation number
* was logged as zero. This is done when the tree logging code
@@ -485,16 +634,15 @@ insert:
* state of the tree found in the subvolume, and i_size is modified
* as it goes
*/
- if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+ if (is_inode_item && ret == -EEXIST) {
struct btrfs_inode_item *src_item;
struct btrfs_inode_item *dst_item;
src_item = (struct btrfs_inode_item *)src_ptr;
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(eb, src_item) == 0) {
- struct extent_buffer *dst_eb = path->nodes[0];
- const u64 ino_size = btrfs_inode_size(eb, src_item);
+ if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) {
+ const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item);
/*
* For regular files an ino_size == 0 is used only when
@@ -503,59 +651,55 @@ insert:
* case don't set the size of the inode in the fs/subvol
* tree, otherwise we would be throwing valid data away.
*/
- if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+ if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) &&
S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
ino_size != 0)
btrfs_set_inode_size(dst_eb, dst_item, ino_size);
goto no_copy;
}
- if (overwrite_root &&
- S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
- S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+ if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) &&
+ S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
save_old_i_size = 1;
- saved_i_size = btrfs_inode_size(path->nodes[0],
- dst_item);
+ saved_i_size = btrfs_inode_size(dst_eb, dst_item);
}
}
- copy_extent_buffer(path->nodes[0], eb, dst_ptr,
- src_ptr, item_size);
+ copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size);
if (save_old_i_size) {
struct btrfs_inode_item *dst_item;
+
dst_item = (struct btrfs_inode_item *)dst_ptr;
- btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+ btrfs_set_inode_size(dst_eb, dst_item, saved_i_size);
}
/* make sure the generation is filled in */
- if (key->type == BTRFS_INODE_ITEM_KEY) {
+ if (is_inode_item) {
struct btrfs_inode_item *dst_item;
+
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
- btrfs_set_inode_generation(path->nodes[0], dst_item,
- trans->transid);
- }
+ if (btrfs_inode_generation(dst_eb, dst_item) == 0)
+ btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
}
no_copy:
- btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
-/*
- * simple helper to read an inode off the disk from a given root
- * This can only be called for subvolume roots and not for the log
- */
-static noinline struct inode *read_one_inode(struct btrfs_root *root,
- u64 objectid)
+static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
+ struct fscrypt_str *name)
{
- struct inode *inode;
+ char *buf;
- inode = btrfs_iget(root->fs_info->sb, objectid, root);
- if (IS_ERR(inode))
- inode = NULL;
- return inode;
+ buf = kmalloc(len, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ read_extent_buffer(eb, buf, (unsigned long)start, len);
+ name->name = buf;
+ name->len = len;
+ return 0;
}
/* replays a single extent in 'eb' at 'slot' with 'key' into the
@@ -570,50 +714,53 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
* The extent is inserted into the file, dropping any existing extents
* from the file that overlap the new one.
*/
-static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static noinline int replay_one_extent(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = root->fs_info;
int found_type;
u64 extent_end;
- u64 start = key->offset;
+ const u64 start = wc->log_key.offset;
u64 nbytes = 0;
+ u64 csum_start;
+ u64 csum_end;
+ LIST_HEAD(ordered_sums);
+ u64 offset;
+ unsigned long dest_offset;
+ struct btrfs_key ins;
struct btrfs_file_extent_item *item;
- struct inode *inode = NULL;
- unsigned long size;
+ struct btrfs_inode *inode = NULL;
int ret = 0;
- item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
- found_type = btrfs_file_extent_type(eb, item);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(wc->log_leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- nbytes = btrfs_file_extent_num_bytes(eb, item);
- extent_end = start + nbytes;
-
- /*
- * We don't add to the inodes nbytes if we are prealloc or a
- * hole.
- */
- if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
- nbytes = 0;
+ extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
+ /* Holes don't take up space. */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0)
+ nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item);
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- size = btrfs_file_extent_ram_bytes(eb, item);
- nbytes = btrfs_file_extent_ram_bytes(eb, item);
- extent_end = ALIGN(start + size,
- fs_info->sectorsize);
+ nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item);
+ extent_end = ALIGN(start + nbytes, fs_info->sectorsize);
} else {
- ret = 0;
- goto out;
- }
-
- inode = read_one_inode(root, key->objectid);
- if (!inode) {
- ret = -EIO;
- goto out;
+ btrfs_abort_log_replay(wc, -EUCLEAN,
+ "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
+ found_type, btrfs_root_id(root),
+ wc->log_key.objectid, wc->log_key.offset);
+ return -EUCLEAN;
+ }
+
+ inode = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get inode %llu for root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
+ return ret;
}
/*
@@ -621,219 +768,299 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
- ret = btrfs_lookup_file_extent(trans, root, path,
- btrfs_ino(BTRFS_I(inode)), start, 0);
+ ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path,
+ btrfs_ino(inode), start, 0);
if (ret == 0 &&
(found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
- struct btrfs_file_extent_item cmp1;
- struct btrfs_file_extent_item cmp2;
- struct btrfs_file_extent_item *existing;
- struct extent_buffer *leaf;
-
- leaf = path->nodes[0];
- existing = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+ struct btrfs_file_extent_item existing;
+ unsigned long ptr;
- read_extent_buffer(eb, &cmp1, (unsigned long)item,
- sizeof(cmp1));
- read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
- sizeof(cmp2));
+ ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ read_extent_buffer(leaf, &existing, ptr, sizeof(existing));
/*
* we already have a pointer to this exact extent,
* we don't have to do anything
*/
- if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
- btrfs_release_path(path);
+ if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item,
+ sizeof(existing)) == 0) {
+ btrfs_release_path(wc->subvol_path);
goto out;
}
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* drop any overlapping extents */
- ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
- if (ret)
+ drop_args.start = start;
+ drop_args.end = extent_end;
+ drop_args.drop_cache = true;
+ drop_args.path = wc->subvol_path;
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to drop extents for inode %llu range [%llu, %llu) root %llu",
+ wc->log_key.objectid, start, extent_end,
+ btrfs_root_id(root));
goto out;
+ }
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- u64 offset;
- unsigned long dest_offset;
- struct btrfs_key ins;
-
- if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
- btrfs_fs_incompat(fs_info, NO_HOLES))
- goto update_inode;
-
- ret = btrfs_insert_empty_item(trans, root, path, key,
- sizeof(*item));
+ if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ /* inline extents are easy, we just overwrite them */
+ ret = overwrite_item(wc);
if (ret)
goto out;
- dest_offset = btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]);
- copy_extent_buffer(path->nodes[0], eb, dest_offset,
- (unsigned long)item, sizeof(*item));
+ goto update_inode;
+ }
- ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
- ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
- ins.type = BTRFS_EXTENT_ITEM_KEY;
- offset = key->offset - btrfs_file_extent_offset(eb, item);
+ /*
+ * If not an inline extent, it can only be a regular or prealloc one.
+ * We have checked that above and returned -EUCLEAN if not.
+ */
- /*
- * Manually record dirty extent, as here we did a shallow
- * file extent item copy and skip normal backref update,
- * but modifying extent tree all by ourselves.
- * So need to manually record dirty extent for qgroup,
- * as the owner of the file extent changed from log tree
- * (doesn't affect qgroup) to fs/file tree(affects qgroup)
- */
- ret = btrfs_qgroup_trace_extent(trans,
- btrfs_file_extent_disk_bytenr(eb, item),
- btrfs_file_extent_disk_num_bytes(eb, item),
- GFP_NOFS);
- if (ret < 0)
+ /* A hole and NO_HOLES feature enabled, nothing else to do. */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 &&
+ btrfs_fs_incompat(fs_info, NO_HOLES))
+ goto update_inode;
+
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path,
+ &wc->log_key, sizeof(*item));
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert item with key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&wc->log_key),
+ btrfs_root_id(root));
+ goto out;
+ }
+ dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0],
+ wc->subvol_path->slots[0]);
+ copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset,
+ (unsigned long)item, sizeof(*item));
+
+ /*
+ * We have an explicit hole and NO_HOLES is not enabled. We have added
+ * the hole file extent item to the subvolume tree, so we don't have
+ * anything else to do other than update the file extent item range and
+ * update the inode item.
+ */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) {
+ btrfs_release_path(wc->subvol_path);
+ goto update_inode;
+ }
+
+ ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item);
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item);
+ offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item);
+
+ /*
+ * Manually record dirty extent, as here we did a shallow file extent
+ * item copy and skip normal backref update, but modifying extent tree
+ * all by ourselves. So need to manually record dirty extent for qgroup,
+ * as the owner of the file extent changed from log tree (doesn't affect
+ * qgroup) to fs/file tree (affects qgroup).
+ */
+ ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid, btrfs_root_id(root));
+ goto out;
+ }
+
+ /*
+ * Is this extent already allocated in the extent tree?
+ * If so, just add a reference.
+ */
+ ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid, btrfs_root_id(root));
+ goto out;
+ } else if (ret == 0) {
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = ins.objectid,
+ .num_bytes = ins.offset,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+
+ btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
+ } else {
+ /* Insert the extent pointer in the extent tree. */
+ ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root),
+ wc->log_key.objectid, offset, &ins);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu",
+ ins.objectid, ins.offset, offset,
+ wc->log_key.objectid, btrfs_root_id(root));
goto out;
+ }
+ }
- if (ins.objectid > 0) {
- struct btrfs_ref ref = { 0 };
- u64 csum_start;
- u64 csum_end;
- LIST_HEAD(ordered_sums);
+ btrfs_release_path(wc->subvol_path);
- /*
- * is this extent already allocated in the extent
- * allocation tree? If so, just add a reference
- */
- ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
- ins.offset);
- if (ret == 0) {
- btrfs_init_generic_ref(&ref,
- BTRFS_ADD_DELAYED_REF,
- ins.objectid, ins.offset, 0);
- btrfs_init_data_ref(&ref,
- root->root_key.objectid,
- key->objectid, offset);
- ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret)
- goto out;
- } else {
- /*
- * insert the extent pointer in the extent
- * allocation tree
- */
- ret = btrfs_alloc_logged_file_extent(trans,
- root->root_key.objectid,
- key->objectid, offset, &ins);
- if (ret)
- goto out;
- }
- btrfs_release_path(path);
+ if (btrfs_file_extent_compression(wc->log_leaf, item)) {
+ csum_start = ins.objectid;
+ csum_end = csum_start + ins.offset;
+ } else {
+ csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item);
+ csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
+ }
- if (btrfs_file_extent_compression(eb, item)) {
- csum_start = ins.objectid;
- csum_end = csum_start + ins.offset;
- } else {
- csum_start = ins.objectid +
- btrfs_file_extent_offset(eb, item);
- csum_end = csum_start +
- btrfs_file_extent_num_bytes(eb, item);
- }
+ ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1,
+ &ordered_sums, false);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookups csums for range [%llu, %llu) inode %llu root %llu",
+ csum_start, csum_end, wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
+ ret = 0;
+ /*
+ * Now delete all existing cums in the csum root that cover our range.
+ * We do this because we can have an extent that is completely
+ * referenced by one file extent item and partially referenced by
+ * another file extent item (like after using the clone or extent_same
+ * ioctls). In this case if we end up doing the replay of the one that
+ * partially references the extent first, and we do not do the csum
+ * deletion below, we can get 2 csum items in the csum tree that overlap
+ * each other. For example, imagine our log has the two following file
+ * extent items:
+ *
+ * key (257 EXTENT_DATA 409600)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 20480 nr 20480 ram 102400
+ *
+ * key (257 EXTENT_DATA 819200)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 0 nr 102400 ram 102400
+ *
+ * Where the second one fully references the 100K extent that starts at
+ * disk byte 12845056, and the log tree has a single csum item that
+ * covers the entire range of the extent:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ *
+ * After the first file extent item is replayed, the csum tree gets the
+ * following csum item:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which covers the 20K sub-range starting at offset 20K of our extent.
+ * Now when we replay the second file extent item, if we do not delete
+ * existing csum items that cover any of its blocks, we end up getting
+ * two csum items in our csum tree that overlap each other:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which is a problem, because after this anyone trying to lookup for
+ * the checksum of any block of our extent starting at an offset of 40K
+ * or higher, will end up looking at the second csum item only, which
+ * does not contain the checksum for any block starting at offset 40K or
+ * higher of our extent.
+ */
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_root *csum_root;
- ret = btrfs_lookup_csums_range(root->log_root,
- csum_start, csum_end - 1,
- &ordered_sums, 0);
+ sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list);
+ csum_root = btrfs_csum_root(fs_info, sums->logical);
+ if (!ret) {
+ ret = btrfs_del_csums(trans, csum_root, sums->logical,
+ sums->len);
if (ret)
- goto out;
- /*
- * Now delete all existing cums in the csum root that
- * cover our range. We do this because we can have an
- * extent that is completely referenced by one file
- * extent item and partially referenced by another
- * file extent item (like after using the clone or
- * extent_same ioctls). In this case if we end up doing
- * the replay of the one that partially references the
- * extent first, and we do not do the csum deletion
- * below, we can get 2 csum items in the csum tree that
- * overlap each other. For example, imagine our log has
- * the two following file extent items:
- *
- * key (257 EXTENT_DATA 409600)
- * extent data disk byte 12845056 nr 102400
- * extent data offset 20480 nr 20480 ram 102400
- *
- * key (257 EXTENT_DATA 819200)
- * extent data disk byte 12845056 nr 102400
- * extent data offset 0 nr 102400 ram 102400
- *
- * Where the second one fully references the 100K extent
- * that starts at disk byte 12845056, and the log tree
- * has a single csum item that covers the entire range
- * of the extent:
- *
- * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
- *
- * After the first file extent item is replayed, the
- * csum tree gets the following csum item:
- *
- * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
- *
- * Which covers the 20K sub-range starting at offset 20K
- * of our extent. Now when we replay the second file
- * extent item, if we do not delete existing csum items
- * that cover any of its blocks, we end up getting two
- * csum items in our csum tree that overlap each other:
- *
- * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
- * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
- *
- * Which is a problem, because after this anyone trying
- * to lookup up for the checksum of any block of our
- * extent starting at an offset of 40K or higher, will
- * end up looking at the second csum item only, which
- * does not contain the checksum for any block starting
- * at offset 40K or higher of our extent.
- */
- while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums;
- sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
- if (!ret)
- ret = btrfs_del_csums(trans,
- fs_info->csum_root,
- sums->bytenr,
- sums->len);
- if (!ret)
- ret = btrfs_csum_file_blocks(trans,
- fs_info->csum_root, sums);
- list_del(&sums->list);
- kfree(sums);
- }
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete csums for range [%llu, %llu) inode %llu root %llu",
+ sums->logical,
+ sums->logical + sums->len,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ }
+ if (!ret) {
+ ret = btrfs_csum_file_blocks(trans, csum_root, sums);
if (ret)
- goto out;
- } else {
- btrfs_release_path(path);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to add csums for range [%llu, %llu) inode %llu root %llu",
+ sums->logical,
+ sums->logical + sums->len,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
}
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- /* inline extents are easy, we just overwrite them */
- ret = overwrite_item(trans, root, path, eb, slot, key);
- if (ret)
- goto out;
+ list_del(&sums->list);
+ kfree(sums);
}
-
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
- extent_end - start);
if (ret)
goto out;
- inode_add_bytes(inode, nbytes);
update_inode:
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to set file extent range [%llu, %llu) inode %llu root %llu",
+ start, extent_end, wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
+
+ btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
out:
- if (inode)
- iput(inode);
+ iput(&inode->vfs_inode);
+ return ret;
+}
+
+static int unlink_inode_for_log_replay(struct walk_control *wc,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ const struct fscrypt_str *name)
+{
+ struct btrfs_trans_handle *trans = wc->trans;
+ int ret;
+
+ ret = btrfs_unlink_inode(trans, dir, inode, name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to unlink inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir), name->len,
+ name->name, btrfs_root_id(inode->root));
+ return ret;
+ }
+ /*
+ * Whenever we need to check if a name exists or not, we check the
+ * fs/subvolume tree. So after an unlink we must run delayed items, so
+ * that future checks for a name during log replay see that the name
+ * does not exists anymore.
+ */
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+"failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir), name->len,
+ name->name, btrfs_root_id(inode->root));
+
return ret;
}
@@ -845,87 +1072,93 @@ out:
* This is a helper function to do the unlink of a specific directory
* item
*/
-static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
+static noinline int drop_one_dir_item(struct walk_control *wc,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
- struct inode *inode;
- char *name;
- int name_len;
- struct extent_buffer *leaf;
+ struct btrfs_root *root = dir->root;
+ struct btrfs_inode *inode;
+ struct fscrypt_str name;
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
struct btrfs_key location;
int ret;
- leaf = path->nodes[0];
-
btrfs_dir_item_key_to_cpu(leaf, di, &location);
- name_len = btrfs_dir_name_len(leaf, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name)
- return -ENOMEM;
+ ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
+ return ret;
+ }
- read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to open inode %llu parent dir %llu name %.*s root %llu",
+ location.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
+ inode = NULL;
goto out;
}
- ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ ret = link_to_fixup_dir(wc, location.objectid);
if (ret)
goto out;
- ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
- name_len);
- if (ret)
- goto out;
- else
- ret = btrfs_run_delayed_items(trans);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
out:
- kfree(name);
- iput(inode);
+ kfree(name.name);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
/*
- * helper function to see if a given name and sequence number found
- * in an inode back reference are already in a directory and correctly
- * point to this inode
+ * See if a given name and sequence number found in an inode back reference are
+ * already in a directory and correctly point to this inode.
+ *
+ * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
+ * exists.
*/
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
u64 dirid, u64 objectid, u64 index,
- const char *name, int name_len)
+ struct fscrypt_str *name)
{
struct btrfs_dir_item *di;
struct btrfs_key location;
- int match = 0;
+ int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
- index, name, name_len, 0);
- if (di && !IS_ERR(di)) {
+ index, name, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid != objectid)
goto out;
- } else
+ } else {
goto out;
- btrfs_release_path(path);
+ }
- di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
- if (di && !IS_ERR(di)) {
- btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
- if (location.objectid != objectid)
- goto out;
- } else
+ btrfs_release_path(path);
+ di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
goto out;
- match = 1;
+ } else if (di) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid == objectid)
+ ret = 1;
+ }
out:
btrfs_release_path(path);
- return match;
+ return ret;
}
/*
@@ -941,9 +1174,9 @@ out:
static noinline int backref_in_log(struct btrfs_root *log,
struct btrfs_key *key,
u64 ref_objectid,
- const char *name, int namelen)
+ const struct fscrypt_str *name)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
path = btrfs_alloc_path();
@@ -951,225 +1184,253 @@ static noinline int backref_in_log(struct btrfs_root *log,
return -ENOMEM;
ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
- if (ret < 0) {
- goto out;
- } else if (ret == 1) {
- ret = 0;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
path->slots[0],
- ref_objectid,
- name, namelen);
+ ref_objectid, name);
else
ret = !!btrfs_find_name_in_backref(path->nodes[0],
- path->slots[0],
- name, namelen);
-out:
- btrfs_free_path(path);
+ path->slots[0], name);
return ret;
}
-static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_root *log_root,
+static int unlink_refs_not_in_log(struct walk_control *wc,
+ struct btrfs_key *search_key,
struct btrfs_inode *dir,
- struct btrfs_inode *inode,
- u64 inode_objectid, u64 parent_objectid,
- u64 ref_index, char *name, int namelen,
- int *search_done)
+ struct btrfs_inode *inode)
{
- int ret;
- char *victim_name;
- int victim_name_len;
- struct extent_buffer *leaf;
- struct btrfs_dir_item *di;
- struct btrfs_key search_key;
- struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+ unsigned long ptr;
+ unsigned long ptr_end;
-again:
- /* Search old style refs */
- search_key.objectid = inode_objectid;
- search_key.type = BTRFS_INODE_REF_KEY;
- search_key.offset = parent_objectid;
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret == 0) {
+ /*
+ * Check all the names in this back reference to see if they are in the
+ * log. If so, we allow them to stay otherwise they must be unlinked as
+ * a conflict.
+ */
+ ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]);
+ while (ptr < ptr_end) {
+ struct fscrypt_str victim_name;
struct btrfs_inode_ref *victim_ref;
- unsigned long ptr;
- unsigned long ptr_end;
-
- leaf = path->nodes[0];
-
- /* are we trying to overwrite a back ref for the root directory
- * if so, just jump out, we're done
- */
- if (search_key.objectid == search_key.offset)
- return 1;
-
- /* check all the names in this back reference to see
- * if they are in the log. if so, we allow them to stay
- * otherwise they must be unlinked as a conflict
- */
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
- while (ptr < ptr_end) {
- victim_ref = (struct btrfs_inode_ref *)ptr;
- victim_name_len = btrfs_inode_ref_name_len(leaf,
- victim_ref);
- victim_name = kmalloc(victim_name_len, GFP_NOFS);
- if (!victim_name)
- return -ENOMEM;
+ int ret;
- read_extent_buffer(leaf, victim_name,
- (unsigned long)(victim_ref + 1),
- victim_name_len);
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ ret = read_alloc_one_name(leaf, (victim_ref + 1),
+ btrfs_inode_ref_name_len(leaf, victim_ref),
+ &victim_name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for inode %llu parent dir %llu root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ btrfs_root_id(inode->root));
+ return ret;
+ }
- ret = backref_in_log(log_root, &search_key,
- parent_objectid, victim_name,
- victim_name_len);
+ ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name);
+ if (ret) {
if (ret < 0) {
- kfree(victim_name);
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ victim_name.len, victim_name.name,
+ btrfs_root_id(inode->root));
+ kfree(victim_name.name);
return ret;
- } else if (!ret) {
- inc_nlink(&inode->vfs_inode);
- btrfs_release_path(path);
-
- ret = btrfs_unlink_inode(trans, root, dir, inode,
- victim_name, victim_name_len);
- kfree(victim_name);
- if (ret)
- return ret;
- ret = btrfs_run_delayed_items(trans);
- if (ret)
- return ret;
- *search_done = 1;
- goto again;
}
- kfree(victim_name);
-
- ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+ kfree(victim_name.name);
+ ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
+ continue;
}
- /*
- * NOTE: we have searched root tree and checked the
- * corresponding ref, it does not need to check again.
- */
- *search_done = 1;
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(wc->subvol_path);
+
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ return -EAGAIN;
}
- btrfs_release_path(path);
- /* Same search but for extended refs */
- extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
- inode_objectid, parent_objectid, 0,
- 0);
- if (!IS_ERR_OR_NULL(extref)) {
- u32 item_size;
- u32 cur_offset = 0;
- unsigned long base;
- struct inode *victim_parent;
+ return 0;
+}
- leaf = path->nodes[0];
+static int unlink_extrefs_not_in_log(struct walk_control *wc,
+ struct btrfs_key *search_key,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode)
+{
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+ const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]);
+ u32 cur_offset = 0;
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ while (cur_offset < item_size) {
+ struct btrfs_root *log_root = wc->log;
+ struct btrfs_inode_extref *extref;
+ struct fscrypt_str victim_name;
+ int ret;
- while (cur_offset < item_size) {
- extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
- victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+ if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir))
+ goto next;
- if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
- goto next;
+ ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
+ &victim_name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for inode %llu parent dir %llu root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ btrfs_root_id(inode->root));
+ return ret;
+ }
- victim_name = kmalloc(victim_name_len, GFP_NOFS);
- if (!victim_name)
- return -ENOMEM;
- read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
- victim_name_len);
-
- search_key.objectid = inode_objectid;
- search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = btrfs_extref_hash(parent_objectid,
- victim_name,
- victim_name_len);
- ret = backref_in_log(log_root, &search_key,
- parent_objectid, victim_name,
- victim_name_len);
+ search_key->objectid = btrfs_ino(inode);
+ search_key->type = BTRFS_INODE_EXTREF_KEY;
+ search_key->offset = btrfs_extref_hash(btrfs_ino(dir),
+ victim_name.name,
+ victim_name.len);
+ ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name);
+ if (ret) {
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ victim_name.len, victim_name.name,
+ btrfs_root_id(inode->root));
+ kfree(victim_name.name);
return ret;
- } else if (!ret) {
- ret = -ENOENT;
- victim_parent = read_one_inode(root,
- parent_objectid);
- if (victim_parent) {
- inc_nlink(&inode->vfs_inode);
- btrfs_release_path(path);
-
- ret = btrfs_unlink_inode(trans, root,
- BTRFS_I(victim_parent),
- inode,
- victim_name,
- victim_name_len);
- if (!ret)
- ret = btrfs_run_delayed_items(
- trans);
- }
- iput(victim_parent);
- kfree(victim_name);
- if (ret)
- return ret;
- *search_done = 1;
- goto again;
}
- kfree(victim_name);
+ kfree(victim_name.name);
next:
- cur_offset += victim_name_len + sizeof(*extref);
+ cur_offset += victim_name.len + sizeof(*extref);
+ continue;
}
- *search_done = 1;
+
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(wc->subvol_path);
+
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ return -EAGAIN;
}
- btrfs_release_path(path);
+
+ return 0;
+}
+
+static inline int __add_inode_ref(struct walk_control *wc,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ u64 ref_index, struct fscrypt_str *name)
+{
+ int ret;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_dir_item *di;
+ struct btrfs_key search_key;
+ struct btrfs_inode_extref *extref;
+
+again:
+ /* Search old style refs */
+ search_key.objectid = btrfs_ino(inode);
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = btrfs_ino(dir);
+ ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&search_key),
+ btrfs_root_id(root));
+ return ret;
+ } else if (ret == 0) {
+ /*
+ * Are we trying to overwrite a back ref for the root directory?
+ * If so, we're done.
+ */
+ if (search_key.objectid == search_key.offset)
+ return 1;
+
+ ret = unlink_refs_not_in_log(wc, &search_key, dir, inode);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ return ret;
+ }
+ btrfs_release_path(wc->subvol_path);
+
+ /* Same search but for extended refs */
+ extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name,
+ btrfs_ino(inode), btrfs_ino(dir));
+ if (IS_ERR(extref)) {
+ return PTR_ERR(extref);
+ } else if (extref) {
+ ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ return ret;
+ }
+ btrfs_release_path(wc->subvol_path);
/* look for a conflicting sequence number */
- di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
- ref_index, name, namelen, 0);
- if (di && !IS_ERR(di)) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir),
+ ref_index, name, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+"failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(dir), ref_index, name->len,
+ name->name, btrfs_root_id(root));
+ return ret;
+ } else if (di) {
+ ret = drop_one_dir_item(wc, dir, di);
if (ret)
return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* look for a conflicting name */
- di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
- name, namelen, 0);
- if (di && !IS_ERR(di)) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir item for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name->len, name->name,
+ btrfs_root_id(root));
+ return ret;
+ } else if (di) {
+ ret = drop_one_dir_item(wc, dir, di);
if (ret)
return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
- u32 *namelen, char **name, u64 *index,
+ struct fscrypt_str *name, u64 *index,
u64 *parent_objectid)
{
struct btrfs_inode_extref *extref;
+ int ret;
extref = (struct btrfs_inode_extref *)ref_ptr;
- *namelen = btrfs_inode_extref_name_len(eb, extref);
- *name = kmalloc(*namelen, GFP_NOFS);
- if (*name == NULL)
- return -ENOMEM;
-
- read_extent_buffer(eb, *name, (unsigned long)&extref->name,
- *namelen);
+ ret = read_alloc_one_name(eb, &extref->name,
+ btrfs_inode_extref_name_len(eb, extref), name);
+ if (ret)
+ return ret;
if (index)
*index = btrfs_inode_extref_index(eb, extref);
@@ -1180,18 +1441,17 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
}
static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
- u32 *namelen, char **name, u64 *index)
+ struct fscrypt_str *name, u64 *index)
{
struct btrfs_inode_ref *ref;
+ int ret;
ref = (struct btrfs_inode_ref *)ref_ptr;
- *namelen = btrfs_inode_ref_name_len(eb, ref);
- *name = kmalloc(*namelen, GFP_NOFS);
- if (*name == NULL)
- return -ENOMEM;
-
- read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+ ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
+ name);
+ if (ret)
+ return ret;
if (index)
*index = btrfs_inode_ref_index(eb, ref);
@@ -1206,230 +1466,133 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
* proper unlink of that name (that is, remove its entry from the inode
* reference item and both dir index keys).
*/
-static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_inode *inode,
- struct extent_buffer *log_eb,
- int log_slot,
- struct btrfs_key *key)
+static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode)
{
+ struct btrfs_root *root = wc->root;
int ret;
unsigned long ref_ptr;
unsigned long ref_end;
struct extent_buffer *eb;
again:
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ btrfs_release_path(wc->subvol_path);
+ ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
if (ret > 0) {
ret = 0;
goto out;
}
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&wc->log_key),
+ btrfs_root_id(root));
goto out;
+ }
- eb = path->nodes[0];
- ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
- ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
+ eb = wc->subvol_path->nodes[0];
+ ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]);
+ ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]);
while (ref_ptr < ref_end) {
- char *name = NULL;
- int namelen;
+ struct fscrypt_str name;
u64 parent_id;
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
- ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = extref_get_fields(eb, ref_ptr, &name,
NULL, &parent_id);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get extref details for inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
+ goto out;
+ }
} else {
- parent_id = key->offset;
- ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
- NULL);
+ parent_id = wc->log_key.offset;
+ ret = ref_get_fields(eb, ref_ptr, &name, NULL);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get ref details for inode %llu parent_id %llu root %llu",
+ btrfs_ino(inode), parent_id,
+ btrfs_root_id(root));
+ goto out;
+ }
}
- if (ret)
- goto out;
- if (key->type == BTRFS_INODE_EXTREF_KEY)
- ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
- parent_id, name,
- namelen);
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
+ ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot,
+ parent_id, &name);
else
- ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
- name, namelen);
+ ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot,
+ &name);
if (!ret) {
- struct inode *dir;
-
- btrfs_release_path(path);
- dir = read_one_inode(root, parent_id);
- if (!dir) {
- ret = -ENOENT;
- kfree(name);
+ struct btrfs_inode *dir;
+
+ btrfs_release_path(wc->subvol_path);
+ dir = btrfs_iget_logging(parent_id, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ kfree(name.name);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_id, btrfs_root_id(root));
goto out;
}
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
- inode, name, namelen);
- kfree(name);
- iput(dir);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
+ kfree(name.name);
+ iput(&dir->vfs_inode);
if (ret)
goto out;
goto again;
}
- kfree(name);
- ref_ptr += namelen;
- if (key->type == BTRFS_INODE_EXTREF_KEY)
+ kfree(name.name);
+ ref_ptr += name.len;
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
ref_ptr += sizeof(struct btrfs_inode_extref);
else
ref_ptr += sizeof(struct btrfs_inode_ref);
}
ret = 0;
out:
- btrfs_release_path(path);
- return ret;
-}
-
-static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
- const u8 ref_type, const char *name,
- const int namelen)
-{
- struct btrfs_key key;
- struct btrfs_path *path;
- const u64 parent_id = btrfs_ino(BTRFS_I(dir));
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.type = ref_type;
- if (key.type == BTRFS_INODE_REF_KEY)
- key.offset = parent_id;
- else
- key.offset = btrfs_extref_hash(parent_id, name, namelen);
-
- ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- ret = 0;
- goto out;
- }
- if (key.type == BTRFS_INODE_EXTREF_KEY)
- ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
- path->slots[0], parent_id, name, namelen);
- else
- ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
- name, namelen);
-
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct inode *dir, struct inode *inode, const char *name,
- int namelen, u64 ref_index)
-{
- struct btrfs_dir_item *dir_item;
- struct btrfs_key key;
- struct btrfs_path *path;
- struct inode *other_inode = NULL;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- dir_item = btrfs_lookup_dir_item(NULL, root, path,
- btrfs_ino(BTRFS_I(dir)),
- name, namelen, 0);
- if (!dir_item) {
- btrfs_release_path(path);
- goto add_link;
- } else if (IS_ERR(dir_item)) {
- ret = PTR_ERR(dir_item);
- goto out;
- }
-
- /*
- * Our inode's dentry collides with the dentry of another inode which is
- * in the log but not yet processed since it has a higher inode number.
- * So delete that other dentry.
- */
- btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
- btrfs_release_path(path);
- other_inode = read_one_inode(root, key.objectid);
- if (!other_inode) {
- ret = -ENOENT;
- goto out;
- }
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
- name, namelen);
- if (ret)
- goto out;
- /*
- * If we dropped the link count to 0, bump it so that later the iput()
- * on the inode will not free it. We will fixup the link count later.
- */
- if (other_inode->i_nlink == 0)
- inc_nlink(other_inode);
-
- ret = btrfs_run_delayed_items(trans);
- if (ret)
- goto out;
-add_link:
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- name, namelen, 0, ref_index);
-out:
- iput(other_inode);
- btrfs_free_path(path);
-
+ btrfs_release_path(wc->subvol_path);
return ret;
}
/*
- * replay one inode back reference item found in the log tree.
- * eb, slot and key refer to the buffer and key found in the log tree.
- * root is the destination we are replaying into, and path is for temp
- * use by this function. (it should be released on return).
+ * Replay one inode back reference item found in the log tree.
+ * Path is for temporary use by this function (it should be released on return).
*/
-static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static noinline int add_inode_ref(struct walk_control *wc)
{
- struct inode *dir = NULL;
- struct inode *inode = NULL;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_inode *dir = NULL;
+ struct btrfs_inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
- char *name = NULL;
- int namelen;
+ struct fscrypt_str name = { 0 };
int ret;
- int search_done = 0;
- int log_ref_ver = 0;
+ const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY);
u64 parent_objectid;
u64 inode_objectid;
u64 ref_index = 0;
int ref_struct_size;
- ref_ptr = btrfs_item_ptr_offset(eb, slot);
- ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+ ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
+ ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot);
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ if (is_extref_item) {
struct btrfs_inode_extref *r;
ref_struct_size = sizeof(struct btrfs_inode_extref);
- log_ref_ver = 1;
r = (struct btrfs_inode_extref *)ref_ptr;
- parent_objectid = btrfs_inode_extref_parent(eb, r);
+ parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r);
} else {
ref_struct_size = sizeof(struct btrfs_inode_ref);
- parent_objectid = key->offset;
+ parent_objectid = wc->log_key.offset;
}
- inode_objectid = key->objectid;
+ inode_objectid = wc->log_key.objectid;
/*
* it is possible that we didn't log all the parent directories
@@ -1437,43 +1600,95 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* copy the back ref in. The link count fixup code will take
* care of the rest
*/
- dir = read_one_inode(root, parent_objectid);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_objectid, btrfs_root_id(root));
+ dir = NULL;
goto out;
}
- inode = read_one_inode(root, inode_objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(inode_objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ inode_objectid, btrfs_root_id(root));
+ inode = NULL;
goto out;
}
while (ref_ptr < ref_end) {
- if (log_ref_ver) {
- ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+ if (is_extref_item) {
+ ret = extref_get_fields(wc->log_leaf, ref_ptr, &name,
&ref_index, &parent_objectid);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get extref details for inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
+ goto out;
+ }
/*
* parent object can change from one array
* item to another.
*/
- if (!dir)
- dir = read_one_inode(root, parent_objectid);
if (!dir) {
- ret = -ENOENT;
- goto out;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ dir = NULL;
+ /*
+ * A new parent dir may have not been
+ * logged and not exist in the subvolume
+ * tree, see the comment above before
+ * the loop when getting the first
+ * parent dir.
+ */
+ if (ret == -ENOENT) {
+ /*
+ * The next extref may refer to
+ * another parent dir that
+ * exists, so continue.
+ */
+ ret = 0;
+ goto next;
+ } else {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_objectid,
+ btrfs_root_id(root));
+ }
+ goto out;
+ }
}
} else {
- ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
- &ref_index);
+ ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get ref details for inode %llu parent_objectid %llu root %llu",
+ btrfs_ino(inode),
+ parent_objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
}
- if (ret)
- goto out;
- /* if we already have a perfect match, we're done */
- if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index,
- name, namelen)) {
+ ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir),
+ btrfs_ino(inode), ref_index, &name);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ ref_index, name.len, name.name,
+ btrfs_root_id(root));
+ goto out;
+ } else if (ret == 0) {
/*
* look for a conflicting back reference in the
* metadata. if we find one we have to unlink that name
@@ -1481,62 +1696,42 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
-
- if (!search_done) {
- ret = __add_inode_ref(trans, root, path, log,
- BTRFS_I(dir),
- BTRFS_I(inode),
- inode_objectid,
- parent_objectid,
- ref_index, name, namelen,
- &search_done);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto out;
- }
- }
-
- /*
- * If a reference item already exists for this inode
- * with the same parent and name, but different index,
- * drop it and the corresponding directory index entries
- * from the parent before adding the new reference item
- * and dir index entries, otherwise we would fail with
- * -EEXIST returned from btrfs_add_link() below.
- */
- ret = btrfs_inode_ref_exists(inode, dir, key->type,
- name, namelen);
- if (ret > 0) {
- ret = btrfs_unlink_inode(trans, root,
- BTRFS_I(dir),
- BTRFS_I(inode),
- name, namelen);
- /*
- * If we dropped the link count to 0, bump it so
- * that later the iput() on the inode will not
- * free it. We will fixup the link count later.
- */
- if (!ret && inode->i_nlink == 0)
- inc_nlink(inode);
- }
- if (ret < 0)
+ ret = __add_inode_ref(wc, dir, inode, ref_index, &name);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
goto out;
+ }
/* insert our name */
- ret = add_link(trans, root, dir, inode, name, namelen,
- ref_index);
- if (ret)
+ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(inode),
+ btrfs_ino(dir), ref_index,
+ name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ }
- btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
+ goto out;
+ }
}
+ /* Else, ret == 1, we already have a perfect match, we're done. */
- ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
- kfree(name);
- name = NULL;
- if (log_ref_ver) {
- iput(dir);
+next:
+ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
+ kfree(name.name);
+ name.name = NULL;
+ if (is_extref_item && dir) {
+ iput(&dir->vfs_inode);
dir = NULL;
}
}
@@ -1549,35 +1744,23 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* dir index entries exist for a name but there is no inode reference
* item with the same name.
*/
- ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
- key);
+ ret = unlink_old_inode_refs(wc, inode);
if (ret)
goto out;
/* finally write the back reference in the inode */
- ret = overwrite_item(trans, root, path, eb, slot, key);
+ ret = overwrite_item(wc);
out:
- btrfs_release_path(path);
- kfree(name);
- iput(dir);
- iput(inode);
- return ret;
-}
-
-static int insert_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 ino)
-{
- int ret;
-
- ret = btrfs_insert_orphan_item(trans, root, ino);
- if (ret == -EEXIST)
- ret = 0;
-
+ btrfs_release_path(wc->subvol_path);
+ kfree(name.name);
+ if (dir)
+ iput(&dir->vfs_inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
-static int count_inode_extrefs(struct btrfs_root *root,
- struct btrfs_inode *inode, struct btrfs_path *path)
+static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret = 0;
int name_len;
@@ -1591,13 +1774,13 @@ static int count_inode_extrefs(struct btrfs_root *root,
struct extent_buffer *leaf;
while (1) {
- ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
- &extref, &offset);
+ ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
+ path, &extref, &offset);
if (ret)
break;
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
cur_offset = 0;
@@ -1620,8 +1803,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
return nlink;
}
-static int count_inode_refs(struct btrfs_root *root,
- struct btrfs_inode *inode, struct btrfs_path *path)
+static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret;
struct btrfs_key key;
@@ -1636,7 +1818,7 @@ static int count_inode_refs(struct btrfs_root *root,
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
if (ret < 0)
break;
if (ret > 0) {
@@ -1651,7 +1833,7 @@ process_slot:
key.type != BTRFS_INODE_REF_KEY)
break;
ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+ ptr_end = ptr + btrfs_item_size(path->nodes[0],
path->slots[0]);
while (ptr < ptr_end) {
struct btrfs_inode_ref *ref;
@@ -1687,26 +1869,22 @@ process_slot:
* number of back refs found. If it goes down to zero, the iput
* will free the inode.
*/
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode)
+static noinline int fixup_inode_link_count(struct walk_control *wc,
+ struct btrfs_inode *inode)
{
- struct btrfs_path *path;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = inode->root;
int ret;
u64 nlink = 0;
- u64 ino = btrfs_ino(BTRFS_I(inode));
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ const u64 ino = btrfs_ino(inode);
- ret = count_inode_refs(root, BTRFS_I(inode), path);
+ ret = count_inode_refs(inode, wc->subvol_path);
if (ret < 0)
goto out;
nlink = ret;
- ret = count_inode_extrefs(root, BTRFS_I(inode), path);
+ ret = count_inode_extrefs(inode, wc->subvol_path);
if (ret < 0)
goto out;
@@ -1714,78 +1892,84 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
ret = 0;
- if (nlink != inode->i_nlink) {
- set_nlink(inode, nlink);
- btrfs_update_inode(trans, root, inode);
+ if (nlink != inode->vfs_inode.i_nlink) {
+ set_nlink(&inode->vfs_inode, nlink);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ goto out;
}
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->index_cnt = (u64)-1;
- if (inode->i_nlink == 0) {
- if (S_ISDIR(inode->i_mode)) {
- ret = replay_dir_deletes(trans, root, NULL, path,
- ino, 1);
+ if (inode->vfs_inode.i_nlink == 0) {
+ if (S_ISDIR(inode->vfs_inode.i_mode)) {
+ ret = replay_dir_deletes(wc, ino, true);
if (ret)
goto out;
}
- ret = insert_orphan_item(trans, root, ino);
+ ret = btrfs_insert_orphan_item(trans, root, ino);
+ if (ret == -EEXIST)
+ ret = 0;
}
out:
- btrfs_free_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
-static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path)
+static noinline int fixup_inode_link_counts(struct walk_control *wc)
{
int ret;
struct btrfs_key key;
- struct inode *inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_inode *inode;
+
+ ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1);
if (ret < 0)
break;
if (ret == 1) {
- if (path->slots[0] == 0)
+ ret = 0;
+ if (wc->subvol_path->slots[0] == 0)
break;
- path->slots[0]--;
+ wc->subvol_path->slots[0]--;
}
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]);
if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
- ret = btrfs_del_item(trans, root, path);
+ ret = btrfs_del_item(trans, root, wc->subvol_path);
if (ret)
- goto out;
+ break;
- btrfs_release_path(path);
- inode = read_one_inode(root, key.offset);
- if (!inode)
- return -EIO;
+ btrfs_release_path(wc->subvol_path);
+ inode = btrfs_iget_logging(key.offset, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ break;
+ }
- ret = fixup_inode_link_count(trans, root, inode);
- iput(inode);
+ ret = fixup_inode_link_count(wc, inode);
+ iput(&inode->vfs_inode);
if (ret)
- goto out;
+ break;
/*
* fixup on a directory may create new entries,
- * make sure we always look for the highset possible
+ * make sure we always look for the highest possible
* offset
*/
key.offset = (u64)-1;
}
- ret = 0;
-out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
@@ -1795,38 +1979,50 @@ out:
* count when replay is done. The link count is incremented here
* so the inode won't go away until we check it
*/
-static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 objectid)
+static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_key key;
int ret = 0;
- struct inode *inode;
-
- inode = read_one_inode(root, objectid);
- if (!inode)
- return -EIO;
+ struct btrfs_inode *inode;
+ struct inode *vfs_inode;
+
+ inode = btrfs_iget_logging(objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ objectid, btrfs_root_id(root));
+ return ret;
+ }
+ vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
- ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (ret == 0) {
- if (!inode->i_nlink)
- set_nlink(inode, 1);
+ if (!vfs_inode->i_nlink)
+ set_nlink(vfs_inode, 1);
else
- inc_nlink(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ inc_nlink(vfs_inode);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ objectid, btrfs_root_id(root));
} else if (ret == -EEXIST) {
ret = 0;
} else {
- BUG(); /* Logic Error */
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert fixup item for inode %llu root %llu",
+ objectid, btrfs_root_id(root));
}
- iput(inode);
+ iput(vfs_inode);
return ret;
}
@@ -1839,33 +2035,59 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 dirid, u64 index,
- char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_key *location)
{
- struct inode *inode;
- struct inode *dir;
+ struct btrfs_inode *inode;
+ struct btrfs_inode *dir;
int ret;
- inode = read_one_inode(root, location->objectid);
- if (!inode)
- return -ENOENT;
+ inode = btrfs_iget_logging(location->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- dir = read_one_inode(root, dirid);
- if (!dir) {
- iput(inode);
- return -EIO;
+ dir = btrfs_iget_logging(dirid, root);
+ if (IS_ERR(dir)) {
+ iput(&inode->vfs_inode);
+ return PTR_ERR(dir);
}
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- name_len, 1, index);
+ ret = btrfs_add_link(trans, dir, inode, name, 1, index);
/* FIXME, put inode into FIXUP list */
- iput(inode);
- iput(dir);
+ iput(&inode->vfs_inode);
+ iput(&dir->vfs_inode);
return ret;
}
+static int delete_conflicting_dir_entry(struct walk_control *wc,
+ struct btrfs_inode *dir,
+ struct btrfs_dir_item *dst_di,
+ const struct btrfs_key *log_key,
+ u8 log_flags,
+ bool exists)
+{
+ struct btrfs_key found_key;
+
+ btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key);
+ /* The existing dentry points to the same inode, don't delete it. */
+ if (found_key.objectid == log_key->objectid &&
+ found_key.type == log_key->type &&
+ found_key.offset == log_key->offset &&
+ btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags)
+ return 1;
+
+ /*
+ * Don't drop the conflicting directory entry if the inode for the new
+ * entry doesn't exist.
+ */
+ if (!exists)
+ return 0;
+
+ return drop_one_dir_item(wc, dir, dst_di);
+}
+
/*
* take a single entry in a log directory item and replay it into
* the subvolume.
@@ -1882,115 +2104,123 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
* Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
* non-existing inode) and 1 if the name was replayed.
*/
-static noinline int replay_one_name(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb,
- struct btrfs_dir_item *di,
- struct btrfs_key *key)
+static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di)
{
- char *name;
- int name_len;
- struct btrfs_dir_item *dst_di;
- struct btrfs_key found_key;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct fscrypt_str name = { 0 };
+ struct btrfs_dir_item *dir_dst_di;
+ struct btrfs_dir_item *index_dst_di;
+ bool dir_dst_matches = false;
+ bool index_dst_matches = false;
struct btrfs_key log_key;
- struct inode *dir;
- u8 log_type;
- int exists;
- int ret = 0;
- bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+ struct btrfs_key search_key;
+ struct btrfs_inode *dir;
+ u8 log_flags;
+ bool exists;
+ int ret;
+ bool update_size = true;
bool name_added = false;
- dir = read_one_inode(root, key->objectid);
- if (!dir)
- return -EIO;
+ dir = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
+ return ret;
+ }
- name_len = btrfs_dir_name_len(eb, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name) {
- ret = -ENOMEM;
+ ret = read_alloc_one_name(wc->log_leaf, di + 1,
+ btrfs_dir_name_len(wc->log_leaf, di), &name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
goto out;
}
- log_type = btrfs_dir_type(eb, di);
- read_extent_buffer(eb, name, (unsigned long)(di + 1),
- name_len);
-
- btrfs_dir_item_key_to_cpu(eb, di, &log_key);
- exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
- if (exists == 0)
- exists = 1;
- else
- exists = 0;
- btrfs_release_path(path);
-
- if (key->type == BTRFS_DIR_ITEM_KEY) {
- dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
- name, name_len, 1);
- } else if (key->type == BTRFS_DIR_INDEX_KEY) {
- dst_di = btrfs_lookup_dir_index_item(trans, root, path,
- key->objectid,
- key->offset, name,
- name_len, 1);
- } else {
- /* Corruption */
- ret = -EINVAL;
+ log_flags = btrfs_dir_flags(wc->log_leaf, di);
+ btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key);
+ ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0);
+ btrfs_release_path(wc->subvol_path);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ log_key.objectid, btrfs_root_id(root));
goto out;
}
- if (IS_ERR_OR_NULL(dst_di)) {
- /* we need a sequence number to insert, so we only
- * do inserts for the BTRFS_DIR_INDEX_KEY types
- */
- if (key->type != BTRFS_DIR_INDEX_KEY)
+ exists = (ret == 0);
+ ret = 0;
+
+ dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path,
+ wc->log_key.objectid, &name, 1);
+ if (IS_ERR(dir_dst_di)) {
+ ret = PTR_ERR(dir_dst_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir item for dir %llu name %.*s root %llu",
+ wc->log_key.objectid, name.len, name.name,
+ btrfs_root_id(root));
+ goto out;
+ } else if (dir_dst_di) {
+ ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di,
+ &log_key, log_flags, exists);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete conflicting entry for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name.len, name.name,
+ btrfs_root_id(root));
goto out;
- goto insert;
+ }
+ dir_dst_matches = (ret == 1);
}
- btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
- /* the existing item matches the logged item */
- if (found_key.objectid == log_key.objectid &&
- found_key.type == log_key.type &&
- found_key.offset == log_key.offset &&
- btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
- update_size = false;
+ btrfs_release_path(wc->subvol_path);
+
+ index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path,
+ wc->log_key.objectid,
+ wc->log_key.offset, &name, 1);
+ if (IS_ERR(index_dst_di)) {
+ ret = PTR_ERR(index_dst_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir index item for dir %llu name %.*s root %llu",
+ wc->log_key.objectid, name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ } else if (index_dst_di) {
+ ret = delete_conflicting_dir_entry(wc, dir, index_dst_di,
+ &log_key, log_flags, exists);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete conflicting entry for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name.len, name.name,
+ btrfs_root_id(root));
+ goto out;
+ }
+ index_dst_matches = (ret == 1);
}
- /*
- * don't drop the conflicting directory entry if the inode
- * for the new entry doesn't exist
- */
- if (!exists)
- goto out;
+ btrfs_release_path(wc->subvol_path);
- ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
- if (ret)
+ if (dir_dst_matches && index_dst_matches) {
+ ret = 0;
+ update_size = false;
goto out;
-
- if (key->type == BTRFS_DIR_INDEX_KEY)
- goto insert;
-out:
- btrfs_release_path(path);
- if (!ret && update_size) {
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
- ret = btrfs_update_inode(trans, root, dir);
}
- kfree(name);
- iput(dir);
- if (!ret && name_added)
- ret = 1;
- return ret;
-insert:
/*
* Check if the inode reference exists in the log for the given name,
* inode and parent inode
*/
- found_key.objectid = log_key.objectid;
- found_key.type = BTRFS_INODE_REF_KEY;
- found_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
+ search_key.objectid = log_key.objectid;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = wc->log_key.objectid;
+ ret = backref_in_log(root->log_root, &search_key, 0, &name);
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu",
+ search_key.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
goto out;
} else if (ret) {
/* The dentry will be added later. */
@@ -1999,12 +2229,15 @@ insert:
goto out;
}
- found_key.objectid = log_key.objectid;
- found_key.type = BTRFS_INODE_EXTREF_KEY;
- found_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
- name_len);
+ search_key.objectid = log_key.objectid;
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len);
+ ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name);
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu",
+ search_key.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
goto out;
} else if (ret) {
/* The dentry will be added later. */
@@ -2012,96 +2245,84 @@ insert:
update_size = false;
goto out;
}
- btrfs_release_path(path);
- ret = insert_one_name(trans, root, key->objectid, key->offset,
- name, name_len, &log_key);
- if (ret && ret != -ENOENT && ret != -EEXIST)
+ ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset,
+ &name, &log_key);
+ if (ret && ret != -ENOENT && ret != -EEXIST) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert name %.*s for inode %llu dir %llu root %llu",
+ name.len, name.name, log_key.objectid,
+ btrfs_ino(dir), btrfs_root_id(root));
goto out;
+ }
if (!ret)
name_added = true;
update_size = false;
ret = 0;
- goto out;
+
+out:
+ if (!ret && update_size) {
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
+ ret = btrfs_update_inode(trans, dir);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update dir inode %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
+ }
+ kfree(name.name);
+ iput(&dir->vfs_inode);
+ if (!ret && name_added)
+ ret = 1;
+ return ret;
}
-/*
- * find all the names in a directory item and reconcile them into
- * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
- * one name in a directory item, but the same code gets used for
- * both directory index types
- */
-static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
+static noinline int replay_one_dir_item(struct walk_control *wc)
{
- int ret = 0;
- u32 item_size = btrfs_item_size_nr(eb, slot);
+ int ret;
struct btrfs_dir_item *di;
- int name_len;
- unsigned long ptr;
- unsigned long ptr_end;
- struct btrfs_path *fixup_path = NULL;
- ptr = btrfs_item_ptr_offset(eb, slot);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- di = (struct btrfs_dir_item *)ptr;
- name_len = btrfs_dir_name_len(eb, di);
- ret = replay_one_name(trans, root, path, eb, di, key);
- if (ret < 0)
- break;
- ptr = (unsigned long)(di + 1);
- ptr += name_len;
+ /* We only log dir index keys, which only contain a single dir item. */
+ ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY,
+ "wc->log_key.type=%u", wc->log_key.type);
- /*
- * If this entry refers to a non-directory (directories can not
- * have a link count > 1) and it was added in the transaction
- * that was not committed, make sure we fixup the link count of
- * the inode it the entry points to. Otherwise something like
- * the following would result in a directory pointing to an
- * inode with a wrong link that does not account for this dir
- * entry:
- *
- * mkdir testdir
- * touch testdir/foo
- * touch testdir/bar
- * sync
- *
- * ln testdir/bar testdir/bar_link
- * ln testdir/foo testdir/foo_link
- * xfs_io -c "fsync" testdir/bar
- *
- * <power failure>
- *
- * mount fs, log replay happens
- *
- * File foo would remain with a link count of 1 when it has two
- * entries pointing to it in the directory testdir. This would
- * make it impossible to ever delete the parent directory has
- * it would result in stale dentries that can never be deleted.
- */
- if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
- struct btrfs_key di_key;
+ di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item);
+ ret = replay_one_name(wc, di);
+ if (ret < 0)
+ return ret;
- if (!fixup_path) {
- fixup_path = btrfs_alloc_path();
- if (!fixup_path) {
- ret = -ENOMEM;
- break;
- }
- }
+ /*
+ * If this entry refers to a non-directory (directories can not have a
+ * link count > 1) and it was added in the transaction that was not
+ * committed, make sure we fixup the link count of the inode the entry
+ * points to. Otherwise something like the following would result in a
+ * directory pointing to an inode with a wrong link that does not account
+ * for this dir entry:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * touch testdir/bar
+ * sync
+ *
+ * ln testdir/bar testdir/bar_link
+ * ln testdir/foo testdir/foo_link
+ * xfs_io -c "fsync" testdir/bar
+ *
+ * <power failure>
+ *
+ * mount fs, log replay happens
+ *
+ * File foo would remain with a link count of 1 when it has two entries
+ * pointing to it in the directory testdir. This would make it impossible
+ * to ever delete the parent directory has it would result in stale
+ * dentries that can never be deleted.
+ */
+ if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) {
+ struct btrfs_key di_key;
- btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- ret = link_to_fixup_dir(trans, root, fixup_path,
- di_key.objectid);
- if (ret)
- break;
- }
- ret = 0;
+ btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key);
+ ret = link_to_fixup_dir(wc, di_key.objectid);
}
- btrfs_free_path(fixup_path);
+
return ret;
}
@@ -2118,7 +2339,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
*/
static noinline int find_dir_range(struct btrfs_root *root,
struct btrfs_path *path,
- u64 dirid, int key_type,
+ u64 dirid,
u64 *start_ret, u64 *end_ret)
{
struct btrfs_key key;
@@ -2131,7 +2352,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
return 1;
key.objectid = dirid;
- key.type = key_type;
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
key.offset = *start_ret;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -2145,7 +2366,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
if (ret != 0)
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.type != key_type || key.objectid != dirid) {
+ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
ret = 1;
goto next;
}
@@ -2172,7 +2393,7 @@ next:
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.type != key_type || key.objectid != dirid) {
+ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
ret = 1;
goto out;
}
@@ -2192,162 +2413,157 @@ out:
* item is not in the log, the item is removed and the inode it points
* to is unlinked
*/
-static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
+static noinline int check_item_in_log(struct walk_control *wc,
struct btrfs_path *log_path,
- struct inode *dir,
- struct btrfs_key *dir_key)
+ struct btrfs_inode *dir,
+ struct btrfs_key *dir_key,
+ bool force_remove)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = dir->root;
int ret;
struct extent_buffer *eb;
int slot;
- u32 item_size;
struct btrfs_dir_item *di;
- struct btrfs_dir_item *log_di;
- int name_len;
- unsigned long ptr;
- unsigned long ptr_end;
- char *name;
- struct inode *inode;
+ struct fscrypt_str name = { 0 };
+ struct btrfs_inode *inode = NULL;
struct btrfs_key location;
-again:
- eb = path->nodes[0];
- slot = path->slots[0];
- item_size = btrfs_item_size_nr(eb, slot);
- ptr = btrfs_item_ptr_offset(eb, slot);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- di = (struct btrfs_dir_item *)ptr;
- name_len = btrfs_dir_name_len(eb, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name) {
- ret = -ENOMEM;
- goto out;
- }
- read_extent_buffer(eb, name, (unsigned long)(di + 1),
- name_len);
- log_di = NULL;
- if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
- log_di = btrfs_lookup_dir_item(trans, log, log_path,
- dir_key->objectid,
- name, name_len, 0);
- } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
- log_di = btrfs_lookup_dir_index_item(trans, log,
- log_path,
- dir_key->objectid,
- dir_key->offset,
- name, name_len, 0);
- }
- if (!log_di || log_di == ERR_PTR(-ENOENT)) {
- btrfs_dir_item_key_to_cpu(eb, di, &location);
- btrfs_release_path(path);
- btrfs_release_path(log_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- kfree(name);
- return -EIO;
- }
+ /*
+ * Currently we only log dir index keys. Even if we replay a log created
+ * by an older kernel that logged both dir index and dir item keys, all
+ * we need to do is process the dir index keys, we (and our caller) can
+ * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
+ */
+ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY, "dir_key->type=%u", dir_key->type);
- ret = link_to_fixup_dir(trans, root,
- path, location.objectid);
- if (ret) {
- kfree(name);
- iput(inode);
- goto out;
- }
+ eb = wc->subvol_path->nodes[0];
+ slot = wc->subvol_path->slots[0];
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu index %llu root %llu",
+ btrfs_ino(dir), dir_key->offset,
+ btrfs_root_id(root));
+ goto out;
+ }
- inc_nlink(inode);
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
- BTRFS_I(inode), name, name_len);
- if (!ret)
- ret = btrfs_run_delayed_items(trans);
- kfree(name);
- iput(inode);
- if (ret)
- goto out;
+ if (!force_remove) {
+ struct btrfs_dir_item *log_di;
- /* there might still be more names under this key
- * check and repeat if required
- */
- ret = btrfs_search_slot(NULL, root, dir_key, path,
- 0, 0);
- if (ret == 0)
- goto again;
+ log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path,
+ dir_key->objectid,
+ dir_key->offset, &name, 0);
+ if (IS_ERR(log_di)) {
+ ret = PTR_ERR(log_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir index item for dir %llu index %llu name %.*s root %llu",
+ btrfs_ino(dir), dir_key->offset,
+ name.len, name.name,
+ btrfs_root_id(root));
+ goto out;
+ } else if (log_di) {
+ /* The dentry exists in the log, we have nothing to do. */
ret = 0;
goto out;
- } else if (IS_ERR(log_di)) {
- kfree(name);
- return PTR_ERR(log_di);
}
- btrfs_release_path(log_path);
- kfree(name);
+ }
- ptr = (unsigned long)(di + 1);
- ptr += name_len;
+ btrfs_dir_item_key_to_cpu(eb, di, &location);
+ btrfs_release_path(wc->subvol_path);
+ btrfs_release_path(log_path);
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ location.objectid, btrfs_root_id(root));
+ goto out;
}
- ret = 0;
+
+ ret = link_to_fixup_dir(wc, location.objectid);
+ if (ret)
+ goto out;
+
+ inc_nlink(&inode->vfs_inode);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
+ /*
+ * Unlike dir item keys, dir index keys can only have one name (entry) in
+ * them, as there are no key collisions since each key has a unique offset
+ * (an index number), so we're done.
+ */
out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
btrfs_release_path(log_path);
+ kfree(name.name);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
-static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- const u64 ino)
+static int replay_xattr_deletes(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_root *log = wc->log;
struct btrfs_key search_key;
- struct btrfs_path *log_path;
- int i;
+ BTRFS_PATH_AUTO_FREE(log_path);
+ const u64 ino = wc->log_key.objectid;
int nritems;
int ret;
log_path = btrfs_alloc_path();
- if (!log_path)
+ if (!log_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
search_key.objectid = ino;
search_key.type = BTRFS_XATTR_ITEM_KEY;
search_key.offset = 0;
again:
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret < 0)
+ ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search xattrs for inode %llu root %llu",
+ ino, btrfs_root_id(root));
goto out;
+ }
process_leaf:
- nritems = btrfs_header_nritems(path->nodes[0]);
- for (i = path->slots[0]; i < nritems; i++) {
+ nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
+ for (int i = wc->subvol_path->slots[0]; i < nritems; i++) {
struct btrfs_key key;
struct btrfs_dir_item *di;
struct btrfs_dir_item *log_di;
u32 total_size;
u32 cur;
- btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i);
if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
ret = 0;
goto out;
}
- di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
- total_size = btrfs_item_size_nr(path->nodes[0], i);
+ di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item);
+ total_size = btrfs_item_size(wc->subvol_path->nodes[0], i);
cur = 0;
while (cur < total_size) {
- u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
- u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+ u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di);
+ u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di);
u32 this_len = sizeof(*di) + name_len + data_len;
char *name;
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
ret = -ENOMEM;
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate memory for name of length %u",
+ name_len);
goto out;
}
- read_extent_buffer(path->nodes[0], name,
+ read_extent_buffer(wc->subvol_path->nodes[0], name,
(unsigned long)(di + 1), name_len);
log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
@@ -2355,40 +2571,59 @@ process_leaf:
btrfs_release_path(log_path);
if (!log_di) {
/* Doesn't exist in log tree, so delete it. */
- btrfs_release_path(path);
- di = btrfs_lookup_xattr(trans, root, path, ino,
+ btrfs_release_path(wc->subvol_path);
+ di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino,
name, name_len, -1);
- kfree(name);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup xattr with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
goto out;
}
ASSERT(di);
ret = btrfs_delete_one_dir_name(trans, root,
- path, di);
- if (ret)
+ wc->subvol_path, di);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete xattr with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
goto out;
- btrfs_release_path(path);
+ }
+ btrfs_release_path(wc->subvol_path);
+ kfree(name);
search_key = key;
goto again;
}
- kfree(name);
if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup xattr in log tree with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
goto out;
}
+ kfree(name);
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
}
- ret = btrfs_next_leaf(root, path);
+ ret = btrfs_next_leaf(root, wc->subvol_path);
if (ret > 0)
ret = 0;
else if (ret == 0)
goto process_leaf;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get next leaf in subvolume root %llu",
+ btrfs_root_id(root));
out:
- btrfs_free_path(log_path);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
@@ -2403,101 +2638,113 @@ out:
* Anything we don't find in the log is unlinked and removed from the
* directory.
*/
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- u64 dirid, int del_all)
+static noinline int replay_dir_deletes(struct walk_control *wc,
+ u64 dirid, bool del_all)
{
+ struct btrfs_root *root = wc->root;
+ struct btrfs_root *log = (del_all ? NULL : wc->log);
u64 range_start;
u64 range_end;
- int key_type = BTRFS_DIR_LOG_ITEM_KEY;
int ret = 0;
struct btrfs_key dir_key;
struct btrfs_key found_key;
- struct btrfs_path *log_path;
- struct inode *dir;
+ BTRFS_PATH_AUTO_FREE(log_path);
+ struct btrfs_inode *dir;
dir_key.objectid = dirid;
- dir_key.type = BTRFS_DIR_ITEM_KEY;
+ dir_key.type = BTRFS_DIR_INDEX_KEY;
log_path = btrfs_alloc_path();
- if (!log_path)
+ if (!log_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
- dir = read_one_inode(root, dirid);
- /* it isn't an error if the inode isn't there, that can happen
- * because we replay the deletes before we copy in the inode item
- * from the log
+ dir = btrfs_iget_logging(dirid, root);
+ /*
+ * It isn't an error if the inode isn't there, that can happen because
+ * we replay the deletes before we copy in the inode item from the log.
*/
- if (!dir) {
- btrfs_free_path(log_path);
- return 0;
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ dirid, btrfs_root_id(root));
+ return ret;
}
-again:
+
range_start = 0;
range_end = 0;
while (1) {
if (del_all)
range_end = (u64)-1;
else {
- ret = find_dir_range(log, path, dirid, key_type,
+ ret = find_dir_range(log, wc->subvol_path, dirid,
&range_start, &range_end);
- if (ret != 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to find range for dir %llu in log tree root %llu",
+ dirid, btrfs_root_id(root));
+ goto out;
+ } else if (ret > 0) {
break;
+ }
}
dir_key.offset = range_start;
while (1) {
int nritems;
- ret = btrfs_search_slot(NULL, root, &dir_key, path,
- 0, 0);
- if (ret < 0)
+ ret = btrfs_search_slot(NULL, root, &dir_key,
+ wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search root %llu for key " BTRFS_KEY_FMT,
+ btrfs_root_id(root),
+ BTRFS_KEY_FMT_VALUE(&dir_key));
goto out;
+ }
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 1)
+ nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
+ if (wc->subvol_path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, wc->subvol_path);
+ if (ret == 1) {
break;
- else if (ret < 0)
+ } else if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get next leaf in subvolume root %llu",
+ btrfs_root_id(root));
goto out;
+ }
}
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key,
+ wc->subvol_path->slots[0]);
if (found_key.objectid != dirid ||
- found_key.type != dir_key.type)
- goto next_type;
+ found_key.type != dir_key.type) {
+ ret = 0;
+ goto out;
+ }
if (found_key.offset > range_end)
break;
- ret = check_item_in_log(trans, root, log, path,
- log_path, dir,
- &found_key);
+ ret = check_item_in_log(wc, log_path, dir, &found_key, del_all);
if (ret)
goto out;
if (found_key.offset == (u64)-1)
break;
dir_key.offset = found_key.offset + 1;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (range_end == (u64)-1)
break;
range_start = range_end + 1;
}
-
-next_type:
ret = 0;
- if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
- key_type = BTRFS_DIR_LOG_INDEX_KEY;
- dir_key.type = BTRFS_DIR_INDEX_KEY;
- btrfs_release_path(path);
- goto again;
- }
out:
- btrfs_release_path(path);
- btrfs_free_path(log_path);
- iput(dir);
+ btrfs_release_path(wc->subvol_path);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -2512,48 +2759,70 @@ out:
* only in the log (references come from either directory items or inode
* back refs).
*/
-static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+static int replay_one_buffer(struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
int nritems;
- struct btrfs_path *path;
- struct btrfs_root *root = wc->replay_dest;
- struct btrfs_key key;
- int i;
+ struct btrfs_tree_parent_check check = {
+ .transid = gen,
+ .level = level
+ };
+ struct btrfs_root *root = wc->root;
+ struct btrfs_trans_handle *trans = wc->trans;
int ret;
- ret = btrfs_read_buffer(eb, gen, level, NULL);
- if (ret)
- return ret;
-
- level = btrfs_header_level(eb);
-
if (level != 0)
return 0;
- path = btrfs_alloc_path();
- if (!path)
+ /*
+ * Set to NULL since it was not yet read and in case we abort log replay
+ * on error, we have no valid log tree leaf to dump.
+ */
+ wc->log_leaf = NULL;
+ ret = btrfs_read_extent_buffer(eb, &check);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to read log tree leaf %llu for root %llu",
+ eb->start, btrfs_root_id(root));
+ return ret;
+ }
+
+ ASSERT(wc->subvol_path == NULL);
+ wc->subvol_path = btrfs_alloc_path();
+ if (!wc->subvol_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
+
+ wc->log_leaf = eb;
nritems = btrfs_header_nritems(eb);
- for (i = 0; i < nritems; i++) {
- btrfs_item_key_to_cpu(eb, &key, i);
+ for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) {
+ struct btrfs_inode_item *inode_item;
- /* inode keys are done during the first stage */
- if (key.type == BTRFS_INODE_ITEM_KEY &&
- wc->stage == LOG_WALK_REPLAY_INODES) {
- struct btrfs_inode_item *inode_item;
- u32 mode;
+ btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot);
- inode_item = btrfs_item_ptr(eb, i,
- struct btrfs_inode_item);
+ if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) {
+ inode_item = btrfs_item_ptr(eb, wc->log_slot,
+ struct btrfs_inode_item);
/*
- * If we have a tmpfile (O_TMPFILE) that got fsync'ed
- * and never got linked before the fsync, skip it, as
- * replaying it is pointless since it would be deleted
- * later. We skip logging tmpfiles, but it's always
- * possible we are replaying a log created with a kernel
- * that used to log tmpfiles.
+ * An inode with no links is either:
+ *
+ * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never
+ * got linked before the fsync, skip it, as replaying
+ * it is pointless since it would be deleted later.
+ * We skip logging tmpfiles, but it's always possible
+ * we are replaying a log created with a kernel that
+ * used to log tmpfiles;
+ *
+ * 2) A non-tmpfile which got its last link deleted
+ * while holding an open fd on it and later got
+ * fsynced through that fd. We always log the
+ * parent inodes when inode->last_unlink_trans is
+ * set to the current transaction, so ignore all the
+ * inode items for this inode. We will delete the
+ * inode when processing the parent directory with
+ * replay_dir_deletes().
*/
if (btrfs_inode_nlink(eb, inode_item) == 0) {
wc->ignore_cur_inode = true;
@@ -2561,19 +2830,23 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
} else {
wc->ignore_cur_inode = false;
}
- ret = replay_xattr_deletes(wc->trans, root, log,
- path, key.objectid);
+ }
+
+ /* Inode keys are done during the first stage. */
+ if (wc->log_key.type == BTRFS_INODE_ITEM_KEY &&
+ wc->stage == LOG_WALK_REPLAY_INODES) {
+ u32 mode;
+
+ ret = replay_xattr_deletes(wc);
if (ret)
break;
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
- ret = replay_dir_deletes(wc->trans,
- root, log, path, key.objectid, 0);
+ ret = replay_dir_deletes(wc, wc->log_key.objectid, false);
if (ret)
break;
}
- ret = overwrite_item(wc->trans, root, path,
- eb, i, &key);
+ ret = overwrite_item(wc);
if (ret)
break;
@@ -2586,30 +2859,49 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
* those prealloc extents just after replaying them.
*/
if (S_ISREG(mode)) {
- struct inode *inode;
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_inode *inode;
u64 from;
- inode = read_one_inode(root, key.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ wc->log_key.objectid,
+ btrfs_root_id(root));
break;
}
- from = ALIGN(i_size_read(inode),
+ from = ALIGN(i_size_read(&inode->vfs_inode),
root->fs_info->sectorsize);
- ret = btrfs_drop_extents(wc->trans, root, inode,
- from, (u64)-1, 1);
- if (!ret) {
+ drop_args.start = from;
+ drop_args.end = (u64)-1;
+ drop_args.drop_cache = true;
+ drop_args.path = wc->subvol_path;
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to drop extents for inode %llu root %llu offset %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root),
+ from);
+ } else {
+ inode_sub_bytes(&inode->vfs_inode,
+ drop_args.bytes_found);
/* Update the inode's nbytes. */
- ret = btrfs_update_inode(wc->trans,
- root, inode);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
}
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
}
- ret = link_to_fixup_dir(wc->trans, root,
- path, key.objectid);
+ ret = link_to_fixup_dir(wc, wc->log_key.objectid);
if (ret)
break;
}
@@ -2617,10 +2909,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
if (wc->ignore_cur_inode)
continue;
- if (key.type == BTRFS_DIR_INDEX_KEY &&
+ if (wc->log_key.type == BTRFS_DIR_INDEX_KEY &&
wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
- ret = replay_one_dir_item(wc->trans, root, path,
- eb, i, &key);
+ ret = replay_one_dir_item(wc);
if (ret)
break;
}
@@ -2629,72 +2920,84 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
continue;
/* these keys are simply copied */
- if (key.type == BTRFS_XATTR_ITEM_KEY) {
- ret = overwrite_item(wc->trans, root, path,
- eb, i, &key);
+ if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) {
+ ret = overwrite_item(wc);
if (ret)
break;
- } else if (key.type == BTRFS_INODE_REF_KEY ||
- key.type == BTRFS_INODE_EXTREF_KEY) {
- ret = add_inode_ref(wc->trans, root, log, path,
- eb, i, &key);
- if (ret && ret != -ENOENT)
- break;
- ret = 0;
- } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
- ret = replay_one_extent(wc->trans, root, path,
- eb, i, &key);
+ } else if (wc->log_key.type == BTRFS_INODE_REF_KEY ||
+ wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = add_inode_ref(wc);
if (ret)
break;
- } else if (key.type == BTRFS_DIR_ITEM_KEY) {
- ret = replay_one_dir_item(wc->trans, root, path,
- eb, i, &key);
+ } else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) {
+ ret = replay_one_extent(wc);
if (ret)
break;
}
+ /*
+ * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
+ * BTRFS_DIR_INDEX_KEY items which we use to derive the
+ * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
+ * older kernel with such keys, ignore them.
+ */
}
- btrfs_free_path(path);
+ btrfs_free_path(wc->subvol_path);
+ wc->subvol_path = NULL;
return ret;
}
-/*
- * Correctly adjust the reserved bytes occupied by a log tree extent buffer
- */
-static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+static int clean_log_buffer(struct btrfs_trans_handle *trans,
+ struct extent_buffer *eb)
{
- struct btrfs_block_group *cache;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_block_group *bg;
- cache = btrfs_lookup_block_group(fs_info, start);
- if (!cache) {
- btrfs_err(fs_info, "unable to find block group for %llu", start);
- return;
+ btrfs_tree_lock(eb);
+ btrfs_clear_buffer_dirty(trans, eb);
+ wait_on_extent_buffer_writeback(eb);
+ btrfs_tree_unlock(eb);
+
+ if (trans) {
+ int ret;
+
+ ret = btrfs_pin_reserved_extent(trans, eb);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ bg = btrfs_lookup_block_group(fs_info, eb->start);
+ if (!bg) {
+ btrfs_err(fs_info, "unable to find block group for %llu", eb->start);
+ btrfs_handle_fs_error(fs_info, -ENOENT, NULL);
+ return -ENOENT;
}
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->reserved -= fs_info->nodesize;
- cache->space_info->bytes_reserved -= fs_info->nodesize;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
+ spin_lock(&bg->space_info->lock);
+ spin_lock(&bg->lock);
+ bg->reserved -= fs_info->nodesize;
+ bg->space_info->bytes_reserved -= fs_info->nodesize;
+ spin_unlock(&bg->lock);
+ spin_unlock(&bg->space_info->lock);
- btrfs_put_block_group(cache);
+ btrfs_put_block_group(bg);
+
+ return 0;
}
-static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level,
- struct walk_control *wc)
+static noinline int walk_down_log_tree(struct btrfs_path *path, int *level,
+ struct walk_control *wc)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_fs_info *fs_info = wc->log->fs_info;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
struct extent_buffer *cur;
- u32 blocksize;
int ret = 0;
while (*level > 0) {
- struct btrfs_key first_key;
+ struct btrfs_tree_parent_check check = { 0 };
cur = path->nodes[*level];
@@ -2706,16 +3009,25 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
- btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
- blocksize = fs_info->nodesize;
-
- next = btrfs_find_create_tree_block(fs_info, bytenr);
- if (IS_ERR(next))
- return PTR_ERR(next);
+ check.transid = ptr_gen;
+ check.level = *level - 1;
+ check.has_first_key = true;
+ btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
+
+ next = btrfs_find_create_tree_block(fs_info, bytenr,
+ btrfs_header_owner(cur),
+ *level - 1);
+ if (IS_ERR(next)) {
+ ret = PTR_ERR(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+ return ret;
+ }
if (*level == 1) {
- ret = wc->process_func(root, next, wc, ptr_gen,
- *level - 1);
+ ret = wc->process_func(next, wc, ptr_gen, *level - 1);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2723,37 +3035,32 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- ret = btrfs_read_buffer(next, ptr_gen,
- *level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
}
- if (trans) {
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
- btrfs_clean_tree_block(next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
- ret = btrfs_pin_reserved_extent(trans,
- bytenr, blocksize);
- if (ret) {
- free_extent_buffer(next);
- return ret;
- }
- } else {
- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
- clear_extent_buffer_dirty(next);
- unaccount_log_buffer(fs_info, bytenr);
+ ret = clean_log_buffer(trans, next);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
}
}
free_extent_buffer(next);
continue;
}
- ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
}
@@ -2770,12 +3077,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return 0;
}
-static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level,
- struct walk_control *wc)
+static noinline int walk_up_log_tree(struct btrfs_path *path, int *level,
+ struct walk_control *wc)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int i;
int slot;
int ret;
@@ -2788,35 +3092,16 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level == 0);
return 0;
} else {
- ret = wc->process_func(root, path->nodes[*level], wc,
+ ret = wc->process_func(path->nodes[*level], wc,
btrfs_header_generation(path->nodes[*level]),
*level);
if (ret)
return ret;
if (wc->free) {
- struct extent_buffer *next;
-
- next = path->nodes[*level];
-
- if (trans) {
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
- btrfs_clean_tree_block(next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
- ret = btrfs_pin_reserved_extent(trans,
- path->nodes[*level]->start,
- path->nodes[*level]->len);
- if (ret)
- return ret;
- } else {
- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
- clear_extent_buffer_dirty(next);
-
- unaccount_log_buffer(fs_info,
- path->nodes[*level]->start);
- }
+ ret = clean_log_buffer(wc->trans, path->nodes[*level]);
+ if (ret)
+ return ret;
}
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
@@ -2831,14 +3116,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
* the tree freeing any blocks that have a ref count of zero after being
* decremented.
*/
-static int walk_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *log, struct walk_control *wc)
+static int walk_log_tree(struct walk_control *wc)
{
- struct btrfs_fs_info *fs_info = log->fs_info;
+ struct btrfs_root *log = wc->log;
int ret = 0;
int wret;
int level;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int orig_level;
path = btrfs_alloc_path();
@@ -2848,59 +3132,34 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
level = btrfs_header_level(log->node);
orig_level = level;
path->nodes[level] = log->node;
- atomic_inc(&log->node->refs);
+ refcount_inc(&log->node->refs);
path->slots[level] = 0;
while (1) {
- wret = walk_down_log_tree(trans, log, path, &level, wc);
+ wret = walk_down_log_tree(path, &level, wc);
if (wret > 0)
break;
- if (wret < 0) {
- ret = wret;
- goto out;
- }
+ if (wret < 0)
+ return wret;
- wret = walk_up_log_tree(trans, log, path, &level, wc);
+ wret = walk_up_log_tree(path, &level, wc);
if (wret > 0)
break;
- if (wret < 0) {
- ret = wret;
- goto out;
- }
+ if (wret < 0)
+ return wret;
}
/* was the root node processed? if not, catch it here */
if (path->nodes[orig_level]) {
- ret = wc->process_func(log, path->nodes[orig_level], wc,
+ ret = wc->process_func(path->nodes[orig_level], wc,
btrfs_header_generation(path->nodes[orig_level]),
orig_level);
if (ret)
- goto out;
- if (wc->free) {
- struct extent_buffer *next;
-
- next = path->nodes[orig_level];
-
- if (trans) {
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
- btrfs_clean_tree_block(next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
- ret = btrfs_pin_reserved_extent(trans,
- next->start, next->len);
- if (ret)
- goto out;
- } else {
- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
- clear_extent_buffer_dirty(next);
- unaccount_log_buffer(fs_info, next->start);
- }
- }
+ return ret;
+ if (wc->free)
+ ret = clean_log_buffer(wc->trans, path->nodes[orig_level]);
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -2968,12 +3227,55 @@ static void wait_for_writer(struct btrfs_root *root)
finish_wait(&root->log_writer_wait, &wait);
}
-static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
- struct btrfs_log_ctx *ctx)
+void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode)
{
- if (!ctx)
+ ctx->log_ret = 0;
+ ctx->log_transid = 0;
+ ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
+ ctx->logging_new_delayed_dentries = false;
+ ctx->logged_before = false;
+ ctx->inode = inode;
+ INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+ INIT_LIST_HEAD(&ctx->conflict_inodes);
+ ctx->num_conflict_inodes = 0;
+ ctx->logging_conflict_inodes = false;
+ ctx->scratch_eb = NULL;
+}
+
+void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_inode *inode = ctx->inode;
+
+ if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
return;
+ /*
+ * Don't care about allocation failure. This is just for optimization,
+ * if we fail to allocate here, we will try again later if needed.
+ */
+ ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
+}
+
+void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ btrfs_assert_inode_locked(ctx->inode);
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
+
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
mutex_lock(&root->log_mutex);
list_del_init(&ctx->list);
mutex_unlock(&root->log_mutex);
@@ -2993,15 +3295,12 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
list_del_init(&ctx->list);
ctx->log_ret = error;
}
-
- INIT_LIST_HEAD(&root->log_ctxs[index]);
}
/*
- * btrfs_sync_log does sends a given tree log down to the disk and
- * updates the super blocks to record it. When this call is done,
- * you know that any inodes previously logged are safely on disk only
- * if it returns 0.
+ * Sends a given tree log down to the disk and updates the super blocks to
+ * record it. When this call is done, you know that any inodes previously
+ * logged are safely on disk only if it returns 0.
*
* Any other return value means you need to call btrfs_commit_transaction.
* Some of the edge cases for fsyncing directories that have had unlinks
@@ -3023,6 +3322,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
+ u64 log_root_start;
+ u64 log_root_level;
mutex_lock(&root->log_mutex);
log_transid = ctx->log_transid;
@@ -3037,7 +3338,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
- ASSERT(log_transid == root->log_transid);
+ ASSERT(log_transid == root->log_transid,
+ "log_transid=%d root->log_transid=%d", log_transid, root->log_transid);
atomic_set(&root->log_commit[index1], 1);
/* wait for previous tree log sync to complete */
@@ -3060,24 +3362,34 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* bail out if we need to do a full commit */
if (btrfs_need_log_full_commit(trans)) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
mutex_unlock(&root->log_mutex);
goto out;
}
if (log_transid % 2 == 0)
- mark = EXTENT_DIRTY;
+ mark = EXTENT_DIRTY_LOG1;
else
- mark = EXTENT_NEW;
+ mark = EXTENT_DIRTY_LOG2;
/* we start IO on all the marked extents here, but we don't actually
* wait for them until later.
*/
blk_start_plug(&plug);
ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
+ /*
+ * -EAGAIN happens when someone, e.g., a concurrent transaction
+ * commit, writes a dirty extent in this tree-log commit. This
+ * concurrent write will create a hole writing out the extents,
+ * and we cannot proceed on a zoned filesystem, requiring
+ * sequential writing. While we can bail out to a full commit
+ * here, but we can continue hoping the concurrent writing fills
+ * the hole.
+ */
+ if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
+ ret = 0;
if (ret) {
blk_finish_plug(&plug);
- btrfs_abort_transaction(trans, ret);
btrfs_set_log_full_commit(trans);
mutex_unlock(&root->log_mutex);
goto out;
@@ -3099,7 +3411,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_set_root_node(&log->root_item, log->node);
memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
- root->log_transid++;
+ btrfs_set_root_log_transid(root, root->log_transid + 1);
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
@@ -3109,47 +3421,43 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->log_mutex);
+ if (btrfs_is_zoned(fs_info)) {
+ mutex_lock(&fs_info->tree_root->log_mutex);
+ if (!log_root_tree->node) {
+ ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
+ if (ret) {
+ mutex_unlock(&fs_info->tree_root->log_mutex);
+ blk_finish_plug(&plug);
+ goto out;
+ }
+ }
+ mutex_unlock(&fs_info->tree_root->log_mutex);
+ }
+
btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
- atomic_inc(&log_root_tree->log_batch);
- atomic_inc(&log_root_tree->log_writers);
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
- mutex_unlock(&log_root_tree->log_mutex);
-
- mutex_lock(&log_root_tree->log_mutex);
-
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit
* open until we drop the log_mutex.
*/
ret = update_log_root(trans, log, &new_root_item);
-
- if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- /* atomic_dec_and_test implies a barrier */
- cond_wake_up_nomb(&log_root_tree->log_writer_wait);
- }
-
if (ret) {
- if (!list_empty(&root_log_ctx.list))
- list_del_init(&root_log_ctx.list);
-
+ list_del_init(&root_log_ctx.list);
blk_finish_plug(&plug);
btrfs_set_log_full_commit(trans);
-
- if (ret != -ENOSPC) {
- btrfs_abort_transaction(trans, ret);
- mutex_unlock(&log_root_tree->log_mutex);
- goto out;
- }
+ if (ret != -ENOSPC)
+ btrfs_err(fs_info,
+ "failed to update log for root %llu ret %d",
+ btrfs_root_id(root), ret);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = -EAGAIN;
goto out;
}
@@ -3161,7 +3469,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
- index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
ret = btrfs_wait_tree_log_extents(log, mark);
@@ -3172,7 +3479,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = root_log_ctx.log_ret;
goto out;
}
- ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
+ ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid,
+ "root_log_ctx.log_transid=%d log_root_tree->log_transid=%d",
+ root_log_ctx.log_transid, log_root_tree->log_transid);
atomic_set(&log_root_tree->log_commit[index2], 1);
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
@@ -3180,8 +3489,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
root_log_ctx.log_transid - 1);
}
- wait_for_writer(log_root_tree);
-
/*
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
@@ -3190,56 +3497,98 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
blk_finish_plug(&plug);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out_wake_log_root;
}
ret = btrfs_write_marked_extents(fs_info,
&log_root_tree->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
blk_finish_plug(&plug);
- if (ret) {
+ /*
+ * As described above, -EAGAIN indicates a hole in the extents. We
+ * cannot wait for these write outs since the waiting cause a
+ * deadlock. Bail out to the full commit instead.
+ */
+ if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
+ btrfs_set_log_full_commit(trans);
+ btrfs_wait_tree_log_extents(log, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ } else if (ret) {
btrfs_set_log_full_commit(trans);
- btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
ret = btrfs_wait_tree_log_extents(log, mark);
if (!ret)
ret = btrfs_wait_tree_log_extents(log_root_tree,
- EXTENT_NEW | EXTENT_DIRTY);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
if (ret) {
btrfs_set_log_full_commit(trans);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
- btrfs_set_super_log_root(fs_info->super_for_commit,
- log_root_tree->node->start);
- btrfs_set_super_log_root_level(fs_info->super_for_commit,
- btrfs_header_level(log_root_tree->node));
-
+ log_root_start = log_root_tree->node->start;
+ log_root_level = btrfs_header_level(log_root_tree->node);
log_root_tree->log_transid++;
mutex_unlock(&log_root_tree->log_mutex);
/*
- * Nobody else is going to jump in and write the ctree
- * super here because the log_commit atomic below is protecting
- * us. We must be called with a transaction handle pinning
- * the running transaction open, so a full commit can't hop
- * in and cause problems either.
+ * Here we are guaranteed that nobody is going to write the superblock
+ * for the current transaction before us and that neither we do write
+ * our superblock before the previous transaction finishes its commit
+ * and writes its superblock, because:
+ *
+ * 1) We are holding a handle on the current transaction, so no body
+ * can commit it until we release the handle;
+ *
+ * 2) Before writing our superblock we acquire the tree_log_mutex, so
+ * if the previous transaction is still committing, and hasn't yet
+ * written its superblock, we wait for it to do it, because a
+ * transaction commit acquires the tree_log_mutex when the commit
+ * begins and releases it only after writing its superblock.
+ */
+ mutex_lock(&fs_info->tree_log_mutex);
+
+ /*
+ * The previous transaction writeout phase could have failed, and thus
+ * marked the fs in an error state. We must not commit here, as we
+ * could have updated our generation in the super_for_commit and
+ * writing the super here would result in transid mismatches. If there
+ * is an error here just bail.
*/
+ if (BTRFS_FS_ERROR(fs_info)) {
+ ret = -EIO;
+ btrfs_set_log_full_commit(trans);
+ btrfs_abort_transaction(trans, ret);
+ mutex_unlock(&fs_info->tree_log_mutex);
+ goto out_wake_log_root;
+ }
+
+ btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
+ btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
ret = write_all_supers(fs_info, 1);
- if (ret) {
+ mutex_unlock(&fs_info->tree_log_mutex);
+ if (unlikely(ret)) {
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
goto out_wake_log_root;
}
- mutex_lock(&root->log_mutex);
- if (root->last_log_commit < log_transid)
- root->last_log_commit = log_transid;
- mutex_unlock(&root->log_mutex);
+ /*
+ * We know there can only be one task here, since we have not yet set
+ * root->log_commit[index1] to 0 and any task attempting to sync the
+ * log must wait for the previous log transaction to commit if it's
+ * still in progress or wait for the current log transaction commit if
+ * someone else already started it. We use <= and not < because the
+ * first log transaction has an ID of 0.
+ */
+ ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid,
+ "last_log_commit(root)=%d log_transid=%d",
+ btrfs_get_root_last_log_commit(root), log_transid);
+ btrfs_set_root_last_log_commit(root, log_transid);
out_wake_log_root:
mutex_lock(&log_root_tree->log_mutex);
@@ -3276,21 +3625,48 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
{
int ret;
struct walk_control wc = {
- .free = 1,
- .process_func = process_one_buffer
+ .free = true,
+ .process_func = process_one_buffer,
+ .log = log,
+ .trans = trans,
};
- ret = walk_log_tree(trans, log, &wc);
- if (ret) {
- if (trans)
- btrfs_abort_transaction(trans, ret);
- else
- btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ if (log->node) {
+ ret = walk_log_tree(&wc);
+ if (ret) {
+ /*
+ * We weren't able to traverse the entire log tree, the
+ * typical scenario is getting an -EIO when reading an
+ * extent buffer of the tree, due to a previous writeback
+ * failure of it.
+ */
+ set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+ &log->fs_info->fs_state);
+
+ /*
+ * Some extent buffers of the log tree may still be dirty
+ * and not yet written back to storage, because we may
+ * have updates to a log tree without syncing a log tree,
+ * such as during rename and link operations. So flush
+ * them out and wait for their writeback to complete, so
+ * that we properly cleanup their state and pages.
+ */
+ btrfs_write_marked_extents(log->fs_info,
+ &log->dirty_log_pages,
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
+ btrfs_wait_tree_log_extents(log,
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
+
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ }
}
- clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
- extent_io_tree_release(&log->log_csum_range);
+ btrfs_extent_io_tree_release(&log->dirty_log_pages);
+ btrfs_extent_io_tree_release(&log->log_csum_range);
+
btrfs_put_root(log);
}
@@ -3303,6 +3679,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
if (root->log_root) {
free_log_tree(trans, root->log_root);
root->log_root = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
}
return 0;
}
@@ -3313,32 +3690,190 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
if (fs_info->log_root_tree) {
free_log_tree(trans, fs_info->log_root_tree);
fs_info->log_root_tree = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
}
return 0;
}
+static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ bool ret = false;
+
+ /*
+ * Do this only if ->logged_trans is still 0 to prevent races with
+ * concurrent logging as we may see the inode not logged when
+ * inode_logged() is called but it gets logged after inode_logged() did
+ * not find it in the log tree and we end up setting ->logged_trans to a
+ * value less than trans->transid after the concurrent logging task has
+ * set it to trans->transid. As a consequence, subsequent rename, unlink
+ * and link operations may end up not logging new names and removing old
+ * names from the log.
+ */
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == 0)
+ inode->logged_trans = trans->transid - 1;
+ else if (inode->logged_trans == trans->transid)
+ ret = true;
+ spin_unlock(&inode->lock);
+
+ return ret;
+}
+
+/*
+ * Check if an inode was logged in the current transaction. This correctly deals
+ * with the case where the inode was logged but has a logged_trans of 0, which
+ * happens if the inode is evicted and loaded again, as logged_trans is an in
+ * memory only field (not persisted).
+ *
+ * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
+ * and < 0 on error.
+ */
+static int inode_logged(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path_in)
+{
+ struct btrfs_path *path = path_in;
+ struct btrfs_key key;
+ int ret;
+
+ /*
+ * Quick lockless call, since once ->logged_trans is set to the current
+ * transaction, we never set it to a lower value anywhere else.
+ */
+ if (data_race(inode->logged_trans) == trans->transid)
+ return 1;
+
+ /*
+ * If logged_trans is not 0 and not trans->transid, then we know the
+ * inode was not logged in this transaction, so we can return false
+ * right away. We take the lock to avoid a race caused by load/store
+ * tearing with a concurrent btrfs_log_inode() call or a concurrent task
+ * in this function further below - an update to trans->transid can be
+ * teared into two 32 bits updates for example, in which case we could
+ * see a positive value that is not trans->transid and assume the inode
+ * was not logged when it was.
+ */
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == trans->transid) {
+ spin_unlock(&inode->lock);
+ return 1;
+ } else if (inode->logged_trans > 0) {
+ spin_unlock(&inode->lock);
+ return 0;
+ }
+ spin_unlock(&inode->lock);
+
+ /*
+ * If no log tree was created for this root in this transaction, then
+ * the inode can not have been logged in this transaction. In that case
+ * set logged_trans to anything greater than 0 and less than the current
+ * transaction's ID, to avoid the search below in a future call in case
+ * a log tree gets created after this.
+ */
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+ return mark_inode_as_not_logged(trans, inode);
+
+ /*
+ * We have a log tree and the inode's logged_trans is 0. We can't tell
+ * for sure if the inode was logged before in this transaction by looking
+ * only at logged_trans. We could be pessimistic and assume it was, but
+ * that can lead to unnecessarily logging an inode during rename and link
+ * operations, and then further updating the log in followup rename and
+ * link operations, specially if it's a directory, which adds latency
+ * visible to applications doing a series of rename or link operations.
+ *
+ * A logged_trans of 0 here can mean several things:
+ *
+ * 1) The inode was never logged since the filesystem was mounted, and may
+ * or may have not been evicted and loaded again;
+ *
+ * 2) The inode was logged in a previous transaction, then evicted and
+ * then loaded again;
+ *
+ * 3) The inode was logged in the current transaction, then evicted and
+ * then loaded again.
+ *
+ * For cases 1) and 2) we don't want to return true, but we need to detect
+ * case 3) and return true. So we do a search in the log root for the inode
+ * item.
+ */
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+
+ if (path_in)
+ btrfs_release_path(path);
+ else
+ btrfs_free_path(path);
+
+ /*
+ * Logging an inode always results in logging its inode item. So if we
+ * did not find the item we know the inode was not logged for sure.
+ */
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /*
+ * Set logged_trans to a value greater than 0 and less then the
+ * current transaction to avoid doing the search in future calls.
+ */
+ return mark_inode_as_not_logged(trans, inode);
+ }
+
+ /*
+ * The inode was previously logged and then evicted, set logged_trans to
+ * the current transaction's ID, to avoid future tree searches as long as
+ * the inode is not evicted again.
+ */
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ spin_unlock(&inode->lock);
+
+ return 1;
+}
+
/*
- * Check if an inode was logged in the current transaction. We can't always rely
- * on an inode's logged_trans value, because it's an in-memory only field and
- * therefore not persisted. This means that its value is lost if the inode gets
- * evicted and loaded again from disk (in which case it has a value of 0, and
- * certainly it is smaller then any possible transaction ID), when that happens
- * the full_sync flag is set in the inode's runtime flags, so on that case we
- * assume eviction happened and ignore the logged_trans value, assuming the
- * worst case, that the inode was logged before in the current transaction.
+ * Delete a directory entry from the log if it exists.
+ *
+ * Returns < 0 on error
+ * 1 if the entry does not exists
+ * 0 if the entry existed and was successfully deleted
*/
-static bool inode_logged(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+static int del_logged_dentry(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dir_ino,
+ const struct fscrypt_str *name,
+ u64 index)
{
- if (inode->logged_trans == trans->transid)
- return true;
+ struct btrfs_dir_item *di;
- if (inode->last_trans == trans->transid &&
- test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
- !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
- return true;
+ /*
+ * We only log dir index items of a directory, so we don't need to look
+ * for dir item keys.
+ */
+ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
+ index, name, -1);
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+ else if (!di)
+ return 1;
- return false;
+ /*
+ * We do not need to update the size field of the directory's
+ * inode item because on log replay we update the field to reflect
+ * all existing entries in the directory (see overwrite_item()).
+ */
+ return btrfs_del_item(trans, log, path);
}
/*
@@ -3362,143 +3897,72 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
* This optimizations allows us to avoid relogging the entire inode
* or the entire directory.
*/
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *dir, u64 index)
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ const struct fscrypt_str *name,
+ struct btrfs_inode *dir, u64 index)
{
- struct btrfs_root *log;
- struct btrfs_dir_item *di;
- struct btrfs_path *path;
+ struct btrfs_root *root = dir->root;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
- int err = 0;
- int bytes_del = 0;
- u64 dir_ino = btrfs_ino(dir);
-
- if (!inode_logged(trans, dir))
- return 0;
-
- ret = join_running_log_trans(root);
- if (ret)
- return 0;
- mutex_lock(&dir->log_mutex);
+ ret = inode_logged(trans, dir, NULL);
+ if (ret == 0)
+ return;
+ if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
- log = root->log_root;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
- goto out_unlock;
- }
-
- di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
- name, name_len, -1);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto fail;
- }
- if (di) {
- ret = btrfs_delete_one_dir_name(trans, log, path, di);
- bytes_del += name_len;
- if (ret) {
- err = ret;
- goto fail;
- }
- }
- btrfs_release_path(path);
- di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
- index, name, name_len, -1);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto fail;
- }
- if (di) {
- ret = btrfs_delete_one_dir_name(trans, log, path, di);
- bytes_del += name_len;
- if (ret) {
- err = ret;
- goto fail;
- }
+ btrfs_set_log_full_commit(trans);
+ return;
}
- /* update the directory size in the log to reflect the names
- * we have removed
- */
- if (bytes_del) {
- struct btrfs_key key;
-
- key.objectid = dir_ino;
- key.offset = 0;
- key.type = BTRFS_INODE_ITEM_KEY;
- btrfs_release_path(path);
+ ret = join_running_log_trans(root);
+ ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
+ if (WARN_ON(ret))
+ return;
- ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
- if (ret < 0) {
- err = ret;
- goto fail;
- }
- if (ret == 0) {
- struct btrfs_inode_item *item;
- u64 i_size;
+ mutex_lock(&dir->log_mutex);
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_inode_item);
- i_size = btrfs_inode_size(path->nodes[0], item);
- if (i_size > bytes_del)
- i_size -= bytes_del;
- else
- i_size = 0;
- btrfs_set_inode_size(path->nodes[0], item, i_size);
- btrfs_mark_buffer_dirty(path->nodes[0]);
- } else
- ret = 0;
- btrfs_release_path(path);
- }
-fail:
- btrfs_free_path(path);
-out_unlock:
+ ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
+ name, index);
mutex_unlock(&dir->log_mutex);
- if (ret == -ENOSPC) {
+ if (ret < 0)
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0)
- btrfs_abort_transaction(trans, ret);
-
btrfs_end_log_trans(root);
-
- return err;
}
/* see comments for btrfs_del_dir_entries_in_log */
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *inode, u64 dirid)
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ const struct fscrypt_str *name,
+ struct btrfs_inode *inode,
+ struct btrfs_inode *dir)
{
- struct btrfs_root *log;
- u64 index;
+ struct btrfs_root *root = dir->root;
int ret;
- if (!inode_logged(trans, inode))
- return 0;
+ ret = inode_logged(trans, inode, NULL);
+ if (ret == 0)
+ return;
+ else if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
ret = join_running_log_trans(root);
- if (ret)
- return 0;
- log = root->log_root;
+ ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
+ if (WARN_ON(ret))
+ return;
mutex_lock(&inode->log_mutex);
- ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
- dirid, &index);
+ ret = btrfs_del_inode_ref(trans, root->log_root, name, btrfs_ino(inode),
+ btrfs_ino(dir), NULL);
mutex_unlock(&inode->log_mutex);
- if (ret == -ENOSPC) {
+ if (ret < 0 && ret != -ENOENT)
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0 && ret != -ENOENT)
- btrfs_abort_transaction(trans, ret);
btrfs_end_log_trans(root);
-
- return ret;
}
/*
@@ -3509,7 +3973,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
- int key_type, u64 dirid,
+ u64 dirid,
u64 first_offset, u64 last_offset)
{
int ret;
@@ -3517,50 +3981,282 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_dir_log_item *item;
key.objectid = dirid;
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
key.offset = first_offset;
- if (key_type == BTRFS_DIR_ITEM_KEY)
- key.type = BTRFS_DIR_LOG_ITEM_KEY;
- else
- key.type = BTRFS_DIR_LOG_INDEX_KEY;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
- if (ret)
+ /*
+ * -EEXIST is fine and can happen sporadically when we are logging a
+ * directory and have concurrent insertions in the subvolume's tree for
+ * items from other inodes and that result in pushing off some dir items
+ * from one leaf to another in order to accommodate for the new items.
+ * This results in logging the same dir index range key.
+ */
+ if (ret && ret != -EEXIST)
return ret;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
+ if (ret == -EEXIST) {
+ const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
+
+ /*
+ * btrfs_del_dir_entries_in_log() might have been called during
+ * an unlink between the initial insertion of this key and the
+ * current update, or we might be logging a single entry deletion
+ * during a rename, so set the new last_offset to the max value.
+ */
+ last_offset = max(last_offset, curr_end);
+ }
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
- btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(path);
return 0;
}
+static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct extent_buffer *src,
+ struct btrfs_path *dst_path,
+ int start_slot,
+ int count)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ char AUTO_KFREE(ins_data);
+ struct btrfs_item_batch batch;
+ struct extent_buffer *dst;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+ u64 last_index;
+ struct btrfs_key key;
+ u32 item_size;
+ int ret;
+ int i;
+
+ ASSERT(count > 0, "count=%d", count);
+ batch.nr = count;
+
+ if (count == 1) {
+ btrfs_item_key_to_cpu(src, &key, start_slot);
+ item_size = btrfs_item_size(src, start_slot);
+ batch.keys = &key;
+ batch.data_sizes = &item_size;
+ batch.total_data_size = item_size;
+ } else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+
+ ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+
+ for (i = 0; i < count; i++) {
+ const int slot = start_slot + i;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+ ins_sizes[i] = btrfs_item_size(src, slot);
+ batch.total_data_size += ins_sizes[i];
+ }
+ }
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
+ return ret;
+
+ dst = dst_path->nodes[0];
+ /*
+ * Copy all the items in bulk, in a single copy operation. Item data is
+ * organized such that it's placed at the end of a leaf and from right
+ * to left. For example, the data for the second item ends at an offset
+ * that matches the offset where the data for the first item starts, the
+ * data for the third item ends at an offset that matches the offset
+ * where the data of the second items starts, and so on.
+ * Therefore our source and destination start offsets for copy match the
+ * offsets of the last items (highest slots).
+ */
+ dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
+ src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
+ copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
+ btrfs_release_path(dst_path);
+
+ last_index = batch.keys[count - 1].offset;
+ ASSERT(last_index > inode->last_dir_index_offset,
+ "last_index=%llu inode->last_dir_index_offset=%llu",
+ last_index, inode->last_dir_index_offset);
+
+ /*
+ * If for some unexpected reason the last item's index is not greater
+ * than the last index we logged, warn and force a transaction commit.
+ */
+ if (WARN_ON(last_index <= inode->last_dir_index_offset))
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ else
+ inode->last_dir_index_offset = last_index;
+
+ if (btrfs_get_first_dir_index_to_log(inode) == 0)
+ btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
+
+ return ret;
+}
+
+static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
+{
+ const int slot = path->slots[0];
+
+ if (ctx->scratch_eb) {
+ copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
+ } else {
+ ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!ctx->scratch_eb)
+ return -ENOMEM;
+ }
+
+ btrfs_release_path(path);
+ path->nodes[0] = ctx->scratch_eb;
+ path->slots[0] = slot;
+ /*
+ * Add extra ref to scratch eb so that it is not freed when callers
+ * release the path, so we can reuse it later if needed.
+ */
+ refcount_inc(&ctx->scratch_eb->refs);
+
+ return 0;
+}
+
+static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx,
+ u64 *last_old_dentry_offset)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ struct extent_buffer *src;
+ const int nritems = btrfs_header_nritems(path->nodes[0]);
+ const u64 ino = btrfs_ino(inode);
+ bool last_found = false;
+ int batch_start = 0;
+ int batch_size = 0;
+ int ret;
+
+ /*
+ * We need to clone the leaf, release the read lock on it, and use the
+ * clone before modifying the log tree. See the comment at copy_items()
+ * about why we need to do this.
+ */
+ ret = clone_leaf(path, ctx);
+ if (ret < 0)
+ return ret;
+
+ src = path->nodes[0];
+
+ for (int i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(src, &key, i);
+
+ if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
+ last_found = true;
+ break;
+ }
+
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+
+ /*
+ * Skip ranges of items that consist only of dir item keys created
+ * in past transactions. However if we find a gap, we must log a
+ * dir index range item for that gap, so that index keys in that
+ * gap are deleted during log replay.
+ */
+ if (btrfs_dir_transid(src, di) < trans->transid) {
+ if (key.offset > *last_old_dentry_offset + 1) {
+ ret = insert_dir_log_key(trans, log, dst_path,
+ ino, *last_old_dentry_offset + 1,
+ key.offset - 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ *last_old_dentry_offset = key.offset;
+ continue;
+ }
+
+ /* If we logged this dir index item before, we can skip it. */
+ if (key.offset <= inode->last_dir_index_offset)
+ continue;
+
+ /*
+ * We must make sure that when we log a directory entry, the
+ * corresponding inode, after log replay, has a matching link
+ * count. For example:
+ *
+ * touch foo
+ * mkdir mydir
+ * sync
+ * ln foo mydir/bar
+ * xfs_io -c "fsync" mydir
+ * <crash>
+ * <mount fs and log replay>
+ *
+ * Would result in a fsync log that when replayed, our file inode
+ * would have a link count of 1, but we get two directory entries
+ * pointing to the same inode. After removing one of the names,
+ * it would not be possible to remove the other name, which
+ * resulted always in stale file handle errors, and would not be
+ * possible to rmdir the parent directory, since its i_size could
+ * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
+ * resulting in -ENOTEMPTY errors.
+ */
+ if (!ctx->log_new_dentries) {
+ struct btrfs_key di_key;
+
+ btrfs_dir_item_key_to_cpu(src, di, &di_key);
+ if (di_key.type != BTRFS_ROOT_ITEM_KEY)
+ ctx->log_new_dentries = true;
+ }
+
+ if (batch_size == 0)
+ batch_start = i;
+ batch_size++;
+ }
+
+ if (batch_size > 0) {
+ ret = flush_dir_items_batch(trans, inode, src, dst_path,
+ batch_start, batch_size);
+ if (ret < 0)
+ return ret;
+ }
+
+ return last_found ? 1 : 0;
+}
+
/*
* log all the items included in the current transaction for a given
* directory. This also creates the range items in the log tree required
* to replay anything deleted before the fsync
*/
static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path, int key_type,
+ struct btrfs_path *dst_path,
struct btrfs_log_ctx *ctx,
u64 min_offset, u64 *last_offset_ret)
{
struct btrfs_key min_key;
+ struct btrfs_root *root = inode->root;
struct btrfs_root *log = root->log_root;
- struct extent_buffer *src;
- int err = 0;
int ret;
- int i;
- int nritems;
- u64 first_offset = min_offset;
+ u64 last_old_dentry_offset = min_offset - 1;
u64 last_offset = (u64)-1;
u64 ino = btrfs_ino(inode);
- log = root->log_root;
-
min_key.objectid = ino;
- min_key.type = key_type;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
min_key.offset = min_offset;
ret = btrfs_search_forward(root, &min_key, path, trans->transid);
@@ -3569,9 +4265,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* we didn't find anything from this transaction, see if there
* is anything at all
*/
- if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
+ if (ret != 0 || min_key.objectid != ino ||
+ min_key.type != BTRFS_DIR_INDEX_KEY) {
min_key.objectid = ino;
- min_key.type = key_type;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
min_key.offset = (u64)-1;
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
@@ -3579,7 +4276,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
return ret;
}
- ret = btrfs_previous_item(root, path, ino, key_type);
+ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
/* if ret == 0 there are items for this type,
* create a range to tell us the last key of this type.
@@ -3588,42 +4285,65 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
*/
if (ret == 0) {
struct btrfs_key tmp;
+
btrfs_item_key_to_cpu(path->nodes[0], &tmp,
path->slots[0]);
- if (key_type == tmp.type)
- first_offset = max(min_offset, tmp.offset) + 1;
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
+ } else if (ret > 0) {
+ ret = 0;
}
+
goto done;
}
/* go backward to find any previous key */
- ret = btrfs_previous_item(root, path, ino, key_type);
+ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
if (ret == 0) {
struct btrfs_key tmp;
+
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (key_type == tmp.type) {
- first_offset = tmp.offset;
- ret = overwrite_item(trans, log, dst_path,
- path->nodes[0], path->slots[0],
- &tmp);
- if (ret) {
- err = ret;
- goto done;
- }
- }
+ /*
+ * The dir index key before the first one we found that needs to
+ * be logged might be in a previous leaf, and there might be a
+ * gap between these keys, meaning that we had deletions that
+ * happened. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
+ * previous key's offset plus 1, so that those deletes are replayed.
+ */
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
+ } else if (ret < 0) {
+ goto done;
}
+
btrfs_release_path(path);
/*
- * Find the first key from this transaction again. See the note for
- * log_new_dir_dentries, if we're logging a directory recursively we
- * won't be holding its i_mutex, which means we can modify the directory
- * while we're logging it. If we remove an entry between our first
- * search and this search we'll not find the key again and can just
- * bail.
+ * Find the first key from this transaction again or the one we were at
+ * in the loop below in case we had to reschedule. We may be logging the
+ * directory without holding its VFS lock, which happen when logging new
+ * dentries (through log_new_dir_dentries()) or in some cases when we
+ * need to log the parent directory of an inode. This means a dir index
+ * key might be deleted from the inode's root, and therefore we may not
+ * find it anymore. If we can't find it, just move to the next key. We
+ * can not bail out and ignore, because if we do that we will simply
+ * not log dir index keys that come after the one that was just deleted
+ * and we can end up logging a dir index range that ends at (u64)-1
+ * (@last_offset is initialized to that), resulting in removing dir
+ * entries we should not remove at log replay time.
*/
+search:
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
- if (ret != 0)
+ if (ret > 0) {
+ ret = btrfs_next_item(root, path);
+ if (ret > 0) {
+ /* There are no more keys in the inode's root. */
+ ret = 0;
+ goto done;
+ }
+ }
+ if (ret < 0)
goto done;
/*
@@ -3631,55 +4351,14 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* from our directory
*/
while (1) {
- struct btrfs_key tmp;
- src = path->nodes[0];
- nritems = btrfs_header_nritems(src);
- for (i = path->slots[0]; i < nritems; i++) {
- struct btrfs_dir_item *di;
-
- btrfs_item_key_to_cpu(src, &min_key, i);
-
- if (min_key.objectid != ino || min_key.type != key_type)
- goto done;
- ret = overwrite_item(trans, log, dst_path, src, i,
- &min_key);
- if (ret) {
- err = ret;
- goto done;
- }
-
- /*
- * We must make sure that when we log a directory entry,
- * the corresponding inode, after log replay, has a
- * matching link count. For example:
- *
- * touch foo
- * mkdir mydir
- * sync
- * ln foo mydir/bar
- * xfs_io -c "fsync" mydir
- * <crash>
- * <mount fs and log replay>
- *
- * Would result in a fsync log that when replayed, our
- * file inode would have a link count of 1, but we get
- * two directory entries pointing to the same inode.
- * After removing one of the names, it would not be
- * possible to remove the other name, which resulted
- * always in stale file handle errors, and would not
- * be possible to rmdir the parent directory, since
- * its i_size could never decrement to the value
- * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
- */
- di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
- btrfs_dir_item_key_to_cpu(src, di, &tmp);
- if (ctx &&
- (btrfs_dir_transid(src, di) == trans->transid ||
- btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
- tmp.type != BTRFS_ROOT_ITEM_KEY)
- ctx->log_new_dentries = true;
+ ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
+ &last_old_dentry_offset);
+ if (ret != 0) {
+ if (ret > 0)
+ ret = 0;
+ goto done;
}
- path->slots[0] = nritems;
+ path->slots[0] = btrfs_header_nritems(path->nodes[0]);
/*
* look ahead to the next item and see if it is also
@@ -3687,44 +4366,125 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
*/
ret = btrfs_next_leaf(root, path);
if (ret) {
- if (ret == 1)
+ if (ret == 1) {
last_offset = (u64)-1;
- else
- err = ret;
+ ret = 0;
+ }
goto done;
}
- btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (tmp.objectid != ino || tmp.type != key_type) {
+ btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+ if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
last_offset = (u64)-1;
goto done;
}
if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
- ret = overwrite_item(trans, log, dst_path,
- path->nodes[0], path->slots[0],
- &tmp);
- if (ret)
- err = ret;
- else
- last_offset = tmp.offset;
+ /*
+ * The next leaf was not changed in the current transaction
+ * and has at least one dir index key.
+ * We check for the next key because there might have been
+ * one or more deletions between the last key we logged and
+ * that next key. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
+ * offset minus 1, so that those deletes are replayed.
+ */
+ last_offset = min_key.offset - 1;
goto done;
}
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
}
done:
btrfs_release_path(path);
btrfs_release_path(dst_path);
- if (err == 0) {
+ if (ret == 0) {
*last_offset_ret = last_offset;
/*
- * insert the log range keys to indicate where the log
- * is valid
+ * In case the leaf was changed in the current transaction but
+ * all its dir items are from a past transaction, the last item
+ * in the leaf is a dir item and there's no gap between that last
+ * dir item and the first one on the next leaf (which did not
+ * change in the current transaction), then we don't need to log
+ * a range, last_old_dentry_offset is == to last_offset.
*/
- ret = insert_dir_log_key(trans, log, path, key_type,
- ino, first_offset, last_offset);
- if (ret)
- err = ret;
+ ASSERT(last_old_dentry_offset <= last_offset,
+ "last_old_dentry_offset=%llu last_offset=%llu",
+ last_old_dentry_offset, last_offset);
+ if (last_old_dentry_offset < last_offset)
+ ret = insert_dir_log_key(trans, log, path, ino,
+ last_old_dentry_offset + 1,
+ last_offset);
}
- return err;
+
+ return ret;
+}
+
+/*
+ * If the inode was logged before and it was evicted, then its
+ * last_dir_index_offset is 0, so we don't know the value of the last index
+ * key offset. If that's the case, search for it and update the inode. This
+ * is to avoid lookups in the log tree every time we try to insert a dir index
+ * key from a leaf changed in the current transaction, and to allow us to always
+ * do batch insertions of dir index keys.
+ */
+static int update_last_dir_index_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_key key;
+ int ret;
+
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (inode->last_dir_index_offset != 0)
+ return 0;
+
+ if (!ctx->logged_before) {
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+ return 0;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+ /*
+ * An error happened or we actually have an index key with an offset
+ * value of (u64)-1. Bail out, we're done.
+ */
+ if (ret <= 0)
+ goto out;
+
+ ret = 0;
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+
+ /*
+ * No dir index items, bail out and leave last_dir_index_offset with
+ * the value right before the first valid index value.
+ */
+ if (path->slots[0] == 0)
+ goto out;
+
+ /*
+ * btrfs_search_slot() left us at one slot beyond the slot with the last
+ * index key, or beyond the last key of the directory that is not an
+ * index key. If we have an index key before, set last_dir_index_offset
+ * to its offset value, otherwise leave it with a value right before the
+ * first valid index value, as it means we have an empty directory.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
+ inode->last_dir_index_offset = key.offset;
+
+out:
+ btrfs_release_path(path);
+
+ return ret;
}
/*
@@ -3740,7 +4500,7 @@ done:
* key logged by this transaction.
*/
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path,
struct btrfs_log_ctx *ctx)
@@ -3748,13 +4508,16 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
u64 min_key;
u64 max_key;
int ret;
- int key_type = BTRFS_DIR_ITEM_KEY;
-again:
- min_key = 0;
+ ret = update_last_dir_index_offset(inode, path, ctx);
+ if (ret)
+ return ret;
+
+ min_key = BTRFS_DIR_START_INDEX;
max_key = 0;
+
while (1) {
- ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
+ ret = log_dir_items(trans, inode, path, dst_path,
ctx, min_key, &max_key);
if (ret)
return ret;
@@ -3763,10 +4526,6 @@ again:
min_key = max_key + 1;
}
- if (key_type == BTRFS_DIR_ITEM_KEY) {
- key_type = BTRFS_DIR_INDEX_KEY;
- goto again;
- }
return 0;
}
@@ -3776,39 +4535,40 @@ again:
* This cannot be run for file data extents because it does not
* free the extents they point to.
*/
-static int drop_objectid_items(struct btrfs_trans_handle *trans,
+static int drop_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 objectid, int max_key_type)
+ struct btrfs_inode *inode,
+ int max_key_type)
{
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
int start_slot;
- key.objectid = objectid;
+ key.objectid = btrfs_ino(inode);
key.type = max_key_type;
key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
- BUG_ON(ret == 0); /* Logic error */
- if (ret < 0)
- break;
-
- if (path->slots[0] == 0)
+ if (ret < 0) {
break;
+ } else if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
- path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
- if (found_key.objectid != objectid)
+ if (found_key.objectid != key.objectid)
break;
found_key.offset = 0;
found_key.type = 0;
- ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
+ ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
if (ret < 0)
break;
@@ -3828,15 +4588,28 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
return ret;
}
+static int truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log_root,
+ struct btrfs_inode *inode,
+ u64 new_size, u32 min_type)
+{
+ struct btrfs_truncate_control control = {
+ .new_size = new_size,
+ .ino = btrfs_ino(inode),
+ .min_type = min_type,
+ .skip_ref_updates = true,
+ };
+
+ return btrfs_truncate_inode_items(trans, log_root, &control);
+}
+
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
- struct inode *inode, int log_inode_only,
+ struct inode *inode, bool log_inode_only,
u64 logged_isize)
{
- struct btrfs_map_token token;
-
- btrfs_init_map_token(&token, leaf);
+ u64 flags;
if (log_inode_only) {
/* set the generation to zero so the recover code
@@ -3844,78 +4617,121 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* just to say 'this inode exists' and a logging
* to say 'update this inode with these values'
*/
- btrfs_set_token_inode_generation(&token, item, 0);
- btrfs_set_token_inode_size(&token, item, logged_isize);
+ btrfs_set_inode_generation(leaf, item, 0);
+ btrfs_set_inode_size(leaf, item, logged_isize);
} else {
- btrfs_set_token_inode_generation(&token, item,
- BTRFS_I(inode)->generation);
- btrfs_set_token_inode_size(&token, item, inode->i_size);
- }
-
- btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
- btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
- btrfs_set_token_inode_mode(&token, item, inode->i_mode);
- btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
-
- btrfs_set_token_timespec_sec(&token, &item->atime,
- inode->i_atime.tv_sec);
- btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode->i_atime.tv_nsec);
-
- btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode->i_mtime.tv_sec);
- btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode->i_mtime.tv_nsec);
-
- btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode->i_ctime.tv_sec);
- btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode->i_ctime.tv_nsec);
-
- btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
-
- btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
- btrfs_set_token_inode_transid(&token, item, trans->transid);
- btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
- btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
- btrfs_set_token_inode_block_group(&token, item, 0);
+ btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+ btrfs_set_inode_size(leaf, item, inode->i_size);
+ }
+
+ btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+ btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+ btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
+
+ /*
+ * We do not need to set the nbytes field, in fact during a fast fsync
+ * its value may not even be correct, since a fast fsync does not wait
+ * for ordered extent completion, which is where we update nbytes, it
+ * only waits for writeback to complete. During log replay as we find
+ * file extent items and replay them, we adjust the nbytes field of the
+ * inode item in subvolume tree as needed (see overwrite_item()).
+ */
+
+ btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_block_group(leaf, item, 0);
}
static int log_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_root *log, struct btrfs_path *path,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode, bool inode_item_dropped)
{
struct btrfs_inode_item *inode_item;
+ struct btrfs_key key;
int ret;
- ret = btrfs_insert_empty_item(trans, log, path,
- &inode->location, sizeof(*inode_item));
- if (ret && ret != -EEXIST)
+ btrfs_get_inode_key(inode, &key);
+ /*
+ * If we are doing a fast fsync and the inode was logged before in the
+ * current transaction, then we know the inode was previously logged and
+ * it exists in the log tree. For performance reasons, in this case use
+ * btrfs_search_slot() directly with ins_len set to 0 so that we never
+ * attempt a write lock on the leaf's parent, which adds unnecessary lock
+ * contention in case there are concurrent fsyncs for other inodes of the
+ * same subvolume. Using btrfs_insert_empty_item() when the inode item
+ * already exists can also result in unnecessarily splitting a leaf.
+ */
+ if (!inode_item_dropped && inode->logged_trans == trans->transid) {
+ ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+ ASSERT(ret <= 0);
+ if (ret > 0)
+ ret = -ENOENT;
+ } else {
+ /*
+ * This means it is the first fsync in the current transaction,
+ * so the inode item is not in the log and we need to insert it.
+ * We can never get -EEXIST because we are only called for a fast
+ * fsync and in case an inode eviction happens after the inode was
+ * logged before in the current transaction, when we load again
+ * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
+ * flags and set ->logged_trans to 0.
+ */
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
+ sizeof(*inode_item));
+ ASSERT(ret != -EEXIST);
+ }
+ if (ret)
return ret;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
- 0, 0);
+ false, 0);
btrfs_release_path(path);
return 0;
}
static int log_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
struct btrfs_root *log_root,
struct btrfs_ordered_sum *sums)
{
- const u64 lock_end = sums->bytenr + sums->len - 1;
+ const u64 lock_end = sums->logical + sums->len - 1;
struct extent_state *cached_state = NULL;
int ret;
/*
+ * If this inode was not used for reflink operations in the current
+ * transaction with new extents, then do the fast path, no need to
+ * worry about logging checksum items with overlapping ranges.
+ */
+ if (inode->last_reflink_trans < trans->transid)
+ return btrfs_csum_file_blocks(trans, log_root, sums);
+
+ /*
* Serialize logging for checksums. This is to avoid racing with the
* same checksum being logged by another task that is logging another
* file which happens to refer to the same extent as well. Such races
* can leave checksum items in the log with overlapping ranges.
*/
- ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
- lock_end, &cached_state);
+ ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
+ &cached_state);
if (ret)
return ret;
/*
@@ -3927,12 +4743,12 @@ static int log_csums(struct btrfs_trans_handle *trans,
* some checksums missing in the fs/subvolume tree. So just delete (or
* trim and adjust) any existing csum items in the log for this range.
*/
- ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
+ ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
if (!ret)
ret = btrfs_csum_file_blocks(trans, log_root, sums);
- unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
- &cached_state);
+ btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
+ &cached_state);
return ret;
}
@@ -3942,53 +4758,210 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_path *dst_path,
struct btrfs_path *src_path,
int start_slot, int nr, int inode_only,
- u64 logged_isize)
+ u64 logged_isize, struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- unsigned long src_offset;
- unsigned long dst_offset;
struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
- struct btrfs_inode_item *inode_item;
- struct extent_buffer *src = src_path->nodes[0];
+ struct extent_buffer *src;
int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
- char *ins_data;
- int i;
- struct list_head ordered_sums;
- int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
+ struct btrfs_item_batch batch;
+ char AUTO_KFREE(ins_data);
+ int dst_index;
+ const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
+ const u64 i_size = i_size_read(&inode->vfs_inode);
- INIT_LIST_HEAD(&ordered_sums);
+ /*
+ * To keep lockdep happy and avoid deadlocks, clone the source leaf and
+ * use the clone. This is because otherwise we would be changing the log
+ * tree, to insert items from the subvolume tree or insert csum items,
+ * while holding a read lock on a leaf from the subvolume tree, which
+ * creates a nasty lock dependency when COWing log tree nodes/leaves:
+ *
+ * 1) Modifying the log tree triggers an extent buffer allocation while
+ * holding a write lock on a parent extent buffer from the log tree.
+ * Allocating the pages for an extent buffer, or the extent buffer
+ * struct, can trigger inode eviction and finally the inode eviction
+ * will trigger a release/remove of a delayed node, which requires
+ * taking the delayed node's mutex;
+ *
+ * 2) Allocating a metadata extent for a log tree can trigger the async
+ * reclaim thread and make us wait for it to release enough space and
+ * unblock our reservation ticket. The reclaim thread can start
+ * flushing delayed items, and that in turn results in the need to
+ * lock delayed node mutexes and in the need to write lock extent
+ * buffers of a subvolume tree - all this while holding a write lock
+ * on the parent extent buffer in the log tree.
+ *
+ * So one task in scenario 1) running in parallel with another task in
+ * scenario 2) could lead to a deadlock, one wanting to lock a delayed
+ * node mutex while having a read lock on a leaf from the subvolume,
+ * while the other is holding the delayed node's mutex and wants to
+ * write lock the same subvolume leaf for flushing delayed items.
+ */
+ ret = clone_leaf(src_path, ctx);
+ if (ret < 0)
+ return ret;
- ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
- nr * sizeof(u32), GFP_NOFS);
+ src = src_path->nodes[0];
+
+ ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS);
if (!ins_data)
return -ENOMEM;
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+ batch.nr = 0;
+
+ dst_index = 0;
+ for (int i = 0; i < nr; i++) {
+ const int src_slot = start_slot + i;
+ struct btrfs_root *csum_root;
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_ordered_sum *sums_next;
+ LIST_HEAD(ordered_sums);
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 extent_offset;
+ u64 extent_num_bytes;
+ bool is_old_extent;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
+
+ if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
+ goto add_to_batch;
+
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
+
+ is_old_extent = (btrfs_file_extent_generation(src, extent) <
+ trans->transid);
+
+ /*
+ * Don't copy extents from past generations. That would make us
+ * log a lot more metadata for common cases like doing only a
+ * few random writes into a file and then fsync it for the first
+ * time or after the full sync flag is set on the inode. We can
+ * get leaves full of extent items, most of which are from past
+ * generations, so we can skip them - as long as the inode has
+ * not been the target of a reflink operation in this transaction,
+ * as in that case it might have had file extent items with old
+ * generations copied into it. We also must always log prealloc
+ * extents that start at or beyond eof, otherwise we would lose
+ * them on log replay.
+ */
+ if (is_old_extent &&
+ ins_keys[dst_index].offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
+
+ if (skip_csum)
+ goto add_to_batch;
+
+ /* Only regular extents have checksums. */
+ if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
+ goto add_to_batch;
+
+ /*
+ * If it's an extent created in a past transaction, then its
+ * checksums are already accessible from the committed csum tree,
+ * no need to log them.
+ */
+ if (is_old_extent)
+ goto add_to_batch;
+
+ disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
+ /* If it's an explicit hole, there are no checksums. */
+ if (disk_bytenr == 0)
+ goto add_to_batch;
+
+ disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
+
+ if (btrfs_file_extent_compression(src, extent)) {
+ extent_offset = 0;
+ extent_num_bytes = disk_num_bytes;
+ } else {
+ extent_offset = btrfs_file_extent_offset(src, extent);
+ extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
+ }
+
+ csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
+ disk_bytenr += extent_offset;
+ ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
+ disk_bytenr + extent_num_bytes - 1,
+ &ordered_sums, false);
+ if (ret < 0)
+ return ret;
+ ret = 0;
- for (i = 0; i < nr; i++) {
- ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
- btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+ list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
+ if (!ret)
+ ret = log_csums(trans, inode, log, sums);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ if (ret)
+ return ret;
+
+add_to_batch:
+ ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
+ batch.total_data_size += ins_sizes[dst_index];
+ batch.nr++;
+ dst_index++;
}
- ret = btrfs_insert_empty_items(trans, log, dst_path,
- ins_keys, ins_sizes, nr);
- if (ret) {
- kfree(ins_data);
+
+ /*
+ * We have a leaf full of old extent items that don't need to be logged,
+ * so we don't need to do anything.
+ */
+ if (batch.nr == 0)
+ return 0;
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
return ret;
- }
- for (i = 0; i < nr; i++, dst_path->slots[0]++) {
- dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
- dst_path->slots[0]);
+ dst_index = 0;
+ for (int i = 0; i < nr; i++) {
+ const int src_slot = start_slot + i;
+ const int dst_slot = dst_path->slots[0] + dst_index;
+ struct btrfs_key key;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+
+ /*
+ * We're done, all the remaining items in the source leaf
+ * correspond to old file extent items.
+ */
+ if (dst_index >= batch.nr)
+ break;
+
+ btrfs_item_key_to_cpu(src, &key, src_slot);
- src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ goto copy_item;
- if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
- inode_item = btrfs_item_ptr(dst_path->nodes[0],
- dst_path->slots[0],
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
+
+ /* See the comment in the previous loop, same logic. */
+ if (btrfs_file_extent_generation(src, extent) < trans->transid &&
+ key.offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
+
+copy_item:
+ dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
+ src_offset = btrfs_item_ptr_offset(src, src_slot);
+
+ if (key.type == BTRFS_INODE_ITEM_KEY) {
+ struct btrfs_inode_item *inode_item;
+
+ inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
struct btrfs_inode_item);
fill_inode_item(trans, dst_path->nodes[0], inode_item,
&inode->vfs_inode,
@@ -3996,80 +4969,21 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
logged_isize);
} else {
copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
- src_offset, ins_sizes[i]);
+ src_offset, ins_sizes[dst_index]);
}
- /* take a reference on file data extents so that truncates
- * or deletes of this inode don't have to relog the inode
- * again
- */
- if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
- !skip_csum) {
- int found_type;
- extent = btrfs_item_ptr(src, start_slot + i,
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_generation(src, extent) < trans->transid)
- continue;
-
- found_type = btrfs_file_extent_type(src, extent);
- if (found_type == BTRFS_FILE_EXTENT_REG) {
- u64 ds, dl, cs, cl;
- ds = btrfs_file_extent_disk_bytenr(src,
- extent);
- /* ds == 0 is a hole */
- if (ds == 0)
- continue;
-
- dl = btrfs_file_extent_disk_num_bytes(src,
- extent);
- cs = btrfs_file_extent_offset(src, extent);
- cl = btrfs_file_extent_num_bytes(src,
- extent);
- if (btrfs_file_extent_compression(src,
- extent)) {
- cs = 0;
- cl = dl;
- }
-
- ret = btrfs_lookup_csums_range(
- fs_info->csum_root,
- ds + cs, ds + cs + cl - 1,
- &ordered_sums, 0);
- if (ret) {
- btrfs_release_path(dst_path);
- kfree(ins_data);
- return ret;
- }
- }
- }
+ dst_index++;
}
- btrfs_mark_buffer_dirty(dst_path->nodes[0]);
btrfs_release_path(dst_path);
- kfree(ins_data);
-
- /*
- * we have to do this after the loop above to avoid changing the
- * log tree while trying to change the log tree.
- */
- ret = 0;
- while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
- if (!ret)
- ret = log_csums(trans, log, sums);
- list_del(&sums->list);
- kfree(sums);
- }
return ret;
}
-static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int extent_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
- struct extent_map *em1, *em2;
+ const struct extent_map *em1, *em2;
em1 = list_entry(a, struct extent_map, list);
em2 = list_entry(b, struct extent_map, list);
@@ -4084,41 +4998,107 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_root *log_root,
- const struct extent_map *em)
+ const struct extent_map *em,
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *csum_root;
+ u64 block_start;
u64 csum_offset;
u64 csum_len;
+ u64 mod_start = em->start;
+ u64 mod_len = em->len;
LIST_HEAD(ordered_sums);
int ret = 0;
if (inode->flags & BTRFS_INODE_NODATASUM ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
- em->block_start == EXTENT_MAP_HOLE)
+ (em->flags & EXTENT_FLAG_PREALLOC) ||
+ em->disk_bytenr == EXTENT_MAP_HOLE)
+ return 0;
+
+ list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
+ const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
+ const u64 mod_end = mod_start + mod_len;
+ struct btrfs_ordered_sum *sums;
+
+ if (mod_len == 0)
+ break;
+
+ if (ordered_end <= mod_start)
+ continue;
+ if (mod_end <= ordered->file_offset)
+ break;
+
+ /*
+ * We are going to copy all the csums on this ordered extent, so
+ * go ahead and adjust mod_start and mod_len in case this ordered
+ * extent has already been logged.
+ */
+ if (ordered->file_offset > mod_start) {
+ if (ordered_end >= mod_end)
+ mod_len = ordered->file_offset - mod_start;
+ /*
+ * If we have this case
+ *
+ * |--------- logged extent ---------|
+ * |----- ordered extent ----|
+ *
+ * Just don't mess with mod_start and mod_len, we'll
+ * just end up logging more csums than we need and it
+ * will be ok.
+ */
+ } else {
+ if (ordered_end < mod_end) {
+ mod_len = mod_end - ordered_end;
+ mod_start = ordered_end;
+ } else {
+ mod_len = 0;
+ }
+ }
+
+ /*
+ * To keep us from looping for the above case of an ordered
+ * extent that falls inside of the logged extent.
+ */
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
+ continue;
+
+ list_for_each_entry(sums, &ordered->list, list) {
+ ret = log_csums(trans, inode, log_root, sums);
+ if (ret)
+ return ret;
+ }
+ }
+
+ /* We're done, found all csums in the ordered extents. */
+ if (mod_len == 0)
return 0;
/* If we're compressed we have to save the entire range of csums. */
- if (em->compress_type) {
+ if (btrfs_extent_map_is_compressed(em)) {
csum_offset = 0;
- csum_len = max(em->block_len, em->orig_block_len);
+ csum_len = em->disk_num_bytes;
} else {
- csum_offset = em->mod_start - em->start;
- csum_len = em->mod_len;
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
}
/* block start is already adjusted for the file extent offset. */
- ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
- em->block_start + csum_offset,
- em->block_start + csum_offset +
- csum_len - 1, &ordered_sums, 0);
- if (ret)
+ block_start = btrfs_extent_map_block_start(em);
+ csum_root = btrfs_csum_root(trans->fs_info, block_start);
+ ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
+ block_start + csum_offset + csum_len - 1,
+ &ordered_sums, false);
+ if (ret < 0)
return ret;
+ ret = 0;
while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
+ struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums,
+ struct btrfs_ordered_sum,
+ list);
if (!ret)
- ret = log_csums(trans, log_root, sums);
+ ret = log_csums(trans, inode, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4127,76 +5107,81 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
}
static int log_one_extent(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_root *root,
+ struct btrfs_inode *inode,
const struct extent_map *em,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_root *log = root->log_root;
- struct btrfs_file_extent_item *fi;
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_file_extent_item fi = { 0 };
struct extent_buffer *leaf;
- struct btrfs_map_token token;
struct btrfs_key key;
- u64 extent_offset = em->start - em->orig_start;
+ enum btrfs_compression_type compress_type;
+ u64 extent_offset = em->offset;
+ u64 block_start = btrfs_extent_map_block_start(em);
u64 block_len;
int ret;
- int extent_inserted = 0;
- ret = log_extent_csums(trans, inode, log, em);
- if (ret)
- return ret;
+ btrfs_set_stack_file_extent_generation(&fi, trans->transid);
+ if (em->flags & EXTENT_FLAG_PREALLOC)
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
+ else
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
+
+ block_len = em->disk_num_bytes;
+ compress_type = btrfs_extent_map_compression(em);
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ } else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ }
+
+ btrfs_set_stack_file_extent_offset(&fi, extent_offset);
+ btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
+ btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
+ btrfs_set_stack_file_extent_compression(&fi, compress_type);
- ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
- em->start + em->len, NULL, 0, 1,
- sizeof(*fi), &extent_inserted);
+ ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
- if (!extent_inserted) {
+ /*
+ * If this is the first time we are logging the inode in the current
+ * transaction, we can avoid btrfs_drop_extents(), which is expensive
+ * because it does a deletion search, which always acquires write locks
+ * for extent buffers at levels 2, 1 and 0. This not only wastes time
+ * but also adds significant contention in a log tree, since log trees
+ * are small, with a root at level 2 or 3 at most, due to their short
+ * life span.
+ */
+ if (ctx->logged_before) {
+ drop_args.path = path;
+ drop_args.start = em->start;
+ drop_args.end = em->start + em->len;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(fi);
+ ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+ if (ret)
+ return ret;
+ }
+
+ if (!drop_args.extent_inserted) {
key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = em->start;
ret = btrfs_insert_empty_item(trans, log, path, &key,
- sizeof(*fi));
+ sizeof(fi));
if (ret)
return ret;
}
leaf = path->nodes[0];
- btrfs_init_map_token(&token, leaf);
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- btrfs_set_token_file_extent_type(&token, fi,
- BTRFS_FILE_EXTENT_PREALLOC);
- else
- btrfs_set_token_file_extent_type(&token, fi,
- BTRFS_FILE_EXTENT_REG);
-
- block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_token_file_extent_disk_bytenr(&token, fi,
- em->block_start);
- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_token_file_extent_disk_bytenr(&token, fi,
- em->block_start -
- extent_offset);
- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
- } else {
- btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
- }
-
- btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
- btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
- btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
- btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
- btrfs_set_token_file_extent_encryption(&token, fi, 0);
- btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
- btrfs_mark_buffer_dirty(leaf);
+ write_extent_buffer(leaf, &fi,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(fi));
btrfs_release_path(path);
@@ -4205,7 +5190,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
/*
* Log all prealloc extents beyond the inode's i_size to make sure we do not
- * lose them after doing a fast fsync and replaying the log. We scan the
+ * lose them after doing a full/fast fsync and replaying the log. We scan the
* subvolume's root instead of iterating the inode's extent map tree because
* otherwise we can log incorrect extent items based on extent map conversion.
* That can happen due to the fact that extent maps are merged when they
@@ -4213,19 +5198,20 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
*/
static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
struct btrfs_key key;
const u64 i_size = i_size_read(&inode->vfs_inode);
const u64 ino = btrfs_ino(inode);
- struct btrfs_path *dst_path = NULL;
+ BTRFS_PATH_AUTO_FREE(dst_path);
bool dropped_extents = false;
u64 truncate_offset = i_size;
struct extent_buffer *leaf;
int slot;
int ins_nr = 0;
- int start_slot;
+ int start_slot = 0;
int ret;
if (!(inode->flags & BTRFS_INODE_PREALLOC))
@@ -4279,7 +5265,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
if (slot >= btrfs_header_nritems(leaf)) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
if (ret < 0)
goto out;
ins_nr = 0;
@@ -4303,22 +5289,23 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
path->slots[0]++;
continue;
}
- if (!dropped_extents) {
- /*
- * Avoid logging extent items logged in past fsync calls
- * and leading to duplicate keys in the log tree.
- */
- do {
- ret = btrfs_truncate_inode_items(trans,
- root->log_root,
- &inode->vfs_inode,
- truncate_offset,
- BTRFS_EXTENT_DATA_KEY);
- } while (ret == -EAGAIN);
+ /*
+ * Avoid overlapping items in the log tree. The first time we
+ * get here, get rid of everything from a past fsync. After
+ * that, if the current extent starts before the end of the last
+ * extent we copied, truncate the last one. This can happen if
+ * an ordered extent completion modifies the subvolume tree
+ * while btrfs_next_leaf() has the tree unlocked.
+ */
+ if (!dropped_extents || key.offset < truncate_offset) {
+ ret = truncate_inode_items(trans, root->log_root, inode,
+ min(key.offset, truncate_offset),
+ BTRFS_EXTENT_DATA_KEY);
if (ret)
goto out;
dropped_extents = true;
}
+ truncate_offset = btrfs_file_extent_end(path);
if (ins_nr == 0)
start_slot = slot;
ins_nr++;
@@ -4333,51 +5320,28 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
}
if (ins_nr > 0)
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
out:
btrfs_release_path(path);
- btrfs_free_path(dst_path);
return ret;
}
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- const u64 start,
- const u64 end)
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
struct extent_map *em, *n;
- struct list_head extents;
+ LIST_HEAD(extents);
struct extent_map_tree *tree = &inode->extent_tree;
- u64 test_gen;
int ret = 0;
int num = 0;
- INIT_LIST_HEAD(&extents);
-
write_lock(&tree->lock);
- test_gen = root->fs_info->last_trans_committed;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
- /*
- * Skip extents outside our logging range. It's important to do
- * it for correctness because if we don't ignore them, we may
- * log them before their ordered extent completes, and therefore
- * we could log them without logging their respective checksums
- * (the checksum items are added to the csum tree at the very
- * end of btrfs_finish_ordered_io()). Also leave such extents
- * outside of our range in the list, since we may have another
- * ranged fsync in the near future that needs them. If an extent
- * outside our range corresponds to a hole, log it to avoid
- * leaving gaps between extents (fsck will complain when we are
- * not using the NO_HOLES feature).
- */
- if ((em->start > end || em->start + em->len <= start) &&
- em->block_start != EXTENT_MAP_HOLE)
- continue;
-
list_del_init(&em->list);
/*
* Just an arbitrary number, this can be really CPU intensive
@@ -4391,17 +5355,17 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
goto process;
}
- if (em->generation <= test_gen)
+ if (em->generation < trans->transid)
continue;
/* We log prealloc extents beyond eof later. */
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
+ if ((em->flags & EXTENT_FLAG_PREALLOC) &&
em->start >= i_size_read(&inode->vfs_inode))
continue;
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(&em->refs);
- set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags |= EXTENT_FLAG_LOGGING;
list_add_tail(&em->list, &extents);
num++;
}
@@ -4409,7 +5373,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
list_sort(NULL, &extents, extent_cmp);
process:
while (!list_empty(&extents)) {
- em = list_entry(extents.next, struct extent_map, list);
+ em = list_first_entry(&extents, struct extent_map, list);
list_del_init(&em->list);
@@ -4418,26 +5382,49 @@ process:
* private list.
*/
if (ret) {
- clear_em_logging(tree, em);
- free_extent_map(em);
+ btrfs_clear_em_logging(inode, em);
+ btrfs_free_extent_map(em);
continue;
}
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path, ctx);
+ ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
- clear_em_logging(tree, em);
- free_extent_map(em);
+ btrfs_clear_em_logging(inode, em);
+ btrfs_free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- btrfs_release_path(path);
if (!ret)
- ret = btrfs_log_prealloc_extents(trans, inode, path);
+ ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * We have logged all extents successfully, now make sure the commit of
+ * the current transaction waits for the ordered extents to complete
+ * before it commits and wipes out the log trees, otherwise we would
+ * lose data if an ordered extents completes after the transaction
+ * commits and a power failure happens after the transaction commit.
+ */
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
+
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ spin_lock(&inode->ordered_tree_lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock(&inode->ordered_tree_lock);
+ }
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ return 0;
}
static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
@@ -4490,16 +5477,21 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
* with a journal, ext3/4, xfs, f2fs, etc).
*/
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path)
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_root *root = inode->root;
int ret;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
int ins_nr = 0;
int start_slot = 0;
+ bool found_xattrs = false;
+
+ if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
+ return 0;
key.objectid = ino;
key.type = BTRFS_XATTR_ITEM_KEY;
@@ -4517,7 +5509,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
if (slot >= nritems) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -4538,15 +5530,19 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
start_slot = slot;
ins_nr++;
path->slots[0]++;
+ found_xattrs = true;
cond_resched();
}
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
if (ret < 0)
return ret;
}
+ if (!found_xattrs)
+ set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
+
return 0;
}
@@ -4560,10 +5556,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
* truncate operation that changes the inode's size.
*/
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
@@ -4610,10 +5606,9 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
* leafs from the log root.
*/
btrfs_release_path(path);
- ret = btrfs_insert_file_extent(trans, root->log_root,
- ino, prev_extent_end, 0,
- 0, hole_len, 0, hole_len,
- 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root->log_root,
+ ino, prev_extent_end,
+ hole_len);
if (ret < 0)
return ret;
@@ -4642,10 +5637,8 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
- ret = btrfs_insert_file_extent(trans, root->log_root,
- ino, prev_extent_end, 0, 0,
- hole_len, 0, hole_len,
- 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
+ prev_extent_end, hole_len);
if (ret < 0)
return ret;
}
@@ -4701,19 +5694,18 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
struct btrfs_inode *inode,
u64 *other_ino, u64 *other_parent)
{
- int ret;
- struct btrfs_path *search_path;
- char *name = NULL;
+ BTRFS_PATH_AUTO_FREE(search_path);
+ char AUTO_KFREE(name);
u32 name_len = 0;
- u32 item_size = btrfs_item_size_nr(eb, slot);
+ u32 item_size = btrfs_item_size(eb, slot);
u32 cur_offset = 0;
unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
search_path = btrfs_alloc_path();
if (!search_path)
return -ENOMEM;
- search_path->search_commit_root = 1;
- search_path->skip_locking = 1;
+ search_path->search_commit_root = true;
+ search_path->skip_locking = true;
while (cur_offset < item_size) {
u64 parent;
@@ -4721,6 +5713,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
u32 this_len;
unsigned long name_ptr;
struct btrfs_dir_item *di;
+ struct fscrypt_str name_str;
if (key->type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *iref;
@@ -4745,17 +5738,18 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
char *new_name;
new_name = krealloc(name, this_name_len, GFP_NOFS);
- if (!new_name) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!new_name)
+ return -ENOMEM;
name_len = this_name_len;
name = new_name;
}
read_extent_buffer(eb, name, name_ptr, this_name_len);
+
+ name_str.name = name;
+ name_str.len = this_name_len;
di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
- parent, name, this_name_len, 0);
+ parent, &name_str, 0);
if (di && !IS_ERR(di)) {
struct btrfs_key di_key;
@@ -4763,28 +5757,243 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
di, &di_key);
if (di_key.type == BTRFS_INODE_ITEM_KEY) {
if (di_key.objectid != key->objectid) {
- ret = 1;
*other_ino = di_key.objectid;
*other_parent = parent;
+ return 1;
} else {
- ret = 0;
+ return 0;
}
} else {
- ret = -EAGAIN;
+ return -EAGAIN;
}
- goto out;
} else if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
+ return PTR_ERR(di);
}
btrfs_release_path(search_path);
cur_offset += this_len;
}
- ret = 0;
+
+ return 0;
+}
+
+/*
+ * Check if we need to log an inode. This is used in contexts where while
+ * logging an inode we need to log another inode (either that it exists or in
+ * full mode). This is used instead of btrfs_inode_in_log() because the later
+ * requires the inode to be in the log and have the log transaction committed,
+ * while here we do not care if the log transaction was already committed - our
+ * caller will commit the log later - and we want to avoid logging an inode
+ * multiple times when multiple tasks have joined the same log transaction.
+ */
+static bool need_log_inode(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ /*
+ * If a directory was not modified, no dentries added or removed, we can
+ * and should avoid logging it.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
+ return false;
+
+ /*
+ * If this inode does not have new/updated/deleted xattrs since the last
+ * time it was logged and is flagged as logged in the current transaction,
+ * we can skip logging it. As for new/deleted names, those are updated in
+ * the log by link/unlink/rename operations.
+ * In case the inode was logged and then evicted and reloaded, its
+ * logged_trans will be 0, in which case we have to fully log it since
+ * logged_trans is a transient field, not persisted.
+ */
+ if (inode_logged(trans, inode, NULL) == 1 &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return false;
+
+ return true;
+}
+
+struct btrfs_dir_list {
+ u64 ino;
+ struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory.
+ * See process_dir_items_leaf() for details about why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not acquire their VFS lock, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(sb_internal#2);
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not acquiring the VFS lock of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ * that while logging the inode new references (names) are added or removed
+ * from the inode, leaving the logged inode item with a link count that does
+ * not match the number of logged inode reference items. This is fine because
+ * at log replay time we compute the real number of links and correct the
+ * link count in the inode item (see replay_one_buffer() and
+ * link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ * while logging the inode's items new index items (key type
+ * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
+ * has a size that doesn't match the sum of the lengths of all the logged
+ * names - this is ok, not a problem, because at log replay time we set the
+ * directory's i_size to the correct value (see replay_one_name() and
+ * overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *start_inode,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = start_inode->root;
+ struct btrfs_path *path;
+ LIST_HEAD(dir_list);
+ struct btrfs_dir_list *dir_elem;
+ u64 ino = btrfs_ino(start_inode);
+ struct btrfs_inode *curr_inode = start_inode;
+ int ret = 0;
+
+ /*
+ * If we are logging a new name, as part of a link or rename operation,
+ * don't bother logging new dentries, as we just want to log the names
+ * of an inode and that any new parents exist.
+ */
+ if (ctx->logging_new_name)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* Pairs with btrfs_add_delayed_iput below. */
+ ihold(&curr_inode->vfs_inode);
+
+ while (true) {
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 next_index;
+ bool continue_curr_inode = true;
+ int iter_ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
+ next_index = key.offset;
+again:
+ btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ struct btrfs_inode *di_inode;
+ int log_mode = LOG_INODE_EXISTS;
+ int type;
+
+ if (found_key.objectid != ino ||
+ found_key.type != BTRFS_DIR_INDEX_KEY) {
+ continue_curr_inode = false;
+ break;
+ }
+
+ next_index = found_key.offset + 1;
+
+ di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+ type = btrfs_dir_ftype(leaf, di);
+ if (btrfs_dir_transid(leaf, di) < trans->transid)
+ continue;
+ btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+ if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ btrfs_release_path(path);
+ di_inode = btrfs_iget_logging(di_key.objectid, root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ goto out;
+ }
+
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
+ break;
+ }
+
+ ctx->log_new_dentries = false;
+ if (type == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
+ btrfs_add_delayed_iput(di_inode);
+ if (ret)
+ goto out;
+ if (ctx->log_new_dentries) {
+ dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+ if (!dir_elem) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dir_elem->ino = di_key.objectid;
+ list_add_tail(&dir_elem->list, &dir_list);
+ }
+ break;
+ }
+
+ btrfs_release_path(path);
+
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
+ } else if (iter_ret > 0) {
+ continue_curr_inode = false;
+ } else {
+ key = found_key;
+ }
+
+ if (continue_curr_inode && key.offset < (u64)-1) {
+ key.offset++;
+ goto again;
+ }
+
+ btrfs_set_first_dir_index_to_log(curr_inode, next_index);
+
+ if (list_empty(&dir_list))
+ break;
+
+ dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
+ ino = dir_elem->ino;
+ list_del(&dir_elem->list);
+ kfree(dir_elem);
+
+ btrfs_add_delayed_iput(curr_inode);
+
+ curr_inode = btrfs_iget_logging(ino, root);
+ if (IS_ERR(curr_inode)) {
+ ret = PTR_ERR(curr_inode);
+ curr_inode = NULL;
+ break;
+ }
+ }
out:
- btrfs_free_path(search_path);
- kfree(name);
+ btrfs_free_path(path);
+ if (curr_inode)
+ btrfs_add_delayed_iput(curr_inode);
+
+ if (ret) {
+ struct btrfs_dir_list *next;
+
+ list_for_each_entry_safe(dir_elem, next, &dir_list, list)
+ kfree(dir_elem);
+ }
+
return ret;
}
@@ -4794,105 +6003,251 @@ struct btrfs_ino_list {
struct list_head list;
};
-static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- u64 ino, u64 parent)
+static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ino_list *curr;
+ struct btrfs_ino_list *next;
+
+ list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
+ list_del(&curr->list);
+ kfree(curr);
+ }
+}
+
+static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
+ struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ path->search_commit_root = true;
+ path->skip_locking = true;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (WARN_ON_ONCE(ret > 0)) {
+ /*
+ * We have previously found the inode through the commit root
+ * so this should not happen. If it does, just error out and
+ * fallback to a transaction commit.
+ */
+ ret = -ENOENT;
+ } else if (ret == 0) {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
+ ret = 1;
+ }
+
+ btrfs_release_path(path);
+ path->search_commit_root = false;
+ path->skip_locking = false;
+
+ return ret;
+}
+
+static int add_conflicting_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 ino, u64 parent,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *ino_elem;
- LIST_HEAD(inode_list);
- int ret = 0;
+ struct btrfs_inode *inode;
+
+ /*
+ * It's rare to have a lot of conflicting inodes, in practice it is not
+ * common to have more than 1 or 2. We don't want to collect too many,
+ * as we could end up logging too many inodes (even if only in
+ * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
+ * commits.
+ */
+ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
+ return BTRFS_LOG_FORCE_COMMIT;
+
+ inode = btrfs_iget_logging(ino, root);
+ /*
+ * If the other inode that had a conflicting dir entry was deleted in
+ * the current transaction then we either:
+ *
+ * 1) Log the parent directory (later after adding it to the list) if
+ * the inode is a directory. This is because it may be a deleted
+ * subvolume/snapshot or it may be a regular directory that had
+ * deleted subvolumes/snapshots (or subdirectories that had them),
+ * and at the moment we can't deal with dropping subvolumes/snapshots
+ * during log replay. So we just log the parent, which will result in
+ * a fallback to a transaction commit if we are dealing with those
+ * cases (last_unlink_trans will match the current transaction);
+ *
+ * 2) Do nothing if it's not a directory. During log replay we simply
+ * unlink the conflicting dentry from the parent directory and then
+ * add the dentry for our inode. Like this we can avoid logging the
+ * parent directory (and maybe fallback to a transaction commit in
+ * case it has a last_unlink_trans == trans->transid, due to moving
+ * some inode from it to some other directory).
+ */
+ if (IS_ERR(inode)) {
+ int ret = PTR_ERR(inode);
+
+ if (ret != -ENOENT)
+ return ret;
+
+ ret = conflicting_inode_is_dir(root, ino, path);
+ /* Not a directory or we got an error. */
+ if (ret <= 0)
+ return ret;
+
+ /* Conflicting inode is a directory, so we'll log its parent. */
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
+
+ return 0;
+ }
+
+ /*
+ * If the inode was already logged skip it - otherwise we can hit an
+ * infinite loop. Example:
+ *
+ * From the commit root (previous transaction) we have the following
+ * inodes:
+ *
+ * inode 257 a directory
+ * inode 258 with references "zz" and "zz_link" on inode 257
+ * inode 259 with reference "a" on inode 257
+ *
+ * And in the current (uncommitted) transaction we have:
+ *
+ * inode 257 a directory, unchanged
+ * inode 258 with references "a" and "a2" on inode 257
+ * inode 259 with reference "zz_link" on inode 257
+ * inode 261 with reference "zz" on inode 257
+ *
+ * When logging inode 261 the following infinite loop could
+ * happen if we don't skip already logged inodes:
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 261
+ * on reference "zz", and log it;
+ *
+ * - we detect inode 259 as a conflicting inode, with inode 258
+ * on reference "a", and log it;
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 259
+ * on reference "zz_link", and log it - again! After this we
+ * repeat the above steps forever.
+ *
+ * Here we can use need_log_inode() because we only need to log the
+ * inode in LOG_INODE_EXISTS mode and rename operations update the log,
+ * so that the log ends up with the new name and without the old name.
+ */
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
+ return 0;
+ }
+
+ btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
return -ENOMEM;
ino_elem->ino = ino;
ino_elem->parent = parent;
- list_add_tail(&ino_elem->list, &inode_list);
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
- while (!list_empty(&inode_list)) {
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_key key;
- struct inode *inode;
-
- ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
- list);
- ino = ino_elem->ino;
- parent = ino_elem->parent;
- list_del(&ino_elem->list);
- kfree(ino_elem);
- if (ret)
- continue;
+ return 0;
+}
- btrfs_release_path(path);
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ int ret = 0;
+
+ /*
+ * Conflicting inodes are logged by the first call to btrfs_log_inode(),
+ * otherwise we could have unbounded recursion of btrfs_log_inode()
+ * calls. This check guarantees we can have only 1 level of recursion.
+ */
+ if (ctx->logging_conflict_inodes)
+ return 0;
+
+ ctx->logging_conflict_inodes = true;
+
+ /*
+ * New conflicting inodes may be found and added to the list while we
+ * are logging a conflicting inode, so keep iterating while the list is
+ * not empty.
+ */
+ while (!list_empty(&ctx->conflict_inodes)) {
+ struct btrfs_ino_list *curr;
+ struct btrfs_inode *inode;
+ u64 ino;
+ u64 parent;
- inode = btrfs_iget(fs_info->sb, ino, root);
+ curr = list_first_entry(&ctx->conflict_inodes,
+ struct btrfs_ino_list, list);
+ ino = curr->ino;
+ parent = curr->parent;
+ list_del(&curr->list);
+ kfree(curr);
+
+ inode = btrfs_iget_logging(ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
- * directory.
+ * directory. See the comment at add_conflicting_inode().
*/
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- if (ret == -ENOENT) {
- inode = btrfs_iget(fs_info->sb, parent, root);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- } else {
- ret = btrfs_log_inode(trans, root,
- BTRFS_I(inode),
- LOG_OTHER_INODE_ALL,
- 0, LLONG_MAX, ctx);
- btrfs_add_delayed_iput(inode);
- }
+ if (ret != -ENOENT)
+ break;
+
+ inode = btrfs_iget_logging(parent, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ break;
}
+
+ /*
+ * Always log the directory, we cannot make this
+ * conditional on need_log_inode() because the directory
+ * might have been logged in LOG_INODE_EXISTS mode or
+ * the dir index of the conflicting inode is not in a
+ * dir index key range logged for the directory. So we
+ * must make sure the deletion is recorded.
+ */
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
continue;
}
+
/*
- * If the inode was already logged skip it - otherwise we can
- * hit an infinite loop. Example:
- *
- * From the commit root (previous transaction) we have the
- * following inodes:
- *
- * inode 257 a directory
- * inode 258 with references "zz" and "zz_link" on inode 257
- * inode 259 with reference "a" on inode 257
- *
- * And in the current (uncommitted) transaction we have:
- *
- * inode 257 a directory, unchanged
- * inode 258 with references "a" and "a2" on inode 257
- * inode 259 with reference "zz_link" on inode 257
- * inode 261 with reference "zz" on inode 257
- *
- * When logging inode 261 the following infinite loop could
- * happen if we don't skip already logged inodes:
- *
- * - we detect inode 258 as a conflicting inode, with inode 261
- * on reference "zz", and log it;
+ * Here we can use need_log_inode() because we only need to log
+ * the inode in LOG_INODE_EXISTS mode and rename operations
+ * update the log, so that the log ends up with the new name and
+ * without the old name.
*
- * - we detect inode 259 as a conflicting inode, with inode 258
- * on reference "a", and log it;
- *
- * - we detect inode 258 as a conflicting inode, with inode 259
- * on reference "zz_link", and log it - again! After this we
- * repeat the above steps forever.
- */
- spin_lock(&BTRFS_I(inode)->lock);
- /*
- * Check the inode's logged_trans only instead of
- * btrfs_inode_in_log(). This is because the last_log_commit of
- * the inode is not updated when we only log that it exists and
- * and it has the full sync bit set (see btrfs_log_inode()).
+ * We did this check at add_conflicting_inode(), but here we do
+ * it again because if some other task logged the inode after
+ * that, we can avoid doing it again.
*/
- if (BTRFS_I(inode)->logged_trans == trans->transid) {
- spin_unlock(&BTRFS_I(inode)->lock);
+ if (!need_log_inode(trans, inode)) {
btrfs_add_delayed_iput(inode);
continue;
}
- spin_unlock(&BTRFS_I(inode)->lock);
+
/*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
@@ -4900,68 +6255,16 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
- if (ret) {
- btrfs_add_delayed_iput(inode);
- continue;
- }
-
- key.objectid = ino;
- key.type = BTRFS_INODE_REF_KEY;
- key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- btrfs_add_delayed_iput(inode);
- continue;
- }
-
- while (true) {
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
- u64 other_ino = 0;
- u64 other_parent = 0;
-
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- break;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid != ino ||
- (key.type != BTRFS_INODE_REF_KEY &&
- key.type != BTRFS_INODE_EXTREF_KEY)) {
- ret = 0;
- break;
- }
-
- ret = btrfs_check_ref_name_override(leaf, slot, &key,
- BTRFS_I(inode), &other_ino,
- &other_parent);
- if (ret < 0)
- break;
- if (ret > 0) {
- ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
- if (!ino_elem) {
- ret = -ENOMEM;
- break;
- }
- ino_elem->ino = other_ino;
- ino_elem->parent = other_parent;
- list_add_tail(&ino_elem->list, &inode_list);
- ret = 0;
- }
- path->slots[0]++;
- }
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
}
+ ctx->logging_conflict_inodes = false;
+ if (ret)
+ free_conflicting_inodes(ctx);
+
return ret;
}
@@ -4972,11 +6275,11 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_path *dst_path,
const u64 logged_isize,
- const bool recursive_logging,
const int inode_only,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
struct btrfs_root *root = inode->root;
int ins_start_slot = 0;
int ins_nr = 0;
@@ -4997,13 +6300,21 @@ again:
if (min_key->type > max_key->type)
break;
- if (min_key->type == BTRFS_INODE_ITEM_KEY)
+ if (min_key->type == BTRFS_INODE_ITEM_KEY) {
*need_log_inode_item = false;
-
- if ((min_key->type == BTRFS_INODE_REF_KEY ||
- min_key->type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
+ } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
+ min_key->offset >= i_size) {
+ /*
+ * Extents at and beyond eof are logged with
+ * btrfs_log_prealloc_extents().
+ * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
+ * and no keys greater than that, so bail out.
+ */
+ break;
+ } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ (inode->generation == trans->transid ||
+ ctx->logging_conflict_inodes)) {
u64 other_ino = 0;
u64 other_parent = 0;
@@ -5012,8 +6323,8 @@ again:
&other_ino, &other_parent);
if (ret < 0) {
return ret;
- } else if (ret > 0 && ctx &&
- other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
+ } else if (ret > 0 &&
+ other_ino != btrfs_ino(ctx->inode)) {
if (ins_nr > 0) {
ins_nr++;
} else {
@@ -5022,27 +6333,26 @@ again:
}
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr,
- inode_only, logged_isize);
+ inode_only, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
- ret = log_conflicting_inodes(trans, root, path,
- ctx, other_ino, other_parent);
+ btrfs_release_path(path);
+ ret = add_conflicting_inode(trans, root, path,
+ other_ino,
+ other_parent, ctx);
if (ret)
return ret;
- btrfs_release_path(path);
goto next_key;
}
- }
-
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
- if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot,
- ins_nr, inode_only, logged_isize);
+ ins_nr, inode_only, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5059,7 +6369,7 @@ again:
}
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
- ins_nr, inode_only, logged_isize);
+ ins_nr, inode_only, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 1;
@@ -5074,7 +6384,7 @@ next_slot:
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
- logged_isize);
+ logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5089,10 +6399,396 @@ next_key:
} else {
break;
}
+
+ /*
+ * We may process many leaves full of items for our inode, so
+ * avoid monopolizing a cpu for too long by rescheduling while
+ * not holding locks on any tree.
+ */
+ cond_resched();
}
- if (ins_nr)
+ if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
- ins_nr, inode_only, logged_isize);
+ ins_nr, inode_only, logged_isize, ctx);
+ if (ret)
+ return ret;
+ }
+
+ if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+ /*
+ * Release the path because otherwise we might attempt to double
+ * lock the same leaf with btrfs_log_prealloc_extents() below.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
+ }
+
+ return ret;
+}
+
+static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ const struct btrfs_item_batch *batch,
+ const struct btrfs_delayed_item *first_item)
+{
+ const struct btrfs_delayed_item *curr = first_item;
+ int ret;
+
+ ret = btrfs_insert_empty_items(trans, log, path, batch);
+ if (ret)
+ return ret;
+
+ for (int i = 0; i < batch->nr; i++) {
+ char *data_ptr;
+
+ data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+ write_extent_buffer(path->nodes[0], &curr->data,
+ (unsigned long)data_ptr, curr->data_len);
+ curr = list_next_entry(curr, log_list);
+ path->slots[0]++;
+ }
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
+ const int max_batch_size = 195;
+ const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_item_batch batch = {
+ .nr = 0,
+ .total_data_size = 0,
+ };
+ const struct btrfs_delayed_item *first = NULL;
+ const struct btrfs_delayed_item *curr;
+ char *ins_data;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ u64 curr_batch_size = 0;
+ int batch_idx = 0;
+ int ret;
+
+ /* We are adding dir index items to the log tree. */
+ lockdep_assert_held(&inode->log_mutex);
+
+ /*
+ * We collect delayed items before copying index keys from the subvolume
+ * to the log tree. However just after we collected them, they may have
+ * been flushed (all of them or just some of them), and therefore we
+ * could have copied them from the subvolume tree to the log tree.
+ * So find the first delayed item that was not yet logged (they are
+ * sorted by index number).
+ */
+ list_for_each_entry(curr, delayed_ins_list, log_list) {
+ if (curr->index > inode->last_dir_index_offset) {
+ first = curr;
+ break;
+ }
+ }
+
+ /* Empty list or all delayed items were already logged. */
+ if (!first)
+ return 0;
+
+ ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+ ins_sizes = (u32 *)ins_data;
+ batch.data_sizes = ins_sizes;
+ ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
+ batch.keys = ins_keys;
+
+ curr = first;
+ while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
+ const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
+
+ if (curr_batch_size + curr_size > leaf_data_size ||
+ batch.nr == max_batch_size) {
+ ret = insert_delayed_items_batch(trans, log, path,
+ &batch, first);
+ if (ret)
+ goto out;
+ batch_idx = 0;
+ batch.nr = 0;
+ batch.total_data_size = 0;
+ curr_batch_size = 0;
+ first = curr;
+ }
+
+ ins_sizes[batch_idx] = curr->data_len;
+ ins_keys[batch_idx].objectid = ino;
+ ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[batch_idx].offset = curr->index;
+ curr_batch_size += curr_size;
+ batch.total_data_size += curr->data_len;
+ batch.nr++;
+ batch_idx++;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ ASSERT(batch.nr >= 1, "batch.nr=%d", batch.nr);
+ ret = insert_delayed_items_batch(trans, log, path, &batch, first);
+
+ curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
+ log_list);
+ inode->last_dir_index_offset = curr->index;
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ const struct btrfs_delayed_item *curr;
+
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ const struct btrfs_delayed_item *next;
+ int ret;
+
+ /*
+ * Find a range of consecutive dir index items to delete. Like
+ * this we log a single dir range item spanning several contiguous
+ * dir items instead of logging one range item per dir index item.
+ */
+ next = list_next_entry(curr, log_list);
+ while (!list_entry_is_head(next, delayed_del_list, log_list)) {
+ if (next->index != curr->index + 1)
+ break;
+ curr = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ last_dir_index = curr->index;
+ ASSERT(last_dir_index >= first_dir_index,
+ "last_dir_index=%llu first_dir_index=%llu",
+ last_dir_index, first_dir_index);
+
+ ret = insert_dir_log_key(trans, inode->root->log_root, path,
+ ino, first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ return 0;
+}
+
+static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ const struct btrfs_delayed_item *first,
+ const struct btrfs_delayed_item **last_ret)
+{
+ const struct btrfs_delayed_item *next;
+ struct extent_buffer *leaf = path->nodes[0];
+ const int last_slot = btrfs_header_nritems(leaf) - 1;
+ int slot = path->slots[0] + 1;
+ const u64 ino = btrfs_ino(inode);
+
+ next = list_next_entry(first, log_list);
+
+ while (slot < last_slot &&
+ !list_entry_is_head(next, delayed_del_list, log_list)) {
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
+ break;
+
+ slot++;
+ *last_ret = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ return btrfs_del_items(trans, inode->root->log_root, path,
+ path->slots[0], slot - path->slots[0]);
+}
+
+static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ const struct btrfs_delayed_item *curr;
+ u64 last_range_start = 0;
+ u64 last_range_end = 0;
+ struct btrfs_key key;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_DIR_INDEX_KEY;
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ const struct btrfs_delayed_item *last = curr;
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ bool deleted_items = false;
+ int ret;
+
+ key.offset = curr->index;
+ ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ ret = batch_delete_dir_index_items(trans, inode, path,
+ delayed_del_list, curr,
+ &last);
+ if (ret)
+ return ret;
+ deleted_items = true;
+ }
+
+ btrfs_release_path(path);
+
+ /*
+ * If we deleted items from the leaf, it means we have a range
+ * item logging their range, so no need to add one or update an
+ * existing one. Otherwise we have to log a dir range item.
+ */
+ if (deleted_items)
+ goto next_batch;
+
+ last_dir_index = last->index;
+ ASSERT(last_dir_index >= first_dir_index,
+ "last_dir_index=%llu first_dir_index=%llu",
+ last_dir_index, first_dir_index);
+ /*
+ * If this range starts right after where the previous one ends,
+ * then we want to reuse the previous range item and change its
+ * end offset to the end of this range. This is just to minimize
+ * leaf space usage, by avoiding adding a new range item.
+ */
+ if (last_range_end != 0 && first_dir_index == last_range_end + 1)
+ first_dir_index = last_range_start;
+
+ ret = insert_dir_log_key(trans, log, path, key.objectid,
+ first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+
+ last_range_start = first_dir_index;
+ last_range_end = last_dir_index;
+next_batch:
+ curr = list_next_entry(last, log_list);
+ }
+
+ return 0;
+}
+
+static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /*
+ * We are deleting dir index items from the log tree or adding range
+ * items to it.
+ */
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (list_empty(delayed_del_list))
+ return 0;
+
+ if (ctx->logged_before)
+ return log_delayed_deletions_incremental(trans, inode, path,
+ delayed_del_list, ctx);
+
+ return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
+ ctx);
+}
+
+/*
+ * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
+ * items instead of the subvolume tree.
+ */
+static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const bool orig_log_new_dentries = ctx->log_new_dentries;
+ struct btrfs_delayed_item *item;
+ int ret = 0;
+
+ /*
+ * No need for the log mutex, plus to avoid potential deadlocks or
+ * lockdep annotations due to nesting of delayed inode mutexes and log
+ * mutexes.
+ */
+ lockdep_assert_not_held(&inode->log_mutex);
+
+ ASSERT(!ctx->logging_new_delayed_dentries,
+ "ctx->logging_new_delayed_dentries=%d", ctx->logging_new_delayed_dentries);
+ ctx->logging_new_delayed_dentries = true;
+
+ list_for_each_entry(item, delayed_ins_list, log_list) {
+ struct btrfs_dir_item *dir_item;
+ struct btrfs_inode *di_inode;
+ struct btrfs_key key;
+ int log_mode = LOG_INODE_EXISTS;
+
+ dir_item = (struct btrfs_dir_item *)item->data;
+ btrfs_disk_key_to_cpu(&key, &dir_item->location);
+
+ if (key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ di_inode = btrfs_iget_logging(key.objectid, inode->root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ break;
+ }
+
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
+ continue;
+ }
+
+ if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
+
+ if (!ret && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans, di_inode, ctx);
+
+ btrfs_add_delayed_iput(di_inode);
+
+ if (ret)
+ break;
+ }
+
+ ctx->log_new_dentries = orig_log_new_dentries;
+ ctx->logging_new_delayed_dentries = false;
return ret;
}
@@ -5112,19 +6808,15 @@ next_key:
* This handles both files and directories.
*/
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_path *dst_path;
struct btrfs_key min_key;
struct btrfs_key max_key;
- struct btrfs_root *log = root->log_root;
- int err = 0;
+ struct btrfs_root *log = inode->root->log_root;
int ret;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
@@ -5132,7 +6824,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
- bool recursive_logging = false;
+ bool inode_item_dropped = true;
+ bool full_dir_logging = false;
+ LIST_HEAD(delayed_ins_list);
+ LIST_HEAD(delayed_del_list);
path = btrfs_alloc_path();
if (!path)
@@ -5160,33 +6855,80 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
+ full_dir_logging = true;
+
/*
- * Only run delayed items if we are a dir or a new file.
- * Otherwise commit the delayed inode only, which is needed in
- * order for the log replay code to mark inodes for link count
- * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+ * If we are logging a directory while we are logging dentries of the
+ * delayed items of some other inode, then we need to flush the delayed
+ * items of this directory and not log the delayed items directly. This
+ * is to prevent more than one level of recursion into btrfs_log_inode()
+ * by having something like this:
+ *
+ * $ mkdir -p a/b/c/d/e/f/g/h/...
+ * $ xfs_io -c "fsync" a
+ *
+ * Where all directories in the path did not exist before and are
+ * created in the current transaction.
+ * So in such a case we directly log the delayed items of the main
+ * directory ("a") without flushing them first, while for each of its
+ * subdirectories we flush their delayed items before logging them.
+ * This prevents a potential unbounded recursion like this:
+ *
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * (...)
+ *
+ * We have thresholds for the maximum number of delayed items to have in
+ * memory, and once they are hit, the items are flushed asynchronously.
+ * However the limit is quite high, so lets prevent deep levels of
+ * recursion to happen by limiting the maximum depth to be 1.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode) ||
- inode->generation > fs_info->last_trans_committed)
+ if (full_dir_logging && ctx->logging_new_delayed_dentries) {
ret = btrfs_commit_inode_delayed_items(trans, inode);
- else
- ret = btrfs_commit_inode_delayed_inode(inode);
-
- if (ret) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return ret;
+ if (ret)
+ goto out;
}
- if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
- recursive_logging = true;
- if (inode_only == LOG_OTHER_INODE)
- inode_only = LOG_INODE_EXISTS;
- else
- inode_only = LOG_INODE_ALL;
- mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
- } else {
- mutex_lock(&inode->log_mutex);
+ mutex_lock(&inode->log_mutex);
+
+ /*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
+ * Before logging the inode item, cache the value returned by
+ * inode_logged(), because after that we have the need to figure out if
+ * the inode was previously logged in this transaction.
+ */
+ ret = inode_logged(trans, inode, path);
+ if (ret < 0)
+ goto out_unlock;
+ ctx->logged_before = (ret == 1);
+ ret = 0;
+
+ /*
+ * This is for cases where logging a directory could result in losing a
+ * a file after replaying the log. For example, if we move a file from a
+ * directory A to a directory B, then fsync directory A, we have no way
+ * to known the file was moved from A to B, so logging just A would
+ * result in losing the file after a log replay.
+ */
+ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out_unlock;
}
/*
@@ -5194,13 +6936,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* copies of everything.
*/
if (S_ISDIR(inode->vfs_inode.i_mode)) {
- int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
-
- if (inode_only == LOG_INODE_EXISTS)
- max_key_type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ BTRFS_XATTR_ITEM_KEY);
} else {
- if (inode_only == LOG_INODE_EXISTS) {
+ if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
/*
* Make sure the new inode item we write to the log has
* the same isize as the current one (if it exists).
@@ -5214,27 +6955,25 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* (zeroes), as if an expanding truncate happened,
* instead of getting a file of 4Kb only.
*/
- err = logged_inode_size(log, inode, path, &logged_isize);
- if (err)
+ ret = logged_inode_size(log, inode, path, &logged_isize);
+ if (ret)
goto out_unlock;
}
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags)) {
if (inode_only == LOG_INODE_EXISTS) {
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path,
+ inode, max_key.type);
} else {
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags);
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags);
- while(1) {
- ret = btrfs_truncate_inode_items(trans,
- log, &inode->vfs_inode, 0, 0);
- if (ret != -EAGAIN)
- break;
- }
+ if (ctx->logged_before)
+ ret = truncate_inode_items(trans, log,
+ inode, 0, 0);
}
} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags) ||
@@ -5242,393 +6981,176 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (inode_only == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
+ inode_item_dropped = false;
goto log_extents;
}
}
- if (ret) {
- err = ret;
+ if (ret)
goto out_unlock;
+
+ /*
+ * If we are logging a directory in full mode, collect the delayed items
+ * before iterating the subvolume tree, so that we don't miss any new
+ * dir index items in case they get flushed while or right after we are
+ * iterating the subvolume tree.
+ */
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries)
+ btrfs_log_get_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
+
+ /*
+ * If we are fsyncing a file with 0 hard links, then commit the delayed
+ * inode because the last inode ref (or extref) item may still be in the
+ * subvolume tree and if we log it the file will still exist after a log
+ * replay. So commit the delayed inode to delete that last ref and we
+ * skip logging it.
+ */
+ if (inode->vfs_inode.i_nlink == 0) {
+ ret = btrfs_commit_inode_delayed_inode(inode);
+ if (ret)
+ goto out_unlock;
}
- err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
+ ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
- recursive_logging, inode_only, ctx,
+ inode_only, ctx,
&need_log_inode_item);
- if (err)
+ if (ret)
goto out_unlock;
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
- if (err)
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
+ if (ret)
goto out_unlock;
xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_holes(trans, root, inode, path);
- if (err)
+ ret = btrfs_log_holes(trans, inode, path);
+ if (ret)
goto out_unlock;
}
log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (need_log_inode_item) {
- err = log_inode_item(trans, log, dst_path, inode);
- if (!err && !xattrs_logged) {
- err = btrfs_log_all_xattrs(trans, root, inode, path,
- dst_path);
+ ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
+ if (ret)
+ goto out_unlock;
+ /*
+ * If we are doing a fast fsync and the inode was logged before
+ * in this transaction, we don't need to log the xattrs because
+ * they were logged before. If xattrs were added, changed or
+ * deleted since the last time we logged the inode, then we have
+ * already logged them because the inode had the runtime flag
+ * BTRFS_INODE_COPY_EVERYTHING set.
+ */
+ if (!xattrs_logged && inode->logged_trans < trans->transid) {
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
+ if (ret)
+ goto out_unlock;
btrfs_release_path(path);
}
- if (err)
- goto out_unlock;
}
if (fast_search) {
- ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- ctx, start, end);
- if (ret) {
- err = ret;
+ ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
+ if (ret)
goto out_unlock;
- }
} else if (inode_only == LOG_INODE_ALL) {
struct extent_map *em, *n;
write_lock(&em_tree->lock);
- /*
- * We can't just remove every em if we're called for a ranged
- * fsync - that is, one that doesn't cover the whole possible
- * file range (0 to LLONG_MAX). This is because we can have
- * em's that fall outside the range we're logging and therefore
- * their ordered operations haven't completed yet
- * (btrfs_finish_ordered_io() not invoked yet). This means we
- * didn't get their respective file extent item in the fs/subvol
- * tree yet, and need to let the next fast fsync (one which
- * consults the list of modified extent maps) find the em so
- * that it logs a matching file extent item and waits for the
- * respective ordered operation to complete (if it's still
- * running).
- *
- * Removing every em outside the range we're logging would make
- * the next fast fsync not log their matching file extent items,
- * therefore making us lose data after a log replay.
- */
- list_for_each_entry_safe(em, n, &em_tree->modified_extents,
- list) {
- const u64 mod_end = em->mod_start + em->mod_len - 1;
-
- if (em->mod_start >= start && mod_end <= end)
- list_del_init(&em->list);
- }
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
+ list_del_init(&em->list);
write_unlock(&em_tree->lock);
}
- if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
- ret = log_directory_changes(trans, root, inode, path, dst_path,
- ctx);
- if (ret) {
- err = ret;
+ if (full_dir_logging) {
+ ret = log_directory_changes(trans, inode, path, dst_path, ctx);
+ if (ret)
+ goto out_unlock;
+ ret = log_delayed_insertion_items(trans, inode, path,
+ &delayed_ins_list, ctx);
+ if (ret)
+ goto out_unlock;
+ ret = log_delayed_deletion_items(trans, inode, path,
+ &delayed_del_list, ctx);
+ if (ret)
goto out_unlock;
- }
}
- /*
- * Don't update last_log_commit if we logged that an inode exists after
- * it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode, then
- * the inode gets evicted after all delalloc was flushed, then we log
- * it exists (due to a rename for example) and then fsync it. This last
- * fsync would do nothing (not logging the extents previously written).
- */
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ /*
+ * Don't update last_log_commit if we logged that an inode exists.
+ * We do this for three reasons:
+ *
+ * 1) We might have had buffered writes to this inode that were
+ * flushed and had their ordered extents completed in this
+ * transaction, but we did not previously log the inode with
+ * LOG_INODE_ALL. Later the inode was evicted and after that
+ * it was loaded again and this LOG_INODE_EXISTS log operation
+ * happened. We must make sure that if an explicit fsync against
+ * the inode is performed later, it logs the new extents, an
+ * updated inode item, etc, and syncs the log. The same logic
+ * applies to direct IO writes instead of buffered writes.
+ *
+ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+ * is logged with an i_size of 0 or whatever value was logged
+ * before. If later the i_size of the inode is increased by a
+ * truncate operation, the log is synced through an fsync of
+ * some other inode and then finally an explicit fsync against
+ * this inode is made, we must make sure this fsync logs the
+ * inode with the new i_size, the hole between old i_size and
+ * the new i_size, and syncs the log.
+ *
+ * 3) If we are logging that an ancestor inode exists as part of
+ * logging a new name from a link or rename operation, don't update
+ * its last_log_commit - otherwise if an explicit fsync is made
+ * against an ancestor, the fsync considers the inode in the log
+ * and doesn't sync the log, resulting in the ancestor missing after
+ * a power failure unless the log was synced as part of an fsync
+ * against any other unrelated inode.
+ */
+ if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
-out_unlock:
- mutex_unlock(&inode->log_mutex);
-
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return err;
-}
-
-/*
- * Check if we must fallback to a transaction commit when logging an inode.
- * This must be called after logging the inode and is used only in the context
- * when fsyncing an inode requires the need to log some other inode - in which
- * case we can't lock the i_mutex of each other inode we need to log as that
- * can lead to deadlocks with concurrent fsync against other inodes (as we can
- * log inodes up or down in the hierarchy) or rename operations for example. So
- * we take the log_mutex of the inode after we have logged it and then check for
- * its last_unlink_trans value - this is safe because any task setting
- * last_unlink_trans must take the log_mutex and it must do this before it does
- * the actual unlink operation, so if we do this check before a concurrent task
- * sets last_unlink_trans it means we've logged a consistent version/state of
- * all the inode items, otherwise we are not sure and must do a transaction
- * commit (the concurrent task might have only updated last_unlink_trans before
- * we logged the inode or it might have also done the unlink).
- */
-static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- bool ret = false;
-
- mutex_lock(&inode->log_mutex);
- if (inode->last_unlink_trans > fs_info->last_trans_committed) {
- /*
- * Make sure any commits to the log are forced to be full
- * commits.
- */
- btrfs_set_log_full_commit(trans);
- ret = true;
- }
- mutex_unlock(&inode->log_mutex);
-
- return ret;
-}
-
-/*
- * follow the dentry parent pointers up the chain and see if any
- * of the directories in it require a full commit before they can
- * be logged. Returns zero if nothing special needs to be done or 1 if
- * a full commit is required.
- */
-static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode,
- struct dentry *parent,
- struct super_block *sb,
- u64 last_committed)
-{
- int ret = 0;
- struct dentry *old_parent = NULL;
/*
- * for regular files, if its inode is already on disk, we don't
- * have to worry about the parents at all. This is because
- * we can use the last_unlink_trans field to record renames
- * and other fun in this file.
+ * Reset the last_reflink_trans so that the next fsync does not need to
+ * go through the slower path when logging extents and their checksums.
*/
- if (S_ISREG(inode->vfs_inode.i_mode) &&
- inode->generation <= last_committed &&
- inode->last_unlink_trans <= last_committed)
- goto out;
-
- if (!S_ISDIR(inode->vfs_inode.i_mode)) {
- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
- goto out;
- inode = BTRFS_I(d_inode(parent));
- }
+ if (inode_only == LOG_INODE_ALL)
+ inode->last_reflink_trans = 0;
- while (1) {
- if (btrfs_must_commit_transaction(trans, inode)) {
- ret = 1;
- break;
- }
-
- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
- break;
-
- if (IS_ROOT(parent)) {
- inode = BTRFS_I(d_inode(parent));
- if (btrfs_must_commit_transaction(trans, inode))
- ret = 1;
- break;
- }
-
- parent = dget_parent(parent);
- dput(old_parent);
- old_parent = parent;
- inode = BTRFS_I(d_inode(parent));
-
- }
- dput(old_parent);
+out_unlock:
+ mutex_unlock(&inode->log_mutex);
out:
- return ret;
-}
-
-struct btrfs_dir_list {
- u64 ino;
- struct list_head list;
-};
-
-/*
- * Log the inodes of the new dentries of a directory. See log_dir_items() for
- * details about the why it is needed.
- * This is a recursive operation - if an existing dentry corresponds to a
- * directory, that directory's new entries are logged too (same behaviour as
- * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
- * the dentries point to we do not lock their i_mutex, otherwise lockdep
- * complains about the following circular lock dependency / possible deadlock:
- *
- * CPU0 CPU1
- * ---- ----
- * lock(&type->i_mutex_dir_key#3/2);
- * lock(sb_internal#2);
- * lock(&type->i_mutex_dir_key#3/2);
- * lock(&sb->s_type->i_mutex_key#14);
- *
- * Where sb_internal is the lock (a counter that works as a lock) acquired by
- * sb_start_intwrite() in btrfs_start_transaction().
- * Not locking i_mutex of the inodes is still safe because:
- *
- * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
- * that while logging the inode new references (names) are added or removed
- * from the inode, leaving the logged inode item with a link count that does
- * not match the number of logged inode reference items. This is fine because
- * at log replay time we compute the real number of links and correct the
- * link count in the inode item (see replay_one_buffer() and
- * link_to_fixup_dir());
- *
- * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
- * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
- * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
- * has a size that doesn't match the sum of the lengths of all the logged
- * names. This does not result in a problem because if a dir_item key is
- * logged but its matching dir_index key is not logged, at log replay time we
- * don't use it to replay the respective name (see replay_one_name()). On the
- * other hand if only the dir_index key ends up being logged, the respective
- * name is added to the fs/subvol tree with both the dir_item and dir_index
- * keys created (see replay_one_name()).
- * The directory's inode item with a wrong i_size is not a problem as well,
- * since we don't use it at log replay time to set the i_size in the inode
- * item of the fs/subvol tree (see overwrite_item()).
- */
-static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *start_inode,
- struct btrfs_log_ctx *ctx)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_root *log = root->log_root;
- struct btrfs_path *path;
- LIST_HEAD(dir_list);
- struct btrfs_dir_list *dir_elem;
- int ret = 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
- if (!dir_elem) {
- btrfs_free_path(path);
- return -ENOMEM;
- }
- dir_elem->ino = btrfs_ino(start_inode);
- list_add_tail(&dir_elem->list, &dir_list);
-
- while (!list_empty(&dir_list)) {
- struct extent_buffer *leaf;
- struct btrfs_key min_key;
- int nritems;
- int i;
-
- dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
- list);
- if (ret)
- goto next_dir_inode;
-
- min_key.objectid = dir_elem->ino;
- min_key.type = BTRFS_DIR_ITEM_KEY;
- min_key.offset = 0;
-again:
- btrfs_release_path(path);
- ret = btrfs_search_forward(log, &min_key, path, trans->transid);
- if (ret < 0) {
- goto next_dir_inode;
- } else if (ret > 0) {
- ret = 0;
- goto next_dir_inode;
- }
-
-process_leaf:
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- for (i = path->slots[0]; i < nritems; i++) {
- struct btrfs_dir_item *di;
- struct btrfs_key di_key;
- struct inode *di_inode;
- struct btrfs_dir_list *new_dir_elem;
- int log_mode = LOG_INODE_EXISTS;
- int type;
-
- btrfs_item_key_to_cpu(leaf, &min_key, i);
- if (min_key.objectid != dir_elem->ino ||
- min_key.type != BTRFS_DIR_ITEM_KEY)
- goto next_dir_inode;
-
- di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
- type = btrfs_dir_type(leaf, di);
- if (btrfs_dir_transid(leaf, di) < trans->transid &&
- type != BTRFS_FT_DIR)
- continue;
- btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
- if (di_key.type == BTRFS_ROOT_ITEM_KEY)
- continue;
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
- btrfs_release_path(path);
- di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
- if (IS_ERR(di_inode)) {
- ret = PTR_ERR(di_inode);
- goto next_dir_inode;
- }
+ if (ret)
+ free_conflicting_inodes(ctx);
+ else
+ ret = log_conflicting_inodes(trans, inode->root, ctx);
- if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
- btrfs_add_delayed_iput(di_inode);
- break;
- }
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
+ if (!ret)
+ ret = log_new_delayed_dentries(trans, inode,
+ &delayed_ins_list, ctx);
- ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
- log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
- log_mode, 0, LLONG_MAX, ctx);
- if (!ret &&
- btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
- ret = 1;
- btrfs_add_delayed_iput(di_inode);
- if (ret)
- goto next_dir_inode;
- if (ctx->log_new_dentries) {
- new_dir_elem = kmalloc(sizeof(*new_dir_elem),
- GFP_NOFS);
- if (!new_dir_elem) {
- ret = -ENOMEM;
- goto next_dir_inode;
- }
- new_dir_elem->ino = di_key.objectid;
- list_add_tail(&new_dir_elem->list, &dir_list);
- }
- break;
- }
- if (i == nritems) {
- ret = btrfs_next_leaf(log, path);
- if (ret < 0) {
- goto next_dir_inode;
- } else if (ret > 0) {
- ret = 0;
- goto next_dir_inode;
- }
- goto process_leaf;
- }
- if (min_key.offset < (u64)-1) {
- min_key.offset++;
- goto again;
- }
-next_dir_inode:
- list_del(&dir_elem->list);
- kfree(dir_elem);
+ btrfs_log_put_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
}
- btrfs_free_path(path);
return ret;
}
@@ -5636,9 +7158,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root *root = inode->root;
const u64 ino = btrfs_ino(inode);
@@ -5646,15 +7167,15 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->skip_locking = 1;
- path->search_commit_root = 1;
+ path->skip_locking = true;
+ path->search_commit_root = true;
key.objectid = ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
while (true) {
struct extent_buffer *leaf = path->nodes[0];
@@ -5666,8 +7187,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
- else if (ret > 0)
+ return ret;
+ if (ret > 0)
break;
continue;
}
@@ -5677,32 +7198,27 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
break;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
- struct btrfs_key inode_key;
- struct inode *dir_inode;
-
- inode_key.type = BTRFS_INODE_ITEM_KEY;
- inode_key.offset = 0;
+ u64 dir_id;
+ struct btrfs_inode *dir_inode;
if (key.type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)
(ptr + cur_offset);
- inode_key.objectid = btrfs_inode_extref_parent(
- leaf, extref);
+ dir_id = btrfs_inode_extref_parent(leaf, extref);
cur_offset += sizeof(*extref);
cur_offset += btrfs_inode_extref_name_len(leaf,
extref);
} else {
- inode_key.objectid = key.offset;
+ dir_id = key.offset;
cur_offset = item_size;
}
- dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
- root);
+ dir_inode = btrfs_iget_logging(dir_id, root);
/*
* If the parent inode was deleted, return an error to
* fallback to a transaction commit. This is to prevent
@@ -5726,31 +7242,25 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
* at both parents and the old parent B would still
* exist.
*/
- if (IS_ERR(dir_inode)) {
- ret = PTR_ERR(dir_inode);
- goto out;
+ if (IS_ERR(dir_inode))
+ return PTR_ERR(dir_inode);
+
+ if (!need_log_inode(trans, dir_inode)) {
+ btrfs_add_delayed_iput(dir_inode);
+ continue;
}
- if (ctx)
- ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
- LOG_INODE_ALL, 0, LLONG_MAX, ctx);
- if (!ret &&
- btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
- ret = 1;
- if (!ret && ctx && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans, root,
- BTRFS_I(dir_inode), ctx);
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx);
+ if (!ret && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans, dir_inode, ctx);
btrfs_add_delayed_iput(dir_inode);
if (ret)
- goto out;
+ return ret;
}
path->slots[0]++;
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static int log_new_ancestors(struct btrfs_trans_handle *trans,
@@ -5763,12 +7273,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
while (true) {
- struct btrfs_fs_info *fs_info = root->fs_info;
- const u64 last_committed = fs_info->last_trans_committed;
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
+ struct extent_buffer *leaf;
+ int slot;
struct btrfs_key search_key;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
int ret = 0;
@@ -5779,14 +7287,13 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
search_key.objectid = found_key.offset;
search_key.type = BTRFS_INODE_ITEM_KEY;
search_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (BTRFS_I(inode)->generation > last_committed)
- ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_INODE_EXISTS,
- 0, LLONG_MAX, ctx);
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode))
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -5825,7 +7332,6 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct dentry *old_parent = NULL;
struct super_block *sb = inode->vfs_inode.i_sb;
int ret = 0;
@@ -5839,9 +7345,10 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (root != inode->root)
break;
- if (inode->generation > fs_info->last_trans_committed) {
- ret = btrfs_log_inode(trans, root, inode,
- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode)) {
+ ret = btrfs_log_inode(trans, inode,
+ LOG_INODE_EXISTS, ctx);
if (ret)
break;
}
@@ -5864,7 +7371,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
const u64 ino = btrfs_ino(inode);
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key search_key;
int ret;
@@ -5885,7 +7392,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret == 0)
path->slots[0]++;
@@ -5897,8 +7404,8 @@ again:
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
- else if (ret > 0)
+ return ret;
+ if (ret > 0)
break;
continue;
}
@@ -5915,10 +7422,8 @@ again:
* this loop, etc). So just return some error to fallback to
* a transaction commit.
*/
- if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
- ret = -EMLINK;
- goto out;
- }
+ if (found_key.type == BTRFS_INODE_EXTREF_KEY)
+ return -EMLINK;
/*
* Logging ancestors needs to do more searches on the fs/subvol
@@ -5930,14 +7435,11 @@ again:
ret = log_new_ancestors(trans, root, path, ctx);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
goto again;
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -5949,61 +7451,37 @@ out:
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
- const loff_t start,
- const loff_t end,
int inode_only,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct super_block *sb;
int ret = 0;
- u64 last_committed = fs_info->last_trans_committed;
- bool log_dentries = false;
+ bool log_dentries;
- sb = inode->vfs_inode.i_sb;
+ if (btrfs_test_opt(fs_info, NOTREELOG))
+ return BTRFS_LOG_FORCE_COMMIT;
- if (btrfs_test_opt(fs_info, NOTREELOG)) {
- ret = 1;
- goto end_no_trans;
- }
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return BTRFS_LOG_FORCE_COMMIT;
/*
- * The prev transaction commit doesn't complete, we need do
- * full commit by ourselves.
+ * If we're logging an inode from a subvolume created in the current
+ * transaction we must force a commit since the root is not persisted.
*/
- if (fs_info->last_trans_log_full_commit >
- fs_info->last_trans_committed) {
- ret = 1;
- goto end_no_trans;
- }
-
- if (btrfs_root_refs(&root->root_item) == 0) {
- ret = 1;
- goto end_no_trans;
- }
+ if (btrfs_root_generation(&root->root_item) == trans->transid)
+ return BTRFS_LOG_FORCE_COMMIT;
- ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
- last_committed);
- if (ret)
- goto end_no_trans;
-
- /*
- * Skip already logged inodes or inodes corresponding to tmpfiles
- * (since logging them is pointless, a link count of 0 means they
- * will never be accessible).
- */
- if (btrfs_inode_in_log(inode, trans->transid) ||
- inode->vfs_inode.i_nlink == 0) {
- ret = BTRFS_NO_LOG_SYNC;
- goto end_no_trans;
- }
+ /* Skip already logged inodes and without new extents. */
+ if (btrfs_inode_in_log(inode, trans->transid) &&
+ list_empty(&ctx->ordered_extents))
+ return BTRFS_NO_LOG_SYNC;
ret = start_log_trans(trans, root, ctx);
if (ret)
- goto end_no_trans;
+ return ret;
- ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
+ ret = btrfs_log_inode(trans, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -6014,14 +7492,17 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* and other fun in this file.
*/
if (S_ISREG(inode->vfs_inode.i_mode) &&
- inode->generation <= last_committed &&
- inode->last_unlink_trans <= last_committed) {
+ inode->generation < trans->transid &&
+ inode->last_unlink_trans < trans->transid) {
ret = 0;
goto end_trans;
}
- if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
- log_dentries = true;
+ /*
+ * Track if we need to log dentries because ctx->log_new_dentries can
+ * be modified in the call chains below.
+ */
+ log_dentries = ctx->log_new_dentries;
/*
* On unlink we must make sure all our current and old parent directory
@@ -6064,7 +7545,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* but the file inode does not have a matching BTRFS_INODE_REF_KEY item
* and has a link count of 2.
*/
- if (inode->last_unlink_trans > last_committed) {
+ if (inode->last_unlink_trans >= trans->transid) {
ret = btrfs_log_all_parents(trans, inode, ctx);
if (ret)
goto end_trans;
@@ -6075,19 +7556,17 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
if (log_dentries)
- ret = log_new_dir_dentries(trans, root, inode, ctx);
- else
- ret = 0;
+ ret = log_new_dir_dentries(trans, inode, ctx);
end_trans:
if (ret < 0) {
btrfs_set_log_full_commit(trans);
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
}
if (ret)
btrfs_remove_log_ctx(root, ctx);
btrfs_end_log_trans(root);
-end_no_trans:
+
return ret;
}
@@ -6099,15 +7578,13 @@ end_no_trans:
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
- start, end, LOG_INODE_ALL, ctx);
+ LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -6123,8 +7600,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
struct btrfs_key key;
- struct btrfs_key found_key;
- struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
struct walk_control wc = {
.process_func = process_one_buffer,
@@ -6144,26 +7619,28 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
}
wc.trans = trans;
- wc.pin = 1;
+ wc.pin = true;
+ wc.log = log_root_tree;
- ret = walk_log_tree(trans, log_root_tree, &wc);
- if (ret) {
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to pin buffers while recovering log root tree.");
+ ret = walk_log_tree(&wc);
+ wc.log = NULL;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto error;
}
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
+ struct btrfs_key found_key;
+
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
- if (ret < 0) {
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't find tree log root.");
+ if (unlikely(ret < 0)) {
+ btrfs_abort_transaction(trans, ret);
goto error;
}
if (ret > 0) {
@@ -6177,18 +7654,22 @@ again:
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
- log = btrfs_read_tree_root(log_root_tree, &found_key);
- if (IS_ERR(log)) {
- ret = PTR_ERR(log);
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't read tree log root.");
+ wc.log = btrfs_read_tree_root(log_root_tree, &found_key);
+ if (IS_ERR(wc.log)) {
+ ret = PTR_ERR(wc.log);
+ wc.log = NULL;
+ btrfs_abort_transaction(trans, ret);
goto error;
}
- wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
- true);
- if (IS_ERR(wc.replay_dest)) {
- ret = PTR_ERR(wc.replay_dest);
+ wc.root = btrfs_get_fs_root(fs_info, found_key.offset, true);
+ if (IS_ERR(wc.root)) {
+ ret = PTR_ERR(wc.root);
+ wc.root = NULL;
+ if (unlikely(ret != -ENOENT)) {
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
/*
* We didn't find the subvol, likely because it was
@@ -6201,33 +7682,37 @@ again:
* block from being modified, and we'll just bail for
* each subsequent pass.
*/
- if (ret == -ENOENT)
- ret = btrfs_pin_extent_for_log_replay(trans,
- log->node->start,
- log->node->len);
- btrfs_put_root(log);
-
- if (!ret)
- goto next;
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't read target root for tree log recovery.");
- goto error;
+ ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
+ goto next;
}
- wc.replay_dest->log_root = log;
- btrfs_record_root_in_trans(trans, wc.replay_dest);
- ret = walk_log_tree(trans, log, &wc);
-
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- ret = fixup_inode_link_counts(trans, wc.replay_dest,
- path);
+ wc.root->log_root = wc.log;
+ ret = btrfs_record_root_in_trans(trans, wc.root);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
}
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- struct btrfs_root *root = wc.replay_dest;
+ ret = walk_log_tree(&wc);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
- btrfs_release_path(path);
+ if (wc.stage == LOG_WALK_REPLAY_ALL) {
+ struct btrfs_root *root = wc.root;
+ wc.subvol_path = path;
+ ret = fixup_inode_link_counts(&wc);
+ wc.subvol_path = NULL;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
/*
* We have just replayed everything, and the highest
* objectid of fs roots probably has changed in case
@@ -6236,17 +7721,22 @@ again:
* root->objectid_mutex is not acquired as log replay
* could only happen during mount.
*/
- ret = btrfs_find_highest_objectid(root,
- &root->highest_objectid);
+ ret = btrfs_init_root_free_objectid(root);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
}
-
- wc.replay_dest->log_root = NULL;
- btrfs_put_root(wc.replay_dest);
- btrfs_put_root(log);
+next:
+ if (wc.root) {
+ wc.root->log_root = NULL;
+ btrfs_put_root(wc.root);
+ }
+ btrfs_put_root(wc.log);
+ wc.log = NULL;
if (ret)
goto error;
-next:
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
@@ -6255,7 +7745,7 @@ next:
/* step one is to pin it all, step two is to replay just inodes */
if (wc.pin) {
- wc.pin = 0;
+ wc.pin = false;
wc.process_func = replay_one_buffer;
wc.stage = LOG_WALK_REPLAY_INODES;
goto again;
@@ -6273,14 +7763,14 @@ next:
if (ret)
return ret;
- log_root_tree->log_root = NULL;
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
- btrfs_put_root(log_root_tree);
return 0;
error:
if (wc.trans)
btrfs_end_transaction(wc.trans);
+ btrfs_put_root(wc.log);
+ clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_free_path(path);
return ret;
}
@@ -6298,7 +7788,7 @@ error:
*/
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct btrfs_inode *inode,
- int for_rename)
+ bool for_rename)
{
/*
* when we're logging a file, if it hasn't been renamed
@@ -6314,18 +7804,25 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
inode->last_unlink_trans = trans->transid;
mutex_unlock(&inode->log_mutex);
+ if (!for_rename)
+ return;
+
/*
- * if this directory was already logged any new
- * names for this file/dir will get recorded
+ * If this directory was already logged, any new names will be logged
+ * with btrfs_log_new_name() and old names will be deleted from the log
+ * tree with btrfs_del_dir_entries_in_log() or with
+ * btrfs_del_inode_ref_in_log().
*/
- if (dir->logged_trans == trans->transid)
+ if (inode_logged(trans, dir, NULL) == 1)
return;
/*
- * if the inode we're about to unlink was logged,
- * the log will be properly updated for any new names
+ * If the inode we're about to unlink was logged before, the log will be
+ * properly updated with the new name with btrfs_log_new_name() and the
+ * old name removed with btrfs_del_dir_entries_in_log() or with
+ * btrfs_del_inode_ref_in_log().
*/
- if (inode->logged_trans == trans->transid)
+ if (inode_logged(trans, inode, NULL) == 1)
return;
/*
@@ -6335,13 +7832,6 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
* properly. So, we have to be conservative and force commits
* so the new name gets discovered.
*/
- if (for_rename)
- goto record;
-
- /* we can safely do the unlink without any special recording */
- return;
-
-record:
mutex_lock(&dir->log_mutex);
dir->last_unlink_trans = trans->transid;
mutex_unlock(&dir->log_mutex);
@@ -6368,29 +7858,58 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
}
/*
- * Call this after adding a new name for a file and it will properly
- * update the log to reflect the new name.
+ * Call this when creating a subvolume in a directory.
+ * Because we don't commit a transaction when creating a subvolume, we can't
+ * allow the directory pointing to the subvolume to be logged with an entry that
+ * points to an unpersisted root if we are still in the transaction used to
+ * create the subvolume, so make any attempt to log the directory to result in a
+ * full log sync.
+ * Also we don't need to worry with renames, since btrfs_rename() marks the log
+ * for full commit when renaming a subvolume.
+ *
+ * Must be called before creating the subvolume entry in its parent directory.
+ */
+void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir)
+{
+ mutex_lock(&dir->log_mutex);
+ dir->last_unlink_trans = trans->transid;
+ mutex_unlock(&dir->log_mutex);
+}
+
+/*
+ * Update the log after adding a new name for an inode.
*
- * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
- * true (because it's not used).
+ * @trans: Transaction handle.
+ * @old_dentry: The dentry associated with the old name and the old
+ * parent directory.
+ * @old_dir: The inode of the previous parent directory for the case
+ * of a rename. For a link operation, it must be NULL.
+ * @old_dir_index: The index number associated with the old name, meaningful
+ * only for rename operations (when @old_dir is not NULL).
+ * Ignored for link operations.
+ * @parent: The dentry associated with the directory under which the
+ * new name is located.
*
- * Return value depends on whether @sync_log is true or false.
- * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
- * otherwise.
- * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
- * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
- * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed (without attempting to sync the log).
+ * Call this after adding a new name for an inode, as a result of a link or
+ * rename operation, and it will properly update the log to reflect the new name.
*/
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx)
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct dentry *old_dentry, struct btrfs_inode *old_dir,
+ u64 old_dir_index, struct dentry *parent)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_log_ctx ctx;
+ bool log_pinned = false;
int ret;
+ /* The inode has a new name (ref/extref), so make sure we log it. */
+ set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
+
+ btrfs_init_log_ctx(&ctx, inode);
+ ctx.logging_new_name = true;
+
/*
* this will force the logging code to walk the dentry chain
* up for the file
@@ -6402,36 +7921,130 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (inode->logged_trans <= fs_info->last_trans_committed &&
- (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
- BTRFS_DONT_NEED_LOG_SYNC;
-
- if (sync_log) {
- struct btrfs_log_ctx ctx2;
-
- btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, &ctx2);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
+ ret = inode_logged(trans, inode, NULL);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
+ if (!old_dir)
+ return;
+ /*
+ * If the inode was not logged and we are doing a rename (old_dir is not
+ * NULL), check if old_dir was logged - if it was not we can return and
+ * do nothing.
+ */
+ ret = inode_logged(trans, old_dir, NULL);
+ if (ret < 0)
+ goto out;
+ else if (ret == 0)
+ return;
+ }
+ ret = 0;
+
+ /*
+ * Now that we know we need to update the log, allocate the scratch eb
+ * for the context before joining a log transaction below, as this can
+ * take time and therefore we could delay log commits from other tasks.
+ */
+ btrfs_init_log_ctx_scratch_eb(&ctx);
+
+ /*
+ * If we are doing a rename (old_dir is not NULL) from a directory that
+ * was previously logged, make sure that on log replay we get the old
+ * dir entry deleted. This is needed because we will also log the new
+ * name of the renamed inode, so we need to make sure that after log
+ * replay we don't end up with both the new and old dir entries existing.
+ */
+ if (old_dir && old_dir->logged_trans == trans->transid) {
+ struct btrfs_root *log = old_dir->root->log_root;
+ struct btrfs_path *path;
+ struct fscrypt_name fname;
+
+ ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX,
+ "old_dir_index=%llu", old_dir_index);
- ret = btrfs_sync_log(trans, inode->root, &ctx2);
+ ret = fscrypt_setup_filename(&old_dir->vfs_inode,
+ &old_dentry->d_name, 0, &fname);
if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- }
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ fscrypt_free_filename(&fname);
+ goto out;
+ }
+
+ /*
+ * We have two inodes to update in the log, the old directory and
+ * the inode that got renamed, so we must pin the log to prevent
+ * anyone from syncing the log until we have updated both inodes
+ * in the log.
+ */
+ ret = join_running_log_trans(root);
+ /*
+ * At least one of the inodes was logged before, so this should
+ * not fail, but if it does, it's not serious, just bail out and
+ * mark the log for a full commit.
+ */
+ if (WARN_ON_ONCE(ret < 0)) {
+ btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
+ goto out;
+ }
+
+ log_pinned = true;
- ASSERT(ctx);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, ctx);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_LOG_SYNC;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
+ /*
+ * Other concurrent task might be logging the old directory,
+ * as it can be triggered when logging other inode that had or
+ * still has a dentry in the old directory. We lock the old
+ * directory's log_mutex to ensure the deletion of the old
+ * name is persisted, because during directory logging we
+ * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
+ * the old name's dir index item is in the delayed items, so
+ * it could be missed by an in progress directory logging.
+ */
+ mutex_lock(&old_dir->log_mutex);
+ ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
+ &fname.disk_name, old_dir_index);
+ if (ret > 0) {
+ /*
+ * The dentry does not exist in the log, so record its
+ * deletion.
+ */
+ btrfs_release_path(path);
+ ret = insert_dir_log_key(trans, log, path,
+ btrfs_ino(old_dir),
+ old_dir_index, old_dir_index);
+ }
+ mutex_unlock(&old_dir->log_mutex);
- return BTRFS_NEED_LOG_SYNC;
+ btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
+ if (ret < 0)
+ goto out;
+ }
+
+ /*
+ * We don't care about the return value. If we fail to log the new name
+ * then we know the next attempt to sync the log will fallback to a full
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
+ * we don't need to worry about getting a log committed that has an
+ * inconsistent state after a rename operation.
+ */
+ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+ ASSERT(list_empty(&ctx.conflict_inodes));
+out:
+ /*
+ * If an error happened mark the log for a full commit because it's not
+ * consistent and up to date or we couldn't find out if one of the
+ * inodes was logged before in this transaction. Do it before unpinning
+ * the log, to avoid any races with someone else trying to commit it.
+ */
+ if (ret < 0)
+ btrfs_set_log_full_commit(trans);
+ if (log_pinned)
+ btrfs_end_log_trans(root);
+ free_extent_buffer(ctx.scratch_eb);
}