summaryrefslogtreecommitdiff
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c8141
1 files changed, 5993 insertions, 2148 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2c6791493637..fff37c8d96a4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1,34 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/list_sort.h>
+#include <linux/iversion.h>
+#include "misc.h"
#include "ctree.h"
-#include "transaction.h"
+#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
-#include "print-tree.h"
#include "backref.h"
-#include "compat.h"
-#include "tree-log.h"
-#include "hash.h"
+#include "compression.h"
+#include "qgroup.h"
+#include "block-group.h"
+#include "space-info.h"
+#include "inode-item.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "dir-item.h"
+#include "file-item.h"
+#include "file.h"
+#include "orphan.h"
+#include "print-tree.h"
+#include "tree-checker.h"
+#include "delayed-inode.h"
+
+#define MAX_CONFLICT_INODES 10
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -36,8 +39,10 @@
* LOG_INODE_EXISTS means to log just enough to recreate the inode
* during log replay
*/
-#define LOG_INODE_ALL 0
-#define LOG_INODE_EXISTS 1
+enum {
+ LOG_INODE_ALL,
+ LOG_INODE_EXISTS,
+};
/*
* directory trouble cases
@@ -91,21 +96,143 @@
* The last stage is to deal with directories and links and extents
* and all the other fun semantics
*/
-#define LOG_WALK_PIN_ONLY 0
-#define LOG_WALK_REPLAY_INODES 1
-#define LOG_WALK_REPLAY_ALL 2
+enum {
+ LOG_WALK_PIN_ONLY,
+ LOG_WALK_REPLAY_INODES,
+ LOG_WALK_REPLAY_DIR_INDEX,
+ LOG_WALK_REPLAY_ALL,
+};
+
+/*
+ * The walk control struct is used to pass state down the chain when processing
+ * the log tree. The stage field tells us which part of the log tree processing
+ * we are currently doing.
+ */
+struct walk_control {
+ /*
+ * Signal that we are freeing the metadata extents of a log tree.
+ * This is used at transaction commit time while freeing a log tree.
+ */
+ bool free;
+
+ /*
+ * Signal that we are pinning the metadata extents of a log tree and the
+ * data extents its leaves point to (if using mixed block groups).
+ * This happens in the first stage of log replay to ensure that during
+ * replay, while we are modifying subvolume trees, we don't overwrite
+ * the metadata extents of log trees.
+ */
+ bool pin;
+
+ /* What stage of the replay code we're currently in. */
+ int stage;
+
+ /*
+ * Ignore any items from the inode currently being processed. Needs
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY.
+ */
+ bool ignore_cur_inode;
+
+ /*
+ * The root we are currently replaying to. This is NULL for the replay
+ * stage LOG_WALK_PIN_ONLY.
+ */
+ struct btrfs_root *root;
+
+ /* The log tree we are currently processing (not NULL for any stage). */
+ struct btrfs_root *log;
+
+ /* The transaction handle used for replaying all log trees. */
+ struct btrfs_trans_handle *trans;
+
+ /*
+ * The function that gets used to process blocks we find in the tree.
+ * Note the extent_buffer might not be up to date when it is passed in,
+ * and it must be checked or read if you need the data inside it.
+ */
+ int (*process_func)(struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen, int level);
+
+ /*
+ * The following are used only when stage is >= LOG_WALK_REPLAY_INODES
+ * and by the replay_one_buffer() callback.
+ */
+
+ /* The current log leaf being processed. */
+ struct extent_buffer *log_leaf;
+ /* The key being processed of the current log leaf. */
+ struct btrfs_key log_key;
+ /* The slot being processed of the current log leaf. */
+ int log_slot;
+
+ /* A path used for searches and modifications to subvolume trees. */
+ struct btrfs_path *subvol_path;
+};
+
+static void do_abort_log_replay(struct walk_control *wc, const char *function,
+ unsigned int line, int error, const char *fmt, ...)
+{
+ struct btrfs_fs_info *fs_info = wc->trans->fs_info;
+ struct va_format vaf;
+ va_list args;
+
+ /*
+ * Do nothing if we already aborted, to avoid dumping leaves again which
+ * can be verbose. Further more, only the first call is useful since it
+ * is where we have a problem. Note that we do not use the flag
+ * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that
+ * are outside of tree-log.c that can abort transactions (such as
+ * btrfs_add_link() for example), so if that happens we still want to
+ * dump all log replay specific information below.
+ */
+ if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state))
+ return;
+
+ btrfs_abort_transaction(wc->trans, error);
+
+ if (wc->subvol_path->nodes[0]) {
+ btrfs_crit(fs_info,
+ "subvolume (root %llu) leaf currently being processed:",
+ btrfs_root_id(wc->root));
+ btrfs_print_leaf(wc->subvol_path->nodes[0]);
+ }
+
+ if (wc->log_leaf) {
+ btrfs_crit(fs_info,
+"log tree (for root %llu) leaf currently being processed (slot %d key " BTRFS_KEY_FMT "):",
+ btrfs_root_id(wc->root), wc->log_slot,
+ BTRFS_KEY_FMT_VALUE(&wc->log_key));
+ btrfs_print_leaf(wc->log_leaf);
+ }
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV",
+ function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf);
+
+ va_end(args);
+}
+
+/*
+ * Use this for aborting a transaction during log replay while we are down the
+ * call chain of replay_one_buffer(), so that we get a lot more useful
+ * information for debugging issues when compared to a plain call to
+ * btrfs_abort_transaction().
+ */
+#define btrfs_abort_log_replay(wc, error, fmt, args...) \
+ do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args)
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only);
-static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid);
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- u64 dirid, int del_all);
+ struct btrfs_inode *inode,
+ int inode_only,
+ struct btrfs_log_ctx *ctx);
+static int link_to_fixup_dir(struct walk_control *wc, u64 objectid);
+static noinline int replay_dir_deletes(struct walk_control *wc,
+ u64 dirid, bool del_all);
+static void wait_log_commit(struct btrfs_root *root, int transid);
/*
* tree logging is a special write ahead log used to make sure that
@@ -130,49 +257,114 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
* and once to do all the other items.
*/
+static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+{
+ unsigned int nofs_flag;
+ struct btrfs_inode *inode;
+
+ /* Only meant to be called for subvolume roots and not for log roots. */
+ ASSERT(btrfs_is_fstree(btrfs_root_id(root)), "root_id=%llu", btrfs_root_id(root));
+
+ /*
+ * We're holding a transaction handle whether we are logging or
+ * replaying a log tree, so we must make sure NOFS semantics apply
+ * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
+ * to allocate an inode, which can recurse back into the filesystem and
+ * attempt a transaction commit, resulting in a deadlock.
+ */
+ nofs_flag = memalloc_nofs_save();
+ inode = btrfs_iget(objectid, root);
+ memalloc_nofs_restore(nofs_flag);
+
+ return inode;
+}
+
/*
* start a sub transaction and setup the log tree
* this increments the log tree writer count to make the people
* syncing the tree wait for us to finish
*/
static int start_log_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
{
- int ret;
- int err = 0;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ const bool zoned = btrfs_is_zoned(fs_info);
+ int ret = 0;
+ bool created = false;
+
+ /*
+ * First check if the log root tree was already created. If not, create
+ * it before locking the root's log_mutex, just to keep lockdep happy.
+ */
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
+ mutex_lock(&tree_root->log_mutex);
+ if (!fs_info->log_root_tree) {
+ ret = btrfs_init_log_root_tree(trans, fs_info);
+ if (!ret) {
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
+ created = true;
+ }
+ }
+ mutex_unlock(&tree_root->log_mutex);
+ if (ret)
+ return ret;
+ }
mutex_lock(&root->log_mutex);
+
+again:
if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
+ if (btrfs_need_log_full_commit(trans)) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
+ }
+
+ if (zoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
+
if (!root->log_start_pid) {
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
- root->log_multiple_pids = false;
} else if (root->log_start_pid != current->pid) {
- root->log_multiple_pids = true;
+ set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ }
+ } else {
+ /*
+ * This means fs_info->log_root_tree was already created
+ * for some other FS trees. Do the full commit not to mix
+ * nodes from multiple log transactions to do sequential
+ * writing.
+ */
+ if (zoned && !created) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
}
- atomic_inc(&root->log_batch);
- atomic_inc(&root->log_writers);
- mutex_unlock(&root->log_mutex);
- return 0;
- }
- root->log_multiple_pids = false;
- root->log_start_pid = current->pid;
- mutex_lock(&root->fs_info->tree_log_mutex);
- if (!root->fs_info->log_root_tree) {
- ret = btrfs_init_log_root_tree(trans, root->fs_info);
- if (ret)
- err = ret;
- }
- if (err == 0 && !root->log_root) {
ret = btrfs_add_log_tree(trans, root);
if (ret)
- err = ret;
+ goto out;
+
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ root->log_start_pid = current->pid;
}
- mutex_unlock(&root->fs_info->tree_log_mutex);
- atomic_inc(&root->log_batch);
+
atomic_inc(&root->log_writers);
+ if (!ctx->logging_new_name) {
+ int index = root->log_transid % 2;
+ list_add_tail(&ctx->list, &root->log_ctxs[index]);
+ ctx->log_transid = root->log_transid;
+ }
+
+out:
mutex_unlock(&root->log_mutex);
- return err;
+ return ret;
}
/*
@@ -182,15 +374,22 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
*/
static int join_running_log_trans(struct btrfs_root *root)
{
+ const bool zoned = btrfs_is_zoned(root->fs_info);
int ret = -ENOENT;
- smp_mb();
- if (!root->log_root)
- return -ENOENT;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
+ return ret;
mutex_lock(&root->log_mutex);
+again:
if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
ret = 0;
+ if (zoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
atomic_inc(&root->log_writers);
}
mutex_unlock(&root->log_mutex);
@@ -202,14 +401,9 @@ static int join_running_log_trans(struct btrfs_root *root)
* until you call btrfs_end_log_trans() or it makes any future
* log transactions wait until you call btrfs_end_log_trans()
*/
-int btrfs_pin_log_trans(struct btrfs_root *root)
+void btrfs_pin_log_trans(struct btrfs_root *root)
{
- int ret = -ENOENT;
-
- mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
- mutex_unlock(&root->log_mutex);
- return ret;
}
/*
@@ -219,99 +413,67 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
- smp_mb();
- if (waitqueue_active(&root->log_writer_wait))
- wake_up(&root->log_writer_wait);
+ /* atomic_dec_and_test implies a barrier */
+ cond_wake_up_nomb(&root->log_writer_wait);
}
}
-
-/*
- * the walk control struct is used to pass state down the chain when
- * processing the log tree. The stage field tells us which part
- * of the log tree processing we are currently doing. The others
- * are state fields used for that specific part
- */
-struct walk_control {
- /* should we free the extent on disk when done? This is used
- * at transaction commit time while freeing a log tree
- */
- int free;
-
- /* should we write out the extent buffer? This is used
- * while flushing the log tree to disk during a sync
- */
- int write;
-
- /* should we wait for the extent buffer io to finish? Also used
- * while flushing the log tree to disk for a sync
- */
- int wait;
-
- /* pin only walk, we record which extents on disk belong to the
- * log trees
- */
- int pin;
-
- /* what stage of the replay code we're currently in */
- int stage;
-
- /* the root we are currently replaying */
- struct btrfs_root *replay_dest;
-
- /* the trans handle for the current replay */
- struct btrfs_trans_handle *trans;
-
- /* the function that gets used to process blocks we find in the
- * tree. Note the extent_buffer might not be up to date when it is
- * passed in, and it must be checked or read if you need the data
- * inside it
- */
- int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
- struct walk_control *wc, u64 gen);
-};
-
/*
* process_func used to pin down extents, write them or wait on them
*/
-static int process_one_buffer(struct btrfs_root *log,
- struct extent_buffer *eb,
- struct walk_control *wc, u64 gen)
+static int process_one_buffer(struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen, int level)
{
+ struct btrfs_root *log = wc->log;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
/*
* If this fs is mixed then we need to be able to process the leaves to
* pin down any logged extents, so we have to read the block.
*/
- if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
- ret = btrfs_read_buffer(eb, gen);
- if (ret)
+ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+ struct btrfs_tree_parent_check check = {
+ .level = level,
+ .transid = gen
+ };
+
+ ret = btrfs_read_extent_buffer(eb, &check);
+ if (unlikely(ret)) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
+ }
}
- if (wc->pin)
- ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
- eb->start, eb->len);
+ if (wc->pin) {
+ ASSERT(trans != NULL);
+ ret = btrfs_pin_extent_for_log_replay(trans, eb);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
- if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
- if (wc->pin && btrfs_header_level(eb) == 0)
- ret = btrfs_exclude_logged_extents(log, eb);
- if (wc->write)
- btrfs_write_tree_block(eb);
- if (wc->wait)
- btrfs_wait_tree_block_writeback(eb);
+ if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) {
+ ret = btrfs_exclude_logged_extents(eb);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
}
return ret;
}
/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
+ * Item overwrite used by log replay. The given log tree leaf, slot and key
+ * from the walk_control structure all refer to the source data we are copying
+ * out.
*
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
+ * The given root is for the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and will be
+ * released on exit).
*
* If the key is already in the destination tree the existing item is
* overwritten. If the existing item isn't big enough, it is extended.
@@ -319,61 +481,67 @@ static int process_one_buffer(struct btrfs_root *log,
*
* If the key isn't in the destination yet, a new item is inserted.
*/
-static noinline int overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int overwrite_item(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
int ret;
u32 item_size;
u64 saved_i_size = 0;
int save_old_i_size = 0;
unsigned long src_ptr;
unsigned long dst_ptr;
- int overwrite_root = 0;
- bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
+ struct extent_buffer *dst_eb;
+ int dst_slot;
+ const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY);
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
- overwrite_root = 1;
+ /*
+ * This is only used during log replay, so the root is always from a
+ * fs/subvolume tree. In case we ever need to support a log root, then
+ * we'll have to clone the leaf in the path, release the path and use
+ * the leaf before writing into the log tree. See the comments at
+ * copy_items() for more details.
+ */
+ ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID, "root_id=%llu", btrfs_root_id(root));
- item_size = btrfs_item_size_nr(eb, slot);
- src_ptr = btrfs_item_ptr_offset(eb, slot);
+ item_size = btrfs_item_size(wc->log_leaf, wc->log_slot);
+ src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
- /* look for the key in the destination tree */
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
+ /* Look for the key in the destination tree. */
+ ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&wc->log_key),
+ btrfs_root_id(root));
return ret;
+ }
+
+ dst_eb = wc->subvol_path->nodes[0];
+ dst_slot = wc->subvol_path->slots[0];
if (ret == 0) {
char *src_copy;
- char *dst_copy;
- u32 dst_size = btrfs_item_size_nr(path->nodes[0],
- path->slots[0]);
+ const u32 dst_size = btrfs_item_size(dst_eb, dst_slot);
+
if (dst_size != item_size)
goto insert;
if (item_size == 0) {
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
- dst_copy = kmalloc(item_size, GFP_NOFS);
src_copy = kmalloc(item_size, GFP_NOFS);
- if (!dst_copy || !src_copy) {
- btrfs_release_path(path);
- kfree(dst_copy);
- kfree(src_copy);
+ if (!src_copy) {
+ btrfs_abort_log_replay(wc, -ENOMEM,
+ "failed to allocate memory for log leaf item");
return -ENOMEM;
}
- read_extent_buffer(eb, src_copy, src_ptr, item_size);
-
- dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
- item_size);
- ret = memcmp(dst_copy, src_copy, item_size);
+ read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size);
+ dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
+ ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
- kfree(dst_copy);
kfree(src_copy);
/*
* they have the same contents, just return, this saves
@@ -382,7 +550,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
* sync
*/
if (ret == 0) {
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
@@ -390,48 +558,72 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
* We need to load the old nbytes into the inode so when we
* replay the extents we've logged we get the right nbytes.
*/
- if (inode_item) {
+ if (is_inode_item) {
struct btrfs_inode_item *item;
u64 nbytes;
+ u32 mode;
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ item = btrfs_item_ptr(dst_eb, dst_slot,
struct btrfs_inode_item);
- nbytes = btrfs_inode_nbytes(path->nodes[0], item);
- item = btrfs_item_ptr(eb, slot,
+ nbytes = btrfs_inode_nbytes(dst_eb, item);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot,
struct btrfs_inode_item);
- btrfs_set_inode_nbytes(eb, item, nbytes);
+ btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes);
+
+ /*
+ * If this is a directory we need to reset the i_size to
+ * 0 so that we can set it up properly when replaying
+ * the rest of the items in this log.
+ */
+ mode = btrfs_inode_mode(wc->log_leaf, item);
+ if (S_ISDIR(mode))
+ btrfs_set_inode_size(wc->log_leaf, item, 0);
}
- } else if (inode_item) {
+ } else if (is_inode_item) {
struct btrfs_inode_item *item;
+ u32 mode;
/*
* New inode, set nbytes to 0 so that the nbytes comes out
* properly when we replay the extents.
*/
- item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
- btrfs_set_inode_nbytes(eb, item, 0);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item);
+ btrfs_set_inode_nbytes(wc->log_leaf, item, 0);
+
+ /*
+ * If this is a directory we need to reset the i_size to 0 so
+ * that we can set it up properly when replaying the rest of
+ * the items in this log.
+ */
+ mode = btrfs_inode_mode(wc->log_leaf, item);
+ if (S_ISDIR(mode))
+ btrfs_set_inode_size(wc->log_leaf, item, 0);
}
insert:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* try to insert the key into the destination tree */
- ret = btrfs_insert_empty_item(trans, root, path,
- key, item_size);
+ wc->subvol_path->skip_release_on_error = true;
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size);
+ wc->subvol_path->skip_release_on_error = false;
+
+ dst_eb = wc->subvol_path->nodes[0];
+ dst_slot = wc->subvol_path->slots[0];
/* make sure any existing item is the correct size */
- if (ret == -EEXIST) {
- u32 found_size;
- found_size = btrfs_item_size_nr(path->nodes[0],
- path->slots[0]);
+ if (ret == -EEXIST || ret == -EOVERFLOW) {
+ const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
+
if (found_size > item_size)
- btrfs_truncate_item(root, path, item_size, 1);
+ btrfs_truncate_item(trans, wc->subvol_path, item_size, 1);
else if (found_size < item_size)
- btrfs_extend_item(root, path,
- item_size - found_size);
+ btrfs_extend_item(trans, wc->subvol_path, item_size - found_size);
} else if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert item for key " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&wc->log_key));
return ret;
}
- dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]);
+ dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
/* don't overwrite an existing inode if the generation number
* was logged as zero. This is done when the tree logging code
@@ -442,70 +634,72 @@ insert:
* state of the tree found in the subvolume, and i_size is modified
* as it goes
*/
- if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+ if (is_inode_item && ret == -EEXIST) {
struct btrfs_inode_item *src_item;
struct btrfs_inode_item *dst_item;
src_item = (struct btrfs_inode_item *)src_ptr;
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(eb, src_item) == 0)
+ if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) {
+ const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item);
+
+ /*
+ * For regular files an ino_size == 0 is used only when
+ * logging that an inode exists, as part of a directory
+ * fsync, and the inode wasn't fsynced before. In this
+ * case don't set the size of the inode in the fs/subvol
+ * tree, otherwise we would be throwing valid data away.
+ */
+ if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) &&
+ S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
+ ino_size != 0)
+ btrfs_set_inode_size(dst_eb, dst_item, ino_size);
goto no_copy;
+ }
- if (overwrite_root &&
- S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
- S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+ if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) &&
+ S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
save_old_i_size = 1;
- saved_i_size = btrfs_inode_size(path->nodes[0],
- dst_item);
+ saved_i_size = btrfs_inode_size(dst_eb, dst_item);
}
}
- copy_extent_buffer(path->nodes[0], eb, dst_ptr,
- src_ptr, item_size);
+ copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size);
if (save_old_i_size) {
struct btrfs_inode_item *dst_item;
+
dst_item = (struct btrfs_inode_item *)dst_ptr;
- btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+ btrfs_set_inode_size(dst_eb, dst_item, saved_i_size);
}
/* make sure the generation is filled in */
- if (key->type == BTRFS_INODE_ITEM_KEY) {
+ if (is_inode_item) {
struct btrfs_inode_item *dst_item;
+
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
- btrfs_set_inode_generation(path->nodes[0], dst_item,
- trans->transid);
- }
+ if (btrfs_inode_generation(dst_eb, dst_item) == 0)
+ btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
}
no_copy:
- btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
-/*
- * simple helper to read an inode off the disk from a given root
- * This can only be called for subvolume roots and not for the log
- */
-static noinline struct inode *read_one_inode(struct btrfs_root *root,
- u64 objectid)
+static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
+ struct fscrypt_str *name)
{
- struct btrfs_key key;
- struct inode *inode;
+ char *buf;
- key.objectid = objectid;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
- if (IS_ERR(inode)) {
- inode = NULL;
- } else if (is_bad_inode(inode)) {
- iput(inode);
- inode = NULL;
- }
- return inode;
+ buf = kmalloc(len, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ read_extent_buffer(eb, buf, (unsigned long)start, len);
+ name->name = buf;
+ name->len = len;
+ return 0;
}
/* replays a single extent in 'eb' at 'slot' with 'key' into the
@@ -520,48 +714,53 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
* The extent is inserted into the file, dropping any existing extents
* from the file that overlap the new one.
*/
-static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static noinline int replay_one_extent(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_fs_info *fs_info = root->fs_info;
int found_type;
u64 extent_end;
- u64 start = key->offset;
+ const u64 start = wc->log_key.offset;
u64 nbytes = 0;
+ u64 csum_start;
+ u64 csum_end;
+ LIST_HEAD(ordered_sums);
+ u64 offset;
+ unsigned long dest_offset;
+ struct btrfs_key ins;
struct btrfs_file_extent_item *item;
- struct inode *inode = NULL;
- unsigned long size;
+ struct btrfs_inode *inode = NULL;
int ret = 0;
- item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
- found_type = btrfs_file_extent_type(eb, item);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(wc->log_leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- nbytes = btrfs_file_extent_num_bytes(eb, item);
- extent_end = start + nbytes;
-
- /*
- * We don't add to the inodes nbytes if we are prealloc or a
- * hole.
- */
- if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
- nbytes = 0;
+ extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
+ /* Holes don't take up space. */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0)
+ nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item);
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- size = btrfs_file_extent_inline_len(eb, item);
- nbytes = btrfs_file_extent_ram_bytes(eb, item);
- extent_end = ALIGN(start + size, root->sectorsize);
+ nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item);
+ extent_end = ALIGN(start + nbytes, fs_info->sectorsize);
} else {
- ret = 0;
- goto out;
+ btrfs_abort_log_replay(wc, -EUCLEAN,
+ "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
+ found_type, btrfs_root_id(root),
+ wc->log_key.objectid, wc->log_key.offset);
+ return -EUCLEAN;
}
- inode = read_one_inode(root, key->objectid);
- if (!inode) {
- ret = -EIO;
- goto out;
+ inode = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get inode %llu for root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
+ return ret;
}
/*
@@ -569,136 +768,299 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
- ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
- start, 0);
+ ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path,
+ btrfs_ino(inode), start, 0);
if (ret == 0 &&
(found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
- struct btrfs_file_extent_item cmp1;
- struct btrfs_file_extent_item cmp2;
- struct btrfs_file_extent_item *existing;
- struct extent_buffer *leaf;
-
- leaf = path->nodes[0];
- existing = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+ struct btrfs_file_extent_item existing;
+ unsigned long ptr;
- read_extent_buffer(eb, &cmp1, (unsigned long)item,
- sizeof(cmp1));
- read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
- sizeof(cmp2));
+ ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ read_extent_buffer(leaf, &existing, ptr, sizeof(existing));
/*
* we already have a pointer to this exact extent,
* we don't have to do anything
*/
- if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
- btrfs_release_path(path);
+ if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item,
+ sizeof(existing)) == 0) {
+ btrfs_release_path(wc->subvol_path);
goto out;
}
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* drop any overlapping extents */
- ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
- if (ret)
+ drop_args.start = start;
+ drop_args.end = extent_end;
+ drop_args.drop_cache = true;
+ drop_args.path = wc->subvol_path;
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to drop extents for inode %llu range [%llu, %llu) root %llu",
+ wc->log_key.objectid, start, extent_end,
+ btrfs_root_id(root));
goto out;
+ }
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- u64 offset;
- unsigned long dest_offset;
- struct btrfs_key ins;
-
- ret = btrfs_insert_empty_item(trans, root, path, key,
- sizeof(*item));
+ if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ /* inline extents are easy, we just overwrite them */
+ ret = overwrite_item(wc);
if (ret)
goto out;
- dest_offset = btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]);
- copy_extent_buffer(path->nodes[0], eb, dest_offset,
- (unsigned long)item, sizeof(*item));
-
- ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
- ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
- ins.type = BTRFS_EXTENT_ITEM_KEY;
- offset = key->offset - btrfs_file_extent_offset(eb, item);
-
- if (ins.objectid > 0) {
- u64 csum_start;
- u64 csum_end;
- LIST_HEAD(ordered_sums);
- /*
- * is this extent already allocated in the extent
- * allocation tree? If so, just add a reference
- */
- ret = btrfs_lookup_extent(root, ins.objectid,
- ins.offset);
- if (ret == 0) {
- ret = btrfs_inc_extent_ref(trans, root,
- ins.objectid, ins.offset,
- 0, root->root_key.objectid,
- key->objectid, offset, 0);
- if (ret)
- goto out;
- } else {
- /*
- * insert the extent pointer in the extent
- * allocation tree
- */
- ret = btrfs_alloc_logged_file_extent(trans,
- root, root->root_key.objectid,
- key->objectid, offset, &ins);
- if (ret)
- goto out;
- }
- btrfs_release_path(path);
+ goto update_inode;
+ }
- if (btrfs_file_extent_compression(eb, item)) {
- csum_start = ins.objectid;
- csum_end = csum_start + ins.offset;
- } else {
- csum_start = ins.objectid +
- btrfs_file_extent_offset(eb, item);
- csum_end = csum_start +
- btrfs_file_extent_num_bytes(eb, item);
- }
+ /*
+ * If not an inline extent, it can only be a regular or prealloc one.
+ * We have checked that above and returned -EUCLEAN if not.
+ */
+
+ /* A hole and NO_HOLES feature enabled, nothing else to do. */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 &&
+ btrfs_fs_incompat(fs_info, NO_HOLES))
+ goto update_inode;
- ret = btrfs_lookup_csums_range(root->log_root,
- csum_start, csum_end - 1,
- &ordered_sums, 0);
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path,
+ &wc->log_key, sizeof(*item));
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert item with key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&wc->log_key),
+ btrfs_root_id(root));
+ goto out;
+ }
+ dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0],
+ wc->subvol_path->slots[0]);
+ copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset,
+ (unsigned long)item, sizeof(*item));
+
+ /*
+ * We have an explicit hole and NO_HOLES is not enabled. We have added
+ * the hole file extent item to the subvolume tree, so we don't have
+ * anything else to do other than update the file extent item range and
+ * update the inode item.
+ */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) {
+ btrfs_release_path(wc->subvol_path);
+ goto update_inode;
+ }
+
+ ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item);
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item);
+ offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item);
+
+ /*
+ * Manually record dirty extent, as here we did a shallow file extent
+ * item copy and skip normal backref update, but modifying extent tree
+ * all by ourselves. So need to manually record dirty extent for qgroup,
+ * as the owner of the file extent changed from log tree (doesn't affect
+ * qgroup) to fs/file tree (affects qgroup).
+ */
+ ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid, btrfs_root_id(root));
+ goto out;
+ }
+
+ /*
+ * Is this extent already allocated in the extent tree?
+ * If so, just add a reference.
+ */
+ ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid, btrfs_root_id(root));
+ goto out;
+ } else if (ret == 0) {
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = ins.objectid,
+ .num_bytes = ins.offset,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+
+ btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
+ } else {
+ /* Insert the extent pointer in the extent tree. */
+ ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root),
+ wc->log_key.objectid, offset, &ins);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu",
+ ins.objectid, ins.offset, offset,
+ wc->log_key.objectid, btrfs_root_id(root));
+ goto out;
+ }
+ }
+
+ btrfs_release_path(wc->subvol_path);
+
+ if (btrfs_file_extent_compression(wc->log_leaf, item)) {
+ csum_start = ins.objectid;
+ csum_end = csum_start + ins.offset;
+ } else {
+ csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item);
+ csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
+ }
+
+ ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1,
+ &ordered_sums, false);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookups csums for range [%llu, %llu) inode %llu root %llu",
+ csum_start, csum_end, wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
+ ret = 0;
+ /*
+ * Now delete all existing cums in the csum root that cover our range.
+ * We do this because we can have an extent that is completely
+ * referenced by one file extent item and partially referenced by
+ * another file extent item (like after using the clone or extent_same
+ * ioctls). In this case if we end up doing the replay of the one that
+ * partially references the extent first, and we do not do the csum
+ * deletion below, we can get 2 csum items in the csum tree that overlap
+ * each other. For example, imagine our log has the two following file
+ * extent items:
+ *
+ * key (257 EXTENT_DATA 409600)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 20480 nr 20480 ram 102400
+ *
+ * key (257 EXTENT_DATA 819200)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 0 nr 102400 ram 102400
+ *
+ * Where the second one fully references the 100K extent that starts at
+ * disk byte 12845056, and the log tree has a single csum item that
+ * covers the entire range of the extent:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ *
+ * After the first file extent item is replayed, the csum tree gets the
+ * following csum item:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which covers the 20K sub-range starting at offset 20K of our extent.
+ * Now when we replay the second file extent item, if we do not delete
+ * existing csum items that cover any of its blocks, we end up getting
+ * two csum items in our csum tree that overlap each other:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which is a problem, because after this anyone trying to lookup for
+ * the checksum of any block of our extent starting at an offset of 40K
+ * or higher, will end up looking at the second csum item only, which
+ * does not contain the checksum for any block starting at offset 40K or
+ * higher of our extent.
+ */
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_root *csum_root;
+
+ sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list);
+ csum_root = btrfs_csum_root(fs_info, sums->logical);
+ if (!ret) {
+ ret = btrfs_del_csums(trans, csum_root, sums->logical,
+ sums->len);
if (ret)
- goto out;
- while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums;
- sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
- if (!ret)
- ret = btrfs_csum_file_blocks(trans,
- root->fs_info->csum_root,
- sums);
- list_del(&sums->list);
- kfree(sums);
- }
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete csums for range [%llu, %llu) inode %llu root %llu",
+ sums->logical,
+ sums->logical + sums->len,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ }
+ if (!ret) {
+ ret = btrfs_csum_file_blocks(trans, csum_root, sums);
if (ret)
- goto out;
- } else {
- btrfs_release_path(path);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to add csums for range [%llu, %llu) inode %llu root %llu",
+ sums->logical,
+ sums->logical + sums->len,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
}
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- /* inline extents are easy, we just overwrite them */
- ret = overwrite_item(trans, root, path, eb, slot, key);
- if (ret)
- goto out;
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ if (ret)
+ goto out;
+
+update_inode:
+ ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to set file extent range [%llu, %llu) inode %llu root %llu",
+ start, extent_end, wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
}
- inode_add_bytes(inode, nbytes);
- ret = btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
out:
- if (inode)
- iput(inode);
+ iput(&inode->vfs_inode);
+ return ret;
+}
+
+static int unlink_inode_for_log_replay(struct walk_control *wc,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ const struct fscrypt_str *name)
+{
+ struct btrfs_trans_handle *trans = wc->trans;
+ int ret;
+
+ ret = btrfs_unlink_inode(trans, dir, inode, name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to unlink inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir), name->len,
+ name->name, btrfs_root_id(inode->root));
+ return ret;
+ }
+ /*
+ * Whenever we need to check if a name exists or not, we check the
+ * fs/subvolume tree. So after an unlink we must run delayed items, so
+ * that future checks for a name during log replay see that the name
+ * does not exists anymore.
+ */
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+"failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir), name->len,
+ name->name, btrfs_root_id(inode->root));
+
return ret;
}
@@ -710,85 +1072,93 @@ out:
* This is a helper function to do the unlink of a specific directory
* item
*/
-static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct inode *dir,
+static noinline int drop_one_dir_item(struct walk_control *wc,
+ struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
- struct inode *inode;
- char *name;
- int name_len;
- struct extent_buffer *leaf;
+ struct btrfs_root *root = dir->root;
+ struct btrfs_inode *inode;
+ struct fscrypt_str name;
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
struct btrfs_key location;
int ret;
- leaf = path->nodes[0];
-
btrfs_dir_item_key_to_cpu(leaf, di, &location);
- name_len = btrfs_dir_name_len(leaf, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name)
- return -ENOMEM;
+ ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
+ return ret;
+ }
- read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to open inode %llu parent dir %llu name %.*s root %llu",
+ location.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
+ inode = NULL;
goto out;
}
- ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ ret = link_to_fixup_dir(wc, location.objectid);
if (ret)
goto out;
- ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
- if (ret)
- goto out;
- btrfs_run_delayed_items(trans, root);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
out:
- kfree(name);
- iput(inode);
+ kfree(name.name);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
/*
- * helper function to see if a given name and sequence number found
- * in an inode back reference are already in a directory and correctly
- * point to this inode
+ * See if a given name and sequence number found in an inode back reference are
+ * already in a directory and correctly point to this inode.
+ *
+ * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
+ * exists.
*/
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
u64 dirid, u64 objectid, u64 index,
- const char *name, int name_len)
+ struct fscrypt_str *name)
{
struct btrfs_dir_item *di;
struct btrfs_key location;
- int match = 0;
+ int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
- index, name, name_len, 0);
- if (di && !IS_ERR(di)) {
+ index, name, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid != objectid)
goto out;
- } else
+ } else {
goto out;
- btrfs_release_path(path);
+ }
- di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
- if (di && !IS_ERR(di)) {
- btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
- if (location.objectid != objectid)
- goto out;
- } else
+ btrfs_release_path(path);
+ di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
goto out;
- match = 1;
+ } else if (di) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid == objectid)
+ ret = 1;
+ }
out:
btrfs_release_path(path);
- return match;
+ return ret;
}
/*
@@ -804,251 +1174,266 @@ out:
static noinline int backref_in_log(struct btrfs_root *log,
struct btrfs_key *key,
u64 ref_objectid,
- char *name, int namelen)
+ const struct fscrypt_str *name)
{
- struct btrfs_path *path;
- struct btrfs_inode_ref *ref;
- unsigned long ptr;
- unsigned long ptr_end;
- unsigned long name_ptr;
- int found_name_len;
- int item_size;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
- int match = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
- if (ret != 0)
- goto out;
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
+
+ if (key->type == BTRFS_INODE_EXTREF_KEY)
+ ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
+ path->slots[0],
+ ref_objectid, name);
+ else
+ ret = !!btrfs_find_name_in_backref(path->nodes[0],
+ path->slots[0], name);
+ return ret;
+}
- ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+static int unlink_refs_not_in_log(struct walk_control *wc,
+ struct btrfs_key *search_key,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode)
+{
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+ unsigned long ptr;
+ unsigned long ptr_end;
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
- if (btrfs_find_name_in_ext_backref(path, ref_objectid,
- name, namelen, NULL))
- match = 1;
+ /*
+ * Check all the names in this back reference to see if they are in the
+ * log. If so, we allow them to stay otherwise they must be unlinked as
+ * a conflict.
+ */
+ ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]);
+ while (ptr < ptr_end) {
+ struct fscrypt_str victim_name;
+ struct btrfs_inode_ref *victim_ref;
+ int ret;
- goto out;
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ ret = read_alloc_one_name(leaf, (victim_ref + 1),
+ btrfs_inode_ref_name_len(leaf, victim_ref),
+ &victim_name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for inode %llu parent dir %llu root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ btrfs_root_id(inode->root));
+ return ret;
+ }
+
+ ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name);
+ if (ret) {
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ victim_name.len, victim_name.name,
+ btrfs_root_id(inode->root));
+ kfree(victim_name.name);
+ return ret;
+ }
+ kfree(victim_name.name);
+ ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
+ continue;
+ }
+
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(wc->subvol_path);
+
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ return -EAGAIN;
}
- item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- ref = (struct btrfs_inode_ref *)ptr;
- found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
- if (found_name_len == namelen) {
- name_ptr = (unsigned long)(ref + 1);
- ret = memcmp_extent_buffer(path->nodes[0], name,
- name_ptr, namelen);
- if (ret == 0) {
- match = 1;
- goto out;
+ return 0;
+}
+
+static int unlink_extrefs_not_in_log(struct walk_control *wc,
+ struct btrfs_key *search_key,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode)
+{
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+ const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]);
+ u32 cur_offset = 0;
+
+ while (cur_offset < item_size) {
+ struct btrfs_root *log_root = wc->log;
+ struct btrfs_inode_extref *extref;
+ struct fscrypt_str victim_name;
+ int ret;
+
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
+
+ if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir))
+ goto next;
+
+ ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
+ &victim_name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for inode %llu parent dir %llu root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ btrfs_root_id(inode->root));
+ return ret;
+ }
+
+ search_key->objectid = btrfs_ino(inode);
+ search_key->type = BTRFS_INODE_EXTREF_KEY;
+ search_key->offset = btrfs_extref_hash(btrfs_ino(dir),
+ victim_name.name,
+ victim_name.len);
+ ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name);
+ if (ret) {
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ victim_name.len, victim_name.name,
+ btrfs_root_id(inode->root));
+ kfree(victim_name.name);
+ return ret;
}
+ kfree(victim_name.name);
+next:
+ cur_offset += victim_name.len + sizeof(*extref);
+ continue;
}
- ptr = (unsigned long)(ref + 1) + found_name_len;
+
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(wc->subvol_path);
+
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ return -EAGAIN;
}
-out:
- btrfs_free_path(path);
- return match;
+
+ return 0;
}
-static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_root *log_root,
- struct inode *dir, struct inode *inode,
- struct extent_buffer *eb,
- u64 inode_objectid, u64 parent_objectid,
- u64 ref_index, char *name, int namelen,
- int *search_done)
+static inline int __add_inode_ref(struct walk_control *wc,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ u64 ref_index, struct fscrypt_str *name)
{
int ret;
- char *victim_name;
- int victim_name_len;
- struct extent_buffer *leaf;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_dir_item *di;
struct btrfs_key search_key;
struct btrfs_inode_extref *extref;
again:
/* Search old style refs */
- search_key.objectid = inode_objectid;
+ search_key.objectid = btrfs_ino(inode);
search_key.type = BTRFS_INODE_REF_KEY;
- search_key.offset = parent_objectid;
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret == 0) {
- struct btrfs_inode_ref *victim_ref;
- unsigned long ptr;
- unsigned long ptr_end;
-
- leaf = path->nodes[0];
-
- /* are we trying to overwrite a back ref for the root directory
- * if so, just jump out, we're done
+ search_key.offset = btrfs_ino(dir);
+ ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&search_key),
+ btrfs_root_id(root));
+ return ret;
+ } else if (ret == 0) {
+ /*
+ * Are we trying to overwrite a back ref for the root directory?
+ * If so, we're done.
*/
if (search_key.objectid == search_key.offset)
return 1;
- /* check all the names in this back reference to see
- * if they are in the log. if so, we allow them to stay
- * otherwise they must be unlinked as a conflict
- */
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
- while (ptr < ptr_end) {
- victim_ref = (struct btrfs_inode_ref *)ptr;
- victim_name_len = btrfs_inode_ref_name_len(leaf,
- victim_ref);
- victim_name = kmalloc(victim_name_len, GFP_NOFS);
- if (!victim_name)
- return -ENOMEM;
-
- read_extent_buffer(leaf, victim_name,
- (unsigned long)(victim_ref + 1),
- victim_name_len);
-
- if (!backref_in_log(log_root, &search_key,
- parent_objectid,
- victim_name,
- victim_name_len)) {
- btrfs_inc_nlink(inode);
- btrfs_release_path(path);
-
- ret = btrfs_unlink_inode(trans, root, dir,
- inode, victim_name,
- victim_name_len);
- kfree(victim_name);
- if (ret)
- return ret;
- btrfs_run_delayed_items(trans, root);
- *search_done = 1;
- goto again;
- }
- kfree(victim_name);
-
- ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
- }
-
- /*
- * NOTE: we have searched root tree and checked the
- * coresponding ref, it does not need to check again.
- */
- *search_done = 1;
+ ret = unlink_refs_not_in_log(wc, &search_key, dir, inode);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* Same search but for extended refs */
- extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
- inode_objectid, parent_objectid, 0,
- 0);
- if (!IS_ERR_OR_NULL(extref)) {
- u32 item_size;
- u32 cur_offset = 0;
- unsigned long base;
- struct inode *victim_parent;
-
- leaf = path->nodes[0];
-
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- base = btrfs_item_ptr_offset(leaf, path->slots[0]);
-
- while (cur_offset < item_size) {
- extref = (struct btrfs_inode_extref *)base + cur_offset;
-
- victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
-
- if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
- goto next;
-
- victim_name = kmalloc(victim_name_len, GFP_NOFS);
- if (!victim_name)
- return -ENOMEM;
- read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
- victim_name_len);
-
- search_key.objectid = inode_objectid;
- search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = btrfs_extref_hash(parent_objectid,
- victim_name,
- victim_name_len);
- ret = 0;
- if (!backref_in_log(log_root, &search_key,
- parent_objectid, victim_name,
- victim_name_len)) {
- ret = -ENOENT;
- victim_parent = read_one_inode(root,
- parent_objectid);
- if (victim_parent) {
- btrfs_inc_nlink(inode);
- btrfs_release_path(path);
-
- ret = btrfs_unlink_inode(trans, root,
- victim_parent,
- inode,
- victim_name,
- victim_name_len);
- btrfs_run_delayed_items(trans, root);
- }
- iput(victim_parent);
- kfree(victim_name);
- if (ret)
- return ret;
- *search_done = 1;
- goto again;
- }
- kfree(victim_name);
- if (ret)
- return ret;
-next:
- cur_offset += victim_name_len + sizeof(*extref);
- }
- *search_done = 1;
+ extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name,
+ btrfs_ino(inode), btrfs_ino(dir));
+ if (IS_ERR(extref)) {
+ return PTR_ERR(extref);
+ } else if (extref) {
+ ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* look for a conflicting sequence number */
- di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
- ref_index, name, namelen, 0);
- if (di && !IS_ERR(di)) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir),
+ ref_index, name, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+"failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(dir), ref_index, name->len,
+ name->name, btrfs_root_id(root));
+ return ret;
+ } else if (di) {
+ ret = drop_one_dir_item(wc, dir, di);
if (ret)
return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
- /* look for a conflicing name */
- di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
- name, namelen, 0);
- if (di && !IS_ERR(di)) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ /* look for a conflicting name */
+ di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir item for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name->len, name->name,
+ btrfs_root_id(root));
+ return ret;
+ } else if (di) {
+ ret = drop_one_dir_item(wc, dir, di);
if (ret)
return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
- u32 *namelen, char **name, u64 *index,
+ struct fscrypt_str *name, u64 *index,
u64 *parent_objectid)
{
struct btrfs_inode_extref *extref;
+ int ret;
extref = (struct btrfs_inode_extref *)ref_ptr;
- *namelen = btrfs_inode_extref_name_len(eb, extref);
- *name = kmalloc(*namelen, GFP_NOFS);
- if (*name == NULL)
- return -ENOMEM;
-
- read_extent_buffer(eb, *name, (unsigned long)&extref->name,
- *namelen);
+ ret = read_alloc_one_name(eb, &extref->name,
+ btrfs_inode_extref_name_len(eb, extref), name);
+ if (ret)
+ return ret;
- *index = btrfs_inode_extref_index(eb, extref);
+ if (index)
+ *index = btrfs_inode_extref_index(eb, extref);
if (parent_objectid)
*parent_objectid = btrfs_inode_extref_parent(eb, extref);
@@ -1056,66 +1441,158 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
}
static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
- u32 *namelen, char **name, u64 *index)
+ struct fscrypt_str *name, u64 *index)
{
struct btrfs_inode_ref *ref;
+ int ret;
ref = (struct btrfs_inode_ref *)ref_ptr;
- *namelen = btrfs_inode_ref_name_len(eb, ref);
- *name = kmalloc(*namelen, GFP_NOFS);
- if (*name == NULL)
- return -ENOMEM;
-
- read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+ ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
+ name);
+ if (ret)
+ return ret;
- *index = btrfs_inode_ref_index(eb, ref);
+ if (index)
+ *index = btrfs_inode_ref_index(eb, ref);
return 0;
}
/*
- * replay one inode back reference item found in the log tree.
- * eb, slot and key refer to the buffer and key found in the log tree.
- * root is the destination we are replaying into, and path is for temp
- * use by this function. (it should be released on return).
+ * Take an inode reference item from the log tree and iterate all names from the
+ * inode reference item in the subvolume tree with the same key (if it exists).
+ * For any name that is not in the inode reference item from the log tree, do a
+ * proper unlink of that name (that is, remove its entry from the inode
+ * reference item and both dir index keys).
*/
-static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode)
{
- struct inode *dir;
- struct inode *inode;
+ struct btrfs_root *root = wc->root;
+ int ret;
unsigned long ref_ptr;
unsigned long ref_end;
- char *name;
- int namelen;
+ struct extent_buffer *eb;
+
+again:
+ btrfs_release_path(wc->subvol_path);
+ ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&wc->log_key),
+ btrfs_root_id(root));
+ goto out;
+ }
+
+ eb = wc->subvol_path->nodes[0];
+ ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]);
+ ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]);
+ while (ref_ptr < ref_end) {
+ struct fscrypt_str name;
+ u64 parent_id;
+
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = extref_get_fields(eb, ref_ptr, &name,
+ NULL, &parent_id);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get extref details for inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
+ goto out;
+ }
+ } else {
+ parent_id = wc->log_key.offset;
+ ret = ref_get_fields(eb, ref_ptr, &name, NULL);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get ref details for inode %llu parent_id %llu root %llu",
+ btrfs_ino(inode), parent_id,
+ btrfs_root_id(root));
+ goto out;
+ }
+ }
+
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
+ ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot,
+ parent_id, &name);
+ else
+ ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot,
+ &name);
+
+ if (!ret) {
+ struct btrfs_inode *dir;
+
+ btrfs_release_path(wc->subvol_path);
+ dir = btrfs_iget_logging(parent_id, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ kfree(name.name);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_id, btrfs_root_id(root));
+ goto out;
+ }
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
+ kfree(name.name);
+ iput(&dir->vfs_inode);
+ if (ret)
+ goto out;
+ goto again;
+ }
+
+ kfree(name.name);
+ ref_ptr += name.len;
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
+ ref_ptr += sizeof(struct btrfs_inode_extref);
+ else
+ ref_ptr += sizeof(struct btrfs_inode_ref);
+ }
+ ret = 0;
+ out:
+ btrfs_release_path(wc->subvol_path);
+ return ret;
+}
+
+/*
+ * Replay one inode back reference item found in the log tree.
+ * Path is for temporary use by this function (it should be released on return).
+ */
+static noinline int add_inode_ref(struct walk_control *wc)
+{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_inode *dir = NULL;
+ struct btrfs_inode *inode = NULL;
+ unsigned long ref_ptr;
+ unsigned long ref_end;
+ struct fscrypt_str name = { 0 };
int ret;
- int search_done = 0;
- int log_ref_ver = 0;
+ const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY);
u64 parent_objectid;
u64 inode_objectid;
u64 ref_index = 0;
int ref_struct_size;
- ref_ptr = btrfs_item_ptr_offset(eb, slot);
- ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+ ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
+ ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot);
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ if (is_extref_item) {
struct btrfs_inode_extref *r;
ref_struct_size = sizeof(struct btrfs_inode_extref);
- log_ref_ver = 1;
r = (struct btrfs_inode_extref *)ref_ptr;
- parent_objectid = btrfs_inode_extref_parent(eb, r);
+ parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r);
} else {
ref_struct_size = sizeof(struct btrfs_inode_ref);
- parent_objectid = key->offset;
+ parent_objectid = wc->log_key.offset;
}
- inode_objectid = key->objectid;
+ inode_objectid = wc->log_key.objectid;
/*
* it is possible that we didn't log all the parent directories
@@ -1123,38 +1600,95 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* copy the back ref in. The link count fixup code will take
* care of the rest
*/
- dir = read_one_inode(root, parent_objectid);
- if (!dir)
- return -ENOENT;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_objectid, btrfs_root_id(root));
+ dir = NULL;
+ goto out;
+ }
- inode = read_one_inode(root, inode_objectid);
- if (!inode) {
- iput(dir);
- return -EIO;
+ inode = btrfs_iget_logging(inode_objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ inode_objectid, btrfs_root_id(root));
+ inode = NULL;
+ goto out;
}
while (ref_ptr < ref_end) {
- if (log_ref_ver) {
- ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+ if (is_extref_item) {
+ ret = extref_get_fields(wc->log_leaf, ref_ptr, &name,
&ref_index, &parent_objectid);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get extref details for inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
+ goto out;
+ }
/*
* parent object can change from one array
* item to another.
*/
- if (!dir)
- dir = read_one_inode(root, parent_objectid);
- if (!dir)
- return -ENOENT;
+ if (!dir) {
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ dir = NULL;
+ /*
+ * A new parent dir may have not been
+ * logged and not exist in the subvolume
+ * tree, see the comment above before
+ * the loop when getting the first
+ * parent dir.
+ */
+ if (ret == -ENOENT) {
+ /*
+ * The next extref may refer to
+ * another parent dir that
+ * exists, so continue.
+ */
+ ret = 0;
+ goto next;
+ } else {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_objectid,
+ btrfs_root_id(root));
+ }
+ goto out;
+ }
+ }
} else {
- ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
- &ref_index);
+ ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get ref details for inode %llu parent_objectid %llu root %llu",
+ btrfs_ino(inode),
+ parent_objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
}
- if (ret)
- return ret;
- /* if we already have a perfect match, we're done */
- if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
- ref_index, name, namelen)) {
+ ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir),
+ btrfs_ino(inode), ref_index, &name);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ ref_index, name.len, name.name,
+ btrfs_root_id(root));
+ goto out;
+ } else if (ret == 0) {
/*
* look for a conflicting back reference in the
* metadata. if we find one we have to unlink that name
@@ -1162,60 +1696,71 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
-
- if (!search_done) {
- ret = __add_inode_ref(trans, root, path, log,
- dir, inode, eb,
- inode_objectid,
- parent_objectid,
- ref_index, name, namelen,
- &search_done);
- if (ret == 1) {
+ ret = __add_inode_ref(wc, dir, inode, ref_index, &name);
+ if (ret) {
+ if (ret == 1)
ret = 0;
- goto out;
- }
- if (ret)
- goto out;
+ goto out;
}
/* insert our name */
- ret = btrfs_add_link(trans, dir, inode, name, namelen,
- 0, ref_index);
- if (ret)
+ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(inode),
+ btrfs_ino(dir), ref_index,
+ name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ }
- btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
+ goto out;
+ }
}
+ /* Else, ret == 1, we already have a perfect match, we're done. */
- ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
- kfree(name);
- if (log_ref_ver) {
- iput(dir);
+next:
+ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
+ kfree(name.name);
+ name.name = NULL;
+ if (is_extref_item && dir) {
+ iput(&dir->vfs_inode);
dir = NULL;
}
}
+ /*
+ * Before we overwrite the inode reference item in the subvolume tree
+ * with the item from the log tree, we must unlink all names from the
+ * parent directory that are in the subvolume's tree inode reference
+ * item, otherwise we end up with an inconsistent subvolume tree where
+ * dir index entries exist for a name but there is no inode reference
+ * item with the same name.
+ */
+ ret = unlink_old_inode_refs(wc, inode);
+ if (ret)
+ goto out;
+
/* finally write the back reference in the inode */
- ret = overwrite_item(trans, root, path, eb, slot, key);
+ ret = overwrite_item(wc);
out:
- btrfs_release_path(path);
- iput(dir);
- iput(inode);
- return ret;
-}
-
-static int insert_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 offset)
-{
- int ret;
- ret = btrfs_find_orphan_item(root, offset);
- if (ret > 0)
- ret = btrfs_insert_orphan_item(trans, root, offset);
+ btrfs_release_path(wc->subvol_path);
+ kfree(name.name);
+ if (dir)
+ iput(&dir->vfs_inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
-static int count_inode_extrefs(struct btrfs_root *root,
- struct inode *inode, struct btrfs_path *path)
+static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret = 0;
int name_len;
@@ -1229,14 +1774,15 @@ static int count_inode_extrefs(struct btrfs_root *root,
struct extent_buffer *leaf;
while (1) {
- ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
- &extref, &offset);
+ ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
+ path, &extref, &offset);
if (ret)
break;
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ cur_offset = 0;
while (cur_offset < item_size) {
extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
@@ -1252,13 +1798,12 @@ static int count_inode_extrefs(struct btrfs_root *root,
}
btrfs_release_path(path);
- if (ret < 0)
+ if (ret < 0 && ret != -ENOENT)
return ret;
return nlink;
}
-static int count_inode_refs(struct btrfs_root *root,
- struct inode *inode, struct btrfs_path *path)
+static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret;
struct btrfs_key key;
@@ -1273,7 +1818,7 @@ static int count_inode_refs(struct btrfs_root *root,
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
if (ret < 0)
break;
if (ret > 0) {
@@ -1281,13 +1826,14 @@ static int count_inode_refs(struct btrfs_root *root,
break;
path->slots[0]--;
}
+process_slot:
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
if (key.objectid != ino ||
key.type != BTRFS_INODE_REF_KEY)
break;
ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+ ptr_end = ptr + btrfs_item_size(path->nodes[0],
path->slots[0]);
while (ptr < ptr_end) {
struct btrfs_inode_ref *ref;
@@ -1301,6 +1847,10 @@ static int count_inode_refs(struct btrfs_root *root,
if (key.offset == 0)
break;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ goto process_slot;
+ }
key.offset--;
btrfs_release_path(path);
}
@@ -1319,29 +1869,22 @@ static int count_inode_refs(struct btrfs_root *root,
* number of back refs found. If it goes down to zero, the iput
* will free the inode.
*/
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode)
+static noinline int fixup_inode_link_count(struct walk_control *wc,
+ struct btrfs_inode *inode)
{
- struct btrfs_path *path;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = inode->root;
int ret;
u64 nlink = 0;
- u64 ino = btrfs_ino(inode);
+ const u64 ino = btrfs_ino(inode);
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- ret = count_inode_refs(root, inode, path);
+ ret = count_inode_refs(inode, wc->subvol_path);
if (ret < 0)
goto out;
nlink = ret;
- ret = count_inode_extrefs(root, inode, path);
- if (ret == -ENOENT)
- ret = 0;
-
+ ret = count_inode_extrefs(inode, wc->subvol_path);
if (ret < 0)
goto out;
@@ -1349,78 +1892,84 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
ret = 0;
- if (nlink != inode->i_nlink) {
- set_nlink(inode, nlink);
- btrfs_update_inode(trans, root, inode);
+ if (nlink != inode->vfs_inode.i_nlink) {
+ set_nlink(&inode->vfs_inode, nlink);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ goto out;
}
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->index_cnt = (u64)-1;
- if (inode->i_nlink == 0) {
- if (S_ISDIR(inode->i_mode)) {
- ret = replay_dir_deletes(trans, root, NULL, path,
- ino, 1);
+ if (inode->vfs_inode.i_nlink == 0) {
+ if (S_ISDIR(inode->vfs_inode.i_mode)) {
+ ret = replay_dir_deletes(wc, ino, true);
if (ret)
goto out;
}
- ret = insert_orphan_item(trans, root, ino);
+ ret = btrfs_insert_orphan_item(trans, root, ino);
+ if (ret == -EEXIST)
+ ret = 0;
}
out:
- btrfs_free_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
-static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path)
+static noinline int fixup_inode_link_counts(struct walk_control *wc)
{
int ret;
struct btrfs_key key;
- struct inode *inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_inode *inode;
+
+ ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1);
if (ret < 0)
break;
if (ret == 1) {
- if (path->slots[0] == 0)
+ ret = 0;
+ if (wc->subvol_path->slots[0] == 0)
break;
- path->slots[0]--;
+ wc->subvol_path->slots[0]--;
}
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]);
if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
- ret = btrfs_del_item(trans, root, path);
+ ret = btrfs_del_item(trans, root, wc->subvol_path);
if (ret)
- goto out;
+ break;
- btrfs_release_path(path);
- inode = read_one_inode(root, key.offset);
- if (!inode)
- return -EIO;
+ btrfs_release_path(wc->subvol_path);
+ inode = btrfs_iget_logging(key.offset, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ break;
+ }
- ret = fixup_inode_link_count(trans, root, inode);
- iput(inode);
+ ret = fixup_inode_link_count(wc, inode);
+ iput(&inode->vfs_inode);
if (ret)
- goto out;
+ break;
/*
* fixup on a directory may create new entries,
- * make sure we always look for the highset possible
+ * make sure we always look for the highest possible
* offset
*/
key.offset = (u64)-1;
}
- ret = 0;
-out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
@@ -1430,38 +1979,50 @@ out:
* count when replay is done. The link count is incremented here
* so the inode won't go away until we check it
*/
-static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 objectid)
+static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_key key;
int ret = 0;
- struct inode *inode;
+ struct btrfs_inode *inode;
+ struct inode *vfs_inode;
- inode = read_one_inode(root, objectid);
- if (!inode)
- return -EIO;
+ inode = btrfs_iget_logging(objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ objectid, btrfs_root_id(root));
+ return ret;
+ }
+ vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
- ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (ret == 0) {
- if (!inode->i_nlink)
- set_nlink(inode, 1);
+ if (!vfs_inode->i_nlink)
+ set_nlink(vfs_inode, 1);
else
- btrfs_inc_nlink(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ inc_nlink(vfs_inode);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ objectid, btrfs_root_id(root));
} else if (ret == -EEXIST) {
ret = 0;
} else {
- BUG(); /* Logic Error */
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert fixup item for inode %llu root %llu",
+ objectid, btrfs_root_id(root));
}
- iput(inode);
+ iput(vfs_inode);
return ret;
}
@@ -1473,33 +2034,60 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
*/
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path,
u64 dirid, u64 index,
- char *name, int name_len, u8 type,
+ const struct fscrypt_str *name,
struct btrfs_key *location)
{
- struct inode *inode;
- struct inode *dir;
+ struct btrfs_inode *inode;
+ struct btrfs_inode *dir;
int ret;
- inode = read_one_inode(root, location->objectid);
- if (!inode)
- return -ENOENT;
+ inode = btrfs_iget_logging(location->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- dir = read_one_inode(root, dirid);
- if (!dir) {
- iput(inode);
- return -EIO;
+ dir = btrfs_iget_logging(dirid, root);
+ if (IS_ERR(dir)) {
+ iput(&inode->vfs_inode);
+ return PTR_ERR(dir);
}
- ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
+
+ ret = btrfs_add_link(trans, dir, inode, name, 1, index);
/* FIXME, put inode into FIXUP list */
- iput(inode);
- iput(dir);
+ iput(&inode->vfs_inode);
+ iput(&dir->vfs_inode);
return ret;
}
+static int delete_conflicting_dir_entry(struct walk_control *wc,
+ struct btrfs_inode *dir,
+ struct btrfs_dir_item *dst_di,
+ const struct btrfs_key *log_key,
+ u8 log_flags,
+ bool exists)
+{
+ struct btrfs_key found_key;
+
+ btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key);
+ /* The existing dentry points to the same inode, don't delete it. */
+ if (found_key.objectid == log_key->objectid &&
+ found_key.type == log_key->type &&
+ found_key.offset == log_key->offset &&
+ btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags)
+ return 1;
+
+ /*
+ * Don't drop the conflicting directory entry if the inode for the new
+ * entry doesn't exist.
+ */
+ if (!exists)
+ return 0;
+
+ return drop_one_dir_item(wc, dir, dst_di);
+}
+
/*
* take a single entry in a log directory item and replay it into
* the subvolume.
@@ -1512,138 +2100,230 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
* not exist in the FS, it is skipped. fsyncs on directories
* do not force down inodes inside that directory, just changes to the
* names or unlinks in a directory.
+ *
+ * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
+ * non-existing inode) and 1 if the name was replayed.
*/
-static noinline int replay_one_name(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb,
- struct btrfs_dir_item *di,
- struct btrfs_key *key)
+static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di)
{
- char *name;
- int name_len;
- struct btrfs_dir_item *dst_di;
- struct btrfs_key found_key;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct fscrypt_str name = { 0 };
+ struct btrfs_dir_item *dir_dst_di;
+ struct btrfs_dir_item *index_dst_di;
+ bool dir_dst_matches = false;
+ bool index_dst_matches = false;
struct btrfs_key log_key;
- struct inode *dir;
- u8 log_type;
- int exists;
- int ret = 0;
-
- dir = read_one_inode(root, key->objectid);
- if (!dir)
- return -EIO;
-
- name_len = btrfs_dir_name_len(eb, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name)
- return -ENOMEM;
+ struct btrfs_key search_key;
+ struct btrfs_inode *dir;
+ u8 log_flags;
+ bool exists;
+ int ret;
+ bool update_size = true;
+ bool name_added = false;
+
+ dir = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
+ return ret;
+ }
- log_type = btrfs_dir_type(eb, di);
- read_extent_buffer(eb, name, (unsigned long)(di + 1),
- name_len);
+ ret = read_alloc_one_name(wc->log_leaf, di + 1,
+ btrfs_dir_name_len(wc->log_leaf, di), &name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
+ goto out;
+ }
- btrfs_dir_item_key_to_cpu(eb, di, &log_key);
- exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
- if (exists == 0)
- exists = 1;
- else
- exists = 0;
- btrfs_release_path(path);
+ log_flags = btrfs_dir_flags(wc->log_leaf, di);
+ btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key);
+ ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0);
+ btrfs_release_path(wc->subvol_path);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ log_key.objectid, btrfs_root_id(root));
+ goto out;
+ }
+ exists = (ret == 0);
+ ret = 0;
- if (key->type == BTRFS_DIR_ITEM_KEY) {
- dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
- name, name_len, 1);
- } else if (key->type == BTRFS_DIR_INDEX_KEY) {
- dst_di = btrfs_lookup_dir_index_item(trans, root, path,
- key->objectid,
- key->offset, name,
- name_len, 1);
- } else {
- /* Corruption */
- ret = -EINVAL;
+ dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path,
+ wc->log_key.objectid, &name, 1);
+ if (IS_ERR(dir_dst_di)) {
+ ret = PTR_ERR(dir_dst_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir item for dir %llu name %.*s root %llu",
+ wc->log_key.objectid, name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ } else if (dir_dst_di) {
+ ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di,
+ &log_key, log_flags, exists);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete conflicting entry for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name.len, name.name,
+ btrfs_root_id(root));
+ goto out;
+ }
+ dir_dst_matches = (ret == 1);
}
- if (IS_ERR_OR_NULL(dst_di)) {
- /* we need a sequence number to insert, so we only
- * do inserts for the BTRFS_DIR_INDEX_KEY types
- */
- if (key->type != BTRFS_DIR_INDEX_KEY)
+
+ btrfs_release_path(wc->subvol_path);
+
+ index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path,
+ wc->log_key.objectid,
+ wc->log_key.offset, &name, 1);
+ if (IS_ERR(index_dst_di)) {
+ ret = PTR_ERR(index_dst_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir index item for dir %llu name %.*s root %llu",
+ wc->log_key.objectid, name.len, name.name,
+ btrfs_root_id(root));
+ goto out;
+ } else if (index_dst_di) {
+ ret = delete_conflicting_dir_entry(wc, dir, index_dst_di,
+ &log_key, log_flags, exists);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete conflicting entry for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name.len, name.name,
+ btrfs_root_id(root));
goto out;
- goto insert;
+ }
+ index_dst_matches = (ret == 1);
}
- btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
- /* the existing item matches the logged item */
- if (found_key.objectid == log_key.objectid &&
- found_key.type == log_key.type &&
- found_key.offset == log_key.offset &&
- btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+ btrfs_release_path(wc->subvol_path);
+
+ if (dir_dst_matches && index_dst_matches) {
+ ret = 0;
+ update_size = false;
goto out;
}
/*
- * don't drop the conflicting directory entry if the inode
- * for the new entry doesn't exist
+ * Check if the inode reference exists in the log for the given name,
+ * inode and parent inode
*/
- if (!exists)
- goto out;
+ search_key.objectid = log_key.objectid;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = wc->log_key.objectid;
+ ret = backref_in_log(root->log_root, &search_key, 0, &name);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu",
+ search_key.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
+ goto out;
+ } else if (ret) {
+ /* The dentry will be added later. */
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
- ret = drop_one_dir_item(trans, root, path, dir, dst_di);
- if (ret)
+ search_key.objectid = log_key.objectid;
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len);
+ ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu",
+ search_key.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
+ goto out;
+ } else if (ret) {
+ /* The dentry will be added later. */
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
+ ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset,
+ &name, &log_key);
+ if (ret && ret != -ENOENT && ret != -EEXIST) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert name %.*s for inode %llu dir %llu root %llu",
+ name.len, name.name, log_key.objectid,
+ btrfs_ino(dir), btrfs_root_id(root));
goto out;
+ }
+ if (!ret)
+ name_added = true;
+ update_size = false;
+ ret = 0;
- if (key->type == BTRFS_DIR_INDEX_KEY)
- goto insert;
out:
- btrfs_release_path(path);
- kfree(name);
- iput(dir);
+ if (!ret && update_size) {
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
+ ret = btrfs_update_inode(trans, dir);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update dir inode %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
+ }
+ kfree(name.name);
+ iput(&dir->vfs_inode);
+ if (!ret && name_added)
+ ret = 1;
return ret;
-
-insert:
- btrfs_release_path(path);
- ret = insert_one_name(trans, root, path, key->objectid, key->offset,
- name, name_len, log_type, &log_key);
- if (ret && ret != -ENOENT)
- goto out;
- ret = 0;
- goto out;
}
-/*
- * find all the names in a directory item and reconcile them into
- * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
- * one name in a directory item, but the same code gets used for
- * both directory index types
- */
-static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
+static noinline int replay_one_dir_item(struct walk_control *wc)
{
int ret;
- u32 item_size = btrfs_item_size_nr(eb, slot);
struct btrfs_dir_item *di;
- int name_len;
- unsigned long ptr;
- unsigned long ptr_end;
- ptr = btrfs_item_ptr_offset(eb, slot);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- di = (struct btrfs_dir_item *)ptr;
- if (verify_dir_item(root, eb, di))
- return -EIO;
- name_len = btrfs_dir_name_len(eb, di);
- ret = replay_one_name(trans, root, path, eb, di, key);
- if (ret)
- return ret;
- ptr = (unsigned long)(di + 1);
- ptr += name_len;
+ /* We only log dir index keys, which only contain a single dir item. */
+ ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY,
+ "wc->log_key.type=%u", wc->log_key.type);
+
+ di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item);
+ ret = replay_one_name(wc, di);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * If this entry refers to a non-directory (directories can not have a
+ * link count > 1) and it was added in the transaction that was not
+ * committed, make sure we fixup the link count of the inode the entry
+ * points to. Otherwise something like the following would result in a
+ * directory pointing to an inode with a wrong link that does not account
+ * for this dir entry:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * touch testdir/bar
+ * sync
+ *
+ * ln testdir/bar testdir/bar_link
+ * ln testdir/foo testdir/foo_link
+ * xfs_io -c "fsync" testdir/bar
+ *
+ * <power failure>
+ *
+ * mount fs, log replay happens
+ *
+ * File foo would remain with a link count of 1 when it has two entries
+ * pointing to it in the directory testdir. This would make it impossible
+ * to ever delete the parent directory has it would result in stale
+ * dentries that can never be deleted.
+ */
+ if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) {
+ struct btrfs_key di_key;
+
+ btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key);
+ ret = link_to_fixup_dir(wc, di_key.objectid);
}
- return 0;
+
+ return ret;
}
/*
@@ -1659,7 +2339,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
*/
static noinline int find_dir_range(struct btrfs_root *root,
struct btrfs_path *path,
- u64 dirid, int key_type,
+ u64 dirid,
u64 *start_ret, u64 *end_ret)
{
struct btrfs_key key;
@@ -1672,7 +2352,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
return 1;
key.objectid = dirid;
- key.type = key_type;
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
key.offset = *start_ret;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1686,7 +2366,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
if (ret != 0)
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.type != key_type || key.objectid != dirid) {
+ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
ret = 1;
goto next;
}
@@ -1704,17 +2384,16 @@ static noinline int find_dir_range(struct btrfs_root *root,
next:
/* check the next slot in the tree to see if it is a valid item */
nritems = btrfs_header_nritems(path->nodes[0]);
+ path->slots[0]++;
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret)
goto out;
- } else {
- path->slots[0]++;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.type != key_type || key.objectid != dirid) {
+ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
ret = 1;
goto out;
}
@@ -1734,111 +2413,221 @@ out:
* item is not in the log, the item is removed and the inode it points
* to is unlinked
*/
-static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
+static noinline int check_item_in_log(struct walk_control *wc,
struct btrfs_path *log_path,
- struct inode *dir,
- struct btrfs_key *dir_key)
+ struct btrfs_inode *dir,
+ struct btrfs_key *dir_key,
+ bool force_remove)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = dir->root;
int ret;
struct extent_buffer *eb;
int slot;
- u32 item_size;
struct btrfs_dir_item *di;
- struct btrfs_dir_item *log_di;
- int name_len;
- unsigned long ptr;
- unsigned long ptr_end;
- char *name;
- struct inode *inode;
+ struct fscrypt_str name = { 0 };
+ struct btrfs_inode *inode = NULL;
struct btrfs_key location;
-again:
- eb = path->nodes[0];
- slot = path->slots[0];
- item_size = btrfs_item_size_nr(eb, slot);
- ptr = btrfs_item_ptr_offset(eb, slot);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- di = (struct btrfs_dir_item *)ptr;
- if (verify_dir_item(root, eb, di)) {
- ret = -EIO;
+ /*
+ * Currently we only log dir index keys. Even if we replay a log created
+ * by an older kernel that logged both dir index and dir item keys, all
+ * we need to do is process the dir index keys, we (and our caller) can
+ * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
+ */
+ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY, "dir_key->type=%u", dir_key->type);
+
+ eb = wc->subvol_path->nodes[0];
+ slot = wc->subvol_path->slots[0];
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu index %llu root %llu",
+ btrfs_ino(dir), dir_key->offset,
+ btrfs_root_id(root));
+ goto out;
+ }
+
+ if (!force_remove) {
+ struct btrfs_dir_item *log_di;
+
+ log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path,
+ dir_key->objectid,
+ dir_key->offset, &name, 0);
+ if (IS_ERR(log_di)) {
+ ret = PTR_ERR(log_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir index item for dir %llu index %llu name %.*s root %llu",
+ btrfs_ino(dir), dir_key->offset,
+ name.len, name.name,
+ btrfs_root_id(root));
+ goto out;
+ } else if (log_di) {
+ /* The dentry exists in the log, we have nothing to do. */
+ ret = 0;
goto out;
}
+ }
- name_len = btrfs_dir_name_len(eb, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name) {
- ret = -ENOMEM;
+ btrfs_dir_item_key_to_cpu(eb, di, &location);
+ btrfs_release_path(wc->subvol_path);
+ btrfs_release_path(log_path);
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ location.objectid, btrfs_root_id(root));
+ goto out;
+ }
+
+ ret = link_to_fixup_dir(wc, location.objectid);
+ if (ret)
+ goto out;
+
+ inc_nlink(&inode->vfs_inode);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
+ /*
+ * Unlike dir item keys, dir index keys can only have one name (entry) in
+ * them, as there are no key collisions since each key has a unique offset
+ * (an index number), so we're done.
+ */
+out:
+ btrfs_release_path(wc->subvol_path);
+ btrfs_release_path(log_path);
+ kfree(name.name);
+ if (inode)
+ iput(&inode->vfs_inode);
+ return ret;
+}
+
+static int replay_xattr_deletes(struct walk_control *wc)
+{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_root *log = wc->log;
+ struct btrfs_key search_key;
+ BTRFS_PATH_AUTO_FREE(log_path);
+ const u64 ino = wc->log_key.objectid;
+ int nritems;
+ int ret;
+
+ log_path = btrfs_alloc_path();
+ if (!log_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
+ return -ENOMEM;
+ }
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_XATTR_ITEM_KEY;
+ search_key.offset = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search xattrs for inode %llu root %llu",
+ ino, btrfs_root_id(root));
+ goto out;
+ }
+process_leaf:
+ nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
+ for (int i = wc->subvol_path->slots[0]; i < nritems; i++) {
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ struct btrfs_dir_item *log_di;
+ u32 total_size;
+ u32 cur;
+
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
+ ret = 0;
goto out;
}
- read_extent_buffer(eb, name, (unsigned long)(di + 1),
- name_len);
- log_di = NULL;
- if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
- log_di = btrfs_lookup_dir_item(trans, log, log_path,
- dir_key->objectid,
- name, name_len, 0);
- } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
- log_di = btrfs_lookup_dir_index_item(trans, log,
- log_path,
- dir_key->objectid,
- dir_key->offset,
- name, name_len, 0);
- }
- if (IS_ERR_OR_NULL(log_di)) {
- btrfs_dir_item_key_to_cpu(eb, di, &location);
- btrfs_release_path(path);
+
+ di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item);
+ total_size = btrfs_item_size(wc->subvol_path->nodes[0], i);
+ cur = 0;
+ while (cur < total_size) {
+ u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di);
+ u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di);
+ u32 this_len = sizeof(*di) + name_len + data_len;
+ char *name;
+
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate memory for name of length %u",
+ name_len);
+ goto out;
+ }
+ read_extent_buffer(wc->subvol_path->nodes[0], name,
+ (unsigned long)(di + 1), name_len);
+
+ log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
+ name, name_len, 0);
btrfs_release_path(log_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
+ if (!log_di) {
+ /* Doesn't exist in log tree, so delete it. */
+ btrfs_release_path(wc->subvol_path);
+ di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino,
+ name, name_len, -1);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup xattr with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
+ goto out;
+ }
+ ASSERT(di);
+ ret = btrfs_delete_one_dir_name(trans, root,
+ wc->subvol_path, di);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete xattr with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
+ goto out;
+ }
+ btrfs_release_path(wc->subvol_path);
kfree(name);
- return -EIO;
+ search_key = key;
+ goto again;
}
-
- ret = link_to_fixup_dir(trans, root,
- path, location.objectid);
- if (ret) {
+ if (IS_ERR(log_di)) {
+ ret = PTR_ERR(log_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup xattr in log tree with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
kfree(name);
- iput(inode);
goto out;
}
-
- btrfs_inc_nlink(inode);
- ret = btrfs_unlink_inode(trans, root, dir, inode,
- name, name_len);
- if (!ret)
- btrfs_run_delayed_items(trans, root);
kfree(name);
- iput(inode);
- if (ret)
- goto out;
-
- /* there might still be more names under this key
- * check and repeat if required
- */
- ret = btrfs_search_slot(NULL, root, dir_key, path,
- 0, 0);
- if (ret == 0)
- goto again;
- ret = 0;
- goto out;
+ cur += this_len;
+ di = (struct btrfs_dir_item *)((char *)di + this_len);
}
- btrfs_release_path(log_path);
- kfree(name);
-
- ptr = (unsigned long)(di + 1);
- ptr += name_len;
}
- ret = 0;
+ ret = btrfs_next_leaf(root, wc->subvol_path);
+ if (ret > 0)
+ ret = 0;
+ else if (ret == 0)
+ goto process_leaf;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get next leaf in subvolume root %llu",
+ btrfs_root_id(root));
out:
- btrfs_release_path(path);
- btrfs_release_path(log_path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
+
/*
* deletion replay happens before we copy any new directory items
* out of the log or out of backreferences from inodes. It
@@ -1849,99 +2638,113 @@ out:
* Anything we don't find in the log is unlinked and removed from the
* directory.
*/
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- u64 dirid, int del_all)
+static noinline int replay_dir_deletes(struct walk_control *wc,
+ u64 dirid, bool del_all)
{
+ struct btrfs_root *root = wc->root;
+ struct btrfs_root *log = (del_all ? NULL : wc->log);
u64 range_start;
u64 range_end;
- int key_type = BTRFS_DIR_LOG_ITEM_KEY;
int ret = 0;
struct btrfs_key dir_key;
struct btrfs_key found_key;
- struct btrfs_path *log_path;
- struct inode *dir;
+ BTRFS_PATH_AUTO_FREE(log_path);
+ struct btrfs_inode *dir;
dir_key.objectid = dirid;
- dir_key.type = BTRFS_DIR_ITEM_KEY;
+ dir_key.type = BTRFS_DIR_INDEX_KEY;
log_path = btrfs_alloc_path();
- if (!log_path)
+ if (!log_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
- dir = read_one_inode(root, dirid);
- /* it isn't an error if the inode isn't there, that can happen
- * because we replay the deletes before we copy in the inode item
- * from the log
+ dir = btrfs_iget_logging(dirid, root);
+ /*
+ * It isn't an error if the inode isn't there, that can happen because
+ * we replay the deletes before we copy in the inode item from the log.
*/
- if (!dir) {
- btrfs_free_path(log_path);
- return 0;
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ dirid, btrfs_root_id(root));
+ return ret;
}
-again:
+
range_start = 0;
range_end = 0;
while (1) {
if (del_all)
range_end = (u64)-1;
else {
- ret = find_dir_range(log, path, dirid, key_type,
+ ret = find_dir_range(log, wc->subvol_path, dirid,
&range_start, &range_end);
- if (ret != 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to find range for dir %llu in log tree root %llu",
+ dirid, btrfs_root_id(root));
+ goto out;
+ } else if (ret > 0) {
break;
+ }
}
dir_key.offset = range_start;
while (1) {
int nritems;
- ret = btrfs_search_slot(NULL, root, &dir_key, path,
- 0, 0);
- if (ret < 0)
+ ret = btrfs_search_slot(NULL, root, &dir_key,
+ wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search root %llu for key " BTRFS_KEY_FMT,
+ btrfs_root_id(root),
+ BTRFS_KEY_FMT_VALUE(&dir_key));
goto out;
+ }
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret)
+ nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
+ if (wc->subvol_path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, wc->subvol_path);
+ if (ret == 1) {
break;
+ } else if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get next leaf in subvolume root %llu",
+ btrfs_root_id(root));
+ goto out;
+ }
}
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key,
+ wc->subvol_path->slots[0]);
if (found_key.objectid != dirid ||
- found_key.type != dir_key.type)
- goto next_type;
+ found_key.type != dir_key.type) {
+ ret = 0;
+ goto out;
+ }
if (found_key.offset > range_end)
break;
- ret = check_item_in_log(trans, root, log, path,
- log_path, dir,
- &found_key);
+ ret = check_item_in_log(wc, log_path, dir, &found_key, del_all);
if (ret)
goto out;
if (found_key.offset == (u64)-1)
break;
dir_key.offset = found_key.offset + 1;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (range_end == (u64)-1)
break;
range_start = range_end + 1;
}
-
-next_type:
ret = 0;
- if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
- key_type = BTRFS_DIR_LOG_INDEX_KEY;
- dir_key.type = BTRFS_DIR_INDEX_KEY;
- btrfs_release_path(path);
- goto again;
- }
out:
- btrfs_release_path(path);
- btrfs_free_path(log_path);
- iput(dir);
+ btrfs_release_path(wc->subvol_path);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -1956,127 +2759,249 @@ out:
* only in the log (references come from either directory items or inode
* back refs).
*/
-static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
- struct walk_control *wc, u64 gen)
+static int replay_one_buffer(struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen, int level)
{
int nritems;
- struct btrfs_path *path;
- struct btrfs_root *root = wc->replay_dest;
- struct btrfs_key key;
- int level;
- int i;
+ struct btrfs_tree_parent_check check = {
+ .transid = gen,
+ .level = level
+ };
+ struct btrfs_root *root = wc->root;
+ struct btrfs_trans_handle *trans = wc->trans;
int ret;
- ret = btrfs_read_buffer(eb, gen);
- if (ret)
- return ret;
-
- level = btrfs_header_level(eb);
-
if (level != 0)
return 0;
- path = btrfs_alloc_path();
- if (!path)
+ /*
+ * Set to NULL since it was not yet read and in case we abort log replay
+ * on error, we have no valid log tree leaf to dump.
+ */
+ wc->log_leaf = NULL;
+ ret = btrfs_read_extent_buffer(eb, &check);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to read log tree leaf %llu for root %llu",
+ eb->start, btrfs_root_id(root));
+ return ret;
+ }
+
+ ASSERT(wc->subvol_path == NULL);
+ wc->subvol_path = btrfs_alloc_path();
+ if (!wc->subvol_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
+
+ wc->log_leaf = eb;
nritems = btrfs_header_nritems(eb);
- for (i = 0; i < nritems; i++) {
- btrfs_item_key_to_cpu(eb, &key, i);
+ for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) {
+ struct btrfs_inode_item *inode_item;
- /* inode keys are done during the first stage */
- if (key.type == BTRFS_INODE_ITEM_KEY &&
+ btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot);
+
+ if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) {
+ inode_item = btrfs_item_ptr(eb, wc->log_slot,
+ struct btrfs_inode_item);
+ /*
+ * An inode with no links is either:
+ *
+ * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never
+ * got linked before the fsync, skip it, as replaying
+ * it is pointless since it would be deleted later.
+ * We skip logging tmpfiles, but it's always possible
+ * we are replaying a log created with a kernel that
+ * used to log tmpfiles;
+ *
+ * 2) A non-tmpfile which got its last link deleted
+ * while holding an open fd on it and later got
+ * fsynced through that fd. We always log the
+ * parent inodes when inode->last_unlink_trans is
+ * set to the current transaction, so ignore all the
+ * inode items for this inode. We will delete the
+ * inode when processing the parent directory with
+ * replay_dir_deletes().
+ */
+ if (btrfs_inode_nlink(eb, inode_item) == 0) {
+ wc->ignore_cur_inode = true;
+ continue;
+ } else {
+ wc->ignore_cur_inode = false;
+ }
+ }
+
+ /* Inode keys are done during the first stage. */
+ if (wc->log_key.type == BTRFS_INODE_ITEM_KEY &&
wc->stage == LOG_WALK_REPLAY_INODES) {
- struct btrfs_inode_item *inode_item;
u32 mode;
- inode_item = btrfs_item_ptr(eb, i,
- struct btrfs_inode_item);
+ ret = replay_xattr_deletes(wc);
+ if (ret)
+ break;
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
- ret = replay_dir_deletes(wc->trans,
- root, log, path, key.objectid, 0);
+ ret = replay_dir_deletes(wc, wc->log_key.objectid, false);
if (ret)
break;
}
- ret = overwrite_item(wc->trans, root, path,
- eb, i, &key);
+ ret = overwrite_item(wc);
if (ret)
break;
- /* for regular files, make sure corresponding
- * orhpan item exist. extents past the new EOF
- * will be truncated later by orphan cleanup.
+ /*
+ * Before replaying extents, truncate the inode to its
+ * size. We need to do it now and not after log replay
+ * because before an fsync we can have prealloc extents
+ * added beyond the inode's i_size. If we did it after,
+ * through orphan cleanup for example, we would drop
+ * those prealloc extents just after replaying them.
*/
if (S_ISREG(mode)) {
- ret = insert_orphan_item(wc->trans, root,
- key.objectid);
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_inode *inode;
+ u64 from;
+
+ inode = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ break;
+ }
+ from = ALIGN(i_size_read(&inode->vfs_inode),
+ root->fs_info->sectorsize);
+ drop_args.start = from;
+ drop_args.end = (u64)-1;
+ drop_args.drop_cache = true;
+ drop_args.path = wc->subvol_path;
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to drop extents for inode %llu root %llu offset %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root),
+ from);
+ } else {
+ inode_sub_bytes(&inode->vfs_inode,
+ drop_args.bytes_found);
+ /* Update the inode's nbytes. */
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
+ }
+ iput(&inode->vfs_inode);
if (ret)
break;
}
- ret = link_to_fixup_dir(wc->trans, root,
- path, key.objectid);
+ ret = link_to_fixup_dir(wc, wc->log_key.objectid);
if (ret)
break;
}
+
+ if (wc->ignore_cur_inode)
+ continue;
+
+ if (wc->log_key.type == BTRFS_DIR_INDEX_KEY &&
+ wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
+ ret = replay_one_dir_item(wc);
+ if (ret)
+ break;
+ }
+
if (wc->stage < LOG_WALK_REPLAY_ALL)
continue;
/* these keys are simply copied */
- if (key.type == BTRFS_XATTR_ITEM_KEY) {
- ret = overwrite_item(wc->trans, root, path,
- eb, i, &key);
+ if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) {
+ ret = overwrite_item(wc);
if (ret)
break;
- } else if (key.type == BTRFS_INODE_REF_KEY ||
- key.type == BTRFS_INODE_EXTREF_KEY) {
- ret = add_inode_ref(wc->trans, root, log, path,
- eb, i, &key);
- if (ret && ret != -ENOENT)
- break;
- ret = 0;
- } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
- ret = replay_one_extent(wc->trans, root, path,
- eb, i, &key);
+ } else if (wc->log_key.type == BTRFS_INODE_REF_KEY ||
+ wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = add_inode_ref(wc);
if (ret)
break;
- } else if (key.type == BTRFS_DIR_ITEM_KEY ||
- key.type == BTRFS_DIR_INDEX_KEY) {
- ret = replay_one_dir_item(wc->trans, root, path,
- eb, i, &key);
+ } else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) {
+ ret = replay_one_extent(wc);
if (ret)
break;
}
+ /*
+ * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
+ * BTRFS_DIR_INDEX_KEY items which we use to derive the
+ * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
+ * older kernel with such keys, ignore them.
+ */
}
- btrfs_free_path(path);
+ btrfs_free_path(wc->subvol_path);
+ wc->subvol_path = NULL;
return ret;
}
-static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level,
- struct walk_control *wc)
+static int clean_log_buffer(struct btrfs_trans_handle *trans,
+ struct extent_buffer *eb)
{
- u64 root_owner;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_block_group *bg;
+
+ btrfs_tree_lock(eb);
+ btrfs_clear_buffer_dirty(trans, eb);
+ wait_on_extent_buffer_writeback(eb);
+ btrfs_tree_unlock(eb);
+
+ if (trans) {
+ int ret;
+
+ ret = btrfs_pin_reserved_extent(trans, eb);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ bg = btrfs_lookup_block_group(fs_info, eb->start);
+ if (!bg) {
+ btrfs_err(fs_info, "unable to find block group for %llu", eb->start);
+ btrfs_handle_fs_error(fs_info, -ENOENT, NULL);
+ return -ENOENT;
+ }
+
+ spin_lock(&bg->space_info->lock);
+ spin_lock(&bg->lock);
+ bg->reserved -= fs_info->nodesize;
+ bg->space_info->bytes_reserved -= fs_info->nodesize;
+ spin_unlock(&bg->lock);
+ spin_unlock(&bg->space_info->lock);
+
+ btrfs_put_block_group(bg);
+
+ return 0;
+}
+
+static noinline int walk_down_log_tree(struct btrfs_path *path, int *level,
+ struct walk_control *wc)
+{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_fs_info *fs_info = wc->log->fs_info;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
struct extent_buffer *cur;
- struct extent_buffer *parent;
- u32 blocksize;
int ret = 0;
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
-
while (*level > 0) {
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
+ struct btrfs_tree_parent_check check = { 0 };
+
cur = path->nodes[*level];
- if (btrfs_header_level(cur) != *level)
- WARN_ON(1);
+ WARN_ON(btrfs_header_level(cur) != *level);
if (path->slots[*level] >=
btrfs_header_nritems(cur))
@@ -2084,17 +3009,25 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
- blocksize = btrfs_level_size(root, *level - 1);
-
- parent = path->nodes[*level];
- root_owner = btrfs_header_owner(parent);
-
- next = btrfs_find_create_tree_block(root, bytenr, blocksize);
- if (!next)
- return -ENOMEM;
+ check.transid = ptr_gen;
+ check.level = *level - 1;
+ check.has_first_key = true;
+ btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
+
+ next = btrfs_find_create_tree_block(fs_info, bytenr,
+ btrfs_header_owner(cur),
+ *level - 1);
+ if (IS_ERR(next)) {
+ ret = PTR_ERR(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+ return ret;
+ }
if (*level == 1) {
- ret = wc->process_func(root, next, wc, ptr_gen);
+ ret = wc->process_func(next, wc, ptr_gen, *level - 1);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2102,22 +3035,17 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- ret = btrfs_read_buffer(next, ptr_gen);
+ ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
}
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
- clean_tree_block(trans, root, next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
-
- WARN_ON(root_owner !=
- BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_and_pin_reserved_extent(root,
- bytenr, blocksize);
+ ret = clean_log_buffer(trans, next);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2126,13 +3054,16 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
continue;
}
- ret = btrfs_read_buffer(next, ptr_gen);
+ ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
}
- WARN_ON(*level <= 0);
if (path->nodes[*level-1])
free_extent_buffer(path->nodes[*level-1]);
path->nodes[*level-1] = next;
@@ -2140,21 +3071,15 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level] = 0;
cond_resched();
}
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
-
path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
cond_resched();
return 0;
}
-static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level,
- struct walk_control *wc)
+static noinline int walk_up_log_tree(struct btrfs_path *path, int *level,
+ struct walk_control *wc)
{
- u64 root_owner;
int i;
int slot;
int ret;
@@ -2167,33 +3092,14 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level == 0);
return 0;
} else {
- struct extent_buffer *parent;
- if (path->nodes[*level] == root->node)
- parent = path->nodes[*level];
- else
- parent = path->nodes[*level + 1];
-
- root_owner = btrfs_header_owner(parent);
- ret = wc->process_func(root, path->nodes[*level], wc,
- btrfs_header_generation(path->nodes[*level]));
+ ret = wc->process_func(path->nodes[*level], wc,
+ btrfs_header_generation(path->nodes[*level]),
+ *level);
if (ret)
return ret;
if (wc->free) {
- struct extent_buffer *next;
-
- next = path->nodes[*level];
-
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
- clean_tree_block(trans, root, next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
-
- WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_and_pin_reserved_extent(root,
- path->nodes[*level]->start,
- path->nodes[*level]->len);
+ ret = clean_log_buffer(wc->trans, path->nodes[*level]);
if (ret)
return ret;
}
@@ -2210,13 +3116,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
* the tree freeing any blocks that have a ref count of zero after being
* decremented.
*/
-static int walk_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *log, struct walk_control *wc)
+static int walk_log_tree(struct walk_control *wc)
{
+ struct btrfs_root *log = wc->log;
int ret = 0;
int wret;
int level;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int orig_level;
path = btrfs_alloc_path();
@@ -2226,55 +3132,34 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
level = btrfs_header_level(log->node);
orig_level = level;
path->nodes[level] = log->node;
- extent_buffer_get(log->node);
+ refcount_inc(&log->node->refs);
path->slots[level] = 0;
while (1) {
- wret = walk_down_log_tree(trans, log, path, &level, wc);
+ wret = walk_down_log_tree(path, &level, wc);
if (wret > 0)
break;
- if (wret < 0) {
- ret = wret;
- goto out;
- }
+ if (wret < 0)
+ return wret;
- wret = walk_up_log_tree(trans, log, path, &level, wc);
+ wret = walk_up_log_tree(path, &level, wc);
if (wret > 0)
break;
- if (wret < 0) {
- ret = wret;
- goto out;
- }
+ if (wret < 0)
+ return wret;
}
/* was the root node processed? if not, catch it here */
if (path->nodes[orig_level]) {
- ret = wc->process_func(log, path->nodes[orig_level], wc,
- btrfs_header_generation(path->nodes[orig_level]));
+ ret = wc->process_func(path->nodes[orig_level], wc,
+ btrfs_header_generation(path->nodes[orig_level]),
+ orig_level);
if (ret)
- goto out;
- if (wc->free) {
- struct extent_buffer *next;
-
- next = path->nodes[orig_level];
-
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
- clean_tree_block(trans, log, next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
-
- WARN_ON(log->root_key.objectid !=
- BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_and_pin_reserved_extent(log, next->start,
- next->len);
- if (ret)
- goto out;
- }
+ return ret;
+ if (wc->free)
+ ret = clean_log_buffer(wc->trans, path->nodes[orig_level]);
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -2283,23 +3168,24 @@ out:
* in the tree of log roots
*/
static int update_log_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *log)
+ struct btrfs_root *log,
+ struct btrfs_root_item *root_item)
{
+ struct btrfs_fs_info *fs_info = log->fs_info;
int ret;
if (log->log_transid == 1) {
/* insert root item on the first sync */
- ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ ret = btrfs_insert_root(trans, fs_info->log_root_tree,
+ &log->root_key, root_item);
} else {
- ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ ret = btrfs_update_root(trans, fs_info->log_root_tree,
+ &log->root_key, root_item);
}
return ret;
}
-static int wait_log_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, unsigned long transid)
+static void wait_log_commit(struct btrfs_root *root, int transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
@@ -2309,46 +3195,112 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
* so we know that if ours is more than 2 older than the
* current transaction, we're done
*/
- do {
+ for (;;) {
prepare_to_wait(&root->log_commit_wait[index],
&wait, TASK_UNINTERRUPTIBLE);
- mutex_unlock(&root->log_mutex);
- if (root->fs_info->last_trans_log_full_commit !=
- trans->transid && root->log_transid < transid + 2 &&
- atomic_read(&root->log_commit[index]))
- schedule();
+ if (!(root->log_transid_committed < transid &&
+ atomic_read(&root->log_commit[index])))
+ break;
- finish_wait(&root->log_commit_wait[index], &wait);
+ mutex_unlock(&root->log_mutex);
+ schedule();
mutex_lock(&root->log_mutex);
- } while (root->fs_info->last_trans_log_full_commit !=
- trans->transid && root->log_transid < transid + 2 &&
- atomic_read(&root->log_commit[index]));
- return 0;
+ }
+ finish_wait(&root->log_commit_wait[index], &wait);
}
-static void wait_for_writer(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_root *root)
{
DEFINE_WAIT(wait);
- while (root->fs_info->last_trans_log_full_commit !=
- trans->transid && atomic_read(&root->log_writers)) {
- prepare_to_wait(&root->log_writer_wait,
- &wait, TASK_UNINTERRUPTIBLE);
+
+ for (;;) {
+ prepare_to_wait(&root->log_writer_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&root->log_writers))
+ break;
+
mutex_unlock(&root->log_mutex);
- if (root->fs_info->last_trans_log_full_commit !=
- trans->transid && atomic_read(&root->log_writers))
- schedule();
+ schedule();
mutex_lock(&root->log_mutex);
- finish_wait(&root->log_writer_wait, &wait);
+ }
+ finish_wait(&root->log_writer_wait, &wait);
+}
+
+void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode)
+{
+ ctx->log_ret = 0;
+ ctx->log_transid = 0;
+ ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
+ ctx->logging_new_delayed_dentries = false;
+ ctx->logged_before = false;
+ ctx->inode = inode;
+ INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+ INIT_LIST_HEAD(&ctx->conflict_inodes);
+ ctx->num_conflict_inodes = 0;
+ ctx->logging_conflict_inodes = false;
+ ctx->scratch_eb = NULL;
+}
+
+void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_inode *inode = ctx->inode;
+
+ if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return;
+
+ /*
+ * Don't care about allocation failure. This is just for optimization,
+ * if we fail to allocate here, we will try again later if needed.
+ */
+ ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
+}
+
+void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ btrfs_assert_inode_locked(ctx->inode);
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
+
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ mutex_lock(&root->log_mutex);
+ list_del_init(&ctx->list);
+ mutex_unlock(&root->log_mutex);
+}
+
+/*
+ * Invoked in log mutex context, or be sure there is no other task which
+ * can access the list.
+ */
+static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
+ int index, int error)
+{
+ struct btrfs_log_ctx *ctx;
+ struct btrfs_log_ctx *safe;
+
+ list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
+ list_del_init(&ctx->list);
+ ctx->log_ret = error;
}
}
/*
- * btrfs_sync_log does sends a given tree log down to the disk and
- * updates the super blocks to record it. When this call is done,
- * you know that any inodes previously logged are safely on disk only
- * if it returns 0.
+ * Sends a given tree log down to the disk and updates the super blocks to
+ * record it. When this call is done, you know that any inodes previously
+ * logged are safely on disk only if it returns 0.
*
* Any other return value means you need to call btrfs_commit_transaction.
* Some of the edge cases for fsyncing directories that have had unlinks
@@ -2357,75 +3309,111 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
* that has happened.
*/
int btrfs_sync_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx)
{
int index1;
int index2;
int mark;
int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
- struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
- unsigned long log_transid = 0;
+ struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ struct btrfs_root_item new_root_item;
+ int log_transid = 0;
+ struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
+ u64 log_root_start;
+ u64 log_root_level;
mutex_lock(&root->log_mutex);
- log_transid = root->log_transid;
- index1 = root->log_transid % 2;
+ log_transid = ctx->log_transid;
+ if (root->log_transid_committed >= log_transid) {
+ mutex_unlock(&root->log_mutex);
+ return ctx->log_ret;
+ }
+
+ index1 = log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
- wait_log_commit(trans, root, root->log_transid);
+ wait_log_commit(root, log_transid);
mutex_unlock(&root->log_mutex);
- return 0;
+ return ctx->log_ret;
}
+ ASSERT(log_transid == root->log_transid,
+ "log_transid=%d root->log_transid=%d", log_transid, root->log_transid);
atomic_set(&root->log_commit[index1], 1);
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
- wait_log_commit(trans, root, root->log_transid - 1);
+ wait_log_commit(root, log_transid - 1);
+
while (1) {
int batch = atomic_read(&root->log_batch);
/* when we're on an ssd, just kick the log commit out */
- if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
+ if (!btrfs_test_opt(fs_info, SSD) &&
+ test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
}
- wait_for_writer(trans, root);
+ wait_for_writer(root);
if (batch == atomic_read(&root->log_batch))
break;
}
/* bail out if we need to do a full commit */
- if (root->fs_info->last_trans_log_full_commit == trans->transid) {
- ret = -EAGAIN;
- btrfs_free_logged_extents(log, log_transid);
+ if (btrfs_need_log_full_commit(trans)) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
mutex_unlock(&root->log_mutex);
goto out;
}
if (log_transid % 2 == 0)
- mark = EXTENT_DIRTY;
+ mark = EXTENT_DIRTY_LOG1;
else
- mark = EXTENT_NEW;
+ mark = EXTENT_DIRTY_LOG2;
/* we start IO on all the marked extents here, but we don't actually
* wait for them until later.
*/
blk_start_plug(&plug);
- ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
+ ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
+ /*
+ * -EAGAIN happens when someone, e.g., a concurrent transaction
+ * commit, writes a dirty extent in this tree-log commit. This
+ * concurrent write will create a hole writing out the extents,
+ * and we cannot proceed on a zoned filesystem, requiring
+ * sequential writing. While we can bail out to a full commit
+ * here, but we can continue hoping the concurrent writing fills
+ * the hole.
+ */
+ if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
+ ret = 0;
if (ret) {
blk_finish_plug(&plug);
- btrfs_abort_transaction(trans, root, ret);
- btrfs_free_logged_extents(log, log_transid);
+ btrfs_set_log_full_commit(trans);
mutex_unlock(&root->log_mutex);
goto out;
}
+ /*
+ * We _must_ update under the root->log_mutex in order to make sure we
+ * have a consistent view of the log root we are trying to commit at
+ * this moment.
+ *
+ * We _must_ copy this into a local copy, because we are not holding the
+ * log_root_tree->log_mutex yet. This is important because when we
+ * commit the log_root_tree we must have a consistent view of the
+ * log_root_tree when we update the super block to point at the
+ * log_root_tree bytenr. If we update the log_root_tree here we'll race
+ * with the commit and possibly point at the new block which we may not
+ * have written out.
+ */
btrfs_set_root_node(&log->root_item, log->node);
+ memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
- root->log_transid++;
+ btrfs_set_root_log_transid(root, root->log_transid + 1);
log->log_transid = root->log_transid;
root->log_start_pid = 0;
- smp_mb();
/*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
@@ -2433,124 +3421,202 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->log_mutex);
- mutex_lock(&log_root_tree->log_mutex);
- atomic_inc(&log_root_tree->log_batch);
- atomic_inc(&log_root_tree->log_writers);
- mutex_unlock(&log_root_tree->log_mutex);
+ if (btrfs_is_zoned(fs_info)) {
+ mutex_lock(&fs_info->tree_root->log_mutex);
+ if (!log_root_tree->node) {
+ ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
+ if (ret) {
+ mutex_unlock(&fs_info->tree_root->log_mutex);
+ blk_finish_plug(&plug);
+ goto out;
+ }
+ }
+ mutex_unlock(&fs_info->tree_root->log_mutex);
+ }
- ret = update_log_root(trans, log);
+ btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
- if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- smp_mb();
- if (waitqueue_active(&log_root_tree->log_writer_wait))
- wake_up(&log_root_tree->log_writer_wait);
- }
+ index2 = log_root_tree->log_transid % 2;
+ list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+ root_log_ctx.log_transid = log_root_tree->log_transid;
+
+ /*
+ * Now we are safe to update the log_root_tree because we're under the
+ * log_mutex, and we're a current writer so we're holding the commit
+ * open until we drop the log_mutex.
+ */
+ ret = update_log_root(trans, log, &new_root_item);
if (ret) {
+ list_del_init(&root_log_ctx.list);
blk_finish_plug(&plug);
- if (ret != -ENOSPC) {
- btrfs_abort_transaction(trans, root, ret);
- mutex_unlock(&log_root_tree->log_mutex);
- goto out;
- }
- root->fs_info->last_trans_log_full_commit = trans->transid;
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
- btrfs_free_logged_extents(log, log_transid);
+ btrfs_set_log_full_commit(trans);
+ if (ret != -ENOSPC)
+ btrfs_err(fs_info,
+ "failed to update log for root %llu ret %d",
+ btrfs_root_id(root), ret);
+ btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = -EAGAIN;
goto out;
}
- index2 = log_root_tree->log_transid % 2;
+ if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+ blk_finish_plug(&plug);
+ list_del_init(&root_log_ctx.list);
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = root_log_ctx.log_ret;
+ goto out;
+ }
+
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
- wait_log_commit(trans, log_root_tree,
- log_root_tree->log_transid);
- btrfs_free_logged_extents(log, log_transid);
+ ret = btrfs_wait_tree_log_extents(log, mark);
+ wait_log_commit(log_root_tree,
+ root_log_ctx.log_transid);
mutex_unlock(&log_root_tree->log_mutex);
- ret = 0;
+ if (!ret)
+ ret = root_log_ctx.log_ret;
goto out;
}
+ ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid,
+ "root_log_ctx.log_transid=%d log_root_tree->log_transid=%d",
+ root_log_ctx.log_transid, log_root_tree->log_transid);
atomic_set(&log_root_tree->log_commit[index2], 1);
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
- wait_log_commit(trans, log_root_tree,
- log_root_tree->log_transid - 1);
+ wait_log_commit(log_root_tree,
+ root_log_ctx.log_transid - 1);
}
- wait_for_writer(trans, log_root_tree);
-
/*
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
*/
- if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ if (btrfs_need_log_full_commit(trans)) {
blk_finish_plug(&plug);
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
- btrfs_free_logged_extents(log, log_transid);
+ btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out_wake_log_root;
}
- ret = btrfs_write_marked_extents(log_root_tree,
+ ret = btrfs_write_marked_extents(fs_info,
&log_root_tree->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
blk_finish_plug(&plug);
+ /*
+ * As described above, -EAGAIN indicates a hole in the extents. We
+ * cannot wait for these write outs since the waiting cause a
+ * deadlock. Bail out to the full commit instead.
+ */
+ if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
+ btrfs_set_log_full_commit(trans);
+ btrfs_wait_tree_log_extents(log, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ } else if (ret) {
+ btrfs_set_log_full_commit(trans);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ }
+ ret = btrfs_wait_tree_log_extents(log, mark);
+ if (!ret)
+ ret = btrfs_wait_tree_log_extents(log_root_tree,
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- btrfs_free_logged_extents(log, log_transid);
+ btrfs_set_log_full_commit(trans);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
- btrfs_wait_marked_extents(log_root_tree,
- &log_root_tree->dirty_log_pages,
- EXTENT_NEW | EXTENT_DIRTY);
- btrfs_wait_logged_extents(log, log_transid);
-
- btrfs_set_super_log_root(root->fs_info->super_for_commit,
- log_root_tree->node->start);
- btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
- btrfs_header_level(log_root_tree->node));
+ log_root_start = log_root_tree->node->start;
+ log_root_level = btrfs_header_level(log_root_tree->node);
log_root_tree->log_transid++;
- smp_mb();
-
mutex_unlock(&log_root_tree->log_mutex);
/*
- * nobody else is going to jump in and write the the ctree
- * super here because the log_commit atomic below is protecting
- * us. We must be called with a transaction handle pinning
- * the running transaction open, so a full commit can't hop
- * in and cause problems either.
+ * Here we are guaranteed that nobody is going to write the superblock
+ * for the current transaction before us and that neither we do write
+ * our superblock before the previous transaction finishes its commit
+ * and writes its superblock, because:
+ *
+ * 1) We are holding a handle on the current transaction, so no body
+ * can commit it until we release the handle;
+ *
+ * 2) Before writing our superblock we acquire the tree_log_mutex, so
+ * if the previous transaction is still committing, and hasn't yet
+ * written its superblock, we wait for it to do it, because a
+ * transaction commit acquires the tree_log_mutex when the commit
+ * begins and releases it only after writing its superblock.
*/
- btrfs_scrub_pause_super(root);
- ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
- btrfs_scrub_continue_super(root);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ mutex_lock(&fs_info->tree_log_mutex);
+
+ /*
+ * The previous transaction writeout phase could have failed, and thus
+ * marked the fs in an error state. We must not commit here, as we
+ * could have updated our generation in the super_for_commit and
+ * writing the super here would result in transid mismatches. If there
+ * is an error here just bail.
+ */
+ if (BTRFS_FS_ERROR(fs_info)) {
+ ret = -EIO;
+ btrfs_set_log_full_commit(trans);
+ btrfs_abort_transaction(trans, ret);
+ mutex_unlock(&fs_info->tree_log_mutex);
goto out_wake_log_root;
}
- mutex_lock(&root->log_mutex);
- if (root->last_log_commit < log_transid)
- root->last_log_commit = log_transid;
- mutex_unlock(&root->log_mutex);
+ btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
+ btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
+ ret = write_all_supers(fs_info, 1);
+ mutex_unlock(&fs_info->tree_log_mutex);
+ if (unlikely(ret)) {
+ btrfs_set_log_full_commit(trans);
+ btrfs_abort_transaction(trans, ret);
+ goto out_wake_log_root;
+ }
+
+ /*
+ * We know there can only be one task here, since we have not yet set
+ * root->log_commit[index1] to 0 and any task attempting to sync the
+ * log must wait for the previous log transaction to commit if it's
+ * still in progress or wait for the current log transaction commit if
+ * someone else already started it. We use <= and not < because the
+ * first log transaction has an ID of 0.
+ */
+ ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid,
+ "last_log_commit(root)=%d log_transid=%d",
+ btrfs_get_root_last_log_commit(root), log_transid);
+ btrfs_set_root_last_log_commit(root, log_transid);
out_wake_log_root:
+ mutex_lock(&log_root_tree->log_mutex);
+ btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
+
+ log_root_tree->log_transid_committed++;
atomic_set(&log_root_tree->log_commit[index2], 0);
- smp_mb();
- if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
- wake_up(&log_root_tree->log_commit_wait[index2]);
+ mutex_unlock(&log_root_tree->log_mutex);
+
+ /*
+ * The barrier before waitqueue_active (in cond_wake_up) is needed so
+ * all the updates above are seen by the woken threads. It might not be
+ * necessary, but proving that seems to be hard.
+ */
+ cond_wake_up(&log_root_tree->log_commit_wait[index2]);
out:
+ mutex_lock(&root->log_mutex);
+ btrfs_remove_all_log_ctxs(root, index1, ret);
+ root->log_transid_committed++;
atomic_set(&root->log_commit[index1], 0);
- smp_mb();
- if (waitqueue_active(&root->log_commit_wait[index1]))
- wake_up(&root->log_commit_wait[index1]);
+ mutex_unlock(&root->log_mutex);
+
+ /*
+ * The barrier before waitqueue_active (in cond_wake_up) is needed so
+ * all the updates above are seen by the woken threads. It might not be
+ * necessary, but proving that seems to be hard.
+ */
+ cond_wake_up(&root->log_commit_wait[index1]);
return ret;
}
@@ -2558,42 +3624,50 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *log)
{
int ret;
- u64 start;
- u64 end;
struct walk_control wc = {
- .free = 1,
- .process_func = process_one_buffer
+ .free = true,
+ .process_func = process_one_buffer,
+ .log = log,
+ .trans = trans,
};
- if (trans) {
- ret = walk_log_tree(trans, log, &wc);
-
- /* I don't think this can happen but just in case */
- if (ret)
- btrfs_abort_transaction(trans, log, ret);
- }
-
- while (1) {
- ret = find_first_extent_bit(&log->dirty_log_pages,
- 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
- NULL);
- if (ret)
- break;
+ if (log->node) {
+ ret = walk_log_tree(&wc);
+ if (ret) {
+ /*
+ * We weren't able to traverse the entire log tree, the
+ * typical scenario is getting an -EIO when reading an
+ * extent buffer of the tree, due to a previous writeback
+ * failure of it.
+ */
+ set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+ &log->fs_info->fs_state);
- clear_extent_bits(&log->dirty_log_pages, start, end,
- EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+ /*
+ * Some extent buffers of the log tree may still be dirty
+ * and not yet written back to storage, because we may
+ * have updates to a log tree without syncing a log tree,
+ * such as during rename and link operations. So flush
+ * them out and wait for their writeback to complete, so
+ * that we properly cleanup their state and pages.
+ */
+ btrfs_write_marked_extents(log->fs_info,
+ &log->dirty_log_pages,
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
+ btrfs_wait_tree_log_extents(log,
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
+
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ }
}
- /*
- * We may have short-circuited the log tree with the full commit logic
- * and left ordered extents on our list, so clear these out to keep us
- * from leaking inodes and memory.
- */
- btrfs_free_logged_extents(log, 0);
- btrfs_free_logged_extents(log, 1);
+ btrfs_extent_io_tree_release(&log->dirty_log_pages);
+ btrfs_extent_io_tree_release(&log->log_csum_range);
- free_extent_buffer(log->node);
- kfree(log);
+ btrfs_put_root(log);
}
/*
@@ -2605,6 +3679,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
if (root->log_root) {
free_log_tree(trans, root->log_root);
root->log_root = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
}
return 0;
}
@@ -2615,10 +3690,192 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
if (fs_info->log_root_tree) {
free_log_tree(trans, fs_info->log_root_tree);
fs_info->log_root_tree = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
}
return 0;
}
+static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ bool ret = false;
+
+ /*
+ * Do this only if ->logged_trans is still 0 to prevent races with
+ * concurrent logging as we may see the inode not logged when
+ * inode_logged() is called but it gets logged after inode_logged() did
+ * not find it in the log tree and we end up setting ->logged_trans to a
+ * value less than trans->transid after the concurrent logging task has
+ * set it to trans->transid. As a consequence, subsequent rename, unlink
+ * and link operations may end up not logging new names and removing old
+ * names from the log.
+ */
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == 0)
+ inode->logged_trans = trans->transid - 1;
+ else if (inode->logged_trans == trans->transid)
+ ret = true;
+ spin_unlock(&inode->lock);
+
+ return ret;
+}
+
+/*
+ * Check if an inode was logged in the current transaction. This correctly deals
+ * with the case where the inode was logged but has a logged_trans of 0, which
+ * happens if the inode is evicted and loaded again, as logged_trans is an in
+ * memory only field (not persisted).
+ *
+ * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
+ * and < 0 on error.
+ */
+static int inode_logged(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path_in)
+{
+ struct btrfs_path *path = path_in;
+ struct btrfs_key key;
+ int ret;
+
+ /*
+ * Quick lockless call, since once ->logged_trans is set to the current
+ * transaction, we never set it to a lower value anywhere else.
+ */
+ if (data_race(inode->logged_trans) == trans->transid)
+ return 1;
+
+ /*
+ * If logged_trans is not 0 and not trans->transid, then we know the
+ * inode was not logged in this transaction, so we can return false
+ * right away. We take the lock to avoid a race caused by load/store
+ * tearing with a concurrent btrfs_log_inode() call or a concurrent task
+ * in this function further below - an update to trans->transid can be
+ * teared into two 32 bits updates for example, in which case we could
+ * see a positive value that is not trans->transid and assume the inode
+ * was not logged when it was.
+ */
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == trans->transid) {
+ spin_unlock(&inode->lock);
+ return 1;
+ } else if (inode->logged_trans > 0) {
+ spin_unlock(&inode->lock);
+ return 0;
+ }
+ spin_unlock(&inode->lock);
+
+ /*
+ * If no log tree was created for this root in this transaction, then
+ * the inode can not have been logged in this transaction. In that case
+ * set logged_trans to anything greater than 0 and less than the current
+ * transaction's ID, to avoid the search below in a future call in case
+ * a log tree gets created after this.
+ */
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+ return mark_inode_as_not_logged(trans, inode);
+
+ /*
+ * We have a log tree and the inode's logged_trans is 0. We can't tell
+ * for sure if the inode was logged before in this transaction by looking
+ * only at logged_trans. We could be pessimistic and assume it was, but
+ * that can lead to unnecessarily logging an inode during rename and link
+ * operations, and then further updating the log in followup rename and
+ * link operations, specially if it's a directory, which adds latency
+ * visible to applications doing a series of rename or link operations.
+ *
+ * A logged_trans of 0 here can mean several things:
+ *
+ * 1) The inode was never logged since the filesystem was mounted, and may
+ * or may have not been evicted and loaded again;
+ *
+ * 2) The inode was logged in a previous transaction, then evicted and
+ * then loaded again;
+ *
+ * 3) The inode was logged in the current transaction, then evicted and
+ * then loaded again.
+ *
+ * For cases 1) and 2) we don't want to return true, but we need to detect
+ * case 3) and return true. So we do a search in the log root for the inode
+ * item.
+ */
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+
+ if (path_in)
+ btrfs_release_path(path);
+ else
+ btrfs_free_path(path);
+
+ /*
+ * Logging an inode always results in logging its inode item. So if we
+ * did not find the item we know the inode was not logged for sure.
+ */
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /*
+ * Set logged_trans to a value greater than 0 and less then the
+ * current transaction to avoid doing the search in future calls.
+ */
+ return mark_inode_as_not_logged(trans, inode);
+ }
+
+ /*
+ * The inode was previously logged and then evicted, set logged_trans to
+ * the current transaction's ID, to avoid future tree searches as long as
+ * the inode is not evicted again.
+ */
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ spin_unlock(&inode->lock);
+
+ return 1;
+}
+
+/*
+ * Delete a directory entry from the log if it exists.
+ *
+ * Returns < 0 on error
+ * 1 if the entry does not exists
+ * 0 if the entry existed and was successfully deleted
+ */
+static int del_logged_dentry(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dir_ino,
+ const struct fscrypt_str *name,
+ u64 index)
+{
+ struct btrfs_dir_item *di;
+
+ /*
+ * We only log dir index items of a directory, so we don't need to look
+ * for dir item keys.
+ */
+ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
+ index, name, -1);
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+ else if (!di)
+ return 1;
+
+ /*
+ * We do not need to update the size field of the directory's
+ * inode item because on log replay we update the field to reflect
+ * all existing entries in the directory (see overwrite_item()).
+ */
+ return btrfs_del_item(trans, log, path);
+}
+
/*
* If both a file and directory are logged, and unlinks or renames are
* mixed in, we have a few interesting corners:
@@ -2640,143 +3897,72 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
* This optimizations allows us to avoid relogging the entire inode
* or the entire directory.
*/
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct inode *dir, u64 index)
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ const struct fscrypt_str *name,
+ struct btrfs_inode *dir, u64 index)
{
- struct btrfs_root *log;
- struct btrfs_dir_item *di;
- struct btrfs_path *path;
+ struct btrfs_root *root = dir->root;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
- int err = 0;
- int bytes_del = 0;
- u64 dir_ino = btrfs_ino(dir);
- if (BTRFS_I(dir)->logged_trans < trans->transid)
- return 0;
-
- ret = join_running_log_trans(root);
- if (ret)
- return 0;
-
- mutex_lock(&BTRFS_I(dir)->log_mutex);
+ ret = inode_logged(trans, dir, NULL);
+ if (ret == 0)
+ return;
+ if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
- log = root->log_root;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
- goto out_unlock;
- }
-
- di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
- name, name_len, -1);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto fail;
- }
- if (di) {
- ret = btrfs_delete_one_dir_name(trans, log, path, di);
- bytes_del += name_len;
- if (ret) {
- err = ret;
- goto fail;
- }
- }
- btrfs_release_path(path);
- di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
- index, name, name_len, -1);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto fail;
- }
- if (di) {
- ret = btrfs_delete_one_dir_name(trans, log, path, di);
- bytes_del += name_len;
- if (ret) {
- err = ret;
- goto fail;
- }
+ btrfs_set_log_full_commit(trans);
+ return;
}
- /* update the directory size in the log to reflect the names
- * we have removed
- */
- if (bytes_del) {
- struct btrfs_key key;
-
- key.objectid = dir_ino;
- key.offset = 0;
- key.type = BTRFS_INODE_ITEM_KEY;
- btrfs_release_path(path);
-
- ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
- if (ret < 0) {
- err = ret;
- goto fail;
- }
- if (ret == 0) {
- struct btrfs_inode_item *item;
- u64 i_size;
+ ret = join_running_log_trans(root);
+ ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
+ if (WARN_ON(ret))
+ return;
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_inode_item);
- i_size = btrfs_inode_size(path->nodes[0], item);
- if (i_size > bytes_del)
- i_size -= bytes_del;
- else
- i_size = 0;
- btrfs_set_inode_size(path->nodes[0], item, i_size);
- btrfs_mark_buffer_dirty(path->nodes[0]);
- } else
- ret = 0;
- btrfs_release_path(path);
- }
-fail:
- btrfs_free_path(path);
-out_unlock:
- mutex_unlock(&BTRFS_I(dir)->log_mutex);
- if (ret == -ENOSPC) {
- root->fs_info->last_trans_log_full_commit = trans->transid;
- ret = 0;
- } else if (ret < 0)
- btrfs_abort_transaction(trans, root, ret);
+ mutex_lock(&dir->log_mutex);
+ ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
+ name, index);
+ mutex_unlock(&dir->log_mutex);
+ if (ret < 0)
+ btrfs_set_log_full_commit(trans);
btrfs_end_log_trans(root);
-
- return err;
}
/* see comments for btrfs_del_dir_entries_in_log */
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct inode *inode, u64 dirid)
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ const struct fscrypt_str *name,
+ struct btrfs_inode *inode,
+ struct btrfs_inode *dir)
{
- struct btrfs_root *log;
- u64 index;
+ struct btrfs_root *root = dir->root;
int ret;
- if (BTRFS_I(inode)->logged_trans < trans->transid)
- return 0;
+ ret = inode_logged(trans, inode, NULL);
+ if (ret == 0)
+ return;
+ else if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
ret = join_running_log_trans(root);
- if (ret)
- return 0;
- log = root->log_root;
- mutex_lock(&BTRFS_I(inode)->log_mutex);
-
- ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
- dirid, &index);
- mutex_unlock(&BTRFS_I(inode)->log_mutex);
- if (ret == -ENOSPC) {
- root->fs_info->last_trans_log_full_commit = trans->transid;
- ret = 0;
- } else if (ret < 0 && ret != -ENOENT)
- btrfs_abort_transaction(trans, root, ret);
- btrfs_end_log_trans(root);
+ ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
+ if (WARN_ON(ret))
+ return;
+ mutex_lock(&inode->log_mutex);
- return ret;
+ ret = btrfs_del_inode_ref(trans, root->log_root, name, btrfs_ino(inode),
+ btrfs_ino(dir), NULL);
+ mutex_unlock(&inode->log_mutex);
+ if (ret < 0 && ret != -ENOENT)
+ btrfs_set_log_full_commit(trans);
+ btrfs_end_log_trans(root);
}
/*
@@ -2787,7 +3973,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
- int key_type, u64 dirid,
+ u64 dirid,
u64 first_offset, u64 last_offset)
{
int ret;
@@ -2795,67 +3981,294 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_dir_log_item *item;
key.objectid = dirid;
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
key.offset = first_offset;
- if (key_type == BTRFS_DIR_ITEM_KEY)
- key.type = BTRFS_DIR_LOG_ITEM_KEY;
- else
- key.type = BTRFS_DIR_LOG_INDEX_KEY;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
- if (ret)
+ /*
+ * -EEXIST is fine and can happen sporadically when we are logging a
+ * directory and have concurrent insertions in the subvolume's tree for
+ * items from other inodes and that result in pushing off some dir items
+ * from one leaf to another in order to accommodate for the new items.
+ * This results in logging the same dir index range key.
+ */
+ if (ret && ret != -EEXIST)
return ret;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
+ if (ret == -EEXIST) {
+ const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
+
+ /*
+ * btrfs_del_dir_entries_in_log() might have been called during
+ * an unlink between the initial insertion of this key and the
+ * current update, or we might be logging a single entry deletion
+ * during a rename, so set the new last_offset to the max value.
+ */
+ last_offset = max(last_offset, curr_end);
+ }
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
- btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(path);
return 0;
}
+static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct extent_buffer *src,
+ struct btrfs_path *dst_path,
+ int start_slot,
+ int count)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ char AUTO_KFREE(ins_data);
+ struct btrfs_item_batch batch;
+ struct extent_buffer *dst;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+ u64 last_index;
+ struct btrfs_key key;
+ u32 item_size;
+ int ret;
+ int i;
+
+ ASSERT(count > 0, "count=%d", count);
+ batch.nr = count;
+
+ if (count == 1) {
+ btrfs_item_key_to_cpu(src, &key, start_slot);
+ item_size = btrfs_item_size(src, start_slot);
+ batch.keys = &key;
+ batch.data_sizes = &item_size;
+ batch.total_data_size = item_size;
+ } else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+
+ ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+
+ for (i = 0; i < count; i++) {
+ const int slot = start_slot + i;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+ ins_sizes[i] = btrfs_item_size(src, slot);
+ batch.total_data_size += ins_sizes[i];
+ }
+ }
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
+ return ret;
+
+ dst = dst_path->nodes[0];
+ /*
+ * Copy all the items in bulk, in a single copy operation. Item data is
+ * organized such that it's placed at the end of a leaf and from right
+ * to left. For example, the data for the second item ends at an offset
+ * that matches the offset where the data for the first item starts, the
+ * data for the third item ends at an offset that matches the offset
+ * where the data of the second items starts, and so on.
+ * Therefore our source and destination start offsets for copy match the
+ * offsets of the last items (highest slots).
+ */
+ dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
+ src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
+ copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
+ btrfs_release_path(dst_path);
+
+ last_index = batch.keys[count - 1].offset;
+ ASSERT(last_index > inode->last_dir_index_offset,
+ "last_index=%llu inode->last_dir_index_offset=%llu",
+ last_index, inode->last_dir_index_offset);
+
+ /*
+ * If for some unexpected reason the last item's index is not greater
+ * than the last index we logged, warn and force a transaction commit.
+ */
+ if (WARN_ON(last_index <= inode->last_dir_index_offset))
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ else
+ inode->last_dir_index_offset = last_index;
+
+ if (btrfs_get_first_dir_index_to_log(inode) == 0)
+ btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
+
+ return ret;
+}
+
+static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
+{
+ const int slot = path->slots[0];
+
+ if (ctx->scratch_eb) {
+ copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
+ } else {
+ ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!ctx->scratch_eb)
+ return -ENOMEM;
+ }
+
+ btrfs_release_path(path);
+ path->nodes[0] = ctx->scratch_eb;
+ path->slots[0] = slot;
+ /*
+ * Add extra ref to scratch eb so that it is not freed when callers
+ * release the path, so we can reuse it later if needed.
+ */
+ refcount_inc(&ctx->scratch_eb->refs);
+
+ return 0;
+}
+
+static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx,
+ u64 *last_old_dentry_offset)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ struct extent_buffer *src;
+ const int nritems = btrfs_header_nritems(path->nodes[0]);
+ const u64 ino = btrfs_ino(inode);
+ bool last_found = false;
+ int batch_start = 0;
+ int batch_size = 0;
+ int ret;
+
+ /*
+ * We need to clone the leaf, release the read lock on it, and use the
+ * clone before modifying the log tree. See the comment at copy_items()
+ * about why we need to do this.
+ */
+ ret = clone_leaf(path, ctx);
+ if (ret < 0)
+ return ret;
+
+ src = path->nodes[0];
+
+ for (int i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(src, &key, i);
+
+ if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
+ last_found = true;
+ break;
+ }
+
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+
+ /*
+ * Skip ranges of items that consist only of dir item keys created
+ * in past transactions. However if we find a gap, we must log a
+ * dir index range item for that gap, so that index keys in that
+ * gap are deleted during log replay.
+ */
+ if (btrfs_dir_transid(src, di) < trans->transid) {
+ if (key.offset > *last_old_dentry_offset + 1) {
+ ret = insert_dir_log_key(trans, log, dst_path,
+ ino, *last_old_dentry_offset + 1,
+ key.offset - 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ *last_old_dentry_offset = key.offset;
+ continue;
+ }
+
+ /* If we logged this dir index item before, we can skip it. */
+ if (key.offset <= inode->last_dir_index_offset)
+ continue;
+
+ /*
+ * We must make sure that when we log a directory entry, the
+ * corresponding inode, after log replay, has a matching link
+ * count. For example:
+ *
+ * touch foo
+ * mkdir mydir
+ * sync
+ * ln foo mydir/bar
+ * xfs_io -c "fsync" mydir
+ * <crash>
+ * <mount fs and log replay>
+ *
+ * Would result in a fsync log that when replayed, our file inode
+ * would have a link count of 1, but we get two directory entries
+ * pointing to the same inode. After removing one of the names,
+ * it would not be possible to remove the other name, which
+ * resulted always in stale file handle errors, and would not be
+ * possible to rmdir the parent directory, since its i_size could
+ * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
+ * resulting in -ENOTEMPTY errors.
+ */
+ if (!ctx->log_new_dentries) {
+ struct btrfs_key di_key;
+
+ btrfs_dir_item_key_to_cpu(src, di, &di_key);
+ if (di_key.type != BTRFS_ROOT_ITEM_KEY)
+ ctx->log_new_dentries = true;
+ }
+
+ if (batch_size == 0)
+ batch_start = i;
+ batch_size++;
+ }
+
+ if (batch_size > 0) {
+ ret = flush_dir_items_batch(trans, inode, src, dst_path,
+ batch_start, batch_size);
+ if (ret < 0)
+ return ret;
+ }
+
+ return last_found ? 1 : 0;
+}
+
/*
* log all the items included in the current transaction for a given
* directory. This also creates the range items in the log tree required
* to replay anything deleted before the fsync
*/
static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path, int key_type,
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx,
u64 min_offset, u64 *last_offset_ret)
{
struct btrfs_key min_key;
- struct btrfs_key max_key;
+ struct btrfs_root *root = inode->root;
struct btrfs_root *log = root->log_root;
- struct extent_buffer *src;
- int err = 0;
int ret;
- int i;
- int nritems;
- u64 first_offset = min_offset;
+ u64 last_old_dentry_offset = min_offset - 1;
u64 last_offset = (u64)-1;
u64 ino = btrfs_ino(inode);
- log = root->log_root;
- max_key.objectid = ino;
- max_key.offset = (u64)-1;
- max_key.type = key_type;
-
min_key.objectid = ino;
- min_key.type = key_type;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
min_key.offset = min_offset;
- path->keep_locks = 1;
-
- ret = btrfs_search_forward(root, &min_key, &max_key,
- path, trans->transid);
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
/*
* we didn't find anything from this transaction, see if there
* is anything at all
*/
- if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
+ if (ret != 0 || min_key.objectid != ino ||
+ min_key.type != BTRFS_DIR_INDEX_KEY) {
min_key.objectid = ino;
- min_key.type = key_type;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
min_key.offset = (u64)-1;
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
@@ -2863,7 +4276,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
return ret;
}
- ret = btrfs_previous_item(root, path, ino, key_type);
+ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
/* if ret == 0 there are items for this type,
* create a range to tell us the last key of this type.
@@ -2872,102 +4285,206 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
*/
if (ret == 0) {
struct btrfs_key tmp;
+
btrfs_item_key_to_cpu(path->nodes[0], &tmp,
path->slots[0]);
- if (key_type == tmp.type)
- first_offset = max(min_offset, tmp.offset) + 1;
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
+ } else if (ret > 0) {
+ ret = 0;
}
+
goto done;
}
/* go backward to find any previous key */
- ret = btrfs_previous_item(root, path, ino, key_type);
+ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
if (ret == 0) {
struct btrfs_key tmp;
+
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (key_type == tmp.type) {
- first_offset = tmp.offset;
- ret = overwrite_item(trans, log, dst_path,
- path->nodes[0], path->slots[0],
- &tmp);
- if (ret) {
- err = ret;
- goto done;
- }
- }
+ /*
+ * The dir index key before the first one we found that needs to
+ * be logged might be in a previous leaf, and there might be a
+ * gap between these keys, meaning that we had deletions that
+ * happened. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
+ * previous key's offset plus 1, so that those deletes are replayed.
+ */
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
+ } else if (ret < 0) {
+ goto done;
}
+
btrfs_release_path(path);
- /* find the first key from this transaction again */
+ /*
+ * Find the first key from this transaction again or the one we were at
+ * in the loop below in case we had to reschedule. We may be logging the
+ * directory without holding its VFS lock, which happen when logging new
+ * dentries (through log_new_dir_dentries()) or in some cases when we
+ * need to log the parent directory of an inode. This means a dir index
+ * key might be deleted from the inode's root, and therefore we may not
+ * find it anymore. If we can't find it, just move to the next key. We
+ * can not bail out and ignore, because if we do that we will simply
+ * not log dir index keys that come after the one that was just deleted
+ * and we can end up logging a dir index range that ends at (u64)-1
+ * (@last_offset is initialized to that), resulting in removing dir
+ * entries we should not remove at log replay time.
+ */
+search:
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
- if (ret != 0) {
- WARN_ON(1);
- goto done;
+ if (ret > 0) {
+ ret = btrfs_next_item(root, path);
+ if (ret > 0) {
+ /* There are no more keys in the inode's root. */
+ ret = 0;
+ goto done;
+ }
}
+ if (ret < 0)
+ goto done;
/*
* we have a block from this transaction, log every item in it
* from our directory
*/
while (1) {
- struct btrfs_key tmp;
- src = path->nodes[0];
- nritems = btrfs_header_nritems(src);
- for (i = path->slots[0]; i < nritems; i++) {
- btrfs_item_key_to_cpu(src, &min_key, i);
-
- if (min_key.objectid != ino || min_key.type != key_type)
- goto done;
- ret = overwrite_item(trans, log, dst_path, src, i,
- &min_key);
- if (ret) {
- err = ret;
- goto done;
- }
+ ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
+ &last_old_dentry_offset);
+ if (ret != 0) {
+ if (ret > 0)
+ ret = 0;
+ goto done;
}
- path->slots[0] = nritems;
+ path->slots[0] = btrfs_header_nritems(path->nodes[0]);
/*
* look ahead to the next item and see if it is also
* from this directory and from this transaction
*/
ret = btrfs_next_leaf(root, path);
- if (ret == 1) {
- last_offset = (u64)-1;
+ if (ret) {
+ if (ret == 1) {
+ last_offset = (u64)-1;
+ ret = 0;
+ }
goto done;
}
- btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (tmp.objectid != ino || tmp.type != key_type) {
+ btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+ if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
last_offset = (u64)-1;
goto done;
}
if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
- ret = overwrite_item(trans, log, dst_path,
- path->nodes[0], path->slots[0],
- &tmp);
- if (ret)
- err = ret;
- else
- last_offset = tmp.offset;
+ /*
+ * The next leaf was not changed in the current transaction
+ * and has at least one dir index key.
+ * We check for the next key because there might have been
+ * one or more deletions between the last key we logged and
+ * that next key. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
+ * offset minus 1, so that those deletes are replayed.
+ */
+ last_offset = min_key.offset - 1;
goto done;
}
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
}
done:
btrfs_release_path(path);
btrfs_release_path(dst_path);
- if (err == 0) {
+ if (ret == 0) {
*last_offset_ret = last_offset;
/*
- * insert the log range keys to indicate where the log
- * is valid
+ * In case the leaf was changed in the current transaction but
+ * all its dir items are from a past transaction, the last item
+ * in the leaf is a dir item and there's no gap between that last
+ * dir item and the first one on the next leaf (which did not
+ * change in the current transaction), then we don't need to log
+ * a range, last_old_dentry_offset is == to last_offset.
*/
- ret = insert_dir_log_key(trans, log, path, key_type,
- ino, first_offset, last_offset);
- if (ret)
- err = ret;
+ ASSERT(last_old_dentry_offset <= last_offset,
+ "last_old_dentry_offset=%llu last_offset=%llu",
+ last_old_dentry_offset, last_offset);
+ if (last_old_dentry_offset < last_offset)
+ ret = insert_dir_log_key(trans, log, path, ino,
+ last_old_dentry_offset + 1,
+ last_offset);
}
- return err;
+
+ return ret;
+}
+
+/*
+ * If the inode was logged before and it was evicted, then its
+ * last_dir_index_offset is 0, so we don't know the value of the last index
+ * key offset. If that's the case, search for it and update the inode. This
+ * is to avoid lookups in the log tree every time we try to insert a dir index
+ * key from a leaf changed in the current transaction, and to allow us to always
+ * do batch insertions of dir index keys.
+ */
+static int update_last_dir_index_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_key key;
+ int ret;
+
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (inode->last_dir_index_offset != 0)
+ return 0;
+
+ if (!ctx->logged_before) {
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+ return 0;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+ /*
+ * An error happened or we actually have an index key with an offset
+ * value of (u64)-1. Bail out, we're done.
+ */
+ if (ret <= 0)
+ goto out;
+
+ ret = 0;
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+
+ /*
+ * No dir index items, bail out and leave last_dir_index_offset with
+ * the value right before the first valid index value.
+ */
+ if (path->slots[0] == 0)
+ goto out;
+
+ /*
+ * btrfs_search_slot() left us at one slot beyond the slot with the last
+ * index key, or beyond the last key of the directory that is not an
+ * index key. If we have an index key before, set last_dir_index_offset
+ * to its offset value, otherwise leave it with a value right before the
+ * first valid index value, as it means we have an empty directory.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
+ inode->last_dir_index_offset = key.offset;
+
+out:
+ btrfs_release_path(path);
+
+ return ret;
}
/*
@@ -2983,22 +4500,25 @@ done:
* key logged by this transaction.
*/
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path)
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx)
{
u64 min_key;
u64 max_key;
int ret;
- int key_type = BTRFS_DIR_ITEM_KEY;
-again:
- min_key = 0;
+ ret = update_last_dir_index_offset(inode, path, ctx);
+ if (ret)
+ return ret;
+
+ min_key = BTRFS_DIR_START_INDEX;
max_key = 0;
+
while (1) {
- ret = log_dir_items(trans, root, inode, path,
- dst_path, key_type, min_key,
- &max_key);
+ ret = log_dir_items(trans, inode, path, dst_path,
+ ctx, min_key, &max_key);
if (ret)
return ret;
if (max_key == (u64)-1)
@@ -3006,10 +4526,6 @@ again:
min_key = max_key + 1;
}
- if (key_type == BTRFS_DIR_ITEM_KEY) {
- key_type = BTRFS_DIR_INDEX_KEY;
- goto again;
- }
return 0;
}
@@ -3019,40 +4535,42 @@ again:
* This cannot be run for file data extents because it does not
* free the extents they point to.
*/
-static int drop_objectid_items(struct btrfs_trans_handle *trans,
+static int drop_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 objectid, int max_key_type)
+ struct btrfs_inode *inode,
+ int max_key_type)
{
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
int start_slot;
- key.objectid = objectid;
+ key.objectid = btrfs_ino(inode);
key.type = max_key_type;
key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
- BUG_ON(ret == 0); /* Logic error */
- if (ret < 0)
- break;
-
- if (path->slots[0] == 0)
+ if (ret < 0) {
break;
+ } else if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
- path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
- if (found_key.objectid != objectid)
+ if (found_key.objectid != key.objectid)
break;
found_key.offset = 0;
found_key.type = 0;
- ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
- &start_slot);
+ ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
+ if (ret < 0)
+ break;
ret = btrfs_del_items(trans, log, path, start_slot,
path->slots[0] - start_slot + 1);
@@ -3070,14 +4588,28 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
return ret;
}
+static int truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log_root,
+ struct btrfs_inode *inode,
+ u64 new_size, u32 min_type)
+{
+ struct btrfs_truncate_control control = {
+ .new_size = new_size,
+ .ino = btrfs_ino(inode),
+ .min_type = min_type,
+ .skip_ref_updates = true,
+ };
+
+ return btrfs_truncate_inode_items(trans, log_root, &control);
+}
+
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
- struct inode *inode, int log_inode_only)
+ struct inode *inode, bool log_inode_only,
+ u64 logged_isize)
{
- struct btrfs_map_token token;
-
- btrfs_init_map_token(&token);
+ u64 flags;
if (log_inode_only) {
/* set the generation to zero so the recover code
@@ -3085,192 +4617,373 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* just to say 'this inode exists' and a logging
* to say 'update this inode with these values'
*/
- btrfs_set_token_inode_generation(leaf, item, 0, &token);
- btrfs_set_token_inode_size(leaf, item, 0, &token);
+ btrfs_set_inode_generation(leaf, item, 0);
+ btrfs_set_inode_size(leaf, item, logged_isize);
} else {
- btrfs_set_token_inode_generation(leaf, item,
- BTRFS_I(inode)->generation,
- &token);
- btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
- }
-
- btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
- btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
- btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
- btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
-
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
- inode->i_atime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
- inode->i_atime.tv_nsec, &token);
-
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
- inode->i_mtime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
- inode->i_mtime.tv_nsec, &token);
-
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
- inode->i_ctime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
- inode->i_ctime.tv_nsec, &token);
-
- btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
- &token);
-
- btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
- btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
- btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
- btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
- btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+ btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+ btrfs_set_inode_size(leaf, item, inode->i_size);
+ }
+
+ btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+ btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+ btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
+
+ /*
+ * We do not need to set the nbytes field, in fact during a fast fsync
+ * its value may not even be correct, since a fast fsync does not wait
+ * for ordered extent completion, which is where we update nbytes, it
+ * only waits for writeback to complete. During log replay as we find
+ * file extent items and replay them, we adjust the nbytes field of the
+ * inode item in subvolume tree as needed (see overwrite_item()).
+ */
+
+ btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_block_group(leaf, item, 0);
}
static int log_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_root *log, struct btrfs_path *path,
- struct inode *inode)
+ struct btrfs_inode *inode, bool inode_item_dropped)
{
struct btrfs_inode_item *inode_item;
struct btrfs_key key;
int ret;
- memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
- ret = btrfs_insert_empty_item(trans, log, path, &key,
- sizeof(*inode_item));
- if (ret && ret != -EEXIST)
+ btrfs_get_inode_key(inode, &key);
+ /*
+ * If we are doing a fast fsync and the inode was logged before in the
+ * current transaction, then we know the inode was previously logged and
+ * it exists in the log tree. For performance reasons, in this case use
+ * btrfs_search_slot() directly with ins_len set to 0 so that we never
+ * attempt a write lock on the leaf's parent, which adds unnecessary lock
+ * contention in case there are concurrent fsyncs for other inodes of the
+ * same subvolume. Using btrfs_insert_empty_item() when the inode item
+ * already exists can also result in unnecessarily splitting a leaf.
+ */
+ if (!inode_item_dropped && inode->logged_trans == trans->transid) {
+ ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+ ASSERT(ret <= 0);
+ if (ret > 0)
+ ret = -ENOENT;
+ } else {
+ /*
+ * This means it is the first fsync in the current transaction,
+ * so the inode item is not in the log and we need to insert it.
+ * We can never get -EEXIST because we are only called for a fast
+ * fsync and in case an inode eviction happens after the inode was
+ * logged before in the current transaction, when we load again
+ * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
+ * flags and set ->logged_trans to 0.
+ */
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
+ sizeof(*inode_item));
+ ASSERT(ret != -EEXIST);
+ }
+ if (ret)
return ret;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
- fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+ fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
+ false, 0);
btrfs_release_path(path);
return 0;
}
+static int log_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_root *log_root,
+ struct btrfs_ordered_sum *sums)
+{
+ const u64 lock_end = sums->logical + sums->len - 1;
+ struct extent_state *cached_state = NULL;
+ int ret;
+
+ /*
+ * If this inode was not used for reflink operations in the current
+ * transaction with new extents, then do the fast path, no need to
+ * worry about logging checksum items with overlapping ranges.
+ */
+ if (inode->last_reflink_trans < trans->transid)
+ return btrfs_csum_file_blocks(trans, log_root, sums);
+
+ /*
+ * Serialize logging for checksums. This is to avoid racing with the
+ * same checksum being logged by another task that is logging another
+ * file which happens to refer to the same extent as well. Such races
+ * can leave checksum items in the log with overlapping ranges.
+ */
+ ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
+ &cached_state);
+ if (ret)
+ return ret;
+ /*
+ * Due to extent cloning, we might have logged a csum item that covers a
+ * subrange of a cloned extent, and later we can end up logging a csum
+ * item for a larger subrange of the same extent or the entire range.
+ * This would leave csum items in the log tree that cover the same range
+ * and break the searches for checksums in the log tree, resulting in
+ * some checksums missing in the fs/subvolume tree. So just delete (or
+ * trim and adjust) any existing csum items in the log for this range.
+ */
+ ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans, log_root, sums);
+
+ btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
+ &cached_state);
+
+ return ret;
+}
+
static noinline int copy_items(struct btrfs_trans_handle *trans,
- struct inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *dst_path,
- struct extent_buffer *src,
- int start_slot, int nr, int inode_only)
+ struct btrfs_path *src_path,
+ int start_slot, int nr, int inode_only,
+ u64 logged_isize, struct btrfs_log_ctx *ctx)
{
- unsigned long src_offset;
- unsigned long dst_offset;
- struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
+ struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
- struct btrfs_inode_item *inode_item;
+ struct extent_buffer *src;
int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
- char *ins_data;
- int i;
- struct list_head ordered_sums;
- int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ struct btrfs_item_batch batch;
+ char AUTO_KFREE(ins_data);
+ int dst_index;
+ const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
+ const u64 i_size = i_size_read(&inode->vfs_inode);
- INIT_LIST_HEAD(&ordered_sums);
+ /*
+ * To keep lockdep happy and avoid deadlocks, clone the source leaf and
+ * use the clone. This is because otherwise we would be changing the log
+ * tree, to insert items from the subvolume tree or insert csum items,
+ * while holding a read lock on a leaf from the subvolume tree, which
+ * creates a nasty lock dependency when COWing log tree nodes/leaves:
+ *
+ * 1) Modifying the log tree triggers an extent buffer allocation while
+ * holding a write lock on a parent extent buffer from the log tree.
+ * Allocating the pages for an extent buffer, or the extent buffer
+ * struct, can trigger inode eviction and finally the inode eviction
+ * will trigger a release/remove of a delayed node, which requires
+ * taking the delayed node's mutex;
+ *
+ * 2) Allocating a metadata extent for a log tree can trigger the async
+ * reclaim thread and make us wait for it to release enough space and
+ * unblock our reservation ticket. The reclaim thread can start
+ * flushing delayed items, and that in turn results in the need to
+ * lock delayed node mutexes and in the need to write lock extent
+ * buffers of a subvolume tree - all this while holding a write lock
+ * on the parent extent buffer in the log tree.
+ *
+ * So one task in scenario 1) running in parallel with another task in
+ * scenario 2) could lead to a deadlock, one wanting to lock a delayed
+ * node mutex while having a read lock on a leaf from the subvolume,
+ * while the other is holding the delayed node's mutex and wants to
+ * write lock the same subvolume leaf for flushing delayed items.
+ */
+ ret = clone_leaf(src_path, ctx);
+ if (ret < 0)
+ return ret;
- ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
- nr * sizeof(u32), GFP_NOFS);
+ src = src_path->nodes[0];
+
+ ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS);
if (!ins_data)
return -ENOMEM;
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+ batch.nr = 0;
+
+ dst_index = 0;
+ for (int i = 0; i < nr; i++) {
+ const int src_slot = start_slot + i;
+ struct btrfs_root *csum_root;
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_ordered_sum *sums_next;
+ LIST_HEAD(ordered_sums);
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 extent_offset;
+ u64 extent_num_bytes;
+ bool is_old_extent;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
+
+ if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
+ goto add_to_batch;
+
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
+
+ is_old_extent = (btrfs_file_extent_generation(src, extent) <
+ trans->transid);
- for (i = 0; i < nr; i++) {
- ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
- btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
- }
- ret = btrfs_insert_empty_items(trans, log, dst_path,
- ins_keys, ins_sizes, nr);
- if (ret) {
- kfree(ins_data);
- return ret;
- }
+ /*
+ * Don't copy extents from past generations. That would make us
+ * log a lot more metadata for common cases like doing only a
+ * few random writes into a file and then fsync it for the first
+ * time or after the full sync flag is set on the inode. We can
+ * get leaves full of extent items, most of which are from past
+ * generations, so we can skip them - as long as the inode has
+ * not been the target of a reflink operation in this transaction,
+ * as in that case it might have had file extent items with old
+ * generations copied into it. We also must always log prealloc
+ * extents that start at or beyond eof, otherwise we would lose
+ * them on log replay.
+ */
+ if (is_old_extent &&
+ ins_keys[dst_index].offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
- for (i = 0; i < nr; i++, dst_path->slots[0]++) {
- dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
- dst_path->slots[0]);
+ if (skip_csum)
+ goto add_to_batch;
- src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+ /* Only regular extents have checksums. */
+ if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
+ goto add_to_batch;
- if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
- inode_item = btrfs_item_ptr(dst_path->nodes[0],
- dst_path->slots[0],
- struct btrfs_inode_item);
- fill_inode_item(trans, dst_path->nodes[0], inode_item,
- inode, inode_only == LOG_INODE_EXISTS);
+ /*
+ * If it's an extent created in a past transaction, then its
+ * checksums are already accessible from the committed csum tree,
+ * no need to log them.
+ */
+ if (is_old_extent)
+ goto add_to_batch;
+
+ disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
+ /* If it's an explicit hole, there are no checksums. */
+ if (disk_bytenr == 0)
+ goto add_to_batch;
+
+ disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
+
+ if (btrfs_file_extent_compression(src, extent)) {
+ extent_offset = 0;
+ extent_num_bytes = disk_num_bytes;
} else {
- copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
- src_offset, ins_sizes[i]);
+ extent_offset = btrfs_file_extent_offset(src, extent);
+ extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
}
- /* take a reference on file data extents so that truncates
- * or deletes of this inode don't have to relog the inode
- * again
+ csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
+ disk_bytenr += extent_offset;
+ ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
+ disk_bytenr + extent_num_bytes - 1,
+ &ordered_sums, false);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+
+ list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
+ if (!ret)
+ ret = log_csums(trans, inode, log, sums);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ if (ret)
+ return ret;
+
+add_to_batch:
+ ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
+ batch.total_data_size += ins_sizes[dst_index];
+ batch.nr++;
+ dst_index++;
+ }
+
+ /*
+ * We have a leaf full of old extent items that don't need to be logged,
+ * so we don't need to do anything.
+ */
+ if (batch.nr == 0)
+ return 0;
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
+ return ret;
+
+ dst_index = 0;
+ for (int i = 0; i < nr; i++) {
+ const int src_slot = start_slot + i;
+ const int dst_slot = dst_path->slots[0] + dst_index;
+ struct btrfs_key key;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+
+ /*
+ * We're done, all the remaining items in the source leaf
+ * correspond to old file extent items.
*/
- if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
- !skip_csum) {
- int found_type;
- extent = btrfs_item_ptr(src, start_slot + i,
- struct btrfs_file_extent_item);
+ if (dst_index >= batch.nr)
+ break;
- if (btrfs_file_extent_generation(src, extent) < trans->transid)
- continue;
+ btrfs_item_key_to_cpu(src, &key, src_slot);
- found_type = btrfs_file_extent_type(src, extent);
- if (found_type == BTRFS_FILE_EXTENT_REG) {
- u64 ds, dl, cs, cl;
- ds = btrfs_file_extent_disk_bytenr(src,
- extent);
- /* ds == 0 is a hole */
- if (ds == 0)
- continue;
-
- dl = btrfs_file_extent_disk_num_bytes(src,
- extent);
- cs = btrfs_file_extent_offset(src, extent);
- cl = btrfs_file_extent_num_bytes(src,
- extent);
- if (btrfs_file_extent_compression(src,
- extent)) {
- cs = 0;
- cl = dl;
- }
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ goto copy_item;
- ret = btrfs_lookup_csums_range(
- log->fs_info->csum_root,
- ds + cs, ds + cs + cl - 1,
- &ordered_sums, 0);
- if (ret) {
- btrfs_release_path(dst_path);
- kfree(ins_data);
- return ret;
- }
- }
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
+
+ /* See the comment in the previous loop, same logic. */
+ if (btrfs_file_extent_generation(src, extent) < trans->transid &&
+ key.offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
+
+copy_item:
+ dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
+ src_offset = btrfs_item_ptr_offset(src, src_slot);
+
+ if (key.type == BTRFS_INODE_ITEM_KEY) {
+ struct btrfs_inode_item *inode_item;
+
+ inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
+ struct btrfs_inode_item);
+ fill_inode_item(trans, dst_path->nodes[0], inode_item,
+ &inode->vfs_inode,
+ inode_only == LOG_INODE_EXISTS,
+ logged_isize);
+ } else {
+ copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+ src_offset, ins_sizes[dst_index]);
}
+
+ dst_index++;
}
- btrfs_mark_buffer_dirty(dst_path->nodes[0]);
btrfs_release_path(dst_path);
- kfree(ins_data);
- /*
- * we have to do this after the loop above to avoid changing the
- * log tree while trying to change the log tree.
- */
- ret = 0;
- while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
- if (!ret)
- ret = btrfs_csum_file_blocks(trans, log, sums);
- list_del(&sums->list);
- kfree(sums);
- }
return ret;
}
-static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int extent_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
- struct extent_map *em1, *em2;
+ const struct extent_map *em1, *em2;
em1 = list_entry(a, struct extent_map, list);
em2 = list_entry(b, struct extent_map, list);
@@ -3282,130 +4995,47 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
return 0;
}
-static int log_one_extent(struct btrfs_trans_handle *trans,
- struct inode *inode, struct btrfs_root *root,
- struct extent_map *em, struct btrfs_path *path)
+static int log_extent_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_root *log_root,
+ const struct extent_map *em,
+ struct btrfs_log_ctx *ctx)
{
- struct btrfs_root *log = root->log_root;
- struct btrfs_file_extent_item *fi;
- struct extent_buffer *leaf;
struct btrfs_ordered_extent *ordered;
- struct list_head ordered_sums;
- struct btrfs_map_token token;
- struct btrfs_key key;
- u64 mod_start = em->mod_start;
- u64 mod_len = em->mod_len;
+ struct btrfs_root *csum_root;
+ u64 block_start;
u64 csum_offset;
u64 csum_len;
- u64 extent_offset = em->start - em->orig_start;
- u64 block_len;
- int ret;
- int index = log->log_transid % 2;
- bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-
- ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
- em->start + em->len, NULL, 0);
- if (ret)
- return ret;
-
- INIT_LIST_HEAD(&ordered_sums);
- btrfs_init_map_token(&token);
- key.objectid = btrfs_ino(inode);
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = em->start;
-
- ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
- if (ret)
- return ret;
- leaf = path->nodes[0];
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
- &token);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
- skip_csum = true;
- btrfs_set_token_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_PREALLOC,
- &token);
- } else {
- btrfs_set_token_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_REG,
- &token);
- if (em->block_start == 0)
- skip_csum = true;
- }
-
- block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
- em->block_start,
- &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
- &token);
- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
- em->block_start -
- extent_offset, &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
- &token);
- } else {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
- &token);
- }
-
- btrfs_set_token_file_extent_offset(leaf, fi,
- em->start - em->orig_start,
- &token);
- btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
- btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
- btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
- &token);
- btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
- btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
- btrfs_mark_buffer_dirty(leaf);
-
- btrfs_release_path(path);
- if (ret) {
- return ret;
- }
+ u64 mod_start = em->start;
+ u64 mod_len = em->len;
+ LIST_HEAD(ordered_sums);
+ int ret = 0;
- if (skip_csum)
+ if (inode->flags & BTRFS_INODE_NODATASUM ||
+ (em->flags & EXTENT_FLAG_PREALLOC) ||
+ em->disk_bytenr == EXTENT_MAP_HOLE)
return 0;
- if (em->compress_type) {
- csum_offset = 0;
- csum_len = block_len;
- }
+ list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
+ const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
+ const u64 mod_end = mod_start + mod_len;
+ struct btrfs_ordered_sum *sums;
- /*
- * First check and see if our csums are on our outstanding ordered
- * extents.
- */
-again:
- spin_lock_irq(&log->log_extents_lock[index]);
- list_for_each_entry(ordered, &log->logged_list[index], log_list) {
- struct btrfs_ordered_sum *sum;
-
- if (!mod_len)
+ if (mod_len == 0)
break;
- if (ordered->inode != inode)
- continue;
-
- if (ordered->file_offset + ordered->len <= mod_start ||
- mod_start + mod_len <= ordered->file_offset)
+ if (ordered_end <= mod_start)
continue;
+ if (mod_end <= ordered->file_offset)
+ break;
/*
* We are going to copy all the csums on this ordered extent, so
- * go ahead and adjust mod_start and mod_len in case this
- * ordered extent has already been logged.
+ * go ahead and adjust mod_start and mod_len in case this ordered
+ * extent has already been logged.
*/
if (ordered->file_offset > mod_start) {
- if (ordered->file_offset + ordered->len >=
- mod_start + mod_len)
+ if (ordered_end >= mod_end)
mod_len = ordered->file_offset - mod_start;
/*
* If we have this case
@@ -3418,12 +5048,9 @@ again:
* will be ok.
*/
} else {
- if (ordered->file_offset + ordered->len <
- mod_start + mod_len) {
- mod_len = (mod_start + mod_len) -
- (ordered->file_offset + ordered->len);
- mod_start = ordered->file_offset +
- ordered->len;
+ if (ordered_end < mod_end) {
+ mod_len = mod_end - ordered_end;
+ mod_start = ordered_end;
} else {
mod_len = 0;
}
@@ -3433,52 +5060,45 @@ again:
* To keep us from looping for the above case of an ordered
* extent that falls inside of the logged extent.
*/
- if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
- &ordered->flags))
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
continue;
- atomic_inc(&ordered->refs);
- spin_unlock_irq(&log->log_extents_lock[index]);
- /*
- * we've dropped the lock, we must either break or
- * start over after this.
- */
- wait_event(ordered->wait, ordered->csum_bytes_left == 0);
-
- list_for_each_entry(sum, &ordered->list, list) {
- ret = btrfs_csum_file_blocks(trans, log, sum);
- if (ret) {
- btrfs_put_ordered_extent(ordered);
- goto unlocked;
- }
+ list_for_each_entry(sums, &ordered->list, list) {
+ ret = log_csums(trans, inode, log_root, sums);
+ if (ret)
+ return ret;
}
- btrfs_put_ordered_extent(ordered);
- goto again;
-
}
- spin_unlock_irq(&log->log_extents_lock[index]);
-unlocked:
- if (!mod_len || ret)
- return ret;
+ /* We're done, found all csums in the ordered extents. */
+ if (mod_len == 0)
+ return 0;
- csum_offset = mod_start - em->start;
- csum_len = mod_len;
+ /* If we're compressed we have to save the entire range of csums. */
+ if (btrfs_extent_map_is_compressed(em)) {
+ csum_offset = 0;
+ csum_len = em->disk_num_bytes;
+ } else {
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
+ }
/* block start is already adjusted for the file extent offset. */
- ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
- em->block_start + csum_offset,
- em->block_start + csum_offset +
- csum_len - 1, &ordered_sums, 0);
- if (ret)
+ block_start = btrfs_extent_map_block_start(em);
+ csum_root = btrfs_csum_root(trans->fs_info, block_start);
+ ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
+ block_start + csum_offset + csum_len - 1,
+ &ordered_sums, false);
+ if (ret < 0)
return ret;
+ ret = 0;
while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
+ struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums,
+ struct btrfs_ordered_sum,
+ list);
if (!ret)
- ret = btrfs_csum_file_blocks(trans, log, sums);
+ ret = log_csums(trans, inode, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -3486,26 +5106,243 @@ unlocked:
return ret;
}
+static int log_one_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ const struct extent_map *em,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_file_extent_item fi = { 0 };
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ enum btrfs_compression_type compress_type;
+ u64 extent_offset = em->offset;
+ u64 block_start = btrfs_extent_map_block_start(em);
+ u64 block_len;
+ int ret;
+
+ btrfs_set_stack_file_extent_generation(&fi, trans->transid);
+ if (em->flags & EXTENT_FLAG_PREALLOC)
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
+ else
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
+
+ block_len = em->disk_num_bytes;
+ compress_type = btrfs_extent_map_compression(em);
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ } else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ }
+
+ btrfs_set_stack_file_extent_offset(&fi, extent_offset);
+ btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
+ btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
+ btrfs_set_stack_file_extent_compression(&fi, compress_type);
+
+ ret = log_extent_csums(trans, inode, log, em, ctx);
+ if (ret)
+ return ret;
+
+ /*
+ * If this is the first time we are logging the inode in the current
+ * transaction, we can avoid btrfs_drop_extents(), which is expensive
+ * because it does a deletion search, which always acquires write locks
+ * for extent buffers at levels 2, 1 and 0. This not only wastes time
+ * but also adds significant contention in a log tree, since log trees
+ * are small, with a root at level 2 or 3 at most, due to their short
+ * life span.
+ */
+ if (ctx->logged_before) {
+ drop_args.path = path;
+ drop_args.start = em->start;
+ drop_args.end = em->start + em->len;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(fi);
+ ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+ if (ret)
+ return ret;
+ }
+
+ if (!drop_args.extent_inserted) {
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = em->start;
+
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
+ sizeof(fi));
+ if (ret)
+ return ret;
+ }
+ leaf = path->nodes[0];
+ write_extent_buffer(leaf, &fi,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(fi));
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+/*
+ * Log all prealloc extents beyond the inode's i_size to make sure we do not
+ * lose them after doing a full/fast fsync and replaying the log. We scan the
+ * subvolume's root instead of iterating the inode's extent map tree because
+ * otherwise we can log incorrect extent items based on extent map conversion.
+ * That can happen due to the fact that extent maps are merged when they
+ * are not in the extent map tree's list of modified extents.
+ */
+static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_key key;
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ BTRFS_PATH_AUTO_FREE(dst_path);
+ bool dropped_extents = false;
+ u64 truncate_offset = i_size;
+ struct extent_buffer *leaf;
+ int slot;
+ int ins_nr = 0;
+ int start_slot = 0;
+ int ret;
+
+ if (!(inode->flags & BTRFS_INODE_PREALLOC))
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = i_size;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We must check if there is a prealloc extent that starts before the
+ * i_size and crosses the i_size boundary. This is to ensure later we
+ * truncate down to the end of that extent and not to the i_size, as
+ * otherwise we end up losing part of the prealloc extent after a log
+ * replay and with an implicit hole if there is another prealloc extent
+ * that starts at an offset beyond i_size.
+ */
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ struct btrfs_file_extent_item *ei;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, ei) ==
+ BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 extent_end;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, ei);
+
+ if (extent_end > i_size)
+ truncate_offset = extent_end;
+ }
+ } else {
+ ret = 0;
+ }
+
+ while (true) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, path,
+ start_slot, ins_nr, 1, 0, ctx);
+ if (ret < 0)
+ goto out;
+ ins_nr = 0;
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(key.objectid < ino) ||
+ key.type < BTRFS_EXTENT_DATA_KEY ||
+ key.offset < i_size) {
+ path->slots[0]++;
+ continue;
+ }
+ /*
+ * Avoid overlapping items in the log tree. The first time we
+ * get here, get rid of everything from a past fsync. After
+ * that, if the current extent starts before the end of the last
+ * extent we copied, truncate the last one. This can happen if
+ * an ordered extent completion modifies the subvolume tree
+ * while btrfs_next_leaf() has the tree unlocked.
+ */
+ if (!dropped_extents || key.offset < truncate_offset) {
+ ret = truncate_inode_items(trans, root->log_root, inode,
+ min(key.offset, truncate_offset),
+ BTRFS_EXTENT_DATA_KEY);
+ if (ret)
+ goto out;
+ dropped_extents = true;
+ }
+ truncate_offset = btrfs_file_extent_end(path);
+ if (ins_nr == 0)
+ start_slot = slot;
+ ins_nr++;
+ path->slots[0]++;
+ if (!dst_path) {
+ dst_path = btrfs_alloc_path();
+ if (!dst_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+ if (ins_nr > 0)
+ ret = copy_items(trans, inode, dst_path, path,
+ start_slot, ins_nr, 1, 0, ctx);
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode,
- struct btrfs_path *path)
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
struct extent_map *em, *n;
- struct list_head extents;
- struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
- u64 test_gen;
+ LIST_HEAD(extents);
+ struct extent_map_tree *tree = &inode->extent_tree;
int ret = 0;
int num = 0;
- INIT_LIST_HEAD(&extents);
-
write_lock(&tree->lock);
- test_gen = root->fs_info->last_trans_committed;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
list_del_init(&em->list);
-
/*
* Just an arbitrary number, this can be really CPU intensive
* once we start getting a lot of extents, and really once we
@@ -3518,20 +5355,25 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
goto process;
}
- if (em->generation <= test_gen)
+ if (em->generation < trans->transid)
+ continue;
+
+ /* We log prealloc extents beyond eof later. */
+ if ((em->flags & EXTENT_FLAG_PREALLOC) &&
+ em->start >= i_size_read(&inode->vfs_inode))
continue;
+
/* Need a ref to keep it from getting evicted from cache */
- atomic_inc(&em->refs);
- set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ refcount_inc(&em->refs);
+ em->flags |= EXTENT_FLAG_LOGGING;
list_add_tail(&em->list, &extents);
num++;
}
list_sort(NULL, &extents, extent_cmp);
-
process:
while (!list_empty(&extents)) {
- em = list_entry(extents.next, struct extent_map, list);
+ em = list_first_entry(&extents, struct extent_map, list);
list_del_init(&em->list);
@@ -3540,22 +5382,1414 @@ process:
* private list.
*/
if (ret) {
- clear_em_logging(tree, em);
- free_extent_map(em);
+ btrfs_clear_em_logging(inode, em);
+ btrfs_free_extent_map(em);
continue;
}
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path);
+ ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
- clear_em_logging(tree, em);
- free_extent_map(em);
+ btrfs_clear_em_logging(inode, em);
+ btrfs_free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
+ if (!ret)
+ ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
+ if (ret)
+ return ret;
+
+ /*
+ * We have logged all extents successfully, now make sure the commit of
+ * the current transaction waits for the ordered extents to complete
+ * before it commits and wipes out the log trees, otherwise we would
+ * lose data if an ordered extents completes after the transaction
+ * commits and a power failure happens after the transaction commit.
+ */
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
+
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ spin_lock(&inode->ordered_tree_lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock(&inode->ordered_tree_lock);
+ }
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ return 0;
+}
+
+static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
+ struct btrfs_path *path, u64 *size_ret)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *size_ret = 0;
+ } else {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ *size_ret = btrfs_inode_size(path->nodes[0], item);
+ /*
+ * If the in-memory inode's i_size is smaller then the inode
+ * size stored in the btree, return the inode's i_size, so
+ * that we get a correct inode size after replaying the log
+ * when before a power failure we had a shrinking truncate
+ * followed by addition of a new name (rename / new hard link).
+ * Otherwise return the inode size from the btree, to avoid
+ * data loss when replaying a log due to previously doing a
+ * write that expands the inode's size and logging a new name
+ * immediately after.
+ */
+ if (*size_ret > inode->vfs_inode.i_size)
+ *size_ret = inode->vfs_inode.i_size;
+ }
+
btrfs_release_path(path);
+ return 0;
+}
+
+/*
+ * At the moment we always log all xattrs. This is to figure out at log replay
+ * time which xattrs must have their deletion replayed. If a xattr is missing
+ * in the log tree and exists in the fs/subvol tree, we delete it. This is
+ * because if a xattr is deleted, the inode is fsynced and a power failure
+ * happens, causing the log to be replayed the next time the fs is mounted,
+ * we want the xattr to not exist anymore (same behaviour as other filesystems
+ * with a journal, ext3/4, xfs, f2fs, etc).
+ */
+static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ int ret;
+ struct btrfs_key key;
+ const u64 ino = btrfs_ino(inode);
+ int ins_nr = 0;
+ int start_slot = 0;
+ bool found_xattrs = false;
+
+ if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (true) {
+ int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ int nritems = btrfs_header_nritems(leaf);
+
+ if (slot >= nritems) {
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, path,
+ start_slot, ins_nr, 1, 0, ctx);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ if (ins_nr == 0)
+ start_slot = slot;
+ ins_nr++;
+ path->slots[0]++;
+ found_xattrs = true;
+ cond_resched();
+ }
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, path,
+ start_slot, ins_nr, 1, 0, ctx);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (!found_xattrs)
+ set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
+
+ return 0;
+}
+
+/*
+ * When using the NO_HOLES feature if we punched a hole that causes the
+ * deletion of entire leafs or all the extent items of the first leaf (the one
+ * that contains the inode item and references) we may end up not processing
+ * any extents, because there are no leafs with a generation matching the
+ * current transaction that have extent items for our inode. So we need to find
+ * if any holes exist and then log them. We also need to log holes after any
+ * truncate operation that changes the inode's size.
+ */
+static int btrfs_log_holes(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ const u64 ino = btrfs_ino(inode);
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ u64 prev_extent_end = 0;
+ int ret;
+
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ /* We have a hole, log it. */
+ if (prev_extent_end < key.offset) {
+ const u64 hole_len = key.offset - prev_extent_end;
+
+ /*
+ * Release the path to avoid deadlocks with other code
+ * paths that search the root while holding locks on
+ * leafs from the log root.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_insert_hole_extent(trans, root->log_root,
+ ino, prev_extent_end,
+ hole_len);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Search for the same key again in the root. Since it's
+ * an extent item and we are holding the inode lock, the
+ * key must still exist. If it doesn't just emit warning
+ * and return an error to fall back to a transaction
+ * commit.
+ */
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+ if (WARN_ON(ret > 0))
+ return -ENOENT;
+ leaf = path->nodes[0];
+ }
+
+ prev_extent_end = btrfs_file_extent_end(path);
+ path->slots[0]++;
+ cond_resched();
+ }
+
+ if (prev_extent_end < i_size) {
+ u64 hole_len;
+
+ btrfs_release_path(path);
+ hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
+ ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
+ prev_extent_end, hole_len);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * When we are logging a new inode X, check if it doesn't have a reference that
+ * matches the reference from some other inode Y created in a past transaction
+ * and that was renamed in the current transaction. If we don't do this, then at
+ * log replay time we can lose inode Y (and all its files if it's a directory):
+ *
+ * mkdir /mnt/x
+ * echo "hello world" > /mnt/x/foobar
+ * sync
+ * mv /mnt/x /mnt/y
+ * mkdir /mnt/x # or touch /mnt/x
+ * xfs_io -c fsync /mnt/x
+ * <power fail>
+ * mount fs, trigger log replay
+ *
+ * After the log replay procedure, we would lose the first directory and all its
+ * files (file foobar).
+ * For the case where inode Y is not a directory we simply end up losing it:
+ *
+ * echo "123" > /mnt/foo
+ * sync
+ * mv /mnt/foo /mnt/bar
+ * echo "abc" > /mnt/foo
+ * xfs_io -c fsync /mnt/foo
+ * <power fail>
+ *
+ * We also need this for cases where a snapshot entry is replaced by some other
+ * entry (file or directory) otherwise we end up with an unreplayable log due to
+ * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
+ * if it were a regular entry:
+ *
+ * mkdir /mnt/x
+ * btrfs subvolume snapshot /mnt /mnt/x/snap
+ * btrfs subvolume delete /mnt/x/snap
+ * rmdir /mnt/x
+ * mkdir /mnt/x
+ * fsync /mnt/x or fsync some new file inside it
+ * <power fail>
+ *
+ * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
+ * the same transaction.
+ */
+static int btrfs_check_ref_name_override(struct extent_buffer *eb,
+ const int slot,
+ const struct btrfs_key *key,
+ struct btrfs_inode *inode,
+ u64 *other_ino, u64 *other_parent)
+{
+ BTRFS_PATH_AUTO_FREE(search_path);
+ char AUTO_KFREE(name);
+ u32 name_len = 0;
+ u32 item_size = btrfs_item_size(eb, slot);
+ u32 cur_offset = 0;
+ unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
+
+ search_path = btrfs_alloc_path();
+ if (!search_path)
+ return -ENOMEM;
+ search_path->search_commit_root = true;
+ search_path->skip_locking = true;
+
+ while (cur_offset < item_size) {
+ u64 parent;
+ u32 this_name_len;
+ u32 this_len;
+ unsigned long name_ptr;
+ struct btrfs_dir_item *di;
+ struct fscrypt_str name_str;
+
+ if (key->type == BTRFS_INODE_REF_KEY) {
+ struct btrfs_inode_ref *iref;
+
+ iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+ parent = key->offset;
+ this_name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_ptr = (unsigned long)(iref + 1);
+ this_len = sizeof(*iref) + this_name_len;
+ } else {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)(ptr +
+ cur_offset);
+ parent = btrfs_inode_extref_parent(eb, extref);
+ this_name_len = btrfs_inode_extref_name_len(eb, extref);
+ name_ptr = (unsigned long)&extref->name;
+ this_len = sizeof(*extref) + this_name_len;
+ }
+
+ if (this_name_len > name_len) {
+ char *new_name;
+
+ new_name = krealloc(name, this_name_len, GFP_NOFS);
+ if (!new_name)
+ return -ENOMEM;
+ name_len = this_name_len;
+ name = new_name;
+ }
+
+ read_extent_buffer(eb, name, name_ptr, this_name_len);
+
+ name_str.name = name;
+ name_str.len = this_name_len;
+ di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
+ parent, &name_str, 0);
+ if (di && !IS_ERR(di)) {
+ struct btrfs_key di_key;
+
+ btrfs_dir_item_key_to_cpu(search_path->nodes[0],
+ di, &di_key);
+ if (di_key.type == BTRFS_INODE_ITEM_KEY) {
+ if (di_key.objectid != key->objectid) {
+ *other_ino = di_key.objectid;
+ *other_parent = parent;
+ return 1;
+ } else {
+ return 0;
+ }
+ } else {
+ return -EAGAIN;
+ }
+ } else if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ }
+ btrfs_release_path(search_path);
+
+ cur_offset += this_len;
+ }
+
+ return 0;
+}
+
+/*
+ * Check if we need to log an inode. This is used in contexts where while
+ * logging an inode we need to log another inode (either that it exists or in
+ * full mode). This is used instead of btrfs_inode_in_log() because the later
+ * requires the inode to be in the log and have the log transaction committed,
+ * while here we do not care if the log transaction was already committed - our
+ * caller will commit the log later - and we want to avoid logging an inode
+ * multiple times when multiple tasks have joined the same log transaction.
+ */
+static bool need_log_inode(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ /*
+ * If a directory was not modified, no dentries added or removed, we can
+ * and should avoid logging it.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
+ return false;
+
+ /*
+ * If this inode does not have new/updated/deleted xattrs since the last
+ * time it was logged and is flagged as logged in the current transaction,
+ * we can skip logging it. As for new/deleted names, those are updated in
+ * the log by link/unlink/rename operations.
+ * In case the inode was logged and then evicted and reloaded, its
+ * logged_trans will be 0, in which case we have to fully log it since
+ * logged_trans is a transient field, not persisted.
+ */
+ if (inode_logged(trans, inode, NULL) == 1 &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return false;
+
+ return true;
+}
+
+struct btrfs_dir_list {
+ u64 ino;
+ struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory.
+ * See process_dir_items_leaf() for details about why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not acquire their VFS lock, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(sb_internal#2);
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not acquiring the VFS lock of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ * that while logging the inode new references (names) are added or removed
+ * from the inode, leaving the logged inode item with a link count that does
+ * not match the number of logged inode reference items. This is fine because
+ * at log replay time we compute the real number of links and correct the
+ * link count in the inode item (see replay_one_buffer() and
+ * link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ * while logging the inode's items new index items (key type
+ * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
+ * has a size that doesn't match the sum of the lengths of all the logged
+ * names - this is ok, not a problem, because at log replay time we set the
+ * directory's i_size to the correct value (see replay_one_name() and
+ * overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *start_inode,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = start_inode->root;
+ struct btrfs_path *path;
+ LIST_HEAD(dir_list);
+ struct btrfs_dir_list *dir_elem;
+ u64 ino = btrfs_ino(start_inode);
+ struct btrfs_inode *curr_inode = start_inode;
+ int ret = 0;
+
+ /*
+ * If we are logging a new name, as part of a link or rename operation,
+ * don't bother logging new dentries, as we just want to log the names
+ * of an inode and that any new parents exist.
+ */
+ if (ctx->logging_new_name)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* Pairs with btrfs_add_delayed_iput below. */
+ ihold(&curr_inode->vfs_inode);
+
+ while (true) {
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 next_index;
+ bool continue_curr_inode = true;
+ int iter_ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
+ next_index = key.offset;
+again:
+ btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ struct btrfs_inode *di_inode;
+ int log_mode = LOG_INODE_EXISTS;
+ int type;
+
+ if (found_key.objectid != ino ||
+ found_key.type != BTRFS_DIR_INDEX_KEY) {
+ continue_curr_inode = false;
+ break;
+ }
+
+ next_index = found_key.offset + 1;
+
+ di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+ type = btrfs_dir_ftype(leaf, di);
+ if (btrfs_dir_transid(leaf, di) < trans->transid)
+ continue;
+ btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+ if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ btrfs_release_path(path);
+ di_inode = btrfs_iget_logging(di_key.objectid, root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ goto out;
+ }
+
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
+ break;
+ }
+
+ ctx->log_new_dentries = false;
+ if (type == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
+ btrfs_add_delayed_iput(di_inode);
+ if (ret)
+ goto out;
+ if (ctx->log_new_dentries) {
+ dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+ if (!dir_elem) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dir_elem->ino = di_key.objectid;
+ list_add_tail(&dir_elem->list, &dir_list);
+ }
+ break;
+ }
+
+ btrfs_release_path(path);
+
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
+ } else if (iter_ret > 0) {
+ continue_curr_inode = false;
+ } else {
+ key = found_key;
+ }
+
+ if (continue_curr_inode && key.offset < (u64)-1) {
+ key.offset++;
+ goto again;
+ }
+
+ btrfs_set_first_dir_index_to_log(curr_inode, next_index);
+
+ if (list_empty(&dir_list))
+ break;
+
+ dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
+ ino = dir_elem->ino;
+ list_del(&dir_elem->list);
+ kfree(dir_elem);
+
+ btrfs_add_delayed_iput(curr_inode);
+
+ curr_inode = btrfs_iget_logging(ino, root);
+ if (IS_ERR(curr_inode)) {
+ ret = PTR_ERR(curr_inode);
+ curr_inode = NULL;
+ break;
+ }
+ }
+out:
+ btrfs_free_path(path);
+ if (curr_inode)
+ btrfs_add_delayed_iput(curr_inode);
+
+ if (ret) {
+ struct btrfs_dir_list *next;
+
+ list_for_each_entry_safe(dir_elem, next, &dir_list, list)
+ kfree(dir_elem);
+ }
+
+ return ret;
+}
+
+struct btrfs_ino_list {
+ u64 ino;
+ u64 parent;
+ struct list_head list;
+};
+
+static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ino_list *curr;
+ struct btrfs_ino_list *next;
+
+ list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
+ list_del(&curr->list);
+ kfree(curr);
+ }
+}
+
+static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
+ struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ path->search_commit_root = true;
+ path->skip_locking = true;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (WARN_ON_ONCE(ret > 0)) {
+ /*
+ * We have previously found the inode through the commit root
+ * so this should not happen. If it does, just error out and
+ * fallback to a transaction commit.
+ */
+ ret = -ENOENT;
+ } else if (ret == 0) {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
+ ret = 1;
+ }
+
+ btrfs_release_path(path);
+ path->search_commit_root = false;
+ path->skip_locking = false;
+
+ return ret;
+}
+
+static int add_conflicting_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 ino, u64 parent,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ino_list *ino_elem;
+ struct btrfs_inode *inode;
+
+ /*
+ * It's rare to have a lot of conflicting inodes, in practice it is not
+ * common to have more than 1 or 2. We don't want to collect too many,
+ * as we could end up logging too many inodes (even if only in
+ * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
+ * commits.
+ */
+ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
+ return BTRFS_LOG_FORCE_COMMIT;
+
+ inode = btrfs_iget_logging(ino, root);
+ /*
+ * If the other inode that had a conflicting dir entry was deleted in
+ * the current transaction then we either:
+ *
+ * 1) Log the parent directory (later after adding it to the list) if
+ * the inode is a directory. This is because it may be a deleted
+ * subvolume/snapshot or it may be a regular directory that had
+ * deleted subvolumes/snapshots (or subdirectories that had them),
+ * and at the moment we can't deal with dropping subvolumes/snapshots
+ * during log replay. So we just log the parent, which will result in
+ * a fallback to a transaction commit if we are dealing with those
+ * cases (last_unlink_trans will match the current transaction);
+ *
+ * 2) Do nothing if it's not a directory. During log replay we simply
+ * unlink the conflicting dentry from the parent directory and then
+ * add the dentry for our inode. Like this we can avoid logging the
+ * parent directory (and maybe fallback to a transaction commit in
+ * case it has a last_unlink_trans == trans->transid, due to moving
+ * some inode from it to some other directory).
+ */
+ if (IS_ERR(inode)) {
+ int ret = PTR_ERR(inode);
+
+ if (ret != -ENOENT)
+ return ret;
+
+ ret = conflicting_inode_is_dir(root, ino, path);
+ /* Not a directory or we got an error. */
+ if (ret <= 0)
+ return ret;
+
+ /* Conflicting inode is a directory, so we'll log its parent. */
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
+
+ return 0;
+ }
+
+ /*
+ * If the inode was already logged skip it - otherwise we can hit an
+ * infinite loop. Example:
+ *
+ * From the commit root (previous transaction) we have the following
+ * inodes:
+ *
+ * inode 257 a directory
+ * inode 258 with references "zz" and "zz_link" on inode 257
+ * inode 259 with reference "a" on inode 257
+ *
+ * And in the current (uncommitted) transaction we have:
+ *
+ * inode 257 a directory, unchanged
+ * inode 258 with references "a" and "a2" on inode 257
+ * inode 259 with reference "zz_link" on inode 257
+ * inode 261 with reference "zz" on inode 257
+ *
+ * When logging inode 261 the following infinite loop could
+ * happen if we don't skip already logged inodes:
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 261
+ * on reference "zz", and log it;
+ *
+ * - we detect inode 259 as a conflicting inode, with inode 258
+ * on reference "a", and log it;
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 259
+ * on reference "zz_link", and log it - again! After this we
+ * repeat the above steps forever.
+ *
+ * Here we can use need_log_inode() because we only need to log the
+ * inode in LOG_INODE_EXISTS mode and rename operations update the log,
+ * so that the log ends up with the new name and without the old name.
+ */
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
+ return 0;
+ }
+
+ btrfs_add_delayed_iput(inode);
+
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
+
+ return 0;
+}
+
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ int ret = 0;
+
+ /*
+ * Conflicting inodes are logged by the first call to btrfs_log_inode(),
+ * otherwise we could have unbounded recursion of btrfs_log_inode()
+ * calls. This check guarantees we can have only 1 level of recursion.
+ */
+ if (ctx->logging_conflict_inodes)
+ return 0;
+
+ ctx->logging_conflict_inodes = true;
+
+ /*
+ * New conflicting inodes may be found and added to the list while we
+ * are logging a conflicting inode, so keep iterating while the list is
+ * not empty.
+ */
+ while (!list_empty(&ctx->conflict_inodes)) {
+ struct btrfs_ino_list *curr;
+ struct btrfs_inode *inode;
+ u64 ino;
+ u64 parent;
+
+ curr = list_first_entry(&ctx->conflict_inodes,
+ struct btrfs_ino_list, list);
+ ino = curr->ino;
+ parent = curr->parent;
+ list_del(&curr->list);
+ kfree(curr);
+
+ inode = btrfs_iget_logging(ino, root);
+ /*
+ * If the other inode that had a conflicting dir entry was
+ * deleted in the current transaction, we need to log its parent
+ * directory. See the comment at add_conflicting_inode().
+ */
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ if (ret != -ENOENT)
+ break;
+
+ inode = btrfs_iget_logging(parent, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ break;
+ }
+
+ /*
+ * Always log the directory, we cannot make this
+ * conditional on need_log_inode() because the directory
+ * might have been logged in LOG_INODE_EXISTS mode or
+ * the dir index of the conflicting inode is not in a
+ * dir index key range logged for the directory. So we
+ * must make sure the deletion is recorded.
+ */
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
+ continue;
+ }
+
+ /*
+ * Here we can use need_log_inode() because we only need to log
+ * the inode in LOG_INODE_EXISTS mode and rename operations
+ * update the log, so that the log ends up with the new name and
+ * without the old name.
+ *
+ * We did this check at add_conflicting_inode(), but here we do
+ * it again because if some other task logged the inode after
+ * that, we can avoid doing it again.
+ */
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
+ continue;
+ }
+
+ /*
+ * We are safe logging the other inode without acquiring its
+ * lock as long as we log with the LOG_INODE_EXISTS mode. We
+ * are safe against concurrent renames of the other inode as
+ * well because during a rename we pin the log and update the
+ * log with the new name before we unpin it.
+ */
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
+ }
+
+ ctx->logging_conflict_inodes = false;
+ if (ret)
+ free_conflicting_inodes(ctx);
+
+ return ret;
+}
+
+static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_key *min_key,
+ const struct btrfs_key *max_key,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ const u64 logged_isize,
+ const int inode_only,
+ struct btrfs_log_ctx *ctx,
+ bool *need_log_inode_item)
+{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ struct btrfs_root *root = inode->root;
+ int ins_start_slot = 0;
+ int ins_nr = 0;
+ int ret;
+
+ while (1) {
+ ret = btrfs_search_forward(root, min_key, path, trans->transid);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+again:
+ /* Note, ins_nr might be > 0 here, cleanup outside the loop */
+ if (min_key->objectid != max_key->objectid)
+ break;
+ if (min_key->type > max_key->type)
+ break;
+
+ if (min_key->type == BTRFS_INODE_ITEM_KEY) {
+ *need_log_inode_item = false;
+ } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
+ min_key->offset >= i_size) {
+ /*
+ * Extents at and beyond eof are logged with
+ * btrfs_log_prealloc_extents().
+ * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
+ * and no keys greater than that, so bail out.
+ */
+ break;
+ } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ (inode->generation == trans->transid ||
+ ctx->logging_conflict_inodes)) {
+ u64 other_ino = 0;
+ u64 other_parent = 0;
+
+ ret = btrfs_check_ref_name_override(path->nodes[0],
+ path->slots[0], min_key, inode,
+ &other_ino, &other_parent);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0 &&
+ other_ino != btrfs_ino(ctx->inode)) {
+ if (ins_nr > 0) {
+ ins_nr++;
+ } else {
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+ }
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr,
+ inode_only, logged_isize, ctx);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+
+ btrfs_release_path(path);
+ ret = add_conflicting_inode(trans, root, path,
+ other_ino,
+ other_parent, ctx);
+ if (ret)
+ return ret;
+ goto next_key;
+ }
+ } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
+ if (ins_nr == 0)
+ goto next_slot;
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot,
+ ins_nr, inode_only, logged_isize, ctx);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ goto next_slot;
+ }
+
+ if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+ ins_nr++;
+ goto next_slot;
+ } else if (!ins_nr) {
+ ins_start_slot = path->slots[0];
+ ins_nr = 1;
+ goto next_slot;
+ }
+
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+ ins_nr, inode_only, logged_isize, ctx);
+ if (ret < 0)
+ return ret;
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+next_slot:
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key_to_cpu(path->nodes[0], min_key,
+ path->slots[0]);
+ goto again;
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize, ctx);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ btrfs_release_path(path);
+next_key:
+ if (min_key->offset < (u64)-1) {
+ min_key->offset++;
+ } else if (min_key->type < max_key->type) {
+ min_key->type++;
+ min_key->offset = 0;
+ } else {
+ break;
+ }
+
+ /*
+ * We may process many leaves full of items for our inode, so
+ * avoid monopolizing a cpu for too long by rescheduling while
+ * not holding locks on any tree.
+ */
+ cond_resched();
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+ ins_nr, inode_only, logged_isize, ctx);
+ if (ret)
+ return ret;
+ }
+
+ if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+ /*
+ * Release the path because otherwise we might attempt to double
+ * lock the same leaf with btrfs_log_prealloc_extents() below.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
+ }
+
+ return ret;
+}
+
+static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ const struct btrfs_item_batch *batch,
+ const struct btrfs_delayed_item *first_item)
+{
+ const struct btrfs_delayed_item *curr = first_item;
+ int ret;
+
+ ret = btrfs_insert_empty_items(trans, log, path, batch);
+ if (ret)
+ return ret;
+
+ for (int i = 0; i < batch->nr; i++) {
+ char *data_ptr;
+
+ data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+ write_extent_buffer(path->nodes[0], &curr->data,
+ (unsigned long)data_ptr, curr->data_len);
+ curr = list_next_entry(curr, log_list);
+ path->slots[0]++;
+ }
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
+ const int max_batch_size = 195;
+ const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_item_batch batch = {
+ .nr = 0,
+ .total_data_size = 0,
+ };
+ const struct btrfs_delayed_item *first = NULL;
+ const struct btrfs_delayed_item *curr;
+ char *ins_data;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ u64 curr_batch_size = 0;
+ int batch_idx = 0;
+ int ret;
+
+ /* We are adding dir index items to the log tree. */
+ lockdep_assert_held(&inode->log_mutex);
+
+ /*
+ * We collect delayed items before copying index keys from the subvolume
+ * to the log tree. However just after we collected them, they may have
+ * been flushed (all of them or just some of them), and therefore we
+ * could have copied them from the subvolume tree to the log tree.
+ * So find the first delayed item that was not yet logged (they are
+ * sorted by index number).
+ */
+ list_for_each_entry(curr, delayed_ins_list, log_list) {
+ if (curr->index > inode->last_dir_index_offset) {
+ first = curr;
+ break;
+ }
+ }
+
+ /* Empty list or all delayed items were already logged. */
+ if (!first)
+ return 0;
+
+ ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+ ins_sizes = (u32 *)ins_data;
+ batch.data_sizes = ins_sizes;
+ ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
+ batch.keys = ins_keys;
+
+ curr = first;
+ while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
+ const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
+
+ if (curr_batch_size + curr_size > leaf_data_size ||
+ batch.nr == max_batch_size) {
+ ret = insert_delayed_items_batch(trans, log, path,
+ &batch, first);
+ if (ret)
+ goto out;
+ batch_idx = 0;
+ batch.nr = 0;
+ batch.total_data_size = 0;
+ curr_batch_size = 0;
+ first = curr;
+ }
+
+ ins_sizes[batch_idx] = curr->data_len;
+ ins_keys[batch_idx].objectid = ino;
+ ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[batch_idx].offset = curr->index;
+ curr_batch_size += curr_size;
+ batch.total_data_size += curr->data_len;
+ batch.nr++;
+ batch_idx++;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ ASSERT(batch.nr >= 1, "batch.nr=%d", batch.nr);
+ ret = insert_delayed_items_batch(trans, log, path, &batch, first);
+
+ curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
+ log_list);
+ inode->last_dir_index_offset = curr->index;
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ const struct btrfs_delayed_item *curr;
+
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ const struct btrfs_delayed_item *next;
+ int ret;
+
+ /*
+ * Find a range of consecutive dir index items to delete. Like
+ * this we log a single dir range item spanning several contiguous
+ * dir items instead of logging one range item per dir index item.
+ */
+ next = list_next_entry(curr, log_list);
+ while (!list_entry_is_head(next, delayed_del_list, log_list)) {
+ if (next->index != curr->index + 1)
+ break;
+ curr = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ last_dir_index = curr->index;
+ ASSERT(last_dir_index >= first_dir_index,
+ "last_dir_index=%llu first_dir_index=%llu",
+ last_dir_index, first_dir_index);
+
+ ret = insert_dir_log_key(trans, inode->root->log_root, path,
+ ino, first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ return 0;
+}
+
+static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ const struct btrfs_delayed_item *first,
+ const struct btrfs_delayed_item **last_ret)
+{
+ const struct btrfs_delayed_item *next;
+ struct extent_buffer *leaf = path->nodes[0];
+ const int last_slot = btrfs_header_nritems(leaf) - 1;
+ int slot = path->slots[0] + 1;
+ const u64 ino = btrfs_ino(inode);
+
+ next = list_next_entry(first, log_list);
+
+ while (slot < last_slot &&
+ !list_entry_is_head(next, delayed_del_list, log_list)) {
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
+ break;
+
+ slot++;
+ *last_ret = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ return btrfs_del_items(trans, inode->root->log_root, path,
+ path->slots[0], slot - path->slots[0]);
+}
+
+static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ const struct btrfs_delayed_item *curr;
+ u64 last_range_start = 0;
+ u64 last_range_end = 0;
+ struct btrfs_key key;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_DIR_INDEX_KEY;
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ const struct btrfs_delayed_item *last = curr;
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ bool deleted_items = false;
+ int ret;
+
+ key.offset = curr->index;
+ ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ ret = batch_delete_dir_index_items(trans, inode, path,
+ delayed_del_list, curr,
+ &last);
+ if (ret)
+ return ret;
+ deleted_items = true;
+ }
+
+ btrfs_release_path(path);
+
+ /*
+ * If we deleted items from the leaf, it means we have a range
+ * item logging their range, so no need to add one or update an
+ * existing one. Otherwise we have to log a dir range item.
+ */
+ if (deleted_items)
+ goto next_batch;
+
+ last_dir_index = last->index;
+ ASSERT(last_dir_index >= first_dir_index,
+ "last_dir_index=%llu first_dir_index=%llu",
+ last_dir_index, first_dir_index);
+ /*
+ * If this range starts right after where the previous one ends,
+ * then we want to reuse the previous range item and change its
+ * end offset to the end of this range. This is just to minimize
+ * leaf space usage, by avoiding adding a new range item.
+ */
+ if (last_range_end != 0 && first_dir_index == last_range_end + 1)
+ first_dir_index = last_range_start;
+
+ ret = insert_dir_log_key(trans, log, path, key.objectid,
+ first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+
+ last_range_start = first_dir_index;
+ last_range_end = last_dir_index;
+next_batch:
+ curr = list_next_entry(last, log_list);
+ }
+
+ return 0;
+}
+
+static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /*
+ * We are deleting dir index items from the log tree or adding range
+ * items to it.
+ */
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (list_empty(delayed_del_list))
+ return 0;
+
+ if (ctx->logged_before)
+ return log_delayed_deletions_incremental(trans, inode, path,
+ delayed_del_list, ctx);
+
+ return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
+ ctx);
+}
+
+/*
+ * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
+ * items instead of the subvolume tree.
+ */
+static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const bool orig_log_new_dentries = ctx->log_new_dentries;
+ struct btrfs_delayed_item *item;
+ int ret = 0;
+
+ /*
+ * No need for the log mutex, plus to avoid potential deadlocks or
+ * lockdep annotations due to nesting of delayed inode mutexes and log
+ * mutexes.
+ */
+ lockdep_assert_not_held(&inode->log_mutex);
+
+ ASSERT(!ctx->logging_new_delayed_dentries,
+ "ctx->logging_new_delayed_dentries=%d", ctx->logging_new_delayed_dentries);
+ ctx->logging_new_delayed_dentries = true;
+
+ list_for_each_entry(item, delayed_ins_list, log_list) {
+ struct btrfs_dir_item *dir_item;
+ struct btrfs_inode *di_inode;
+ struct btrfs_key key;
+ int log_mode = LOG_INODE_EXISTS;
+
+ dir_item = (struct btrfs_dir_item *)item->data;
+ btrfs_disk_key_to_cpu(&key, &dir_item->location);
+
+ if (key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ di_inode = btrfs_iget_logging(key.objectid, inode->root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ break;
+ }
+
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
+ continue;
+ }
+
+ if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
+
+ if (!ret && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans, di_inode, ctx);
+
+ btrfs_add_delayed_iput(di_inode);
+
+ if (ret)
+ break;
+ }
+
+ ctx->log_new_dentries = orig_log_new_dentries;
+ ctx->logging_new_delayed_dentries = false;
+
return ret;
}
@@ -3574,22 +6808,26 @@ process:
* This handles both files and directories.
*/
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only)
+ struct btrfs_inode *inode,
+ int inode_only,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
struct btrfs_path *dst_path;
struct btrfs_key min_key;
struct btrfs_key max_key;
- struct btrfs_root *log = root->log_root;
- struct extent_buffer *src = NULL;
- int err = 0;
+ struct btrfs_root *log = inode->root->log_root;
int ret;
- int nritems;
- int ins_start_slot = 0;
- int ins_nr;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ u64 logged_isize = 0;
+ bool need_log_inode_item = true;
+ bool xattrs_logged = false;
+ bool inode_item_dropped = true;
+ bool full_dir_logging = false;
+ LIST_HEAD(delayed_ins_list);
+ LIST_HEAD(delayed_del_list);
path = btrfs_alloc_path();
if (!path)
@@ -3608,249 +6846,602 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
/* today the code can only do partial logging of directories */
- if (S_ISDIR(inode->i_mode) ||
+ if (S_ISDIR(inode->vfs_inode.i_mode) ||
(!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags) &&
- inode_only == LOG_INODE_EXISTS))
+ &inode->runtime_flags) &&
+ inode_only >= LOG_INODE_EXISTS))
max_key.type = BTRFS_XATTR_ITEM_KEY;
else
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
- /* Only run delayed items if we are a dir or a new file */
- if (S_ISDIR(inode->i_mode) ||
- BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
+ full_dir_logging = true;
+
+ /*
+ * If we are logging a directory while we are logging dentries of the
+ * delayed items of some other inode, then we need to flush the delayed
+ * items of this directory and not log the delayed items directly. This
+ * is to prevent more than one level of recursion into btrfs_log_inode()
+ * by having something like this:
+ *
+ * $ mkdir -p a/b/c/d/e/f/g/h/...
+ * $ xfs_io -c "fsync" a
+ *
+ * Where all directories in the path did not exist before and are
+ * created in the current transaction.
+ * So in such a case we directly log the delayed items of the main
+ * directory ("a") without flushing them first, while for each of its
+ * subdirectories we flush their delayed items before logging them.
+ * This prevents a potential unbounded recursion like this:
+ *
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * (...)
+ *
+ * We have thresholds for the maximum number of delayed items to have in
+ * memory, and once they are hit, the items are flushed asynchronously.
+ * However the limit is quite high, so lets prevent deep levels of
+ * recursion to happen by limiting the maximum depth to be 1.
+ */
+ if (full_dir_logging && ctx->logging_new_delayed_dentries) {
ret = btrfs_commit_inode_delayed_items(trans, inode);
- if (ret) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return ret;
- }
+ if (ret)
+ goto out;
}
- mutex_lock(&BTRFS_I(inode)->log_mutex);
+ mutex_lock(&inode->log_mutex);
- btrfs_get_logged_extents(log, inode);
+ /*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
+ * Before logging the inode item, cache the value returned by
+ * inode_logged(), because after that we have the need to figure out if
+ * the inode was previously logged in this transaction.
+ */
+ ret = inode_logged(trans, inode, path);
+ if (ret < 0)
+ goto out_unlock;
+ ctx->logged_before = (ret == 1);
+ ret = 0;
+
+ /*
+ * This is for cases where logging a directory could result in losing a
+ * a file after replaying the log. For example, if we move a file from a
+ * directory A to a directory B, then fsync directory A, we have no way
+ * to known the file was moved from A to B, so logging just A would
+ * result in losing the file after a log replay.
+ */
+ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out_unlock;
+ }
/*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
*/
- if (S_ISDIR(inode->i_mode)) {
- int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
-
- if (inode_only == LOG_INODE_EXISTS)
- max_key_type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+ if (S_ISDIR(inode->vfs_inode.i_mode)) {
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ BTRFS_XATTR_ITEM_KEY);
} else {
- if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags);
- ret = btrfs_truncate_inode_items(trans, log,
- inode, 0, 0);
+ if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
+ /*
+ * Make sure the new inode item we write to the log has
+ * the same isize as the current one (if it exists).
+ * This is necessary to prevent data loss after log
+ * replay, and also to prevent doing a wrong expanding
+ * truncate - for e.g. create file, write 4K into offset
+ * 0, fsync, write 4K into offset 4096, add hard link,
+ * fsync some other file (to sync log), power fail - if
+ * we use the inode's current i_size, after log replay
+ * we get a 8Kb file, with the last 4Kb extent as a hole
+ * (zeroes), as if an expanding truncate happened,
+ * instead of getting a file of 4Kb only.
+ */
+ ret = logged_inode_size(log, inode, path, &logged_isize);
+ if (ret)
+ goto out_unlock;
+ }
+ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &inode->runtime_flags)) {
+ if (inode_only == LOG_INODE_EXISTS) {
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path,
+ inode, max_key.type);
+ } else {
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &inode->runtime_flags);
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &inode->runtime_flags);
+ if (ctx->logged_before)
+ ret = truncate_inode_items(trans, log,
+ inode, 0, 0);
+ }
} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags)) {
+ &inode->runtime_flags) ||
+ inode_only == LOG_INODE_EXISTS) {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
- ret = log_inode_item(trans, log, dst_path, inode);
- if (ret) {
- err = ret;
- goto out_unlock;
- }
+ inode_item_dropped = false;
goto log_extents;
}
}
- if (ret) {
- err = ret;
+ if (ret)
goto out_unlock;
- }
- path->keep_locks = 1;
-
- while (1) {
- ins_nr = 0;
- ret = btrfs_search_forward(root, &min_key, &max_key,
- path, trans->transid);
- if (ret != 0)
- break;
-again:
- /* note, ins_nr might be > 0 here, cleanup outside the loop */
- if (min_key.objectid != ino)
- break;
- if (min_key.type > max_key.type)
- break;
- src = path->nodes[0];
- if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
- ins_nr++;
- goto next_slot;
- } else if (!ins_nr) {
- ins_start_slot = path->slots[0];
- ins_nr = 1;
- goto next_slot;
- }
+ /*
+ * If we are logging a directory in full mode, collect the delayed items
+ * before iterating the subvolume tree, so that we don't miss any new
+ * dir index items in case they get flushed while or right after we are
+ * iterating the subvolume tree.
+ */
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries)
+ btrfs_log_get_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
- ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
- ins_nr, inode_only);
- if (ret) {
- err = ret;
+ /*
+ * If we are fsyncing a file with 0 hard links, then commit the delayed
+ * inode because the last inode ref (or extref) item may still be in the
+ * subvolume tree and if we log it the file will still exist after a log
+ * replay. So commit the delayed inode to delete that last ref and we
+ * skip logging it.
+ */
+ if (inode->vfs_inode.i_nlink == 0) {
+ ret = btrfs_commit_inode_delayed_inode(inode);
+ if (ret)
goto out_unlock;
- }
- ins_nr = 1;
- ins_start_slot = path->slots[0];
-next_slot:
+ }
- nritems = btrfs_header_nritems(path->nodes[0]);
- path->slots[0]++;
- if (path->slots[0] < nritems) {
- btrfs_item_key_to_cpu(path->nodes[0], &min_key,
- path->slots[0]);
- goto again;
- }
- if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, src,
- ins_start_slot,
- ins_nr, inode_only);
- if (ret) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
- }
- btrfs_release_path(path);
+ ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
+ path, dst_path, logged_isize,
+ inode_only, ctx,
+ &need_log_inode_item);
+ if (ret)
+ goto out_unlock;
- if (min_key.offset < (u64)-1)
- min_key.offset++;
- else if (min_key.type < (u8)-1)
- min_key.type++;
- else if (min_key.objectid < (u64)-1)
- min_key.objectid++;
- else
- break;
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
+ if (ret)
+ goto out_unlock;
+ xattrs_logged = true;
+ if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ ret = btrfs_log_holes(trans, inode, path);
+ if (ret)
+ goto out_unlock;
}
- if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
- ins_nr, inode_only);
- if (ret) {
- err = ret;
+log_extents:
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ if (need_log_inode_item) {
+ ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
+ if (ret)
goto out_unlock;
+ /*
+ * If we are doing a fast fsync and the inode was logged before
+ * in this transaction, we don't need to log the xattrs because
+ * they were logged before. If xattrs were added, changed or
+ * deleted since the last time we logged the inode, then we have
+ * already logged them because the inode had the runtime flag
+ * BTRFS_INODE_COPY_EVERYTHING set.
+ */
+ if (!xattrs_logged && inode->logged_trans < trans->transid) {
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
+ if (ret)
+ goto out_unlock;
+ btrfs_release_path(path);
}
- ins_nr = 0;
}
-
-log_extents:
if (fast_search) {
- btrfs_release_path(dst_path);
- ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
- if (ret) {
- err = ret;
+ ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
+ if (ret)
goto out_unlock;
- }
- } else {
- struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+ } else if (inode_only == LOG_INODE_ALL) {
struct extent_map *em, *n;
- write_lock(&tree->lock);
- list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+ write_lock(&em_tree->lock);
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
list_del_init(&em->list);
- write_unlock(&tree->lock);
+ write_unlock(&em_tree->lock);
}
- if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
- btrfs_release_path(path);
- btrfs_release_path(dst_path);
- ret = log_directory_changes(trans, root, inode, path, dst_path);
- if (ret) {
- err = ret;
+ if (full_dir_logging) {
+ ret = log_directory_changes(trans, inode, path, dst_path, ctx);
+ if (ret)
+ goto out_unlock;
+ ret = log_delayed_insertion_items(trans, inode, path,
+ &delayed_ins_list, ctx);
+ if (ret)
+ goto out_unlock;
+ ret = log_delayed_deletion_items(trans, inode, path,
+ &delayed_del_list, ctx);
+ if (ret)
goto out_unlock;
- }
}
- BTRFS_I(inode)->logged_trans = trans->transid;
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
-out_unlock:
- if (err)
- btrfs_free_logged_extents(log, log->log_transid);
- mutex_unlock(&BTRFS_I(inode)->log_mutex);
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ /*
+ * Don't update last_log_commit if we logged that an inode exists.
+ * We do this for three reasons:
+ *
+ * 1) We might have had buffered writes to this inode that were
+ * flushed and had their ordered extents completed in this
+ * transaction, but we did not previously log the inode with
+ * LOG_INODE_ALL. Later the inode was evicted and after that
+ * it was loaded again and this LOG_INODE_EXISTS log operation
+ * happened. We must make sure that if an explicit fsync against
+ * the inode is performed later, it logs the new extents, an
+ * updated inode item, etc, and syncs the log. The same logic
+ * applies to direct IO writes instead of buffered writes.
+ *
+ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+ * is logged with an i_size of 0 or whatever value was logged
+ * before. If later the i_size of the inode is increased by a
+ * truncate operation, the log is synced through an fsync of
+ * some other inode and then finally an explicit fsync against
+ * this inode is made, we must make sure this fsync logs the
+ * inode with the new i_size, the hole between old i_size and
+ * the new i_size, and syncs the log.
+ *
+ * 3) If we are logging that an ancestor inode exists as part of
+ * logging a new name from a link or rename operation, don't update
+ * its last_log_commit - otherwise if an explicit fsync is made
+ * against an ancestor, the fsync considers the inode in the log
+ * and doesn't sync the log, resulting in the ancestor missing after
+ * a power failure unless the log was synced as part of an fsync
+ * against any other unrelated inode.
+ */
+ if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS)
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
+
+ /*
+ * Reset the last_reflink_trans so that the next fsync does not need to
+ * go through the slower path when logging extents and their checksums.
+ */
+ if (inode_only == LOG_INODE_ALL)
+ inode->last_reflink_trans = 0;
+
+out_unlock:
+ mutex_unlock(&inode->log_mutex);
+out:
btrfs_free_path(path);
btrfs_free_path(dst_path);
- return err;
+
+ if (ret)
+ free_conflicting_inodes(ctx);
+ else
+ ret = log_conflicting_inodes(trans, inode->root, ctx);
+
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
+ if (!ret)
+ ret = log_new_delayed_dentries(trans, inode,
+ &delayed_ins_list, ctx);
+
+ btrfs_log_put_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
+ }
+
+ return ret;
}
-/*
- * follow the dentry parent pointers up the chain and see if any
- * of the directories in it require a full commit before they can
- * be logged. Returns zero if nothing special needs to be done or 1 if
- * a full commit is required.
- */
-static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
- struct inode *inode,
- struct dentry *parent,
- struct super_block *sb,
- u64 last_committed)
+static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_log_ctx *ctx)
{
- int ret = 0;
- struct btrfs_root *root;
- struct dentry *old_parent = NULL;
+ int ret;
+ BTRFS_PATH_AUTO_FREE(path);
+ struct btrfs_key key;
+ struct btrfs_root *root = inode->root;
+ const u64 ino = btrfs_ino(inode);
- /*
- * for regular files, if its inode is already on disk, we don't
- * have to worry about the parents at all. This is because
- * we can use the last_unlink_trans field to record renames
- * and other fun in this file.
- */
- if (S_ISREG(inode->i_mode) &&
- BTRFS_I(inode)->generation <= last_committed &&
- BTRFS_I(inode)->last_unlink_trans <= last_committed)
- goto out;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->skip_locking = true;
+ path->search_commit_root = true;
- if (!S_ISDIR(inode->i_mode)) {
- if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
- goto out;
- inode = parent->d_inode;
- }
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
- while (1) {
- BTRFS_I(inode)->logged_trans = trans->transid;
- smp_mb();
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u32 cur_offset = 0;
+ u32 item_size;
+ unsigned long ptr;
- if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
- root = BTRFS_I(inode)->root;
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ break;
+ continue;
+ }
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
+ if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ item_size = btrfs_item_size(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ while (cur_offset < item_size) {
+ u64 dir_id;
+ struct btrfs_inode *dir_inode;
+
+ if (key.type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)
+ (ptr + cur_offset);
+ dir_id = btrfs_inode_extref_parent(leaf, extref);
+ cur_offset += sizeof(*extref);
+ cur_offset += btrfs_inode_extref_name_len(leaf,
+ extref);
+ } else {
+ dir_id = key.offset;
+ cur_offset = item_size;
+ }
+
+ dir_inode = btrfs_iget_logging(dir_id, root);
/*
- * make sure any commits to the log are forced
- * to be full commits
+ * If the parent inode was deleted, return an error to
+ * fallback to a transaction commit. This is to prevent
+ * getting an inode that was moved from one parent A to
+ * a parent B, got its former parent A deleted and then
+ * it got fsync'ed, from existing at both parents after
+ * a log replay (and the old parent still existing).
+ * Example:
+ *
+ * mkdir /mnt/A
+ * mkdir /mnt/B
+ * touch /mnt/B/bar
+ * sync
+ * mv /mnt/B/bar /mnt/A/bar
+ * mv -T /mnt/A /mnt/B
+ * fsync /mnt/B/bar
+ * <power fail>
+ *
+ * If we ignore the old parent B which got deleted,
+ * after a log replay we would have file bar linked
+ * at both parents and the old parent B would still
+ * exist.
*/
- root->fs_info->last_trans_log_full_commit =
- trans->transid;
- ret = 1;
+ if (IS_ERR(dir_inode))
+ return PTR_ERR(dir_inode);
+
+ if (!need_log_inode(trans, dir_inode)) {
+ btrfs_add_delayed_iput(dir_inode);
+ continue;
+ }
+
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx);
+ if (!ret && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans, dir_inode, ctx);
+ btrfs_add_delayed_iput(dir_inode);
+ if (ret)
+ return ret;
+ }
+ path->slots[0]++;
+ }
+ return 0;
+}
+
+static int log_new_ancestors(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_key found_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+
+ while (true) {
+ struct extent_buffer *leaf;
+ int slot;
+ struct btrfs_key search_key;
+ struct btrfs_inode *inode;
+ u64 ino;
+ int ret = 0;
+
+ btrfs_release_path(path);
+
+ ino = found_key.offset;
+
+ search_key.objectid = found_key.offset;
+ search_key.type = BTRFS_INODE_ITEM_KEY;
+ search_key.offset = 0;
+ inode = btrfs_iget_logging(ino, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode))
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ return ret;
+
+ if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
break;
+
+ search_key.type = BTRFS_INODE_REF_KEY;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ return -ENOENT;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
}
- if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid != search_key.objectid ||
+ found_key.type != BTRFS_INODE_REF_KEY)
+ return -ENOENT;
+ }
+ return 0;
+}
+
+static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct dentry *parent,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ struct dentry *old_parent = NULL;
+ struct super_block *sb = inode->vfs_inode.i_sb;
+ int ret = 0;
+
+ while (true) {
+ if (!parent || d_really_is_negative(parent) ||
+ sb != parent->d_sb)
break;
+ inode = BTRFS_I(d_inode(parent));
+ if (root != inode->root)
+ break;
+
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode)) {
+ ret = btrfs_log_inode(trans, inode,
+ LOG_INODE_EXISTS, ctx);
+ if (ret)
+ break;
+ }
if (IS_ROOT(parent))
break;
parent = dget_parent(parent);
dput(old_parent);
old_parent = parent;
- inode = parent->d_inode;
-
}
dput(old_parent);
-out:
+
return ret;
}
+static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct dentry *parent,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ const u64 ino = btrfs_ino(inode);
+ BTRFS_PATH_AUTO_FREE(path);
+ struct btrfs_key search_key;
+ int ret;
+
+ /*
+ * For a single hard link case, go through a fast path that does not
+ * need to iterate the fs/subvolume tree.
+ */
+ if (inode->vfs_inode.i_nlink < 2)
+ return log_new_ancestors_fast(trans, inode, parent, ctx);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ path->slots[0]++;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_key found_key;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid != ino ||
+ found_key.type > BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ /*
+ * Don't deal with extended references because they are rare
+ * cases and too complex to deal with (we would need to keep
+ * track of which subitem we are processing for each item in
+ * this loop, etc). So just return some error to fallback to
+ * a transaction commit.
+ */
+ if (found_key.type == BTRFS_INODE_EXTREF_KEY)
+ return -EMLINK;
+
+ /*
+ * Logging ancestors needs to do more searches on the fs/subvol
+ * tree, so it releases the path as needed to avoid deadlocks.
+ * Keep track of the last inode ref key and resume from that key
+ * after logging all new ancestors for the current hard link.
+ */
+ memcpy(&search_key, &found_key, sizeof(search_key));
+
+ ret = log_new_ancestors(trans, root, path, ctx);
+ if (ret)
+ return ret;
+ btrfs_release_path(path);
+ goto again;
+ }
+ return 0;
+}
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -3858,49 +7449,39 @@ out:
* the last committed transaction
*/
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- struct dentry *parent, int exists_only)
+ struct btrfs_inode *inode,
+ struct dentry *parent,
+ int inode_only,
+ struct btrfs_log_ctx *ctx)
{
- int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
- struct super_block *sb;
- struct dentry *old_parent = NULL;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
- u64 last_committed = root->fs_info->last_trans_committed;
+ bool log_dentries;
- sb = inode->i_sb;
+ if (btrfs_test_opt(fs_info, NOTREELOG))
+ return BTRFS_LOG_FORCE_COMMIT;
- if (btrfs_test_opt(root, NOTREELOG)) {
- ret = 1;
- goto end_no_trans;
- }
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return BTRFS_LOG_FORCE_COMMIT;
- if (root->fs_info->last_trans_log_full_commit >
- root->fs_info->last_trans_committed) {
- ret = 1;
- goto end_no_trans;
- }
-
- if (root != BTRFS_I(inode)->root ||
- btrfs_root_refs(&root->root_item) == 0) {
- ret = 1;
- goto end_no_trans;
- }
-
- ret = check_parent_dirs_for_sync(trans, inode, parent,
- sb, last_committed);
- if (ret)
- goto end_no_trans;
+ /*
+ * If we're logging an inode from a subvolume created in the current
+ * transaction we must force a commit since the root is not persisted.
+ */
+ if (btrfs_root_generation(&root->root_item) == trans->transid)
+ return BTRFS_LOG_FORCE_COMMIT;
- if (btrfs_inode_in_log(inode, trans->transid)) {
- ret = BTRFS_NO_LOG_SYNC;
- goto end_no_trans;
- }
+ /* Skip already logged inodes and without new extents. */
+ if (btrfs_inode_in_log(inode, trans->transid) &&
+ list_empty(&ctx->ordered_extents))
+ return BTRFS_NO_LOG_SYNC;
- ret = start_log_trans(trans, root);
+ ret = start_log_trans(trans, root, ctx);
if (ret)
- goto end_trans;
+ return ret;
- ret = btrfs_log_inode(trans, root, inode, inode_only);
+ ret = btrfs_log_inode(trans, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -3910,44 +7491,82 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* we can use the last_unlink_trans field to record renames
* and other fun in this file.
*/
- if (S_ISREG(inode->i_mode) &&
- BTRFS_I(inode)->generation <= last_committed &&
- BTRFS_I(inode)->last_unlink_trans <= last_committed) {
+ if (S_ISREG(inode->vfs_inode.i_mode) &&
+ inode->generation < trans->transid &&
+ inode->last_unlink_trans < trans->transid) {
ret = 0;
goto end_trans;
}
- inode_only = LOG_INODE_EXISTS;
- while (1) {
- if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
- break;
+ /*
+ * Track if we need to log dentries because ctx->log_new_dentries can
+ * be modified in the call chains below.
+ */
+ log_dentries = ctx->log_new_dentries;
- inode = parent->d_inode;
- if (root != BTRFS_I(inode)->root)
- break;
+ /*
+ * On unlink we must make sure all our current and old parent directory
+ * inodes are fully logged. This is to prevent leaving dangling
+ * directory index entries in directories that were our parents but are
+ * not anymore. Not doing this results in old parent directory being
+ * impossible to delete after log replay (rmdir will always fail with
+ * error -ENOTEMPTY).
+ *
+ * Example 1:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * ln testdir/foo testdir/bar
+ * sync
+ * unlink testdir/bar
+ * xfs_io -c fsync testdir/foo
+ * <power failure>
+ * mount fs, triggers log replay
+ *
+ * If we don't log the parent directory (testdir), after log replay the
+ * directory still has an entry pointing to the file inode using the bar
+ * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
+ * the file inode has a link count of 1.
+ *
+ * Example 2:
+ *
+ * mkdir testdir
+ * touch foo
+ * ln foo testdir/foo2
+ * ln foo testdir/foo3
+ * sync
+ * unlink testdir/foo3
+ * xfs_io -c fsync foo
+ * <power failure>
+ * mount fs, triggers log replay
+ *
+ * Similar as the first example, after log replay the parent directory
+ * testdir still has an entry pointing to the inode file with name foo3
+ * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
+ * and has a link count of 2.
+ */
+ if (inode->last_unlink_trans >= trans->transid) {
+ ret = btrfs_log_all_parents(trans, inode, ctx);
+ if (ret)
+ goto end_trans;
+ }
- if (BTRFS_I(inode)->generation >
- root->fs_info->last_trans_committed) {
- ret = btrfs_log_inode(trans, root, inode, inode_only);
- if (ret)
- goto end_trans;
- }
- if (IS_ROOT(parent))
- break;
+ ret = log_all_new_ancestors(trans, inode, parent, ctx);
+ if (ret)
+ goto end_trans;
- parent = dget_parent(parent);
- dput(old_parent);
- old_parent = parent;
- }
- ret = 0;
+ if (log_dentries)
+ ret = log_new_dir_dentries(trans, inode, ctx);
end_trans:
- dput(old_parent);
if (ret < 0) {
- root->fs_info->last_trans_log_full_commit = trans->transid;
- ret = 1;
+ btrfs_set_log_full_commit(trans);
+ ret = BTRFS_LOG_FORCE_COMMIT;
}
+
+ if (ret)
+ btrfs_remove_log_ctx(root, ctx);
btrfs_end_log_trans(root);
-end_no_trans:
+
return ret;
}
@@ -3958,12 +7577,14 @@ end_no_trans:
* data on disk.
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry)
+ struct dentry *dentry,
+ struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
- ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+ ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
+ LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -3979,20 +7600,17 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
struct btrfs_key key;
- struct btrfs_key found_key;
- struct btrfs_key tmp_key;
- struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
struct walk_control wc = {
.process_func = process_one_buffer,
- .stage = 0,
+ .stage = LOG_WALK_PIN_ONLY,
};
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- fs_info->log_root_recovering = 1;
+ set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
trans = btrfs_start_transaction(fs_info->tree_root, 0);
if (IS_ERR(trans)) {
@@ -4001,26 +7619,28 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
}
wc.trans = trans;
- wc.pin = 1;
+ wc.pin = true;
+ wc.log = log_root_tree;
- ret = walk_log_tree(trans, log_root_tree, &wc);
- if (ret) {
- btrfs_error(fs_info, ret, "Failed to pin buffers while "
- "recovering log root tree.");
+ ret = walk_log_tree(&wc);
+ wc.log = NULL;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto error;
}
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
+ key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
while (1) {
+ struct btrfs_key found_key;
+
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
- if (ret < 0) {
- btrfs_error(fs_info, ret,
- "Couldn't find tree log root.");
+ if (unlikely(ret < 0)) {
+ btrfs_abort_transaction(trans, ret);
goto error;
}
if (ret > 0) {
@@ -4034,55 +7654,98 @@ again:
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
- log = btrfs_read_fs_root(log_root_tree, &found_key);
- if (IS_ERR(log)) {
- ret = PTR_ERR(log);
- btrfs_error(fs_info, ret,
- "Couldn't read tree log root.");
+ wc.log = btrfs_read_tree_root(log_root_tree, &found_key);
+ if (IS_ERR(wc.log)) {
+ ret = PTR_ERR(wc.log);
+ wc.log = NULL;
+ btrfs_abort_transaction(trans, ret);
goto error;
}
- tmp_key.objectid = found_key.offset;
- tmp_key.type = BTRFS_ROOT_ITEM_KEY;
- tmp_key.offset = (u64)-1;
-
- wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
- if (IS_ERR(wc.replay_dest)) {
- ret = PTR_ERR(wc.replay_dest);
- free_extent_buffer(log->node);
- free_extent_buffer(log->commit_root);
- kfree(log);
- btrfs_error(fs_info, ret, "Couldn't read target root "
- "for tree log recovery.");
- goto error;
+ wc.root = btrfs_get_fs_root(fs_info, found_key.offset, true);
+ if (IS_ERR(wc.root)) {
+ ret = PTR_ERR(wc.root);
+ wc.root = NULL;
+ if (unlikely(ret != -ENOENT)) {
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
+
+ /*
+ * We didn't find the subvol, likely because it was
+ * deleted. This is ok, simply skip this log and go to
+ * the next one.
+ *
+ * We need to exclude the root because we can't have
+ * other log replays overwriting this log as we'll read
+ * it back in a few more times. This will keep our
+ * block from being modified, and we'll just bail for
+ * each subsequent pass.
+ */
+ ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
+ goto next;
}
- wc.replay_dest->log_root = log;
- btrfs_record_root_in_trans(trans, wc.replay_dest);
- ret = walk_log_tree(trans, log, &wc);
+ wc.root->log_root = wc.log;
+ ret = btrfs_record_root_in_trans(trans, wc.root);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- ret = fixup_inode_link_counts(trans, wc.replay_dest,
- path);
+ ret = walk_log_tree(&wc);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
}
- key.offset = found_key.offset - 1;
- wc.replay_dest->log_root = NULL;
- free_extent_buffer(log->node);
- free_extent_buffer(log->commit_root);
- kfree(log);
+ if (wc.stage == LOG_WALK_REPLAY_ALL) {
+ struct btrfs_root *root = wc.root;
+
+ wc.subvol_path = path;
+ ret = fixup_inode_link_counts(&wc);
+ wc.subvol_path = NULL;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
+ /*
+ * We have just replayed everything, and the highest
+ * objectid of fs roots probably has changed in case
+ * some inode_item's got replayed.
+ *
+ * root->objectid_mutex is not acquired as log replay
+ * could only happen during mount.
+ */
+ ret = btrfs_init_root_free_objectid(root);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
+ }
+next:
+ if (wc.root) {
+ wc.root->log_root = NULL;
+ btrfs_put_root(wc.root);
+ }
+ btrfs_put_root(wc.log);
+ wc.log = NULL;
if (ret)
goto error;
-
if (found_key.offset == 0)
break;
+ key.offset = found_key.offset - 1;
}
btrfs_release_path(path);
/* step one is to pin it all, step two is to replay just inodes */
if (wc.pin) {
- wc.pin = 0;
+ wc.pin = false;
wc.process_func = replay_one_buffer;
wc.stage = LOG_WALK_REPLAY_INODES;
goto again;
@@ -4096,19 +7759,18 @@ again:
btrfs_free_path(path);
/* step 4: commit the transaction, which also unpins the blocks */
- ret = btrfs_commit_transaction(trans, fs_info->tree_root);
+ ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
- free_extent_buffer(log_root_tree->node);
- log_root_tree->log_root = NULL;
- fs_info->log_root_recovering = 0;
- kfree(log_root_tree);
+ clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
return 0;
error:
if (wc.trans)
- btrfs_end_transaction(wc.trans, fs_info->tree_root);
+ btrfs_end_transaction(wc.trans);
+ btrfs_put_root(wc.log);
+ clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_free_path(path);
return ret;
}
@@ -4120,10 +7782,13 @@ error:
* They revolve around files there were unlinked from the directory, and
* this function updates the parent directory so that a full commit is
* properly done if it is fsync'd later after the unlinks are done.
+ *
+ * Must be called before the unlink operations (updates to the subvolume tree,
+ * inodes, etc) are done.
*/
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
- struct inode *dir, struct inode *inode,
- int for_rename)
+ struct btrfs_inode *dir, struct btrfs_inode *inode,
+ bool for_rename)
{
/*
* when we're logging a file, if it hasn't been renamed
@@ -4135,22 +7800,29 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
* into the file. When the file is logged we check it and
* don't log the parents if the file is fully on disk.
*/
- if (S_ISREG(inode->i_mode))
- BTRFS_I(inode)->last_unlink_trans = trans->transid;
+ mutex_lock(&inode->log_mutex);
+ inode->last_unlink_trans = trans->transid;
+ mutex_unlock(&inode->log_mutex);
+
+ if (!for_rename)
+ return;
/*
- * if this directory was already logged any new
- * names for this file/dir will get recorded
+ * If this directory was already logged, any new names will be logged
+ * with btrfs_log_new_name() and old names will be deleted from the log
+ * tree with btrfs_del_dir_entries_in_log() or with
+ * btrfs_del_inode_ref_in_log().
*/
- smp_mb();
- if (BTRFS_I(dir)->logged_trans == trans->transid)
+ if (inode_logged(trans, dir, NULL) == 1)
return;
/*
- * if the inode we're about to unlink was logged,
- * the log will be properly updated for any new names
+ * If the inode we're about to unlink was logged before, the log will be
+ * properly updated with the new name with btrfs_log_new_name() and the
+ * old name removed with btrfs_del_dir_entries_in_log() or with
+ * btrfs_del_inode_ref_in_log().
*/
- if (BTRFS_I(inode)->logged_trans == trans->transid)
+ if (inode_logged(trans, inode, NULL) == 1)
return;
/*
@@ -4160,46 +7832,219 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
* properly. So, we have to be conservative and force commits
* so the new name gets discovered.
*/
- if (for_rename)
- goto record;
+ mutex_lock(&dir->log_mutex);
+ dir->last_unlink_trans = trans->transid;
+ mutex_unlock(&dir->log_mutex);
+}
- /* we can safely do the unlink without any special recording */
- return;
+/*
+ * Make sure that if someone attempts to fsync the parent directory of a deleted
+ * snapshot, it ends up triggering a transaction commit. This is to guarantee
+ * that after replaying the log tree of the parent directory's root we will not
+ * see the snapshot anymore and at log replay time we will not see any log tree
+ * corresponding to the deleted snapshot's root, which could lead to replaying
+ * it after replaying the log tree of the parent directory (which would replay
+ * the snapshot delete operation).
+ *
+ * Must be called before the actual snapshot destroy operation (updates to the
+ * parent root and tree of tree roots trees, etc) are done.
+ */
+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir)
+{
+ mutex_lock(&dir->log_mutex);
+ dir->last_unlink_trans = trans->transid;
+ mutex_unlock(&dir->log_mutex);
+}
-record:
- BTRFS_I(dir)->last_unlink_trans = trans->transid;
+/*
+ * Call this when creating a subvolume in a directory.
+ * Because we don't commit a transaction when creating a subvolume, we can't
+ * allow the directory pointing to the subvolume to be logged with an entry that
+ * points to an unpersisted root if we are still in the transaction used to
+ * create the subvolume, so make any attempt to log the directory to result in a
+ * full log sync.
+ * Also we don't need to worry with renames, since btrfs_rename() marks the log
+ * for full commit when renaming a subvolume.
+ *
+ * Must be called before creating the subvolume entry in its parent directory.
+ */
+void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir)
+{
+ mutex_lock(&dir->log_mutex);
+ dir->last_unlink_trans = trans->transid;
+ mutex_unlock(&dir->log_mutex);
}
/*
- * Call this after adding a new name for a file and it will properly
- * update the log to reflect the new name.
+ * Update the log after adding a new name for an inode.
+ *
+ * @trans: Transaction handle.
+ * @old_dentry: The dentry associated with the old name and the old
+ * parent directory.
+ * @old_dir: The inode of the previous parent directory for the case
+ * of a rename. For a link operation, it must be NULL.
+ * @old_dir_index: The index number associated with the old name, meaningful
+ * only for rename operations (when @old_dir is not NULL).
+ * Ignored for link operations.
+ * @parent: The dentry associated with the directory under which the
+ * new name is located.
*
- * It will return zero if all goes well, and it will return 1 if a
- * full transaction commit is required.
+ * Call this after adding a new name for an inode, as a result of a link or
+ * rename operation, and it will properly update the log to reflect the new name.
*/
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *old_dir,
- struct dentry *parent)
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct dentry *old_dentry, struct btrfs_inode *old_dir,
+ u64 old_dir_index, struct dentry *parent)
{
- struct btrfs_root * root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_log_ctx ctx;
+ bool log_pinned = false;
+ int ret;
+
+ /* The inode has a new name (ref/extref), so make sure we log it. */
+ set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
+
+ btrfs_init_log_ctx(&ctx, inode);
+ ctx.logging_new_name = true;
/*
* this will force the logging code to walk the dentry chain
* up for the file
*/
- if (S_ISREG(inode->i_mode))
- BTRFS_I(inode)->last_unlink_trans = trans->transid;
+ if (!S_ISDIR(inode->vfs_inode.i_mode))
+ inode->last_unlink_trans = trans->transid;
/*
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (BTRFS_I(inode)->logged_trans <=
- root->fs_info->last_trans_committed &&
- (!old_dir || BTRFS_I(old_dir)->logged_trans <=
- root->fs_info->last_trans_committed))
- return 0;
+ ret = inode_logged(trans, inode, NULL);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
+ if (!old_dir)
+ return;
+ /*
+ * If the inode was not logged and we are doing a rename (old_dir is not
+ * NULL), check if old_dir was logged - if it was not we can return and
+ * do nothing.
+ */
+ ret = inode_logged(trans, old_dir, NULL);
+ if (ret < 0)
+ goto out;
+ else if (ret == 0)
+ return;
+ }
+ ret = 0;
+
+ /*
+ * Now that we know we need to update the log, allocate the scratch eb
+ * for the context before joining a log transaction below, as this can
+ * take time and therefore we could delay log commits from other tasks.
+ */
+ btrfs_init_log_ctx_scratch_eb(&ctx);
+
+ /*
+ * If we are doing a rename (old_dir is not NULL) from a directory that
+ * was previously logged, make sure that on log replay we get the old
+ * dir entry deleted. This is needed because we will also log the new
+ * name of the renamed inode, so we need to make sure that after log
+ * replay we don't end up with both the new and old dir entries existing.
+ */
+ if (old_dir && old_dir->logged_trans == trans->transid) {
+ struct btrfs_root *log = old_dir->root->log_root;
+ struct btrfs_path *path;
+ struct fscrypt_name fname;
+
+ ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX,
+ "old_dir_index=%llu", old_dir_index);
+
+ ret = fscrypt_setup_filename(&old_dir->vfs_inode,
+ &old_dentry->d_name, 0, &fname);
+ if (ret)
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ fscrypt_free_filename(&fname);
+ goto out;
+ }
+
+ /*
+ * We have two inodes to update in the log, the old directory and
+ * the inode that got renamed, so we must pin the log to prevent
+ * anyone from syncing the log until we have updated both inodes
+ * in the log.
+ */
+ ret = join_running_log_trans(root);
+ /*
+ * At least one of the inodes was logged before, so this should
+ * not fail, but if it does, it's not serious, just bail out and
+ * mark the log for a full commit.
+ */
+ if (WARN_ON_ONCE(ret < 0)) {
+ btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
+ goto out;
+ }
- return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+ log_pinned = true;
+
+ /*
+ * Other concurrent task might be logging the old directory,
+ * as it can be triggered when logging other inode that had or
+ * still has a dentry in the old directory. We lock the old
+ * directory's log_mutex to ensure the deletion of the old
+ * name is persisted, because during directory logging we
+ * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
+ * the old name's dir index item is in the delayed items, so
+ * it could be missed by an in progress directory logging.
+ */
+ mutex_lock(&old_dir->log_mutex);
+ ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
+ &fname.disk_name, old_dir_index);
+ if (ret > 0) {
+ /*
+ * The dentry does not exist in the log, so record its
+ * deletion.
+ */
+ btrfs_release_path(path);
+ ret = insert_dir_log_key(trans, log, path,
+ btrfs_ino(old_dir),
+ old_dir_index, old_dir_index);
+ }
+ mutex_unlock(&old_dir->log_mutex);
+
+ btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
+ if (ret < 0)
+ goto out;
+ }
+
+ /*
+ * We don't care about the return value. If we fail to log the new name
+ * then we know the next attempt to sync the log will fallback to a full
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
+ * we don't need to worry about getting a log committed that has an
+ * inconsistent state after a rename operation.
+ */
+ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+ ASSERT(list_empty(&ctx.conflict_inodes));
+out:
+ /*
+ * If an error happened mark the log for a full commit because it's not
+ * consistent and up to date or we couldn't find out if one of the
+ * inodes was logged before in this transaction. Do it before unpinning
+ * the log, to avoid any races with someone else trying to commit it.
+ */
+ if (ret < 0)
+ btrfs_set_log_full_commit(trans);
+ if (log_pinned)
+ btrfs_end_log_trans(root);
+ free_extent_buffer(ctx.scratch_eb);
}