summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r--fs/xfs/xfs_inode.c570
1 files changed, 125 insertions, 445 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 33cf0b82a0c0..3022918bf96a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -20,6 +20,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
+#include "xfs_iunlink_item.h"
#include "xfs_ialloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -1800,195 +1801,69 @@ out:
* because we must walk that list to find the inode that points to the inode
* being removed from the unlinked hash bucket list.
*
- * What if we modelled the unlinked list as a collection of records capturing
- * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
- * have a fast way to look up unlinked list predecessors, which avoids the
- * slow list walk. That's exactly what we do here (in-core) with a per-AG
- * rhashtable.
+ * Hence we keep an in-memory double linked list to link each inode on an
+ * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
+ * based lists would require having 64 list heads in the perag, one for each
+ * list. This is expensive in terms of memory (think millions of AGs) and cache
+ * misses on lookups. Instead, use the fact that inodes on the unlinked list
+ * must be referenced at the VFS level to keep them on the list and hence we
+ * have an existence guarantee for inodes on the unlinked list.
*
- * Because this is a backref cache, we ignore operational failures since the
- * iunlink code can fall back to the slow bucket walk. The only errors that
- * should bubble out are for obviously incorrect situations.
- *
- * All users of the backref cache MUST hold the AGI buffer lock to serialize
- * access or have otherwise provided for concurrency control.
+ * Given we have an existence guarantee, we can use lockless inode cache lookups
+ * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
+ * for the double linked unlinked list, and we don't need any extra locking to
+ * keep the list safe as all manipulations are done under the AGI buffer lock.
+ * Keeping the list up to date does not require memory allocation, just finding
+ * the XFS inode and updating the next/prev unlinked list aginos.
*/
-/* Capture a "X.next_unlinked = Y" relationship. */
-struct xfs_iunlink {
- struct rhash_head iu_rhash_head;
- xfs_agino_t iu_agino; /* X */
- xfs_agino_t iu_next_unlinked; /* Y */
-};
-
-/* Unlinked list predecessor lookup hashtable construction */
-static int
-xfs_iunlink_obj_cmpfn(
- struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const xfs_agino_t *key = arg->key;
- const struct xfs_iunlink *iu = obj;
-
- if (iu->iu_next_unlinked != *key)
- return 1;
- return 0;
-}
-
-static const struct rhashtable_params xfs_iunlink_hash_params = {
- .min_size = XFS_AGI_UNLINKED_BUCKETS,
- .key_len = sizeof(xfs_agino_t),
- .key_offset = offsetof(struct xfs_iunlink,
- iu_next_unlinked),
- .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
- .automatic_shrinking = true,
- .obj_cmpfn = xfs_iunlink_obj_cmpfn,
-};
-
/*
- * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
- * relation is found.
+ * Find an inode on the unlinked list. This does not take references to the
+ * inode as we have existence guarantees by holding the AGI buffer lock and that
+ * only unlinked, referenced inodes can be on the unlinked inode list. If we
+ * don't find the inode in cache, then let the caller handle the situation.
*/
-static xfs_agino_t
-xfs_iunlink_lookup_backref(
+static struct xfs_inode *
+xfs_iunlink_lookup(
struct xfs_perag *pag,
xfs_agino_t agino)
{
- struct xfs_iunlink *iu;
-
- iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
- xfs_iunlink_hash_params);
- return iu ? iu->iu_agino : NULLAGINO;
-}
+ struct xfs_inode *ip;
-/*
- * Take ownership of an iunlink cache entry and insert it into the hash table.
- * If successful, the entry will be owned by the cache; if not, it is freed.
- * Either way, the caller does not own @iu after this call.
- */
-static int
-xfs_iunlink_insert_backref(
- struct xfs_perag *pag,
- struct xfs_iunlink *iu)
-{
- int error;
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, agino);
- error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
- &iu->iu_rhash_head, xfs_iunlink_hash_params);
/*
- * Fail loudly if there already was an entry because that's a sign of
- * corruption of in-memory data. Also fail loudly if we see an error
- * code we didn't anticipate from the rhashtable code. Currently we
- * only anticipate ENOMEM.
+ * Inode not in memory or in RCU freeing limbo should not happen.
+ * Warn about this and let the caller handle the failure.
*/
- if (error) {
- WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
- kmem_free(iu);
+ if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
+ rcu_read_unlock();
+ return NULL;
}
- /*
- * Absorb any runtime errors that aren't a result of corruption because
- * this is a cache and we can always fall back to bucket list scanning.
- */
- if (error != 0 && error != -EEXIST)
- error = 0;
- return error;
+ ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM));
+ rcu_read_unlock();
+ return ip;
}
-/* Remember that @prev_agino.next_unlinked = @this_agino. */
+/* Update the prev pointer of the next agino. */
static int
-xfs_iunlink_add_backref(
+xfs_iunlink_update_backref(
struct xfs_perag *pag,
xfs_agino_t prev_agino,
- xfs_agino_t this_agino)
-{
- struct xfs_iunlink *iu;
-
- if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
- return 0;
-
- iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
- iu->iu_agino = prev_agino;
- iu->iu_next_unlinked = this_agino;
-
- return xfs_iunlink_insert_backref(pag, iu);
-}
-
-/*
- * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
- * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
- * wasn't any such entry then we don't bother.
- */
-static int
-xfs_iunlink_change_backref(
- struct xfs_perag *pag,
- xfs_agino_t agino,
- xfs_agino_t next_unlinked)
+ xfs_agino_t next_agino)
{
- struct xfs_iunlink *iu;
- int error;
-
- /* Look up the old entry; if there wasn't one then exit. */
- iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
- xfs_iunlink_hash_params);
- if (!iu)
- return 0;
-
- /*
- * Remove the entry. This shouldn't ever return an error, but if we
- * couldn't remove the old entry we don't want to add it again to the
- * hash table, and if the entry disappeared on us then someone's
- * violated the locking rules and we need to fail loudly. Either way
- * we cannot remove the inode because internal state is or would have
- * been corrupt.
- */
- error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
- &iu->iu_rhash_head, xfs_iunlink_hash_params);
- if (error)
- return error;
+ struct xfs_inode *ip;
- /* If there is no new next entry just free our item and return. */
- if (next_unlinked == NULLAGINO) {
- kmem_free(iu);
+ /* No update necessary if we are at the end of the list. */
+ if (next_agino == NULLAGINO)
return 0;
- }
-
- /* Update the entry and re-add it to the hash table. */
- iu->iu_next_unlinked = next_unlinked;
- return xfs_iunlink_insert_backref(pag, iu);
-}
-
-/* Set up the in-core predecessor structures. */
-int
-xfs_iunlink_init(
- struct xfs_perag *pag)
-{
- return rhashtable_init(&pag->pagi_unlinked_hash,
- &xfs_iunlink_hash_params);
-}
-
-/* Free the in-core predecessor structures. */
-static void
-xfs_iunlink_free_item(
- void *ptr,
- void *arg)
-{
- struct xfs_iunlink *iu = ptr;
- bool *freed_anything = arg;
- *freed_anything = true;
- kmem_free(iu);
-}
-
-void
-xfs_iunlink_destroy(
- struct xfs_perag *pag)
-{
- bool freed_anything = false;
-
- rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
- xfs_iunlink_free_item, &freed_anything);
-
- ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount));
+ ip = xfs_iunlink_lookup(pag, next_agino);
+ if (!ip)
+ return -EFSCORRUPTED;
+ ip->i_prev_unlinked = prev_agino;
+ return 0;
}
/*
@@ -2030,88 +1905,53 @@ xfs_iunlink_update_bucket(
return 0;
}
-/* Set an on-disk inode's next_unlinked pointer. */
-STATIC void
-xfs_iunlink_update_dinode(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- xfs_agino_t agino,
- struct xfs_buf *ibp,
- struct xfs_dinode *dip,
- struct xfs_imap *imap,
- xfs_agino_t next_agino)
-{
- struct xfs_mount *mp = tp->t_mountp;
- int offset;
-
- ASSERT(xfs_verify_agino_or_null(pag, next_agino));
-
- trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, agino,
- be32_to_cpu(dip->di_next_unlinked), next_agino);
-
- dip->di_next_unlinked = cpu_to_be32(next_agino);
- offset = imap->im_boffset +
- offsetof(struct xfs_dinode, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
-}
-
-/* Set an in-core inode's unlinked pointer and return the old value. */
-STATIC int
-xfs_iunlink_update_inode(
+static int
+xfs_iunlink_insert_inode(
struct xfs_trans *tp,
- struct xfs_inode *ip,
struct xfs_perag *pag,
- xfs_agino_t next_agino,
- xfs_agino_t *old_next_agino)
+ struct xfs_buf *agibp,
+ struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_dinode *dip;
- struct xfs_buf *ibp;
- xfs_agino_t old_value;
+ struct xfs_agi *agi = agibp->b_addr;
+ xfs_agino_t next_agino;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
- ASSERT(xfs_verify_agino_or_null(pag, next_agino));
-
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
- if (error)
- return error;
- dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
-
- /* Make sure the old pointer isn't garbage. */
- old_value = be32_to_cpu(dip->di_next_unlinked);
- if (!xfs_verify_agino_or_null(pag, old_value)) {
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
- sizeof(*dip), __this_address);
- error = -EFSCORRUPTED;
- goto out;
+ /*
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the pointer isn't garbage and that this inode
+ * isn't already on the list.
+ */
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (next_agino == agino ||
+ !xfs_verify_agino_or_null(pag, next_agino)) {
+ xfs_buf_mark_corrupt(agibp);
+ return -EFSCORRUPTED;
}
/*
- * Since we're updating a linked list, we should never find that the
- * current pointer is the same as the new value, unless we're
- * terminating the list.
+ * Update the prev pointer in the next inode to point back to this
+ * inode.
*/
- *old_next_agino = old_value;
- if (old_value == next_agino) {
- if (next_agino != NULLAGINO) {
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
- dip, sizeof(*dip), __this_address);
- error = -EFSCORRUPTED;
- }
- goto out;
+ error = xfs_iunlink_update_backref(pag, agino, next_agino);
+ if (error)
+ return error;
+
+ if (next_agino != NULLAGINO) {
+ /*
+ * There is already another inode in the bucket, so point this
+ * inode to the current head of the list.
+ */
+ error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
+ if (error)
+ return error;
+ ip->i_next_unlinked = next_agino;
}
- /* Ok, update the new pointer. */
- xfs_iunlink_update_dinode(tp, pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
- ibp, dip, &ip->i_imap, next_agino);
- return 0;
-out:
- xfs_trans_brelse(tp, ibp);
- return error;
+ /* Point the head of the list to point to this inode. */
+ return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
}
/*
@@ -2128,11 +1968,7 @@ xfs_iunlink(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_perag *pag;
- struct xfs_agi *agi;
struct xfs_buf *agibp;
- xfs_agino_t next_agino;
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
ASSERT(VFS_I(ip)->i_nlink == 0);
@@ -2145,193 +1981,29 @@ xfs_iunlink(
error = xfs_read_agi(pag, tp, &agibp);
if (error)
goto out;
- agi = agibp->b_addr;
- /*
- * Get the index into the agi hash table for the list this inode will
- * go on. Make sure the pointer isn't garbage and that this inode
- * isn't already on the list.
- */
- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- if (next_agino == agino ||
- !xfs_verify_agino_or_null(pag, next_agino)) {
- xfs_buf_mark_corrupt(agibp);
- error = -EFSCORRUPTED;
- goto out;
- }
-
- if (next_agino != NULLAGINO) {
- xfs_agino_t old_agino;
-
- /*
- * There is already another inode in the bucket, so point this
- * inode to the current head of the list.
- */
- error = xfs_iunlink_update_inode(tp, ip, pag, next_agino,
- &old_agino);
- if (error)
- goto out;
- ASSERT(old_agino == NULLAGINO);
-
- /*
- * agino has been unlinked, add a backref from the next inode
- * back to agino.
- */
- error = xfs_iunlink_add_backref(pag, agino, next_agino);
- if (error)
- goto out;
- }
-
- /* Point the head of the list to point to this inode. */
- error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
+ error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
out:
xfs_perag_put(pag);
return error;
}
-/* Return the imap, dinode pointer, and buffer for an inode. */
-STATIC int
-xfs_iunlink_map_ino(
- struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agino_t agino,
- struct xfs_imap *imap,
- struct xfs_dinode **dipp,
- struct xfs_buf **bpp)
-{
- struct xfs_mount *mp = tp->t_mountp;
- int error;
-
- imap->im_blkno = 0;
- error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap returned error %d.",
- __func__, error);
- return error;
- }
-
- error = xfs_imap_to_bp(mp, tp, imap, bpp);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
-
- *dipp = xfs_buf_offset(*bpp, imap->im_boffset);
- return 0;
-}
-
-/*
- * Walk the unlinked chain from @head_agino until we find the inode that
- * points to @target_agino. Return the inode number, map, dinode pointer,
- * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
- *
- * @tp, @pag, @head_agino, and @target_agino are input parameters.
- * @agino, @imap, @dipp, and @bpp are all output parameters.
- *
- * Do not call this function if @target_agino is the head of the list.
- */
-STATIC int
-xfs_iunlink_map_prev(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- xfs_agino_t head_agino,
- xfs_agino_t target_agino,
- xfs_agino_t *agino,
- struct xfs_imap *imap,
- struct xfs_dinode **dipp,
- struct xfs_buf **bpp)
-{
- struct xfs_mount *mp = tp->t_mountp;
- xfs_agino_t next_agino;
- int error;
-
- ASSERT(head_agino != target_agino);
- *bpp = NULL;
-
- /* See if our backref cache can find it faster. */
- *agino = xfs_iunlink_lookup_backref(pag, target_agino);
- if (*agino != NULLAGINO) {
- error = xfs_iunlink_map_ino(tp, pag->pag_agno, *agino, imap,
- dipp, bpp);
- if (error)
- return error;
-
- if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
- return 0;
-
- /*
- * If we get here the cache contents were corrupt, so drop the
- * buffer and fall back to walking the bucket list.
- */
- xfs_trans_brelse(tp, *bpp);
- *bpp = NULL;
- WARN_ON_ONCE(1);
- }
-
- trace_xfs_iunlink_map_prev_fallback(mp, pag->pag_agno);
-
- /* Otherwise, walk the entire bucket until we find it. */
- next_agino = head_agino;
- while (next_agino != target_agino) {
- xfs_agino_t unlinked_agino;
-
- if (*bpp)
- xfs_trans_brelse(tp, *bpp);
-
- *agino = next_agino;
- error = xfs_iunlink_map_ino(tp, pag->pag_agno, next_agino, imap,
- dipp, bpp);
- if (error)
- return error;
-
- unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
- /*
- * Make sure this pointer is valid and isn't an obvious
- * infinite loop.
- */
- if (!xfs_verify_agino(pag, unlinked_agino) ||
- next_agino == unlinked_agino) {
- XFS_CORRUPTION_ERROR(__func__,
- XFS_ERRLEVEL_LOW, mp,
- *dipp, sizeof(**dipp));
- error = -EFSCORRUPTED;
- return error;
- }
- next_agino = unlinked_agino;
- }
-
- return 0;
-}
-
-/*
- * Pull the on-disk inode from the AGI unlinked list.
- */
-STATIC int
-xfs_iunlink_remove(
+static int
+xfs_iunlink_remove_inode(
struct xfs_trans *tp,
struct xfs_perag *pag,
+ struct xfs_buf *agibp,
struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi;
- struct xfs_buf *agibp;
- struct xfs_buf *last_ibp;
- struct xfs_dinode *last_dip = NULL;
+ struct xfs_agi *agi = agibp->b_addr;
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- xfs_agino_t next_agino;
xfs_agino_t head_agino;
short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
trace_xfs_iunlink_remove(ip);
- /* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(pag, tp, &agibp);
- if (error)
- return error;
- agi = agibp->b_addr;
-
/*
* Get the index into the agi hash table for the list this inode will
* go on. Make sure the head pointer isn't garbage.
@@ -2348,52 +2020,60 @@ xfs_iunlink_remove(
* the old pointer value so that we can update whatever was previous
* to us in the list to point to whatever was next in the list.
*/
- error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino);
+ error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
if (error)
return error;
/*
- * If there was a backref pointing from the next inode back to this
- * one, remove it because we've removed this inode from the list.
- *
- * Later, if this inode was in the middle of the list we'll update
- * this inode's backref to point from the next inode.
+ * Update the prev pointer in the next inode to point back to previous
+ * inode in the chain.
*/
- if (next_agino != NULLAGINO) {
- error = xfs_iunlink_change_backref(pag, next_agino, NULLAGINO);
- if (error)
- return error;
- }
+ error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
+ ip->i_next_unlinked);
+ if (error)
+ return error;
if (head_agino != agino) {
- struct xfs_imap imap;
- xfs_agino_t prev_agino;
-
- /* We need to search the list for the inode being freed. */
- error = xfs_iunlink_map_prev(tp, pag, head_agino, agino,
- &prev_agino, &imap, &last_dip, &last_ibp);
- if (error)
- return error;
+ struct xfs_inode *prev_ip;
- /* Point the previous inode on the list to the next inode. */
- xfs_iunlink_update_dinode(tp, pag, prev_agino, last_ibp,
- last_dip, &imap, next_agino);
+ prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
+ if (!prev_ip)
+ return -EFSCORRUPTED;
- /*
- * Now we deal with the backref for this inode. If this inode
- * pointed at a real inode, change the backref that pointed to
- * us to point to our old next. If this inode was the end of
- * the list, delete the backref that pointed to us. Note that
- * change_backref takes care of deleting the backref if
- * next_agino is NULLAGINO.
- */
- return xfs_iunlink_change_backref(agibp->b_pag, agino,
- next_agino);
+ error = xfs_iunlink_log_inode(tp, prev_ip, pag,
+ ip->i_next_unlinked);
+ prev_ip->i_next_unlinked = ip->i_next_unlinked;
+ } else {
+ /* Point the head of the list to the next unlinked inode. */
+ error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
+ ip->i_next_unlinked);
}
- /* Point the head of the list to the next unlinked inode. */
- return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
- next_agino);
+ ip->i_next_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = NULLAGINO;
+ return error;
+}
+
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+STATIC int
+xfs_iunlink_remove(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ struct xfs_buf *agibp;
+ int error;
+
+ trace_xfs_iunlink_remove(ip);
+
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = xfs_read_agi(pag, tp, &agibp);
+ if (error)
+ return error;
+
+ return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
}
/*