summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_inode_item.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_inode_item.c')
-rw-r--r--fs/xfs/xfs_inode_item.c212
1 files changed, 145 insertions, 67 deletions
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 91c847a84e10..2eb0c6011a2e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -19,6 +19,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_error.h"
+#include "xfs_rtbitmap.h"
#include <linux/iversion.h>
@@ -36,6 +37,36 @@ xfs_inode_item_sort(
return INODE_ITEM(lip)->ili_inode->i_ino;
}
+#ifdef DEBUG_EXPENSIVE
+static void
+xfs_inode_item_precommit_check(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_dinode *dip;
+ xfs_failaddr_t fa;
+
+ dip = kzalloc(mp->m_sb.sb_inodesize, GFP_KERNEL | GFP_NOFS);
+ if (!dip) {
+ ASSERT(dip != NULL);
+ return;
+ }
+
+ xfs_inode_to_disk(ip, dip, 0);
+ xfs_dinode_calc_crc(mp, dip);
+ fa = xfs_dinode_verify(mp, ip->i_ino, dip);
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), fa);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ ASSERT(fa == NULL);
+ }
+ kfree(dip);
+}
+#else
+# define xfs_inode_item_precommit_check(ip) ((void)0)
+#endif
+
/*
* Prior to finally logging the inode, we have to ensure that all the
* per-modification inode state changes are applied. This includes VFS inode
@@ -82,9 +113,9 @@ xfs_inode_item_precommit(
* to log the timestamps, or will clear already cleared fields in the
* worst case.
*/
- if (inode->i_state & I_DIRTY_TIME) {
+ if (inode_state_read_once(inode) & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
- inode->i_state &= ~I_DIRTY_TIME;
+ inode_state_clear(inode, I_DIRTY_TIME);
spin_unlock(&inode->i_lock);
}
@@ -100,32 +131,28 @@ xfs_inode_item_precommit(
}
/*
- * Inode verifiers do not check that the extent size hint is an integer
- * multiple of the rt extent size on a directory with both rtinherit
- * and extszinherit flags set. If we're logging a directory that is
- * misconfigured in this way, clear the hint.
+ * Inode verifiers do not check that the extent size hints are an
+ * integer multiple of the rt extent size on a directory with
+ * rtinherit flags set. If we're logging a directory that is
+ * misconfigured in this way, clear the bad hints.
*/
- if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
- (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
- (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
- ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
- XFS_DIFLAG_EXTSZINHERIT);
- ip->i_extsize = 0;
- flags |= XFS_ILOG_CORE;
+ if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) {
+ if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
+ ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ ip->i_extsize = 0;
+ flags |= XFS_ILOG_CORE;
+ }
+ if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
+ ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_cowextsize = 0;
+ flags |= XFS_ILOG_CORE;
+ }
}
- /*
- * Record the specific change for fdatasync optimisation. This allows
- * fdatasync to skip log forces for inodes that are only timestamp
- * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it
- * to XFS_ILOG_CORE so that the actual on-disk dirty tracking
- * (ili_fields) correctly tracks that the version has changed.
- */
spin_lock(&iip->ili_lock);
- iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
- if (flags & XFS_ILOG_IVERSION)
- flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
-
if (!iip->ili_item.li_buf) {
struct xfs_buf *bp;
int error;
@@ -154,12 +181,26 @@ xfs_inode_item_precommit(
xfs_buf_hold(bp);
spin_lock(&iip->ili_lock);
iip->ili_item.li_buf = bp;
- bp->b_flags |= _XBF_INODES;
+ bp->b_iodone = xfs_buf_inode_iodone;
list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
xfs_trans_brelse(tp, bp);
}
/*
+ * Store the dirty flags back into the inode item as this state is used
+ * later on in xfs_inode_item_committing() to determine whether the
+ * transaction is relevant to fsync state or not.
+ */
+ iip->ili_dirty_flags = flags;
+
+ /*
+ * Convert the flags on-disk fields that have been modified in the
+ * transaction so that ili_fields tracks the changes correctly.
+ */
+ if (flags & XFS_ILOG_IVERSION)
+ flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
+
+ /*
* Always OR in the bits from the ili_last_fields field. This is to
* coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
* in the eventual clearing of the ili_fields bits. See the big comment
@@ -168,11 +209,7 @@ xfs_inode_item_precommit(
iip->ili_fields |= (flags | iip->ili_last_fields);
spin_unlock(&iip->ili_lock);
- /*
- * We are done with the log item transaction dirty state, so clear it so
- * that it doesn't pollute future transactions.
- */
- iip->ili_dirty_flags = 0;
+ xfs_inode_item_precommit_check(ip);
return 0;
}
@@ -209,6 +246,7 @@ xfs_inode_item_data_fork_size(
}
break;
case XFS_DINODE_FMT_BTREE:
+ case XFS_DINODE_FMT_META_BTREE:
if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
ip->i_df.if_broot_bytes > 0) {
*nbytes += ip->i_df.if_broot_bytes;
@@ -329,6 +367,7 @@ xfs_inode_item_format_data_fork(
}
break;
case XFS_DINODE_FMT_BTREE:
+ case XFS_DINODE_FMT_META_BTREE:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV);
@@ -351,11 +390,10 @@ xfs_inode_item_format_data_fork(
~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- ASSERT(ip->i_df.if_u1.if_data != NULL);
+ ASSERT(ip->i_df.if_data != NULL);
ASSERT(ip->i_disk_size > 0);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
- ip->i_df.if_u1.if_data,
- ip->i_df.if_bytes);
+ ip->i_df.if_data, ip->i_df.if_bytes);
ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
ilf->ilf_size++;
} else {
@@ -430,10 +468,9 @@ xfs_inode_item_format_attr_fork(
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_af.if_bytes > 0) {
- ASSERT(ip->i_af.if_u1.if_data != NULL);
+ ASSERT(ip->i_af.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
- ip->i_af.if_u1.if_data,
- ip->i_af.if_bytes);
+ ip->i_af.if_data, ip->i_af.if_bytes);
ilf->ilf_asize = (unsigned)ip->i_af.if_bytes;
ilf->ilf_size++;
} else {
@@ -525,10 +562,9 @@ xfs_inode_to_log_dinode(
to->di_projid_lo = ip->i_projid & 0xffff;
to->di_projid_hi = ip->i_projid >> 16;
- memset(to->di_pad3, 0, sizeof(to->di_pad3));
- to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
- to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
- to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime);
+ to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode));
+ to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode));
+ to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode));
to->di_nlink = inode->i_nlink;
to->di_gen = inode->i_generation;
to->di_mode = inode->i_mode;
@@ -550,16 +586,26 @@ xfs_inode_to_log_dinode(
to->di_changecount = inode_peek_iversion(inode);
to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime);
to->di_flags2 = ip->i_diflags2;
+ /* also covers the di_used_blocks union arm: */
to->di_cowextsize = ip->i_cowextsize;
to->di_ino = ip->i_ino;
to->di_lsn = lsn;
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_v3_pad = 0;
+
+ /* dummy value for initialisation */
+ to->di_crc = 0;
+
+ if (xfs_is_metadir_inode(ip))
+ to->di_metatype = ip->i_metatype;
+ else
+ to->di_metatype = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
+ to->di_metatype = 0;
}
xfs_inode_to_log_dinode_iext_counters(ip, to);
@@ -648,7 +694,7 @@ xfs_inode_item_pin(
{
struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(lip->li_buf);
trace_xfs_inode_pin(ip, _RET_IP_);
@@ -673,13 +719,24 @@ xfs_inode_item_unpin(
struct xfs_log_item *lip,
int remove)
{
- struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
trace_xfs_inode_unpin(ip, _RET_IP_);
ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
ASSERT(atomic_read(&ip->i_pincount) > 0);
- if (atomic_dec_and_test(&ip->i_pincount))
+
+ /*
+ * If this is the last unpin, then the inode no longer needs a journal
+ * flush to persist it. Hence we can clear the commit sequence numbers
+ * as a fsync/fdatasync operation on the inode at this point is a no-op.
+ */
+ if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) {
+ iip->ili_commit_seq = 0;
+ iip->ili_datasync_seq = 0;
+ spin_unlock(&iip->ili_lock);
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+ }
}
STATIC uint
@@ -702,11 +759,14 @@ xfs_inode_item_push(
* completed and items removed from the AIL before the next push
* attempt.
*/
+ trace_xfs_inode_push_stale(ip, _RET_IP_);
return XFS_ITEM_PINNED;
}
- if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp))
+ if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) {
+ trace_xfs_inode_push_pinned(ip, _RET_IP_);
return XFS_ITEM_PINNED;
+ }
if (xfs_iflags_test(ip, XFS_IFLUSHING))
return XFS_ITEM_FLUSHING;
@@ -754,7 +814,7 @@ xfs_inode_item_release(
unsigned short lock_flags;
ASSERT(ip->i_itemp != NULL);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
lock_flags = iip->ili_lock_flags;
iip->ili_lock_flags = 0;
@@ -799,12 +859,45 @@ xfs_inode_item_committed(
return lsn;
}
+/*
+ * The modification is now complete, so before we unlock the inode we need to
+ * update the commit sequence numbers for data integrity journal flushes. We
+ * always record the commit sequence number (ili_commit_seq) so that anything
+ * that needs a full journal sync will capture all of this modification.
+ *
+ * We then
+ * check if the changes will impact a datasync (O_DSYNC) journal flush. If the
+ * changes will require a datasync flush, then we also record the sequence in
+ * ili_datasync_seq.
+ *
+ * These commit sequence numbers will get cleared atomically with the inode being
+ * unpinned (i.e. pin count goes to zero), and so it will only be set when the
+ * inode is dirty in the journal. This removes the need for checking if the
+ * inode is pinned to determine if a journal flush is necessary, and hence
+ * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to
+ * serialise pin counts against commit sequence number updates.
+ *
+ */
STATIC void
xfs_inode_item_committing(
struct xfs_log_item *lip,
xfs_csn_t seq)
{
- INODE_ITEM(lip)->ili_commit_seq = seq;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+
+ spin_lock(&iip->ili_lock);
+ iip->ili_commit_seq = seq;
+ if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP))
+ iip->ili_datasync_seq = seq;
+ spin_unlock(&iip->ili_lock);
+
+ /*
+ * Clear the per-transaction dirty flags now that we have finished
+ * recording the transaction's inode modifications in the CIL and are
+ * about to release and (maybe) unlock the inode.
+ */
+ iip->ili_dirty_flags = 0;
+
return xfs_inode_item_release(lip);
}
@@ -854,7 +947,7 @@ xfs_inode_item_destroy(
ASSERT(iip->ili_item.li_buf == NULL);
ip->i_itemp = NULL;
- kmem_free(iip->ili_item.li_lv_shadow);
+ kvfree(iip->ili_item.li_lv_shadow);
kmem_cache_free(xfs_ili_cache, iip);
}
@@ -931,6 +1024,7 @@ xfs_iflush_finish(
}
iip->ili_last_fields = 0;
iip->ili_flush_lsn = 0;
+ clear_bit(XFS_LI_FLUSHING, &lip->li_flags);
spin_unlock(&iip->ili_lock);
xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING);
if (drop_buffer)
@@ -983,16 +1077,6 @@ xfs_buf_inode_iodone(
list_splice_tail(&flushed_inodes, &bp->b_li_list);
}
-void
-xfs_buf_inode_io_fail(
- struct xfs_buf *bp)
-{
- struct xfs_log_item *lip;
-
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
- set_bit(XFS_LI_FAILED, &lip->li_flags);
-}
-
/*
* Clear the inode logging fields so no more flushes are attempted. If we are
* on a buffer list, it is now safe to remove it because the buffer is
@@ -1005,10 +1089,10 @@ xfs_iflush_abort_clean(
{
iip->ili_last_fields = 0;
iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
iip->ili_flush_lsn = 0;
iip->ili_item.li_buf = NULL;
list_del_init(&iip->ili_item.li_bio_list);
+ clear_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
}
/*
@@ -1041,13 +1125,7 @@ xfs_iflush_abort(
* state. Whilst the inode is in the AIL, it should have a valid buffer
* pointer for push operations to access - it is only safe to remove the
* inode from the buffer once it has been removed from the AIL.
- *
- * We also clear the failed bit before removing the item from the AIL
- * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer
- * references the inode item owns and needs to hold until we've fully
- * aborted the inode log item and detached it from the buffer.
*/
- clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
xfs_trans_ail_delete(&iip->ili_item, 0);
/*
@@ -1137,12 +1215,12 @@ xfs_iflush_shutdown_abort(
*/
int
xfs_inode_item_format_convert(
- struct xfs_log_iovec *buf,
+ struct kvec *buf,
struct xfs_inode_log_format *in_f)
{
- struct xfs_inode_log_format_32 *in_f32 = buf->i_addr;
+ struct xfs_inode_log_format_32 *in_f32 = buf->iov_base;
- if (buf->i_len != sizeof(*in_f32)) {
+ if (buf->iov_len != sizeof(*in_f32)) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
}