diff options
Diffstat (limited to 'fs/xfs/xfs_inode_item.c')
| -rw-r--r-- | fs/xfs/xfs_inode_item.c | 212 |
1 files changed, 145 insertions, 67 deletions
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 91c847a84e10..2eb0c6011a2e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -19,6 +19,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_error.h" +#include "xfs_rtbitmap.h" #include <linux/iversion.h> @@ -36,6 +37,36 @@ xfs_inode_item_sort( return INODE_ITEM(lip)->ili_inode->i_ino; } +#ifdef DEBUG_EXPENSIVE +static void +xfs_inode_item_precommit_check( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_dinode *dip; + xfs_failaddr_t fa; + + dip = kzalloc(mp->m_sb.sb_inodesize, GFP_KERNEL | GFP_NOFS); + if (!dip) { + ASSERT(dip != NULL); + return; + } + + xfs_inode_to_disk(ip, dip, 0); + xfs_dinode_calc_crc(mp, dip); + fa = xfs_dinode_verify(mp, ip->i_ino, dip); + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), fa); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + ASSERT(fa == NULL); + } + kfree(dip); +} +#else +# define xfs_inode_item_precommit_check(ip) ((void)0) +#endif + /* * Prior to finally logging the inode, we have to ensure that all the * per-modification inode state changes are applied. This includes VFS inode @@ -82,9 +113,9 @@ xfs_inode_item_precommit( * to log the timestamps, or will clear already cleared fields in the * worst case. */ - if (inode->i_state & I_DIRTY_TIME) { + if (inode_state_read_once(inode) & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; + inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); } @@ -100,32 +131,28 @@ xfs_inode_item_precommit( } /* - * Inode verifiers do not check that the extent size hint is an integer - * multiple of the rt extent size on a directory with both rtinherit - * and extszinherit flags set. If we're logging a directory that is - * misconfigured in this way, clear the hint. + * Inode verifiers do not check that the extent size hints are an + * integer multiple of the rt extent size on a directory with + * rtinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the bad hints. */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - flags |= XFS_ILOG_CORE; + if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) { + if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + flags |= XFS_ILOG_CORE; + } } - /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it - * to XFS_ILOG_CORE so that the actual on-disk dirty tracking - * (ili_fields) correctly tracks that the version has changed. - */ spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); - if (flags & XFS_ILOG_IVERSION) - flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); - if (!iip->ili_item.li_buf) { struct xfs_buf *bp; int error; @@ -154,12 +181,26 @@ xfs_inode_item_precommit( xfs_buf_hold(bp); spin_lock(&iip->ili_lock); iip->ili_item.li_buf = bp; - bp->b_flags |= _XBF_INODES; + bp->b_iodone = xfs_buf_inode_iodone; list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); xfs_trans_brelse(tp, bp); } /* + * Store the dirty flags back into the inode item as this state is used + * later on in xfs_inode_item_committing() to determine whether the + * transaction is relevant to fsync state or not. + */ + iip->ili_dirty_flags = flags; + + /* + * Convert the flags on-disk fields that have been modified in the + * transaction so that ili_fields tracks the changes correctly. + */ + if (flags & XFS_ILOG_IVERSION) + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + + /* * Always OR in the bits from the ili_last_fields field. This is to * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines * in the eventual clearing of the ili_fields bits. See the big comment @@ -168,11 +209,7 @@ xfs_inode_item_precommit( iip->ili_fields |= (flags | iip->ili_last_fields); spin_unlock(&iip->ili_lock); - /* - * We are done with the log item transaction dirty state, so clear it so - * that it doesn't pollute future transactions. - */ - iip->ili_dirty_flags = 0; + xfs_inode_item_precommit_check(ip); return 0; } @@ -209,6 +246,7 @@ xfs_inode_item_data_fork_size( } break; case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_META_BTREE: if ((iip->ili_fields & XFS_ILOG_DBROOT) && ip->i_df.if_broot_bytes > 0) { *nbytes += ip->i_df.if_broot_bytes; @@ -329,6 +367,7 @@ xfs_inode_item_format_data_fork( } break; case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_META_BTREE: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV); @@ -351,11 +390,10 @@ xfs_inode_item_format_data_fork( ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_df.if_data != NULL); ASSERT(ip->i_disk_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, - ip->i_df.if_u1.if_data, - ip->i_df.if_bytes); + ip->i_df.if_data, ip->i_df.if_bytes); ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes; ilf->ilf_size++; } else { @@ -430,10 +468,9 @@ xfs_inode_item_format_attr_fork( if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_af.if_bytes > 0) { - ASSERT(ip->i_af.if_u1.if_data != NULL); + ASSERT(ip->i_af.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, - ip->i_af.if_u1.if_data, - ip->i_af.if_bytes); + ip->i_af.if_data, ip->i_af.if_bytes); ilf->ilf_asize = (unsigned)ip->i_af.if_bytes; ilf->ilf_size++; } else { @@ -525,10 +562,9 @@ xfs_inode_to_log_dinode( to->di_projid_lo = ip->i_projid & 0xffff; to->di_projid_hi = ip->i_projid >> 16; - memset(to->di_pad3, 0, sizeof(to->di_pad3)); - to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); - to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); - to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime); + to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode)); + to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode)); + to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode)); to->di_nlink = inode->i_nlink; to->di_gen = inode->i_generation; to->di_mode = inode->i_mode; @@ -550,16 +586,26 @@ xfs_inode_to_log_dinode( to->di_changecount = inode_peek_iversion(inode); to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime); to->di_flags2 = ip->i_diflags2; + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = ip->i_cowextsize; to->di_ino = ip->i_ino; to->di_lsn = lsn; memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_v3_pad = 0; + + /* dummy value for initialisation */ + to->di_crc = 0; + + if (xfs_is_metadir_inode(ip)) + to->di_metatype = ip->i_metatype; + else + to->di_metatype = 0; } else { to->di_version = 2; to->di_flushiter = ip->i_flushiter; memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); + to->di_metatype = 0; } xfs_inode_to_log_dinode_iext_counters(ip, to); @@ -648,7 +694,7 @@ xfs_inode_item_pin( { struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(lip->li_buf); trace_xfs_inode_pin(ip, _RET_IP_); @@ -673,13 +719,24 @@ xfs_inode_item_unpin( struct xfs_log_item *lip, int remove) { - struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE)); ASSERT(atomic_read(&ip->i_pincount) > 0); - if (atomic_dec_and_test(&ip->i_pincount)) + + /* + * If this is the last unpin, then the inode no longer needs a journal + * flush to persist it. Hence we can clear the commit sequence numbers + * as a fsync/fdatasync operation on the inode at this point is a no-op. + */ + if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) { + iip->ili_commit_seq = 0; + iip->ili_datasync_seq = 0; + spin_unlock(&iip->ili_lock); wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); + } } STATIC uint @@ -702,11 +759,14 @@ xfs_inode_item_push( * completed and items removed from the AIL before the next push * attempt. */ + trace_xfs_inode_push_stale(ip, _RET_IP_); return XFS_ITEM_PINNED; } - if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) { + trace_xfs_inode_push_pinned(ip, _RET_IP_); return XFS_ITEM_PINNED; + } if (xfs_iflags_test(ip, XFS_IFLUSHING)) return XFS_ITEM_FLUSHING; @@ -754,7 +814,7 @@ xfs_inode_item_release( unsigned short lock_flags; ASSERT(ip->i_itemp != NULL); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0; @@ -799,12 +859,45 @@ xfs_inode_item_committed( return lsn; } +/* + * The modification is now complete, so before we unlock the inode we need to + * update the commit sequence numbers for data integrity journal flushes. We + * always record the commit sequence number (ili_commit_seq) so that anything + * that needs a full journal sync will capture all of this modification. + * + * We then + * check if the changes will impact a datasync (O_DSYNC) journal flush. If the + * changes will require a datasync flush, then we also record the sequence in + * ili_datasync_seq. + * + * These commit sequence numbers will get cleared atomically with the inode being + * unpinned (i.e. pin count goes to zero), and so it will only be set when the + * inode is dirty in the journal. This removes the need for checking if the + * inode is pinned to determine if a journal flush is necessary, and hence + * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to + * serialise pin counts against commit sequence number updates. + * + */ STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, xfs_csn_t seq) { - INODE_ITEM(lip)->ili_commit_seq = seq; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + + spin_lock(&iip->ili_lock); + iip->ili_commit_seq = seq; + if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP)) + iip->ili_datasync_seq = seq; + spin_unlock(&iip->ili_lock); + + /* + * Clear the per-transaction dirty flags now that we have finished + * recording the transaction's inode modifications in the CIL and are + * about to release and (maybe) unlock the inode. + */ + iip->ili_dirty_flags = 0; + return xfs_inode_item_release(lip); } @@ -854,7 +947,7 @@ xfs_inode_item_destroy( ASSERT(iip->ili_item.li_buf == NULL); ip->i_itemp = NULL; - kmem_free(iip->ili_item.li_lv_shadow); + kvfree(iip->ili_item.li_lv_shadow); kmem_cache_free(xfs_ili_cache, iip); } @@ -931,6 +1024,7 @@ xfs_iflush_finish( } iip->ili_last_fields = 0; iip->ili_flush_lsn = 0; + clear_bit(XFS_LI_FLUSHING, &lip->li_flags); spin_unlock(&iip->ili_lock); xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING); if (drop_buffer) @@ -983,16 +1077,6 @@ xfs_buf_inode_iodone( list_splice_tail(&flushed_inodes, &bp->b_li_list); } -void -xfs_buf_inode_io_fail( - struct xfs_buf *bp) -{ - struct xfs_log_item *lip; - - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) - set_bit(XFS_LI_FAILED, &lip->li_flags); -} - /* * Clear the inode logging fields so no more flushes are attempted. If we are * on a buffer list, it is now safe to remove it because the buffer is @@ -1005,10 +1089,10 @@ xfs_iflush_abort_clean( { iip->ili_last_fields = 0; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; iip->ili_flush_lsn = 0; iip->ili_item.li_buf = NULL; list_del_init(&iip->ili_item.li_bio_list); + clear_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags); } /* @@ -1041,13 +1125,7 @@ xfs_iflush_abort( * state. Whilst the inode is in the AIL, it should have a valid buffer * pointer for push operations to access - it is only safe to remove the * inode from the buffer once it has been removed from the AIL. - * - * We also clear the failed bit before removing the item from the AIL - * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer - * references the inode item owns and needs to hold until we've fully - * aborted the inode log item and detached it from the buffer. */ - clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); xfs_trans_ail_delete(&iip->ili_item, 0); /* @@ -1137,12 +1215,12 @@ xfs_iflush_shutdown_abort( */ int xfs_inode_item_format_convert( - struct xfs_log_iovec *buf, + struct kvec *buf, struct xfs_inode_log_format *in_f) { - struct xfs_inode_log_format_32 *in_f32 = buf->i_addr; + struct xfs_inode_log_format_32 *in_f32 = buf->iov_base; - if (buf->i_len != sizeof(*in_f32)) { + if (buf->iov_len != sizeof(*in_f32)) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; } |
