From b52a360b2aa1c59ba9970fb0f52bbb093fcc7a24 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Mon, 7 Nov 2011 16:10:24 +0000 Subject: xfs: Fix possible memory corruption in xfs_readlink Fixes a possible memory corruption when the link is larger than MAXPATHLEN and XFS_DEBUG is not enabled. This also remove the S_ISLNK assert, since the inode mode is checked previously in xfs_readlink_by_handle() and via VFS. Updated to address concerns raised by Ben Hutchings about the loose attention paid to 32- vs 64-bit values, and the lack of handling a potentially negative pathlen value: - Changed type of "pathlen" to be xfs_fsize_t, to match that of ip->i_d.di_size - Added checking for a negative pathlen to the too-long pathlen test, and generalized the message that gets reported in that case to reflect the change As a result, if a negative pathlen were encountered, this function would return EFSCORRUPTED (and would fail an assertion for a debug build)--just as would a too-long pathlen. Signed-off-by: Alex Elder Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_vnodeops.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 4ecf2a549060..ce9268a2f56b 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -112,7 +112,7 @@ xfs_readlink( char *link) { xfs_mount_t *mp = ip->i_mount; - int pathlen; + xfs_fsize_t pathlen; int error = 0; trace_xfs_readlink(ip); @@ -122,13 +122,19 @@ xfs_readlink( xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT(S_ISLNK(ip->i_d.di_mode)); - ASSERT(ip->i_d.di_size <= MAXPATHLEN); - pathlen = ip->i_d.di_size; if (!pathlen) goto out; + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { memcpy(link, ip->i_df.if_u1.if_data, pathlen); link[pathlen] = '\0'; -- cgit From 272e42b215c52d32e06bf035c1f6b70baa6716bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Oct 2011 09:54:24 +0000 Subject: xfs: constify xfs_item_ops The log item ops aren't nessecarily the biggest exploit vector, but marking them const is easy enough. Also remove the unused xfs_item_ops_t typedef while we're at it. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Alex Elder --- fs/xfs/xfs_buf_item.c | 2 +- fs/xfs/xfs_dquot_item.c | 6 +++--- fs/xfs/xfs_extfree_item.c | 4 ++-- fs/xfs/xfs_inode_item.c | 2 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log.h | 2 +- fs/xfs/xfs_trans.h | 6 +++--- 7 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1a3513881bce..eac97ef81e2a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -656,7 +656,7 @@ xfs_buf_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_buf_item_ops = { +static const struct xfs_item_ops xfs_buf_item_ops = { .iop_size = xfs_buf_item_size, .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index bb3f71d236d2..0dee0b71029d 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -295,7 +295,7 @@ xfs_qm_dquot_logitem_committing( /* * This is the ops vector for dquots */ -static struct xfs_item_ops xfs_dquot_item_ops = { +static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = xfs_qm_dquot_logitem_size, .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, @@ -483,7 +483,7 @@ xfs_qm_qoff_logitem_committing( { } -static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { +static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, @@ -498,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { /* * This is the ops vector shared by all quotaoff-start log items. */ -static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { +static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d22e62623437..35c2aff38b20 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -217,7 +217,7 @@ xfs_efi_item_committing( /* * This is the ops vector shared by all efi log items. */ -static struct xfs_item_ops xfs_efi_item_ops = { +static const struct xfs_item_ops xfs_efi_item_ops = { .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_pin = xfs_efi_item_pin, @@ -477,7 +477,7 @@ xfs_efd_item_committing( /* * This is the ops vector shared by all efd log items. */ -static struct xfs_item_ops xfs_efd_item_ops = { +static const struct xfs_item_ops xfs_efd_item_ops = { .iop_size = xfs_efd_item_size, .iop_format = xfs_efd_item_format, .iop_pin = xfs_efd_item_pin, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index b7cf21ba240f..abaafdbb3e65 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -795,7 +795,7 @@ xfs_inode_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_inode_item_ops = { +static const struct xfs_item_ops xfs_inode_item_ops = { .iop_size = xfs_inode_item_size, .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2758a6277c52..a14cd89fe465 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -626,7 +626,7 @@ xfs_log_item_init( struct xfs_mount *mp, struct xfs_log_item *item, int type, - struct xfs_item_ops *ops) + const struct xfs_item_ops *ops) { item->li_mountp = mp; item->li_ailp = mp->m_ail; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 78c9039994af..3f7bf451c034 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -137,7 +137,7 @@ struct xfs_trans; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, int type, - struct xfs_item_ops *ops); + const struct xfs_item_ops *ops); xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 603f3eb52041..3ae713c0abd9 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -326,7 +326,7 @@ typedef struct xfs_log_item { struct xfs_log_item *); /* buffer item iodone */ /* callback func */ - struct xfs_item_ops *li_ops; /* function list */ + const struct xfs_item_ops *li_ops; /* function list */ /* delayed logging */ struct list_head li_cil; /* CIL pointers */ @@ -341,7 +341,7 @@ typedef struct xfs_log_item { { XFS_LI_IN_AIL, "IN_AIL" }, \ { XFS_LI_ABORTED, "ABORTED" } -typedef struct xfs_item_ops { +struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); @@ -352,7 +352,7 @@ typedef struct xfs_item_ops { void (*iop_push)(xfs_log_item_t *); bool (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); -} xfs_item_ops_t; +}; #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) -- cgit From 810627d9a6d0e8820c798001875bc4e1b7754ebf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2011 08:56:15 +0000 Subject: xfs: fix force shutdown handling in xfs_end_io Ensure ioend->io_error gets propagated back to e.g. AIO completions. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/xfs_aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 33b13310ee0c..574d4ee9b625 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -189,7 +189,7 @@ xfs_end_io( int error = 0; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - error = -EIO; + ioend->io_error = -EIO; goto done; } if (ioend->io_error) -- cgit From db3e74b582915d66e10b0c73a62763418f54c340 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Thu, 10 Nov 2011 01:33:10 +0000 Subject: xfs: use doalloc flag in xfs_qm_dqattach_one() The doalloc arg in xfs_qm_dqattach_one() is a flag that indicates whether a new area to handle quota information will be allocated if needed. Originally, it was passed to xfs_qm_dqget(), but has been removed by the following commit (probably by mistake): commit 8e9b6e7fa4544ea8a0e030c8987b918509c8ff47 Author: Christoph Hellwig Date: Sun Feb 8 21:51:42 2009 +0100 xfs: remove the unused XFS_QMOPT_DQLOCK flag As the result, xfs_qm_dqget() called from xfs_qm_dqattach_one() never allocates the new area even if it is needed. This patch gives the doalloc arg to xfs_qm_dqget() in xfs_qm_dqattach_one() to fix this problem. Signed-off-by: Mitsuo Hayasaka Cc: Alex Elder Cc: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_qm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5cff443f6cdb..0bbb1a41998b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -674,7 +674,8 @@ xfs_qm_dqattach_one( * disk and we didn't ask it to allocate; * ESRCH if quotas got turned off suddenly. */ - error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp); + error = xfs_qm_dqget(ip->i_mount, ip, id, type, + doalloc | XFS_QMOPT_DOWARN, &dqp); if (error) return error; -- cgit From fa8b18edd752a8b4e9d1ee2cd615b82c93cf8bba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 20 Nov 2011 15:35:32 +0000 Subject: xfs: validate acl count This prevents in-memory corruption and possible panics if the on-disk ACL is badly corrupted. Signed-off-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_acl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b6c4b3795c4a..76e4266d2e7e 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -42,6 +42,8 @@ xfs_acl_from_disk(struct xfs_acl *aclp) int count, i; count = be32_to_cpu(aclp->acl_cnt); + if (count > XFS_ACL_MAX_ENTRIES) + return ERR_PTR(-EFSCORRUPTED); acl = posix_acl_alloc(count, GFP_KERNEL); if (!acl) -- cgit From 4dd2cb4a28b7ab1f37163a4eba280926a13a8749 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Nov 2011 12:06:14 -0600 Subject: xfs: force buffer writeback before blocking on the ilock in inode reclaim If we are doing synchronous inode reclaim we block the VM from making progress in memory reclaim. So if we encouter a flush locked inode promote it in the delwri list and wake up xfsbufd to write it out now. Without this we can get hangs of up to 30 seconds during workloads hitting synchronous inode reclaim. The scheme is copied from what we do for dquot reclaims. Reported-by: Simon Kirby Signed-off-by: Christoph Hellwig Tested-by: Simon Kirby Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 21 +++++++++++++++++++++ fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_sync.c | 11 +++++++++++ 3 files changed, 33 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c0237c602f11..755ee8164880 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2835,6 +2835,27 @@ corrupt_out: return XFS_ERROR(EFSCORRUPTED); } +void +xfs_promote_inode( + struct xfs_inode *ip) +{ + struct xfs_buf *bp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + + bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno, + ip->i_imap.im_len, XBF_TRYLOCK); + if (!bp) + return; + + if (XFS_BUF_ISDELAYWRITE(bp)) { + xfs_buf_delwri_promote(bp); + wake_up_process(ip->i_mount->m_ddev_targp->bt_task); + } + + xfs_buf_relse(bp); +} + /* * Return a pointer to the extent record at file index idx. */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 760140d1dd66..b4cd4739f98e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -498,6 +498,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_iunpin_wait(xfs_inode_t *); int xfs_iflush(xfs_inode_t *, uint); +void xfs_promote_inode(struct xfs_inode *); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index aa3dc1a4d53d..be5c51d8f757 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -770,6 +770,17 @@ restart: if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; + + /* + * If we only have a single dirty inode in a cluster there is + * a fair chance that the AIL push may have pushed it into + * the buffer, but xfsbufd won't touch it until 30 seconds + * from now, and thus we will lock up here. + * + * Promote the inode buffer to the front of the delwri list + * and wake up xfsbufd now. + */ + xfs_promote_inode(ip); xfs_iflock(ip); } -- cgit From 4c393a6059f8442a70512a48ce4639b882b6f6ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 19 Nov 2011 17:44:30 +0000 Subject: xfs: fix attr2 vs large data fork assert With Dmitry fsstress updates I've seen very reproducible crashes in xfs_attr_shortform_remove because xfs_attr_shortform_bytesfit claims that the attributes would not fit inline into the inode after removing an attribute. It turns out that we were operating on an inode with lots of delalloc extents, and thus an if_bytes values for the data fork that is larger than biggest possible on-disk storage for it which utterly confuses the code near the end of xfs_attr_shortform_bytesfit. Fix this by always allowing the current attribute fork, like we already do for the attr1 format, given that delalloc conversion will take care for moving either the data or attribute area out of line if it doesn't fit at that point - or making the point moot by merging extents at this point. Also document the function better, and clean up some loose bits. Reviewed-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_attr_leaf.c | 64 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 25 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index d4906e7c9787..c1b55e596551 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -110,6 +110,7 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags) /* * Query whether the requested number of additional bytes of extended * attribute space will be able to fit inline. + * * Returns zero if not, else the di_forkoff fork offset to be used in the * literal area for attribute data once the new bytes have been added. * @@ -122,7 +123,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) int offset; int minforkoff; /* lower limit on valid forkoff locations */ int maxforkoff; /* upper limit on valid forkoff locations */ - int dsize; + int dsize; xfs_mount_t *mp = dp->i_mount; offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ @@ -136,47 +137,60 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) return (offset >= minforkoff) ? minforkoff : 0; } - if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { - if (bytes <= XFS_IFORK_ASIZE(dp)) - return dp->i_d.di_forkoff; + /* + * If the requested numbers of bytes is smaller or equal to the + * current attribute fork size we can always proceed. + * + * Note that if_bytes in the data fork might actually be larger than + * the current data fork size is due to delalloc extents. In that + * case either the extent count will go down when they are converted + * to real extents, or the delalloc conversion will take care of the + * literal area rebalancing. + */ + if (bytes <= XFS_IFORK_ASIZE(dp)) + return dp->i_d.di_forkoff; + + /* + * For attr2 we can try to move the forkoff if there is space in the + * literal area, but for the old format we are done if there is no + * space in the fixed attribute fork. + */ + if (!(mp->m_flags & XFS_MOUNT_ATTR2)) return 0; - } dsize = dp->i_df.if_bytes; - + switch (dp->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* + /* * If there is no attr fork and the data fork is extents, - * determine if creating the default attr fork will result - * in the extents form migrating to btree. If so, the - * minimum offset only needs to be the space required for + * determine if creating the default attr fork will result + * in the extents form migrating to btree. If so, the + * minimum offset only needs to be the space required for * the btree root. - */ + */ if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > xfs_default_attroffset(dp)) dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); break; - case XFS_DINODE_FMT_BTREE: /* - * If have data btree then keep forkoff if we have one, - * otherwise we are adding a new attr, so then we set - * minforkoff to where the btree root can finish so we have + * If we have a data btree then keep forkoff if we have one, + * otherwise we are adding a new attr, so then we set + * minforkoff to where the btree root can finish so we have * plenty of room for attrs */ if (dp->i_d.di_forkoff) { - if (offset < dp->i_d.di_forkoff) + if (offset < dp->i_d.di_forkoff) return 0; - else - return dp->i_d.di_forkoff; - } else - dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); + return dp->i_d.di_forkoff; + } + dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); break; } - - /* - * A data fork btree root must have space for at least + + /* + * A data fork btree root must have space for at least * MINDBTPTRS key/ptr pairs if the data fork is small or empty. */ minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); @@ -186,10 +200,10 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ - if (offset >= minforkoff && offset < maxforkoff) - return offset; if (offset >= maxforkoff) return maxforkoff; + if (offset >= minforkoff) + return offset; return 0; } -- cgit