From 6958d11f77d45db80f7e22a21a74d4d5f44dc667 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Sun, 17 Mar 2019 15:21:49 -0700 Subject: xfs: don't trip over uninitialized buffer on extent read of corrupted inode We've had rather rare reports of bmap btree block corruption where the bmap root block has a level count of zero. The root cause of the corruption is so far unknown. We do have verifier checks to detect this form of on-disk corruption, but this doesn't cover a memory corruption variant of the problem. The latter is a reasonable possibility because the root block is part of the inode fork and can reside in-core for some time before inode extents are read. If this occurs, it leads to a system crash such as the following: BUG: unable to handle kernel paging request at ffffffff00000221 PF error: [normal kernel read fault] ... RIP: 0010:xfs_trans_brelse+0xf/0x200 [xfs] ... Call Trace: xfs_iread_extents+0x379/0x540 [xfs] xfs_file_iomap_begin_delay+0x11a/0xb40 [xfs] ? xfs_attr_get+0xd1/0x120 [xfs] ? iomap_write_begin.constprop.40+0x2d0/0x2d0 xfs_file_iomap_begin+0x4c4/0x6d0 [xfs] ? __vfs_getxattr+0x53/0x70 ? iomap_write_begin.constprop.40+0x2d0/0x2d0 iomap_apply+0x63/0x130 ? iomap_write_begin.constprop.40+0x2d0/0x2d0 iomap_file_buffered_write+0x62/0x90 ? iomap_write_begin.constprop.40+0x2d0/0x2d0 xfs_file_buffered_aio_write+0xe4/0x3b0 [xfs] __vfs_write+0x150/0x1b0 vfs_write+0xba/0x1c0 ksys_pwrite64+0x64/0xa0 do_syscall_64+0x5a/0x1d0 entry_SYSCALL_64_after_hwframe+0x49/0xbe The crash occurs because xfs_iread_extents() attempts to release an uninitialized buffer pointer as the level == 0 value prevented the buffer from ever being allocated or read. Change the level > 0 assert to an explicit error check in xfs_iread_extents() to avoid crashing the kernel in the event of localized, in-core inode corruption. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 48502cb9990f..ae4c3b0d84db 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1191,7 +1191,10 @@ xfs_iread_extents( * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. */ level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); + if (unlikely(level == 0)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); -- cgit From 228de124f290e6b981b2c61fbd78215e11264044 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 19 Mar 2019 08:16:21 -0700 Subject: xfs: dabtree scrub needs to range-check level Make sure scrub's dabtree iterator function checks that we're not going deeper in the stack than our cursor permits. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/dabtree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index f1260b4bfdee..90527b094878 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -574,6 +574,11 @@ xchk_da_btree( /* Drill another level deeper. */ blkno = be32_to_cpu(key->before); level++; + if (level >= XFS_DA_NODE_MAXDEPTH) { + /* Too deep! */ + xchk_da_set_corrupt(&ds, level - 1); + break; + } ds.tree_level--; error = xchk_da_btree_block(&ds, level, blkno); if (error) -- cgit From a72e9d8d69e7ca848ddd4c4db72d3ab280c1775d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 19 Mar 2019 08:16:22 -0700 Subject: xfs: fix btree scrub checking with regards to root-in-inode In xchk_btree_check_owner, we can be passed a null buffer pointer. This should only happen for the root of a root-in-inode btree type, but we should program defensively in case the btree cursor state ever gets screwed up and we get a null buffer anyway. Coverity-id: 1438713 Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/btree.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 6f94d1f7322d..117910db51b8 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -415,8 +415,17 @@ xchk_btree_check_owner( struct xfs_btree_cur *cur = bs->cur; struct check_owner *co; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL) + /* + * In theory, xfs_btree_get_block should only give us a null buffer + * pointer for the root of a root-in-inode btree type, but we need + * to check defensively here in case the cursor state is also screwed + * up. + */ + if (bp == NULL) { + if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; + } /* * We want to cross-reference each btree block with the bnobt -- cgit From 4b0bce30f39b7733420bb8b28e340aa91c219bc1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 19 Mar 2019 08:16:22 -0700 Subject: xfs: always init bma in xfs_bmapi_write Always init the tp/ip fields of bma in xfs_bmapi_write so that the bmapi_finish at the bottom never trips over null transaction or inode pointers. Coverity-id: 1443964 Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_bmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ae4c3b0d84db..4637ae1ae91c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4252,9 +4252,13 @@ xfs_bmapi_write( struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap) /* i/o: mval size/count */ { + struct xfs_bmalloca bma = { + .tp = tp, + .ip = ip, + .total = total, + }; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; - struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ xfs_fileoff_t end; /* end of mapped file region */ bool eof = false; /* after the end of extents */ int error; /* error return */ @@ -4322,10 +4326,6 @@ xfs_bmapi_write( eof = true; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) bma.prev.br_startoff = NULLFILEOFF; - bma.tp = tp; - bma.ip = ip; - bma.total = total; - bma.datatype = 0; bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); n = 0; -- cgit From ed79dac98c5e9f8471456afe2cc09a3912586b52 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 22 Mar 2019 18:10:22 -0700 Subject: xfs: prohibit fstrim in norecovery mode The xfs fstrim implementation uses the free space btrees to find free space that can be discarded. If we haven't recovered the log, the bnobt will be stale and we absolutely *cannot* use stale metadata to zap the underlying storage. Signed-off-by: Darrick J. Wong Reviewed-by: Eric Sandeen --- fs/xfs/xfs_discard.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 93f07edafd81..9ee2a7d02e70 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -161,6 +161,14 @@ xfs_ioc_trim( return -EPERM; if (!blk_queue_discard(q)) return -EOPNOTSUPP; + + /* + * We haven't recovered the log, so we cannot use our bnobt-guided + * storage zapping commands. + */ + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + return -EROFS; + if (copy_from_user(&range, urange, sizeof(range))) return -EFAULT; -- cgit From 2032a8a27b5cc0f578d37fa16fa2494b80a0d00a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 25 Mar 2019 17:01:45 -0700 Subject: xfs: serialize unaligned dio writes against all other dio writes XFS applies more strict serialization constraints to unaligned direct writes to accommodate things like direct I/O layer zeroing, unwritten extent conversion, etc. Unaligned submissions acquire the exclusive iolock and wait for in-flight dio to complete to ensure multiple submissions do not race on the same block and cause data corruption. This generally works in the case of an aligned dio followed by an unaligned dio, but the serialization is lost if I/Os occur in the opposite order. If an unaligned write is submitted first and immediately followed by an overlapping, aligned write, the latter submits without the typical unaligned serialization barriers because there is no indication of an unaligned dio still in-flight. This can lead to unpredictable results. To provide proper unaligned dio serialization, require that such direct writes are always the only dio allowed in-flight at one time for a particular inode. We already acquire the exclusive iolock and drain pending dio before submitting the unaligned dio. Wait once more after the dio submission to hold the iolock across the I/O and prevent further submissions until the unaligned I/O completes. This is heavy handed, but consistent with the current pre-submission serialization for unaligned direct writes. Signed-off-by: Brian Foster Reviewed-by: Allison Henderson Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1f2e2845eb76..a7ceae90110e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -529,18 +529,17 @@ xfs_file_dio_aio_write( count = iov_iter_count(from); /* - * If we are doing unaligned IO, wait for all other IO to drain, - * otherwise demote the lock if we had to take the exclusive lock - * for other reasons in xfs_file_aio_write_checks. + * If we are doing unaligned IO, we can't allow any other overlapping IO + * in-flight at the same time or we risk data corruption. Wait for all + * other IO to drain before we submit. If the IO is aligned, demote the + * iolock if we had to take the exclusive lock in + * xfs_file_aio_write_checks() for other reasons. */ if (unaligned_io) { - /* If we are going to wait for other DIO to finish, bail */ - if (iocb->ki_flags & IOCB_NOWAIT) { - if (atomic_read(&inode->i_dio_count)) - return -EAGAIN; - } else { - inode_dio_wait(inode); - } + /* unaligned dio always waits, bail */ + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_dio_wait(inode); } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; @@ -548,6 +547,14 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); + + /* + * If unaligned, this is the only IO in-flight. If it has not yet + * completed, wait on it before we release the iolock to prevent + * subsequent overlapping IO. + */ + if (ret == -EIOCBQUEUED && unaligned_io) + inode_dio_wait(inode); out: xfs_iunlock(ip, iolock); -- cgit