diff options
Diffstat (limited to 'fs/xfs/xfs_buf_item_recover.c')
| -rw-r--r-- | fs/xfs/xfs_buf_item_recover.c | 329 |
1 files changed, 281 insertions, 48 deletions
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 04faa7310c4f..e4c8af873632 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -22,6 +22,20 @@ #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_quota.h" +#include "xfs_alloc.h" +#include "xfs_ag.h" +#include "xfs_sb.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" + +/* + * This is the number of entries in the l_buf_cancel_table used during + * recovery. + */ +#define XLOG_BC_TABLE_SIZE 64 + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) /* * This structure is used during recovery to record the buf log items which @@ -76,7 +90,7 @@ xlog_add_buffer_cancelled( return false; } - bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); + bcp = kmalloc(sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL); bcp->bc_blkno = blkno; bcp->bc_len = len; bcp->bc_refcount = 1; @@ -120,7 +134,7 @@ xlog_put_buffer_cancelled( if (--bcp->bc_refcount == 0) { list_del(&bcp->bc_list); - kmem_free(bcp); + kfree(bcp); } return true; } @@ -145,7 +159,7 @@ STATIC enum xlog_recover_reorder xlog_recover_buf_reorder( struct xlog_recover_item *item) { - struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base; if (buf_f->blf_flags & XFS_BLF_CANCEL) return XLOG_REORDER_CANCEL_LIST; @@ -159,7 +173,7 @@ xlog_recover_buf_ra_pass2( struct xlog *log, struct xlog_recover_item *item) { - struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base; xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); } @@ -173,11 +187,11 @@ xlog_recover_buf_commit_pass1( struct xlog *log, struct xlog_recover_item *item) { - struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; + struct xfs_buf_log_format *bf = item->ri_buf[0].iov_base; if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { - xfs_err(log->l_mp, "bad buffer log item size (%d)", - item->ri_buf[0].i_len); + xfs_err(log->l_mp, "bad buffer log item size (%zd)", + item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -219,7 +233,7 @@ xlog_recover_validate_buf_type( * inconsistent state resulting in verification failures. Hence for now * just avoid the verification stage for non-crc filesystems */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); @@ -248,12 +262,18 @@ xlog_recover_validate_buf_type( case XFS_BMAP_MAGIC: bp->b_ops = &xfs_bmbt_buf_ops; break; + case XFS_RTRMAP_CRC_MAGIC: + bp->b_ops = &xfs_rtrmapbt_buf_ops; + break; case XFS_RMAP_CRC_MAGIC: bp->b_ops = &xfs_rmapbt_buf_ops; break; case XFS_REFC_CRC_MAGIC: bp->b_ops = &xfs_refcountbt_buf_ops; break; + case XFS_RTREFC_CRC_MAGIC: + bp->b_ops = &xfs_rtrefcountbt_buf_ops; + break; default: warnmsg = "Bad btree block magic!"; break; @@ -381,9 +401,18 @@ xlog_recover_validate_buf_type( break; #ifdef CONFIG_XFS_RT case XFS_BLFT_RTBITMAP_BUF: + if (xfs_has_rtgroups(mp) && magic32 != XFS_RTBITMAP_MAGIC) { + warnmsg = "Bad rtbitmap magic!"; + break; + } + bp->b_ops = xfs_rtblock_ops(mp, XFS_RTGI_BITMAP); + break; case XFS_BLFT_RTSUMMARY_BUF: - /* no magic numbers for verification of RT buffers */ - bp->b_ops = &xfs_rtbuf_ops; + if (xfs_has_rtgroups(mp) && magic32 != XFS_RTSUMMARY_MAGIC) { + warnmsg = "Bad rtsummary magic!"; + break; + } + bp->b_ops = xfs_rtblock_ops(mp, XFS_RTGI_SUMMARY); break; #endif /* CONFIG_XFS_RT */ default: @@ -414,13 +443,12 @@ xlog_recover_validate_buf_type( * * Write verifiers update the metadata LSN from log items attached to * the buffer. Therefore, initialize a bli purely to carry the LSN to - * the verifier. We'll clean it up in our ->iodone() callback. + * the verifier. */ if (bp->b_ops) { struct xfs_buf_log_item *bip; - ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); - bp->b_iodone = xlog_recover_iodone; + bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_item_init(bp, mp); bip = bp->b_log_item; bip->bli_item.li_lsn = current_lsn; @@ -459,8 +487,8 @@ xlog_recover_do_reg_buffer( nbits = xfs_contig_bits(buf_f->blf_data_map, buf_f->blf_map_size, bit); ASSERT(nbits > 0); - ASSERT(item->ri_buf[i].i_addr != NULL); - ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); + ASSERT(item->ri_buf[i].iov_base != NULL); + ASSERT(item->ri_buf[i].iov_len % XFS_BLF_CHUNK == 0); ASSERT(BBTOB(bp->b_length) >= ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); @@ -472,8 +500,8 @@ xlog_recover_do_reg_buffer( * the log. Hence we need to trim nbits back to the length of * the current region being copied out of the log. */ - if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) - nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + if (item->ri_buf[i].iov_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].iov_len >> XFS_BLF_SHIFT; /* * Do a sanity check if this is a dquot buffer. Just checking @@ -483,30 +511,29 @@ xlog_recover_do_reg_buffer( fa = NULL; if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - if (item->ri_buf[i].i_addr == NULL) { + if (item->ri_buf[i].iov_base == NULL) { xfs_alert(mp, "XFS: NULL dquot in %s.", __func__); goto next; } - if (item->ri_buf[i].i_len < size_disk_dquot) { + if (item->ri_buf[i].iov_len < size_disk_dquot) { xfs_alert(mp, - "XFS: dquot too small (%d) in %s.", - item->ri_buf[i].i_len, __func__); + "XFS: dquot too small (%zd) in %s.", + item->ri_buf[i].iov_len, __func__); goto next; } - fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, - -1, 0); + fa = xfs_dquot_verify(mp, item->ri_buf[i].iov_base, -1); if (fa) { xfs_alert(mp, "dquot corrupt at %pS trying to replay into block 0x%llx", - fa, bp->b_bn); + fa, xfs_buf_daddr(bp)); goto next; } } memcpy(xfs_buf_offset(bp, (uint)bit << XFS_BLF_SHIFT), /* dest */ - item->ri_buf[i].i_addr, /* source */ + item->ri_buf[i].iov_base, /* source */ nbits<<XFS_BLF_SHIFT); /* length */ next: i++; @@ -548,11 +575,11 @@ xlog_recover_do_dquot_buffer( type = 0; if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) - type |= XFS_DQ_USER; + type |= XFS_DQTYPE_USER; if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) - type |= XFS_DQ_PROJ; + type |= XFS_DQTYPE_PROJ; if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) - type |= XFS_DQ_GROUP; + type |= XFS_DQTYPE_GROUP; /* * This type of quotas was turned off, so ignore this buffer */ @@ -599,13 +626,13 @@ xlog_recover_do_inode_buffer( * Post recovery validation only works properly on CRC enabled * filesystems. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) bp->b_ops = &xfs_inode_buf_ops; inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + - offsetof(xfs_dinode_t, di_next_unlinked); + offsetof(struct xfs_dinode, di_next_unlinked); while (next_unlinked_offset >= (reg_buf_offset + reg_buf_bytes)) { @@ -642,8 +669,8 @@ xlog_recover_do_inode_buffer( if (next_unlinked_offset < reg_buf_offset) continue; - ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); + ASSERT(item->ri_buf[item_index].iov_base != NULL); + ASSERT((item->ri_buf[item_index].iov_len % XFS_BLF_CHUNK) == 0); ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); /* @@ -651,7 +678,7 @@ xlog_recover_do_inode_buffer( * current di_next_unlinked field. Extract its value * and copy it to the buffer copy. */ - logged_nextp = item->ri_buf[item_index].i_addr + + logged_nextp = item->ri_buf[item_index].iov_base + next_unlinked_offset - reg_buf_offset; if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { xfs_alert(mp, @@ -678,6 +705,100 @@ xlog_recover_do_inode_buffer( } /* + * Update the in-memory superblock and perag structures from the primary SB + * buffer. + * + * This is required because transactions running after growfs may require the + * updated values to be set in a previous fully commit transaction. + */ +static int +xlog_recover_do_primary_sb_buffer( + struct xfs_mount *mp, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f, + xfs_lsn_t current_lsn) +{ + struct xfs_dsb *dsb = bp->b_addr; + xfs_agnumber_t orig_agcount = mp->m_sb.sb_agcount; + xfs_rgnumber_t orig_rgcount = mp->m_sb.sb_rgcount; + int error; + + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); + + if (orig_agcount == 0) { + xfs_alert(mp, "Trying to grow file system without AGs"); + return -EFSCORRUPTED; + } + + /* + * Update the in-core super block from the freshly recovered on-disk one. + */ + xfs_sb_from_disk(&mp->m_sb, dsb); + + /* + * Grow can change the device size. Mirror that into the buftarg. + */ + mp->m_ddev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) { + mp->m_rtdev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + } + + if (mp->m_sb.sb_agcount < orig_agcount) { + xfs_alert(mp, "Shrinking AG count in log recovery not supported"); + return -EFSCORRUPTED; + } + if (mp->m_sb.sb_rgcount < orig_rgcount) { + xfs_warn(mp, + "Shrinking rtgroup count in log recovery not supported"); + return -EFSCORRUPTED; + } + + /* + * If the last AG was grown or shrunk, we also need to update the + * length in the in-core perag structure and values depending on it. + */ + error = xfs_update_last_ag_size(mp, orig_agcount); + if (error) + return error; + + /* + * If the last rtgroup was grown or shrunk, we also need to update the + * length in the in-core rtgroup structure and values depending on it. + * Ignore this on any filesystem with zero rtgroups. + */ + if (orig_rgcount > 0) { + error = xfs_update_last_rtgroup_size(mp, orig_rgcount); + if (error) + return error; + } + + /* + * Initialize the new perags, and also update various block and inode + * allocator setting based off the number of AGs or total blocks. + * Because of the latter this also needs to happen if the agcount did + * not change. + */ + error = xfs_initialize_perag(mp, orig_agcount, mp->m_sb.sb_agcount, + mp->m_sb.sb_dblocks, &mp->m_maxagi); + if (error) { + xfs_warn(mp, "Failed recovery per-ag init: %d", error); + return error; + } + mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + + error = xfs_initialize_rtgroups(mp, orig_rgcount, mp->m_sb.sb_rgcount, + mp->m_sb.sb_rextents); + if (error) { + xfs_warn(mp, "Failed recovery rtgroup init: %d", error); + return error; + } + return 0; +} + +/* * V5 filesystems know the age of the buffer on disk being recovered. We can * have newer objects on disk than we are replaying, and so for these cases we * don't want to replay the current change as that will make the buffer contents @@ -700,7 +821,8 @@ xlog_recover_do_inode_buffer( static xfs_lsn_t xlog_recover_get_buf_lsn( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) { uint32_t magic32; uint16_t magic16; @@ -708,19 +830,39 @@ xlog_recover_get_buf_lsn( void *blk = bp->b_addr; uuid_t *uuid; xfs_lsn_t lsn = -1; + uint16_t blft; /* v4 filesystems always recover immediately */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) + goto recover_immediately; + + /* + * realtime bitmap and summary file blocks do not have magic numbers or + * UUIDs, so we must recover them immediately. + */ + blft = xfs_blft_from_flags(buf_f); + if (!xfs_has_rtgroups(mp) && (blft == XFS_BLFT_RTBITMAP_BUF || + blft == XFS_BLFT_RTSUMMARY_BUF)) goto recover_immediately; magic32 = be32_to_cpu(*(__be32 *)blk); switch (magic32) { + case XFS_RTSUMMARY_MAGIC: + case XFS_RTBITMAP_MAGIC: { + struct xfs_rtbuf_blkinfo *hdr = blk; + + lsn = be64_to_cpu(hdr->rt_lsn); + uuid = &hdr->rt_uuid; + break; + } case XFS_ABTB_CRC_MAGIC: case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: case XFS_ABTC_MAGIC: case XFS_RMAP_CRC_MAGIC: case XFS_REFC_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk; @@ -729,6 +871,8 @@ xlog_recover_get_buf_lsn( uuid = &btb->bb_u.s.bb_uuid; break; } + case XFS_RTRMAP_CRC_MAGIC: + case XFS_RTREFC_CRC_MAGIC: case XFS_BMAP_CRC_MAGIC: case XFS_BMAP_MAGIC: { struct xfs_btree_block *btb = blk; @@ -777,7 +921,7 @@ xlog_recover_get_buf_lsn( * the relevant UUID in the superblock. */ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); - if (xfs_sb_version_hasmetauuid(&mp->m_sb)) + if (xfs_has_metauuid(mp)) uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; else uuid = &((struct xfs_dsb *)blk)->sb_uuid; @@ -796,6 +940,7 @@ xlog_recover_get_buf_lsn( switch (magicda) { case XFS_DIR3_LEAF1_MAGIC: case XFS_DIR3_LEAFN_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: case XFS_DA3_NODE_MAGIC: lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; @@ -805,7 +950,7 @@ xlog_recover_get_buf_lsn( } if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) goto recover_immediately; return lsn; } @@ -867,11 +1012,10 @@ xlog_recover_buf_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t current_lsn) { - struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base; struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; int error; - uint buf_flags; xfs_lsn_t lsn; /* @@ -890,13 +1034,8 @@ xlog_recover_buf_commit_pass2( } trace_xfs_log_recover_buf_recover(log, buf_f); - - buf_flags = 0; - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) - buf_flags |= XBF_UNMAPPED; - error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags, &bp, NULL); + 0, &bp, NULL); if (error) return error; @@ -919,10 +1058,20 @@ xlog_recover_buf_commit_pass2( * the verifier will be reset to match whatever recover turns that * buffer into. */ - lsn = xlog_recover_get_buf_lsn(mp, bp); + lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f); if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { trace_xfs_log_recover_buf_skip(log, buf_f); xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); + + /* + * We're skipping replay of this buffer log item due to the log + * item LSN being behind the ondisk buffer. Verify the buffer + * contents since we aren't going to run the write verifier. + */ + if (bp->b_ops) { + bp->b_ops->verify_read(bp); + error = bp->b_error; + } goto out_release; } @@ -937,11 +1086,38 @@ xlog_recover_buf_commit_pass2( dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); if (!dirty) goto out_release; + } else if ((xfs_blft_from_flags(buf_f) & XFS_BLFT_SB_BUF) && + xfs_buf_daddr(bp) == 0) { + error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f, + current_lsn); + if (error) + goto out_writebuf; + + /* Update the rt superblock if we have one. */ + if (xfs_has_rtsb(mp) && mp->m_rtsb_bp) { + struct xfs_buf *rtsb_bp = mp->m_rtsb_bp; + + xfs_buf_lock(rtsb_bp); + xfs_buf_hold(rtsb_bp); + xfs_update_rtsb(rtsb_bp, bp); + rtsb_bp->b_flags |= _XBF_LOGRECOVERY; + xfs_buf_delwri_queue(rtsb_bp, buffer_list); + xfs_buf_relse(rtsb_bp); + } } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); } /* + * Buffer held by buf log item during 'normal' buffer recovery must + * be committed through buffer I/O submission path to ensure proper + * release. When error occurs during sb buffer recovery, log shutdown + * will be done before submitting buffer list so that buffers can be + * released correctly through ioend failure path. + */ +out_writebuf: + + /* * Perform delayed write on the buffer. Asynchronous writes will be * slower when taking into account all the buffers to be flushed. * @@ -950,7 +1126,7 @@ xlog_recover_buf_commit_pass2( * or inode_cluster_size bytes, whichever is bigger. The inode * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel - * running with a different inode cluster size. Regardless, if the + * running with a different inode cluster size. Regardless, if * the inode buffer size isn't max(blocksize, inode_cluster_size) * for *our* value of inode_cluster_size, then we need to keep * the buffer out of the buffer cache so that the buffer won't @@ -963,7 +1139,7 @@ xlog_recover_buf_commit_pass2( error = xfs_bwrite(bp); } else { ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; + bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_delwri_queue(bp, buffer_list); } @@ -982,3 +1158,60 @@ const struct xlog_recover_item_ops xlog_buf_item_ops = { .commit_pass1 = xlog_recover_buf_commit_pass1, .commit_pass2 = xlog_recover_buf_commit_pass2, }; + +#ifdef DEBUG +void +xlog_check_buf_cancel_table( + struct xlog *log) +{ + int i; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + ASSERT(list_empty(&log->l_buf_cancel_table[i])); +} +#endif + +int +xlog_alloc_buf_cancel_table( + struct xlog *log) +{ + void *p; + int i; + + ASSERT(log->l_buf_cancel_table == NULL); + + p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), + GFP_KERNEL); + if (!p) + return -ENOMEM; + + log->l_buf_cancel_table = p; + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + + return 0; +} + +void +xlog_free_buf_cancel_table( + struct xlog *log) +{ + int i; + + if (!log->l_buf_cancel_table) + return; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { + struct xfs_buf_cancel *bc; + + while ((bc = list_first_entry_or_null( + &log->l_buf_cancel_table[i], + struct xfs_buf_cancel, bc_list))) { + list_del(&bc->bc_list); + kfree(bc); + } + } + + kfree(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; +} |
