diff options
Diffstat (limited to 'fs/xfs/xfs_log_recover.c')
| -rw-r--r-- | fs/xfs/xfs_log_recover.c | 495 |
1 files changed, 276 insertions, 219 deletions
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 322eb2ee6c55..03e42c7dab56 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -190,8 +190,8 @@ xlog_bwrite( */ STATIC void xlog_header_check_dump( - xfs_mount_t *mp, - xlog_rec_header_t *head) + struct xfs_mount *mp, + struct xlog_rec_header *head) { xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", __func__, &mp->m_sb.sb_uuid, XLOG_FMT); @@ -207,8 +207,8 @@ xlog_header_check_dump( */ STATIC int xlog_header_check_recover( - xfs_mount_t *mp, - xlog_rec_header_t *head) + struct xfs_mount *mp, + struct xlog_rec_header *head) { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); @@ -238,8 +238,8 @@ xlog_header_check_recover( */ STATIC int xlog_header_check_mount( - xfs_mount_t *mp, - xlog_rec_header_t *head) + struct xfs_mount *mp, + struct xlog_rec_header *head) { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); @@ -329,7 +329,7 @@ xlog_find_verify_cycle( * try a smaller size. We need to be able to read at least * a log sector, or we're out of luck. */ - bufblks = 1 << ffs(nbblks); + bufblks = roundup_pow_of_two(nbblks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(buffer = xlog_alloc_buffer(log, bufblks))) { @@ -361,7 +361,7 @@ xlog_find_verify_cycle( *new_blk = -1; out: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -400,7 +400,7 @@ xlog_find_verify_log_record( xfs_daddr_t i; char *buffer; char *offset = NULL; - xlog_rec_header_t *head = NULL; + struct xlog_rec_header *head = NULL; int error = 0; int smallmem = 0; int num_blks = *last_blk - start_blk; @@ -437,7 +437,7 @@ xlog_find_verify_log_record( goto out; } - head = (xlog_rec_header_t *)offset; + head = (struct xlog_rec_header *)offset; if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) break; @@ -477,7 +477,7 @@ xlog_find_verify_log_record( *last_blk = i; out: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -731,7 +731,7 @@ validate_head: goto out_free_buffer; } - kmem_free(buffer); + kvfree(buffer); if (head_blk == log_bbnum) *return_head_blk = 0; else @@ -745,7 +745,7 @@ validate_head: return 0; out_free_buffer: - kmem_free(buffer); + kvfree(buffer); if (error) xfs_warn(log->l_mp, "failed to find log head"); return error; @@ -999,7 +999,7 @@ xlog_verify_tail( "Tail block (0x%llx) overwrite detected. Updated to 0x%llx", orig_tail, *tail_blk); out: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -1046,7 +1046,7 @@ xlog_verify_head( error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk, XLOG_MAX_ICLOGS, tmp_buffer, &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped); - kmem_free(tmp_buffer); + kvfree(tmp_buffer); if (error < 0) return error; @@ -1177,8 +1177,8 @@ xlog_check_unmount_rec( */ xlog_assign_atomic_lsn(&log->l_tail_lsn, log->l_curr_cycle, after_umount_blk); - xlog_assign_atomic_lsn(&log->l_last_sync_lsn, - log->l_curr_cycle, after_umount_blk); + log->l_ailp->ail_head_lsn = + atomic64_read(&log->l_tail_lsn); *tail_blk = after_umount_blk; *clean = true; @@ -1212,11 +1212,7 @@ xlog_set_state( if (bump_cycle) log->l_curr_cycle++; atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); - atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); - xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, - BBTOB(log->l_curr_block)); - xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, - BBTOB(log->l_curr_block)); + log->l_ailp->ail_head_lsn = be64_to_cpu(rhead->h_lsn); } /* @@ -1241,7 +1237,7 @@ xlog_find_tail( xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk) { - xlog_rec_header_t *rhead; + struct xlog_rec_header *rhead; char *offset = NULL; char *buffer; int error; @@ -1340,7 +1336,7 @@ xlog_find_tail( * headers if we have a filesystem using non-persistent counters. */ if (clean) - set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate); + xfs_set_clean(log->l_mp); /* * Make sure that there are no blocks in front of the head @@ -1365,7 +1361,7 @@ xlog_find_tail( error = xlog_clear_stale_blocks(log, tail_lsn); done: - kmem_free(buffer); + kvfree(buffer); if (error) xfs_warn(log->l_mp, "failed to locate log tail"); @@ -1399,6 +1395,7 @@ xlog_find_zeroed( xfs_daddr_t new_blk, last_blk, start_blk; xfs_daddr_t num_scan_bblks; int error, log_bbnum = log->l_logBBsize; + int ret = 1; *blk_no = 0; @@ -1413,8 +1410,7 @@ xlog_find_zeroed( first_cycle = xlog_get_cycle(offset); if (first_cycle == 0) { /* completely zeroed log */ *blk_no = 0; - kmem_free(buffer); - return 1; + goto out_free_buffer; } /* check partially zeroed log */ @@ -1424,8 +1420,8 @@ xlog_find_zeroed( last_cycle = xlog_get_cycle(offset); if (last_cycle != 0) { /* log completely written to */ - kmem_free(buffer); - return 0; + ret = 0; + goto out_free_buffer; } /* we have a partially zeroed log */ @@ -1471,10 +1467,10 @@ xlog_find_zeroed( *blk_no = last_blk; out_free_buffer: - kmem_free(buffer); + kvfree(buffer); if (error) return error; - return 1; + return ret; } /* @@ -1491,7 +1487,7 @@ xlog_add_record( int tail_cycle, int tail_block) { - xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; + struct xlog_rec_header *recp = (struct xlog_rec_header *)buf; memset(buf, 0, BBSIZE); recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); @@ -1528,7 +1524,7 @@ xlog_write_log_records( * a smaller size. We need to be able to write at least a * log sector, or we're out of luck. */ - bufblks = 1 << ffs(blocks); + bufblks = roundup_pow_of_two(blocks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(buffer = xlog_alloc_buffer(log, bufblks))) { @@ -1583,7 +1579,7 @@ xlog_write_log_records( } out_free_buffer: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -1723,30 +1719,24 @@ xlog_clear_stale_blocks( */ void xlog_recover_release_intent( - struct xlog *log, - unsigned short intent_type, - uint64_t intent_id) + struct xlog *log, + unsigned short intent_type, + uint64_t intent_id) { - struct xfs_ail_cursor cur; - struct xfs_log_item *lip; - struct xfs_ail *ailp = log->l_ailp; + struct xfs_defer_pending *dfp, *n; + + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + struct xfs_log_item *lip = dfp->dfp_intent; - spin_lock(&ailp->ail_lock); - for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; - lip = xfs_trans_ail_cursor_next(ailp, &cur)) { if (lip->li_type != intent_type) continue; if (!lip->li_ops->iop_match(lip, intent_id)) continue; - spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_release(lip); - spin_lock(&ailp->ail_lock); - break; - } + ASSERT(xlog_item_is_intent(lip)); - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + xfs_defer_cancel_recovery(log->l_mp, dfp); + } } int @@ -1773,6 +1763,37 @@ xlog_recover_iget( return 0; } +/* + * Get an inode so that we can recover a log operation. + * + * Log intent items that target inodes effectively contain a file handle. + * Check that the generation number matches the intent item like we do for + * other file handles. Log intent items defined after this validation weakness + * was identified must use this function. + */ +int +xlog_recover_iget_handle( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t gen, + struct xfs_inode **ipp) +{ + struct xfs_inode *ip; + int error; + + error = xlog_recover_iget(mp, ino, &ip); + if (error) + return error; + + if (VFS_I(ip)->i_generation != gen) { + xfs_irele(ip); + return -EFSCORRUPTED; + } + + *ipp = ip; + return 0; +} + /****************************************************************************** * * Log recover routines @@ -1795,6 +1816,14 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { &xlog_bud_item_ops, &xlog_attri_item_ops, &xlog_attrd_item_ops, + &xlog_xmi_item_ops, + &xlog_xmd_item_ops, + &xlog_rtefi_item_ops, + &xlog_rtefd_item_ops, + &xlog_rtrui_item_ops, + &xlog_rtrud_item_ops, + &xlog_rtcui_item_ops, + &xlog_rtcud_item_ops, }; static const struct xlog_recover_item_ops * @@ -1826,7 +1855,7 @@ xlog_find_item_ops( * from the transaction. However, we can't do that until after we've * replayed all the other items because they may be dependent on the * cancelled buffer and replaying the cancelled buffer can remove it - * form the cancelled buffer table. Hence they have tobe done last. + * form the cancelled buffer table. Hence they have to be done last. * * 3. Inode allocation buffers must be replayed before inode items that * read the buffer and replay changes into it. For filesystems using the @@ -1939,6 +1968,29 @@ xlog_buf_readahead( xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops); } +/* + * Create a deferred work structure for resuming and tracking the progress of a + * log intent item that was found during recovery. + */ +void +xlog_recover_intent_item( + struct xlog *log, + struct xfs_log_item *lip, + xfs_lsn_t lsn, + const struct xfs_defer_op_type *ops) +{ + ASSERT(xlog_item_is_intent(lip)); + + xfs_defer_start_recovery(lip, &log->r_dfops, ops); + + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, lip, lsn); + lip->li_ops->iop_unpin(lip, 0); +} + STATIC int xlog_recover_items_pass2( struct xlog *log, @@ -2040,7 +2092,8 @@ xlog_recover_add_item( { struct xlog_recover_item *item; - item = kmem_zalloc(sizeof(struct xlog_recover_item), 0); + item = kzalloc(sizeof(struct xlog_recover_item), + GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&item->ri_list); list_add_tail(&item->ri_list, head); } @@ -2078,15 +2131,15 @@ xlog_recover_add_to_cont_trans( item = list_entry(trans->r_itemq.prev, struct xlog_recover_item, ri_list); - old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; - old_len = item->ri_buf[item->ri_cnt-1].i_len; + old_ptr = item->ri_buf[item->ri_cnt-1].iov_base; + old_len = item->ri_buf[item->ri_cnt-1].iov_len; - ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL); + ptr = kvrealloc(old_ptr, len + old_len, GFP_KERNEL); if (!ptr) return -ENOMEM; memcpy(&ptr[old_len], dp, len); - item->ri_buf[item->ri_cnt-1].i_len += len; - item->ri_buf[item->ri_cnt-1].i_addr = ptr; + item->ri_buf[item->ri_cnt-1].iov_len += len; + item->ri_buf[item->ri_cnt-1].iov_base = ptr; trace_xfs_log_recover_item_add_cont(log, trans, item, 0); return 0; } @@ -2143,7 +2196,7 @@ xlog_recover_add_to_trans( return 0; } - ptr = kmem_alloc(len, 0); + ptr = xlog_kvmalloc(len); memcpy(ptr, dp, len); in_f = (struct xfs_inode_log_format *)ptr; @@ -2165,14 +2218,13 @@ xlog_recover_add_to_trans( "bad number of regions (%d) in inode log format", in_f->ilf_size); ASSERT(0); - kmem_free(ptr); + kvfree(ptr); return -EFSCORRUPTED; } item->ri_total = in_f->ilf_size; - item->ri_buf = - kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), - 0); + item->ri_buf = kcalloc(item->ri_total, sizeof(*item->ri_buf), + GFP_KERNEL | __GFP_NOFAIL); } if (item->ri_total <= item->ri_cnt) { @@ -2180,13 +2232,13 @@ xlog_recover_add_to_trans( "log item region count (%d) overflowed size (%d)", item->ri_cnt, item->ri_total); ASSERT(0); - kmem_free(ptr); + kvfree(ptr); return -EFSCORRUPTED; } /* Description region is ri_buf[0] */ - item->ri_buf[item->ri_cnt].i_addr = ptr; - item->ri_buf[item->ri_cnt].i_len = len; + item->ri_buf[item->ri_cnt].iov_base = ptr; + item->ri_buf[item->ri_cnt].iov_len = len; item->ri_cnt++; trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; @@ -2210,13 +2262,13 @@ xlog_recover_free_trans( /* Free the regions in the item. */ list_del(&item->ri_list); for (i = 0; i < item->ri_cnt; i++) - kmem_free(item->ri_buf[i].i_addr); + kvfree(item->ri_buf[i].iov_base); /* Free the item itself */ - kmem_free(item->ri_buf); - kmem_free(item); + kfree(item->ri_buf); + kfree(item); } /* Free the transaction recover structure */ - kmem_free(trans); + kfree(trans); } /* @@ -2315,7 +2367,7 @@ xlog_recover_ophdr_to_trans( * This is a new transaction so allocate a new recovery container to * hold the recovery ops that will follow. */ - trans = kmem_zalloc(sizeof(struct xlog_recover), 0); + trans = kzalloc(sizeof(struct xlog_recover), GFP_KERNEL | __GFP_NOFAIL); trans->r_log_tid = tid; trans->r_lsn = be64_to_cpu(rhead->h_lsn); INIT_LIST_HEAD(&trans->r_itemq); @@ -2439,7 +2491,10 @@ xlog_recover_process_data( ohead = (struct xlog_op_header *)dp; dp += sizeof(*ohead); - ASSERT(dp <= end); + if (dp > end) { + xfs_warn(log->l_mp, "%s: op header overrun", __func__); + return -EFSCORRUPTED; + } /* errors will abort recovery */ error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, @@ -2511,7 +2566,7 @@ xlog_abort_defer_ops( list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { list_del_init(&dfc->dfc_list); - xfs_defer_ops_capture_free(mp, dfc); + xfs_defer_ops_capture_abort(mp, dfc); } } @@ -2533,36 +2588,26 @@ xlog_abort_defer_ops( */ STATIC int xlog_recover_process_intents( - struct xlog *log) + struct xlog *log) { LIST_HEAD(capture_list); - struct xfs_ail_cursor cur; - struct xfs_log_item *lip; - struct xfs_ail *ailp; - int error = 0; + struct xfs_defer_pending *dfp, *n; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) - xfs_lsn_t last_lsn; -#endif + xfs_lsn_t last_lsn; - ailp = log->l_ailp; - spin_lock(&ailp->ail_lock); -#if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif - for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - lip != NULL; - lip = xfs_trans_ail_cursor_next(ailp, &cur)) { - const struct xfs_item_ops *ops; - if (!xlog_item_is_intent(lip)) - break; + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + ASSERT(xlog_item_is_intent(dfp->dfp_intent)); /* * We should never see a redo item with a LSN higher than * the last transaction we found in the log at the start * of recovery. */ - ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); + ASSERT(XFS_LSN_CMP(last_lsn, dfp->dfp_intent->li_lsn) >= 0); /* * NOTE: If your intent processing routine can create more @@ -2571,21 +2616,14 @@ xlog_recover_process_intents( * replayed in the wrong order! * * The recovery function can free the log item, so we must not - * access lip after it returns. + * access dfp->dfp_intent after it returns. It must dispose of + * @dfp if it returns 0. */ - spin_unlock(&ailp->ail_lock); - ops = lip->li_ops; - error = ops->iop_recover(lip, &capture_list); - spin_lock(&ailp->ail_lock); - if (error) { - trace_xlog_intent_recovery_failed(log->l_mp, error, - ops->iop_recover); + error = xfs_defer_finish_recovery(log->l_mp, dfp, + &capture_list); + if (error) break; - } } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); if (error) goto err; @@ -2606,27 +2644,34 @@ err: */ STATIC void xlog_recover_cancel_intents( - struct xlog *log) + struct xlog *log) { - struct xfs_log_item *lip; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp; - - ailp = log->l_ailp; - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (!xlog_item_is_intent(lip)) - break; + struct xfs_defer_pending *dfp, *n; + + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + ASSERT(xlog_item_is_intent(dfp->dfp_intent)); - spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_release(lip); - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_next(ailp, &cur); + xfs_defer_cancel_recovery(log->l_mp, dfp); } +} - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); +/* + * Transfer ownership of the recovered pending work to the recovery transaction + * and try to finish the work. If there is more work to be done, the dfp will + * remain attached to the transaction. If not, the dfp is freed. + */ +int +xlog_recover_finish_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + int error; + + list_move(&dfp->dfp_list, &tp->t_dfops); + error = xfs_defer_finish_one(tp, dfp); + if (error == -EAGAIN) + return 0; + return error; } /* @@ -2638,7 +2683,7 @@ xlog_recover_clear_agi_bucket( struct xfs_perag *pag, int bucket) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_trans *tp; struct xfs_agi *agi; struct xfs_buf *agibp; @@ -2649,7 +2694,7 @@ xlog_recover_clear_agi_bucket( if (error) goto out_error; - error = xfs_read_agi(pag, tp, &agibp); + error = xfs_read_agi(pag, tp, 0, &agibp); if (error) goto out_abort; @@ -2669,7 +2714,7 @@ out_abort: xfs_trans_cancel(tp); out_error: xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, - pag->pag_agno); + pag_agno(pag)); return; } @@ -2679,7 +2724,7 @@ xlog_recover_iunlink_bucket( struct xfs_agi *agi, int bucket) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_inode *prev_ip = NULL; struct xfs_inode *ip; xfs_agino_t prev_agino, agino; @@ -2687,9 +2732,8 @@ xlog_recover_iunlink_bucket( agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - error = xfs_iget(mp, NULL, - XFS_AGINO_TO_INO(mp, pag->pag_agno, agino), - 0, 0, &ip); + error = xfs_iget(mp, NULL, xfs_agino_to_ino(pag, agino), 0, 0, + &ip); if (error) break; @@ -2711,7 +2755,9 @@ xlog_recover_iunlink_bucket( * just to flush the inodegc queue and wait for it to * complete. */ - xfs_inodegc_flush(mp); + error = xfs_inodegc_flush(mp); + if (error) + break; } prev_agino = agino; @@ -2719,10 +2765,15 @@ xlog_recover_iunlink_bucket( } if (prev_ip) { + int error2; + ip->i_prev_unlinked = prev_agino; xfs_irele(prev_ip); + + error2 = xfs_inodegc_flush(mp); + if (error2 && !error) + return error2; } - xfs_inodegc_flush(mp); return error; } @@ -2758,7 +2809,7 @@ xlog_recover_iunlink_ag( int bucket; int error; - error = xfs_read_agi(pag, NULL, &agibp); + error = xfs_read_agi(pag, NULL, 0, &agibp); if (error) { /* * AGI is b0rked. Don't process it. @@ -2789,7 +2840,6 @@ xlog_recover_iunlink_ag( * bucket and remaining inodes on it unreferenced and * unfreeable. */ - xfs_inodegc_flush(pag->pag_mount); xlog_recover_clear_agi_bucket(pag, bucket); } } @@ -2801,18 +2851,10 @@ static void xlog_recover_process_iunlinks( struct xlog *log) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; - for_each_perag(log->l_mp, agno, pag) + while ((pag = xfs_perag_next(log->l_mp, pag))) xlog_recover_iunlink_ag(pag); - - /* - * Flush the pending unlinked inodes to ensure that the inactivations - * are fully completed on disk and the incore inodes can be reclaimed - * before we signal that recovery is complete. - */ - xfs_inodegc_flush(log->l_mp); } STATIC void @@ -2821,23 +2863,12 @@ xlog_unpack_data( char *dp, struct xlog *log) { - int i, j, k; + int i; - for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { + *(__be32 *)dp = *xlog_cycle_data(rhead, i); dp += BBSIZE; } - - if (xfs_has_logv2(log->l_mp)) { - xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; - for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; - dp += BBSIZE; - } - } } /* @@ -2852,20 +2883,34 @@ xlog_recover_process( int pass, struct list_head *buffer_list) { - __le32 old_crc = rhead->h_crc; - __le32 crc; + __le32 expected_crc = rhead->h_crc, crc, other_crc; - crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE, + be32_to_cpu(rhead->h_len)); + + /* + * Look at the end of the struct xlog_rec_header definition in + * xfs_log_format.h for the glory details. + */ + if (expected_crc && crc != expected_crc) { + other_crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE_OTHER, + be32_to_cpu(rhead->h_len)); + if (other_crc == expected_crc) { + xfs_notice_once(log->l_mp, + "Fixing up incorrect CRC due to padding."); + crc = other_crc; + } + } /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always - * sets old_crc to 0 so we must consider this valid even on v5 supers. - * Otherwise, return EFSBADCRC on failure so the callers up the stack - * know precisely what failed. + * sets expected_crc to 0 so we must consider this valid even on v5 + * supers. Otherwise, return EFSBADCRC on failure so the callers up the + * stack know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (old_crc && crc != old_crc) + if (expected_crc && crc != expected_crc) return -EFSBADCRC; return 0; } @@ -2876,11 +2921,11 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != old_crc) { - if (old_crc || xfs_has_crc(log->l_mp)) { + if (crc != expected_crc) { + if (expected_crc || xfs_has_crc(log->l_mp)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", - le32_to_cpu(old_crc), + le32_to_cpu(expected_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } @@ -2952,7 +2997,7 @@ xlog_do_recovery_pass( int pass, xfs_daddr_t *first_bad) /* out: first bad log rec */ { - xlog_rec_header_t *rhead; + struct xlog_rec_header *rhead; xfs_daddr_t blk_no, rblk_no; xfs_daddr_t rhead_blk; char *offset; @@ -2960,7 +3005,7 @@ xlog_do_recovery_pass( int error = 0, h_size, h_len; int error2 = 0; int bblks, split_bblks; - int hblks, split_hblks, wrapped_hblks; + int hblks = 1, split_hblks, wrapped_hblks; int i; struct hlist_head rhash[XLOG_RHASH_SIZE]; LIST_HEAD (buffer_list); @@ -2971,6 +3016,10 @@ xlog_do_recovery_pass( for (i = 0; i < XLOG_RHASH_SIZE; i++) INIT_HLIST_HEAD(&rhash[i]); + hbp = xlog_alloc_buffer(log, hblks); + if (!hbp) + return -ENOMEM; + /* * Read the header of the tail block and get the iclog buffer size from * h_size. Use this to tell how many sectors make up the log header. @@ -2981,15 +3030,11 @@ xlog_do_recovery_pass( * iclog header and extract the header size from it. Get a * new hbp that is the correct size. */ - hbp = xlog_alloc_buffer(log, 1); - if (!hbp) - return -ENOMEM; - error = xlog_bread(log, tail_blk, 1, hbp, &offset); if (error) goto bread_err1; - rhead = (xlog_rec_header_t *)offset; + rhead = (struct xlog_rec_header *)offset; /* * xfsprogs has a bug where record length is based on lsunit but @@ -3016,23 +3061,30 @@ xlog_do_recovery_pass( if (error) goto bread_err1; - hblks = xlog_logrec_hblks(log, rhead); - if (hblks != 1) { - kmem_free(hbp); - hbp = xlog_alloc_buffer(log, hblks); + /* + * This open codes xlog_logrec_hblks so that we can reuse the + * fixed up h_size value calculated above. Without that we'd + * still allocate the buffer based on the incorrect on-disk + * size. + */ + if (h_size > XLOG_HEADER_CYCLE_SIZE && + (rhead->h_version & cpu_to_be32(XLOG_VERSION_2))) { + hblks = DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE); + if (hblks > 1) { + kvfree(hbp); + hbp = xlog_alloc_buffer(log, hblks); + if (!hbp) + return -ENOMEM; + } } } else { ASSERT(log->l_sectBBsize == 1); - hblks = 1; - hbp = xlog_alloc_buffer(log, 1); h_size = XLOG_BIG_RECORD_BSIZE; } - if (!hbp) - return -ENOMEM; dbp = xlog_alloc_buffer(log, BTOBB(h_size)); if (!dbp) { - kmem_free(hbp); + kvfree(hbp); return -ENOMEM; } @@ -3089,7 +3141,7 @@ xlog_do_recovery_pass( if (error) goto bread_err2; } - rhead = (xlog_rec_header_t *)offset; + rhead = (struct xlog_rec_header *)offset; error = xlog_valid_rec_header(log, rhead, split_hblks ? blk_no : 0, h_size); if (error) @@ -3171,7 +3223,7 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - rhead = (xlog_rec_header_t *)offset; + rhead = (struct xlog_rec_header *)offset; error = xlog_valid_rec_header(log, rhead, blk_no, h_size); if (error) goto bread_err2; @@ -3193,16 +3245,33 @@ xlog_do_recovery_pass( } bread_err2: - kmem_free(dbp); + kvfree(dbp); bread_err1: - kmem_free(hbp); + kvfree(hbp); /* - * Submit buffers that have been added from the last record processed, - * regardless of error status. + * Submit buffers that have been dirtied by the last record recovered. */ - if (!list_empty(&buffer_list)) + if (!list_empty(&buffer_list)) { + if (error) { + /* + * If there has been an item recovery error then we + * cannot allow partial checkpoint writeback to + * occur. We might have multiple checkpoints with the + * same start LSN in this buffer list, and partial + * writeback of a checkpoint in this situation can + * prevent future recovery of all the changes in the + * checkpoints at this start LSN. + * + * Note: Shutting down the filesystem will result in the + * delwri submission marking all the buffers stale, + * completing them and cleaning up _XBF_LOGRECOVERY + * state without doing any IO. + */ + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + } error2 = xfs_buf_delwri_submit(&buffer_list); + } if (error && first_bad) *first_bad = rhead_blk; @@ -3300,14 +3369,13 @@ xlog_do_recover( /* * We now update the tail_lsn since much of the recovery has completed - * and there may be space available to use. If there were no extent - * or iunlinks, we can free up the entire log and set the tail_lsn to - * be the last_sync_lsn. This was set in xlog_find_tail to be the - * lsn of the last known good LR on disk. If there are extent frees - * or iunlinks they will have some entries in the AIL; so we look at - * the AIL to determine how to set the tail_lsn. + * and there may be space available to use. If there were no extent or + * iunlinks, we can free up the entire log. This was set in + * xlog_find_tail to be the lsn of the last known good LR on disk. If + * there are extent frees or iunlinks they will have some entries in the + * AIL; so we look at the AIL to determine how to set the tail_lsn. */ - xlog_assign_tail_lsn(mp); + xfs_ail_assign_tail_lsn(log->l_ailp); /* * Now that we've finished replaying all buffer and inode updates, @@ -3315,7 +3383,7 @@ xlog_do_recover( */ xfs_buf_lock(bp); xfs_buf_hold(bp); - error = _xfs_buf_read(bp, XBF_READ); + error = _xfs_buf_read(bp); if (error) { if (!xlog_is_shutdown(log)) { xfs_buf_ioerror_alert(bp, __this_address); @@ -3332,13 +3400,6 @@ xlog_do_recover( /* re-initialise in-core superblock and geometry structures */ mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); - error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks, - &mp->m_maxagi); - if (error) { - xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); - return error; - } - mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); /* Normal transactions can now occur */ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); @@ -3437,12 +3498,19 @@ xlog_recover( * part of recovery so that the root and real-time bitmap inodes can be read in * from disk in between the two stages. This is necessary so that we can free * space in the real-time portion of the file system. + * + * We run this whole process under GFP_NOFS allocation context. We do a + * combination of non-transactional and transactional work, yet we really don't + * want to recurse into the filesystem from direct reclaim during any of this + * processing. This allows all the recovery code run here not to care about the + * memory allocation context it is running in. */ int xlog_recover_finish( struct xlog *log) { - int error; + unsigned int nofs_flags = memalloc_nofs_save(); + int error; error = xlog_recover_process_intents(log); if (error) { @@ -3456,7 +3524,7 @@ xlog_recover_finish( xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return error; + goto out_error; } /* @@ -3466,21 +3534,6 @@ xlog_recover_finish( */ xfs_log_force(log->l_mp, XFS_LOG_SYNC); - /* - * Now that we've recovered the log and all the intents, we can clear - * the log incompat feature bits in the superblock because there's no - * longer anything to protect. We rely on the AIL push to write out the - * updated superblock after everything else. - */ - if (xfs_clear_incompat_log_features(log->l_mp)) { - error = xfs_sync_sb(log->l_mp, false); - if (error < 0) { - xfs_alert(log->l_mp, - "Failed to clear log incompat features on recovery"); - return error; - } - } - xlog_recover_process_iunlinks(log); /* @@ -3502,9 +3555,13 @@ xlog_recover_finish( * and AIL. */ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + error = 0; + goto out_error; } - return 0; +out_error: + memalloc_nofs_restore(nofs_flags); + return error; } void |
