diff options
Diffstat (limited to 'fs/jbd2/recovery.c')
-rw-r--r-- | fs/jbd2/recovery.c | 414 |
1 files changed, 232 insertions, 182 deletions
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 1f7664984d6e..cac8c2cd4a92 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -19,6 +19,7 @@ #include <linux/errno.h> #include <linux/crc32.h> #include <linux/blkdev.h> +#include <linux/string_choices.h> #endif /* @@ -38,7 +39,7 @@ struct recovery_info static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass); -static int scan_revoke_records(journal_t *, struct buffer_head *, +static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *, tid_t, struct recovery_info *); #ifdef __KERNEL__ @@ -64,9 +65,8 @@ static void journal_brelse_array(struct buffer_head *b[], int n) */ #define MAXBUF 8 -static int do_readahead(journal_t *journal, unsigned int start) +static void do_readahead(journal_t *journal, unsigned int start) { - int err; unsigned int max, nbufs, next; unsigned long long blocknr; struct buffer_head *bh; @@ -84,7 +84,7 @@ static int do_readahead(journal_t *journal, unsigned int start) nbufs = 0; for (next = start; next < max; next++) { - err = jbd2_journal_bmap(journal, next, &blocknr); + int err = jbd2_journal_bmap(journal, next, &blocknr); if (err) { printk(KERN_ERR "JBD2: bad block at offset %u\n", @@ -93,10 +93,8 @@ static int do_readahead(journal_t *journal, unsigned int start) } bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) { - err = -ENOMEM; + if (!bh) goto failed; - } if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; @@ -111,12 +109,10 @@ static int do_readahead(journal_t *journal, unsigned int start) if (nbufs) bh_readahead_batch(nbufs, bufs, 0); - err = 0; failed: if (nbufs) journal_brelse_array(bufs, nbufs); - return err; } #endif /* __KERNEL__ */ @@ -189,7 +185,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); provided = tail->t_checksum; tail->t_checksum = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); tail->t_checksum = provided; return provided == cpu_to_be32(calculated); @@ -286,19 +282,20 @@ static int fc_do_one_pass(journal_t *journal, int jbd2_journal_recover(journal_t *journal) { int err, err2; - journal_superblock_t * sb; - struct recovery_info info; memset(&info, 0, sizeof(info)); - sb = journal->j_superblock; /* * The journal superblock's s_start field (the current log head) * is always zero if, and only if, the journal was cleanly - * unmounted. + * unmounted. We use its in-memory version j_tail here because + * jbd2_journal_wipe() could have updated it without updating journal + * superblock. */ - if (!sb->s_start) { + if (!journal->j_tail) { + journal_superblock_t *sb = journal->j_superblock; + jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n", be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head)); journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; @@ -326,6 +323,12 @@ int jbd2_journal_recover(journal_t *journal) journal->j_transaction_sequence, journal->j_head); jbd2_journal_clear_revoke(journal); + /* Free revoke table allocated for replay */ + if (journal->j_revoke != journal->j_revoke_table[0] && + journal->j_revoke != journal->j_revoke_table[1]) { + jbd2_journal_destroy_revoke_table(journal->j_revoke); + journal->j_revoke = journal->j_revoke_table[1]; + } err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; @@ -374,7 +377,7 @@ int jbd2_journal_skip_recovery(journal_t *journal) be32_to_cpu(journal->j_superblock->s_sequence); jbd2_debug(1, "JBD2: ignoring %d transaction%s from the journal.\n", - dropped, (dropped == 1) ? "" : "s"); + dropped, str_plural(dropped)); #endif journal->j_transaction_sequence = ++info.end_transaction; journal->j_head = info.head_block; @@ -437,12 +440,33 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) h = buf; provided = h->h_chksum[0]; h->h_chksum[0] = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); h->h_chksum[0] = provided; return provided == cpu_to_be32(calculated); } +static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf) +{ + struct commit_header *h; + __be32 provided; + __u32 calculated; + void *tmpbuf; + + tmpbuf = kzalloc(j->j_blocksize, GFP_KERNEL); + if (!tmpbuf) + return false; + + memcpy(tmpbuf, buf, sizeof(struct commit_header)); + h = tmpbuf; + provided = h->h_chksum[0]; + h->h_chksum[0] = 0; + calculated = jbd2_chksum(j->j_csum_seed, tmpbuf, j->j_blocksize); + kfree(tmpbuf); + + return provided == cpu_to_be32(calculated); +} + static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, journal_block_tag3_t *tag3, void *buf, __u32 sequence) @@ -454,8 +478,8 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, return 1; seq = cpu_to_be32(sequence); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, buf, j->j_blocksize); if (jbd2_has_feature_csum3(j)) return tag3->t_checksum == cpu_to_be32(csum32); @@ -463,6 +487,104 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, return tag->t_checksum == cpu_to_be16(csum32); } +static __always_inline int jbd2_do_replay(journal_t *journal, + struct recovery_info *info, + struct buffer_head *bh, + unsigned long *next_log_block, + unsigned int next_commit_ID) +{ + char *tagp; + int flags; + int ret = 0; + int tag_bytes = journal_tag_bytes(journal); + int descr_csum_size = 0; + unsigned long io_block; + journal_block_tag_t tag; + struct buffer_head *obh; + struct buffer_head *nbh; + + if (jbd2_journal_has_csum_v2or3(journal)) + descr_csum_size = sizeof(struct jbd2_journal_block_tail); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while (tagp - bh->b_data + tag_bytes <= + journal->j_blocksize - descr_csum_size) { + int err; + + memcpy(&tag, tagp, sizeof(tag)); + flags = be16_to_cpu(tag.t_flags); + + io_block = (*next_log_block)++; + wrap(journal, *next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but report failure at the end. */ + ret = err; + pr_err("JBD2: IO error %d recovering block %lu in log\n", + err, io_block); + } else { + unsigned long long blocknr; + + J_ASSERT(obh != NULL); + blocknr = read_tag_block(journal, &tag); + + /* If the block has been revoked, then we're all done here. */ + if (jbd2_journal_test_revoke(journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Look for block corruption */ + if (!jbd2_block_tag_csum_verify(journal, &tag, + (journal_block_tag3_t *)tagp, + obh->b_data, next_commit_ID)) { + brelse(obh); + ret = -EFSBADCRC; + pr_err("JBD2: Invalid checksum recovering data block %llu in journal block %lu\n", + blocknr, io_block); + goto skip_write; + } + + /* Find a buffer for the new data being restored */ + nbh = __getblk(journal->j_fs_dev, blocknr, + journal->j_blocksize); + if (nbh == NULL) { + pr_err("JBD2: Out of memory during recovery.\n"); + brelse(obh); + return -ENOMEM; + } + + lock_buffer(nbh); + memcpy(nbh->b_data, obh->b_data, journal->j_blocksize); + if (flags & JBD2_FLAG_ESCAPE) { + *((__be32 *)nbh->b_data) = + cpu_to_be32(JBD2_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + set_buffer_uptodate(nbh); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + ++info->nr_replays; + unlock_buffer(nbh); + brelse(obh); + brelse(nbh); + } + +skip_write: + tagp += tag_bytes; + if (!(flags & JBD2_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JBD2_FLAG_LAST_TAG) + break; + } + + return ret; +} + static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { @@ -471,13 +593,10 @@ static int do_one_pass(journal_t *journal, int err, success = 0; journal_superblock_t * sb; journal_header_t * tmp; - struct buffer_head * bh; + struct buffer_head *bh = NULL; unsigned int sequence; int blocktype; - int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ - int descr_csum_size = 0; - int block_error = 0; bool need_check_commit_time = false; __u64 last_trans_commit_time = 0, commit_time; @@ -495,6 +614,31 @@ static int do_one_pass(journal_t *journal, first_commit_ID = next_commit_ID; if (pass == PASS_SCAN) info->start_transaction = first_commit_ID; + else if (pass == PASS_REVOKE) { + /* + * Would the default revoke table have too long hash chains + * during replay? + */ + if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) { + unsigned int hash_size; + + /* + * Aim for average chain length of 8, limit at 1M + * entries to avoid problems with malicious + * filesystems. + */ + hash_size = min(roundup_pow_of_two(info->nr_revokes / 8), + 1U << 20); + journal->j_revoke = + jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke) { + printk(KERN_ERR + "JBD2: failed to allocate revoke table for replay with %u entries. " + "Journal replay may be slow.\n", hash_size); + journal->j_revoke = journal->j_revoke_table[1]; + } + } + } jbd2_debug(1, "Starting recovery pass %d\n", pass); @@ -506,12 +650,6 @@ static int do_one_pass(journal_t *journal, */ while (1) { - int flags; - char * tagp; - journal_block_tag_t tag; - struct buffer_head * obh; - struct buffer_head * nbh; - cond_resched(); /* If we already know where to stop the log traversal, @@ -530,6 +668,8 @@ static int do_one_pass(journal_t *journal, * record. */ jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block); + brelse(bh); + bh = NULL; err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -545,20 +685,16 @@ static int do_one_pass(journal_t *journal, tmp = (journal_header_t *)bh->b_data; - if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) { - brelse(bh); + if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) break; - } blocktype = be32_to_cpu(tmp->h_blocktype); sequence = be32_to_cpu(tmp->h_sequence); jbd2_debug(3, "Found magic %d, sequence %d\n", blocktype, sequence); - if (sequence != next_commit_ID) { - brelse(bh); + if (sequence != next_commit_ID) break; - } /* OK, we have a valid descriptor block which matches * all of the sequence number checks. What are we going @@ -567,11 +703,7 @@ static int do_one_pass(journal_t *journal, switch(blocktype) { case JBD2_DESCRIPTOR_BLOCK: /* Verify checksum first */ - if (jbd2_journal_has_csum_v2or3(journal)) - descr_csum_size = - sizeof(struct jbd2_journal_block_tail); - if (descr_csum_size > 0 && - !jbd2_descriptor_block_csum_verify(journal, + if (!jbd2_descriptor_block_csum_verify(journal, bh->b_data)) { /* * PASS_SCAN can see stale blocks due to lazy @@ -581,7 +713,6 @@ static int do_one_pass(journal_t *journal, pr_err("JBD2: Invalid checksum recovering block %lu in log\n", next_log_block); err = -EFSBADCRC; - brelse(bh); goto failed; } need_check_commit_time = true; @@ -597,125 +728,39 @@ static int do_one_pass(journal_t *journal, if (pass != PASS_REPLAY) { if (pass == PASS_SCAN && jbd2_has_feature_checksum(journal) && - !need_check_commit_time && !info->end_transaction) { if (calc_chksums(journal, bh, &next_log_block, - &crc32_sum)) { - put_bh(bh); + &crc32_sum)) break; - } - put_bh(bh); continue; } next_log_block += count_tags(journal, bh); wrap(journal, next_log_block); - put_bh(bh); continue; } - /* A descriptor block: we can now write all of - * the data blocks. Yay, useful work is finally - * getting done here! */ - - tagp = &bh->b_data[sizeof(journal_header_t)]; - while ((tagp - bh->b_data + tag_bytes) - <= journal->j_blocksize - descr_csum_size) { - unsigned long io_block; - - memcpy(&tag, tagp, sizeof(tag)); - flags = be16_to_cpu(tag.t_flags); - - io_block = next_log_block++; - wrap(journal, next_log_block); - err = jread(&obh, journal, io_block); - if (err) { - /* Recover what we can, but - * report failure at the end. */ - success = err; - printk(KERN_ERR - "JBD2: IO error %d recovering " - "block %lu in log\n", - err, io_block); - } else { - unsigned long long blocknr; - - J_ASSERT(obh != NULL); - blocknr = read_tag_block(journal, - &tag); - - /* If the block has been - * revoked, then we're all done - * here. */ - if (jbd2_journal_test_revoke - (journal, blocknr, - next_commit_ID)) { - brelse(obh); - ++info->nr_revoke_hits; - goto skip_write; - } - - /* Look for block corruption */ - if (!jbd2_block_tag_csum_verify( - journal, &tag, (journal_block_tag3_t *)tagp, - obh->b_data, be32_to_cpu(tmp->h_sequence))) { - brelse(obh); - success = -EFSBADCRC; - printk(KERN_ERR "JBD2: Invalid " - "checksum recovering " - "data block %llu in " - "journal block %lu\n", - blocknr, io_block); - block_error = 1; - goto skip_write; - } - - /* Find a buffer for the new - * data being restored */ - nbh = __getblk(journal->j_fs_dev, - blocknr, - journal->j_blocksize); - if (nbh == NULL) { - printk(KERN_ERR - "JBD2: Out of memory " - "during recovery.\n"); - err = -ENOMEM; - brelse(bh); - brelse(obh); - goto failed; - } - - lock_buffer(nbh); - memcpy(nbh->b_data, obh->b_data, - journal->j_blocksize); - if (flags & JBD2_FLAG_ESCAPE) { - *((__be32 *)nbh->b_data) = - cpu_to_be32(JBD2_MAGIC_NUMBER); - } - - BUFFER_TRACE(nbh, "marking dirty"); - set_buffer_uptodate(nbh); - mark_buffer_dirty(nbh); - BUFFER_TRACE(nbh, "marking uptodate"); - ++info->nr_replays; - unlock_buffer(nbh); - brelse(obh); - brelse(nbh); - } - - skip_write: - tagp += tag_bytes; - if (!(flags & JBD2_FLAG_SAME_UUID)) - tagp += 16; - - if (flags & JBD2_FLAG_LAST_TAG) - break; + /* + * A descriptor block: we can now write all of the + * data blocks. Yay, useful work is finally getting + * done here! + */ + err = jbd2_do_replay(journal, info, bh, &next_log_block, + next_commit_ID); + if (err) { + if (err == -ENOMEM) + goto failed; + success = err; } - brelse(bh); continue; case JBD2_COMMIT_BLOCK: + if (pass != PASS_SCAN) { + next_commit_ID++; + continue; + } + /* How to differentiate between interrupted commit * and journal corruption ? * @@ -760,7 +805,6 @@ static int do_one_pass(journal_t *journal, pr_err("JBD2: Invalid checksum found in transaction %u\n", next_commit_ID); err = -EFSBADCRC; - brelse(bh); goto failed; } ignore_crc_mismatch: @@ -770,7 +814,6 @@ static int do_one_pass(journal_t *journal, */ jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", next_commit_ID); - brelse(bh); goto done; } @@ -780,8 +823,7 @@ static int do_one_pass(journal_t *journal, * much to do other than move on to the next sequence * number. */ - if (pass == PASS_SCAN && - jbd2_has_feature_checksum(journal)) { + if (jbd2_has_feature_checksum(journal)) { struct commit_header *cbh = (struct commit_header *)bh->b_data; unsigned found_chksum = @@ -790,7 +832,6 @@ static int do_one_pass(journal_t *journal, if (info->end_transaction) { journal->j_failed_commit = info->end_transaction; - brelse(bh); break; } @@ -806,33 +847,45 @@ static int do_one_pass(journal_t *journal, goto chksum_error; crc32_sum = ~0; + goto chksum_ok; } - if (pass == PASS_SCAN && - !jbd2_commit_block_csum_verify(journal, - bh->b_data)) { - chksum_error: - if (commit_time < last_trans_commit_time) - goto ignore_crc_mismatch; - info->end_transaction = next_commit_ID; - info->head_block = head_block; - - if (!jbd2_has_feature_async_commit(journal)) { - journal->j_failed_commit = - next_commit_ID; - brelse(bh); - break; - } + + if (jbd2_commit_block_csum_verify(journal, bh->b_data)) + goto chksum_ok; + + if (jbd2_commit_block_csum_verify_partial(journal, + bh->b_data)) { + pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n", + next_commit_ID, next_log_block); + goto chksum_ok; } - if (pass == PASS_SCAN) { - last_trans_commit_time = commit_time; - head_block = next_log_block; + +chksum_error: + if (commit_time < last_trans_commit_time) + goto ignore_crc_mismatch; + info->end_transaction = next_commit_ID; + info->head_block = head_block; + + if (!jbd2_has_feature_async_commit(journal)) { + journal->j_failed_commit = next_commit_ID; + break; } - brelse(bh); + +chksum_ok: + last_trans_commit_time = commit_time; + head_block = next_log_block; next_commit_ID++; continue; case JBD2_REVOKE_BLOCK: /* + * If we aren't in the SCAN or REVOKE pass, then we can + * just skip over this block. + */ + if (pass != PASS_REVOKE && pass != PASS_SCAN) + continue; + + /* * Check revoke block crc in pass_scan, if csum verify * failed, check commit block time later. */ @@ -843,16 +896,9 @@ static int do_one_pass(journal_t *journal, next_log_block); need_check_commit_time = true; } - /* If we aren't in the REVOKE pass, then we can - * just skip over this block. */ - if (pass != PASS_REVOKE) { - brelse(bh); - continue; - } - err = scan_revoke_records(journal, bh, + err = scan_revoke_records(journal, pass, bh, next_commit_ID, info); - brelse(bh); if (err) goto failed; continue; @@ -860,12 +906,12 @@ static int do_one_pass(journal_t *journal, default: jbd2_debug(3, "Unrecognised magic %d, end of scan.\n", blocktype); - brelse(bh); goto done; } } done: + brelse(bh); /* * We broke out of the log scan loop: either we came to the * known end of the log or we found an unexpected block in the @@ -896,18 +942,18 @@ static int do_one_pass(journal_t *journal, success = err; } - if (block_error && success == 0) - success = -EIO; return success; failed: + brelse(bh); return err; } /* Scan a revoke record, marking all blocks mentioned as revoked. */ -static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, - tid_t sequence, struct recovery_info *info) +static int scan_revoke_records(journal_t *journal, enum passtype pass, + struct buffer_head *bh, tid_t sequence, + struct recovery_info *info) { jbd2_journal_revoke_header_t *header; int offset, max; @@ -928,6 +974,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, if (jbd2_has_feature_64bit(journal)) record_len = 8; + if (pass == PASS_SCAN) { + info->nr_revokes += (max - offset) / record_len; + return 0; + } + while (offset + record_len <= max) { unsigned long long blocknr; int err; @@ -940,7 +991,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, err = jbd2_journal_set_revoke(journal, blocknr, sequence); if (err) return err; - ++info->nr_revokes; } return 0; } |