diff options
Diffstat (limited to 'fs/jbd2')
-rw-r--r-- | fs/jbd2/Kconfig | 2 | ||||
-rw-r--r-- | fs/jbd2/checkpoint.c | 45 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 37 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 305 | ||||
-rw-r--r-- | fs/jbd2/recovery.c | 414 | ||||
-rw-r--r-- | fs/jbd2/revoke.c | 38 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 71 |
7 files changed, 453 insertions, 459 deletions
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig index 4ad2c67f93f1..9c19e1512101 100644 --- a/fs/jbd2/Kconfig +++ b/fs/jbd2/Kconfig @@ -2,8 +2,6 @@ config JBD2 tristate select CRC32 - select CRYPTO - select CRYPTO_CRC32C help This is a generic journaling layer for block devices that support both 32-bit and 64-bit block numbers. It is currently used by diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 1c97e64c4784..b3971e91e8eb 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -79,17 +79,23 @@ __releases(&journal->j_state_lock) if (space_left < nblocks) { int chkpt = journal->j_checkpoint_transactions != NULL; tid_t tid = 0; + bool has_transaction = false; - if (journal->j_committing_transaction) + if (journal->j_committing_transaction) { tid = journal->j_committing_transaction->t_tid; + has_transaction = true; + } spin_unlock(&journal->j_list_lock); write_unlock(&journal->j_state_lock); if (chkpt) { jbd2_log_do_checkpoint(journal); - } else if (jbd2_cleanup_journal_tail(journal) == 0) { - /* We were able to recover space; yay! */ + } else if (jbd2_cleanup_journal_tail(journal) <= 0) { + /* + * We were able to recover space or the + * journal was aborted due to an error. + */ ; - } else if (tid) { + } else if (has_transaction) { /* * jbd2_journal_commit_transaction() may want * to take the checkpoint_mutex if JBD2_FLUSHED @@ -337,8 +343,6 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ -enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; - /* * journal_shrink_one_cp_list * @@ -350,7 +354,7 @@ enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, - enum shrink_type type, + enum jbd2_shrink_type type, bool *released) { struct journal_head *last_jh; @@ -367,12 +371,12 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, jh = next_jh; next_jh = jh->b_cpnext; - if (type == SHRINK_DESTROY) { + if (type == JBD2_SHRINK_DESTROY) { ret = __jbd2_journal_remove_checkpoint(jh); } else { ret = jbd2_journal_try_remove_checkpoint(jh); if (ret < 0) { - if (type == SHRINK_BUSY_SKIP) + if (type == JBD2_SHRINK_BUSY_SKIP) continue; break; } @@ -409,6 +413,7 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, tid_t tid = 0; unsigned long nr_freed = 0; unsigned long freed; + bool first_set = false; again: spin_lock(&journal->j_list_lock); @@ -428,8 +433,10 @@ again: else transaction = journal->j_checkpoint_transactions; - if (!first_tid) + if (!first_set) { first_tid = transaction->t_tid; + first_set = true; + } last_transaction = journal->j_checkpoint_transactions->t_cpprev; next_transaction = transaction; last_tid = last_transaction->t_tid; @@ -439,7 +446,7 @@ again: tid = transaction->t_tid; freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, - SHRINK_BUSY_SKIP, &released); + JBD2_SHRINK_BUSY_SKIP, &released); nr_freed += freed; (*nr_to_scan) -= min(*nr_to_scan, freed); if (*nr_to_scan == 0) @@ -459,7 +466,7 @@ again: spin_unlock(&journal->j_list_lock); cond_resched(); - if (*nr_to_scan && next_tid) + if (*nr_to_scan && journal->j_shrink_transaction) goto again; out: trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, @@ -472,21 +479,25 @@ out: * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. - * If 'destroy' is set, release all buffers unconditionally. + * If 'type' is JBD2_SHRINK_DESTROY, release all buffers unconditionally. If + * 'type' is JBD2_SHRINK_BUSY_STOP, will stop release buffers if encounters a + * busy buffer. To avoid wasting CPU cycles scanning the buffer list in some + * cases, don't pass JBD2_SHRINK_BUSY_SKIP 'type' for this function. * * Called with j_list_lock held. */ -void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) +void __jbd2_journal_clean_checkpoint_list(journal_t *journal, + enum jbd2_shrink_type type) { transaction_t *transaction, *last_transaction, *next_transaction; - enum shrink_type type; bool released; + WARN_ON_ONCE(type == JBD2_SHRINK_BUSY_SKIP); + transaction = journal->j_checkpoint_transactions; if (!transaction) return; - type = destroy ? SHRINK_DESTROY : SHRINK_BUSY_STOP; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { @@ -527,7 +538,7 @@ void jbd2_journal_destroy_checkpoint(journal_t *journal) spin_unlock(&journal->j_list_lock); break; } - __jbd2_journal_clean_checkpoint_list(journal, true); + __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_DESTROY); spin_unlock(&journal->j_list_lock); cond_resched(); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5e122586e06e..7203d2d2624d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -57,8 +57,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) * So here, we have a buffer which has just come off the forget list. Look to * see if we can strip all buffers from the backing page. * - * Called under lock_journal(), and possibly under journal_datalist_lock. The - * caller provided us with a ref against the buffer, and we drop that here. + * Called under j_list_lock. The caller provided us with a ref against the + * buffer, and we drop that here. */ static void release_buffer_page(struct buffer_head *bh) { @@ -99,7 +99,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) h->h_chksum_type = 0; h->h_chksum_size = 0; h->h_chksum[0] = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); h->h_chksum[0] = cpu_to_be32(csum); } @@ -330,8 +330,8 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, seq = cpu_to_be32(sequence); addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, addr, bh->b_size); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, addr, bh->b_size); kunmap_local(addr); if (jbd2_has_feature_csum3(j)) @@ -353,7 +353,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) struct buffer_head *descriptor; struct buffer_head **wbuf = journal->j_wbuf; int bufs; - int flags; + int escape; int err; unsigned long long blocknr; ktime_t start_time; @@ -501,7 +501,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * frees some memory */ spin_lock(&journal->j_list_lock); - __jbd2_journal_clean_checkpoint_list(journal, false); + __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP); spin_unlock(&journal->j_list_lock); jbd2_debug(3, "JBD2: commit phase 1\n"); @@ -571,7 +571,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(commit_transaction->t_nr_buffers <= atomic_read(&commit_transaction->t_outstanding_credits)); - err = 0; bufs = 0; descriptor = NULL; while (commit_transaction->t_buffers) { @@ -661,19 +660,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ set_bit(BH_JWrite, &jh2bh(jh)->b_state); JBUFFER_TRACE(jh, "ph3: write metadata"); - flags = jbd2_journal_write_metadata_buffer(commit_transaction, + escape = jbd2_journal_write_metadata_buffer(commit_transaction, jh, &wbuf[bufs], blocknr); - if (flags < 0) { - jbd2_journal_abort(journal, flags); - continue; - } jbd2_file_log_bh(&io_bufs, wbuf[bufs]); /* Record the new block's tag in the current descriptor buffer */ tag_flag = 0; - if (flags & 1) + if (escape) tag_flag |= JBD2_FLAG_ESCAPE; if (!first_tag) tag_flag |= JBD2_FLAG_SAME_UUID; @@ -743,10 +738,8 @@ start_journal_io: err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { printk(KERN_WARNING - "JBD2: Detected IO errors while flushing file data " - "on %s\n", journal->j_devname); - if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) - jbd2_journal_abort(journal, err); + "JBD2: Detected IO errors %d while flushing file data on %s\n", + err, journal->j_devname); err = 0; } @@ -767,7 +760,7 @@ start_journal_io: if (first_block < journal->j_tail) freed += journal->j_last - journal->j_first; /* Update tail only if we free significant amount of space */ - if (freed < jbd2_journal_get_max_txn_bufs(journal)) + if (freed < journal->j_max_transaction_buffers) update_tail = 0; } J_ASSERT(commit_transaction->t_state == T_COMMIT); @@ -777,9 +770,9 @@ start_journal_io: /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue - * the commit record + * the commit record and update the journal tail sequence. */ - if (commit_transaction->t_need_data_flush && + if ((commit_transaction->t_need_data_flush || update_tail) && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(journal->j_fs_dev); @@ -1108,7 +1101,7 @@ restart_loop: commit_transaction->t_state = T_COMMIT_CALLBACK; J_ASSERT(commit_transaction == journal->j_committing_transaction); - journal->j_commit_sequence = commit_transaction->t_tid; + WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid); journal->j_committing_transaction = NULL; commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b6c114c11b97..6d5e76848733 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -83,7 +83,7 @@ EXPORT_SYMBOL(jbd2_log_wait_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); -EXPORT_SYMBOL(jbd2_journal_blocks_per_page); +EXPORT_SYMBOL(jbd2_journal_blocks_per_folio); EXPORT_SYMBOL(jbd2_journal_invalidate_folio); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); @@ -115,14 +115,14 @@ void __jbd2_debug(int level, const char *file, const char *func, #endif /* Checksumming functions */ -static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) +static __be32 jbd2_superblock_csum(journal_superblock_t *sb) { __u32 csum; __be32 old_csum; old_csum = sb->s_checksum; sb->s_checksum = 0; - csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t)); + csum = jbd2_chksum(~0, (char *)sb, sizeof(journal_superblock_t)); sb->s_checksum = old_csum; return cpu_to_be32(csum); @@ -197,7 +197,7 @@ loop: if (journal->j_commit_sequence != journal->j_commit_request) { jbd2_debug(1, "OK, requests differ\n"); write_unlock(&journal->j_state_lock); - del_timer_sync(&journal->j_commit_timer); + timer_delete_sync(&journal->j_commit_timer); jbd2_journal_commit_transaction(journal); write_lock(&journal->j_state_lock); goto loop; @@ -220,19 +220,12 @@ loop: * so we don't sleep */ DEFINE_WAIT(wait); - int should_sleep = 1; prepare_to_wait(&journal->j_wait_commit, &wait, TASK_INTERRUPTIBLE); - if (journal->j_commit_sequence != journal->j_commit_request) - should_sleep = 0; transaction = journal->j_running_transaction; - if (transaction && time_after_eq(jiffies, - transaction->t_expires)) - should_sleep = 0; - if (journal->j_flags & JBD2_UNMOUNT) - should_sleep = 0; - if (should_sleep) { + if (transaction == NULL || + time_before(jiffies, transaction->t_expires)) { write_unlock(&journal->j_state_lock); schedule(); write_lock(&journal->j_state_lock); @@ -253,7 +246,7 @@ loop: goto loop; end_loop: - del_timer_sync(&journal->j_commit_timer); + timer_delete_sync(&journal->j_commit_timer); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); jbd2_debug(1, "Journal thread exiting.\n"); @@ -288,6 +281,16 @@ static void journal_kill_thread(journal_t *journal) write_unlock(&journal->j_state_lock); } +static inline bool jbd2_data_needs_escaping(char *data) +{ + return *((__be32 *)data) == cpu_to_be32(JBD2_MAGIC_NUMBER); +} + +static inline void jbd2_data_do_escape(char *data) +{ + *((unsigned int *)data) = 0; +} + /* * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal. * @@ -315,12 +318,8 @@ static void journal_kill_thread(journal_t *journal) * * * Return value: - * <0: Error - * >=0: Finished OK - * - * On success: - * Bit 0 set == escape performed on the data - * Bit 1 set == buffer copy-out performed (kfree the data after IO) + * =0: Finished OK without escape + * =1: Finished OK with escape */ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, @@ -328,10 +327,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct buffer_head **bh_out, sector_t blocknr) { - int need_copy_out = 0; - int done_copy_out = 0; int do_escape = 0; - char *mapped_data; struct buffer_head *new_bh; struct folio *new_folio; unsigned int new_offset; @@ -355,83 +351,63 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, atomic_set(&new_bh->b_count, 1); spin_lock(&jh_in->b_state_lock); -repeat: /* * If a new transaction has already done a buffer copy-out, then * we use that version of the data for the commit. */ if (jh_in->b_frozen_data) { - done_copy_out = 1; new_folio = virt_to_folio(jh_in->b_frozen_data); new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); + do_escape = jbd2_data_needs_escaping(jh_in->b_frozen_data); + if (do_escape) + jbd2_data_do_escape(jh_in->b_frozen_data); } else { - new_folio = jh2bh(jh_in)->b_folio; - new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data); - } + char *tmp; + char *mapped_data; - mapped_data = kmap_local_folio(new_folio, new_offset); - /* - * Fire data frozen trigger if data already wasn't frozen. Do this - * before checking for escaping, as the trigger may modify the magic - * offset. If a copy-out happens afterwards, it will have the correct - * data in the buffer. - */ - if (!done_copy_out) + new_folio = bh_in->b_folio; + new_offset = offset_in_folio(new_folio, bh_in->b_data); + mapped_data = kmap_local_folio(new_folio, new_offset); + /* + * Fire data frozen trigger if data already wasn't frozen. Do + * this before checking for escaping, as the trigger may modify + * the magic offset. If a copy-out happens afterwards, it will + * have the correct data in the buffer. + */ jbd2_buffer_frozen_trigger(jh_in, mapped_data, jh_in->b_triggers); - - /* - * Check for escaping - */ - if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) { - need_copy_out = 1; - do_escape = 1; - } - kunmap_local(mapped_data); - - /* - * Do we need to do a data copy? - */ - if (need_copy_out && !done_copy_out) { - char *tmp; + do_escape = jbd2_data_needs_escaping(mapped_data); + kunmap_local(mapped_data); + /* + * Do we need to do a data copy? + */ + if (!do_escape) + goto escape_done; spin_unlock(&jh_in->b_state_lock); - tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); - if (!tmp) { - brelse(new_bh); - return -ENOMEM; - } + tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL); spin_lock(&jh_in->b_state_lock); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); - goto repeat; + goto copy_done; } jh_in->b_frozen_data = tmp; memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size); - - new_folio = virt_to_folio(tmp); - new_offset = offset_in_folio(new_folio, tmp); - done_copy_out = 1; - /* * This isn't strictly necessary, as we're using frozen * data for the escaping, but it keeps consistency with * b_frozen_data usage. */ jh_in->b_frozen_triggers = jh_in->b_triggers; - } - /* - * Did we need to do an escaping? Now we've done all the - * copying, we can finally do so. - */ - if (do_escape) { - mapped_data = kmap_local_folio(new_folio, new_offset); - *((unsigned int *)mapped_data) = 0; - kunmap_local(mapped_data); +copy_done: + new_folio = virt_to_folio(jh_in->b_frozen_data); + new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); + jbd2_data_do_escape(jh_in->b_frozen_data); } +escape_done: folio_set_bh(new_bh, new_folio, new_offset); new_bh->b_size = bh_in->b_size; new_bh->b_bdev = journal->j_dev; @@ -454,7 +430,7 @@ repeat: set_buffer_shadow(bh_in); spin_unlock(&jh_in->b_state_lock); - return do_escape | (done_copy_out << 1); + return do_escape; } /* @@ -627,7 +603,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) { int ret = 0; - transaction_t *commit_trans; + transaction_t *commit_trans, *running_trans; if (!(journal->j_flags & JBD2_BARRIER)) return 0; @@ -637,6 +613,16 @@ int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) goto out; commit_trans = journal->j_committing_transaction; if (!commit_trans || commit_trans->t_tid != tid) { + running_trans = journal->j_running_transaction; + /* + * The query transaction hasn't started committing, + * it must still be running. + */ + if (WARN_ON_ONCE(!running_trans || + running_trans->t_tid != tid)) + goto out; + + running_trans->t_need_data_flush = 1; ret = 1; goto out; } @@ -724,7 +710,7 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) return -EINVAL; write_lock(&journal->j_state_lock); - if (tid <= journal->j_commit_sequence) { + if (tid_geq(journal->j_commit_sequence, tid)) { write_unlock(&journal->j_state_lock); return -EALREADY; } @@ -742,7 +728,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) } journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; write_unlock(&journal->j_state_lock); - jbd2_journal_lock_updates(journal); return 0; } @@ -754,7 +739,6 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit); */ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { - jbd2_journal_unlock_updates(journal); if (journal->j_fc_cleanup_callback) journal->j_fc_cleanup_callback(journal, 0, tid); write_lock(&journal->j_state_lock); @@ -789,17 +773,7 @@ EXPORT_SYMBOL(jbd2_fc_end_commit_fallback); /* Return 1 when transaction with given tid has already committed. */ int jbd2_transaction_committed(journal_t *journal, tid_t tid) { - int ret = 1; - - read_lock(&journal->j_state_lock); - if (journal->j_running_transaction && - journal->j_running_transaction->t_tid == tid) - ret = 0; - if (journal->j_committing_transaction && - journal->j_committing_transaction->t_tid == tid) - ret = 0; - read_unlock(&journal->j_state_lock); - return ret; + return tid_geq(READ_ONCE(journal->j_commit_sequence), tid); } EXPORT_SYMBOL(jbd2_transaction_committed); @@ -865,17 +839,12 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) *bh_out = NULL; - if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) { - fc_off = journal->j_fc_off; - blocknr = journal->j_fc_first + fc_off; - journal->j_fc_off++; - } else { - ret = -EINVAL; - } - - if (ret) - return ret; + if (journal->j_fc_off + journal->j_fc_first >= journal->j_fc_last) + return -EINVAL; + fc_off = journal->j_fc_off; + blocknr = journal->j_fc_first + fc_off; + journal->j_fc_off++; ret = jbd2_journal_bmap(journal, blocknr, &pblock); if (ret) return ret; @@ -884,7 +853,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) if (!bh) return -ENOMEM; - journal->j_fc_wbuf[fc_off] = bh; *bh_out = bh; @@ -927,7 +895,7 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) } EXPORT_SYMBOL(jbd2_fc_wait_bufs); -int jbd2_fc_release_bufs(journal_t *journal) +void jbd2_fc_release_bufs(journal_t *journal) { struct buffer_head *bh; int i, j_fc_off; @@ -941,8 +909,6 @@ int jbd2_fc_release_bufs(journal_t *journal) put_bh(bh); journal->j_fc_wbuf[i] = NULL; } - - return 0; } EXPORT_SYMBOL(jbd2_fc_release_bufs); @@ -989,7 +955,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, * descriptor blocks we do need to generate bona fide buffers. * * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying - * the buffer's contents they really should run flush_dcache_page(bh->b_page). + * the buffer's contents they really should run flush_dcache_folio(bh->b_folio). * But we don't bother doing that, so there will be coherency problems with * mmaps of blockdevs which hold live JBD-controlled filesystems. */ @@ -1034,7 +1000,7 @@ void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh) tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); tail->t_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); tail->t_checksum = cpu_to_be32(csum); } @@ -1403,7 +1369,7 @@ static int journal_check_superblock(journal_t *journal) return err; } - if (jbd2_journal_has_csum_v2or3_feature(journal) && + if (jbd2_journal_has_csum_v2or3(journal) && jbd2_has_feature_checksum(journal)) { /* Can't have checksum v1 and v2 on at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " @@ -1411,22 +1377,14 @@ static int journal_check_superblock(journal_t *journal) return err; } - /* Load the checksum driver */ - if (jbd2_journal_has_csum_v2or3_feature(journal)) { + if (jbd2_journal_has_csum_v2or3(journal)) { if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { printk(KERN_ERR "JBD2: Unknown checksum type\n"); return err; } - journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); - err = PTR_ERR(journal->j_chksum_driver); - journal->j_chksum_driver = NULL; - return err; - } /* Check superblock checksum */ - if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + if (sb->s_checksum != jbd2_superblock_csum(sb)) { printk(KERN_ERR "JBD2: journal checksum error\n"); err = -EFSBADCRC; return err; @@ -1451,6 +1409,48 @@ static int journal_revoke_records_per_block(journal_t *journal) return space / record_size; } +static int jbd2_journal_get_max_txn_bufs(journal_t *journal) +{ + return (journal->j_total_len - journal->j_fc_wbufsize) / 3; +} + +/* + * Base amount of descriptor blocks we reserve for each transaction. + */ +static int jbd2_descriptor_blocks_per_trans(journal_t *journal) +{ + int tag_space = journal->j_blocksize - sizeof(journal_header_t); + int tags_per_block; + + /* Subtract UUID */ + tag_space -= 16; + if (jbd2_journal_has_csum_v2or3(journal)) + tag_space -= sizeof(struct jbd2_journal_block_tail); + /* Commit code leaves a slack space of 16 bytes at the end of block */ + tags_per_block = (tag_space - 16) / journal_tag_bytes(journal); + /* + * Revoke descriptors are accounted separately so we need to reserve + * space for commit block and normal transaction descriptor blocks. + */ + return 1 + DIV_ROUND_UP(jbd2_journal_get_max_txn_bufs(journal), + tags_per_block); +} + +/* + * Initialize number of blocks each transaction reserves for its bookkeeping + * and maximum number of blocks a transaction can use. This needs to be called + * after the journal size and the fastcommit area size are initialized. + */ +static void jbd2_journal_init_transaction_limits(journal_t *journal) +{ + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); + journal->j_transaction_overhead_buffers = + jbd2_descriptor_blocks_per_trans(journal); + journal->j_max_transaction_buffers = + jbd2_journal_get_max_txn_bufs(journal); +} + /* * Load the on-disk journal superblock and read the key fields into the * journal_t. @@ -1490,10 +1490,10 @@ static int journal_load_superblock(journal_t *journal) journal->j_total_len = be32_to_cpu(sb->s_maxlen); /* Precompute checksum seed for all metadata */ if (jbd2_journal_has_csum_v2or3(journal)) - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, sizeof(sb->s_uuid)); - journal->j_revoke_records_per_block = - journal_revoke_records_per_block(journal); + /* After journal features are set, we can compute transaction limits */ + jbd2_journal_init_transaction_limits(journal); if (jbd2_has_feature_fast_commit(journal)) { journal->j_fc_last = be32_to_cpu(sb->s_maxlen); @@ -1512,9 +1512,10 @@ static int journal_load_superblock(journal_t *journal) * destroy journal_t structures, and to initialise and read existing * journal blocks from disk. */ -/* First: create and setup a journal_t object in memory. We initialise - * very few fields yet: that has to wait until we have created the - * journal structures from from scratch, or loaded them from disk. */ +/* The journal_init_common() function creates and fills a journal_t object + * in memory. It calls journal_load_superblock() to load the on-disk journal + * superblock and initialize the journal_t object. + */ static journal_t *journal_init_common(struct block_device *bdev, struct block_device *fs_dev, @@ -1599,7 +1600,6 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan; journal->j_shrinker->count_objects = jbd2_journal_shrink_count; - journal->j_shrinker->batch = journal->j_max_transaction_buffers; journal->j_shrinker->private_data = journal; shrinker_register(journal->j_shrinker); @@ -1608,8 +1608,6 @@ static journal_t *journal_init_common(struct block_device *bdev, err_cleanup: percpu_counter_destroy(&journal->j_checkpoint_jh_count); - if (journal->j_chksum_driver) - crypto_free_shash(journal->j_chksum_driver); kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); journal_fail_superblock(journal); @@ -1743,8 +1741,6 @@ static int journal_reset(journal_t *journal) journal->j_commit_sequence = journal->j_transaction_sequence - 1; journal->j_commit_request = journal->j_commit_sequence; - journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal); - /* * Now that journal recovery is done, turn fast commits off here. This * way, if fast commit was enabled before the crash but if now FS has @@ -1823,7 +1819,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) set_buffer_uptodate(bh); } if (jbd2_journal_has_csum_v2or3(journal)) - sb->s_checksum = jbd2_superblock_csum(journal, sb); + sb->s_checksum = jbd2_superblock_csum(sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE | write_flags, bh); @@ -1881,7 +1877,6 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, /* Log is no longer empty */ write_lock(&journal->j_state_lock); - WARN_ON(!sb->s_sequence); journal->j_flags &= ~JBD2_FLUSHED; write_unlock(&journal->j_state_lock); @@ -1929,7 +1924,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags) if (had_fast_commit) jbd2_set_feature_fast_commit(journal); - /* Log is no longer empty */ + /* Log is empty */ write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_FLUSHED; write_unlock(&journal->j_state_lock); @@ -1977,17 +1972,15 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) return err; } - if (block_start == ~0ULL) { - block_start = phys_block; - block_stop = block_start - 1; - } + if (block_start == ~0ULL) + block_stop = block_start = phys_block; /* * last block not contiguous with current block, * process last contiguous region and return to this block on * next loop */ - if (phys_block != block_stop + 1) { + if (phys_block != block_stop) { block--; } else { block_stop++; @@ -2006,11 +1999,10 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) */ byte_start = block_start * journal->j_blocksize; byte_stop = block_stop * journal->j_blocksize; - byte_count = (block_stop - block_start + 1) * - journal->j_blocksize; + byte_count = (block_stop - block_start) * journal->j_blocksize; - truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping, - byte_start, byte_stop); + truncate_inode_pages_range(journal->j_dev->bd_mapping, + byte_start, byte_stop - 1); if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { err = blkdev_issue_discard(journal->j_dev, @@ -2025,7 +2017,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) } if (unlikely(err != 0)) { - pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu", + pr_err("JBD2: (error %d) unable to wipe journal at physical blocks [%llu, %llu)", err, block_start, block_stop); return err; } @@ -2193,8 +2185,6 @@ int jbd2_journal_destroy(journal_t *journal) iput(journal->j_inode); if (journal->j_revoke) jbd2_journal_destroy_revoke(journal); - if (journal->j_chksum_driver) - crypto_free_shash(journal->j_chksum_driver); kfree(journal->j_fc_wbuf); kfree(journal->j_wbuf); kfree(journal); @@ -2285,8 +2275,6 @@ jbd2_journal_initialize_fast_commit(journal_t *journal) journal->j_fc_first = journal->j_last + 1; journal->j_fc_off = 0; journal->j_free = journal->j_last - journal->j_first; - journal->j_max_transaction_buffers = - jbd2_journal_get_max_txn_bufs(journal); return 0; } @@ -2341,27 +2329,15 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, } } - /* Load the checksum driver if necessary */ - if ((journal->j_chksum_driver == NULL) && - INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { - journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); - journal->j_chksum_driver = NULL; - return 0; - } - /* Precompute checksum seed for all metadata */ - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, - sizeof(sb->s_uuid)); - } - lock_buffer(journal->j_sb_buffer); - /* If enabling v3 checksums, update superblock */ + /* If enabling v3 checksums, update superblock and precompute seed */ if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); + journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, + sizeof(sb->s_uuid)); } /* If enabling v1 checksums, downgrade superblock */ @@ -2374,8 +2350,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); unlock_buffer(journal->j_sb_buffer); - journal->j_revoke_records_per_block = - journal_revoke_records_per_block(journal); + jbd2_journal_init_transaction_limits(journal); return 1; #undef COMPAT_FEATURE_ON @@ -2406,8 +2381,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, sb->s_feature_compat &= ~cpu_to_be32(compat); sb->s_feature_ro_compat &= ~cpu_to_be32(ro); sb->s_feature_incompat &= ~cpu_to_be32(incompat); - journal->j_revoke_records_per_block = - journal_revoke_records_per_block(journal); + jbd2_journal_init_transaction_limits(journal); } EXPORT_SYMBOL(jbd2_journal_clear_features); @@ -2681,9 +2655,10 @@ void jbd2_journal_ack_err(journal_t *journal) write_unlock(&journal->j_state_lock); } -int jbd2_journal_blocks_per_page(struct inode *inode) +int jbd2_journal_blocks_per_folio(struct inode *inode) { - return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + return 1 << (PAGE_SHIFT + mapping_max_folio_order(inode->i_mapping) - + inode->i_sb->s_blocksize_bits); } /* @@ -2855,8 +2830,7 @@ static struct journal_head *journal_alloc_journal_head(void) ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS | __GFP_NOFAIL); } - if (ret) - spin_lock_init(&ret->b_state_lock); + spin_lock_init(&ret->b_state_lock); return ret; } @@ -3181,6 +3155,7 @@ static void __exit journal_exit(void) jbd2_journal_destroy_caches(); } +MODULE_DESCRIPTION("Generic filesystem journal-writing module"); MODULE_LICENSE("GPL"); module_init(journal_init); module_exit(journal_exit); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 1f7664984d6e..cac8c2cd4a92 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -19,6 +19,7 @@ #include <linux/errno.h> #include <linux/crc32.h> #include <linux/blkdev.h> +#include <linux/string_choices.h> #endif /* @@ -38,7 +39,7 @@ struct recovery_info static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass); -static int scan_revoke_records(journal_t *, struct buffer_head *, +static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *, tid_t, struct recovery_info *); #ifdef __KERNEL__ @@ -64,9 +65,8 @@ static void journal_brelse_array(struct buffer_head *b[], int n) */ #define MAXBUF 8 -static int do_readahead(journal_t *journal, unsigned int start) +static void do_readahead(journal_t *journal, unsigned int start) { - int err; unsigned int max, nbufs, next; unsigned long long blocknr; struct buffer_head *bh; @@ -84,7 +84,7 @@ static int do_readahead(journal_t *journal, unsigned int start) nbufs = 0; for (next = start; next < max; next++) { - err = jbd2_journal_bmap(journal, next, &blocknr); + int err = jbd2_journal_bmap(journal, next, &blocknr); if (err) { printk(KERN_ERR "JBD2: bad block at offset %u\n", @@ -93,10 +93,8 @@ static int do_readahead(journal_t *journal, unsigned int start) } bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) { - err = -ENOMEM; + if (!bh) goto failed; - } if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; @@ -111,12 +109,10 @@ static int do_readahead(journal_t *journal, unsigned int start) if (nbufs) bh_readahead_batch(nbufs, bufs, 0); - err = 0; failed: if (nbufs) journal_brelse_array(bufs, nbufs); - return err; } #endif /* __KERNEL__ */ @@ -189,7 +185,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); provided = tail->t_checksum; tail->t_checksum = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); tail->t_checksum = provided; return provided == cpu_to_be32(calculated); @@ -286,19 +282,20 @@ static int fc_do_one_pass(journal_t *journal, int jbd2_journal_recover(journal_t *journal) { int err, err2; - journal_superblock_t * sb; - struct recovery_info info; memset(&info, 0, sizeof(info)); - sb = journal->j_superblock; /* * The journal superblock's s_start field (the current log head) * is always zero if, and only if, the journal was cleanly - * unmounted. + * unmounted. We use its in-memory version j_tail here because + * jbd2_journal_wipe() could have updated it without updating journal + * superblock. */ - if (!sb->s_start) { + if (!journal->j_tail) { + journal_superblock_t *sb = journal->j_superblock; + jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n", be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head)); journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; @@ -326,6 +323,12 @@ int jbd2_journal_recover(journal_t *journal) journal->j_transaction_sequence, journal->j_head); jbd2_journal_clear_revoke(journal); + /* Free revoke table allocated for replay */ + if (journal->j_revoke != journal->j_revoke_table[0] && + journal->j_revoke != journal->j_revoke_table[1]) { + jbd2_journal_destroy_revoke_table(journal->j_revoke); + journal->j_revoke = journal->j_revoke_table[1]; + } err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; @@ -374,7 +377,7 @@ int jbd2_journal_skip_recovery(journal_t *journal) be32_to_cpu(journal->j_superblock->s_sequence); jbd2_debug(1, "JBD2: ignoring %d transaction%s from the journal.\n", - dropped, (dropped == 1) ? "" : "s"); + dropped, str_plural(dropped)); #endif journal->j_transaction_sequence = ++info.end_transaction; journal->j_head = info.head_block; @@ -437,12 +440,33 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) h = buf; provided = h->h_chksum[0]; h->h_chksum[0] = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); h->h_chksum[0] = provided; return provided == cpu_to_be32(calculated); } +static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf) +{ + struct commit_header *h; + __be32 provided; + __u32 calculated; + void *tmpbuf; + + tmpbuf = kzalloc(j->j_blocksize, GFP_KERNEL); + if (!tmpbuf) + return false; + + memcpy(tmpbuf, buf, sizeof(struct commit_header)); + h = tmpbuf; + provided = h->h_chksum[0]; + h->h_chksum[0] = 0; + calculated = jbd2_chksum(j->j_csum_seed, tmpbuf, j->j_blocksize); + kfree(tmpbuf); + + return provided == cpu_to_be32(calculated); +} + static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, journal_block_tag3_t *tag3, void *buf, __u32 sequence) @@ -454,8 +478,8 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, return 1; seq = cpu_to_be32(sequence); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, buf, j->j_blocksize); if (jbd2_has_feature_csum3(j)) return tag3->t_checksum == cpu_to_be32(csum32); @@ -463,6 +487,104 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, return tag->t_checksum == cpu_to_be16(csum32); } +static __always_inline int jbd2_do_replay(journal_t *journal, + struct recovery_info *info, + struct buffer_head *bh, + unsigned long *next_log_block, + unsigned int next_commit_ID) +{ + char *tagp; + int flags; + int ret = 0; + int tag_bytes = journal_tag_bytes(journal); + int descr_csum_size = 0; + unsigned long io_block; + journal_block_tag_t tag; + struct buffer_head *obh; + struct buffer_head *nbh; + + if (jbd2_journal_has_csum_v2or3(journal)) + descr_csum_size = sizeof(struct jbd2_journal_block_tail); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while (tagp - bh->b_data + tag_bytes <= + journal->j_blocksize - descr_csum_size) { + int err; + + memcpy(&tag, tagp, sizeof(tag)); + flags = be16_to_cpu(tag.t_flags); + + io_block = (*next_log_block)++; + wrap(journal, *next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but report failure at the end. */ + ret = err; + pr_err("JBD2: IO error %d recovering block %lu in log\n", + err, io_block); + } else { + unsigned long long blocknr; + + J_ASSERT(obh != NULL); + blocknr = read_tag_block(journal, &tag); + + /* If the block has been revoked, then we're all done here. */ + if (jbd2_journal_test_revoke(journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Look for block corruption */ + if (!jbd2_block_tag_csum_verify(journal, &tag, + (journal_block_tag3_t *)tagp, + obh->b_data, next_commit_ID)) { + brelse(obh); + ret = -EFSBADCRC; + pr_err("JBD2: Invalid checksum recovering data block %llu in journal block %lu\n", + blocknr, io_block); + goto skip_write; + } + + /* Find a buffer for the new data being restored */ + nbh = __getblk(journal->j_fs_dev, blocknr, + journal->j_blocksize); + if (nbh == NULL) { + pr_err("JBD2: Out of memory during recovery.\n"); + brelse(obh); + return -ENOMEM; + } + + lock_buffer(nbh); + memcpy(nbh->b_data, obh->b_data, journal->j_blocksize); + if (flags & JBD2_FLAG_ESCAPE) { + *((__be32 *)nbh->b_data) = + cpu_to_be32(JBD2_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + set_buffer_uptodate(nbh); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + ++info->nr_replays; + unlock_buffer(nbh); + brelse(obh); + brelse(nbh); + } + +skip_write: + tagp += tag_bytes; + if (!(flags & JBD2_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JBD2_FLAG_LAST_TAG) + break; + } + + return ret; +} + static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { @@ -471,13 +593,10 @@ static int do_one_pass(journal_t *journal, int err, success = 0; journal_superblock_t * sb; journal_header_t * tmp; - struct buffer_head * bh; + struct buffer_head *bh = NULL; unsigned int sequence; int blocktype; - int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ - int descr_csum_size = 0; - int block_error = 0; bool need_check_commit_time = false; __u64 last_trans_commit_time = 0, commit_time; @@ -495,6 +614,31 @@ static int do_one_pass(journal_t *journal, first_commit_ID = next_commit_ID; if (pass == PASS_SCAN) info->start_transaction = first_commit_ID; + else if (pass == PASS_REVOKE) { + /* + * Would the default revoke table have too long hash chains + * during replay? + */ + if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) { + unsigned int hash_size; + + /* + * Aim for average chain length of 8, limit at 1M + * entries to avoid problems with malicious + * filesystems. + */ + hash_size = min(roundup_pow_of_two(info->nr_revokes / 8), + 1U << 20); + journal->j_revoke = + jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke) { + printk(KERN_ERR + "JBD2: failed to allocate revoke table for replay with %u entries. " + "Journal replay may be slow.\n", hash_size); + journal->j_revoke = journal->j_revoke_table[1]; + } + } + } jbd2_debug(1, "Starting recovery pass %d\n", pass); @@ -506,12 +650,6 @@ static int do_one_pass(journal_t *journal, */ while (1) { - int flags; - char * tagp; - journal_block_tag_t tag; - struct buffer_head * obh; - struct buffer_head * nbh; - cond_resched(); /* If we already know where to stop the log traversal, @@ -530,6 +668,8 @@ static int do_one_pass(journal_t *journal, * record. */ jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block); + brelse(bh); + bh = NULL; err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -545,20 +685,16 @@ static int do_one_pass(journal_t *journal, tmp = (journal_header_t *)bh->b_data; - if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) { - brelse(bh); + if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) break; - } blocktype = be32_to_cpu(tmp->h_blocktype); sequence = be32_to_cpu(tmp->h_sequence); jbd2_debug(3, "Found magic %d, sequence %d\n", blocktype, sequence); - if (sequence != next_commit_ID) { - brelse(bh); + if (sequence != next_commit_ID) break; - } /* OK, we have a valid descriptor block which matches * all of the sequence number checks. What are we going @@ -567,11 +703,7 @@ static int do_one_pass(journal_t *journal, switch(blocktype) { case JBD2_DESCRIPTOR_BLOCK: /* Verify checksum first */ - if (jbd2_journal_has_csum_v2or3(journal)) - descr_csum_size = - sizeof(struct jbd2_journal_block_tail); - if (descr_csum_size > 0 && - !jbd2_descriptor_block_csum_verify(journal, + if (!jbd2_descriptor_block_csum_verify(journal, bh->b_data)) { /* * PASS_SCAN can see stale blocks due to lazy @@ -581,7 +713,6 @@ static int do_one_pass(journal_t *journal, pr_err("JBD2: Invalid checksum recovering block %lu in log\n", next_log_block); err = -EFSBADCRC; - brelse(bh); goto failed; } need_check_commit_time = true; @@ -597,125 +728,39 @@ static int do_one_pass(journal_t *journal, if (pass != PASS_REPLAY) { if (pass == PASS_SCAN && jbd2_has_feature_checksum(journal) && - !need_check_commit_time && !info->end_transaction) { if (calc_chksums(journal, bh, &next_log_block, - &crc32_sum)) { - put_bh(bh); + &crc32_sum)) break; - } - put_bh(bh); continue; } next_log_block += count_tags(journal, bh); wrap(journal, next_log_block); - put_bh(bh); continue; } - /* A descriptor block: we can now write all of - * the data blocks. Yay, useful work is finally - * getting done here! */ - - tagp = &bh->b_data[sizeof(journal_header_t)]; - while ((tagp - bh->b_data + tag_bytes) - <= journal->j_blocksize - descr_csum_size) { - unsigned long io_block; - - memcpy(&tag, tagp, sizeof(tag)); - flags = be16_to_cpu(tag.t_flags); - - io_block = next_log_block++; - wrap(journal, next_log_block); - err = jread(&obh, journal, io_block); - if (err) { - /* Recover what we can, but - * report failure at the end. */ - success = err; - printk(KERN_ERR - "JBD2: IO error %d recovering " - "block %lu in log\n", - err, io_block); - } else { - unsigned long long blocknr; - - J_ASSERT(obh != NULL); - blocknr = read_tag_block(journal, - &tag); - - /* If the block has been - * revoked, then we're all done - * here. */ - if (jbd2_journal_test_revoke - (journal, blocknr, - next_commit_ID)) { - brelse(obh); - ++info->nr_revoke_hits; - goto skip_write; - } - - /* Look for block corruption */ - if (!jbd2_block_tag_csum_verify( - journal, &tag, (journal_block_tag3_t *)tagp, - obh->b_data, be32_to_cpu(tmp->h_sequence))) { - brelse(obh); - success = -EFSBADCRC; - printk(KERN_ERR "JBD2: Invalid " - "checksum recovering " - "data block %llu in " - "journal block %lu\n", - blocknr, io_block); - block_error = 1; - goto skip_write; - } - - /* Find a buffer for the new - * data being restored */ - nbh = __getblk(journal->j_fs_dev, - blocknr, - journal->j_blocksize); - if (nbh == NULL) { - printk(KERN_ERR - "JBD2: Out of memory " - "during recovery.\n"); - err = -ENOMEM; - brelse(bh); - brelse(obh); - goto failed; - } - - lock_buffer(nbh); - memcpy(nbh->b_data, obh->b_data, - journal->j_blocksize); - if (flags & JBD2_FLAG_ESCAPE) { - *((__be32 *)nbh->b_data) = - cpu_to_be32(JBD2_MAGIC_NUMBER); - } - - BUFFER_TRACE(nbh, "marking dirty"); - set_buffer_uptodate(nbh); - mark_buffer_dirty(nbh); - BUFFER_TRACE(nbh, "marking uptodate"); - ++info->nr_replays; - unlock_buffer(nbh); - brelse(obh); - brelse(nbh); - } - - skip_write: - tagp += tag_bytes; - if (!(flags & JBD2_FLAG_SAME_UUID)) - tagp += 16; - - if (flags & JBD2_FLAG_LAST_TAG) - break; + /* + * A descriptor block: we can now write all of the + * data blocks. Yay, useful work is finally getting + * done here! + */ + err = jbd2_do_replay(journal, info, bh, &next_log_block, + next_commit_ID); + if (err) { + if (err == -ENOMEM) + goto failed; + success = err; } - brelse(bh); continue; case JBD2_COMMIT_BLOCK: + if (pass != PASS_SCAN) { + next_commit_ID++; + continue; + } + /* How to differentiate between interrupted commit * and journal corruption ? * @@ -760,7 +805,6 @@ static int do_one_pass(journal_t *journal, pr_err("JBD2: Invalid checksum found in transaction %u\n", next_commit_ID); err = -EFSBADCRC; - brelse(bh); goto failed; } ignore_crc_mismatch: @@ -770,7 +814,6 @@ static int do_one_pass(journal_t *journal, */ jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", next_commit_ID); - brelse(bh); goto done; } @@ -780,8 +823,7 @@ static int do_one_pass(journal_t *journal, * much to do other than move on to the next sequence * number. */ - if (pass == PASS_SCAN && - jbd2_has_feature_checksum(journal)) { + if (jbd2_has_feature_checksum(journal)) { struct commit_header *cbh = (struct commit_header *)bh->b_data; unsigned found_chksum = @@ -790,7 +832,6 @@ static int do_one_pass(journal_t *journal, if (info->end_transaction) { journal->j_failed_commit = info->end_transaction; - brelse(bh); break; } @@ -806,33 +847,45 @@ static int do_one_pass(journal_t *journal, goto chksum_error; crc32_sum = ~0; + goto chksum_ok; } - if (pass == PASS_SCAN && - !jbd2_commit_block_csum_verify(journal, - bh->b_data)) { - chksum_error: - if (commit_time < last_trans_commit_time) - goto ignore_crc_mismatch; - info->end_transaction = next_commit_ID; - info->head_block = head_block; - - if (!jbd2_has_feature_async_commit(journal)) { - journal->j_failed_commit = - next_commit_ID; - brelse(bh); - break; - } + + if (jbd2_commit_block_csum_verify(journal, bh->b_data)) + goto chksum_ok; + + if (jbd2_commit_block_csum_verify_partial(journal, + bh->b_data)) { + pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n", + next_commit_ID, next_log_block); + goto chksum_ok; } - if (pass == PASS_SCAN) { - last_trans_commit_time = commit_time; - head_block = next_log_block; + +chksum_error: + if (commit_time < last_trans_commit_time) + goto ignore_crc_mismatch; + info->end_transaction = next_commit_ID; + info->head_block = head_block; + + if (!jbd2_has_feature_async_commit(journal)) { + journal->j_failed_commit = next_commit_ID; + break; } - brelse(bh); + +chksum_ok: + last_trans_commit_time = commit_time; + head_block = next_log_block; next_commit_ID++; continue; case JBD2_REVOKE_BLOCK: /* + * If we aren't in the SCAN or REVOKE pass, then we can + * just skip over this block. + */ + if (pass != PASS_REVOKE && pass != PASS_SCAN) + continue; + + /* * Check revoke block crc in pass_scan, if csum verify * failed, check commit block time later. */ @@ -843,16 +896,9 @@ static int do_one_pass(journal_t *journal, next_log_block); need_check_commit_time = true; } - /* If we aren't in the REVOKE pass, then we can - * just skip over this block. */ - if (pass != PASS_REVOKE) { - brelse(bh); - continue; - } - err = scan_revoke_records(journal, bh, + err = scan_revoke_records(journal, pass, bh, next_commit_ID, info); - brelse(bh); if (err) goto failed; continue; @@ -860,12 +906,12 @@ static int do_one_pass(journal_t *journal, default: jbd2_debug(3, "Unrecognised magic %d, end of scan.\n", blocktype); - brelse(bh); goto done; } } done: + brelse(bh); /* * We broke out of the log scan loop: either we came to the * known end of the log or we found an unexpected block in the @@ -896,18 +942,18 @@ static int do_one_pass(journal_t *journal, success = err; } - if (block_error && success == 0) - success = -EIO; return success; failed: + brelse(bh); return err; } /* Scan a revoke record, marking all blocks mentioned as revoked. */ -static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, - tid_t sequence, struct recovery_info *info) +static int scan_revoke_records(journal_t *journal, enum passtype pass, + struct buffer_head *bh, tid_t sequence, + struct recovery_info *info) { jbd2_journal_revoke_header_t *header; int offset, max; @@ -928,6 +974,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, if (jbd2_has_feature_64bit(journal)) record_len = 8; + if (pass == PASS_SCAN) { + info->nr_revokes += (max - offset) / record_len; + return 0; + } + while (offset + record_len <= max) { unsigned long long blocknr; int err; @@ -940,7 +991,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, err = jbd2_journal_set_revoke(journal, blocknr, sequence); if (err) return err; - ++info->nr_revokes; } return 0; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 4556e4689024..1467f6790747 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -215,7 +215,7 @@ int __init jbd2_journal_init_revoke_table_cache(void) return 0; } -static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) +struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) { int shift = 0; int tmp = hash_size; @@ -231,7 +231,7 @@ static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) table->hash_size = hash_size; table->hash_shift = shift; table->hash_table = - kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); + kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { kmem_cache_free(jbd2_revoke_table_cache, table); table = NULL; @@ -245,7 +245,7 @@ out: return table; } -static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) +void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) { int i; struct list_head *hash_list; @@ -255,7 +255,7 @@ static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) J_ASSERT(list_empty(hash_list)); } - kfree(table->hash_table); + kvfree(table->hash_table); kmem_cache_free(jbd2_revoke_table_cache, table); } @@ -345,7 +345,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, bh = bh_in; if (!bh) { - bh = __find_get_block(bdev, blocknr, journal->j_blocksize); + bh = __find_get_block_nonatomic(bdev, blocknr, + journal->j_blocksize); if (bh) BUFFER_TRACE(bh, "found on hash"); } @@ -355,7 +356,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, /* If there is a different buffer_head lying around in * memory anywhere... */ - bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); + bh2 = __find_get_block_nonatomic(bdev, blocknr, + journal->j_blocksize); if (bh2) { /* ... and it has RevokeValid status... */ if (bh2 != bh && buffer_revokevalid(bh2)) @@ -420,12 +422,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. */ -int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) +void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { struct jbd2_revoke_record_s *record; journal_t *journal = handle->h_transaction->t_journal; int need_cancel; - int did_revoke = 0; /* akpm: debug */ struct buffer_head *bh = jh2bh(jh); jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); @@ -450,7 +451,6 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) list_del(&record->hash); spin_unlock(&journal->j_revoke_lock); kmem_cache_free(jbd2_revoke_record_cache, record); - did_revoke = 1; } } @@ -466,18 +466,18 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) * state machine will get very upset later on. */ if (need_cancel) { struct buffer_head *bh2; - bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); + bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, + bh->b_size); if (bh2) { if (bh2 != bh) clear_buffer_revoked(bh2); __brelse(bh2); } } - return did_revoke; } /* - * journal_clear_revoked_flag clears revoked flag of buffers in + * jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in * revoke table to reflect there is no revoked buffers in the next * transaction which is going to be started. */ @@ -495,9 +495,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal) struct jbd2_revoke_record_s *record; struct buffer_head *bh; record = (struct jbd2_revoke_record_s *)list_entry; - bh = __find_get_block(journal->j_fs_dev, - record->blocknr, - journal->j_blocksize); + bh = __find_get_block_nonatomic(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); if (bh) { clear_buffer_revoked(bh); __brelse(bh); @@ -506,9 +506,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal) } } -/* journal_switch_revoke table select j_revoke for next transaction - * we do not want to suspend any processing until all revokes are - * written -bzzz +/* jbd2_journal_switch_revoke_table table select j_revoke for next + * transaction we do not want to suspend any processing until all + * revokes are written -bzzz */ void jbd2_journal_switch_revoke_table(journal_t *journal) { @@ -654,7 +654,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); - write_dirty_buffer(descriptor, REQ_SYNC); + write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS); } #endif diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index cb0b8d6fc0c6..c7867139af69 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -63,28 +63,6 @@ void jbd2_journal_free_transaction(transaction_t *transaction) } /* - * Base amount of descriptor blocks we reserve for each transaction. - */ -static int jbd2_descriptor_blocks_per_trans(journal_t *journal) -{ - int tag_space = journal->j_blocksize - sizeof(journal_header_t); - int tags_per_block; - - /* Subtract UUID */ - tag_space -= 16; - if (jbd2_journal_has_csum_v2or3(journal)) - tag_space -= sizeof(struct jbd2_journal_block_tail); - /* Commit code leaves a slack space of 16 bytes at the end of block */ - tags_per_block = (tag_space - 16) / journal_tag_bytes(journal); - /* - * Revoke descriptors are accounted separately so we need to reserve - * space for commit block and normal transaction descriptor blocks. - */ - return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers, - tags_per_block); -} - -/* * jbd2_get_transaction: obtain a new transaction_t object. * * Simply initialise a new transaction. Initialize it in @@ -109,12 +87,11 @@ static void jbd2_get_transaction(journal_t *journal, transaction->t_expires = jiffies + journal->j_commit_interval; atomic_set(&transaction->t_updates, 0); atomic_set(&transaction->t_outstanding_credits, - jbd2_descriptor_blocks_per_trans(journal) + + journal->j_transaction_overhead_buffers + atomic_read(&journal->j_reserved_credits)); atomic_set(&transaction->t_outstanding_revokes, 0); atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); - INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); @@ -136,12 +113,9 @@ static void jbd2_get_transaction(journal_t *journal, */ /* - * Update transaction's maximum wait time, if debugging is enabled. - * * t_max_wait is carefully updated here with use of atomic compare exchange. * Note that there could be multiplre threads trying to do this simultaneously * hence using cmpxchg to avoid any use of locks in this case. - * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug. */ static inline void update_t_max_wait(transaction_t *transaction, unsigned long ts) @@ -213,6 +187,13 @@ static void sub_reserved_credits(journal_t *journal, int blocks) wake_up(&journal->j_wait_reserved); } +/* Maximum number of blocks for user transaction payload */ +static int jbd2_max_user_trans_buffers(journal_t *journal) +{ + return journal->j_max_transaction_buffers - + journal->j_transaction_overhead_buffers; +} + /* * Wait until we can add credits for handle to the running transaction. Called * with j_state_lock held for reading. Returns 0 if handle joined the running @@ -262,12 +243,12 @@ __must_hold(&journal->j_state_lock) * big to fit this handle? Wait until reserved credits are freed. */ if (atomic_read(&journal->j_reserved_credits) + total > - journal->j_max_transaction_buffers) { + jbd2_max_user_trans_buffers(journal)) { read_unlock(&journal->j_state_lock); jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + total <= - journal->j_max_transaction_buffers); + jbd2_max_user_trans_buffers(journal)); __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } @@ -307,14 +288,14 @@ __must_hold(&journal->j_state_lock) needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits); /* We allow at most half of a transaction to be reserved */ - if (needed > journal->j_max_transaction_buffers / 2) { + if (needed > jbd2_max_user_trans_buffers(journal) / 2) { sub_reserved_credits(journal, rsv_blocks); atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + rsv_blocks - <= journal->j_max_transaction_buffers / 2); + <= jbd2_max_user_trans_buffers(journal) / 2); __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } @@ -344,12 +325,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle, * size and limit the number of total credits to not exceed maximum * transaction size per operation. */ - if ((rsv_blocks > journal->j_max_transaction_buffers / 2) || - (rsv_blocks + blocks > journal->j_max_transaction_buffers)) { + if (rsv_blocks > jbd2_max_user_trans_buffers(journal) / 2 || + rsv_blocks + blocks > jbd2_max_user_trans_buffers(journal)) { printk(KERN_ERR "JBD2: %s wants too many credits " "credits:%d rsv_credits:%d max:%d\n", current->comm, blocks, rsv_blocks, - journal->j_max_transaction_buffers); + jbd2_max_user_trans_buffers(journal)); WARN_ON(1); return -ENOSPC; } @@ -1528,7 +1509,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) jh->b_next_transaction == transaction); spin_unlock(&jh->b_state_lock); } - if (jh->b_modified == 1) { + if (data_race(jh->b_modified == 1)) { /* If it's in our transaction it must be in BJ_Metadata list. */ if (data_race(jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata)) { @@ -1547,7 +1528,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) goto out; } - journal = transaction->t_journal; spin_lock(&jh->b_state_lock); if (is_handle_aborted(handle)) { @@ -1562,6 +1542,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) goto out_unlock_bh; } + journal = transaction->t_journal; + if (jh->b_modified == 0) { /* * This buffer's got modified and becoming part @@ -2094,21 +2076,6 @@ static void __jbd2_journal_unfile_buffer(struct journal_head *jh) jh->b_transaction = NULL; } -void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - /* Get reference so that buffer cannot be freed before we unlock it */ - get_bh(bh); - spin_lock(&jh->b_state_lock); - spin_lock(&journal->j_list_lock); - __jbd2_journal_unfile_buffer(jh); - spin_unlock(&journal->j_list_lock); - spin_unlock(&jh->b_state_lock); - jbd2_journal_put_journal_head(jh); - __brelse(bh); -} - /** * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation @@ -2207,7 +2174,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in - * __journal_file_buffer + * __jbd2_journal_file_buffer */ clear_buffer_dirty(bh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); |