summaryrefslogtreecommitdiff
path: root/fs/jbd2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/jbd2')
-rw-r--r--fs/jbd2/Kconfig3
-rw-r--r--fs/jbd2/Makefile1
-rw-r--r--fs/jbd2/checkpoint.c449
-rw-r--r--fs/jbd2/commit.c345
-rw-r--r--fs/jbd2/journal.c1645
-rw-r--r--fs/jbd2/recovery.c633
-rw-r--r--fs/jbd2/revoke.c105
-rw-r--r--fs/jbd2/transaction.c1040
8 files changed, 2538 insertions, 1683 deletions
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 5a9f5534d57b..9c19e1512101 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -1,8 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
config JBD2
tristate
select CRC32
- select CRYPTO
- select CRYPTO_CRC32C
help
This is a generic journaling layer for block devices that support
both 32-bit and 64-bit block numbers. It is currently used by
diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
index 802a3413872a..126b4da6c7de 100644
--- a/fs/jbd2/Makefile
+++ b/fs/jbd2/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the linux journaling routines.
#
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 4055f51617ef..de89c5bef607 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* linux/fs/jbd2/checkpoint.c
*
@@ -5,10 +6,6 @@
*
* Copyright 1999 Red Hat Software --- All Rights Reserved
*
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
* Checkpoint routines for the generic filesystem journaling code.
* Part of the ext2fs journaling system.
*
@@ -30,7 +27,7 @@
*
* Called with j_list_lock held.
*/
-static inline void __buffer_unlink_first(struct journal_head *jh)
+static inline void __buffer_unlink(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
@@ -44,79 +41,22 @@ static inline void __buffer_unlink_first(struct journal_head *jh)
}
/*
- * Unlink a buffer from a transaction checkpoint(io) list.
- *
- * Called with j_list_lock held.
- */
-static inline void __buffer_unlink(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
- if (transaction->t_checkpoint_io_list == jh) {
- transaction->t_checkpoint_io_list = jh->b_cpnext;
- if (transaction->t_checkpoint_io_list == jh)
- transaction->t_checkpoint_io_list = NULL;
- }
-}
-
-/*
- * Move a buffer from the checkpoint list to the checkpoint io list
- *
- * Called with j_list_lock held
- */
-static inline void __buffer_relink_io(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
-
- if (!transaction->t_checkpoint_io_list) {
- jh->b_cpnext = jh->b_cpprev = jh;
- } else {
- jh->b_cpnext = transaction->t_checkpoint_io_list;
- jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
- jh->b_cpprev->b_cpnext = jh;
- jh->b_cpnext->b_cpprev = jh;
- }
- transaction->t_checkpoint_io_list = jh;
-}
-
-/*
- * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it and 2 if we also released the
- * whole transaction.
- *
- * Requires j_list_lock
- */
-static int __try_to_free_cp_buf(struct journal_head *jh)
-{
- int ret = 0;
- struct buffer_head *bh = jh2bh(jh);
-
- if (jh->b_transaction == NULL && !buffer_locked(bh) &&
- !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- ret = __jbd2_journal_remove_checkpoint(jh) + 1;
- }
- return ret;
-}
-
-/*
* __jbd2_log_wait_for_space: wait until there is space in the journal.
*
* Called under j-state_lock *only*. It will be unlocked if we have to wait
* for a checkpoint to free up some space in the log.
*/
void __jbd2_log_wait_for_space(journal_t *journal)
+__acquires(&journal->j_state_lock)
+__releases(&journal->j_state_lock)
{
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */
- nblocks = jbd2_space_needed(journal);
+ nblocks = journal->j_max_transaction_buffers;
while (jbd2_log_space_left(journal) < nblocks) {
write_unlock(&journal->j_state_lock);
- mutex_lock(&journal->j_checkpoint_mutex);
+ mutex_lock_io(&journal->j_checkpoint_mutex);
/*
* Test again, another process may have checkpointed while we
@@ -135,22 +75,27 @@ void __jbd2_log_wait_for_space(journal_t *journal)
return;
}
spin_lock(&journal->j_list_lock);
- nblocks = jbd2_space_needed(journal);
space_left = jbd2_log_space_left(journal);
if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL;
tid_t tid = 0;
+ bool has_transaction = false;
- if (journal->j_committing_transaction)
+ if (journal->j_committing_transaction) {
tid = journal->j_committing_transaction->t_tid;
+ has_transaction = true;
+ }
spin_unlock(&journal->j_list_lock);
write_unlock(&journal->j_state_lock);
if (chkpt) {
jbd2_log_do_checkpoint(journal);
- } else if (jbd2_cleanup_journal_tail(journal) == 0) {
- /* We were able to recover space; yay! */
+ } else if (jbd2_cleanup_journal_tail(journal) <= 0) {
+ /*
+ * We were able to recover space or the
+ * journal was aborted due to an error.
+ */
;
- } else if (tid) {
+ } else if (has_transaction) {
/*
* jbd2_journal_commit_transaction() may want
* to take the checkpoint_mutex if JBD2_FLUSHED
@@ -168,7 +113,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
"journal space in %s\n", __func__,
journal->j_devname);
WARN_ON(1);
- jbd2_journal_abort(journal, 0);
+ jbd2_journal_abort(journal, -ENOSPC);
}
write_lock(&journal->j_state_lock);
} else {
@@ -186,13 +131,14 @@ __flush_batch(journal_t *journal, int *batch_count)
blk_start_plug(&plug);
for (i = 0; i < *batch_count; i++)
- write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC);
+ write_dirty_buffer(journal->j_chkpt_bhs[i], JBD2_JOURNAL_REQ_FLAGS);
blk_finish_plug(&plug);
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = journal->j_chkpt_bhs[i];
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
+ journal->j_chkpt_bhs[i] = NULL;
}
*batch_count = 0;
}
@@ -213,7 +159,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
tid_t this_tid;
int result, batch_count = 0;
- jbd_debug(1, "Start checkpoint\n");
+ jbd2_debug(1, "Start checkpoint\n");
/*
* First thing: if there are any transactions in the log which
@@ -222,7 +168,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
*/
result = jbd2_cleanup_journal_tail(journal);
trace_jbd2_checkpoint(journal, result);
- jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+ jbd2_debug(1, "cleanup_journal_tail returned %d\n", result);
if (result <= 0)
return result;
@@ -230,7 +176,6 @@ int jbd2_log_do_checkpoint(journal_t *journal)
* OK, we need to start writing disk blocks. Take one transaction
* and write it.
*/
- result = 0;
spin_lock(&journal->j_list_lock);
if (!journal->j_checkpoint_transactions)
goto out;
@@ -253,15 +198,6 @@ restart:
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- get_bh(bh);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- goto retry;
- }
if (jh->b_transaction != NULL) {
transaction_t *t = jh->b_transaction;
tid_t tid = t->t_tid;
@@ -279,36 +215,67 @@ restart:
"JBD2: %s: Waiting for Godot: block %llu\n",
journal->j_devname, (unsigned long long) bh->b_blocknr);
+ if (batch_count)
+ __flush_batch(journal, &batch_count);
jbd2_log_start_commit(journal, tid);
+ /*
+ * jbd2_journal_commit_transaction() may want
+ * to take the checkpoint_mutex if JBD2_FLUSHED
+ * is set, jbd2_update_log_tail() called by
+ * jbd2_journal_commit_transaction() may also take
+ * checkpoint_mutex. So we need to temporarily
+ * drop it.
+ */
+ mutex_unlock(&journal->j_checkpoint_mutex);
jbd2_log_wait_commit(journal, tid);
- goto retry;
+ mutex_lock_io(&journal->j_checkpoint_mutex);
+ spin_lock(&journal->j_list_lock);
+ goto restart;
}
- if (!buffer_dirty(bh)) {
- if (unlikely(buffer_write_io_error(bh)) && !result)
- result = -EIO;
+ if (!trylock_buffer(bh)) {
+ /*
+ * The buffer is locked, it may be writing back, or
+ * flushing out in the last couple of cycles, or
+ * re-adding into a new transaction, need to check
+ * it again until it's unlocked.
+ */
+ get_bh(bh);
+ spin_unlock(&journal->j_list_lock);
+ wait_on_buffer(bh);
+ /* the journal_head may have gone by now */
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ goto retry;
+ } else if (!buffer_dirty(bh)) {
+ unlock_buffer(bh);
BUFFER_TRACE(bh, "remove from checkpoint");
- if (__jbd2_journal_remove_checkpoint(jh))
- /* The transaction was released; we're done */
+ /*
+ * If the transaction was released or the checkpoint
+ * list was empty, we're done.
+ */
+ if (__jbd2_journal_remove_checkpoint(jh) ||
+ !transaction->t_checkpoint_list)
goto out;
- continue;
+ } else {
+ unlock_buffer(bh);
+ /*
+ * We are about to write the buffer, it could be
+ * raced by some other transaction shrink or buffer
+ * re-log logic once we release the j_list_lock,
+ * leave it on the checkpoint list and check status
+ * again to make sure it's clean.
+ */
+ BUFFER_TRACE(bh, "queue");
+ get_bh(bh);
+ J_ASSERT_BH(bh, !buffer_jwrite(bh));
+ journal->j_chkpt_bhs[batch_count++] = bh;
+ transaction->t_chp_stats.cs_written++;
+ transaction->t_checkpoint_list = jh->b_cpnext;
}
- /*
- * Important: we are about to write the buffer, and
- * possibly block, while still holding the journal
- * lock. We cannot afford to let the transaction
- * logic start messing around with this buffer before
- * we write it to disk, as that would break
- * recoverability.
- */
- BUFFER_TRACE(bh, "queue");
- get_bh(bh);
- J_ASSERT_BH(bh, !buffer_jwrite(bh));
- journal->j_chkpt_bhs[batch_count++] = bh;
- __buffer_relink_io(jh);
- transaction->t_chp_stats.cs_written++;
+
if ((batch_count == JBD2_NR_BATCH) ||
- need_resched() ||
- spin_needbreak(&journal->j_list_lock))
+ need_resched() || spin_needbreak(&journal->j_list_lock) ||
+ jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0])
goto unlock_and_flush;
}
@@ -318,50 +285,14 @@ restart:
retry:
if (batch_count)
__flush_batch(journal, &batch_count);
+ cond_resched();
spin_lock(&journal->j_list_lock);
goto restart;
}
- /*
- * Now we issued all of the transaction's buffers, let's deal
- * with the buffers that are out for I/O.
- */
-restart2:
- /* Did somebody clean up the transaction in the meanwhile? */
- if (journal->j_checkpoint_transactions != transaction ||
- transaction->t_tid != this_tid)
- goto out;
-
- while (transaction->t_checkpoint_io_list) {
- jh = transaction->t_checkpoint_io_list;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- get_bh(bh);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- spin_lock(&journal->j_list_lock);
- goto restart2;
- }
- if (unlikely(buffer_write_io_error(bh)) && !result)
- result = -EIO;
-
- /*
- * Now in whatever state the buffer currently is, we
- * know that it has been written out and so we can
- * drop it from the list
- */
- if (__jbd2_journal_remove_checkpoint(jh))
- break;
- }
out:
spin_unlock(&journal->j_list_lock);
- if (result < 0)
- jbd2_journal_abort(journal, result);
- else
- result = jbd2_cleanup_journal_tail(journal);
+ result = jbd2_cleanup_journal_tail(journal);
return (result < 0) ? result : 0;
}
@@ -405,7 +336,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
* jbd2_cleanup_journal_tail() doesn't get called all that often.
*/
if (journal->j_flags & JBD2_BARRIER)
- blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
+ blkdev_issue_flush(journal->j_fs_dev);
return __jbd2_update_log_tail(journal, first_tid, blocknr);
}
@@ -414,20 +345,25 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
/* Checkpoint list management */
/*
- * journal_clean_one_cp_list
+ * journal_shrink_one_cp_list
*
- * Find all the written-back checkpoint buffers in the given list and
- * release them. If 'destroy' is set, clean all buffers unconditionally.
+ * Find all the written-back checkpoint buffers in the given list
+ * and try to release them. If the whole transaction is released, set
+ * the 'released' parameter. Return the number of released checkpointed
+ * buffers.
*
* Called with j_list_lock held.
- * Returns 1 if we freed the transaction, 0 otherwise.
*/
-static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
+static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
+ enum jbd2_shrink_type type,
+ bool *released)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
+ unsigned long nr_freed = 0;
int ret;
+ *released = false;
if (!jh)
return 0;
@@ -435,39 +371,129 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
do {
jh = next_jh;
next_jh = jh->b_cpnext;
- if (!destroy)
- ret = __try_to_free_cp_buf(jh);
- else
- ret = __jbd2_journal_remove_checkpoint(jh) + 1;
- if (!ret)
- return 0;
- if (ret == 2)
- return 1;
- /*
- * This function only frees up some memory
- * if possible so we dont have an obligation
- * to finish processing. Bail out if preemption
- * requested:
- */
+
+ if (type == JBD2_SHRINK_DESTROY) {
+ ret = __jbd2_journal_remove_checkpoint(jh);
+ } else {
+ ret = jbd2_journal_try_remove_checkpoint(jh);
+ if (ret < 0) {
+ if (type == JBD2_SHRINK_BUSY_SKIP)
+ continue;
+ break;
+ }
+ }
+
+ nr_freed++;
+ if (ret) {
+ *released = true;
+ break;
+ }
+
if (need_resched())
- return 0;
+ break;
} while (jh != last_jh);
- return 0;
+ return nr_freed;
+}
+
+/*
+ * jbd2_journal_shrink_checkpoint_list
+ *
+ * Find 'nr_to_scan' written-back checkpoint buffers in the journal
+ * and try to release them. Return the number of released checkpointed
+ * buffers.
+ *
+ * Called with j_list_lock held.
+ */
+unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal,
+ unsigned long *nr_to_scan)
+{
+ transaction_t *transaction, *last_transaction, *next_transaction;
+ bool __maybe_unused released;
+ tid_t first_tid = 0, last_tid = 0, next_tid = 0;
+ tid_t tid = 0;
+ unsigned long nr_freed = 0;
+ unsigned long freed;
+ bool first_set = false;
+
+again:
+ spin_lock(&journal->j_list_lock);
+ if (!journal->j_checkpoint_transactions) {
+ spin_unlock(&journal->j_list_lock);
+ goto out;
+ }
+
+ /*
+ * Get next shrink transaction, resume previous scan or start
+ * over again. If some others do checkpoint and drop transaction
+ * from the checkpoint list, we ignore saved j_shrink_transaction
+ * and start over unconditionally.
+ */
+ if (journal->j_shrink_transaction)
+ transaction = journal->j_shrink_transaction;
+ else
+ transaction = journal->j_checkpoint_transactions;
+
+ if (!first_set) {
+ first_tid = transaction->t_tid;
+ first_set = true;
+ }
+ last_transaction = journal->j_checkpoint_transactions->t_cpprev;
+ next_transaction = transaction;
+ last_tid = last_transaction->t_tid;
+ do {
+ transaction = next_transaction;
+ next_transaction = transaction->t_cpnext;
+ tid = transaction->t_tid;
+
+ freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list,
+ JBD2_SHRINK_BUSY_SKIP, &released);
+ nr_freed += freed;
+ (*nr_to_scan) -= min(*nr_to_scan, freed);
+ if (*nr_to_scan == 0)
+ break;
+ if (need_resched() || spin_needbreak(&journal->j_list_lock))
+ break;
+ } while (transaction != last_transaction);
+
+ if (transaction != last_transaction) {
+ journal->j_shrink_transaction = next_transaction;
+ next_tid = next_transaction->t_tid;
+ } else {
+ journal->j_shrink_transaction = NULL;
+ next_tid = 0;
+ }
+
+ spin_unlock(&journal->j_list_lock);
+ cond_resched();
+
+ if (*nr_to_scan && journal->j_shrink_transaction)
+ goto again;
+out:
+ trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid,
+ nr_freed, next_tid);
+
+ return nr_freed;
}
/*
* journal_clean_checkpoint_list
*
* Find all the written-back checkpoint buffers in the journal and release them.
- * If 'destroy' is set, release all buffers unconditionally.
+ * If 'type' is JBD2_SHRINK_DESTROY, release all buffers unconditionally. If
+ * 'type' is JBD2_SHRINK_BUSY_STOP, will stop release buffers if encounters a
+ * busy buffer. To avoid wasting CPU cycles scanning the buffer list in some
+ * cases, don't pass JBD2_SHRINK_BUSY_SKIP 'type' for this function.
*
* Called with j_list_lock held.
*/
-void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
+void __jbd2_journal_clean_checkpoint_list(journal_t *journal,
+ enum jbd2_shrink_type type)
{
transaction_t *transaction, *last_transaction, *next_transaction;
- int ret;
+ bool released;
+
+ WARN_ON_ONCE(type == JBD2_SHRINK_BUSY_SKIP);
transaction = journal->j_checkpoint_transactions;
if (!transaction)
@@ -478,8 +504,8 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
- ret = journal_clean_one_cp_list(transaction->t_checkpoint_list,
- destroy);
+ journal_shrink_one_cp_list(transaction->t_checkpoint_list,
+ type, &released);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
@@ -487,23 +513,12 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
*/
if (need_resched())
return;
- if (ret)
- continue;
- /*
- * It is essential that we are as careful as in the case of
- * t_checkpoint_list with removing the buffer from the list as
- * we can possibly see not yet submitted buffers on io_list
- */
- ret = journal_clean_one_cp_list(transaction->
- t_checkpoint_io_list, destroy);
- if (need_resched())
- return;
/*
* Stop scanning if we couldn't free the transaction. This
* avoids pointless scanning of transactions which still
* weren't checkpointed.
*/
- if (!ret)
+ if (!released)
return;
} while (transaction != last_transaction);
}
@@ -524,7 +539,7 @@ void jbd2_journal_destroy_checkpoint(journal_t *journal)
spin_unlock(&journal->j_list_lock);
break;
}
- __jbd2_journal_clean_checkpoint_list(journal, true);
+ __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_DESTROY);
spin_unlock(&journal->j_list_lock);
cond_resched();
}
@@ -553,24 +568,26 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
struct transaction_chp_stats_s *stats;
transaction_t *transaction;
journal_t *journal;
- int ret = 0;
JBUFFER_TRACE(jh, "entry");
- if ((transaction = jh->b_cp_transaction) == NULL) {
+ transaction = jh->b_cp_transaction;
+ if (!transaction) {
JBUFFER_TRACE(jh, "not on transaction");
- goto out;
+ return 0;
}
journal = transaction->t_journal;
JBUFFER_TRACE(jh, "removing from transaction");
+
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
+ percpu_counter_dec(&journal->j_checkpoint_jh_count);
jbd2_journal_put_journal_head(jh);
- if (transaction->t_checkpoint_list != NULL ||
- transaction->t_checkpoint_io_list != NULL)
- goto out;
+ /* Is this transaction empty? */
+ if (transaction->t_checkpoint_list)
+ return 0;
/*
* There is one special case to worry about: if we have just pulled the
@@ -582,10 +599,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
* See the comment at the end of jbd2_journal_commit_transaction().
*/
if (transaction->t_state != T_FINISHED)
- goto out;
+ return 0;
- /* OK, that was the last buffer for the transaction: we can now
- safely remove this transaction from the log */
+ /*
+ * OK, that was the last buffer for the transaction, we can now
+ * safely remove this transaction from the log.
+ */
stats = &transaction->t_chp_stats;
if (stats->cs_chp_time)
stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
@@ -595,9 +614,37 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
__jbd2_journal_drop_transaction(journal, transaction);
jbd2_journal_free_transaction(transaction);
- ret = 1;
-out:
- return ret;
+ return 1;
+}
+
+/*
+ * Check the checkpoint buffer and try to remove it from the checkpoint
+ * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if
+ * it frees the transaction, 0 otherwise.
+ *
+ * This function is called with j_list_lock held.
+ */
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (jh->b_transaction)
+ return -EBUSY;
+ if (!trylock_buffer(bh))
+ return -EBUSY;
+ if (buffer_dirty(bh)) {
+ unlock_buffer(bh);
+ return -EBUSY;
+ }
+ unlock_buffer(bh);
+
+ /*
+ * Buffer is clean and the IO has finished (we held the buffer
+ * lock) so the checkpoint is done. We can safely remove the
+ * buffer from this transaction.
+ */
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ return __jbd2_journal_remove_checkpoint(jh);
}
/*
@@ -628,6 +675,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_list = jh;
+ percpu_counter_inc(&transaction->t_journal->j_checkpoint_jh_count);
}
/*
@@ -643,6 +691,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
{
assert_spin_locked(&journal->j_list_lock);
+
+ journal->j_shrink_transaction = NULL;
if (transaction->t_cpnext) {
transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
@@ -658,12 +708,11 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
- J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(atomic_read(&transaction->t_updates) == 0);
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
trace_jbd2_drop_transaction(journal, transaction);
- jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+ jbd2_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3c1c31321d9b..7203d2d2624d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* linux/fs/jbd2/commit.c
*
@@ -5,10 +6,6 @@
*
* Copyright 1998 Red Hat corp --- All Rights Reserved
*
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
* Journal commit routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*/
@@ -60,32 +57,30 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
* So here, we have a buffer which has just come off the forget list. Look to
* see if we can strip all buffers from the backing page.
*
- * Called under lock_journal(), and possibly under journal_datalist_lock. The
- * caller provided us with a ref against the buffer, and we drop that here.
+ * Called under j_list_lock. The caller provided us with a ref against the
+ * buffer, and we drop that here.
*/
static void release_buffer_page(struct buffer_head *bh)
{
- struct page *page;
+ struct folio *folio;
if (buffer_dirty(bh))
goto nope;
if (atomic_read(&bh->b_count) != 1)
goto nope;
- page = bh->b_page;
- if (!page)
- goto nope;
- if (page->mapping)
+ folio = bh->b_folio;
+ if (folio->mapping)
goto nope;
/* OK, it's a truncated page */
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto nope;
- get_page(page);
+ folio_get(folio);
__brelse(bh);
- try_to_free_buffers(page);
- unlock_page(page);
- put_page(page);
+ try_to_free_buffers(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return;
nope:
@@ -104,7 +99,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
h->h_chksum_type = 0;
h->h_chksum_size = 0;
h->h_chksum[0] = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+ csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize);
h->h_chksum[0] = cpu_to_be32(csum);
}
@@ -123,8 +118,8 @@ static int journal_submit_commit_record(journal_t *journal,
{
struct commit_header *tmp;
struct buffer_head *bh;
- int ret;
- struct timespec64 now = current_kernel_time64();
+ struct timespec64 now;
+ blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS;
*cbh = NULL;
@@ -137,6 +132,7 @@ static int journal_submit_commit_record(journal_t *journal,
return 1;
tmp = (struct commit_header *)bh->b_data;
+ ktime_get_coarse_real_ts64(&now);
tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
@@ -155,13 +151,11 @@ static int journal_submit_commit_record(journal_t *journal,
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_has_feature_async_commit(journal))
- ret = submit_bh(REQ_OP_WRITE,
- REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
- else
- ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
+ write_flags |= REQ_PREFLUSH | REQ_FUA;
+ submit_bh(write_flags, bh);
*cbh = bh;
- return ret;
+ return 0;
}
/*
@@ -183,25 +177,28 @@ static int journal_wait_on_commit_record(journal_t *journal,
return ret;
}
-/*
- * write the filemap data using writepage() address_space_operations.
- * We don't do block allocation here even for delalloc. We don't
- * use writepages() because with dealyed allocation we may be doing
- * block allocation in writepages().
- */
-static int journal_submit_inode_data_buffers(struct address_space *mapping)
+/* Send all the data buffers related to an inode */
+int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
{
- int ret;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = mapping->nrpages * 2,
- .range_start = 0,
- .range_end = i_size_read(mapping->host),
- };
-
- ret = generic_writepages(mapping, &wbc);
- return ret;
+ if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
+ return 0;
+
+ trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
+ return journal->j_submit_inode_data_buffers(jinode);
+
+}
+EXPORT_SYMBOL(jbd2_submit_inode_data);
+
+int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
+{
+ if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) ||
+ !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping)
+ return 0;
+ return filemap_fdatawait_range_keep_errors(
+ jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start,
+ jinode->i_dirty_end);
}
+EXPORT_SYMBOL(jbd2_wait_inode_data);
/*
* Submit all the data buffers of inode associated with the transaction to
@@ -216,25 +213,20 @@ static int journal_submit_data_buffers(journal_t *journal,
{
struct jbd2_inode *jinode;
int err, ret = 0;
- struct address_space *mapping;
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
if (!(jinode->i_flags & JI_WRITE_DATA))
continue;
- mapping = jinode->i_vfs_inode->i_mapping;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
- /*
- * submit the inode data buffers. We use writepage
- * instead of writepages. Because writepages can do
- * block allocation with delalloc. We need to write
- * only allocated blocks here.
- */
+ /* submit the inode data buffers. */
trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
- err = journal_submit_inode_data_buffers(mapping);
- if (!ret)
- ret = err;
+ if (journal->j_submit_inode_data_buffers) {
+ err = journal->j_submit_inode_data_buffers(jinode);
+ if (!ret)
+ ret = err;
+ }
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
jinode->i_flags &= ~JI_COMMIT_RUNNING;
@@ -245,6 +237,15 @@ static int journal_submit_data_buffers(journal_t *journal,
return ret;
}
+int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
+
+ return filemap_fdatawait_range_keep_errors(mapping,
+ jinode->i_dirty_start,
+ jinode->i_dirty_end);
+}
+
/*
* Wait for data submitted for writeout, refile inodes to proper
* transaction if needed.
@@ -263,10 +264,13 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
continue;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
- err = filemap_fdatawait_keep_errors(
- jinode->i_vfs_inode->i_mapping);
- if (!ret)
- ret = err;
+ /* wait for the inode data buffers writeout. */
+ if (journal->j_finish_inode_data_buffers) {
+ err = journal->j_finish_inode_data_buffers(jinode);
+ if (!ret)
+ ret = err;
+ }
+ cond_resched();
spin_lock(&journal->j_list_lock);
jinode->i_flags &= ~JI_COMMIT_RUNNING;
smp_mb();
@@ -284,6 +288,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
&jinode->i_transaction->t_inode_list);
} else {
jinode->i_transaction = NULL;
+ jinode->i_dirty_start = 0;
+ jinode->i_dirty_end = 0;
}
}
spin_unlock(&journal->j_list_lock);
@@ -293,14 +299,12 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
{
- struct page *page = bh->b_page;
char *addr;
__u32 checksum;
- addr = kmap_atomic(page);
- checksum = crc32_be(crc32_sum,
- (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
- kunmap_atomic(addr);
+ addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
+ checksum = crc32_be(crc32_sum, addr, bh->b_size);
+ kunmap_local(addr);
return checksum;
}
@@ -317,7 +321,6 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
struct buffer_head *bh, __u32 sequence)
{
journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
- struct page *page = bh->b_page;
__u8 *addr;
__u32 csum32;
__be32 seq;
@@ -326,11 +329,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
return;
seq = cpu_to_be32(sequence);
- addr = kmap_atomic(page);
- csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
- csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
- bh->b_size);
- kunmap_atomic(addr);
+ addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
+ csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
+ csum32 = jbd2_chksum(csum32, addr, bh->b_size);
+ kunmap_local(addr);
if (jbd2_has_feature_csum3(j))
tag3->t_checksum = cpu_to_be32(csum32);
@@ -351,7 +353,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
struct buffer_head *descriptor;
struct buffer_head **wbuf = journal->j_wbuf;
int bufs;
- int flags;
+ int escape;
int err;
unsigned long long blocknr;
ktime_t start_time;
@@ -384,7 +386,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/* Do we need to erase the effects of a prior jbd2_journal_flush? */
if (journal->j_flags & JBD2_FLUSHED) {
- jbd_debug(3, "super block updated\n");
+ jbd2_debug(3, "super block updated\n");
mutex_lock_io(&journal->j_checkpoint_mutex);
/*
* We hold j_checkpoint_mutex so tail cannot change under us.
@@ -394,23 +396,46 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
jbd2_journal_update_sb_log_tail(journal,
journal->j_tail_sequence,
- journal->j_tail,
- REQ_SYNC);
+ journal->j_tail, 0);
mutex_unlock(&journal->j_checkpoint_mutex);
} else {
- jbd_debug(3, "superblock not updated\n");
+ jbd2_debug(3, "superblock not updated\n");
}
J_ASSERT(journal->j_running_transaction != NULL);
J_ASSERT(journal->j_committing_transaction == NULL);
+ write_lock(&journal->j_state_lock);
+ journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
+ while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&journal->j_fc_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ write_unlock(&journal->j_state_lock);
+ schedule();
+ write_lock(&journal->j_state_lock);
+ finish_wait(&journal->j_fc_wait, &wait);
+ /*
+ * TODO: by blocking fast commits here, we are increasing
+ * fsync() latency slightly. Strictly speaking, we don't need
+ * to block fast commits until the transaction enters T_FLUSH
+ * state. So an optimization is possible where we block new fast
+ * commits here and wait for existing ones to complete
+ * just before we enter T_FLUSH. That way, the existing fast
+ * commits and this full commit can proceed parallely.
+ */
+ }
+ write_unlock(&journal->j_state_lock);
+
commit_transaction = journal->j_running_transaction;
trace_jbd2_start_commit(journal, commit_transaction);
- jbd_debug(1, "JBD2: starting commit of transaction %d\n",
+ jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
commit_transaction->t_tid);
write_lock(&journal->j_state_lock);
+ journal->j_fc_off = 0;
J_ASSERT(commit_transaction->t_state == T_RUNNING);
commit_transaction->t_state = T_LOCKED;
@@ -425,22 +450,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
stats.run.rs_locked);
- spin_lock(&commit_transaction->t_handle_lock);
- while (atomic_read(&commit_transaction->t_updates)) {
- DEFINE_WAIT(wait);
+ // waits for any t_updates to finish
+ jbd2_journal_wait_updates(journal);
- prepare_to_wait(&journal->j_wait_updates, &wait,
- TASK_UNINTERRUPTIBLE);
- if (atomic_read(&commit_transaction->t_updates)) {
- spin_unlock(&commit_transaction->t_handle_lock);
- write_unlock(&journal->j_state_lock);
- schedule();
- write_lock(&journal->j_state_lock);
- spin_lock(&commit_transaction->t_handle_lock);
- }
- finish_wait(&journal->j_wait_updates, &wait);
- }
- spin_unlock(&commit_transaction->t_handle_lock);
+ commit_transaction->t_state = T_SWITCH;
J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
journal->j_max_transaction_buffers);
@@ -460,6 +473,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* has reserved. This is consistent with the existing behaviour
* that multiple jbd2_journal_get_write_access() calls to the same
* buffer are perfectly permissible.
+ * We use journal->j_state_lock here to serialize processing of
+ * t_reserved_list with eviction of buffers from journal_unmap_buffer().
*/
while (commit_transaction->t_reserved_list) {
jh = commit_transaction->t_reserved_list;
@@ -471,24 +486,25 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (jh->b_committed_data) {
struct buffer_head *bh = jh2bh(jh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
jbd2_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
jbd2_journal_refile_buffer(journal, jh);
}
+ write_unlock(&journal->j_state_lock);
/*
* Now try to drop any written-back buffers from the journal's
* checkpoint lists. We do this *before* commit because it potentially
* frees some memory
*/
spin_lock(&journal->j_list_lock);
- __jbd2_journal_clean_checkpoint_list(journal, false);
+ __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP);
spin_unlock(&journal->j_list_lock);
- jbd_debug(3, "JBD2: commit phase 1\n");
+ jbd2_debug(3, "JBD2: commit phase 1\n");
/*
* Clear revoked flag to reflect there is no revoked buffers
@@ -501,6 +517,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
jbd2_journal_switch_revoke_table(journal);
+ write_lock(&journal->j_state_lock);
/*
* Reserved credits cannot be claimed anymore, free them
*/
@@ -517,10 +534,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
journal->j_running_transaction = NULL;
start_time = ktime_get();
commit_transaction->t_log_start = journal->j_head;
- wake_up(&journal->j_wait_transaction_locked);
+ wake_up_all(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
- jbd_debug(3, "JBD2: commit phase 2a\n");
+ jbd2_debug(3, "JBD2: commit phase 2a\n");
/*
* Now start flushing things to disk, in the order they appear
@@ -533,7 +550,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
blk_start_plug(&plug);
jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
- jbd_debug(3, "JBD2: commit phase 2b\n");
+ jbd2_debug(3, "JBD2: commit phase 2b\n");
/*
* Way to go: we have now written out all of the data for a
@@ -548,14 +565,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
stats.run.rs_logging = jiffies;
stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
stats.run.rs_logging);
- stats.run.rs_blocks =
- atomic_read(&commit_transaction->t_outstanding_credits);
+ stats.run.rs_blocks = commit_transaction->t_nr_buffers;
stats.run.rs_blocks_logged = 0;
J_ASSERT(commit_transaction->t_nr_buffers <=
atomic_read(&commit_transaction->t_outstanding_credits));
- err = 0;
bufs = 0;
descriptor = NULL;
while (commit_transaction->t_buffers) {
@@ -590,7 +605,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (!descriptor) {
J_ASSERT (bufs == 0);
- jbd_debug(4, "JBD2: get descriptor\n");
+ jbd2_debug(4, "JBD2: get descriptor\n");
descriptor = jbd2_journal_get_descriptor_buffer(
commit_transaction,
@@ -600,7 +615,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
continue;
}
- jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
+ jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
(unsigned long long)descriptor->b_blocknr,
descriptor->b_data);
tagp = &descriptor->b_data[sizeof(journal_header_t)];
@@ -630,8 +645,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/*
* start_this_handle() uses t_outstanding_credits to determine
- * the free space in the log, but this counter is changed
- * by jbd2_journal_next_log_block() also.
+ * the free space in the log.
*/
atomic_dec(&commit_transaction->t_outstanding_credits);
@@ -646,19 +660,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
set_bit(BH_JWrite, &jh2bh(jh)->b_state);
JBUFFER_TRACE(jh, "ph3: write metadata");
- flags = jbd2_journal_write_metadata_buffer(commit_transaction,
+ escape = jbd2_journal_write_metadata_buffer(commit_transaction,
jh, &wbuf[bufs], blocknr);
- if (flags < 0) {
- jbd2_journal_abort(journal, flags);
- continue;
- }
jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
/* Record the new block's tag in the current descriptor
buffer */
tag_flag = 0;
- if (flags & 1)
+ if (escape)
tag_flag |= JBD2_FLAG_ESCAPE;
if (!first_tag)
tag_flag |= JBD2_FLAG_SAME_UUID;
@@ -686,18 +696,21 @@ void jbd2_journal_commit_transaction(journal_t *journal)
commit_transaction->t_buffers == NULL ||
space_left < tag_bytes + 16 + csum_size) {
- jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
+ jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
/* Write an end-of-descriptor marker before
submitting the IOs. "tag" still points to
the last tag we set up. */
tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
-
- jbd2_descriptor_block_csum_set(journal, descriptor);
start_journal_io:
+ if (descriptor)
+ jbd2_descriptor_block_csum_set(journal,
+ descriptor);
+
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
+
/*
* Compute checksum.
*/
@@ -710,10 +723,10 @@ start_journal_io:
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
+ submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
+ bh);
}
cond_resched();
- stats.run.rs_blocks_logged += bufs;
/* Force a new descriptor to be generated next
time round the loop. */
@@ -725,10 +738,8 @@ start_journal_io:
err = journal_finish_inode_data_buffers(journal, commit_transaction);
if (err) {
printk(KERN_WARNING
- "JBD2: Detected IO errors while flushing file data "
- "on %s\n", journal->j_devname);
- if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
- jbd2_journal_abort(journal, err);
+ "JBD2: Detected IO errors %d while flushing file data on %s\n",
+ err, journal->j_devname);
err = 0;
}
@@ -749,29 +760,29 @@ start_journal_io:
if (first_block < journal->j_tail)
freed += journal->j_last - journal->j_first;
/* Update tail only if we free significant amount of space */
- if (freed < journal->j_maxlen / 4)
+ if (freed < journal->j_max_transaction_buffers)
update_tail = 0;
}
J_ASSERT(commit_transaction->t_state == T_COMMIT);
commit_transaction->t_state = T_COMMIT_DFLUSH;
write_unlock(&journal->j_state_lock);
- /*
+ /*
* If the journal is not located on the file system device,
* then we must flush the file system device before we issue
- * the commit record
+ * the commit record and update the journal tail sequence.
*/
- if (commit_transaction->t_need_data_flush &&
+ if ((commit_transaction->t_need_data_flush || update_tail) &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
+ blkdev_issue_flush(journal->j_fs_dev);
/* Done it all: now write the commit record asynchronously. */
if (jbd2_has_feature_async_commit(journal)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
- __jbd2_journal_abort_hard(journal);
+ jbd2_journal_abort(journal, err);
}
blk_finish_plug(&plug);
@@ -787,7 +798,7 @@ start_journal_io:
so we incur less scheduling load.
*/
- jbd_debug(3, "JBD2: commit phase 3\n");
+ jbd2_debug(3, "JBD2: commit phase 3\n");
while (!list_empty(&io_bufs)) {
struct buffer_head *bh = list_entry(io_bufs.prev,
@@ -800,6 +811,7 @@ start_journal_io:
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
jbd2_unfile_log_bh(bh);
+ stats.run.rs_blocks_logged++;
/*
* The list contains temporary buffer heads created by
@@ -829,7 +841,7 @@ start_journal_io:
J_ASSERT (commit_transaction->t_shadow_list == NULL);
- jbd_debug(3, "JBD2: commit phase 4\n");
+ jbd2_debug(3, "JBD2: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
while (!list_empty(&log_bufs)) {
@@ -845,6 +857,7 @@ start_journal_io:
BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
clear_buffer_jwrite(bh);
jbd2_unfile_log_bh(bh);
+ stats.run.rs_blocks_logged++;
__brelse(bh); /* One for getblk */
/* AKPM: bforget here */
}
@@ -852,7 +865,7 @@ start_journal_io:
if (err)
jbd2_journal_abort(journal, err);
- jbd_debug(3, "JBD2: commit phase 5\n");
+ jbd2_debug(3, "JBD2: commit phase 5\n");
write_lock(&journal->j_state_lock);
J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
commit_transaction->t_state = T_COMMIT_JFLUSH;
@@ -862,18 +875,22 @@ start_journal_io:
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
- __jbd2_journal_abort_hard(journal);
+ jbd2_journal_abort(journal, err);
}
if (cbh)
err = journal_wait_on_commit_record(journal, cbh);
+ stats.run.rs_blocks_logged++;
if (jbd2_has_feature_async_commit(journal) &&
journal->j_flags & JBD2_BARRIER) {
- blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
+ blkdev_issue_flush(journal->j_dev);
}
if (err)
jbd2_journal_abort(journal, err);
+ WARN_ON_ONCE(
+ atomic_read(&commit_transaction->t_outstanding_credits) < 0);
+
/*
* Now disk caches for filesystem device are flushed so we are safe to
* erase checkpointed transactions from the log by updating journal
@@ -887,7 +904,7 @@ start_journal_io:
transaction can be removed from any checkpoint list it was on
before. */
- jbd_debug(3, "JBD2: commit phase 6\n");
+ jbd2_debug(3, "JBD2: commit phase 6\n");
J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
@@ -904,6 +921,7 @@ restart_loop:
transaction_t *cp_transaction;
struct buffer_head *bh;
int try_to_free = 0;
+ bool drop_ref;
jh = commit_transaction->t_forget;
spin_unlock(&journal->j_list_lock);
@@ -913,7 +931,7 @@ restart_loop:
* done with it.
*/
get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
/*
@@ -958,29 +976,34 @@ restart_loop:
* it. */
/*
- * A buffer which has been freed while still being journaled by
- * a previous transaction.
- */
- if (buffer_freed(bh)) {
+ * A buffer which has been freed while still being journaled
+ * by a previous transaction, refile the buffer to BJ_Forget of
+ * the running transaction. If the just committed transaction
+ * contains "add to orphan" operation, we can completely
+ * invalidate the buffer now. We are rather through in that
+ * since the buffer may be still accessible when blocksize <
+ * pagesize and it is attached to the last partial page.
+ */
+ if (buffer_freed(bh) && !jh->b_next_transaction) {
+ struct address_space *mapping;
+
+ clear_buffer_freed(bh);
+ clear_buffer_jbddirty(bh);
+
/*
- * If the running transaction is the one containing
- * "add to orphan" operation (b_next_transaction !=
- * NULL), we have to wait for that transaction to
- * commit before we can really get rid of the buffer.
- * So just clear b_modified to not confuse transaction
- * credit accounting and refile the buffer to
- * BJ_Forget of the running transaction. If the just
- * committed transaction contains "add to orphan"
- * operation, we can completely invalidate the buffer
- * now. We are rather through in that since the
- * buffer may be still accessible when blocksize <
- * pagesize and it is attached to the last partial
- * page.
+ * Block device buffers need to stay mapped all the
+ * time, so it is enough to clear buffer_jbddirty and
+ * buffer_freed bits. For the file mapping buffers (i.e.
+ * journalled data) we need to unmap buffer and clear
+ * more bits. We also need to be careful about the check
+ * because the data page mapping can get cleared under
+ * our hands. Note that if mapping == NULL, we don't
+ * need to make buffer unmapped because the page is
+ * already detached from the mapping and buffers cannot
+ * get reused.
*/
- jh->b_modified = 0;
- if (!jh->b_next_transaction) {
- clear_buffer_freed(bh);
- clear_buffer_jbddirty(bh);
+ mapping = READ_ONCE(bh->b_folio->mapping);
+ if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
clear_buffer_mapped(bh);
clear_buffer_new(bh);
clear_buffer_req(bh);
@@ -1008,8 +1031,10 @@ restart_loop:
try_to_free = 1;
}
JBUFFER_TRACE(jh, "refile or unfile buffer");
- __jbd2_journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
+ drop_ref = __jbd2_journal_refile_buffer(jh);
+ spin_unlock(&jh->b_state_lock);
+ if (drop_ref)
+ jbd2_journal_put_journal_head(jh);
if (try_to_free)
release_buffer_page(bh); /* Drops bh reference */
else
@@ -1056,7 +1081,7 @@ restart_loop:
/* Done with this transaction! */
- jbd_debug(3, "JBD2: commit phase 7\n");
+ jbd2_debug(3, "JBD2: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
@@ -1076,7 +1101,7 @@ restart_loop:
commit_transaction->t_state = T_COMMIT_CALLBACK;
J_ASSERT(commit_transaction == journal->j_committing_transaction);
- journal->j_commit_sequence = commit_transaction->t_tid;
+ WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid);
journal->j_committing_transaction = NULL;
commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
@@ -1094,23 +1119,27 @@ restart_loop:
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
+ if (journal->j_fc_cleanup_callback)
+ journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
trace_jbd2_end_commit(journal, commit_transaction);
- jbd_debug(1, "JBD2: commit %d complete, head %d\n",
+ jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
write_lock(&journal->j_state_lock);
+ journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
+ journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
spin_lock(&journal->j_list_lock);
commit_transaction->t_state = T_FINISHED;
/* Check if the transaction can be dropped now that we are finished */
- if (commit_transaction->t_checkpoint_list == NULL &&
- commit_transaction->t_checkpoint_io_list == NULL) {
+ if (commit_transaction->t_checkpoint_list == NULL) {
__jbd2_journal_drop_transaction(journal, commit_transaction);
jbd2_journal_free_transaction(commit_transaction);
}
spin_unlock(&journal->j_list_lock);
write_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_done_commit);
+ wake_up(&journal->j_fc_wait);
/*
* Calculate overall stats
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 7d5ef3bf3f3e..c973162d5b31 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* linux/fs/jbd2/journal.c
*
@@ -5,10 +6,6 @@
*
* Copyright 1998 Red Hat corp --- All Rights Reserved
*
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
* Generic filesystem journal-writing code; part of the ext2fs
* journaling system.
*
@@ -52,8 +49,7 @@
#include <asm/page.h>
#ifdef CONFIG_JBD2_DEBUG
-ushort jbd2_journal_enable_debug __read_mostly;
-EXPORT_SYMBOL(jbd2_journal_enable_debug);
+static ushort jbd2_journal_enable_debug __read_mostly;
module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
@@ -69,9 +65,6 @@ EXPORT_SYMBOL(jbd2_journal_get_undo_access);
EXPORT_SYMBOL(jbd2_journal_set_triggers);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_forget);
-#if 0
-EXPORT_SYMBOL(journal_sync_buffer);
-#endif
EXPORT_SYMBOL(jbd2_journal_flush);
EXPORT_SYMBOL(jbd2_journal_revoke);
@@ -87,22 +80,21 @@ EXPORT_SYMBOL(jbd2_journal_errno);
EXPORT_SYMBOL(jbd2_journal_ack_err);
EXPORT_SYMBOL(jbd2_journal_clear_err);
EXPORT_SYMBOL(jbd2_log_wait_commit);
-EXPORT_SYMBOL(jbd2_log_start_commit);
EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
-EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
-EXPORT_SYMBOL(jbd2_journal_invalidatepage);
+EXPORT_SYMBOL(jbd2_journal_blocks_per_folio);
+EXPORT_SYMBOL(jbd2_journal_invalidate_folio);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
-EXPORT_SYMBOL(jbd2_journal_inode_add_write);
-EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
+EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
+EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
+EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers);
EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
EXPORT_SYMBOL(jbd2_inode_cache);
-static void __journal_abort_soft (journal_t *journal, int errno);
static int jbd2_journal_create_slab(size_t slab_size);
#ifdef CONFIG_JBD2_DEBUG
@@ -117,59 +109,34 @@ void __jbd2_debug(int level, const char *file, const char *func,
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+ printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
va_end(args);
}
-EXPORT_SYMBOL(__jbd2_debug);
#endif
/* Checksumming functions */
-static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
-{
- if (!jbd2_journal_has_csum_v2or3_feature(j))
- return 1;
-
- return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
-}
-
-static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
+static __be32 jbd2_superblock_csum(journal_superblock_t *sb)
{
__u32 csum;
__be32 old_csum;
old_csum = sb->s_checksum;
sb->s_checksum = 0;
- csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
+ csum = jbd2_chksum(~0, (char *)sb, sizeof(journal_superblock_t));
sb->s_checksum = old_csum;
return cpu_to_be32(csum);
}
-static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
-{
- if (!jbd2_journal_has_csum_v2or3(j))
- return 1;
-
- return sb->s_checksum == jbd2_superblock_csum(j, sb);
-}
-
-static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
-{
- if (!jbd2_journal_has_csum_v2or3(j))
- return;
-
- sb->s_checksum = jbd2_superblock_csum(j, sb);
-}
-
/*
* Helper function used to manage commit timeouts
*/
-static void commit_timeout(unsigned long __data)
+static void commit_timeout(struct timer_list *t)
{
- struct task_struct * p = (struct task_struct *) __data;
+ journal_t *journal = timer_container_of(journal, t, j_commit_timer);
- wake_up_process(p);
+ wake_up_process(journal->j_task);
}
/*
@@ -180,7 +147,9 @@ static void commit_timeout(unsigned long __data)
*
* 1) COMMIT: Every so often we need to commit the current state of the
* filesystem to disk. The journal thread is responsible for writing
- * all of the metadata buffers to disk.
+ * all of the metadata buffers to disk. If a fast commit is ongoing
+ * journal thread waits until it's done and then continues from
+ * there on.
*
* 2) CHECKPOINT: We cannot reuse a used section of the log file until all
* of the data in that part of the log has been rewritten elsewhere on
@@ -197,8 +166,7 @@ static int kjournald2(void *arg)
* Set up an interval timer which can be used to trigger a commit wakeup
* after the commit interval expires
*/
- setup_timer(&journal->j_commit_timer, commit_timeout,
- (unsigned long)current);
+ timer_setup(&journal->j_commit_timer, commit_timeout, 0);
set_freezable();
@@ -223,13 +191,13 @@ loop:
if (journal->j_flags & JBD2_UNMOUNT)
goto end_loop;
- jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
+ jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n",
journal->j_commit_sequence, journal->j_commit_request);
if (journal->j_commit_sequence != journal->j_commit_request) {
- jbd_debug(1, "OK, requests differ\n");
+ jbd2_debug(1, "OK, requests differ\n");
write_unlock(&journal->j_state_lock);
- del_timer_sync(&journal->j_commit_timer);
+ timer_delete_sync(&journal->j_commit_timer);
jbd2_journal_commit_transaction(journal);
write_lock(&journal->j_state_lock);
goto loop;
@@ -242,7 +210,7 @@ loop:
* good idea, because that depends on threads that may
* be already stopped.
*/
- jbd_debug(1, "Now suspending kjournald2\n");
+ jbd2_debug(1, "Now suspending kjournald2\n");
write_unlock(&journal->j_state_lock);
try_to_freeze();
write_lock(&journal->j_state_lock);
@@ -252,19 +220,12 @@ loop:
* so we don't sleep
*/
DEFINE_WAIT(wait);
- int should_sleep = 1;
prepare_to_wait(&journal->j_wait_commit, &wait,
TASK_INTERRUPTIBLE);
- if (journal->j_commit_sequence != journal->j_commit_request)
- should_sleep = 0;
transaction = journal->j_running_transaction;
- if (transaction && time_after_eq(jiffies,
- transaction->t_expires))
- should_sleep = 0;
- if (journal->j_flags & JBD2_UNMOUNT)
- should_sleep = 0;
- if (should_sleep) {
+ if (transaction == NULL ||
+ time_before(jiffies, transaction->t_expires)) {
write_unlock(&journal->j_state_lock);
schedule();
write_lock(&journal->j_state_lock);
@@ -272,7 +233,7 @@ loop:
finish_wait(&journal->j_wait_commit, &wait);
}
- jbd_debug(1, "kjournald2 wakes\n");
+ jbd2_debug(1, "kjournald2 wakes\n");
/*
* Were we woken up by a commit wakeup event?
@@ -280,15 +241,15 @@ loop:
transaction = journal->j_running_transaction;
if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
journal->j_commit_request = transaction->t_tid;
- jbd_debug(1, "woke because of timeout\n");
+ jbd2_debug(1, "woke because of timeout\n");
}
goto loop;
end_loop:
- del_timer_sync(&journal->j_commit_timer);
+ timer_delete_sync(&journal->j_commit_timer);
journal->j_task = NULL;
wake_up(&journal->j_wait_done_commit);
- jbd_debug(1, "Journal thread exiting.\n");
+ jbd2_debug(1, "Journal thread exiting.\n");
write_unlock(&journal->j_state_lock);
return 0;
}
@@ -320,6 +281,16 @@ static void journal_kill_thread(journal_t *journal)
write_unlock(&journal->j_state_lock);
}
+static inline bool jbd2_data_needs_escaping(char *data)
+{
+ return *((__be32 *)data) == cpu_to_be32(JBD2_MAGIC_NUMBER);
+}
+
+static inline void jbd2_data_do_escape(char *data)
+{
+ *((unsigned int *)data) = 0;
+}
+
/*
* jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
*
@@ -344,15 +315,11 @@ static void journal_kill_thread(journal_t *journal)
* IO is in progress. do_get_write_access() handles this.
*
* The function returns a pointer to the buffer_head to be used for IO.
- *
*
- * Return value:
- * <0: Error
- * >=0: Finished OK
*
- * On success:
- * Bit 0 set == escape performed on the data
- * Bit 1 set == buffer copy-out performed (kfree the data after IO)
+ * Return value:
+ * =0: Finished OK without escape
+ * =1: Finished OK with escape
*/
int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
@@ -360,12 +327,9 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct buffer_head **bh_out,
sector_t blocknr)
{
- int need_copy_out = 0;
- int done_copy_out = 0;
int do_escape = 0;
- char *mapped_data;
struct buffer_head *new_bh;
- struct page *new_page;
+ struct folio *new_folio;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
journal_t *journal = transaction->t_journal;
@@ -386,88 +350,65 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);
- jbd_lock_bh_state(bh_in);
-repeat:
+ spin_lock(&jh_in->b_state_lock);
/*
* If a new transaction has already done a buffer copy-out, then
* we use that version of the data for the commit.
*/
if (jh_in->b_frozen_data) {
- done_copy_out = 1;
- new_page = virt_to_page(jh_in->b_frozen_data);
- new_offset = offset_in_page(jh_in->b_frozen_data);
+ new_folio = virt_to_folio(jh_in->b_frozen_data);
+ new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
+ do_escape = jbd2_data_needs_escaping(jh_in->b_frozen_data);
+ if (do_escape)
+ jbd2_data_do_escape(jh_in->b_frozen_data);
} else {
- new_page = jh2bh(jh_in)->b_page;
- new_offset = offset_in_page(jh2bh(jh_in)->b_data);
- }
+ char *tmp;
+ char *mapped_data;
- mapped_data = kmap_atomic(new_page);
- /*
- * Fire data frozen trigger if data already wasn't frozen. Do this
- * before checking for escaping, as the trigger may modify the magic
- * offset. If a copy-out happens afterwards, it will have the correct
- * data in the buffer.
- */
- if (!done_copy_out)
- jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+ new_folio = bh_in->b_folio;
+ new_offset = offset_in_folio(new_folio, bh_in->b_data);
+ mapped_data = kmap_local_folio(new_folio, new_offset);
+ /*
+ * Fire data frozen trigger if data already wasn't frozen. Do
+ * this before checking for escaping, as the trigger may modify
+ * the magic offset. If a copy-out happens afterwards, it will
+ * have the correct data in the buffer.
+ */
+ jbd2_buffer_frozen_trigger(jh_in, mapped_data,
jh_in->b_triggers);
+ do_escape = jbd2_data_needs_escaping(mapped_data);
+ kunmap_local(mapped_data);
+ /*
+ * Do we need to do a data copy?
+ */
+ if (!do_escape)
+ goto escape_done;
- /*
- * Check for escaping
- */
- if (*((__be32 *)(mapped_data + new_offset)) ==
- cpu_to_be32(JBD2_MAGIC_NUMBER)) {
- need_copy_out = 1;
- do_escape = 1;
- }
- kunmap_atomic(mapped_data);
-
- /*
- * Do we need to do a data copy?
- */
- if (need_copy_out && !done_copy_out) {
- char *tmp;
-
- jbd_unlock_bh_state(bh_in);
- tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
- if (!tmp) {
- brelse(new_bh);
- return -ENOMEM;
- }
- jbd_lock_bh_state(bh_in);
+ spin_unlock(&jh_in->b_state_lock);
+ tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
+ spin_lock(&jh_in->b_state_lock);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
- goto repeat;
+ goto copy_done;
}
jh_in->b_frozen_data = tmp;
- mapped_data = kmap_atomic(new_page);
- memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
- kunmap_atomic(mapped_data);
-
- new_page = virt_to_page(tmp);
- new_offset = offset_in_page(tmp);
- done_copy_out = 1;
-
+ memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size);
/*
* This isn't strictly necessary, as we're using frozen
* data for the escaping, but it keeps consistency with
* b_frozen_data usage.
*/
jh_in->b_frozen_triggers = jh_in->b_triggers;
- }
- /*
- * Did we need to do an escaping? Now we've done all the
- * copying, we can finally do so.
- */
- if (do_escape) {
- mapped_data = kmap_atomic(new_page);
- *((unsigned int *)(mapped_data + new_offset)) = 0;
- kunmap_atomic(mapped_data);
+copy_done:
+ new_folio = virt_to_folio(jh_in->b_frozen_data);
+ new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
+ jbd2_data_do_escape(jh_in->b_frozen_data);
}
- set_bh_page(new_bh, new_page, new_offset);
+escape_done:
+ folio_set_bh(new_bh, new_folio, new_offset);
new_bh->b_size = bh_in->b_size;
new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
@@ -487,9 +428,9 @@ repeat:
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
set_buffer_shadow(bh_in);
- jbd_unlock_bh_state(bh_in);
+ spin_unlock(&jh_in->b_state_lock);
- return do_escape | (done_copy_out << 1);
+ return do_escape;
}
/*
@@ -501,7 +442,7 @@ repeat:
* Called with j_state_lock locked for writing.
* Returns true if a transaction commit was started.
*/
-int __jbd2_log_start_commit(journal_t *journal, tid_t target)
+static int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{
/* Return if the txn has already requested to be committed */
if (journal->j_commit_request == target)
@@ -520,7 +461,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
*/
journal->j_commit_request = target;
- jbd_debug(1, "JBD2: requesting commit %d/%d\n",
+ jbd2_debug(1, "JBD2: requesting commit %u/%u\n",
journal->j_commit_request,
journal->j_commit_sequence);
journal->j_running_transaction->t_requested = jiffies;
@@ -533,7 +474,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
journal->j_commit_request,
journal->j_commit_sequence,
- target, journal->j_running_transaction ?
+ target, journal->j_running_transaction ?
journal->j_running_transaction->t_tid : 0);
return 0;
}
@@ -586,12 +527,14 @@ static int __jbd2_journal_force_commit(journal_t *journal)
}
/**
- * Force and wait upon a commit if the calling process is not within
- * transaction. This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
+ * jbd2_journal_force_commit_nested - Force and wait upon a commit if the
+ * calling process is not within transaction.
*
* @journal: journal to force
* Returns true if progress was made.
+ *
+ * This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
*/
int jbd2_journal_force_commit_nested(journal_t *journal)
{
@@ -602,7 +545,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
}
/**
- * int journal_force_commit() - force any uncommitted transactions
+ * jbd2_journal_force_commit() - force any uncommitted transactions
* @journal: journal to force
*
* Caller want unconditional commit. We can only force the running transaction
@@ -660,7 +603,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
{
int ret = 0;
- transaction_t *commit_trans;
+ transaction_t *commit_trans, *running_trans;
if (!(journal->j_flags & JBD2_BARRIER))
return 0;
@@ -670,6 +613,16 @@ int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
goto out;
commit_trans = journal->j_committing_transaction;
if (!commit_trans || commit_trans->t_tid != tid) {
+ running_trans = journal->j_running_transaction;
+ /*
+ * The query transaction hasn't started committing,
+ * it must still be running.
+ */
+ if (WARN_ON_ONCE(!running_trans ||
+ running_trans->t_tid != tid))
+ goto out;
+
+ running_trans->t_need_data_flush = 1;
ret = 1;
goto out;
}
@@ -718,12 +671,12 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
#ifdef CONFIG_JBD2_DEBUG
if (!tid_geq(journal->j_commit_request, tid)) {
printk(KERN_ERR
- "%s: error: j_commit_request=%d, tid=%d\n",
+ "%s: error: j_commit_request=%u, tid=%u\n",
__func__, journal->j_commit_request, tid);
}
#endif
while (tid_gt(tid, journal->j_commit_sequence)) {
- jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
+ jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
tid, journal->j_commit_sequence);
read_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_commit);
@@ -739,6 +692,92 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
}
/*
+ * Start a fast commit. If there's an ongoing fast or full commit wait for
+ * it to complete. Returns 0 if a new fast commit was started. Returns -EALREADY
+ * if a fast commit is not needed, either because there's an already a commit
+ * going on or this tid has already been committed. Returns -EINVAL if no jbd2
+ * commit has yet been performed.
+ */
+int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
+{
+ if (unlikely(is_journal_aborted(journal)))
+ return -EIO;
+ /*
+ * Fast commits only allowed if at least one full commit has
+ * been processed.
+ */
+ if (!journal->j_stats.ts_tid)
+ return -EINVAL;
+
+ write_lock(&journal->j_state_lock);
+ if (tid_geq(journal->j_commit_sequence, tid)) {
+ write_unlock(&journal->j_state_lock);
+ return -EALREADY;
+ }
+
+ if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
+ (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&journal->j_fc_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ write_unlock(&journal->j_state_lock);
+ schedule();
+ finish_wait(&journal->j_fc_wait, &wait);
+ return -EALREADY;
+ }
+ journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
+ write_unlock(&journal->j_state_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_fc_begin_commit);
+
+/*
+ * Stop a fast commit. If fallback is set, this function starts commit of
+ * TID tid before any other fast commit can start.
+ */
+static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
+{
+ if (journal->j_fc_cleanup_callback)
+ journal->j_fc_cleanup_callback(journal, 0, tid);
+ write_lock(&journal->j_state_lock);
+ journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
+ if (fallback)
+ journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
+ write_unlock(&journal->j_state_lock);
+ wake_up(&journal->j_fc_wait);
+ if (fallback)
+ return jbd2_complete_transaction(journal, tid);
+ return 0;
+}
+
+int jbd2_fc_end_commit(journal_t *journal)
+{
+ return __jbd2_fc_end_commit(journal, 0, false);
+}
+EXPORT_SYMBOL(jbd2_fc_end_commit);
+
+int jbd2_fc_end_commit_fallback(journal_t *journal)
+{
+ tid_t tid;
+
+ read_lock(&journal->j_state_lock);
+ tid = journal->j_running_transaction ?
+ journal->j_running_transaction->t_tid : 0;
+ read_unlock(&journal->j_state_lock);
+ return __jbd2_fc_end_commit(journal, tid, true);
+}
+EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);
+
+/* Return 1 when transaction with given tid has already committed. */
+int jbd2_transaction_committed(journal_t *journal, tid_t tid)
+{
+ return tid_geq(READ_ONCE(journal->j_commit_sequence), tid);
+}
+EXPORT_SYMBOL(jbd2_transaction_committed);
+
+/*
* When this function returns the transaction corresponding to tid
* will be completed. If the transaction has currently running, start
* committing that transaction before waiting for it to complete. If
@@ -789,6 +828,90 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
return jbd2_journal_bmap(journal, blocknr, retp);
}
+/* Map one fast commit buffer for use by the file system */
+int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
+{
+ unsigned long long pblock;
+ unsigned long blocknr;
+ int ret = 0;
+ struct buffer_head *bh;
+ int fc_off;
+
+ *bh_out = NULL;
+
+ if (journal->j_fc_off + journal->j_fc_first >= journal->j_fc_last)
+ return -EINVAL;
+
+ fc_off = journal->j_fc_off;
+ blocknr = journal->j_fc_first + fc_off;
+ journal->j_fc_off++;
+ ret = jbd2_journal_bmap(journal, blocknr, &pblock);
+ if (ret)
+ return ret;
+
+ bh = __getblk(journal->j_dev, pblock, journal->j_blocksize);
+ if (!bh)
+ return -ENOMEM;
+
+ journal->j_fc_wbuf[fc_off] = bh;
+
+ *bh_out = bh;
+
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_fc_get_buf);
+
+/*
+ * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf
+ * for completion.
+ */
+int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
+{
+ struct buffer_head *bh;
+ int i, j_fc_off;
+
+ j_fc_off = journal->j_fc_off;
+
+ /*
+ * Wait in reverse order to minimize chances of us being woken up before
+ * all IOs have completed
+ */
+ for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
+ bh = journal->j_fc_wbuf[i];
+ wait_on_buffer(bh);
+ /*
+ * Update j_fc_off so jbd2_fc_release_bufs can release remain
+ * buffer head.
+ */
+ if (unlikely(!buffer_uptodate(bh))) {
+ journal->j_fc_off = i + 1;
+ return -EIO;
+ }
+ put_bh(bh);
+ journal->j_fc_wbuf[i] = NULL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_fc_wait_bufs);
+
+void jbd2_fc_release_bufs(journal_t *journal)
+{
+ struct buffer_head *bh;
+ int i, j_fc_off;
+
+ j_fc_off = journal->j_fc_off;
+
+ for (i = j_fc_off - 1; i >= 0; i--) {
+ bh = journal->j_fc_wbuf[i];
+ if (!bh)
+ break;
+ put_bh(bh);
+ journal->j_fc_wbuf[i] = NULL;
+ }
+}
+EXPORT_SYMBOL(jbd2_fc_release_bufs);
+
/*
* Conversion of logical to physical block numbers for the journal
*
@@ -801,18 +924,25 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
{
int err = 0;
unsigned long long ret;
+ sector_t block = blocknr;
- if (journal->j_inode) {
- ret = bmap(journal->j_inode, blocknr);
- if (ret)
- *retp = ret;
- else {
+ if (journal->j_bmap) {
+ err = journal->j_bmap(journal, &block);
+ if (err == 0)
+ *retp = block;
+ } else if (journal->j_inode) {
+ ret = bmap(journal->j_inode, &block);
+
+ if (ret || !block) {
printk(KERN_ALERT "%s: journal block not found "
"at offset %lu on %s\n",
__func__, blocknr, journal->j_devname);
+ jbd2_journal_abort(journal, ret ? ret : -EFSCORRUPTED);
err = -EIO;
- __journal_abort_soft(journal, err);
+ } else {
+ *retp = block;
}
+
} else {
*retp = blocknr; /* +journal->j_blk_offset */
}
@@ -825,7 +955,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
* descriptor blocks we do need to generate bona fide buffers.
*
* After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
- * the buffer's contents they really should run flush_dcache_page(bh->b_page).
+ * the buffer's contents they really should run flush_dcache_folio(bh->b_folio).
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
@@ -846,6 +976,7 @@ jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh)
return NULL;
+ atomic_dec(&transaction->t_outstanding_credits);
lock_buffer(bh);
memset(bh->b_data, 0, journal->j_blocksize);
header = (journal_header_t *)bh->b_data;
@@ -869,7 +1000,7 @@ void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
sizeof(struct jbd2_journal_block_tail));
tail->t_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+ csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->t_checksum = cpu_to_be32(csum);
}
@@ -935,8 +1066,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
* space and if we lose sb update during power failure we'd replay
* old transaction with possibly newly overwritten data.
*/
- ret = jbd2_journal_update_sb_log_tail(journal, tid, block,
- REQ_SYNC | REQ_FUA);
+ ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
if (ret)
goto out;
@@ -946,8 +1076,8 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
freed += journal->j_last - journal->j_first;
trace_jbd2_update_log_tail(journal, tid, block, freed);
- jbd_debug(1,
- "Cleaning journal tail from %d to %d (offset %lu), "
+ jbd2_debug(1,
+ "Cleaning journal tail from %u to %u (offset %lu), "
"freeing %lu\n",
journal->j_tail_sequence, tid, block, freed);
@@ -961,7 +1091,7 @@ out:
}
/*
- * This is a variaon of __jbd2_update_log_tail which checks for validity of
+ * This is a variation of __jbd2_update_log_tail which checks for validity of
* provided log tail and locks j_checkpoint_mutex. So it is safe against races
* with other threads updating log tail.
*/
@@ -987,6 +1117,7 @@ static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ (*pos)++;
return NULL;
}
@@ -1040,7 +1171,7 @@ static const struct seq_operations jbd2_seq_info_ops = {
static int jbd2_seq_info_open(struct inode *inode, struct file *file)
{
- journal_t *journal = PDE_DATA(inode);
+ journal_t *journal = pde_data(inode);
struct jbd2_stats_proc_session *s;
int rc, size;
@@ -1079,12 +1210,11 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file)
return seq_release(inode, file);
}
-static const struct file_operations jbd2_seq_info_fops = {
- .owner = THIS_MODULE,
- .open = jbd2_seq_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = jbd2_seq_info_release,
+static const struct proc_ops jbd2_info_proc_ops = {
+ .proc_open = jbd2_seq_info_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = jbd2_seq_info_release,
};
static struct proc_dir_entry *proc_jbd2_stats;
@@ -1094,7 +1224,7 @@ static void jbd2_stats_proc_init(journal_t *journal)
journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
if (journal->j_proc_entry) {
proc_create_data("info", S_IRUGO, journal->j_proc_entry,
- &jbd2_seq_info_fops, journal);
+ &jbd2_info_proc_ops, journal);
}
}
@@ -1104,44 +1234,333 @@ static void jbd2_stats_proc_exit(journal_t *journal)
remove_proc_entry(journal->j_devname, proc_jbd2_stats);
}
+/* Minimum size of descriptor tag */
+static int jbd2_min_tag_size(void)
+{
+ /*
+ * Tag with 32-bit block numbers does not use last four bytes of the
+ * structure
+ */
+ return sizeof(journal_block_tag_t) - 4;
+}
+
+/**
+ * jbd2_journal_shrink_scan()
+ * @shrink: shrinker to work on
+ * @sc: reclaim request to process
+ *
+ * Scan the checkpointed buffer on the checkpoint list and release the
+ * journal_head.
+ */
+static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ journal_t *journal = shrink->private_data;
+ unsigned long nr_to_scan = sc->nr_to_scan;
+ unsigned long nr_shrunk;
+ unsigned long count;
+
+ count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
+ trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);
+
+ nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);
+
+ count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
+ trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count);
+
+ return nr_shrunk;
+}
+
+/**
+ * jbd2_journal_shrink_count()
+ * @shrink: shrinker to work on
+ * @sc: reclaim request to process
+ *
+ * Count the number of checkpoint buffers on the checkpoint list.
+ */
+static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ journal_t *journal = shrink->private_data;
+ unsigned long count;
+
+ count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
+ trace_jbd2_shrink_count(journal, sc->nr_to_scan, count);
+
+ return count;
+}
+
+/*
+ * If the journal init or create aborts, we need to mark the journal
+ * superblock as being NULL to prevent the journal destroy from writing
+ * back a bogus superblock.
+ */
+static void journal_fail_superblock(journal_t *journal)
+{
+ struct buffer_head *bh = journal->j_sb_buffer;
+ brelse(bh);
+ journal->j_sb_buffer = NULL;
+}
+
+/*
+ * Check the superblock for a given journal, performing initial
+ * validation of the format.
+ */
+static int journal_check_superblock(journal_t *journal)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+ int num_fc_blks;
+ int err = -EINVAL;
+
+ if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
+ sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
+ printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
+ return err;
+ }
+
+ if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 &&
+ be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) {
+ printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
+ return err;
+ }
+
+ if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
+ printk(KERN_WARNING "JBD2: journal file too short\n");
+ return err;
+ }
+
+ if (be32_to_cpu(sb->s_first) == 0 ||
+ be32_to_cpu(sb->s_first) >= journal->j_total_len) {
+ printk(KERN_WARNING
+ "JBD2: Invalid start block of journal: %u\n",
+ be32_to_cpu(sb->s_first));
+ return err;
+ }
+
+ /*
+ * If this is a V2 superblock, then we have to check the
+ * features flags on it.
+ */
+ if (!jbd2_format_support_feature(journal))
+ return 0;
+
+ if ((sb->s_feature_ro_compat &
+ ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
+ (sb->s_feature_incompat &
+ ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
+ printk(KERN_WARNING "JBD2: Unrecognised features on journal\n");
+ return err;
+ }
+
+ num_fc_blks = jbd2_has_feature_fast_commit(journal) ?
+ jbd2_journal_get_num_fc_blks(sb) : 0;
+ if (be32_to_cpu(sb->s_maxlen) < JBD2_MIN_JOURNAL_BLOCKS ||
+ be32_to_cpu(sb->s_maxlen) - JBD2_MIN_JOURNAL_BLOCKS < num_fc_blks) {
+ printk(KERN_ERR "JBD2: journal file too short %u,%d\n",
+ be32_to_cpu(sb->s_maxlen), num_fc_blks);
+ return err;
+ }
+
+ if (jbd2_has_feature_csum2(journal) &&
+ jbd2_has_feature_csum3(journal)) {
+ /* Can't have checksum v2 and v3 at the same time! */
+ printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
+ "at the same time!\n");
+ return err;
+ }
+
+ if (jbd2_journal_has_csum_v2or3(journal) &&
+ jbd2_has_feature_checksum(journal)) {
+ /* Can't have checksum v1 and v2 on at the same time! */
+ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
+ "at the same time!\n");
+ return err;
+ }
+
+ if (jbd2_journal_has_csum_v2or3(journal)) {
+ if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) {
+ printk(KERN_ERR "JBD2: Unknown checksum type\n");
+ return err;
+ }
+
+ /* Check superblock checksum */
+ if (sb->s_checksum != jbd2_superblock_csum(sb)) {
+ printk(KERN_ERR "JBD2: journal checksum error\n");
+ err = -EFSBADCRC;
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int journal_revoke_records_per_block(journal_t *journal)
+{
+ int record_size;
+ int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);
+
+ if (jbd2_has_feature_64bit(journal))
+ record_size = 8;
+ else
+ record_size = 4;
+
+ if (jbd2_journal_has_csum_v2or3(journal))
+ space -= sizeof(struct jbd2_journal_block_tail);
+ return space / record_size;
+}
+
+static int jbd2_journal_get_max_txn_bufs(journal_t *journal)
+{
+ return (journal->j_total_len - journal->j_fc_wbufsize) / 3;
+}
+
+/*
+ * Base amount of descriptor blocks we reserve for each transaction.
+ */
+static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
+{
+ int tag_space = journal->j_blocksize - sizeof(journal_header_t);
+ int tags_per_block;
+
+ /* Subtract UUID */
+ tag_space -= 16;
+ if (jbd2_journal_has_csum_v2or3(journal))
+ tag_space -= sizeof(struct jbd2_journal_block_tail);
+ /* Commit code leaves a slack space of 16 bytes at the end of block */
+ tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
+ /*
+ * Revoke descriptors are accounted separately so we need to reserve
+ * space for commit block and normal transaction descriptor blocks.
+ */
+ return 1 + DIV_ROUND_UP(jbd2_journal_get_max_txn_bufs(journal),
+ tags_per_block);
+}
+
+/*
+ * Initialize number of blocks each transaction reserves for its bookkeeping
+ * and maximum number of blocks a transaction can use. This needs to be called
+ * after the journal size and the fastcommit area size are initialized.
+ */
+static void jbd2_journal_init_transaction_limits(journal_t *journal)
+{
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
+ journal->j_transaction_overhead_buffers =
+ jbd2_descriptor_blocks_per_trans(journal);
+ journal->j_max_transaction_buffers =
+ jbd2_journal_get_max_txn_bufs(journal);
+}
+
+/*
+ * Load the on-disk journal superblock and read the key fields into the
+ * journal_t.
+ */
+static int journal_load_superblock(journal_t *journal)
+{
+ int err;
+ struct buffer_head *bh;
+ journal_superblock_t *sb;
+
+ bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset,
+ journal->j_blocksize);
+ if (bh)
+ err = bh_read(bh, 0);
+ if (!bh || err < 0) {
+ pr_err("%s: Cannot read journal superblock\n", __func__);
+ brelse(bh);
+ return -EIO;
+ }
+
+ journal->j_sb_buffer = bh;
+ sb = (journal_superblock_t *)bh->b_data;
+ journal->j_superblock = sb;
+ err = journal_check_superblock(journal);
+ if (err) {
+ journal_fail_superblock(journal);
+ return err;
+ }
+
+ journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
+ journal->j_tail = be32_to_cpu(sb->s_start);
+ journal->j_first = be32_to_cpu(sb->s_first);
+ journal->j_errno = be32_to_cpu(sb->s_errno);
+ journal->j_last = be32_to_cpu(sb->s_maxlen);
+
+ if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
+ journal->j_total_len = be32_to_cpu(sb->s_maxlen);
+ /* Precompute checksum seed for all metadata */
+ if (jbd2_journal_has_csum_v2or3(journal))
+ journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid,
+ sizeof(sb->s_uuid));
+ /* After journal features are set, we can compute transaction limits */
+ jbd2_journal_init_transaction_limits(journal);
+
+ if (jbd2_has_feature_fast_commit(journal)) {
+ journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
+ journal->j_last = journal->j_fc_last -
+ jbd2_journal_get_num_fc_blks(sb);
+ journal->j_fc_first = journal->j_last + 1;
+ journal->j_fc_off = 0;
+ }
+
+ return 0;
+}
+
+
/*
* Management for journal control blocks: functions to create and
* destroy journal_t structures, and to initialise and read existing
* journal blocks from disk. */
-/* First: create and setup a journal_t object in memory. We initialise
- * very few fields yet: that has to wait until we have created the
- * journal structures from from scratch, or loaded them from disk. */
+/* The journal_init_common() function creates and fills a journal_t object
+ * in memory. It calls journal_load_superblock() to load the on-disk journal
+ * superblock and initialize the journal_t object.
+ */
static journal_t *journal_init_common(struct block_device *bdev,
struct block_device *fs_dev,
unsigned long long start, int len, int blocksize)
{
- static struct lock_class_key jbd2_trans_commit_key;
journal_t *journal;
int err;
- struct buffer_head *bh;
int n;
journal = kzalloc(sizeof(*journal), GFP_KERNEL);
if (!journal)
- return NULL;
+ return ERR_PTR(-ENOMEM);
+
+ lockdep_register_key(&journal->jbd2_trans_commit_key);
+ journal->j_blocksize = blocksize;
+ journal->j_dev = bdev;
+ journal->j_fs_dev = fs_dev;
+ journal->j_blk_offset = start;
+ journal->j_total_len = len;
+ jbd2_init_fs_dev_write_error(journal);
+
+ err = journal_load_superblock(journal);
+ if (err)
+ goto err_cleanup;
init_waitqueue_head(&journal->j_wait_transaction_locked);
init_waitqueue_head(&journal->j_wait_done_commit);
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
init_waitqueue_head(&journal->j_wait_reserved);
+ init_waitqueue_head(&journal->j_fc_wait);
+ mutex_init(&journal->j_abort_mutex);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
spin_lock_init(&journal->j_list_lock);
+ spin_lock_init(&journal->j_history_lock);
rwlock_init(&journal->j_state_lock);
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
journal->j_min_batch_time = 0;
journal->j_max_batch_time = 15000; /* 15ms */
atomic_set(&journal->j_reserved_credits, 0);
+ lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
+ &journal->jbd2_trans_commit_key, 0);
/* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JBD2_ABORT;
@@ -1151,40 +1570,50 @@ static journal_t *journal_init_common(struct block_device *bdev,
if (err)
goto err_cleanup;
- spin_lock_init(&journal->j_history_lock);
-
- lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
- &jbd2_trans_commit_key, 0);
-
- /* journal descriptor can store up to n blocks -bzzz */
- journal->j_blocksize = blocksize;
- journal->j_dev = bdev;
- journal->j_fs_dev = fs_dev;
- journal->j_blk_offset = start;
- journal->j_maxlen = len;
- n = journal->j_blocksize / sizeof(journal_block_tag_t);
+ /*
+ * journal descriptor can store up to n blocks, we need enough
+ * buffers to write out full descriptor block.
+ */
+ err = -ENOMEM;
+ n = journal->j_blocksize / jbd2_min_tag_size();
journal->j_wbufsize = n;
+ journal->j_fc_wbuf = NULL;
journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!journal->j_wbuf)
goto err_cleanup;
- bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
- if (!bh) {
- pr_err("%s: Cannot get buffer for journal superblock\n",
- __func__);
+ err = percpu_counter_init(&journal->j_checkpoint_jh_count, 0,
+ GFP_KERNEL);
+ if (err)
+ goto err_cleanup;
+
+ journal->j_shrink_transaction = NULL;
+
+ journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)",
+ MAJOR(bdev->bd_dev),
+ MINOR(bdev->bd_dev));
+ if (!journal->j_shrinker) {
+ err = -ENOMEM;
goto err_cleanup;
}
- journal->j_sb_buffer = bh;
- journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+ journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan;
+ journal->j_shrinker->count_objects = jbd2_journal_shrink_count;
+ journal->j_shrinker->private_data = journal;
+
+ shrinker_register(journal->j_shrinker);
return journal;
err_cleanup:
+ percpu_counter_destroy(&journal->j_checkpoint_jh_count);
kfree(journal->j_wbuf);
jbd2_journal_destroy_revoke(journal);
+ journal_fail_superblock(journal);
+ lockdep_unregister_key(&journal->jbd2_trans_commit_key);
kfree(journal);
- return NULL;
+ return ERR_PTR(err);
}
/* jbd2_journal_init_dev and jbd2_journal_init_inode:
@@ -1217,10 +1646,11 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev,
journal_t *journal;
journal = journal_init_common(bdev, fs_dev, start, len, blocksize);
- if (!journal)
- return NULL;
+ if (IS_ERR(journal))
+ return ERR_CAST(journal);
- bdevname(journal->j_dev, journal->j_devname);
+ snprintf(journal->j_devname, sizeof(journal->j_devname),
+ "%pg", journal->j_dev);
strreplace(journal->j_devname, '/', '!');
jbd2_stats_proc_init(journal);
@@ -1238,48 +1668,36 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev,
journal_t *jbd2_journal_init_inode(struct inode *inode)
{
journal_t *journal;
- char *p;
- unsigned long long blocknr;
+ sector_t blocknr;
+ int err = 0;
- blocknr = bmap(inode, 0);
- if (!blocknr) {
- pr_err("%s: Cannot locate journal superblock\n",
- __func__);
- return NULL;
+ blocknr = 0;
+ err = bmap(inode, &blocknr);
+ if (err || !blocknr) {
+ pr_err("%s: Cannot locate journal superblock\n", __func__);
+ return err ? ERR_PTR(err) : ERR_PTR(-EINVAL);
}
- jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
+ jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev,
blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits,
inode->i_sb->s_blocksize);
- if (!journal)
- return NULL;
+ if (IS_ERR(journal))
+ return ERR_CAST(journal);
journal->j_inode = inode;
- bdevname(journal->j_dev, journal->j_devname);
- p = strreplace(journal->j_devname, '/', '!');
- sprintf(p, "-%lu", journal->j_inode->i_ino);
+ snprintf(journal->j_devname, sizeof(journal->j_devname),
+ "%pg-%lu", journal->j_dev, journal->j_inode->i_ino);
+ strreplace(journal->j_devname, '/', '!');
jbd2_stats_proc_init(journal);
return journal;
}
/*
- * If the journal init or create aborts, we need to mark the journal
- * superblock as being NULL to prevent the journal destroy from writing
- * back a bogus superblock.
- */
-static void journal_fail_superblock (journal_t *journal)
-{
- struct buffer_head *bh = journal->j_sb_buffer;
- brelse(bh);
- journal->j_sb_buffer = NULL;
-}
-
-/*
* Given a journal_t structure, initialise the various fields for
* startup of a new journaling session. We use this both when creating
* a journal, and after recovering an old journal to reset it for
@@ -1303,15 +1721,33 @@ static int journal_reset(journal_t *journal)
journal->j_first = first;
journal->j_last = last;
- journal->j_head = first;
- journal->j_tail = first;
- journal->j_free = last - first;
+ if (journal->j_head != 0 && journal->j_flags & JBD2_CYCLE_RECORD) {
+ /*
+ * Disable the cycled recording mode if the journal head block
+ * number is not correct.
+ */
+ if (journal->j_head < first || journal->j_head >= last) {
+ printk(KERN_WARNING "JBD2: Incorrect Journal head block %lu, "
+ "disable journal_cycle_record\n",
+ journal->j_head);
+ journal->j_head = journal->j_first;
+ }
+ } else {
+ journal->j_head = journal->j_first;
+ }
+ journal->j_tail = journal->j_head;
+ journal->j_free = journal->j_last - journal->j_first;
journal->j_tail_sequence = journal->j_transaction_sequence;
journal->j_commit_sequence = journal->j_transaction_sequence - 1;
journal->j_commit_request = journal->j_commit_sequence;
- journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+ /*
+ * Now that journal recovery is done, turn fast commits off here. This
+ * way, if fast commit was enabled before the crash but if now FS has
+ * disabled it, we don't enable fast commits.
+ */
+ jbd2_clear_feature_fast_commit(journal);
/*
* As a special case, if the on-disk copy is already marked as needing
@@ -1320,8 +1756,8 @@ static int journal_reset(journal_t *journal)
* attempting a write to a potential-readonly device.
*/
if (sb->s_start == 0) {
- jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
- "(start %ld, seq %d, errno %d)\n",
+ jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb "
+ "(start %ld, seq %u, errno %d)\n",
journal->j_tail, journal->j_tail_sequence,
journal->j_errno);
journal->j_flags |= JBD2_FLUSHED;
@@ -1336,23 +1772,38 @@ static int journal_reset(journal_t *journal)
*/
jbd2_journal_update_sb_log_tail(journal,
journal->j_tail_sequence,
- journal->j_tail,
- REQ_SYNC | REQ_FUA);
+ journal->j_tail, REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
return jbd2_journal_start_thread(journal);
}
-static int jbd2_write_superblock(journal_t *journal, int write_flags)
+/*
+ * This function expects that the caller will have locked the journal
+ * buffer head, and will return with it unlocked
+ */
+static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
{
struct buffer_head *bh = journal->j_sb_buffer;
journal_superblock_t *sb = journal->j_superblock;
- int ret;
+ int ret = 0;
- trace_jbd2_write_superblock(journal, write_flags);
+ /* Buffer got discarded which means block device got invalidated */
+ if (!buffer_mapped(bh)) {
+ unlock_buffer(bh);
+ return -EIO;
+ }
+
+ /*
+ * Always set high priority flags to exempt from block layer's
+ * QOS policies, e.g. writeback throttle.
+ */
+ write_flags |= JBD2_JOURNAL_REQ_FLAGS;
if (!(journal->j_flags & JBD2_BARRIER))
write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
- lock_buffer(bh);
+
+ trace_jbd2_write_superblock(journal, write_flags);
+
if (buffer_write_io_error(bh)) {
/*
* Oh, dear. A previous attempt to write the journal
@@ -1368,10 +1819,11 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
clear_buffer_write_io_error(bh);
set_buffer_uptodate(bh);
}
- jbd2_superblock_csum_set(journal, sb);
+ if (jbd2_journal_has_csum_v2or3(journal))
+ sb->s_checksum = jbd2_superblock_csum(sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
+ submit_bh(REQ_OP_WRITE | write_flags, bh);
wait_on_buffer(bh);
if (buffer_write_io_error(bh)) {
clear_buffer_write_io_error(bh);
@@ -1379,10 +1831,10 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
ret = -EIO;
}
if (ret) {
- printk(KERN_ERR "JBD2: Error %d detected when updating "
- "journal superblock for %s.\n", ret,
- journal->j_devname);
- jbd2_journal_abort(journal, ret);
+ printk(KERN_ERR "JBD2: I/O error when updating journal superblock for %s.\n",
+ journal->j_devname);
+ if (!is_journal_aborted(journal))
+ jbd2_journal_abort(journal, ret);
}
return ret;
@@ -1393,31 +1845,40 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
* @journal: The journal to update.
* @tail_tid: TID of the new transaction at the tail of the log
* @tail_block: The first block of the transaction at the tail of the log
- * @write_op: With which operation should we write the journal sb
+ * @write_flags: Flags for the journal sb write operation
*
* Update a journal's superblock information about log tail and write it to
* disk, waiting for the IO to complete.
*/
int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
- unsigned long tail_block, int write_op)
+ unsigned long tail_block,
+ blk_opf_t write_flags)
{
journal_superblock_t *sb = journal->j_superblock;
int ret;
+ if (is_journal_aborted(journal))
+ return -EIO;
+ ret = jbd2_check_fs_dev_write_error(journal);
+ if (ret) {
+ jbd2_journal_abort(journal, ret);
+ return -EIO;
+ }
+
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
- jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
+ jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
tail_block, tail_tid);
+ lock_buffer(journal->j_sb_buffer);
sb->s_sequence = cpu_to_be32(tail_tid);
sb->s_start = cpu_to_be32(tail_block);
- ret = jbd2_write_superblock(journal, write_op);
+ ret = jbd2_write_superblock(journal, write_flags);
if (ret)
goto out;
/* Log is no longer empty */
write_lock(&journal->j_state_lock);
- WARN_ON(!sb->s_sequence);
journal->j_flags &= ~JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);
@@ -1428,203 +1889,173 @@ out:
/**
* jbd2_mark_journal_empty() - Mark on disk journal as empty.
* @journal: The journal to update.
- * @write_op: With which operation should we write the journal sb
+ * @write_flags: Flags for the journal sb write operation
*
* Update a journal's dynamic superblock fields to show that journal is empty.
* Write updated superblock to disk waiting for IO to complete.
*/
-static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
+static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags)
{
journal_superblock_t *sb = journal->j_superblock;
+ bool had_fast_commit = false;
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
- read_lock(&journal->j_state_lock);
- /* Is it already empty? */
- if (sb->s_start == 0) {
- read_unlock(&journal->j_state_lock);
+ lock_buffer(journal->j_sb_buffer);
+ if (sb->s_start == 0) { /* Is it already empty? */
+ unlock_buffer(journal->j_sb_buffer);
return;
}
- jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
+
+ jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
journal->j_tail_sequence);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
sb->s_start = cpu_to_be32(0);
- read_unlock(&journal->j_state_lock);
+ sb->s_head = cpu_to_be32(journal->j_head);
+ if (jbd2_has_feature_fast_commit(journal)) {
+ /*
+ * When journal is clean, no need to commit fast commit flag and
+ * make file system incompatible with older kernels.
+ */
+ jbd2_clear_feature_fast_commit(journal);
+ had_fast_commit = true;
+ }
- jbd2_write_superblock(journal, write_op);
+ jbd2_write_superblock(journal, write_flags);
- /* Log is no longer empty */
+ if (had_fast_commit)
+ jbd2_set_feature_fast_commit(journal);
+
+ /* Log is empty */
write_lock(&journal->j_state_lock);
journal->j_flags |= JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);
}
-
/**
- * jbd2_journal_update_sb_errno() - Update error in the journal.
- * @journal: The journal to update.
- *
- * Update a journal's errno. Write updated superblock to disk waiting for IO
- * to complete.
- */
-void jbd2_journal_update_sb_errno(journal_t *journal)
-{
- journal_superblock_t *sb = journal->j_superblock;
-
- read_lock(&journal->j_state_lock);
- jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
- journal->j_errno);
- sb->s_errno = cpu_to_be32(journal->j_errno);
- read_unlock(&journal->j_state_lock);
-
- jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
-}
-EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
-
-/*
- * Read the superblock for a given journal, performing initial
- * validation of the format.
+ * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock)
+ * @journal: The journal to erase.
+ * @flags: A discard/zeroout request is sent for each physically contigous
+ * region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or
+ * JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation
+ * to perform.
+ *
+ * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes
+ * will be explicitly written if no hardware offload is available, see
+ * blkdev_issue_zeroout for more details.
*/
-static int journal_get_superblock(journal_t *journal)
+static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
{
- struct buffer_head *bh;
- journal_superblock_t *sb;
- int err = -EIO;
+ int err = 0;
+ unsigned long block, log_offset; /* logical */
+ unsigned long long phys_block, block_start, block_stop; /* physical */
+ loff_t byte_start, byte_stop, byte_count;
+
+ /* flags must be set to either discard or zeroout */
+ if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags ||
+ ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ (flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
+ return -EINVAL;
- bh = journal->j_sb_buffer;
+ if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ !bdev_max_discard_sectors(journal->j_dev))
+ return -EOPNOTSUPP;
- J_ASSERT(bh != NULL);
- if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- printk(KERN_ERR
- "JBD2: IO error reading journal superblock\n");
- goto out;
+ /*
+ * lookup block mapping and issue discard/zeroout for each
+ * contiguous region
+ */
+ log_offset = be32_to_cpu(journal->j_superblock->s_first);
+ block_start = ~0ULL;
+ for (block = log_offset; block < journal->j_total_len; block++) {
+ err = jbd2_journal_bmap(journal, block, &phys_block);
+ if (err) {
+ pr_err("JBD2: bad block at offset %lu", block);
+ return err;
}
- }
-
- if (buffer_verified(bh))
- return 0;
-
- sb = journal->j_superblock;
-
- err = -EINVAL;
-
- if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
- sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
- printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
- goto out;
- }
- switch(be32_to_cpu(sb->s_header.h_blocktype)) {
- case JBD2_SUPERBLOCK_V1:
- journal->j_format_version = 1;
- break;
- case JBD2_SUPERBLOCK_V2:
- journal->j_format_version = 2;
- break;
- default:
- printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
- goto out;
- }
-
- if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
- journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
- else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
- printk(KERN_WARNING "JBD2: journal file too short\n");
- goto out;
- }
-
- if (be32_to_cpu(sb->s_first) == 0 ||
- be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
- printk(KERN_WARNING
- "JBD2: Invalid start block of journal: %u\n",
- be32_to_cpu(sb->s_first));
- goto out;
- }
-
- if (jbd2_has_feature_csum2(journal) &&
- jbd2_has_feature_csum3(journal)) {
- /* Can't have checksum v2 and v3 at the same time! */
- printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
- "at the same time!\n");
- goto out;
- }
+ if (block_start == ~0ULL)
+ block_stop = block_start = phys_block;
- if (jbd2_journal_has_csum_v2or3_feature(journal) &&
- jbd2_has_feature_checksum(journal)) {
- /* Can't have checksum v1 and v2 on at the same time! */
- printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
- "at the same time!\n");
- goto out;
- }
+ /*
+ * last block not contiguous with current block,
+ * process last contiguous region and return to this block on
+ * next loop
+ */
+ if (phys_block != block_stop) {
+ block--;
+ } else {
+ block_stop++;
+ /*
+ * if this isn't the last block of journal,
+ * no need to process now because next block may also
+ * be part of this contiguous region
+ */
+ if (block != journal->j_total_len - 1)
+ continue;
+ }
- if (!jbd2_verify_csum_type(journal, sb)) {
- printk(KERN_ERR "JBD2: Unknown checksum type\n");
- goto out;
- }
+ /*
+ * end of contiguous region or this is last block of journal,
+ * take care of the region
+ */
+ byte_start = block_start * journal->j_blocksize;
+ byte_stop = block_stop * journal->j_blocksize;
+ byte_count = (block_stop - block_start) * journal->j_blocksize;
+
+ truncate_inode_pages_range(journal->j_dev->bd_mapping,
+ byte_start, byte_stop - 1);
+
+ if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
+ err = blkdev_issue_discard(journal->j_dev,
+ byte_start >> SECTOR_SHIFT,
+ byte_count >> SECTOR_SHIFT,
+ GFP_NOFS);
+ } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) {
+ err = blkdev_issue_zeroout(journal->j_dev,
+ byte_start >> SECTOR_SHIFT,
+ byte_count >> SECTOR_SHIFT,
+ GFP_NOFS, 0);
+ }
- /* Load the checksum driver */
- if (jbd2_journal_has_csum_v2or3_feature(journal)) {
- journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
- if (IS_ERR(journal->j_chksum_driver)) {
- printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
- err = PTR_ERR(journal->j_chksum_driver);
- journal->j_chksum_driver = NULL;
- goto out;
+ if (unlikely(err != 0)) {
+ pr_err("JBD2: (error %d) unable to wipe journal at physical blocks [%llu, %llu)",
+ err, block_start, block_stop);
+ return err;
}
- }
- /* Check superblock checksum */
- if (!jbd2_superblock_csum_verify(journal, sb)) {
- printk(KERN_ERR "JBD2: journal checksum error\n");
- err = -EFSBADCRC;
- goto out;
+ /* reset start and stop after processing a region */
+ block_start = ~0ULL;
}
- /* Precompute checksum seed for all metadata */
- if (jbd2_journal_has_csum_v2or3(journal))
- journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
- sizeof(sb->s_uuid));
-
- set_buffer_verified(bh);
-
- return 0;
-
-out:
- journal_fail_superblock(journal);
- return err;
+ return blkdev_issue_flush(journal->j_dev);
}
-/*
- * Load the on-disk journal superblock and read the key fields into the
- * journal_t.
+/**
+ * jbd2_journal_update_sb_errno() - Update error in the journal.
+ * @journal: The journal to update.
+ *
+ * Update a journal's errno. Write updated superblock to disk waiting for IO
+ * to complete.
*/
-
-static int load_superblock(journal_t *journal)
+void jbd2_journal_update_sb_errno(journal_t *journal)
{
- int err;
- journal_superblock_t *sb;
-
- err = journal_get_superblock(journal);
- if (err)
- return err;
+ journal_superblock_t *sb = journal->j_superblock;
+ int errcode;
- sb = journal->j_superblock;
+ lock_buffer(journal->j_sb_buffer);
+ errcode = journal->j_errno;
+ if (errcode == -ESHUTDOWN)
+ errcode = 0;
+ jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
+ sb->s_errno = cpu_to_be32(errcode);
- journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
- journal->j_tail = be32_to_cpu(sb->s_start);
- journal->j_first = be32_to_cpu(sb->s_first);
- journal->j_last = be32_to_cpu(sb->s_maxlen);
- journal->j_errno = be32_to_cpu(sb->s_errno);
-
- return 0;
+ jbd2_write_superblock(journal, REQ_FUA);
}
-
+EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
/**
- * int jbd2_journal_load() - Read journal from disk.
+ * jbd2_journal_load() - Read journal from disk.
* @journal: Journal to act on.
*
* Given a journal_t structure which tells us which disk blocks contain
@@ -1634,26 +2065,7 @@ static int load_superblock(journal_t *journal)
int jbd2_journal_load(journal_t *journal)
{
int err;
- journal_superblock_t *sb;
-
- err = load_superblock(journal);
- if (err)
- return err;
-
- sb = journal->j_superblock;
- /* If this is a V2 superblock, then we have to check the
- * features flags on it. */
-
- if (journal->j_format_version >= 2) {
- if ((sb->s_feature_ro_compat &
- ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
- (sb->s_feature_incompat &
- ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
- printk(KERN_WARNING
- "JBD2: Unrecognised features on journal\n");
- return -EINVAL;
- }
- }
+ journal_superblock_t *sb = journal->j_superblock;
/*
* Create a slab for this blocksize
@@ -1664,8 +2076,11 @@ int jbd2_journal_load(journal_t *journal)
/* Let the recovery code check whether it needs to recover any
* data from the journal. */
- if (jbd2_journal_recover(journal))
- goto recovery_error;
+ err = jbd2_journal_recover(journal);
+ if (err) {
+ pr_warn("JBD2: journal recovery failed\n");
+ return err;
+ }
if (journal->j_failed_commit) {
printk(KERN_ERR "JBD2: journal transaction %u on %s "
@@ -1673,24 +2088,27 @@ int jbd2_journal_load(journal_t *journal)
journal->j_devname);
return -EFSCORRUPTED;
}
+ /*
+ * clear JBD2_ABORT flag initialized in journal_init_common
+ * here to update log tail information with the newest seq.
+ */
+ journal->j_flags &= ~JBD2_ABORT;
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
* and reset them on disk. */
- if (journal_reset(journal))
- goto recovery_error;
+ err = journal_reset(journal);
+ if (err) {
+ pr_warn("JBD2: journal reset failed\n");
+ return err;
+ }
- journal->j_flags &= ~JBD2_ABORT;
journal->j_flags |= JBD2_LOADED;
return 0;
-
-recovery_error:
- printk(KERN_WARNING "JBD2: recovery failed\n");
- return -EIO;
}
/**
- * void jbd2_journal_destroy() - Release a journal_t structure.
+ * jbd2_journal_destroy() - Release a journal_t structure.
* @journal: Journal to act on.
*
* Release a journal_t structure once it is no longer in use by the
@@ -1734,6 +2152,18 @@ int jbd2_journal_destroy(journal_t *journal)
J_ASSERT(journal->j_checkpoint_transactions == NULL);
spin_unlock(&journal->j_list_lock);
+ /*
+ * OK, all checkpoint transactions have been checked, now check the
+ * writeback errseq of fs dev and abort the journal if some buffer
+ * failed to write back to the original location, otherwise the
+ * filesystem may become inconsistent.
+ */
+ if (!is_journal_aborted(journal)) {
+ int ret = jbd2_check_fs_dev_write_error(journal);
+ if (ret)
+ jbd2_journal_abort(journal, ret);
+ }
+
if (journal->j_sb_buffer) {
if (!is_journal_aborted(journal)) {
mutex_lock_io(&journal->j_checkpoint_mutex);
@@ -1743,22 +2173,25 @@ int jbd2_journal_destroy(journal_t *journal)
++journal->j_transaction_sequence;
write_unlock(&journal->j_state_lock);
- jbd2_mark_journal_empty(journal,
- REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
+ jbd2_mark_journal_empty(journal, REQ_PREFLUSH | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
} else
err = -EIO;
brelse(journal->j_sb_buffer);
}
+ if (journal->j_shrinker) {
+ percpu_counter_destroy(&journal->j_checkpoint_jh_count);
+ shrinker_free(journal->j_shrinker);
+ }
if (journal->j_proc_entry)
jbd2_stats_proc_exit(journal);
iput(journal->j_inode);
if (journal->j_revoke)
jbd2_journal_destroy_revoke(journal);
- if (journal->j_chksum_driver)
- crypto_free_shash(journal->j_chksum_driver);
+ kfree(journal->j_fc_wbuf);
kfree(journal->j_wbuf);
+ lockdep_unregister_key(&journal->jbd2_trans_commit_key);
kfree(journal);
return err;
@@ -1766,7 +2199,7 @@ int jbd2_journal_destroy(journal_t *journal)
/**
- *int jbd2_journal_check_used_features () - Check if features specified are used.
+ * jbd2_journal_check_used_features() - Check if features specified are used.
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -1776,18 +2209,14 @@ int jbd2_journal_destroy(journal_t *journal)
* features. Return true (non-zero) if it does.
**/
-int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
journal_superblock_t *sb;
if (!compat && !ro && !incompat)
return 1;
- /* Load journal superblock if it is not loaded yet. */
- if (journal->j_format_version == 0 &&
- journal_get_superblock(journal) != 0)
- return 0;
- if (journal->j_format_version == 1)
+ if (!jbd2_format_support_feature(journal))
return 0;
sb = journal->j_superblock;
@@ -1801,7 +2230,7 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
}
/**
- * int jbd2_journal_check_available_features() - Check feature set in journalling layer
+ * jbd2_journal_check_available_features() - Check feature set in journalling layer
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -1811,17 +2240,13 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
* all of a given set of features on this journal. Return true
* (non-zero) if it can. */
-int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
if (!compat && !ro && !incompat)
return 1;
- /* We can support any known requested features iff the
- * superblock is in version 2. Otherwise we fail to support any
- * extended sb features. */
-
- if (journal->j_format_version != 2)
+ if (!jbd2_format_support_feature(journal))
return 0;
if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
@@ -1832,8 +2257,35 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com
return 0;
}
+static int
+jbd2_journal_initialize_fast_commit(journal_t *journal)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+ unsigned long long num_fc_blks;
+
+ num_fc_blks = jbd2_journal_get_num_fc_blks(sb);
+ if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
+ return -ENOSPC;
+
+ /* Are we called twice? */
+ WARN_ON(journal->j_fc_wbuf != NULL);
+ journal->j_fc_wbuf = kmalloc_array(num_fc_blks,
+ sizeof(struct buffer_head *), GFP_KERNEL);
+ if (!journal->j_fc_wbuf)
+ return -ENOMEM;
+
+ journal->j_fc_wbufsize = num_fc_blks;
+ journal->j_fc_last = journal->j_last;
+ journal->j_last = journal->j_fc_last - num_fc_blks;
+ journal->j_fc_first = journal->j_last + 1;
+ journal->j_fc_off = 0;
+ journal->j_free = journal->j_last - journal->j_first;
+
+ return 0;
+}
+
/**
- * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
+ * jbd2_journal_set_features() - Mark a given journal feature in the superblock
* @journal: Journal to act on.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -1844,7 +2296,7 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com
*
*/
-int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
#define INCOMPAT_FEATURE_ON(f) \
@@ -1870,33 +2322,27 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
compat & JBD2_FEATURE_COMPAT_CHECKSUM)
compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
- jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+ jbd2_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
compat, ro, incompat);
sb = journal->j_superblock;
- /* If enabling v3 checksums, update superblock */
+ if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) {
+ if (jbd2_journal_initialize_fast_commit(journal)) {
+ pr_err("JBD2: Cannot enable fast commits.\n");
+ return 0;
+ }
+ }
+
+ lock_buffer(journal->j_sb_buffer);
+
+ /* If enabling v3 checksums, update superblock and precompute seed */
if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
sb->s_feature_compat &=
~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
-
- /* Load the checksum driver */
- if (journal->j_chksum_driver == NULL) {
- journal->j_chksum_driver = crypto_alloc_shash("crc32c",
- 0, 0);
- if (IS_ERR(journal->j_chksum_driver)) {
- printk(KERN_ERR "JBD2: Cannot load crc32c "
- "driver.\n");
- journal->j_chksum_driver = NULL;
- return 0;
- }
-
- /* Precompute checksum seed for all metadata */
- journal->j_csum_seed = jbd2_chksum(journal, ~0,
- sb->s_uuid,
- sizeof(sb->s_uuid));
- }
+ journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid,
+ sizeof(sb->s_uuid));
}
/* If enabling v1 checksums, downgrade superblock */
@@ -1908,6 +2354,14 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
sb->s_feature_compat |= cpu_to_be32(compat);
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
+ /*
+ * Update the checksum now so that it is valid even for read-only
+ * filesystems where jbd2_write_superblock() doesn't get called.
+ */
+ if (jbd2_journal_has_csum_v2or3(journal))
+ sb->s_checksum = jbd2_superblock_csum(sb);
+ unlock_buffer(journal->j_sb_buffer);
+ jbd2_journal_init_transaction_limits(journal);
return 1;
#undef COMPAT_FEATURE_ON
@@ -1915,7 +2369,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
}
/*
- * jbd2_journal_clear_features () - Clear a given journal feature in the
+ * jbd2_journal_clear_features() - Clear a given journal feature in the
* superblock
* @journal: Journal to act on.
* @compat: bitmask of compatible features
@@ -1930,27 +2384,41 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
{
journal_superblock_t *sb;
- jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
+ jbd2_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
compat, ro, incompat);
sb = journal->j_superblock;
+ lock_buffer(journal->j_sb_buffer);
sb->s_feature_compat &= ~cpu_to_be32(compat);
sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
sb->s_feature_incompat &= ~cpu_to_be32(incompat);
+ /*
+ * Update the checksum now so that it is valid even for read-only
+ * filesystems where jbd2_write_superblock() doesn't get called.
+ */
+ if (jbd2_journal_has_csum_v2or3(journal))
+ sb->s_checksum = jbd2_superblock_csum(sb);
+ unlock_buffer(journal->j_sb_buffer);
+ jbd2_journal_init_transaction_limits(journal);
}
EXPORT_SYMBOL(jbd2_journal_clear_features);
/**
- * int jbd2_journal_flush () - Flush journal
+ * jbd2_journal_flush() - Flush journal
* @journal: Journal to act on.
+ * @flags: optional operation on the journal blocks after the flush (see below)
*
* Flush all data for a given journal to disk and empty the journal.
* Filesystems can use this when remounting readonly to ensure that
- * recovery does not need to happen on remount.
+ * recovery does not need to happen on remount. Optionally, a discard or zeroout
+ * can be issued on the journal blocks after flushing.
+ *
+ * flags:
+ * JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks
+ * JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks
*/
-
-int jbd2_journal_flush(journal_t *journal)
+int jbd2_journal_flush(journal_t *journal, unsigned int flags)
{
int err = 0;
transaction_t *transaction = NULL;
@@ -2003,7 +2471,11 @@ int jbd2_journal_flush(journal_t *journal)
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
- jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+ jbd2_mark_journal_empty(journal, REQ_FUA);
+
+ if (flags)
+ err = __jbd2_journal_erase(journal, flags);
+
mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
J_ASSERT(!journal->j_running_transaction);
@@ -2017,7 +2489,7 @@ out:
}
/**
- * int jbd2_journal_wipe() - Wipe journal contents
+ * jbd2_journal_wipe() - Wipe journal contents
* @journal: Journal to act on.
* @write: flag (see below)
*
@@ -2031,16 +2503,12 @@ out:
int jbd2_journal_wipe(journal_t *journal, int write)
{
- int err = 0;
+ int err;
J_ASSERT (!(journal->j_flags & JBD2_LOADED));
- err = load_superblock(journal);
- if (err)
- return err;
-
if (!journal->j_tail)
- goto no_recovery;
+ return 0;
printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
write ? "Clearing" : "Ignoring");
@@ -2048,68 +2516,16 @@ int jbd2_journal_wipe(journal_t *journal, int write)
err = jbd2_journal_skip_recovery(journal);
if (write) {
/* Lock to make assertions happy... */
- mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+ mutex_lock_io(&journal->j_checkpoint_mutex);
+ jbd2_mark_journal_empty(journal, REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
- no_recovery:
return err;
}
-/*
- * Journal abort has very specific semantics, which we describe
- * for journal abort.
- *
- * Two internal functions, which provide abort to the jbd layer
- * itself are here.
- */
-
-/*
- * Quick version for internal journal use (doesn't lock the journal).
- * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
- * and don't attempt to make any other journal updates.
- */
-void __jbd2_journal_abort_hard(journal_t *journal)
-{
- transaction_t *transaction;
-
- if (journal->j_flags & JBD2_ABORT)
- return;
-
- printk(KERN_ERR "Aborting journal on device %s.\n",
- journal->j_devname);
-
- write_lock(&journal->j_state_lock);
- journal->j_flags |= JBD2_ABORT;
- transaction = journal->j_running_transaction;
- if (transaction)
- __jbd2_log_start_commit(journal, transaction->t_tid);
- write_unlock(&journal->j_state_lock);
-}
-
-/* Soft abort: record the abort error status in the journal superblock,
- * but don't do any other IO. */
-static void __journal_abort_soft (journal_t *journal, int errno)
-{
- if (journal->j_flags & JBD2_ABORT)
- return;
-
- if (!journal->j_errno)
- journal->j_errno = errno;
-
- __jbd2_journal_abort_hard(journal);
-
- if (errno) {
- jbd2_journal_update_sb_errno(journal);
- write_lock(&journal->j_state_lock);
- journal->j_flags |= JBD2_REC_ERR;
- write_unlock(&journal->j_state_lock);
- }
-}
-
/**
- * void jbd2_journal_abort () - Shutdown the journal immediately.
+ * jbd2_journal_abort () - Shutdown the journal immediately.
* @journal: the journal to shutdown.
* @errno: an error number to record in the journal indicating
* the reason for the shutdown.
@@ -2147,20 +2563,60 @@ static void __journal_abort_soft (journal_t *journal, int errno)
* failure to disk. ext3_error, for example, now uses this
* functionality.
*
- * Errors which originate from within the journaling layer will NOT
- * supply an errno; a null errno implies that absolutely no further
- * writes are done to the journal (unless there are any already in
- * progress).
- *
*/
void jbd2_journal_abort(journal_t *journal, int errno)
{
- __journal_abort_soft(journal, errno);
+ transaction_t *transaction;
+
+ /*
+ * Lock the aborting procedure until everything is done, this avoid
+ * races between filesystem's error handling flow (e.g. ext4_abort()),
+ * ensure panic after the error info is written into journal's
+ * superblock.
+ */
+ mutex_lock(&journal->j_abort_mutex);
+ /*
+ * ESHUTDOWN always takes precedence because a file system check
+ * caused by any other journal abort error is not required after
+ * a shutdown triggered.
+ */
+ write_lock(&journal->j_state_lock);
+ if (journal->j_flags & JBD2_ABORT) {
+ int old_errno = journal->j_errno;
+
+ write_unlock(&journal->j_state_lock);
+ if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) {
+ journal->j_errno = errno;
+ jbd2_journal_update_sb_errno(journal);
+ }
+ mutex_unlock(&journal->j_abort_mutex);
+ return;
+ }
+
+ /*
+ * Mark the abort as occurred and start current running transaction
+ * to release all journaled buffer.
+ */
+ pr_err("Aborting journal on device %s.\n", journal->j_devname);
+
+ journal->j_flags |= JBD2_ABORT;
+ journal->j_errno = errno;
+ transaction = journal->j_running_transaction;
+ if (transaction)
+ __jbd2_log_start_commit(journal, transaction->t_tid);
+ write_unlock(&journal->j_state_lock);
+
+ /*
+ * Record errno to the journal super block, so that fsck and jbd2
+ * layer could realise that a filesystem check is needed.
+ */
+ jbd2_journal_update_sb_errno(journal);
+ mutex_unlock(&journal->j_abort_mutex);
}
/**
- * int jbd2_journal_errno () - returns the journal's error state.
+ * jbd2_journal_errno() - returns the journal's error state.
* @journal: journal to examine.
*
* This is the errno number set with jbd2_journal_abort(), the last
@@ -2184,7 +2640,7 @@ int jbd2_journal_errno(journal_t *journal)
}
/**
- * int jbd2_journal_clear_err () - clears the journal's error state
+ * jbd2_journal_clear_err() - clears the journal's error state
* @journal: journal to act on.
*
* An error must be cleared or acked to take a FS out of readonly
@@ -2204,7 +2660,7 @@ int jbd2_journal_clear_err(journal_t *journal)
}
/**
- * void jbd2_journal_ack_err() - Ack journal err.
+ * jbd2_journal_ack_err() - Ack journal err.
* @journal: journal to act on.
*
* An error must be cleared or acked to take a FS out of readonly
@@ -2218,9 +2674,10 @@ void jbd2_journal_ack_err(journal_t *journal)
write_unlock(&journal->j_state_lock);
}
-int jbd2_journal_blocks_per_page(struct inode *inode)
+int jbd2_journal_blocks_per_folio(struct inode *inode)
{
- return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
+ return 1 << (PAGE_SHIFT + mapping_max_folio_order(inode->i_mapping) -
+ inode->i_sb->s_blocksize_bits);
}
/*
@@ -2273,8 +2730,7 @@ static void jbd2_journal_destroy_slabs(void)
int i;
for (i = 0; i < JBD2_MAX_SLABS; i++) {
- if (jbd2_slab[i])
- kmem_cache_destroy(jbd2_slab[i]);
+ kmem_cache_destroy(jbd2_slab[i]);
jbd2_slab[i] = NULL;
}
}
@@ -2355,30 +2811,25 @@ static struct kmem_cache *jbd2_journal_head_cache;
static atomic_t nr_journal_heads = ATOMIC_INIT(0);
#endif
-static int jbd2_journal_init_journal_head_cache(void)
+static int __init jbd2_journal_init_journal_head_cache(void)
{
- int retval;
-
- J_ASSERT(jbd2_journal_head_cache == NULL);
+ J_ASSERT(!jbd2_journal_head_cache);
jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
sizeof(struct journal_head),
0, /* offset */
SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
NULL); /* ctor */
- retval = 0;
if (!jbd2_journal_head_cache) {
- retval = -ENOMEM;
printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
+ return -ENOMEM;
}
- return retval;
+ return 0;
}
static void jbd2_journal_destroy_journal_head_cache(void)
{
- if (jbd2_journal_head_cache) {
- kmem_cache_destroy(jbd2_journal_head_cache);
- jbd2_journal_head_cache = NULL;
- }
+ kmem_cache_destroy(jbd2_journal_head_cache);
+ jbd2_journal_head_cache = NULL;
}
/*
@@ -2393,11 +2844,12 @@ static struct journal_head *journal_alloc_journal_head(void)
#endif
ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
if (!ret) {
- jbd_debug(1, "out of memory for journal_head\n");
+ jbd2_debug(1, "out of memory for journal_head\n");
pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
ret = kmem_cache_zalloc(jbd2_journal_head_cache,
GFP_NOFS | __GFP_NOFAIL);
}
+ spin_lock_init(&ret->b_state_lock);
return ret;
}
@@ -2466,7 +2918,7 @@ repeat:
} else {
J_ASSERT_BH(bh,
(atomic_read(&bh->b_count) > 0) ||
- (bh->b_page && bh->b_page->mapping));
+ (bh->b_folio && bh->b_folio->mapping));
if (!new_jh) {
jbd_unlock_bh_journal_head(bh);
@@ -2504,12 +2956,12 @@ struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
jbd_unlock_bh_journal_head(bh);
return jh;
}
+EXPORT_SYMBOL(jbd2_journal_grab_journal_head);
static void __journal_remove_journal_head(struct buffer_head *bh)
{
struct journal_head *jh = bh2jh(bh);
- J_ASSERT_JH(jh, jh->b_jcount >= 0);
J_ASSERT_JH(jh, jh->b_transaction == NULL);
J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
@@ -2517,17 +2969,23 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
J_ASSERT_BH(bh, buffer_jbd(bh));
J_ASSERT_BH(bh, jh2bh(jh) == bh);
BUFFER_TRACE(bh, "remove journal_head");
+
+ /* Unlink before dropping the lock */
+ bh->b_private = NULL;
+ jh->b_bh = NULL; /* debug, really */
+ clear_buffer_jbd(bh);
+}
+
+static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
+{
if (jh->b_frozen_data) {
printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
- jbd2_free(jh->b_frozen_data, bh->b_size);
+ jbd2_free(jh->b_frozen_data, b_size);
}
if (jh->b_committed_data) {
printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
- jbd2_free(jh->b_committed_data, bh->b_size);
+ jbd2_free(jh->b_committed_data, b_size);
}
- bh->b_private = NULL;
- jh->b_bh = NULL; /* debug, really */
- clear_buffer_jbd(bh);
journal_free_journal_head(jh);
}
@@ -2545,10 +3003,13 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
if (!jh->b_jcount) {
__journal_remove_journal_head(bh);
jbd_unlock_bh_journal_head(bh);
+ journal_release_journal_head(jh, bh->b_size);
__brelse(bh);
- } else
+ } else {
jbd_unlock_bh_journal_head(bh);
+ }
}
+EXPORT_SYMBOL(jbd2_journal_put_journal_head);
/*
* Initialize jbd inode head
@@ -2559,6 +3020,8 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
jinode->i_next_transaction = NULL;
jinode->i_vfs_inode = inode;
jinode->i_flags = 0;
+ jinode->i_dirty_start = 0;
+ jinode->i_dirty_end = 0;
INIT_LIST_HEAD(&jinode->i_list);
}
@@ -2618,29 +3081,38 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
+static int __init jbd2_journal_init_inode_cache(void)
+{
+ J_ASSERT(!jbd2_inode_cache);
+ jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
+ if (!jbd2_inode_cache) {
+ pr_emerg("JBD2: failed to create inode cache\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
static int __init jbd2_journal_init_handle_cache(void)
{
+ J_ASSERT(!jbd2_handle_cache);
jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
- if (jbd2_handle_cache == NULL) {
+ if (!jbd2_handle_cache) {
printk(KERN_EMERG "JBD2: failed to create handle cache\n");
return -ENOMEM;
}
- jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
- if (jbd2_inode_cache == NULL) {
- printk(KERN_EMERG "JBD2: failed to create inode cache\n");
- kmem_cache_destroy(jbd2_handle_cache);
- return -ENOMEM;
- }
return 0;
}
-static void jbd2_journal_destroy_handle_cache(void)
+static void jbd2_journal_destroy_inode_cache(void)
{
- if (jbd2_handle_cache)
- kmem_cache_destroy(jbd2_handle_cache);
- if (jbd2_inode_cache)
- kmem_cache_destroy(jbd2_inode_cache);
+ kmem_cache_destroy(jbd2_inode_cache);
+ jbd2_inode_cache = NULL;
+}
+static void jbd2_journal_destroy_handle_cache(void)
+{
+ kmem_cache_destroy(jbd2_handle_cache);
+ jbd2_handle_cache = NULL;
}
/*
@@ -2651,21 +3123,27 @@ static int __init journal_init_caches(void)
{
int ret;
- ret = jbd2_journal_init_revoke_caches();
+ ret = jbd2_journal_init_revoke_record_cache();
+ if (ret == 0)
+ ret = jbd2_journal_init_revoke_table_cache();
if (ret == 0)
ret = jbd2_journal_init_journal_head_cache();
if (ret == 0)
ret = jbd2_journal_init_handle_cache();
if (ret == 0)
+ ret = jbd2_journal_init_inode_cache();
+ if (ret == 0)
ret = jbd2_journal_init_transaction_cache();
return ret;
}
static void jbd2_journal_destroy_caches(void)
{
- jbd2_journal_destroy_revoke_caches();
+ jbd2_journal_destroy_revoke_record_cache();
+ jbd2_journal_destroy_revoke_table_cache();
jbd2_journal_destroy_journal_head_cache();
jbd2_journal_destroy_handle_cache();
+ jbd2_journal_destroy_inode_cache();
jbd2_journal_destroy_transaction_cache();
jbd2_journal_destroy_slabs();
}
@@ -2696,6 +3174,7 @@ static void __exit journal_exit(void)
jbd2_journal_destroy_caches();
}
+MODULE_DESCRIPTION("Generic filesystem journal-writing module");
MODULE_LICENSE("GPL");
module_init(journal_init);
module_exit(journal_exit);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 02dd3360cb20..cac8c2cd4a92 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* linux/fs/jbd2/recovery.c
*
@@ -5,10 +6,6 @@
*
* Copyright 1999-2000 Red Hat Software --- All Rights Reserved
*
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
* Journal recovery routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*/
@@ -22,6 +19,7 @@
#include <linux/errno.h>
#include <linux/crc32.h>
#include <linux/blkdev.h>
+#include <linux/string_choices.h>
#endif
/*
@@ -32,16 +30,16 @@ struct recovery_info
{
tid_t start_transaction;
tid_t end_transaction;
+ unsigned long head_block;
int nr_replays;
int nr_revokes;
int nr_revoke_hits;
};
-enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass);
-static int scan_revoke_records(journal_t *, struct buffer_head *,
+static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *,
tid_t, struct recovery_info *);
#ifdef __KERNEL__
@@ -67,9 +65,8 @@ static void journal_brelse_array(struct buffer_head *b[], int n)
*/
#define MAXBUF 8
-static int do_readahead(journal_t *journal, unsigned int start)
+static void do_readahead(journal_t *journal, unsigned int start)
{
- int err;
unsigned int max, nbufs, next;
unsigned long long blocknr;
struct buffer_head *bh;
@@ -78,8 +75,8 @@ static int do_readahead(journal_t *journal, unsigned int start)
/* Do up to 128K of readahead */
max = start + (128 * 1024 / journal->j_blocksize);
- if (max > journal->j_maxlen)
- max = journal->j_maxlen;
+ if (max > journal->j_total_len)
+ max = journal->j_total_len;
/* Do the readahead itself. We'll submit MAXBUF buffer_heads at
* a time to the block device IO layer. */
@@ -87,7 +84,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
nbufs = 0;
for (next = start; next < max; next++) {
- err = jbd2_journal_bmap(journal, next, &blocknr);
+ int err = jbd2_journal_bmap(journal, next, &blocknr);
if (err) {
printk(KERN_ERR "JBD2: bad block at offset %u\n",
@@ -96,15 +93,13 @@ static int do_readahead(journal_t *journal, unsigned int start)
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
- if (!bh) {
- err = -ENOMEM;
+ if (!bh)
goto failed;
- }
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
if (nbufs == MAXBUF) {
- ll_rw_block(REQ_OP_READ, 0, nbufs, bufs);
+ bh_readahead_batch(nbufs, bufs, 0);
journal_brelse_array(bufs, nbufs);
nbufs = 0;
}
@@ -113,13 +108,11 @@ static int do_readahead(journal_t *journal, unsigned int start)
}
if (nbufs)
- ll_rw_block(REQ_OP_READ, 0, nbufs, bufs);
- err = 0;
+ bh_readahead_batch(nbufs, bufs, 0);
failed:
if (nbufs)
journal_brelse_array(bufs, nbufs);
- return err;
}
#endif /* __KERNEL__ */
@@ -138,7 +131,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
*bhp = NULL;
- if (offset >= journal->j_maxlen) {
+ if (offset >= journal->j_total_len) {
printk(KERN_ERR "JBD2: corrupted journal superblock\n");
return -EFSCORRUPTED;
}
@@ -156,9 +149,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
return -ENOMEM;
if (!buffer_uptodate(bh)) {
- /* If this is a brand new buffer, start readahead.
- Otherwise, we assume we are already reading it. */
- if (!buffer_req(bh))
+ /*
+ * If this is a brand new buffer, start readahead.
+ * Otherwise, we assume we are already reading it.
+ */
+ bool need_readahead = !buffer_req(bh);
+
+ bh_read_nowait(bh, 0);
+ if (need_readahead)
do_readahead(journal, offset);
wait_on_buffer(bh);
}
@@ -183,11 +181,11 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
if (!jbd2_journal_has_csum_v2or3(j))
return 1;
- tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
- sizeof(struct jbd2_journal_block_tail));
+ tail = (struct jbd2_journal_block_tail *)((char *)buf +
+ j->j_blocksize - sizeof(struct jbd2_journal_block_tail));
provided = tail->t_checksum;
tail->t_checksum = 0;
- calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+ calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize);
tail->t_checksum = provided;
return provided == cpu_to_be32(calculated);
@@ -200,7 +198,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
static int count_tags(journal_t *journal, struct buffer_head *bh)
{
char * tagp;
- journal_block_tag_t * tag;
+ journal_block_tag_t tag;
int nr = 0, size = journal->j_blocksize;
int tag_bytes = journal_tag_bytes(journal);
@@ -210,14 +208,14 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + tag_bytes) <= size) {
- tag = (journal_block_tag_t *) tagp;
+ memcpy(&tag, tagp, sizeof(tag));
nr++;
tagp += tag_bytes;
- if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
+ if (!(tag.t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
tagp += 16;
- if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
+ if (tag.t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
break;
}
@@ -232,6 +230,43 @@ do { \
var -= ((journal)->j_last - (journal)->j_first); \
} while (0)
+static int fc_do_one_pass(journal_t *journal,
+ struct recovery_info *info, enum passtype pass)
+{
+ unsigned int expected_commit_id = info->end_transaction;
+ unsigned long next_fc_block;
+ struct buffer_head *bh;
+ int err = 0;
+
+ next_fc_block = journal->j_fc_first;
+ if (!journal->j_fc_replay_callback)
+ return 0;
+
+ while (next_fc_block <= journal->j_fc_last) {
+ jbd2_debug(3, "Fast commit replay: next block %ld\n",
+ next_fc_block);
+ err = jread(&bh, journal, next_fc_block);
+ if (err) {
+ jbd2_debug(3, "Fast commit replay: read error\n");
+ break;
+ }
+
+ err = journal->j_fc_replay_callback(journal, bh, pass,
+ next_fc_block - journal->j_fc_first,
+ expected_commit_id);
+ brelse(bh);
+ next_fc_block++;
+ if (err < 0 || err == JBD2_FC_REPLAY_STOP)
+ break;
+ err = 0;
+ }
+
+ if (err)
+ jbd2_debug(3, "Fast commit replay failed, err = %d\n", err);
+
+ return err;
+}
+
/**
* jbd2_journal_recover - recovers a on-disk journal
* @journal: the journal to recover
@@ -247,23 +282,24 @@ do { \
int jbd2_journal_recover(journal_t *journal)
{
int err, err2;
- journal_superblock_t * sb;
-
struct recovery_info info;
memset(&info, 0, sizeof(info));
- sb = journal->j_superblock;
/*
* The journal superblock's s_start field (the current log head)
* is always zero if, and only if, the journal was cleanly
- * unmounted.
+ * unmounted. We use its in-memory version j_tail here because
+ * jbd2_journal_wipe() could have updated it without updating journal
+ * superblock.
*/
+ if (!journal->j_tail) {
+ journal_superblock_t *sb = journal->j_superblock;
- if (!sb->s_start) {
- jbd_debug(1, "No recovery required, last transaction %d\n",
- be32_to_cpu(sb->s_sequence));
+ jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n",
+ be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head));
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
+ journal->j_head = be32_to_cpu(sb->s_head);
return 0;
}
@@ -273,23 +309,35 @@ int jbd2_journal_recover(journal_t *journal)
if (!err)
err = do_one_pass(journal, &info, PASS_REPLAY);
- jbd_debug(1, "JBD2: recovery, exit status %d, "
+ jbd2_debug(1, "JBD2: recovery, exit status %d, "
"recovered transactions %u to %u\n",
err, info.start_transaction, info.end_transaction);
- jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
+ jbd2_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
/* Restart the log at the next transaction ID, thus invalidating
* any existing commit records in the log. */
journal->j_transaction_sequence = ++info.end_transaction;
+ journal->j_head = info.head_block;
+ jbd2_debug(1, "JBD2: last transaction %d, head block %lu\n",
+ journal->j_transaction_sequence, journal->j_head);
jbd2_journal_clear_revoke(journal);
+ /* Free revoke table allocated for replay */
+ if (journal->j_revoke != journal->j_revoke_table[0] &&
+ journal->j_revoke != journal->j_revoke_table[1]) {
+ jbd2_journal_destroy_revoke_table(journal->j_revoke);
+ journal->j_revoke = journal->j_revoke_table[1];
+ }
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
+ err2 = jbd2_check_fs_dev_write_error(journal);
+ if (!err)
+ err = err2;
/* Make sure all replayed data is on permanent storage */
if (journal->j_flags & JBD2_BARRIER) {
- err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+ err2 = blkdev_issue_flush(journal->j_fs_dev);
if (!err)
err = err2;
}
@@ -322,15 +370,17 @@ int jbd2_journal_skip_recovery(journal_t *journal)
if (err) {
printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
++journal->j_transaction_sequence;
+ journal->j_head = journal->j_first;
} else {
#ifdef CONFIG_JBD2_DEBUG
int dropped = info.end_transaction -
be32_to_cpu(journal->j_superblock->s_sequence);
- jbd_debug(1,
+ jbd2_debug(1,
"JBD2: ignoring %d transaction%s from the journal.\n",
- dropped, (dropped == 1) ? "" : "s");
+ dropped, str_plural(dropped));
#endif
journal->j_transaction_sequence = ++info.end_transaction;
+ journal->j_head = info.head_block;
}
journal->j_tail = 0;
@@ -390,16 +440,37 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
h = buf;
provided = h->h_chksum[0];
h->h_chksum[0] = 0;
- calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+ calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize);
h->h_chksum[0] = provided;
return provided == cpu_to_be32(calculated);
}
+static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf)
+{
+ struct commit_header *h;
+ __be32 provided;
+ __u32 calculated;
+ void *tmpbuf;
+
+ tmpbuf = kzalloc(j->j_blocksize, GFP_KERNEL);
+ if (!tmpbuf)
+ return false;
+
+ memcpy(tmpbuf, buf, sizeof(struct commit_header));
+ h = tmpbuf;
+ provided = h->h_chksum[0];
+ h->h_chksum[0] = 0;
+ calculated = jbd2_chksum(j->j_csum_seed, tmpbuf, j->j_blocksize);
+ kfree(tmpbuf);
+
+ return provided == cpu_to_be32(calculated);
+}
+
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
+ journal_block_tag3_t *tag3,
void *buf, __u32 sequence)
{
- journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
__u32 csum32;
__be32 seq;
@@ -407,8 +478,8 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
return 1;
seq = cpu_to_be32(sequence);
- csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
- csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
+ csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
+ csum32 = jbd2_chksum(csum32, buf, j->j_blocksize);
if (jbd2_has_feature_csum3(j))
return tag3->t_checksum == cpu_to_be32(csum32);
@@ -416,21 +487,118 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
return tag->t_checksum == cpu_to_be16(csum32);
}
+static __always_inline int jbd2_do_replay(journal_t *journal,
+ struct recovery_info *info,
+ struct buffer_head *bh,
+ unsigned long *next_log_block,
+ unsigned int next_commit_ID)
+{
+ char *tagp;
+ int flags;
+ int ret = 0;
+ int tag_bytes = journal_tag_bytes(journal);
+ int descr_csum_size = 0;
+ unsigned long io_block;
+ journal_block_tag_t tag;
+ struct buffer_head *obh;
+ struct buffer_head *nbh;
+
+ if (jbd2_journal_has_csum_v2or3(journal))
+ descr_csum_size = sizeof(struct jbd2_journal_block_tail);
+
+ tagp = &bh->b_data[sizeof(journal_header_t)];
+ while (tagp - bh->b_data + tag_bytes <=
+ journal->j_blocksize - descr_csum_size) {
+ int err;
+
+ memcpy(&tag, tagp, sizeof(tag));
+ flags = be16_to_cpu(tag.t_flags);
+
+ io_block = (*next_log_block)++;
+ wrap(journal, *next_log_block);
+ err = jread(&obh, journal, io_block);
+ if (err) {
+ /* Recover what we can, but report failure at the end. */
+ ret = err;
+ pr_err("JBD2: IO error %d recovering block %lu in log\n",
+ err, io_block);
+ } else {
+ unsigned long long blocknr;
+
+ J_ASSERT(obh != NULL);
+ blocknr = read_tag_block(journal, &tag);
+
+ /* If the block has been revoked, then we're all done here. */
+ if (jbd2_journal_test_revoke(journal, blocknr,
+ next_commit_ID)) {
+ brelse(obh);
+ ++info->nr_revoke_hits;
+ goto skip_write;
+ }
+
+ /* Look for block corruption */
+ if (!jbd2_block_tag_csum_verify(journal, &tag,
+ (journal_block_tag3_t *)tagp,
+ obh->b_data, next_commit_ID)) {
+ brelse(obh);
+ ret = -EFSBADCRC;
+ pr_err("JBD2: Invalid checksum recovering data block %llu in journal block %lu\n",
+ blocknr, io_block);
+ goto skip_write;
+ }
+
+ /* Find a buffer for the new data being restored */
+ nbh = __getblk(journal->j_fs_dev, blocknr,
+ journal->j_blocksize);
+ if (nbh == NULL) {
+ pr_err("JBD2: Out of memory during recovery.\n");
+ brelse(obh);
+ return -ENOMEM;
+ }
+
+ lock_buffer(nbh);
+ memcpy(nbh->b_data, obh->b_data, journal->j_blocksize);
+ if (flags & JBD2_FLAG_ESCAPE) {
+ *((__be32 *)nbh->b_data) =
+ cpu_to_be32(JBD2_MAGIC_NUMBER);
+ }
+
+ BUFFER_TRACE(nbh, "marking dirty");
+ set_buffer_uptodate(nbh);
+ mark_buffer_dirty(nbh);
+ BUFFER_TRACE(nbh, "marking uptodate");
+ ++info->nr_replays;
+ unlock_buffer(nbh);
+ brelse(obh);
+ brelse(nbh);
+ }
+
+skip_write:
+ tagp += tag_bytes;
+ if (!(flags & JBD2_FLAG_SAME_UUID))
+ tagp += 16;
+
+ if (flags & JBD2_FLAG_LAST_TAG)
+ break;
+ }
+
+ return ret;
+}
+
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
unsigned int first_commit_ID, next_commit_ID;
- unsigned long next_log_block;
+ unsigned long next_log_block, head_block;
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
- struct buffer_head * bh;
+ struct buffer_head *bh = NULL;
unsigned int sequence;
int blocktype;
- int tag_bytes = journal_tag_bytes(journal);
__u32 crc32_sum = ~0; /* Transactional Checksums */
- int descr_csum_size = 0;
- int block_error = 0;
+ bool need_check_commit_time = false;
+ __u64 last_trans_commit_time = 0, commit_time;
/*
* First thing is to establish what we expect to find in the log
@@ -441,12 +609,38 @@ static int do_one_pass(journal_t *journal,
sb = journal->j_superblock;
next_commit_ID = be32_to_cpu(sb->s_sequence);
next_log_block = be32_to_cpu(sb->s_start);
+ head_block = next_log_block;
first_commit_ID = next_commit_ID;
if (pass == PASS_SCAN)
info->start_transaction = first_commit_ID;
+ else if (pass == PASS_REVOKE) {
+ /*
+ * Would the default revoke table have too long hash chains
+ * during replay?
+ */
+ if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) {
+ unsigned int hash_size;
+
+ /*
+ * Aim for average chain length of 8, limit at 1M
+ * entries to avoid problems with malicious
+ * filesystems.
+ */
+ hash_size = min(roundup_pow_of_two(info->nr_revokes / 8),
+ 1U << 20);
+ journal->j_revoke =
+ jbd2_journal_init_revoke_table(hash_size);
+ if (!journal->j_revoke) {
+ printk(KERN_ERR
+ "JBD2: failed to allocate revoke table for replay with %u entries. "
+ "Journal replay may be slow.\n", hash_size);
+ journal->j_revoke = journal->j_revoke_table[1];
+ }
+ }
+ }
- jbd_debug(1, "Starting recovery pass %d\n", pass);
+ jbd2_debug(1, "Starting recovery pass %d\n", pass);
/*
* Now we walk through the log, transaction by transaction,
@@ -456,12 +650,6 @@ static int do_one_pass(journal_t *journal,
*/
while (1) {
- int flags;
- char * tagp;
- journal_block_tag_t * tag;
- struct buffer_head * obh;
- struct buffer_head * nbh;
-
cond_resched();
/* If we already know where to stop the log traversal,
@@ -472,14 +660,16 @@ static int do_one_pass(journal_t *journal,
if (tid_geq(next_commit_ID, info->end_transaction))
break;
- jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+ jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
next_commit_ID, next_log_block, journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
* record. */
- jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
+ jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block);
+ brelse(bh);
+ bh = NULL;
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
@@ -495,20 +685,16 @@ static int do_one_pass(journal_t *journal,
tmp = (journal_header_t *)bh->b_data;
- if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
- brelse(bh);
+ if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER))
break;
- }
blocktype = be32_to_cpu(tmp->h_blocktype);
sequence = be32_to_cpu(tmp->h_sequence);
- jbd_debug(3, "Found magic %d, sequence %d\n",
+ jbd2_debug(3, "Found magic %d, sequence %d\n",
blocktype, sequence);
- if (sequence != next_commit_ID) {
- brelse(bh);
+ if (sequence != next_commit_ID)
break;
- }
/* OK, we have a valid descriptor block which matches
* all of the sequence number checks. What are we going
@@ -517,18 +703,22 @@ static int do_one_pass(journal_t *journal,
switch(blocktype) {
case JBD2_DESCRIPTOR_BLOCK:
/* Verify checksum first */
- if (jbd2_journal_has_csum_v2or3(journal))
- descr_csum_size =
- sizeof(struct jbd2_journal_block_tail);
- if (descr_csum_size > 0 &&
- !jbd2_descriptor_block_csum_verify(journal,
+ if (!jbd2_descriptor_block_csum_verify(journal,
bh->b_data)) {
- printk(KERN_ERR "JBD2: Invalid checksum "
- "recovering block %lu in log\n",
- next_log_block);
- err = -EFSBADCRC;
- brelse(bh);
- goto failed;
+ /*
+ * PASS_SCAN can see stale blocks due to lazy
+ * journal init. Don't error out on those yet.
+ */
+ if (pass != PASS_SCAN) {
+ pr_err("JBD2: Invalid checksum recovering block %lu in log\n",
+ next_log_block);
+ err = -EFSBADCRC;
+ goto failed;
+ }
+ need_check_commit_time = true;
+ jbd2_debug(1,
+ "invalid descriptor block found in %lu\n",
+ next_log_block);
}
/* If it is a valid descriptor block, replay it
@@ -541,121 +731,36 @@ static int do_one_pass(journal_t *journal,
!info->end_transaction) {
if (calc_chksums(journal, bh,
&next_log_block,
- &crc32_sum)) {
- put_bh(bh);
+ &crc32_sum))
break;
- }
- put_bh(bh);
continue;
}
next_log_block += count_tags(journal, bh);
wrap(journal, next_log_block);
- put_bh(bh);
continue;
}
- /* A descriptor block: we can now write all of
- * the data blocks. Yay, useful work is finally
- * getting done here! */
-
- tagp = &bh->b_data[sizeof(journal_header_t)];
- while ((tagp - bh->b_data + tag_bytes)
- <= journal->j_blocksize - descr_csum_size) {
- unsigned long io_block;
-
- tag = (journal_block_tag_t *) tagp;
- flags = be16_to_cpu(tag->t_flags);
-
- io_block = next_log_block++;
- wrap(journal, next_log_block);
- err = jread(&obh, journal, io_block);
- if (err) {
- /* Recover what we can, but
- * report failure at the end. */
- success = err;
- printk(KERN_ERR
- "JBD2: IO error %d recovering "
- "block %ld in log\n",
- err, io_block);
- } else {
- unsigned long long blocknr;
-
- J_ASSERT(obh != NULL);
- blocknr = read_tag_block(journal,
- tag);
-
- /* If the block has been
- * revoked, then we're all done
- * here. */
- if (jbd2_journal_test_revoke
- (journal, blocknr,
- next_commit_ID)) {
- brelse(obh);
- ++info->nr_revoke_hits;
- goto skip_write;
- }
-
- /* Look for block corruption */
- if (!jbd2_block_tag_csum_verify(
- journal, tag, obh->b_data,
- be32_to_cpu(tmp->h_sequence))) {
- brelse(obh);
- success = -EFSBADCRC;
- printk(KERN_ERR "JBD2: Invalid "
- "checksum recovering "
- "block %llu in log\n",
- blocknr);
- block_error = 1;
- goto skip_write;
- }
-
- /* Find a buffer for the new
- * data being restored */
- nbh = __getblk(journal->j_fs_dev,
- blocknr,
- journal->j_blocksize);
- if (nbh == NULL) {
- printk(KERN_ERR
- "JBD2: Out of memory "
- "during recovery.\n");
- err = -ENOMEM;
- brelse(bh);
- brelse(obh);
- goto failed;
- }
-
- lock_buffer(nbh);
- memcpy(nbh->b_data, obh->b_data,
- journal->j_blocksize);
- if (flags & JBD2_FLAG_ESCAPE) {
- *((__be32 *)nbh->b_data) =
- cpu_to_be32(JBD2_MAGIC_NUMBER);
- }
-
- BUFFER_TRACE(nbh, "marking dirty");
- set_buffer_uptodate(nbh);
- mark_buffer_dirty(nbh);
- BUFFER_TRACE(nbh, "marking uptodate");
- ++info->nr_replays;
- /* ll_rw_block(WRITE, 1, &nbh); */
- unlock_buffer(nbh);
- brelse(obh);
- brelse(nbh);
- }
-
- skip_write:
- tagp += tag_bytes;
- if (!(flags & JBD2_FLAG_SAME_UUID))
- tagp += 16;
-
- if (flags & JBD2_FLAG_LAST_TAG)
- break;
+ /*
+ * A descriptor block: we can now write all of the
+ * data blocks. Yay, useful work is finally getting
+ * done here!
+ */
+ err = jbd2_do_replay(journal, info, bh, &next_log_block,
+ next_commit_ID);
+ if (err) {
+ if (err == -ENOMEM)
+ goto failed;
+ success = err;
}
- brelse(bh);
continue;
case JBD2_COMMIT_BLOCK:
+ if (pass != PASS_SCAN) {
+ next_commit_ID++;
+ continue;
+ }
+
/* How to differentiate between interrupted commit
* and journal corruption ?
*
@@ -686,101 +791,127 @@ static int do_one_pass(journal_t *journal,
* mentioned conditions. Hence assume
* "Interrupted Commit".)
*/
+ commit_time = be64_to_cpu(
+ ((struct commit_header *)bh->b_data)->h_commit_sec);
+ /*
+ * If need_check_commit_time is set, it means we are in
+ * PASS_SCAN and csum verify failed before. If
+ * commit_time is increasing, it's the same journal,
+ * otherwise it is stale journal block, just end this
+ * recovery.
+ */
+ if (need_check_commit_time) {
+ if (commit_time >= last_trans_commit_time) {
+ pr_err("JBD2: Invalid checksum found in transaction %u\n",
+ next_commit_ID);
+ err = -EFSBADCRC;
+ goto failed;
+ }
+ ignore_crc_mismatch:
+ /*
+ * It likely does not belong to same journal,
+ * just end this recovery with success.
+ */
+ jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
+ next_commit_ID);
+ goto done;
+ }
- /* Found an expected commit block: if checksums
- * are present verify them in PASS_SCAN; else not
+ /*
+ * Found an expected commit block: if checksums
+ * are present, verify them in PASS_SCAN; else not
* much to do other than move on to the next sequence
- * number. */
- if (pass == PASS_SCAN &&
- jbd2_has_feature_checksum(journal)) {
- int chksum_err, chksum_seen;
+ * number.
+ */
+ if (jbd2_has_feature_checksum(journal)) {
struct commit_header *cbh =
(struct commit_header *)bh->b_data;
unsigned found_chksum =
be32_to_cpu(cbh->h_chksum[0]);
- chksum_err = chksum_seen = 0;
-
if (info->end_transaction) {
journal->j_failed_commit =
info->end_transaction;
- brelse(bh);
break;
}
- if (crc32_sum == found_chksum &&
- cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
- cbh->h_chksum_size ==
- JBD2_CRC32_CHKSUM_SIZE)
- chksum_seen = 1;
- else if (!(cbh->h_chksum_type == 0 &&
- cbh->h_chksum_size == 0 &&
- found_chksum == 0 &&
- !chksum_seen))
- /*
- * If fs is mounted using an old kernel and then
- * kernel with journal_chksum is used then we
- * get a situation where the journal flag has
- * checksum flag set but checksums are not
- * present i.e chksum = 0, in the individual
- * commit blocks.
- * Hence to avoid checksum failures, in this
- * situation, this extra check is added.
- */
- chksum_err = 1;
-
- if (chksum_err) {
- info->end_transaction = next_commit_ID;
+ /* Neither checksum match nor unused? */
+ if (!((crc32_sum == found_chksum &&
+ cbh->h_chksum_type ==
+ JBD2_CRC32_CHKSUM &&
+ cbh->h_chksum_size ==
+ JBD2_CRC32_CHKSUM_SIZE) ||
+ (cbh->h_chksum_type == 0 &&
+ cbh->h_chksum_size == 0 &&
+ found_chksum == 0)))
+ goto chksum_error;
- if (!jbd2_has_feature_async_commit(journal)) {
- journal->j_failed_commit =
- next_commit_ID;
- brelse(bh);
- break;
- }
- }
crc32_sum = ~0;
+ goto chksum_ok;
}
- if (pass == PASS_SCAN &&
- !jbd2_commit_block_csum_verify(journal,
- bh->b_data)) {
- info->end_transaction = next_commit_ID;
- if (!jbd2_has_feature_async_commit(journal)) {
- journal->j_failed_commit =
- next_commit_ID;
- brelse(bh);
- break;
- }
+ if (jbd2_commit_block_csum_verify(journal, bh->b_data))
+ goto chksum_ok;
+
+ if (jbd2_commit_block_csum_verify_partial(journal,
+ bh->b_data)) {
+ pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n",
+ next_commit_ID, next_log_block);
+ goto chksum_ok;
}
- brelse(bh);
+
+chksum_error:
+ if (commit_time < last_trans_commit_time)
+ goto ignore_crc_mismatch;
+ info->end_transaction = next_commit_ID;
+ info->head_block = head_block;
+
+ if (!jbd2_has_feature_async_commit(journal)) {
+ journal->j_failed_commit = next_commit_ID;
+ break;
+ }
+
+chksum_ok:
+ last_trans_commit_time = commit_time;
+ head_block = next_log_block;
next_commit_ID++;
continue;
case JBD2_REVOKE_BLOCK:
- /* If we aren't in the REVOKE pass, then we can
- * just skip over this block. */
- if (pass != PASS_REVOKE) {
- brelse(bh);
+ /*
+ * If we aren't in the SCAN or REVOKE pass, then we can
+ * just skip over this block.
+ */
+ if (pass != PASS_REVOKE && pass != PASS_SCAN)
continue;
+
+ /*
+ * Check revoke block crc in pass_scan, if csum verify
+ * failed, check commit block time later.
+ */
+ if (pass == PASS_SCAN &&
+ !jbd2_descriptor_block_csum_verify(journal,
+ bh->b_data)) {
+ jbd2_debug(1, "JBD2: invalid revoke block found in %lu\n",
+ next_log_block);
+ need_check_commit_time = true;
}
- err = scan_revoke_records(journal, bh,
+ err = scan_revoke_records(journal, pass, bh,
next_commit_ID, info);
- brelse(bh);
if (err)
goto failed;
continue;
default:
- jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+ jbd2_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
- brelse(bh);
goto done;
}
}
done:
+ brelse(bh);
/*
* We broke out of the log scan loop: either we came to the
* known end of the log or we found an unexpected block in the
@@ -791,6 +922,8 @@ static int do_one_pass(journal_t *journal,
if (pass == PASS_SCAN) {
if (!info->end_transaction)
info->end_transaction = next_commit_ID;
+ if (!info->head_block)
+ info->head_block = head_block;
} else {
/* It's really bad news if different passes end up at
* different places (but possible due to IO errors). */
@@ -802,22 +935,29 @@ static int do_one_pass(journal_t *journal,
success = -EIO;
}
}
- if (block_error && success == 0)
- success = -EIO;
+
+ if (jbd2_has_feature_fast_commit(journal) && pass != PASS_REVOKE) {
+ err = fc_do_one_pass(journal, info, pass);
+ if (err)
+ success = err;
+ }
+
return success;
failed:
+ brelse(bh);
return err;
}
/* Scan a revoke record, marking all blocks mentioned as revoked. */
-static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
- tid_t sequence, struct recovery_info *info)
+static int scan_revoke_records(journal_t *journal, enum passtype pass,
+ struct buffer_head *bh, tid_t sequence,
+ struct recovery_info *info)
{
jbd2_journal_revoke_header_t *header;
int offset, max;
- int csum_size = 0;
+ unsigned csum_size = 0;
__u32 rcount;
int record_len = 4;
@@ -825,9 +965,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
offset = sizeof(jbd2_journal_revoke_header_t);
rcount = be32_to_cpu(header->r_count);
- if (!jbd2_descriptor_block_csum_verify(journal, header))
- return -EFSBADCRC;
-
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_block_tail);
if (rcount > journal->j_blocksize - csum_size)
@@ -837,6 +974,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
if (jbd2_has_feature_64bit(journal))
record_len = 8;
+ if (pass == PASS_SCAN) {
+ info->nr_revokes += (max - offset) / record_len;
+ return 0;
+ }
+
while (offset + record_len <= max) {
unsigned long long blocknr;
int err;
@@ -849,7 +991,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
err = jbd2_journal_set_revoke(journal, blocknr, sequence);
if (err)
return err;
- ++info->nr_revokes;
}
return 0;
}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f9aefcda5854..1467f6790747 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* linux/fs/jbd2/revoke.c
*
@@ -5,10 +6,6 @@
*
* Copyright 2000 Red Hat corp --- All Rights Reserved
*
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
* Journal revoke routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*
@@ -181,40 +178,44 @@ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
return NULL;
}
-void jbd2_journal_destroy_revoke_caches(void)
+void jbd2_journal_destroy_revoke_record_cache(void)
{
- if (jbd2_revoke_record_cache) {
- kmem_cache_destroy(jbd2_revoke_record_cache);
- jbd2_revoke_record_cache = NULL;
- }
- if (jbd2_revoke_table_cache) {
- kmem_cache_destroy(jbd2_revoke_table_cache);
- jbd2_revoke_table_cache = NULL;
- }
+ kmem_cache_destroy(jbd2_revoke_record_cache);
+ jbd2_revoke_record_cache = NULL;
}
-int __init jbd2_journal_init_revoke_caches(void)
+void jbd2_journal_destroy_revoke_table_cache(void)
{
- J_ASSERT(!jbd2_revoke_record_cache);
- J_ASSERT(!jbd2_revoke_table_cache);
+ kmem_cache_destroy(jbd2_revoke_table_cache);
+ jbd2_revoke_table_cache = NULL;
+}
+int __init jbd2_journal_init_revoke_record_cache(void)
+{
+ J_ASSERT(!jbd2_revoke_record_cache);
jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
- if (!jbd2_revoke_record_cache)
- goto record_cache_failure;
+ if (!jbd2_revoke_record_cache) {
+ pr_emerg("JBD2: failed to create revoke_record cache\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int __init jbd2_journal_init_revoke_table_cache(void)
+{
+ J_ASSERT(!jbd2_revoke_table_cache);
jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
SLAB_TEMPORARY);
- if (!jbd2_revoke_table_cache)
- goto table_cache_failure;
- return 0;
-table_cache_failure:
- jbd2_journal_destroy_revoke_caches();
-record_cache_failure:
+ if (!jbd2_revoke_table_cache) {
+ pr_emerg("JBD2: failed to create revoke_table cache\n");
return -ENOMEM;
+ }
+ return 0;
}
-static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
+struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
{
int shift = 0;
int tmp = hash_size;
@@ -230,7 +231,7 @@ static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
table->hash_size = hash_size;
table->hash_shift = shift;
table->hash_table =
- kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+ kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
if (!table->hash_table) {
kmem_cache_free(jbd2_revoke_table_cache, table);
table = NULL;
@@ -244,7 +245,7 @@ out:
return table;
}
-static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
+void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
{
int i;
struct list_head *hash_list;
@@ -254,7 +255,7 @@ static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
J_ASSERT(list_empty(hash_list));
}
- kfree(table->hash_table);
+ kvfree(table->hash_table);
kmem_cache_free(jbd2_revoke_table_cache, table);
}
@@ -344,7 +345,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
bh = bh_in;
if (!bh) {
- bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ bh = __find_get_block_nonatomic(bdev, blocknr,
+ journal->j_blocksize);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
@@ -354,7 +356,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
/* If there is a different buffer_head lying around in
* memory anywhere... */
- bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ bh2 = __find_get_block_nonatomic(bdev, blocknr,
+ journal->j_blocksize);
if (bh2) {
/* ... and it has RevokeValid status... */
if (bh2 != bh && buffer_revokevalid(bh2))
@@ -370,6 +373,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
}
#endif
+ if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) {
+ if (!bh_in)
+ brelse(bh);
+ return -EIO;
+ }
/* We really ought not ever to revoke twice in a row without
first having the revoke cancelled: it's illegal to free a
block twice without allocating it in between! */
@@ -390,8 +398,9 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
__brelse(bh);
}
}
+ handle->h_revoke_credits--;
- jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
+ jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
handle->h_transaction->t_tid);
BUFFER_TRACE(bh_in, "exit");
@@ -413,15 +422,14 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
* do not trust the Revoked bit on buffers unless RevokeValid is also
* set.
*/
-int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
struct jbd2_revoke_record_s *record;
journal_t *journal = handle->h_transaction->t_journal;
int need_cancel;
- int did_revoke = 0; /* akpm: debug */
struct buffer_head *bh = jh2bh(jh);
- jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+ jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh);
/* Is the existing Revoke bit valid? If so, we trust it, and
* only perform the full cancel if the revoke bit is set. If
@@ -437,13 +445,12 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
if (need_cancel) {
record = find_revoke_record(journal, bh->b_blocknr);
if (record) {
- jbd_debug(4, "cancelled existing revoke on "
+ jbd2_debug(4, "cancelled existing revoke on "
"blocknr %llu\n", (unsigned long long)bh->b_blocknr);
spin_lock(&journal->j_revoke_lock);
list_del(&record->hash);
spin_unlock(&journal->j_revoke_lock);
kmem_cache_free(jbd2_revoke_record_cache, record);
- did_revoke = 1;
}
}
@@ -459,18 +466,18 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
- bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+ bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr,
+ bh->b_size);
if (bh2) {
if (bh2 != bh)
clear_buffer_revoked(bh2);
__brelse(bh2);
}
}
- return did_revoke;
}
/*
- * journal_clear_revoked_flag clears revoked flag of buffers in
+ * jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in
* revoke table to reflect there is no revoked buffers in the next
* transaction which is going to be started.
*/
@@ -488,9 +495,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
struct jbd2_revoke_record_s *record;
struct buffer_head *bh;
record = (struct jbd2_revoke_record_s *)list_entry;
- bh = __find_get_block(journal->j_fs_dev,
- record->blocknr,
- journal->j_blocksize);
+ bh = __find_get_block_nonatomic(journal->j_fs_dev,
+ record->blocknr,
+ journal->j_blocksize);
if (bh) {
clear_buffer_revoked(bh);
__brelse(bh);
@@ -499,9 +506,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
}
}
-/* journal_switch_revoke table select j_revoke for next transaction
- * we do not want to suspend any processing until all revokes are
- * written -bzzz
+/* jbd2_journal_switch_revoke_table table select j_revoke for next
+ * transaction we do not want to suspend any processing until all
+ * revokes are written -bzzz
*/
void jbd2_journal_switch_revoke_table(journal_t *journal)
{
@@ -553,7 +560,7 @@ void jbd2_journal_write_revoke_records(transaction_t *transaction,
}
if (descriptor)
flush_descriptor(journal, descriptor, offset);
- jbd_debug(1, "Wrote %d revoke records\n", count);
+ jbd2_debug(1, "Wrote %d revoke records\n", count);
}
/*
@@ -637,10 +644,8 @@ static void flush_descriptor(journal_t *journal,
{
jbd2_journal_revoke_header_t *header;
- if (is_journal_aborted(journal)) {
- put_bh(descriptor);
+ if (is_journal_aborted(journal))
return;
- }
header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
header->r_count = cpu_to_be32(offset);
@@ -649,7 +654,7 @@ static void flush_descriptor(journal_t *journal,
set_buffer_jwrite(descriptor);
BUFFER_TRACE(descriptor, "write");
set_buffer_dirty(descriptor);
- write_dirty_buffer(descriptor, REQ_SYNC);
+ write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS);
}
#endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 8b08044b3120..dca4b5d8aaaa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* linux/fs/jbd2/transaction.c
*
@@ -5,10 +6,6 @@
*
* Copyright 1998 Red Hat corp --- All Rights Reserved
*
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
* Generic filesystem transaction handling code; part of the ext2fs
* journaling system.
*
@@ -45,17 +42,17 @@ int __init jbd2_journal_init_transaction_cache(void)
0,
SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
NULL);
- if (transaction_cache)
- return 0;
- return -ENOMEM;
+ if (!transaction_cache) {
+ pr_emerg("JBD2: failed to create transaction cache\n");
+ return -ENOMEM;
+ }
+ return 0;
}
void jbd2_journal_destroy_transaction_cache(void)
{
- if (transaction_cache) {
- kmem_cache_destroy(transaction_cache);
- transaction_cache = NULL;
- }
+ kmem_cache_destroy(transaction_cache);
+ transaction_cache = NULL;
}
void jbd2_journal_free_transaction(transaction_t *transaction)
@@ -68,7 +65,7 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
- * Simply allocate and initialise a new transaction. Create it in
+ * Simply initialise a new transaction. Initialize it in
* RUNNING state and add it to the current journal (which should not
* have an existing running transaction: we only make a new transaction
* once we have started to commit the old one).
@@ -80,21 +77,21 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
*
*/
-static transaction_t *
-jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
+static void jbd2_get_transaction(journal_t *journal,
+ transaction_t *transaction)
{
transaction->t_journal = journal;
transaction->t_state = T_RUNNING;
transaction->t_start_time = ktime_get();
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
- spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits,
+ journal->j_transaction_overhead_buffers +
atomic_read(&journal->j_reserved_credits));
+ atomic_set(&transaction->t_outstanding_revokes, 0);
atomic_set(&transaction->t_handle_count, 0);
INIT_LIST_HEAD(&transaction->t_inode_list);
- INIT_LIST_HEAD(&transaction->t_private_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
@@ -105,8 +102,6 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_max_wait = 0;
transaction->t_start = jiffies;
transaction->t_requested = 0;
-
- return transaction;
}
/*
@@ -118,34 +113,27 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
*/
/*
- * Update transaction's maximum wait time, if debugging is enabled.
- *
- * In order for t_max_wait to be reliable, it must be protected by a
- * lock. But doing so will mean that start_this_handle() can not be
- * run in parallel on SMP systems, which limits our scalability. So
- * unless debugging is enabled, we no longer update t_max_wait, which
- * means that maximum wait time reported by the jbd2_run_stats
- * tracepoint will always be zero.
+ * t_max_wait is carefully updated here with use of atomic compare exchange.
+ * Note that there could be multiplre threads trying to do this simultaneously
+ * hence using cmpxchg to avoid any use of locks in this case.
*/
static inline void update_t_max_wait(transaction_t *transaction,
unsigned long ts)
{
-#ifdef CONFIG_JBD2_DEBUG
- if (jbd2_journal_enable_debug &&
- time_after(transaction->t_start, ts)) {
- ts = jbd2_time_diff(ts, transaction->t_start);
- spin_lock(&transaction->t_handle_lock);
- if (ts > transaction->t_max_wait)
- transaction->t_max_wait = ts;
- spin_unlock(&transaction->t_handle_lock);
+ unsigned long oldts, newts;
+
+ if (time_after(transaction->t_start, ts)) {
+ newts = jbd2_time_diff(ts, transaction->t_start);
+ oldts = READ_ONCE(transaction->t_max_wait);
+ while (oldts < newts)
+ oldts = cmpxchg(&transaction->t_max_wait, oldts, newts);
}
-#endif
}
/*
- * Wait until running transaction passes T_LOCKED state. Also starts the commit
- * if needed. The function expects running transaction to exist and releases
- * j_state_lock.
+ * Wait until running transaction passes to T_FLUSH state and new transaction
+ * can thus be started. Also starts the commit if needed. The function expects
+ * running transaction to exist and releases j_state_lock.
*/
static void wait_transaction_locked(journal_t *journal)
__releases(journal->j_state_lock)
@@ -154,7 +142,7 @@ static void wait_transaction_locked(journal_t *journal)
int need_to_start;
tid_t tid = journal->j_running_transaction->t_tid;
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
@@ -165,20 +153,61 @@ static void wait_transaction_locked(journal_t *journal)
finish_wait(&journal->j_wait_transaction_locked, &wait);
}
+/*
+ * Wait until running transaction transitions from T_SWITCH to T_FLUSH
+ * state and new transaction can thus be started. The function releases
+ * j_state_lock.
+ */
+static void wait_transaction_switching(journal_t *journal)
+ __releases(journal->j_state_lock)
+{
+ DEFINE_WAIT(wait);
+
+ if (WARN_ON(!journal->j_running_transaction ||
+ journal->j_running_transaction->t_state != T_SWITCH)) {
+ read_unlock(&journal->j_state_lock);
+ return;
+ }
+ prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
+ TASK_UNINTERRUPTIBLE);
+ read_unlock(&journal->j_state_lock);
+ /*
+ * We don't call jbd2_might_wait_for_commit() here as there's no
+ * waiting for outstanding handles happening anymore in T_SWITCH state
+ * and handling of reserved handles actually relies on that for
+ * correctness.
+ */
+ schedule();
+ finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+
static void sub_reserved_credits(journal_t *journal, int blocks)
{
atomic_sub(blocks, &journal->j_reserved_credits);
wake_up(&journal->j_wait_reserved);
}
+/* Maximum number of blocks for user transaction payload */
+static int jbd2_max_user_trans_buffers(journal_t *journal)
+{
+ return journal->j_max_transaction_buffers -
+ journal->j_transaction_overhead_buffers;
+}
+
/*
* Wait until we can add credits for handle to the running transaction. Called
* with j_state_lock held for reading. Returns 0 if handle joined the running
* transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
* caller must retry.
+ *
+ * Note: because j_state_lock may be dropped depending on the return
+ * value, we need to fake out sparse so ti doesn't complain about a
+ * locking imbalance. Callers of add_transaction_credits will need to
+ * make a similar accomodation.
*/
static int add_transaction_credits(journal_t *journal, int blocks,
int rsv_blocks)
+__must_hold(&journal->j_state_lock)
{
transaction_t *t = journal->j_running_transaction;
int needed;
@@ -188,8 +217,10 @@ static int add_transaction_credits(journal_t *journal, int blocks,
* If the current transaction is locked down for commit, wait
* for the lock to be released.
*/
- if (t->t_state == T_LOCKED) {
+ if (t->t_state != T_RUNNING) {
+ WARN_ON_ONCE(t->t_state >= T_FLUSH);
wait_transaction_locked(journal);
+ __acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
@@ -212,16 +243,18 @@ static int add_transaction_credits(journal_t *journal, int blocks,
* big to fit this handle? Wait until reserved credits are freed.
*/
if (atomic_read(&journal->j_reserved_credits) + total >
- journal->j_max_transaction_buffers) {
+ jbd2_max_user_trans_buffers(journal)) {
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + total <=
- journal->j_max_transaction_buffers);
+ jbd2_max_user_trans_buffers(journal));
+ __acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
wait_transaction_locked(journal);
+ __acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
@@ -236,14 +269,16 @@ static int add_transaction_credits(journal_t *journal, int blocks,
* *before* starting to dirty potentially checkpointed buffers
* in the new transaction.
*/
- if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+ if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
write_lock(&journal->j_state_lock);
- if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+ if (jbd2_log_space_left(journal) <
+ journal->j_max_transaction_buffers)
__jbd2_log_wait_for_space(journal);
write_unlock(&journal->j_state_lock);
+ __acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
@@ -253,14 +288,15 @@ static int add_transaction_credits(journal_t *journal, int blocks,
needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
/* We allow at most half of a transaction to be reserved */
- if (needed > journal->j_max_transaction_buffers / 2) {
+ if (needed > jbd2_max_user_trans_buffers(journal) / 2) {
sub_reserved_credits(journal, rsv_blocks);
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + rsv_blocks
- <= journal->j_max_transaction_buffers / 2);
+ <= jbd2_max_user_trans_buffers(journal) / 2);
+ __acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
return 0;
@@ -277,30 +313,35 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
gfp_t gfp_mask)
{
transaction_t *transaction, *new_transaction = NULL;
- int blocks = handle->h_buffer_credits;
+ int blocks = handle->h_total_credits;
int rsv_blocks = 0;
unsigned long ts = jiffies;
if (handle->h_rsv_handle)
- rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+ rsv_blocks = handle->h_rsv_handle->h_total_credits;
/*
* Limit the number of reserved credits to 1/2 of maximum transaction
* size and limit the number of total credits to not exceed maximum
* transaction size per operation.
*/
- if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
- (rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
+ if (rsv_blocks > jbd2_max_user_trans_buffers(journal) / 2 ||
+ rsv_blocks + blocks > jbd2_max_user_trans_buffers(journal)) {
printk(KERN_ERR "JBD2: %s wants too many credits "
"credits:%d rsv_credits:%d max:%d\n",
current->comm, blocks, rsv_blocks,
- journal->j_max_transaction_buffers);
+ jbd2_max_user_trans_buffers(journal));
WARN_ON(1);
return -ENOSPC;
}
alloc_transaction:
- if (!journal->j_running_transaction) {
+ /*
+ * This check is racy but it is just an optimization of allocating new
+ * transaction early if there are high chances we'll need it. If we
+ * guess wrong, we'll retry or free unused transaction.
+ */
+ if (!data_race(journal->j_running_transaction)) {
/*
* If __GFP_FS is not present, then we may be being called from
* inside the fs writeback layer, so we MUST NOT fail.
@@ -313,7 +354,7 @@ alloc_transaction:
return -ENOMEM;
}
- jbd_debug(3, "New handle %p going live.\n", handle);
+ jbd2_debug(3, "New handle %p going live.\n", handle);
/*
* We need to hold j_state_lock until t_updates has been incremented,
@@ -359,35 +400,48 @@ repeat:
if (!handle->h_reserved) {
/* We may have dropped j_state_lock - restart in that case */
- if (add_transaction_credits(journal, blocks, rsv_blocks))
+ if (add_transaction_credits(journal, blocks, rsv_blocks)) {
+ /*
+ * add_transaction_credits releases
+ * j_state_lock on a non-zero return
+ */
+ __release(&journal->j_state_lock);
goto repeat;
+ }
} else {
/*
* We have handle reserved so we are allowed to join T_LOCKED
* transaction and we don't have to check for transaction size
- * and journal space.
+ * and journal space. But we still have to wait while running
+ * transaction is being switched to a committing one as it
+ * won't wait for any handles anymore.
*/
+ if (transaction->t_state == T_SWITCH) {
+ wait_transaction_switching(journal);
+ goto repeat;
+ }
sub_reserved_credits(journal, blocks);
handle->h_reserved = 0;
}
/* OK, account for the buffers that this operation expects to
- * use and add the handle to the running transaction.
+ * use and add the handle to the running transaction.
*/
update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
handle->h_requested_credits = blocks;
+ handle->h_revoke_credits_requested = handle->h_revoke_credits;
handle->h_start_jiffies = jiffies;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
- jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+ jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
handle, blocks,
atomic_read(&transaction->t_outstanding_credits),
jbd2_log_space_left(journal));
read_unlock(&journal->j_state_lock);
current->journal_info = handle;
- rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
+ rwsem_acquire_read(&journal->j_trans_commit_map, 0, 1, _THIS_IP_);
jbd2_journal_free_transaction(new_transaction);
/*
* Ensure that no allocations done while the transaction is open are
@@ -403,15 +457,15 @@ static handle_t *new_handle(int nblocks)
handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
if (!handle)
return NULL;
- handle->h_buffer_credits = nblocks;
+ handle->h_total_credits = nblocks;
handle->h_ref = 1;
return handle;
}
handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
- gfp_t gfp_mask, unsigned int type,
- unsigned int line_no)
+ int revoke_records, gfp_t gfp_mask,
+ unsigned int type, unsigned int line_no)
{
handle_t *handle = journal_current_handle();
int err;
@@ -425,6 +479,8 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
return handle;
}
+ nblocks += DIV_ROUND_UP(revoke_records,
+ journal->j_revoke_records_per_block);
handle = new_handle(nblocks);
if (!handle)
return ERR_PTR(-ENOMEM);
@@ -440,6 +496,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
rsv_handle->h_journal = journal;
handle->h_rsv_handle = rsv_handle;
}
+ handle->h_revoke_credits = revoke_records;
err = start_this_handle(journal, handle, gfp_mask);
if (err < 0) {
@@ -460,7 +517,7 @@ EXPORT_SYMBOL(jbd2__journal_start);
/**
- * handle_t *jbd2_journal_start() - Obtain a new handle.
+ * jbd2_journal_start() - Obtain a new handle.
* @journal: Journal to start transaction on.
* @nblocks: number of block buffer we might modify
*
@@ -468,7 +525,7 @@ EXPORT_SYMBOL(jbd2__journal_start);
* modified buffers in the log. We block until the log can guarantee
* that much space. Additionally, if rsv_blocks > 0, we also create another
* handle with rsv_blocks reserved blocks in the journal. This handle is
- * is stored in h_rsv_handle. It is not attached to any particular transaction
+ * stored in h_rsv_handle. It is not attached to any particular transaction
* and thus doesn't block transaction commit. If the caller uses this reserved
* handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
* on the parent handle will dispose the reserved one. Reserved handle has to
@@ -480,23 +537,37 @@ EXPORT_SYMBOL(jbd2__journal_start);
*/
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
- return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
+ return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);
-void jbd2_journal_free_reserved(handle_t *handle)
+static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t)
{
journal_t *journal = handle->h_journal;
WARN_ON(!handle->h_reserved);
- sub_reserved_credits(journal, handle->h_buffer_credits);
+ sub_reserved_credits(journal, handle->h_total_credits);
+ if (t)
+ atomic_sub(handle->h_total_credits, &t->t_outstanding_credits);
+}
+
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+ journal_t *journal = handle->h_journal;
+
+ /* Get j_state_lock to pin running transaction if it exists */
+ read_lock(&journal->j_state_lock);
+ __jbd2_journal_unreserve_handle(handle, journal->j_running_transaction);
+ read_unlock(&journal->j_state_lock);
jbd2_free_handle(handle);
}
EXPORT_SYMBOL(jbd2_journal_free_reserved);
/**
- * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
+ * jbd2_journal_start_reserved() - start reserved handle
* @handle: handle to start
+ * @type: for handle statistics
+ * @line_no: for handle statistics
*
* Start handle that has been previously reserved with jbd2_journal_reserve().
* This attaches @handle to the running transaction (or creates one if there's
@@ -533,19 +604,24 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
*/
ret = start_this_handle(journal, handle, GFP_NOFS);
if (ret < 0) {
+ handle->h_journal = journal;
jbd2_journal_free_reserved(handle);
return ret;
}
handle->h_type = type;
handle->h_line_no = line_no;
+ trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
+ handle->h_transaction->t_tid, type,
+ line_no, handle->h_total_credits);
return 0;
}
EXPORT_SYMBOL(jbd2_journal_start_reserved);
/**
- * int jbd2_journal_extend() - extend buffer credits.
+ * jbd2_journal_extend() - extend buffer credits.
* @handle: handle to 'extend'
* @nblocks: nr blocks to try to extend by.
+ * @revoke_records: number of revoke records to try to extend by.
*
* Some transactions, such as large extends and truncates, can be done
* atomically all at once or in several stages. The operation requests
@@ -562,7 +638,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved);
* return code < 0 implies an error
* return code > 0 implies normal transaction-full status.
*/
-int jbd2_journal_extend(handle_t *handle, int nblocks)
+int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
@@ -579,53 +655,96 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
/* Don't extend a locked-down transaction! */
if (transaction->t_state != T_RUNNING) {
- jbd_debug(3, "denied handle %p %d blocks: "
+ jbd2_debug(3, "denied handle %p %d blocks: "
"transaction not running\n", handle, nblocks);
goto error_out;
}
- spin_lock(&transaction->t_handle_lock);
+ nblocks += DIV_ROUND_UP(
+ handle->h_revoke_credits_requested + revoke_records,
+ journal->j_revoke_records_per_block) -
+ DIV_ROUND_UP(
+ handle->h_revoke_credits_requested,
+ journal->j_revoke_records_per_block);
wanted = atomic_add_return(nblocks,
&transaction->t_outstanding_credits);
if (wanted > journal->j_max_transaction_buffers) {
- jbd_debug(3, "denied handle %p %d blocks: "
+ jbd2_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks);
atomic_sub(nblocks, &transaction->t_outstanding_credits);
- goto unlock;
- }
-
- if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
- jbd2_log_space_left(journal)) {
- jbd_debug(3, "denied handle %p %d blocks: "
- "insufficient log space\n", handle, nblocks);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- goto unlock;
+ goto error_out;
}
trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
transaction->t_tid,
handle->h_type, handle->h_line_no,
- handle->h_buffer_credits,
+ handle->h_total_credits,
nblocks);
- handle->h_buffer_credits += nblocks;
+ handle->h_total_credits += nblocks;
handle->h_requested_credits += nblocks;
+ handle->h_revoke_credits += revoke_records;
+ handle->h_revoke_credits_requested += revoke_records;
result = 0;
- jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
-unlock:
- spin_unlock(&transaction->t_handle_lock);
+ jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks);
error_out:
read_unlock(&journal->j_state_lock);
return result;
}
+static void stop_this_handle(handle_t *handle)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ int revokes;
+
+ J_ASSERT(journal_current_handle() == handle);
+ J_ASSERT(atomic_read(&transaction->t_updates) > 0);
+ current->journal_info = NULL;
+ /*
+ * Subtract necessary revoke descriptor blocks from handle credits. We
+ * take care to account only for revoke descriptor blocks the
+ * transaction will really need as large sequences of transactions with
+ * small numbers of revokes are relatively common.
+ */
+ revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
+ if (revokes) {
+ int t_revokes, revoke_descriptors;
+ int rr_per_blk = journal->j_revoke_records_per_block;
+
+ WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
+ > handle->h_total_credits);
+ t_revokes = atomic_add_return(revokes,
+ &transaction->t_outstanding_revokes);
+ revoke_descriptors =
+ DIV_ROUND_UP(t_revokes, rr_per_blk) -
+ DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
+ handle->h_total_credits -= revoke_descriptors;
+ }
+ atomic_sub(handle->h_total_credits,
+ &transaction->t_outstanding_credits);
+ if (handle->h_rsv_handle)
+ __jbd2_journal_unreserve_handle(handle->h_rsv_handle,
+ transaction);
+ if (atomic_dec_and_test(&transaction->t_updates))
+ wake_up(&journal->j_wait_updates);
+
+ rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
+ /*
+ * Scope of the GFP_NOFS context is over here and so we can restore the
+ * original alloc context.
+ */
+ memalloc_nofs_restore(handle->saved_alloc_context);
+}
/**
- * int jbd2_journal_restart() - restart a handle .
+ * jbd2__journal_restart() - restart a handle .
* @handle: handle to restart
* @nblocks: nr credits requested
+ * @revoke_records: number of revoke record credits requested
+ * @gfp_mask: memory allocation flags (for start_this_handle)
*
* Restart a handle for a multi-transaction filesystem
* operation.
@@ -637,56 +756,48 @@ error_out:
* credits. We preserve reserved handle if there's any attached to the
* passed in handle.
*/
-int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
+int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
+ gfp_t gfp_mask)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
tid_t tid;
- int need_to_start, ret;
+ int need_to_start;
+ int ret;
/* If we've had an abort of any type, don't even think about
* actually doing the restart! */
if (is_handle_aborted(handle))
return 0;
journal = transaction->t_journal;
+ tid = transaction->t_tid;
/*
* First unlink the handle from its current transaction, and start the
* commit on that.
*/
- J_ASSERT(atomic_read(&transaction->t_updates) > 0);
- J_ASSERT(journal_current_handle() == handle);
-
- read_lock(&journal->j_state_lock);
- spin_lock(&transaction->t_handle_lock);
- atomic_sub(handle->h_buffer_credits,
- &transaction->t_outstanding_credits);
- if (handle->h_rsv_handle) {
- sub_reserved_credits(journal,
- handle->h_rsv_handle->h_buffer_credits);
- }
- if (atomic_dec_and_test(&transaction->t_updates))
- wake_up(&journal->j_wait_updates);
- tid = transaction->t_tid;
- spin_unlock(&transaction->t_handle_lock);
+ jbd2_debug(2, "restarting handle %p\n", handle);
+ stop_this_handle(handle);
handle->h_transaction = NULL;
- current->journal_info = NULL;
- jbd_debug(2, "restarting handle %p\n", handle);
+ /*
+ * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
+ * get rid of pointless j_state_lock traffic like this.
+ */
+ read_lock(&journal->j_state_lock);
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
if (need_to_start)
jbd2_log_start_commit(journal, tid);
-
- rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
- handle->h_buffer_credits = nblocks;
- /*
- * Restore the original nofs context because the journal restart
- * is basically the same thing as journal stop and start.
- * start_this_handle will start a new nofs context.
- */
- memalloc_nofs_restore(handle->saved_alloc_context);
+ handle->h_total_credits = nblocks +
+ DIV_ROUND_UP(revoke_records,
+ journal->j_revoke_records_per_block);
+ handle->h_revoke_credits = revoke_records;
ret = start_this_handle(journal, handle, gfp_mask);
+ trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
+ ret ? 0 : handle->h_transaction->t_tid,
+ handle->h_type, handle->h_line_no,
+ handle->h_total_credits);
return ret;
}
EXPORT_SYMBOL(jbd2__journal_restart);
@@ -694,12 +805,49 @@ EXPORT_SYMBOL(jbd2__journal_restart);
int jbd2_journal_restart(handle_t *handle, int nblocks)
{
- return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+ return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
}
EXPORT_SYMBOL(jbd2_journal_restart);
+/*
+ * Waits for any outstanding t_updates to finish.
+ * This is called with write j_state_lock held.
+ */
+void jbd2_journal_wait_updates(journal_t *journal)
+{
+ DEFINE_WAIT(wait);
+
+ while (1) {
+ /*
+ * Note that the running transaction can get freed under us if
+ * this transaction is getting committed in
+ * jbd2_journal_commit_transaction() ->
+ * jbd2_journal_free_transaction(). This can only happen when we
+ * release j_state_lock -> schedule() -> acquire j_state_lock.
+ * Hence we should everytime retrieve new j_running_transaction
+ * value (after j_state_lock release acquire cycle), else it may
+ * lead to use-after-free of old freed transaction.
+ */
+ transaction_t *transaction = journal->j_running_transaction;
+
+ if (!transaction)
+ break;
+
+ prepare_to_wait(&journal->j_wait_updates, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&transaction->t_updates)) {
+ finish_wait(&journal->j_wait_updates, &wait);
+ break;
+ }
+ write_unlock(&journal->j_state_lock);
+ schedule();
+ finish_wait(&journal->j_wait_updates, &wait);
+ write_lock(&journal->j_state_lock);
+ }
+}
+
/**
- * void jbd2_journal_lock_updates () - establish a transaction barrier.
+ * jbd2_journal_lock_updates () - establish a transaction barrier.
* @journal: Journal to establish a barrier on.
*
* This locks out any further updates from being started, and blocks
@@ -710,8 +858,6 @@ EXPORT_SYMBOL(jbd2_journal_restart);
*/
void jbd2_journal_lock_updates(journal_t *journal)
{
- DEFINE_WAIT(wait);
-
jbd2_might_wait_for_commit(journal);
write_lock(&journal->j_state_lock);
@@ -725,27 +871,9 @@ void jbd2_journal_lock_updates(journal_t *journal)
write_lock(&journal->j_state_lock);
}
- /* Wait until there are no running updates */
- while (1) {
- transaction_t *transaction = journal->j_running_transaction;
-
- if (!transaction)
- break;
+ /* Wait until there are no running t_updates */
+ jbd2_journal_wait_updates(journal);
- spin_lock(&transaction->t_handle_lock);
- prepare_to_wait(&journal->j_wait_updates, &wait,
- TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&transaction->t_updates)) {
- spin_unlock(&transaction->t_handle_lock);
- finish_wait(&journal->j_wait_updates, &wait);
- break;
- }
- spin_unlock(&transaction->t_handle_lock);
- write_unlock(&journal->j_state_lock);
- schedule();
- finish_wait(&journal->j_wait_updates, &wait);
- write_lock(&journal->j_state_lock);
- }
write_unlock(&journal->j_state_lock);
/*
@@ -758,7 +886,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
}
/**
- * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
+ * jbd2_journal_unlock_updates () - release barrier
* @journal: Journal to release the barrier on.
*
* Release a transaction barrier obtained with jbd2_journal_lock_updates().
@@ -773,7 +901,7 @@ void jbd2_journal_unlock_updates (journal_t *journal)
write_lock(&journal->j_state_lock);
--journal->j_barrier_count;
write_unlock(&journal->j_state_lock);
- wake_up(&journal->j_wait_transaction_locked);
+ wake_up_all(&journal->j_wait_transaction_locked);
}
static void warn_dirty_buffer(struct buffer_head *bh)
@@ -788,19 +916,15 @@ static void warn_dirty_buffer(struct buffer_head *bh)
/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
static void jbd2_freeze_jh_data(struct journal_head *jh)
{
- struct page *page;
- int offset;
char *source;
struct buffer_head *bh = jh2bh(jh);
J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
- page = bh->b_page;
- offset = offset_in_page(bh->b_data);
- source = kmap_atomic(page);
+ source = kmap_local_folio(bh->b_folio, bh_offset(bh));
/* Fire data frozen trigger just before we copy the data */
- jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers);
- memcpy(jh->b_frozen_data, source + offset, bh->b_size);
- kunmap_atomic(source);
+ jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers);
+ memcpy(jh->b_frozen_data, source, bh->b_size);
+ kunmap_local(source);
/*
* Now that the frozen data is saved off, we need to store any matching
@@ -830,11 +954,9 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
char *frozen_buffer = NULL;
unsigned long start_lock, time_lock;
- if (is_handle_aborted(handle))
- return -EROFS;
journal = transaction->t_journal;
- jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
+ jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
JBUFFER_TRACE(jh, "entry");
repeat:
@@ -844,7 +966,7 @@ repeat:
start_lock = jiffies;
lock_buffer(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
/* If it takes too long to lock the buffer, trace it */
time_lock = jbd2_time_diff(start_lock, jiffies);
@@ -865,36 +987,28 @@ repeat:
* ie. locked but not dirty) or tune2fs (which may actually have
* the buffer dirtied, ugh.) */
- if (buffer_dirty(bh)) {
+ if (buffer_dirty(bh) && jh->b_transaction) {
+ warn_dirty_buffer(bh);
/*
- * First question: is this buffer already part of the current
- * transaction or the existing committing transaction?
- */
- if (jh->b_transaction) {
- J_ASSERT_JH(jh,
- jh->b_transaction == transaction ||
- jh->b_transaction ==
- journal->j_committing_transaction);
- if (jh->b_next_transaction)
- J_ASSERT_JH(jh, jh->b_next_transaction ==
- transaction);
- warn_dirty_buffer(bh);
- }
- /*
- * In any case we need to clean the dirty flag and we must
- * do it under the buffer lock to be sure we don't race
- * with running write-out.
+ * We need to clean the dirty flag and we must do it under the
+ * buffer lock to be sure we don't race with running write-out.
*/
JBUFFER_TRACE(jh, "Journalling dirty buffer");
clear_buffer_dirty(bh);
+ /*
+ * The buffer is going to be added to BJ_Reserved list now and
+ * nothing guarantees jbd2_journal_dirty_metadata() will be
+ * ever called for it. So we need to set jbddirty bit here to
+ * make sure the buffer is dirtied and written out when the
+ * journaling machinery is done with it.
+ */
set_buffer_jbddirty(bh);
}
- unlock_buffer(bh);
-
error = -EROFS;
if (is_handle_aborted(handle)) {
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
+ unlock_buffer(bh);
goto out;
}
error = 0;
@@ -904,14 +1018,16 @@ repeat:
* b_next_transaction points to it
*/
if (jh->b_transaction == transaction ||
- jh->b_next_transaction == transaction)
+ jh->b_next_transaction == transaction) {
+ unlock_buffer(bh);
goto done;
+ }
/*
* this is the first time this transaction is touching this buffer,
* reset the modified flag
*/
- jh->b_modified = 0;
+ jh->b_modified = 0;
/*
* If the buffer is not journaled right now, we need to make sure it
@@ -929,10 +1045,24 @@ repeat:
*/
smp_wmb();
spin_lock(&journal->j_list_lock);
+ if (test_clear_buffer_dirty(bh)) {
+ /*
+ * Execute buffer dirty clearing and jh->b_transaction
+ * assignment under journal->j_list_lock locked to
+ * prevent bh being removed from checkpoint list if
+ * the buffer is in an intermediate state (not dirty
+ * and jh->b_transaction is NULL).
+ */
+ JBUFFER_TRACE(jh, "Journalling dirty buffer");
+ set_buffer_jbddirty(bh);
+ }
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
spin_unlock(&journal->j_list_lock);
+ unlock_buffer(bh);
goto done;
}
+ unlock_buffer(bh);
+
/*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
@@ -958,7 +1088,7 @@ repeat:
*/
if (buffer_shadow(bh)) {
JBUFFER_TRACE(jh, "on shadow: sleep");
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
goto repeat;
}
@@ -979,7 +1109,7 @@ repeat:
JBUFFER_TRACE(jh, "generate frozen data");
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS | __GFP_NOFAIL);
goto repeat;
@@ -998,7 +1128,7 @@ attach_next:
jh->b_next_transaction = transaction;
done:
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
/*
* If we are about to journal a buffer, then any revoke pending on it is
@@ -1046,8 +1176,8 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
/* For undo access buffer must have data copied */
if (undo && !jh->b_committed_data)
goto out;
- if (jh->b_transaction != handle->h_transaction &&
- jh->b_next_transaction != handle->h_transaction)
+ if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
+ READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
goto out;
/*
* There are two reasons for the barrier here:
@@ -1068,7 +1198,8 @@ out:
}
/**
- * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * jbd2_journal_get_write_access() - notify intent to modify a buffer
+ * for metadata (not data) update.
* @handle: transaction to add buffer modifications to
* @bh: bh to be used for metadata writes
*
@@ -1081,8 +1212,26 @@ out:
int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
{
struct journal_head *jh;
+ journal_t *journal;
int rc;
+ if (is_handle_aborted(handle))
+ return -EROFS;
+
+ journal = handle->h_transaction->t_journal;
+ rc = jbd2_check_fs_dev_write_error(journal);
+ if (rc) {
+ /*
+ * If the fs dev has writeback errors, it may have failed
+ * to async write out metadata buffers in the background.
+ * In this case, we could read old data from disk and write
+ * it out again, which may lead to on-disk filesystem
+ * inconsistency. Aborting journal can avoid it happen.
+ */
+ jbd2_journal_abort(journal, rc);
+ return -EIO;
+ }
+
if (jbd2_write_access_granted(handle, bh, false))
return 0;
@@ -1109,7 +1258,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
* unlocked buffer beforehand. */
/**
- * int jbd2_journal_get_create_access () - notify intent to use newly created bh
+ * jbd2_journal_get_create_access () - notify intent to use newly created bh
* @handle: transaction to new buffer to
* @bh: new buffer.
*
@@ -1122,7 +1271,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh = jbd2_journal_add_journal_head(bh);
int err;
- jbd_debug(5, "journal_head %p\n", jh);
+ jbd2_debug(5, "journal_head %p\n", jh);
err = -EROFS;
if (is_handle_aborted(handle))
goto out;
@@ -1136,14 +1285,23 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
* committing transaction's lists, but it HAS to be in Forget state in
* that case: the transaction must have deleted the buffer for it to be
* reused here.
+ * In the case of file system data inconsistency, for example, if the
+ * block bitmap of a referenced block is not set, it can lead to the
+ * situation where a block being committed is allocated and used again.
+ * As a result, the following condition will not be satisfied, so here
+ * we directly trigger a JBD abort instead of immediately invoking
+ * bugon.
*/
- jbd_lock_bh_state(bh);
- J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
- jh->b_transaction == NULL ||
- (jh->b_transaction == journal->j_committing_transaction &&
- jh->b_jlist == BJ_Forget)));
+ spin_lock(&jh->b_state_lock);
+ if (!(jh->b_transaction == transaction || jh->b_transaction == NULL ||
+ (jh->b_transaction == journal->j_committing_transaction &&
+ jh->b_jlist == BJ_Forget)) || jh->b_next_transaction != NULL) {
+ err = -EROFS;
+ spin_unlock(&jh->b_state_lock);
+ jbd2_journal_abort(journal, err);
+ goto out;
+ }
- J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
if (jh->b_transaction == NULL) {
@@ -1172,7 +1330,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
jh->b_next_transaction = transaction;
spin_unlock(&journal->j_list_lock);
}
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
/*
* akpm: I added this. ext3_alloc_branch can pick up new indirect
@@ -1189,7 +1347,7 @@ out:
}
/**
- * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with
+ * jbd2_journal_get_undo_access() - Notify intent to modify metadata with
* non-rewindable consequences
* @handle: transaction
* @bh: buffer to undo
@@ -1220,11 +1378,15 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
char *committed_data = NULL;
- JBUFFER_TRACE(jh, "entry");
+ if (is_handle_aborted(handle))
+ return -EROFS;
+
if (jbd2_write_access_granted(handle, bh, true))
return 0;
jh = jbd2_journal_add_journal_head(bh);
+ JBUFFER_TRACE(jh, "entry");
+
/*
* Do this first --- it can drop the journal lock, so we want to
* make sure that obtaining the committed_data is done
@@ -1239,13 +1401,13 @@ repeat:
committed_data = jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS|__GFP_NOFAIL);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
if (!jh->b_committed_data) {
/* Copy out the current buffer contents into the
* preserved, committed copy. */
JBUFFER_TRACE(jh, "generate b_committed data");
if (!committed_data) {
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
goto repeat;
}
@@ -1253,7 +1415,7 @@ repeat:
committed_data = NULL;
memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
}
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
out:
jbd2_journal_put_journal_head(jh);
if (unlikely(committed_data))
@@ -1262,7 +1424,7 @@ out:
}
/**
- * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * jbd2_journal_set_triggers() - Add triggers for commit writeout
* @bh: buffer to trigger on
* @type: struct jbd2_buffer_trigger_type containing the trigger(s).
*
@@ -1277,7 +1439,7 @@ void jbd2_journal_set_triggers(struct buffer_head *bh,
{
struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
- if (WARN_ON(!jh))
+ if (WARN_ON_ONCE(!jh))
return;
jh->b_triggers = type;
jbd2_journal_put_journal_head(jh);
@@ -1304,7 +1466,7 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
}
/**
- * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
+ * jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
*
@@ -1333,47 +1495,64 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
int ret = 0;
- if (is_handle_aborted(handle))
- return -EROFS;
- if (!buffer_jbd(bh)) {
- ret = -EUCLEAN;
- goto out;
- }
+ if (!buffer_jbd(bh))
+ return -EUCLEAN;
+
/*
* We don't grab jh reference here since the buffer must be part
* of the running transaction.
*/
jh = bh2jh(bh);
+ jbd2_debug(5, "journal_head %p\n", jh);
+ JBUFFER_TRACE(jh, "entry");
+
/*
* This and the following assertions are unreliable since we may see jh
* in inconsistent state unless we grab bh_state lock. But this is
* crucial to catch bugs so let's do a reliable check until the
* lockless handling is fully proven.
*/
- if (jh->b_transaction != transaction &&
- jh->b_next_transaction != transaction) {
- jbd_lock_bh_state(bh);
+ if (data_race(jh->b_transaction != transaction &&
+ jh->b_next_transaction != transaction)) {
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, jh->b_transaction == transaction ||
jh->b_next_transaction == transaction);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
- if (jh->b_modified == 1) {
+ if (data_race(jh->b_modified == 1)) {
/* If it's in our transaction it must be in BJ_Metadata list. */
- if (jh->b_transaction == transaction &&
- jh->b_jlist != BJ_Metadata) {
- jbd_lock_bh_state(bh);
+ if (data_race(jh->b_transaction == transaction &&
+ jh->b_jlist != BJ_Metadata)) {
+ spin_lock(&jh->b_state_lock);
+ if (jh->b_transaction == transaction &&
+ jh->b_jlist != BJ_Metadata)
+ pr_err("JBD2: assertion failure: h_type=%u "
+ "h_line_no=%u block_no=%llu jlist=%u\n",
+ handle->h_type, handle->h_line_no,
+ (unsigned long long) bh->b_blocknr,
+ jh->b_jlist);
J_ASSERT_JH(jh, jh->b_transaction != transaction ||
jh->b_jlist == BJ_Metadata);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
goto out;
}
- journal = transaction->t_journal;
- jbd_debug(5, "journal_head %p\n", jh);
- JBUFFER_TRACE(jh, "entry");
+ spin_lock(&jh->b_state_lock);
+
+ if (is_handle_aborted(handle)) {
+ /*
+ * Check journal aborting with @jh->b_state_lock locked,
+ * since 'jh->b_transaction' could be replaced with
+ * 'jh->b_next_transaction' during old transaction
+ * committing if journal aborted, which may fail
+ * assertion on 'jh->b_frozen_data == NULL'.
+ */
+ ret = -EROFS;
+ goto out_unlock_bh;
+ }
- jbd_lock_bh_state(bh);
+ journal = transaction->t_journal;
if (jh->b_modified == 0) {
/*
@@ -1381,12 +1560,12 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
* of the transaction. This needs to be done
* once a transaction -bzzz
*/
- jh->b_modified = 1;
- if (handle->h_buffer_credits <= 0) {
+ if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
ret = -ENOSPC;
goto out_unlock_bh;
}
- handle->h_buffer_credits--;
+ jh->b_modified = 1;
+ handle->h_total_credits--;
}
/*
@@ -1459,14 +1638,14 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
out:
JBUFFER_TRACE(jh, "exit");
return ret;
}
/**
- * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
+ * jbd2_journal_forget() - bforget() for potentially-journaled buffers.
* @handle: transaction handle
* @bh: bh to 'forget'
*
@@ -1482,7 +1661,7 @@ out:
* Allow this call even if the handle has aborted --- it may be part of
* the caller's cleanup after an abort.
*/
-int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
+int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
@@ -1490,6 +1669,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
int drop_reserve = 0;
int err = 0;
int was_modified = 0;
+ int wait_for_writeback = 0;
if (is_handle_aborted(handle))
return -EROFS;
@@ -1497,18 +1677,20 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
BUFFER_TRACE(bh, "entry");
- jbd_lock_bh_state(bh);
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh) {
+ __bforget(bh);
+ return 0;
+ }
- if (!buffer_jbd(bh))
- goto not_jbd;
- jh = bh2jh(bh);
+ spin_lock(&jh->b_state_lock);
/* Critical error: attempting to delete a bitmap buffer, maybe?
* Don't do any jbd operations, and return an error. */
if (!J_EXPECT_JH(jh, !jh->b_committed_data,
"inconsistent data on disk")) {
err = -EIO;
- goto not_jbd;
+ goto drop;
}
/* keep track of whether or not this transaction modified us */
@@ -1556,12 +1738,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
} else {
__jbd2_journal_unfile_buffer(jh);
- if (!buffer_jbd(bh)) {
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- __bforget(bh);
- goto drop;
- }
+ jbd2_journal_put_journal_head(jh);
}
spin_unlock(&journal->j_list_lock);
} else if (jh->b_transaction) {
@@ -1570,14 +1747,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
/* However, if the buffer is still owned by a prior
* (committing) transaction, we can't drop it yet... */
JBUFFER_TRACE(jh, "belongs to older transaction");
- /* ... but we CAN drop it from the new transaction if we
- * have also modified it since the original commit. */
+ /* ... but we CAN drop it from the new transaction through
+ * marking the buffer as freed and set j_next_transaction to
+ * the new transaction, so that not only the commit code
+ * knows it should clear dirty bits when it is done with the
+ * buffer, but also the buffer can be checkpointed only
+ * after the new transaction commits. */
- if (jh->b_next_transaction) {
- J_ASSERT(jh->b_next_transaction == transaction);
+ set_buffer_freed(bh);
+
+ if (!jh->b_next_transaction) {
spin_lock(&journal->j_list_lock);
- jh->b_next_transaction = NULL;
+ jh->b_next_transaction = transaction;
spin_unlock(&journal->j_list_lock);
+ } else {
+ J_ASSERT(jh->b_next_transaction == transaction);
/*
* only drop a reference if this transaction modified
@@ -1586,21 +1770,55 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
if (was_modified)
drop_reserve = 1;
}
- }
+ } else {
+ /*
+ * Finally, if the buffer is not belongs to any
+ * transaction, we can just drop it now if it has no
+ * checkpoint.
+ */
+ spin_lock(&journal->j_list_lock);
+ if (!jh->b_cp_transaction) {
+ JBUFFER_TRACE(jh, "belongs to none transaction");
+ spin_unlock(&journal->j_list_lock);
+ goto drop;
+ }
-not_jbd:
- jbd_unlock_bh_state(bh);
- __brelse(bh);
+ /*
+ * Otherwise, if the buffer has been written to disk,
+ * it is safe to remove the checkpoint and drop it.
+ */
+ if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
+ spin_unlock(&journal->j_list_lock);
+ goto drop;
+ }
+
+ /*
+ * The buffer has not yet been written to disk. We should
+ * either clear the buffer or ensure that the ongoing I/O
+ * is completed, and attach this buffer to current
+ * transaction so that the buffer can be checkpointed only
+ * after the current transaction commits.
+ */
+ clear_buffer_dirty(bh);
+ wait_for_writeback = 1;
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+ spin_unlock(&journal->j_list_lock);
+ }
drop:
+ __brelse(bh);
+ spin_unlock(&jh->b_state_lock);
+ if (wait_for_writeback)
+ wait_on_buffer(bh);
+ jbd2_journal_put_journal_head(jh);
if (drop_reserve) {
/* no need to reserve log space for this block -bzzz */
- handle->h_buffer_credits++;
+ handle->h_total_credits++;
}
return err;
}
/**
- * int jbd2_journal_stop() - complete a transaction
+ * jbd2_journal_stop() - complete a transaction
* @handle: transaction to complete.
*
* All done for a particular handle.
@@ -1623,45 +1841,34 @@ int jbd2_journal_stop(handle_t *handle)
tid_t tid;
pid_t pid;
+ if (--handle->h_ref > 0) {
+ jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+ handle->h_ref);
+ if (is_handle_aborted(handle))
+ return -EIO;
+ return 0;
+ }
if (!transaction) {
/*
- * Handle is already detached from the transaction so
- * there is nothing to do other than decrease a refcount,
- * or free the handle if refcount drops to zero
+ * Handle is already detached from the transaction so there is
+ * nothing to do other than free the handle.
*/
- if (--handle->h_ref > 0) {
- jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
- handle->h_ref);
- return err;
- } else {
- if (handle->h_rsv_handle)
- jbd2_free_handle(handle->h_rsv_handle);
- goto free_and_exit;
- }
+ memalloc_nofs_restore(handle->saved_alloc_context);
+ goto free_and_exit;
}
journal = transaction->t_journal;
-
- J_ASSERT(journal_current_handle() == handle);
+ tid = transaction->t_tid;
if (is_handle_aborted(handle))
err = -EIO;
- else
- J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-
- if (--handle->h_ref > 0) {
- jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
- handle->h_ref);
- return err;
- }
- jbd_debug(4, "Handle %p going down\n", handle);
+ jbd2_debug(4, "Handle %p going down\n", handle);
trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
- transaction->t_tid,
- handle->h_type, handle->h_line_no,
+ tid, handle->h_type, handle->h_line_no,
jiffies - handle->h_start_jiffies,
handle->h_sync, handle->h_requested_credits,
(handle->h_requested_credits -
- handle->h_buffer_credits));
+ handle->h_total_credits));
/*
* Implement synchronous transaction batching. If the handle
@@ -1721,28 +1928,22 @@ int jbd2_journal_stop(handle_t *handle)
if (handle->h_sync)
transaction->t_synchronous_commit = 1;
- current->journal_info = NULL;
- atomic_sub(handle->h_buffer_credits,
- &transaction->t_outstanding_credits);
/*
* If the handle is marked SYNC, we need to set another commit
- * going! We also want to force a commit if the current
- * transaction is occupying too much of the log, or if the
- * transaction is too old now.
+ * going! We also want to force a commit if the transaction is too
+ * old now.
*/
if (handle->h_sync ||
- (atomic_read(&transaction->t_outstanding_credits) >
- journal->j_max_transaction_buffers) ||
time_after_eq(jiffies, transaction->t_expires)) {
/* Do this even for aborted journals: an abort still
* completes the commit thread, it just doesn't write
* anything to disk. */
- jbd_debug(2, "transaction too old, requesting commit for "
+ jbd2_debug(2, "transaction too old, requesting commit for "
"handle %p\n", handle);
/* This is non-blocking */
- jbd2_log_start_commit(journal, transaction->t_tid);
+ jbd2_log_start_commit(journal, tid);
/*
* Special case: JBD2_SYNC synchronous updates require us
@@ -1753,31 +1954,19 @@ int jbd2_journal_stop(handle_t *handle)
}
/*
- * Once we drop t_updates, if it goes to zero the transaction
- * could start committing on us and eventually disappear. So
- * once we do this, we must not dereference transaction
- * pointer again.
+ * Once stop_this_handle() drops t_updates, the transaction could start
+ * committing on us and eventually disappear. So we must not
+ * dereference transaction pointer again after calling
+ * stop_this_handle().
*/
- tid = transaction->t_tid;
- if (atomic_dec_and_test(&transaction->t_updates)) {
- wake_up(&journal->j_wait_updates);
- if (journal->j_barrier_count)
- wake_up(&journal->j_wait_transaction_locked);
- }
-
- rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
+ stop_this_handle(handle);
if (wait_for_commit)
err = jbd2_log_wait_commit(journal, tid);
- if (handle->h_rsv_handle)
- jbd2_journal_free_reserved(handle->h_rsv_handle);
free_and_exit:
- /*
- * Scope of the GFP_NOFS context is over here and so we can restore the
- * original alloc context.
- */
- memalloc_nofs_restore(handle->saved_alloc_context);
+ if (handle->h_rsv_handle)
+ jbd2_free_handle(handle->h_rsv_handle);
jbd2_free_handle(handle);
return err;
}
@@ -1795,7 +1984,7 @@ free_and_exit:
*
* j_list_lock is held.
*
- * jbd_lock_bh_state(jh2bh(jh)) is held.
+ * jh->b_state_lock is held.
*/
static inline void
@@ -1819,7 +2008,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
*
* Called with j_list_lock held, and the journal may not be locked.
*
- * jbd_lock_bh_state(jh2bh(jh)) is held.
+ * jh->b_state_lock is held.
*/
static inline void
@@ -1851,7 +2040,7 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
transaction_t *transaction;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
transaction = jh->b_transaction;
if (transaction)
assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1888,70 +2077,24 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
}
/*
- * Remove buffer from all transactions.
+ * Remove buffer from all transactions. The caller is responsible for dropping
+ * the jh reference that belonged to the transaction.
*
* Called with bh_state lock and j_list_lock
- *
- * jh and bh may be already freed when this function returns.
*/
static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
{
+ J_ASSERT_JH(jh, jh->b_transaction != NULL);
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+
__jbd2_journal_temp_unlink_buffer(jh);
jh->b_transaction = NULL;
- jbd2_journal_put_journal_head(jh);
-}
-
-void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
-{
- struct buffer_head *bh = jh2bh(jh);
-
- /* Get reference so that buffer cannot be freed before we unlock it */
- get_bh(bh);
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- __jbd2_journal_unfile_buffer(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- __brelse(bh);
-}
-
-/*
- * Called from jbd2_journal_try_to_free_buffers().
- *
- * Called under jbd_lock_bh_state(bh)
- */
-static void
-__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
-{
- struct journal_head *jh;
-
- jh = bh2jh(bh);
-
- if (buffer_locked(bh) || buffer_dirty(bh))
- goto out;
-
- if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
- goto out;
-
- spin_lock(&journal->j_list_lock);
- if (jh->b_cp_transaction != NULL) {
- /* written-back checkpointed metadata buffer */
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __jbd2_journal_remove_checkpoint(jh);
- }
- spin_unlock(&journal->j_list_lock);
-out:
- return;
}
/**
- * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
+ * jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
- * @page: to try and free
- * @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
- * code to release the buffers.
- *
+ * @folio: Folio to detach data from.
*
* For all the buffers on this page,
* if they are fully written out ordered data, move them onto BUF_CLEAN
@@ -1980,18 +2123,17 @@ out:
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
*
- * Return 0 on failure, 1 on success
+ * Return false on failure, true on success
*/
-int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t gfp_mask)
+bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
{
struct buffer_head *head;
struct buffer_head *bh;
- int ret = 0;
+ bool ret = false;
- J_ASSERT(PageLocked(page));
+ J_ASSERT(folio_test_locked(folio));
- head = page_buffers(page);
+ head = folio_buffers(folio);
bh = head;
do {
struct journal_head *jh;
@@ -2005,16 +2147,21 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
if (!jh)
continue;
- jbd_lock_bh_state(bh);
- __journal_try_to_free_buffer(journal, bh);
+ spin_lock(&jh->b_state_lock);
+ if (!jh->b_transaction && !jh->b_next_transaction) {
+ spin_lock(&journal->j_list_lock);
+ /* Remove written-back checkpointed metadata buffer */
+ if (jh->b_cp_transaction != NULL)
+ jbd2_journal_try_remove_checkpoint(jh);
+ spin_unlock(&journal->j_list_lock);
+ }
+ spin_unlock(&jh->b_state_lock);
jbd2_journal_put_journal_head(jh);
- jbd_unlock_bh_state(bh);
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
- ret = try_to_free_buffers(page);
-
+ ret = try_to_free_buffers(folio);
busy:
return ret;
}
@@ -2029,7 +2176,7 @@ busy:
*
* Called under j_list_lock.
*
- * Called under jbd_lock_bh_state(bh).
+ * Called under jh->b_state_lock.
*/
static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
{
@@ -2042,7 +2189,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
/*
* We don't want to write the buffer anymore, clear the
* bit so that we don't confuse checks in
- * __journal_file_buffer
+ * __jbd2_journal_file_buffer
*/
clear_buffer_dirty(bh);
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
@@ -2050,19 +2197,20 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
} else {
JBUFFER_TRACE(jh, "on running transaction");
__jbd2_journal_unfile_buffer(jh);
+ jbd2_journal_put_journal_head(jh);
}
return may_free;
}
/*
- * jbd2_journal_invalidatepage
+ * jbd2_journal_invalidate_folio
*
* This code is tricky. It has a number of cases to deal with.
*
* There are two invariants which this code relies on:
*
- * i_size must be updated on disk before we start calling invalidatepage on the
- * data.
+ * i_size must be updated on disk before we start calling invalidate_folio
+ * on the data.
*
* This is done in ext3 by defining an ext3_setattr method which
* updates i_size before truncate gets going. By maintaining this
@@ -2116,18 +2264,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
* holding the page lock. --sct
*/
- if (!buffer_jbd(bh))
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh)
goto zap_buffer_unlocked;
/* OK, we have data buffer in journaled mode */
write_lock(&journal->j_state_lock);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
- jh = jbd2_journal_grab_journal_head(bh);
- if (!jh)
- goto zap_buffer_no_jh;
-
/*
* We cannot remove the buffer from checkpoint lists until the
* transaction adding inode to orphan list (let's call it T)
@@ -2206,25 +2351,30 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
* for commit and try again.
*/
if (partial_page) {
- jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
+ /* Already zapped buffer? Nothing to do... */
+ if (!bh->b_bdev)
+ return 0;
return -EBUSY;
}
/*
- * OK, buffer won't be reachable after truncate. We just set
- * j_next_transaction to the running transaction (if there is
- * one) and mark buffer as freed so that commit code knows it
- * should clear dirty bits when it is done with the buffer.
+ * OK, buffer won't be reachable after truncate. We just clear
+ * b_modified to not confuse transaction credit accounting, and
+ * set j_next_transaction to the running transaction (if there
+ * is one) and mark buffer as freed so that commit code knows
+ * it should clear dirty bits when it is done with the buffer.
*/
set_buffer_freed(bh);
if (journal->j_running_transaction && buffer_jbddirty(bh))
jh->b_next_transaction = journal->j_running_transaction;
- jbd2_journal_put_journal_head(jh);
+ jh->b_modified = 0;
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
return 0;
} else {
/* Good, the buffer belongs to the running transaction.
@@ -2248,11 +2398,10 @@ zap_buffer:
* here.
*/
jh->b_modified = 0;
- jbd2_journal_put_journal_head(jh);
-zap_buffer_no_jh:
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
zap_buffer_unlocked:
clear_buffer_dirty(bh);
J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2266,9 +2415,9 @@ zap_buffer_unlocked:
}
/**
- * void jbd2_journal_invalidatepage()
+ * jbd2_journal_invalidate_folio()
* @journal: journal to use for flush...
- * @page: page to flush
+ * @folio: folio to flush
* @offset: start of the range to invalidate
* @length: length of the range to invalidate
*
@@ -2277,30 +2426,29 @@ zap_buffer_unlocked:
* the page is straddling i_size. Caller then has to wait for current commit
* and try again.
*/
-int jbd2_journal_invalidatepage(journal_t *journal,
- struct page *page,
- unsigned int offset,
- unsigned int length)
+int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio,
+ size_t offset, size_t length)
{
struct buffer_head *head, *bh, *next;
unsigned int stop = offset + length;
unsigned int curr_off = 0;
- int partial_page = (offset || length < PAGE_SIZE);
+ int partial_page = (offset || length < folio_size(folio));
int may_free = 1;
int ret = 0;
- if (!PageLocked(page))
+ if (!folio_test_locked(folio))
BUG();
- if (!page_has_buffers(page))
+ head = folio_buffers(folio);
+ if (!head)
return 0;
- BUG_ON(stop > PAGE_SIZE || stop < length);
+ BUG_ON(stop > folio_size(folio) || stop < length);
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
- head = bh = page_buffers(page);
+ bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
@@ -2323,8 +2471,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
} while (bh != head);
if (!partial_page) {
- if (may_free && try_to_free_buffers(page))
- J_ASSERT(!page_has_buffers(page));
+ if (may_free && try_to_free_buffers(folio))
+ J_ASSERT(!folio_buffers(folio));
}
return 0;
}
@@ -2339,7 +2487,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
int was_dirty = 0;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
assert_spin_locked(&transaction->t_journal->j_list_lock);
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@ -2401,11 +2549,11 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
void jbd2_journal_file_buffer(struct journal_head *jh,
transaction_t *transaction, int jlist)
{
- jbd_lock_bh_state(jh2bh(jh));
+ spin_lock(&jh->b_state_lock);
spin_lock(&transaction->t_journal->j_list_lock);
__jbd2_journal_file_buffer(jh, transaction, jlist);
spin_unlock(&transaction->t_journal->j_list_lock);
- jbd_unlock_bh_state(jh2bh(jh));
+ spin_unlock(&jh->b_state_lock);
}
/*
@@ -2415,23 +2563,25 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
* buffer on that transaction's metadata list.
*
* Called under j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh))
+ * Called under jh->b_state_lock
*
- * jh and bh may be already free when this function returns
+ * When this function returns true, there's no next transaction to refile to
+ * and the caller has to drop jh reference through
+ * jbd2_journal_put_journal_head().
*/
-void __jbd2_journal_refile_buffer(struct journal_head *jh)
+bool __jbd2_journal_refile_buffer(struct journal_head *jh)
{
int was_dirty, jlist;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
if (jh->b_transaction)
assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
/* If the buffer is now unused, just drop it. */
if (jh->b_next_transaction == NULL) {
__jbd2_journal_unfile_buffer(jh);
- return;
+ return true;
}
/*
@@ -2441,13 +2591,20 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
was_dirty = test_clear_buffer_jbddirty(bh);
__jbd2_journal_temp_unlink_buffer(jh);
+
+ /*
+ * b_transaction must be set, otherwise the new b_transaction won't
+ * be holding jh reference
+ */
+ J_ASSERT_JH(jh, jh->b_transaction != NULL);
+
/*
* We set b_transaction here because b_next_transaction will inherit
* our jh reference and thus __jbd2_journal_file_buffer() must not
* take a new one.
*/
- jh->b_transaction = jh->b_next_transaction;
- jh->b_next_transaction = NULL;
+ WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
+ WRITE_ONCE(jh->b_next_transaction, NULL);
if (buffer_freed(bh))
jlist = BJ_Forget;
else if (jh->b_modified)
@@ -2459,6 +2616,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
if (was_dirty)
set_buffer_jbddirty(bh);
+ return false;
}
/*
@@ -2469,23 +2627,22 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
*/
void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
- struct buffer_head *bh = jh2bh(jh);
+ bool drop;
- /* Get reference so that buffer cannot be freed before we unlock it */
- get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
- __jbd2_journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
+ drop = __jbd2_journal_refile_buffer(jh);
+ spin_unlock(&jh->b_state_lock);
spin_unlock(&journal->j_list_lock);
- __brelse(bh);
+ if (drop)
+ jbd2_journal_put_journal_head(jh);
}
/*
* File inode in the inode list of the handle's transaction
*/
static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
- unsigned long flags)
+ unsigned long flags, loff_t start_byte, loff_t end_byte)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
@@ -2494,29 +2651,20 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
return -EROFS;
journal = transaction->t_journal;
- jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+ jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
transaction->t_tid);
- /*
- * First check whether inode isn't already on the transaction's
- * lists without taking the lock. Note that this check is safe
- * without the lock as we cannot race with somebody removing inode
- * from the transaction. The reason is that we remove inode from the
- * transaction only in journal_release_jbd_inode() and when we commit
- * the transaction. We are guarded from the first case by holding
- * a reference to the inode. We are safe against the second case
- * because if jinode->i_transaction == transaction, commit code
- * cannot touch the transaction because we hold reference to it,
- * and if jinode->i_next_transaction == transaction, commit code
- * will only file the inode where we want it.
- */
- if ((jinode->i_transaction == transaction ||
- jinode->i_next_transaction == transaction) &&
- (jinode->i_flags & flags) == flags)
- return 0;
-
spin_lock(&journal->j_list_lock);
jinode->i_flags |= flags;
+
+ if (jinode->i_dirty_end) {
+ jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte);
+ jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte);
+ } else {
+ jinode->i_dirty_start = start_byte;
+ jinode->i_dirty_end = end_byte;
+ }
+
/* Is inode already attached where we need it? */
if (jinode->i_transaction == transaction ||
jinode->i_next_transaction == transaction)
@@ -2548,15 +2696,19 @@ done:
return 0;
}
-int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
+int jbd2_journal_inode_ranged_write(handle_t *handle,
+ struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
{
return jbd2_journal_file_inode(handle, jinode,
- JI_WRITE_DATA | JI_WAIT_DATA);
+ JI_WRITE_DATA | JI_WAIT_DATA, start_byte,
+ start_byte + length - 1);
}
-int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
+int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode,
+ loff_t start_byte, loff_t length)
{
- return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
+ return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
+ start_byte, start_byte + length - 1);
}
/*