diff options
Diffstat (limited to 'fs/jbd2/commit.c')
| -rw-r--r-- | fs/jbd2/commit.c | 520 |
1 files changed, 259 insertions, 261 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 559bec1a37b4..7203d2d2624d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/commit.c * @@ -5,10 +6,6 @@ * * Copyright 1998 Red Hat corp --- All Rights Reserved * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * * Journal commit routines for the generic filesystem journaling code; * part of the ext2fs journaling system. */ @@ -43,7 +40,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) clear_buffer_uptodate(bh); if (orig_bh) { clear_bit_unlock(BH_Shadow, &orig_bh->b_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&orig_bh->b_state, BH_Shadow); } unlock_buffer(bh); @@ -60,32 +57,30 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) * So here, we have a buffer which has just come off the forget list. Look to * see if we can strip all buffers from the backing page. * - * Called under lock_journal(), and possibly under journal_datalist_lock. The - * caller provided us with a ref against the buffer, and we drop that here. + * Called under j_list_lock. The caller provided us with a ref against the + * buffer, and we drop that here. */ static void release_buffer_page(struct buffer_head *bh) { - struct page *page; + struct folio *folio; if (buffer_dirty(bh)) goto nope; if (atomic_read(&bh->b_count) != 1) goto nope; - page = bh->b_page; - if (!page) - goto nope; - if (page->mapping) + folio = bh->b_folio; + if (folio->mapping) goto nope; /* OK, it's a truncated page */ - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto nope; - page_cache_get(page); + folio_get(folio); __brelse(bh); - try_to_free_buffers(page); - unlock_page(page); - page_cache_release(page); + try_to_free_buffers(folio); + folio_unlock(folio); + folio_put(folio); return; nope: @@ -97,14 +92,14 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) struct commit_header *h; __u32 csum; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; h = (struct commit_header *)(bh->b_data); h->h_chksum_type = 0; h->h_chksum_size = 0; h->h_chksum[0] = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); h->h_chksum[0] = cpu_to_be32(csum); } @@ -123,27 +118,25 @@ static int journal_submit_commit_record(journal_t *journal, { struct commit_header *tmp; struct buffer_head *bh; - int ret; - struct timespec now = current_kernel_time(); + struct timespec64 now; + blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS; *cbh = NULL; if (is_journal_aborted(journal)) return 0; - bh = jbd2_journal_get_descriptor_buffer(journal); + bh = jbd2_journal_get_descriptor_buffer(commit_transaction, + JBD2_COMMIT_BLOCK); if (!bh) return 1; tmp = (struct commit_header *)bh->b_data; - tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); - tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + ktime_get_coarse_real_ts64(&now); tmp->h_commit_sec = cpu_to_be64(now.tv_sec); tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); - if (JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM)) { + if (jbd2_has_feature_checksum(journal)) { tmp->h_chksum_type = JBD2_CRC32_CHKSUM; tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; tmp->h_chksum[0] = cpu_to_be32(crc32_sum); @@ -157,14 +150,12 @@ static int journal_submit_commit_record(journal_t *journal, bh->b_end_io = journal_end_buffer_io_sync; if (journal->j_flags & JBD2_BARRIER && - !JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) - ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); - else - ret = submit_bh(WRITE_SYNC, bh); + !jbd2_has_feature_async_commit(journal)) + write_flags |= REQ_PREFLUSH | REQ_FUA; + submit_bh(write_flags, bh); *cbh = bh; - return ret; + return 0; } /* @@ -186,25 +177,28 @@ static int journal_wait_on_commit_record(journal_t *journal, return ret; } -/* - * write the filemap data using writepage() address_space_operations. - * We don't do block allocation here even for delalloc. We don't - * use writepages() because with dealyed allocation we may be doing - * block allocation in writepages(). - */ -static int journal_submit_inode_data_buffers(struct address_space *mapping) +/* Send all the data buffers related to an inode */ +int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) { - int ret; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = mapping->nrpages * 2, - .range_start = 0, - .range_end = i_size_read(mapping->host), - }; - - ret = generic_writepages(mapping, &wbc); - return ret; + if (!jinode || !(jinode->i_flags & JI_WRITE_DATA)) + return 0; + + trace_jbd2_submit_inode_data(jinode->i_vfs_inode); + return journal->j_submit_inode_data_buffers(jinode); + +} +EXPORT_SYMBOL(jbd2_submit_inode_data); + +int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode) +{ + if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) || + !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping) + return 0; + return filemap_fdatawait_range_keep_errors( + jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start, + jinode->i_dirty_end); } +EXPORT_SYMBOL(jbd2_wait_inode_data); /* * Submit all the data buffers of inode associated with the transaction to @@ -219,33 +213,39 @@ static int journal_submit_data_buffers(journal_t *journal, { struct jbd2_inode *jinode; int err, ret = 0; - struct address_space *mapping; spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { - mapping = jinode->i_vfs_inode->i_mapping; - set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + if (!(jinode->i_flags & JI_WRITE_DATA)) + continue; + jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); - /* - * submit the inode data buffers. We use writepage - * instead of writepages. Because writepages can do - * block allocation with delalloc. We need to write - * only allocated blocks here. - */ + /* submit the inode data buffers. */ trace_jbd2_submit_inode_data(jinode->i_vfs_inode); - err = journal_submit_inode_data_buffers(mapping); - if (!ret) - ret = err; + if (journal->j_submit_inode_data_buffers) { + err = journal->j_submit_inode_data_buffers(jinode); + if (!ret) + ret = err; + } spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); - clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_clear_bit(); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + smp_mb(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } spin_unlock(&journal->j_list_lock); return ret; } +int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + + return filemap_fdatawait_range_keep_errors(mapping, + jinode->i_dirty_start, + jinode->i_dirty_end); +} + /* * Wait for data submitted for writeout, refile inodes to proper * transaction if needed. @@ -260,24 +260,20 @@ static int journal_finish_inode_data_buffers(journal_t *journal, /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { - set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + if (!(jinode->i_flags & JI_WAIT_DATA)) + continue; + jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); - err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); - if (err) { - /* - * Because AS_EIO is cleared by - * filemap_fdatawait_range(), set it again so - * that user process can get -EIO from fsync(). - */ - set_bit(AS_EIO, - &jinode->i_vfs_inode->i_mapping->flags); - + /* wait for the inode data buffers writeout. */ + if (journal->j_finish_inode_data_buffers) { + err = journal->j_finish_inode_data_buffers(jinode); if (!ret) ret = err; } + cond_resched(); spin_lock(&journal->j_list_lock); - clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_clear_bit(); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + smp_mb(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -292,6 +288,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal, &jinode->i_transaction->t_inode_list); } else { jinode->i_transaction = NULL; + jinode->i_dirty_start = 0; + jinode->i_dirty_end = 0; } } spin_unlock(&journal->j_list_lock); @@ -301,62 +299,45 @@ static int journal_finish_inode_data_buffers(journal_t *journal, static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) { - struct page *page = bh->b_page; char *addr; __u32 checksum; - addr = kmap_atomic(page); - checksum = crc32_be(crc32_sum, - (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); - kunmap_atomic(addr); + addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); + checksum = crc32_be(crc32_sum, addr, bh->b_size); + kunmap_local(addr); return checksum; } -static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, +static void write_tag_block(journal_t *j, journal_block_tag_t *tag, unsigned long long block) { tag->t_blocknr = cpu_to_be32(block & (u32)~0); - if (tag_bytes > JBD2_TAG_SIZE32) + if (jbd2_has_feature_64bit(j)) tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); } -static void jbd2_descr_block_csum_set(journal_t *j, - struct buffer_head *bh) -{ - struct jbd2_journal_block_tail *tail; - __u32 csum; - - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) - return; - - tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - - sizeof(struct jbd2_journal_block_tail)); - tail->t_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); - tail->t_checksum = cpu_to_be32(csum); -} - static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct buffer_head *bh, __u32 sequence) { - struct page *page = bh->b_page; + journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; __u8 *addr; __u32 csum32; + __be32 seq; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; - sequence = cpu_to_be32(sequence); - addr = kmap_atomic(page); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, - sizeof(sequence)); - csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), - bh->b_size); - kunmap_atomic(addr); + seq = cpu_to_be32(sequence); + addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, addr, bh->b_size); + kunmap_local(addr); - /* We only have space to store the lower 16 bits of the crc32c. */ - tag->t_checksum = cpu_to_be16(csum32); + if (jbd2_has_feature_csum3(j)) + tag3->t_checksum = cpu_to_be32(csum32); + else + tag->t_checksum = cpu_to_be16(csum32); } /* * jbd2_journal_commit_transaction @@ -372,13 +353,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) struct buffer_head *descriptor; struct buffer_head **wbuf = journal->j_wbuf; int bufs; - int flags; + int escape; int err; unsigned long long blocknr; ktime_t start_time; u64 commit_time; char *tagp = NULL; - journal_header_t *header; journal_block_tag_t *tag = NULL; int space_left = 0; int first_tag = 0; @@ -396,7 +376,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) LIST_HEAD(io_bufs); LIST_HEAD(log_bufs); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_block_tail); /* @@ -406,8 +386,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { - jbd_debug(3, "super block updated\n"); - mutex_lock(&journal->j_checkpoint_mutex); + jbd2_debug(3, "super block updated\n"); + mutex_lock_io(&journal->j_checkpoint_mutex); /* * We hold j_checkpoint_mutex so tail cannot change under us. * We don't need any special data guarantees for writing sb @@ -416,23 +396,46 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, - journal->j_tail, - WRITE_SYNC); + journal->j_tail, 0); mutex_unlock(&journal->j_checkpoint_mutex); } else { - jbd_debug(3, "superblock not updated\n"); + jbd2_debug(3, "superblock not updated\n"); } J_ASSERT(journal->j_running_transaction != NULL); J_ASSERT(journal->j_committing_transaction == NULL); + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_FULL_COMMIT_ONGOING; + while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_fc_wait, &wait, + TASK_UNINTERRUPTIBLE); + write_unlock(&journal->j_state_lock); + schedule(); + write_lock(&journal->j_state_lock); + finish_wait(&journal->j_fc_wait, &wait); + /* + * TODO: by blocking fast commits here, we are increasing + * fsync() latency slightly. Strictly speaking, we don't need + * to block fast commits until the transaction enters T_FLUSH + * state. So an optimization is possible where we block new fast + * commits here and wait for existing ones to complete + * just before we enter T_FLUSH. That way, the existing fast + * commits and this full commit can proceed parallely. + */ + } + write_unlock(&journal->j_state_lock); + commit_transaction = journal->j_running_transaction; trace_jbd2_start_commit(journal, commit_transaction); - jbd_debug(1, "JBD2: starting commit of transaction %d\n", + jbd2_debug(1, "JBD2: starting commit of transaction %d\n", commit_transaction->t_tid); write_lock(&journal->j_state_lock); + journal->j_fc_off = 0; J_ASSERT(commit_transaction->t_state == T_RUNNING); commit_transaction->t_state = T_LOCKED; @@ -447,22 +450,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, stats.run.rs_locked); - spin_lock(&commit_transaction->t_handle_lock); - while (atomic_read(&commit_transaction->t_updates)) { - DEFINE_WAIT(wait); + // waits for any t_updates to finish + jbd2_journal_wait_updates(journal); - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&commit_transaction->t_updates)) { - spin_unlock(&commit_transaction->t_handle_lock); - write_unlock(&journal->j_state_lock); - schedule(); - write_lock(&journal->j_state_lock); - spin_lock(&commit_transaction->t_handle_lock); - } - finish_wait(&journal->j_wait_updates, &wait); - } - spin_unlock(&commit_transaction->t_handle_lock); + commit_transaction->t_state = T_SWITCH; J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= journal->j_max_transaction_buffers); @@ -482,6 +473,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) * has reserved. This is consistent with the existing behaviour * that multiple jbd2_journal_get_write_access() calls to the same * buffer are perfectly permissible. + * We use journal->j_state_lock here to serialize processing of + * t_reserved_list with eviction of buffers from journal_unmap_buffer(). */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; @@ -493,24 +486,25 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (jh->b_committed_data) { struct buffer_head *bh = jh2bh(jh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); jbd2_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } jbd2_journal_refile_buffer(journal, jh); } + write_unlock(&journal->j_state_lock); /* * Now try to drop any written-back buffers from the journal's * checkpoint lists. We do this *before* commit because it potentially * frees some memory */ spin_lock(&journal->j_list_lock); - __jbd2_journal_clean_checkpoint_list(journal); + __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP); spin_unlock(&journal->j_list_lock); - jbd_debug(3, "JBD2: commit phase 1\n"); + jbd2_debug(3, "JBD2: commit phase 1\n"); /* * Clear revoked flag to reflect there is no revoked buffers @@ -523,6 +517,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_switch_revoke_table(journal); + write_lock(&journal->j_state_lock); /* * Reserved credits cannot be claimed anymore, free them */ @@ -539,10 +534,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) journal->j_running_transaction = NULL; start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; - wake_up(&journal->j_wait_transaction_locked); + wake_up_all(&journal->j_wait_transaction_locked); write_unlock(&journal->j_state_lock); - jbd_debug(3, "JBD2: commit phase 2a\n"); + jbd2_debug(3, "JBD2: commit phase 2a\n"); /* * Now start flushing things to disk, in the order they appear @@ -553,11 +548,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_abort(journal, err); blk_start_plug(&plug); - jbd2_journal_write_revoke_records(journal, commit_transaction, - &log_bufs, WRITE_SYNC); - blk_finish_plug(&plug); + jbd2_journal_write_revoke_records(commit_transaction, &log_bufs); - jbd_debug(3, "JBD2: commit phase 2b\n"); + jbd2_debug(3, "JBD2: commit phase 2b\n"); /* * Way to go: we have now written out all of the data for a @@ -572,17 +565,14 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_logging = jiffies; stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, stats.run.rs_logging); - stats.run.rs_blocks = - atomic_read(&commit_transaction->t_outstanding_credits); + stats.run.rs_blocks = commit_transaction->t_nr_buffers; stats.run.rs_blocks_logged = 0; J_ASSERT(commit_transaction->t_nr_buffers <= atomic_read(&commit_transaction->t_outstanding_credits)); - err = 0; bufs = 0; descriptor = NULL; - blk_start_plug(&plug); while (commit_transaction->t_buffers) { /* Find the next buffer to be journaled... */ @@ -615,22 +605,19 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (!descriptor) { J_ASSERT (bufs == 0); - jbd_debug(4, "JBD2: get descriptor\n"); + jbd2_debug(4, "JBD2: get descriptor\n"); - descriptor = jbd2_journal_get_descriptor_buffer(journal); + descriptor = jbd2_journal_get_descriptor_buffer( + commit_transaction, + JBD2_DESCRIPTOR_BLOCK); if (!descriptor) { jbd2_journal_abort(journal, -EIO); continue; } - jbd_debug(4, "JBD2: got buffer %llu (%p)\n", + jbd2_debug(4, "JBD2: got buffer %llu (%p)\n", (unsigned long long)descriptor->b_blocknr, descriptor->b_data); - header = (journal_header_t *)descriptor->b_data; - header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); - header->h_sequence = cpu_to_be32(commit_transaction->t_tid); - tagp = &descriptor->b_data[sizeof(journal_header_t)]; space_left = descriptor->b_size - sizeof(journal_header_t); @@ -658,8 +645,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* * start_this_handle() uses t_outstanding_credits to determine - * the free space in the log, but this counter is changed - * by jbd2_journal_next_log_block() also. + * the free space in the log. */ atomic_dec(&commit_transaction->t_outstanding_credits); @@ -674,25 +660,21 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ set_bit(BH_JWrite, &jh2bh(jh)->b_state); JBUFFER_TRACE(jh, "ph3: write metadata"); - flags = jbd2_journal_write_metadata_buffer(commit_transaction, + escape = jbd2_journal_write_metadata_buffer(commit_transaction, jh, &wbuf[bufs], blocknr); - if (flags < 0) { - jbd2_journal_abort(journal, flags); - continue; - } jbd2_file_log_bh(&io_bufs, wbuf[bufs]); /* Record the new block's tag in the current descriptor buffer */ tag_flag = 0; - if (flags & 1) + if (escape) tag_flag |= JBD2_FLAG_ESCAPE; if (!first_tag) tag_flag |= JBD2_FLAG_SAME_UUID; tag = (journal_block_tag_t *) tagp; - write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); + write_tag_block(journal, tag, jh2bh(jh)->b_blocknr); tag->t_flags = cpu_to_be16(tag_flag); jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], commit_transaction->t_tid); @@ -714,23 +696,25 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction->t_buffers == NULL || space_left < tag_bytes + 16 + csum_size) { - jbd_debug(4, "JBD2: Submit %d IOs\n", bufs); + jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs); /* Write an end-of-descriptor marker before submitting the IOs. "tag" still points to the last tag we set up. */ tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); - - jbd2_descr_block_csum_set(journal, descriptor); start_journal_io: + if (descriptor) + jbd2_descriptor_block_csum_set(journal, + descriptor); + for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; + /* * Compute checksum. */ - if (JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM)) { + if (jbd2_has_feature_checksum(journal)) { crc32_sum = jbd2_checksum_data(crc32_sum, bh); } @@ -739,10 +723,10 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(WRITE_SYNC, bh); + submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS, + bh); } cond_resched(); - stats.run.rs_blocks_logged += bufs; /* Force a new descriptor to be generated next time round the loop. */ @@ -754,10 +738,8 @@ start_journal_io: err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { printk(KERN_WARNING - "JBD2: Detected IO errors while flushing file data " - "on %s\n", journal->j_devname); - if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) - jbd2_journal_abort(journal, err); + "JBD2: Detected IO errors %d while flushing file data on %s\n", + err, journal->j_devname); err = 0; } @@ -778,30 +760,29 @@ start_journal_io: if (first_block < journal->j_tail) freed += journal->j_last - journal->j_first; /* Update tail only if we free significant amount of space */ - if (freed < journal->j_maxlen / 4) + if (freed < journal->j_max_transaction_buffers) update_tail = 0; } J_ASSERT(commit_transaction->t_state == T_COMMIT); commit_transaction->t_state = T_COMMIT_DFLUSH; write_unlock(&journal->j_state_lock); - /* + /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue - * the commit record + * the commit record and update the journal tail sequence. */ - if (commit_transaction->t_need_data_flush && + if ((commit_transaction->t_need_data_flush || update_tail) && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); + blkdev_issue_flush(journal->j_fs_dev); /* Done it all: now write the commit record asynchronously. */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + if (jbd2_has_feature_async_commit(journal)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) - __jbd2_journal_abort_hard(journal); + jbd2_journal_abort(journal, err); } blk_finish_plug(&plug); @@ -817,7 +798,7 @@ start_journal_io: so we incur less scheduling load. */ - jbd_debug(3, "JBD2: commit phase 3\n"); + jbd2_debug(3, "JBD2: commit phase 3\n"); while (!list_empty(&io_bufs)) { struct buffer_head *bh = list_entry(io_bufs.prev, @@ -830,6 +811,7 @@ start_journal_io: if (unlikely(!buffer_uptodate(bh))) err = -EIO; jbd2_unfile_log_bh(bh); + stats.run.rs_blocks_logged++; /* * The list contains temporary buffer heads created by @@ -859,7 +841,7 @@ start_journal_io: J_ASSERT (commit_transaction->t_shadow_list == NULL); - jbd_debug(3, "JBD2: commit phase 4\n"); + jbd2_debug(3, "JBD2: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ while (!list_empty(&log_bufs)) { @@ -875,6 +857,7 @@ start_journal_io: BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); clear_buffer_jwrite(bh); jbd2_unfile_log_bh(bh); + stats.run.rs_blocks_logged++; __brelse(bh); /* One for getblk */ /* AKPM: bforget here */ } @@ -882,30 +865,32 @@ start_journal_io: if (err) jbd2_journal_abort(journal, err); - jbd_debug(3, "JBD2: commit phase 5\n"); + jbd2_debug(3, "JBD2: commit phase 5\n"); write_lock(&journal->j_state_lock); J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); commit_transaction->t_state = T_COMMIT_JFLUSH; write_unlock(&journal->j_state_lock); - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + if (!jbd2_has_feature_async_commit(journal)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) - __jbd2_journal_abort_hard(journal); + jbd2_journal_abort(journal, err); } if (cbh) err = journal_wait_on_commit_record(journal, cbh); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && + stats.run.rs_blocks_logged++; + if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { - blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); + blkdev_issue_flush(journal->j_dev); } if (err) jbd2_journal_abort(journal, err); + WARN_ON_ONCE( + atomic_read(&commit_transaction->t_outstanding_credits) < 0); + /* * Now disk caches for filesystem device are flushed so we are safe to * erase checkpointed transactions from the log by updating journal @@ -919,7 +904,7 @@ start_journal_io: transaction can be removed from any checkpoint list it was on before. */ - jbd_debug(3, "JBD2: commit phase 6\n"); + jbd2_debug(3, "JBD2: commit phase 6\n"); J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); @@ -936,6 +921,7 @@ restart_loop: transaction_t *cp_transaction; struct buffer_head *bh; int try_to_free = 0; + bool drop_ref; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); @@ -945,7 +931,7 @@ restart_loop: * done with it. */ get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); /* @@ -990,29 +976,34 @@ restart_loop: * it. */ /* - * A buffer which has been freed while still being journaled by - * a previous transaction. - */ - if (buffer_freed(bh)) { + * A buffer which has been freed while still being journaled + * by a previous transaction, refile the buffer to BJ_Forget of + * the running transaction. If the just committed transaction + * contains "add to orphan" operation, we can completely + * invalidate the buffer now. We are rather through in that + * since the buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial page. + */ + if (buffer_freed(bh) && !jh->b_next_transaction) { + struct address_space *mapping; + + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + /* - * If the running transaction is the one containing - * "add to orphan" operation (b_next_transaction != - * NULL), we have to wait for that transaction to - * commit before we can really get rid of the buffer. - * So just clear b_modified to not confuse transaction - * credit accounting and refile the buffer to - * BJ_Forget of the running transaction. If the just - * committed transaction contains "add to orphan" - * operation, we can completely invalidate the buffer - * now. We are rather through in that since the - * buffer may be still accessible when blocksize < - * pagesize and it is attached to the last partial - * page. + * Block device buffers need to stay mapped all the + * time, so it is enough to clear buffer_jbddirty and + * buffer_freed bits. For the file mapping buffers (i.e. + * journalled data) we need to unmap buffer and clear + * more bits. We also need to be careful about the check + * because the data page mapping can get cleared under + * our hands. Note that if mapping == NULL, we don't + * need to make buffer unmapped because the page is + * already detached from the mapping and buffers cannot + * get reused. */ - jh->b_modified = 0; - if (!jh->b_next_transaction) { - clear_buffer_freed(bh); - clear_buffer_jbddirty(bh); + mapping = READ_ONCE(bh->b_folio->mapping); + if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { clear_buffer_mapped(bh); clear_buffer_new(bh); clear_buffer_req(bh); @@ -1040,8 +1031,10 @@ restart_loop: try_to_free = 1; } JBUFFER_TRACE(jh, "refile or unfile buffer"); - __jbd2_journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); + drop_ref = __jbd2_journal_refile_buffer(jh); + spin_unlock(&jh->b_state_lock); + if (drop_ref) + jbd2_journal_put_journal_head(jh); if (try_to_free) release_buffer_page(bh); /* Drops bh reference */ else @@ -1067,9 +1060,28 @@ restart_loop: goto restart_loop; } + /* Add the transaction to the checkpoint list + * __journal_remove_checkpoint() can not destroy transaction + * under us because it is not marked as T_FINISHED yet */ + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + spin_unlock(&journal->j_list_lock); + /* Done with this transaction! */ - jbd_debug(3, "JBD2: commit phase 7\n"); + jbd2_debug(3, "JBD2: commit phase 7\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); @@ -1085,28 +1097,11 @@ restart_loop: atomic_read(&commit_transaction->t_handle_count); trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, commit_transaction->t_tid, &stats.run); - - /* - * Calculate overall stats - */ - spin_lock(&journal->j_history_lock); - journal->j_stats.ts_tid++; - if (commit_transaction->t_requested) - journal->j_stats.ts_requested++; - journal->j_stats.run.rs_wait += stats.run.rs_wait; - journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; - journal->j_stats.run.rs_running += stats.run.rs_running; - journal->j_stats.run.rs_locked += stats.run.rs_locked; - journal->j_stats.run.rs_flushing += stats.run.rs_flushing; - journal->j_stats.run.rs_logging += stats.run.rs_logging; - journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; - journal->j_stats.run.rs_blocks += stats.run.rs_blocks; - journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; - spin_unlock(&journal->j_history_lock); + stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; commit_transaction->t_state = T_COMMIT_CALLBACK; J_ASSERT(commit_transaction == journal->j_committing_transaction); - journal->j_commit_sequence = commit_transaction->t_tid; + WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid); journal->j_committing_transaction = NULL; commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); @@ -1122,41 +1117,44 @@ restart_loop: write_unlock(&journal->j_state_lock); - if (journal->j_checkpoint_transactions == NULL) { - journal->j_checkpoint_transactions = commit_transaction; - commit_transaction->t_cpnext = commit_transaction; - commit_transaction->t_cpprev = commit_transaction; - } else { - commit_transaction->t_cpnext = - journal->j_checkpoint_transactions; - commit_transaction->t_cpprev = - commit_transaction->t_cpnext->t_cpprev; - commit_transaction->t_cpnext->t_cpprev = - commit_transaction; - commit_transaction->t_cpprev->t_cpnext = - commit_transaction; - } - spin_unlock(&journal->j_list_lock); - /* Drop all spin_locks because commit_callback may be block. - * __journal_remove_checkpoint() can not destroy transaction - * under us because it is not marked as T_FINISHED yet */ if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); + if (journal->j_fc_cleanup_callback) + journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); trace_jbd2_end_commit(journal, commit_transaction); - jbd_debug(1, "JBD2: commit %d complete, head %d\n", + jbd2_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); write_lock(&journal->j_state_lock); + journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING; + journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; spin_lock(&journal->j_list_lock); commit_transaction->t_state = T_FINISHED; - /* Recheck checkpoint lists after j_list_lock was dropped */ - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { + /* Check if the transaction can be dropped now that we are finished */ + if (commit_transaction->t_checkpoint_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); jbd2_journal_free_transaction(commit_transaction); } spin_unlock(&journal->j_list_lock); write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_done_commit); + wake_up(&journal->j_fc_wait); + + /* + * Calculate overall stats + */ + spin_lock(&journal->j_history_lock); + journal->j_stats.ts_tid++; + journal->j_stats.ts_requested += stats.ts_requested; + journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; + journal->j_stats.run.rs_running += stats.run.rs_running; + journal->j_stats.run.rs_locked += stats.run.rs_locked; + journal->j_stats.run.rs_flushing += stats.run.rs_flushing; + journal->j_stats.run.rs_logging += stats.run.rs_logging; + journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; + journal->j_stats.run.rs_blocks += stats.run.rs_blocks; + journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; + spin_unlock(&journal->j_history_lock); } |
