diff options
Diffstat (limited to 'fs/xfs/xfs_log.c')
| -rw-r--r-- | fs/xfs/xfs_log.c | 952 |
1 files changed, 285 insertions, 667 deletions
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fc61cc024023..a311385b23d8 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -20,6 +20,7 @@ #include "xfs_sysfs.h" #include "xfs_sb.h" #include "xfs_health.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_log_ticket_cache; @@ -30,10 +31,6 @@ xlog_alloc_log( struct xfs_buftarg *log_target, xfs_daddr_t blk_offset, int num_bblks); -STATIC int -xlog_space_left( - struct xlog *log, - atomic64_t *head); STATIC void xlog_dealloc_log( struct xlog *log); @@ -51,19 +48,12 @@ xlog_state_get_iclog_space( struct xlog_ticket *ticket, int *logoffsetp); STATIC void -xlog_grant_push_ail( - struct xlog *log, - int need_bytes); -STATIC void xlog_sync( struct xlog *log, struct xlog_in_core *iclog, struct xlog_ticket *ticket); #if defined(DEBUG) STATIC void -xlog_verify_grant_tail( - struct xlog *log); -STATIC void xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, @@ -73,7 +63,6 @@ xlog_verify_tail_lsn( struct xlog *log, struct xlog_in_core *iclog); #else -#define xlog_verify_grant_tail(a) #define xlog_verify_iclog(a,b,c) #define xlog_verify_tail_lsn(a,b) #endif @@ -120,14 +109,14 @@ xlog_prepare_iovec( vec = &lv->lv_iovecp[0]; } - len = lv->lv_buf_len + sizeof(struct xlog_op_header); + len = lv->lv_buf_used + sizeof(struct xlog_op_header); if (!IS_ALIGNED(len, sizeof(uint64_t))) { - lv->lv_buf_len = round_up(len, sizeof(uint64_t)) - + lv->lv_buf_used = round_up(len, sizeof(uint64_t)) - sizeof(struct xlog_op_header); } vec->i_type = type; - vec->i_addr = lv->lv_buf + lv->lv_buf_len; + vec->i_addr = lv->lv_buf + lv->lv_buf_used; oph = vec->i_addr; oph->oh_clientid = XFS_TRANSACTION; @@ -141,70 +130,66 @@ xlog_prepare_iovec( return buf; } -static void +static inline void xlog_grant_sub_space( - struct xlog *log, - atomic64_t *head, - int bytes) + struct xlog_grant_head *head, + int64_t bytes) { - int64_t head_val = atomic64_read(head); - int64_t new, old; - - do { - int cycle, space; - - xlog_crack_grant_head_val(head_val, &cycle, &space); - - space -= bytes; - if (space < 0) { - space += log->l_logsize; - cycle--; - } - - old = head_val; - new = xlog_assign_grant_head_val(cycle, space); - head_val = atomic64_cmpxchg(head, old, new); - } while (head_val != old); + atomic64_sub(bytes, &head->grant); } -static void +static inline void xlog_grant_add_space( - struct xlog *log, - atomic64_t *head, - int bytes) + struct xlog_grant_head *head, + int64_t bytes) { - int64_t head_val = atomic64_read(head); - int64_t new, old; - - do { - int tmp; - int cycle, space; - - xlog_crack_grant_head_val(head_val, &cycle, &space); - - tmp = log->l_logsize - space; - if (tmp > bytes) - space += bytes; - else { - space = bytes - tmp; - cycle++; - } - - old = head_val; - new = xlog_assign_grant_head_val(cycle, space); - head_val = atomic64_cmpxchg(head, old, new); - } while (head_val != old); + atomic64_add(bytes, &head->grant); } -STATIC void +static void xlog_grant_head_init( struct xlog_grant_head *head) { - xlog_assign_grant_head(&head->grant, 1, 0); + atomic64_set(&head->grant, 0); INIT_LIST_HEAD(&head->waiters); spin_lock_init(&head->lock); } +void +xlog_grant_return_space( + struct xlog *log, + xfs_lsn_t old_head, + xfs_lsn_t new_head) +{ + int64_t diff = xlog_lsn_sub(log, new_head, old_head); + + xlog_grant_sub_space(&log->l_reserve_head, diff); + xlog_grant_sub_space(&log->l_write_head, diff); +} + +/* + * Return the space in the log between the tail and the head. In the case where + * we have overrun available reservation space, return 0. The memory barrier + * pairs with the smp_wmb() in xlog_cil_ail_insert() to ensure that grant head + * vs tail space updates are seen in the correct order and hence avoid + * transients as space is transferred from the grant heads to the AIL on commit + * completion. + */ +static uint64_t +xlog_grant_space_left( + struct xlog *log, + struct xlog_grant_head *head) +{ + int64_t free_bytes; + + smp_rmb(); /* paired with smp_wmb in xlog_cil_ail_insert() */ + free_bytes = log->l_logsize - READ_ONCE(log->l_tail_space) - + atomic64_read(&head->grant); + if (free_bytes > 0) + return free_bytes; + return 0; +} + STATIC void xlog_grant_head_wake_all( struct xlog_grant_head *head) @@ -242,42 +227,15 @@ xlog_grant_head_wake( { struct xlog_ticket *tic; int need_bytes; - bool woken_task = false; list_for_each_entry(tic, &head->waiters, t_queue) { - - /* - * There is a chance that the size of the CIL checkpoints in - * progress at the last AIL push target calculation resulted in - * limiting the target to the log head (l_last_sync_lsn) at the - * time. This may not reflect where the log head is now as the - * CIL checkpoints may have completed. - * - * Hence when we are woken here, it may be that the head of the - * log that has moved rather than the tail. As the tail didn't - * move, there still won't be space available for the - * reservation we require. However, if the AIL has already - * pushed to the target defined by the old log head location, we - * will hang here waiting for something else to update the AIL - * push target. - * - * Therefore, if there isn't space to wake the first waiter on - * the grant head, we need to push the AIL again to ensure the - * target reflects both the current log tail and log head - * position before we wait for the tail to move again. - */ - need_bytes = xlog_ticket_reservation(log, head, tic); - if (*free_bytes < need_bytes) { - if (!woken_task) - xlog_grant_push_ail(log, need_bytes); + if (*free_bytes < need_bytes) return false; - } *free_bytes -= need_bytes; trace_xfs_log_grant_wake_up(log, tic); wake_up_process(tic->t_task); - woken_task = true; } return true; @@ -296,13 +254,15 @@ xlog_grant_head_wait( do { if (xlog_is_shutdown(log)) goto shutdown; - xlog_grant_push_ail(log, need_bytes); __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&head->lock); XFS_STATS_INC(log->l_mp, xs_sleep_logspace); + /* Push on the AIL to free up all the log space. */ + xfs_ail_push_all(log->l_ailp); + trace_xfs_log_grant_sleep(log, tic); schedule(); trace_xfs_log_grant_wake(log, tic); @@ -310,7 +270,7 @@ xlog_grant_head_wait( spin_lock(&head->lock); if (xlog_is_shutdown(log)) goto shutdown; - } while (xlog_space_left(log, &head->grant) < need_bytes); + } while (xlog_grant_space_left(log, head) < need_bytes); list_del_init(&tic->t_queue); return 0; @@ -355,7 +315,7 @@ xlog_grant_head_check( * otherwise try to get some space for this transaction. */ *need_bytes = xlog_ticket_reservation(log, head, tic); - free_bytes = xlog_space_left(log, &head->grant); + free_bytes = xlog_grant_space_left(log, head); if (!list_empty_careful(&head->waiters)) { spin_lock(&head->lock); if (!xlog_grant_head_wake(log, head, &free_bytes) || @@ -418,9 +378,6 @@ xfs_log_regrant( * of rolling transactions in the log easily. */ tic->t_tid++; - - xlog_grant_push_ail(log, tic->t_unit_res); - tic->t_curr_res = tic->t_unit_res; if (tic->t_cnt > 0) return 0; @@ -432,9 +389,8 @@ xfs_log_regrant( if (error) goto out_error; - xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + xlog_grant_add_space(&log->l_write_head, need_bytes); trace_xfs_log_regrant_exit(log, tic); - xlog_verify_grant_tail(log); return 0; out_error: @@ -477,21 +433,15 @@ xfs_log_reserve( ASSERT(*ticp == NULL); tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent); *ticp = tic; - - xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt - : tic->t_unit_res); - trace_xfs_log_reserve(log, tic); - error = xlog_grant_head_check(log, &log->l_reserve_head, tic, &need_bytes); if (error) goto out_error; - xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes); - xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + xlog_grant_add_space(&log->l_reserve_head, need_bytes); + xlog_grant_add_space(&log->l_write_head, need_bytes); trace_xfs_log_reserve_exit(log, tic); - xlog_verify_grant_tail(log); return 0; out_error: @@ -571,7 +521,6 @@ xlog_state_release_iclog( struct xlog_in_core *iclog, struct xlog_ticket *ticket) { - xfs_lsn_t tail_lsn; bool last_ref; lockdep_assert_held(&log->l_icloglock); @@ -585,9 +534,9 @@ xlog_state_release_iclog( */ if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && - !iclog->ic_header.h_tail_lsn) { - tail_lsn = xlog_assign_tail_lsn(log->l_mp); - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + !iclog->ic_header->h_tail_lsn) { + iclog->ic_header->h_tail_lsn = + cpu_to_be64(atomic64_read(&log->l_tail_lsn)); } last_ref = atomic_dec_and_test(&iclog->ic_refcnt); @@ -633,15 +582,14 @@ xlog_state_release_iclog( */ int xfs_log_mount( - xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks) + xfs_mount_t *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks) { - struct xlog *log; - bool fatal = xfs_has_crc(mp); - int error = 0; - int min_logfsbs; + struct xlog *log; + int error = 0; + int min_logfsbs; if (!xfs_has_norecovery(mp)) { xfs_notice(mp, "Mounting V%d Filesystem %pU", @@ -663,53 +611,37 @@ xfs_log_mount( mp->m_log = log; /* - * Validate the given log space and drop a critical message via syslog - * if the log size is too small that would lead to some unexpected - * situations in transaction log space reservation stage. + * Now that we have set up the log and it's internal geometry + * parameters, we can validate the given log space and drop a critical + * message via syslog if the log size is too small. A log that is too + * small can lead to unexpected situations in transaction log space + * reservation stage. The superblock verifier has already validated all + * the other log geometry constraints, so we don't have to check those + * here. * - * Note: we can't just reject the mount if the validation fails. This - * would mean that people would have to downgrade their kernel just to - * remedy the situation as there is no way to grow the log (short of - * black magic surgery with xfs_db). + * Note: For v4 filesystems, we can't just reject the mount if the + * validation fails. This would mean that people would have to + * downgrade their kernel just to remedy the situation as there is no + * way to grow the log (short of black magic surgery with xfs_db). * - * We can, however, reject mounts for CRC format filesystems, as the + * We can, however, reject mounts for V5 format filesystems, as the * mkfs binary being used to make the filesystem should never create a * filesystem with a log that is too small. */ min_logfsbs = xfs_log_calc_minimum_size(mp); - if (mp->m_sb.sb_logblocks < min_logfsbs) { xfs_warn(mp, "Log size %d blocks too small, minimum size is %d blocks", mp->m_sb.sb_logblocks, min_logfsbs); - error = -EINVAL; - } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { - xfs_warn(mp, - "Log size %d blocks too large, maximum size is %lld blocks", - mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); - error = -EINVAL; - } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { - xfs_warn(mp, - "log size %lld bytes too large, maximum size is %lld bytes", - XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), - XFS_MAX_LOG_BYTES); - error = -EINVAL; - } else if (mp->m_sb.sb_logsunit > 1 && - mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) { - xfs_warn(mp, - "log stripe unit %u bytes must be a multiple of block size", - mp->m_sb.sb_logsunit); - error = -EINVAL; - fatal = true; - } - if (error) { + /* * Log check errors are always fatal on v5; or whenever bad * metadata leads to a crash. */ - if (fatal) { + if (xfs_has_crc(mp)) { xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); ASSERT(0); + error = -EINVAL; goto out_free_log; } xfs_crit(mp, "Log size out of supported range."); @@ -732,15 +664,7 @@ xfs_log_mount( * just worked. */ if (!xfs_has_norecovery(mp)) { - /* - * log recovery ignores readonly state and so we need to clear - * mount-based read only state so it can write to disk. - */ - bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, - &mp->m_opstate); error = xlog_recover(log); - if (readonly) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); @@ -789,7 +713,6 @@ xfs_log_mount_finish( struct xfs_mount *mp) { struct xlog *log = mp->m_log; - bool readonly; int error = 0; if (xfs_has_norecovery(mp)) { @@ -798,12 +721,6 @@ xfs_log_mount_finish( } /* - * log recovery ignores readonly state and so we need to clear - * mount-based read only state so it can write to disk. - */ - readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); - - /* * During the second phase of log recovery, we need iget and * iput to behave like they do for an active filesystem. * xfs_fs_drop_inode needs to be able to prevent the deletion @@ -852,8 +769,6 @@ xfs_log_mount_finish( xfs_buftarg_drain(mp->m_ddev_targp); clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); - if (readonly) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); /* Make sure the log is dead if we're returning failure. */ ASSERT(!error || xlog_is_shutdown(log)); @@ -1054,8 +969,8 @@ xfs_log_unmount_write( * counters will be recalculated. Refer to xlog_check_unmount_rec for * more details. */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + if (xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { xfs_alert(mp, "%s: will fix summary counters at next mount", __func__); return; @@ -1183,7 +1098,7 @@ xfs_log_space_wake( ASSERT(!xlog_in_recovery(log)); spin_lock(&log->l_write_head.lock); - free_bytes = xlog_space_left(log, &log->l_write_head.grant); + free_bytes = xlog_grant_space_left(log, &log->l_write_head); xlog_grant_head_wake(log, &log->l_write_head, &free_bytes); spin_unlock(&log->l_write_head.lock); } @@ -1192,7 +1107,7 @@ xfs_log_space_wake( ASSERT(!xlog_in_recovery(log)); spin_lock(&log->l_reserve_head.lock); - free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); + free_bytes = xlog_grant_space_left(log, &log->l_reserve_head); xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes); spin_unlock(&log->l_reserve_head.lock); } @@ -1306,105 +1221,6 @@ xfs_log_cover( return error; } -/* - * We may be holding the log iclog lock upon entering this routine. - */ -xfs_lsn_t -xlog_assign_tail_lsn_locked( - struct xfs_mount *mp) -{ - struct xlog *log = mp->m_log; - struct xfs_log_item *lip; - xfs_lsn_t tail_lsn; - - assert_spin_locked(&mp->m_ail->ail_lock); - - /* - * To make sure we always have a valid LSN for the log tail we keep - * track of the last LSN which was committed in log->l_last_sync_lsn, - * and use that when the AIL was empty. - */ - lip = xfs_ail_min(mp->m_ail); - if (lip) - tail_lsn = lip->li_lsn; - else - tail_lsn = atomic64_read(&log->l_last_sync_lsn); - trace_xfs_log_assign_tail_lsn(log, tail_lsn); - atomic64_set(&log->l_tail_lsn, tail_lsn); - return tail_lsn; -} - -xfs_lsn_t -xlog_assign_tail_lsn( - struct xfs_mount *mp) -{ - xfs_lsn_t tail_lsn; - - spin_lock(&mp->m_ail->ail_lock); - tail_lsn = xlog_assign_tail_lsn_locked(mp); - spin_unlock(&mp->m_ail->ail_lock); - - return tail_lsn; -} - -/* - * Return the space in the log between the tail and the head. The head - * is passed in the cycle/bytes formal parms. In the special case where - * the reserve head has wrapped passed the tail, this calculation is no - * longer valid. In this case, just return 0 which means there is no space - * in the log. This works for all places where this function is called - * with the reserve head. Of course, if the write head were to ever - * wrap the tail, we should blow up. Rather than catch this case here, - * we depend on other ASSERTions in other parts of the code. XXXmiken - * - * If reservation head is behind the tail, we have a problem. Warn about it, - * but then treat it as if the log is empty. - * - * If the log is shut down, the head and tail may be invalid or out of whack, so - * shortcut invalidity asserts in this case so that we don't trigger them - * falsely. - */ -STATIC int -xlog_space_left( - struct xlog *log, - atomic64_t *head) -{ - int tail_bytes; - int tail_cycle; - int head_cycle; - int head_bytes; - - xlog_crack_grant_head(head, &head_cycle, &head_bytes); - xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); - tail_bytes = BBTOB(tail_bytes); - if (tail_cycle == head_cycle && head_bytes >= tail_bytes) - return log->l_logsize - (head_bytes - tail_bytes); - if (tail_cycle + 1 < head_cycle) - return 0; - - /* Ignore potential inconsistency when shutdown. */ - if (xlog_is_shutdown(log)) - return log->l_logsize; - - if (tail_cycle < head_cycle) { - ASSERT(tail_cycle == (head_cycle - 1)); - return tail_bytes - head_bytes; - } - - /* - * The reservation head is behind the tail. In this case we just want to - * return the size of the log as the amount of space left. - */ - xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); - xfs_alert(log->l_mp, " tail_cycle = %d, tail_bytes = %d", - tail_cycle, tail_bytes); - xfs_alert(log->l_mp, " GH cycle = %d, GH bytes = %d", - head_cycle, head_bytes); - ASSERT(0); - return log->l_logsize; -} - - static void xlog_ioend_work( struct work_struct *work) @@ -1424,7 +1240,7 @@ xlog_ioend_work( /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { + if (error || XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } @@ -1463,11 +1279,12 @@ xlog_get_iclog_buffer_size( log->l_iclog_size = mp->m_logbsize; /* - * # headers = size / 32k - one header holds cycles from 32k of data. + * Combined size of the log record headers. The first 32k cycles + * are stored directly in the xlog_rec_header, the rest in the + * variable number of xlog_rec_ext_headers at its end. */ - log->l_iclog_heads = - DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE); - log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT; + log->l_iclog_hsize = struct_size(log->l_iclog->ic_header, h_ext, + DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE) - 1); } void @@ -1482,7 +1299,7 @@ xfs_log_work_queue( * Clear the log incompat flags if we have the opportunity. * * This only happens if we're about to log the second dummy transaction as part - * of covering the log and we can get the log incompat feature usage lock. + * of covering the log. */ static inline void xlog_clear_incompat( @@ -1497,11 +1314,7 @@ xlog_clear_incompat( if (log->l_covered_state != XLOG_STATE_COVER_DONE2) return; - if (!down_write_trylock(&log->l_incompat_users)) - return; - xfs_clear_incompat_log_features(mp); - up_write(&log->l_incompat_users); } /* @@ -1555,14 +1368,13 @@ xlog_alloc_log( int num_bblks) { struct xlog *log; - xlog_rec_header_t *head; - xlog_in_core_t **iclogp; - xlog_in_core_t *iclog, *prev_iclog=NULL; + struct xlog_in_core **iclogp; + struct xlog_in_core *iclog, *prev_iclog = NULL; int i; int error = -ENOMEM; uint log2_size = 0; - log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); + log = kzalloc(sizeof(struct xlog), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!log) { xfs_warn(mp, "Log allocation failed: No memory!"); goto out; @@ -1576,11 +1388,11 @@ xlog_alloc_log( log->l_covered_state = XLOG_STATE_COVER_IDLE; set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); + INIT_LIST_HEAD(&log->r_dfops); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); - xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1) @@ -1618,27 +1430,19 @@ xlog_alloc_log( } log->l_sectBBsize = 1 << log2_size; - init_rwsem(&log->l_incompat_users); - xlog_get_iclog_buffer_size(mp, log); spin_lock_init(&log->l_icloglock); init_waitqueue_head(&log->l_flush_wait); iclogp = &log->l_iclog; - /* - * The amount of memory to allocate for the iclog structure is - * rather funky due to the way the structure is defined. It is - * done this way so that we can use different sizes for machines - * with different amounts of memory. See the definition of - * xlog_in_core_t in xfs_log_priv.h for details. - */ ASSERT(log->l_iclog_size >= 4096); for (i = 0; i < log->l_iclog_bufs; i++) { size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * sizeof(struct bio_vec); - iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL); + iclog = kzalloc(sizeof(*iclog) + bvec_size, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog) goto out_free_iclog; @@ -1646,26 +1450,25 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog; - iclog->ic_data = kvzalloc(log->l_iclog_size, + iclog->ic_header = kvzalloc(log->l_iclog_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); - if (!iclog->ic_data) + if (!iclog->ic_header) goto out_free_iclog; - head = &iclog->ic_header; - memset(head, 0, sizeof(xlog_rec_header_t)); - head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); - head->h_version = cpu_to_be32( + iclog->ic_header->h_magicno = + cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + iclog->ic_header->h_version = cpu_to_be32( xfs_has_logv2(log->l_mp) ? 2 : 1); - head->h_size = cpu_to_be32(log->l_iclog_size); - /* new fields */ - head->h_fmt = cpu_to_be32(XLOG_FMT); - memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + iclog->ic_header->h_size = cpu_to_be32(log->l_iclog_size); + iclog->ic_header->h_fmt = cpu_to_be32(XLOG_FMT); + memcpy(&iclog->ic_header->h_fs_uuid, &mp->m_sb.sb_uuid, + sizeof(iclog->ic_header->h_fs_uuid)); + iclog->ic_datap = (void *)iclog->ic_header + log->l_iclog_hsize; iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); INIT_LIST_HEAD(&iclog->ic_callbacks); - iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize; init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1678,8 +1481,7 @@ xlog_alloc_log( log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | - WQ_HIGHPRI), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU), 0, mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog; @@ -1694,101 +1496,18 @@ out_destroy_workqueue: out_free_iclog: for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { prev_iclog = iclog->ic_next; - kmem_free(iclog->ic_data); - kmem_free(iclog); + kvfree(iclog->ic_header); + kfree(iclog); if (prev_iclog == log->l_iclog) break; } out_free_log: - kmem_free(log); + kfree(log); out: return ERR_PTR(error); } /* xlog_alloc_log */ /* - * Compute the LSN that we'd need to push the log tail towards in order to have - * (a) enough on-disk log space to log the number of bytes specified, (b) at - * least 25% of the log space free, and (c) at least 256 blocks free. If the - * log free space already meets all three thresholds, this function returns - * NULLCOMMITLSN. - */ -xfs_lsn_t -xlog_grant_push_threshold( - struct xlog *log, - int need_bytes) -{ - xfs_lsn_t threshold_lsn = 0; - xfs_lsn_t last_sync_lsn; - int free_blocks; - int free_bytes; - int threshold_block; - int threshold_cycle; - int free_threshold; - - ASSERT(BTOBB(need_bytes) < log->l_logBBsize); - - free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); - free_blocks = BTOBBT(free_bytes); - - /* - * Set the threshold for the minimum number of free blocks in the - * log to the maximum of what the caller needs, one quarter of the - * log, and 256 blocks. - */ - free_threshold = BTOBB(need_bytes); - free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); - free_threshold = max(free_threshold, 256); - if (free_blocks >= free_threshold) - return NULLCOMMITLSN; - - xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, - &threshold_block); - threshold_block += free_threshold; - if (threshold_block >= log->l_logBBsize) { - threshold_block -= log->l_logBBsize; - threshold_cycle += 1; - } - threshold_lsn = xlog_assign_lsn(threshold_cycle, - threshold_block); - /* - * Don't pass in an lsn greater than the lsn of the last - * log record known to be on disk. Use a snapshot of the last sync lsn - * so that it doesn't change between the compare and the set. - */ - last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); - if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) - threshold_lsn = last_sync_lsn; - - return threshold_lsn; -} - -/* - * Push the tail of the log if we need to do so to maintain the free log space - * thresholds set out by xlog_grant_push_threshold. We may need to adopt a - * policy which pushes on an lsn which is further along in the log once we - * reach the high water mark. In this manner, we would be creating a low water - * mark. - */ -STATIC void -xlog_grant_push_ail( - struct xlog *log, - int need_bytes) -{ - xfs_lsn_t threshold_lsn; - - threshold_lsn = xlog_grant_push_threshold(log, need_bytes); - if (threshold_lsn == NULLCOMMITLSN || xlog_is_shutdown(log)) - return; - - /* - * Get the transaction layer to kick the dirty buffers out to - * disk asynchronously. No point in trying to do this if - * the filesystem is shutting down. - */ - xfs_ail_push(log->l_ailp, threshold_lsn); -} - -/* * Stamp cycle number in every block */ STATIC void @@ -1797,36 +1516,19 @@ xlog_pack_data( struct xlog_in_core *iclog, int roundoff) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - char *dp; - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + struct xlog_rec_header *rhead = iclog->ic_header; + __be32 cycle_lsn = CYCLE_LSN_DISK(rhead->h_lsn); + char *dp = iclog->ic_datap; + int i; - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size); i++) { - if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) - break; - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + for (i = 0; i < BTOBB(iclog->ic_offset + roundoff); i++) { + *xlog_cycle_data(rhead, i) = *(__be32 *)dp; *(__be32 *)dp = cycle_lsn; dp += BBSIZE; } - if (xfs_has_logv2(log->l_mp)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; - - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - - for (i = 1; i < log->l_iclog_heads; i++) - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + for (i = 0; i < (log->l_iclog_hsize >> BBSHIFT) - 1; i++) + rhead->h_ext[i].xh_cycle = cycle_lsn; } /* @@ -1840,27 +1542,22 @@ xlog_cksum( struct xlog *log, struct xlog_rec_header *rhead, char *dp, - int size) + unsigned int hdrsize, + unsigned int size) { uint32_t crc; /* first generate the crc for the record header ... */ - crc = xfs_start_cksum_update((char *)rhead, - sizeof(struct xlog_rec_header), + crc = xfs_start_cksum_update((char *)rhead, hdrsize, offsetof(struct xlog_rec_header, h_crc)); /* ... then for additional cycle data for v2 logs ... */ if (xfs_has_logv2(log->l_mp)) { - union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; - int i; - int xheads; + int xheads, i; - xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE); - - for (i = 1; i < xheads; i++) { - crc = crc32c(crc, &xhdr[i].hic_xheader, - sizeof(struct xlog_rec_ext_header)); - } + xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE) - 1; + for (i = 0; i < xheads; i++) + crc = crc32c(crc, &rhead->h_ext[i], XLOG_REC_EXT_SIZE); } /* ... and finally for the payload */ @@ -1879,27 +1576,6 @@ xlog_bio_end_io( &iclog->ic_end_io_work); } -static int -xlog_map_iclog_data( - struct bio *bio, - void *data, - size_t count) -{ - do { - struct page *page = kmem_to_page(data); - unsigned int off = offset_in_page(data); - size_t len = min_t(size_t, count, PAGE_SIZE - off); - - if (bio_add_page(bio, page, len, off) != len) - return -EIO; - - data += len; - count -= len; - } while (count); - - return 0; -} - STATIC void xlog_write_iclog( struct xlog *log, @@ -1927,9 +1603,7 @@ xlog_write_iclog( * the buffer manually, the code needs to be kept in sync * with the I/O completion path. */ - xlog_state_done_syncing(iclog); - up(&iclog->ic_sema); - return; + goto sync; } /* @@ -1959,22 +1633,20 @@ xlog_write_iclog( * avoid shutdown re-entering this path and erroring out again. */ if (log->l_targ != log->l_mp->m_ddev_targp && - blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) { - xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return; - } + blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) + goto shutdown; } if (iclog->ic_flags & XLOG_ICL_NEED_FUA) iclog->ic_bio.bi_opf |= REQ_FUA; iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { - xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return; + if (is_vmalloc_addr(iclog->ic_header)) { + if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_header, count)) + goto shutdown; + } else { + bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_header, count); } - if (is_vmalloc_addr(iclog->ic_data)) - flush_kernel_vmap_range(iclog->ic_data, count); /* * If this log buffer would straddle the end of the log we will have @@ -1993,6 +1665,12 @@ xlog_write_iclog( } submit_bio(&iclog->ic_bio); + return; +shutdown: + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); +sync: + xlog_state_done_syncing(iclog); + up(&iclog->ic_sema); } /* @@ -2085,8 +1763,8 @@ xlog_sync( if (ticket) { ticket->t_curr_res -= roundoff; } else { - xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); - xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + xlog_grant_add_space(&log->l_reserve_head, roundoff); + xlog_grant_add_space(&log->l_write_head, roundoff); } /* put cycle number in every block */ @@ -2096,20 +1774,20 @@ xlog_sync( size = iclog->ic_offset; if (xfs_has_logv2(log->l_mp)) size += roundoff; - iclog->ic_header.h_len = cpu_to_be32(size); + iclog->ic_header->h_len = cpu_to_be32(size); XFS_STATS_INC(log->l_mp, xs_log_writes); XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count)); - bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); + bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header->h_lsn)); /* Do we need to split this write into 2 parts? */ if (bno + BTOBB(count) > log->l_logBBsize) - xlog_split_iclog(log, &iclog->ic_header, bno, count); + xlog_split_iclog(log, iclog->ic_header, bno, count); /* calculcate the checksum */ - iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, - iclog->ic_datap, size); + iclog->ic_header->h_crc = xlog_cksum(log, iclog->ic_header, + iclog->ic_datap, XLOG_REC_SIZE, size); /* * Intentionally corrupt the log record CRC based on the error injection * frequency, if defined. This facilitates testing log recovery in the @@ -2118,12 +1796,12 @@ xlog_sync( * detects the bad CRC and attempts to recover. */ #ifdef DEBUG - if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { - iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); + if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { + iclog->ic_header->h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_fail_crc = true; xfs_warn(log->l_mp, "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", - be64_to_cpu(iclog->ic_header.h_lsn)); + be64_to_cpu(iclog->ic_header->h_lsn)); } #endif xlog_verify_iclog(log, iclog, count); @@ -2135,10 +1813,10 @@ xlog_sync( */ STATIC void xlog_dealloc_log( - struct xlog *log) + struct xlog *log) { - xlog_in_core_t *iclog, *next_iclog; - int i; + struct xlog_in_core *iclog, *next_iclog; + int i; /* * Destroy the CIL after waiting for iclog IO completion because an @@ -2150,14 +1828,14 @@ xlog_dealloc_log( iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next; - kmem_free(iclog->ic_data); - kmem_free(iclog); + kvfree(iclog->ic_header); + kfree(iclog); iclog = next_iclog; } log->l_mp->m_log = NULL; destroy_workqueue(log->l_ioend_workqueue); - kmem_free(log); + kfree(log); } /* @@ -2172,7 +1850,7 @@ xlog_state_finish_copy( { lockdep_assert_held(&log->l_icloglock); - be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); + be32_add_cpu(&iclog->ic_header->h_num_logops, record_cnt); iclog->ic_offset += copy_bytes; } @@ -2222,9 +1900,9 @@ xlog_print_trans( if (!lv) continue; xfs_warn(mp, " niovecs = %d", lv->lv_niovecs); - xfs_warn(mp, " size = %d", lv->lv_size); + xfs_warn(mp, " alloc_size = %d", lv->lv_alloc_size); xfs_warn(mp, " bytes = %d", lv->lv_bytes); - xfs_warn(mp, " buf len = %d", lv->lv_buf_len); + xfs_warn(mp, " buf used= %d", lv->lv_buf_used); /* dump each iovec for the log item */ vec = lv->lv_iovecp; @@ -2595,7 +2273,7 @@ xlog_state_activate_iclog( * We don't need to cover the dummy. */ if (*iclogs_changed == 0 && - iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { + iclog->ic_header->h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { *iclogs_changed = 1; } else { /* @@ -2607,11 +2285,11 @@ xlog_state_activate_iclog( iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_offset = 0; - iclog->ic_header.h_num_logops = 0; - memset(iclog->ic_header.h_cycle_data, 0, - sizeof(iclog->ic_header.h_cycle_data)); - iclog->ic_header.h_lsn = 0; - iclog->ic_header.h_tail_lsn = 0; + iclog->ic_header->h_num_logops = 0; + memset(iclog->ic_header->h_cycle_data, 0, + sizeof(iclog->ic_header->h_cycle_data)); + iclog->ic_header->h_lsn = 0; + iclog->ic_header->h_tail_lsn = 0; } /* @@ -2703,7 +2381,7 @@ xlog_get_lowest_lsn( iclog->ic_state == XLOG_STATE_DIRTY) continue; - lsn = be64_to_cpu(iclog->ic_header.h_lsn); + lsn = be64_to_cpu(iclog->ic_header->h_lsn); if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0) lowest_lsn = lsn; } while ((iclog = iclog->ic_next) != log->l_iclog); @@ -2712,47 +2390,6 @@ xlog_get_lowest_lsn( } /* - * Completion of a iclog IO does not imply that a transaction has completed, as - * transactions can be large enough to span many iclogs. We cannot change the - * tail of the log half way through a transaction as this may be the only - * transaction in the log and moving the tail to point to the middle of it - * will prevent recovery from finding the start of the transaction. Hence we - * should only update the last_sync_lsn if this iclog contains transaction - * completion callbacks on it. - * - * We have to do this before we drop the icloglock to ensure we are the only one - * that can update it. - * - * If we are moving the last_sync_lsn forwards, we also need to ensure we kick - * the reservation grant head pushing. This is due to the fact that the push - * target is bound by the current last_sync_lsn value. Hence if we have a large - * amount of log space bound up in this committing transaction then the - * last_sync_lsn value may be the limiting factor preventing tail pushing from - * freeing space in the log. Hence once we've updated the last_sync_lsn we - * should push the AIL to ensure the push target (and hence the grant head) is - * no longer bound by the old log head location and can move forwards and make - * progress again. - */ -static void -xlog_state_set_callback( - struct xlog *log, - struct xlog_in_core *iclog, - xfs_lsn_t header_lsn) -{ - trace_xlog_iclog_callback(iclog, _RET_IP_); - iclog->ic_state = XLOG_STATE_CALLBACK; - - ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), - header_lsn) <= 0); - - if (list_empty_careful(&iclog->ic_callbacks)) - return; - - atomic64_set(&log->l_last_sync_lsn, header_lsn); - xlog_grant_push_ail(log, 0); -} - -/* * Return true if we need to stop processing, false to continue to the next * iclog. The caller will need to run callbacks if the iclog is returned in the * XLOG_STATE_CALLBACK state. @@ -2779,11 +2416,21 @@ xlog_state_iodone_process_iclog( * If this is not the lowest lsn iclog, then we will leave it * for another completion to process. */ - header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + header_lsn = be64_to_cpu(iclog->ic_header->h_lsn); lowest_lsn = xlog_get_lowest_lsn(log); if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) return false; - xlog_state_set_callback(log, iclog, header_lsn); + /* + * If there are no callbacks on this iclog, we can mark it clean + * immediately and return. Otherwise we need to run the + * callbacks. + */ + if (list_empty(&iclog->ic_callbacks)) { + xlog_state_clean_iclog(log, iclog); + return false; + } + trace_xlog_iclog_callback(iclog, _RET_IP_); + iclog->ic_state = XLOG_STATE_CALLBACK; return false; default: /* @@ -2932,9 +2579,9 @@ xlog_state_get_iclog_space( struct xlog_ticket *ticket, int *logoffsetp) { - int log_offset; - xlog_rec_header_t *head; - xlog_in_core_t *iclog; + int log_offset; + struct xlog_rec_header *head; + struct xlog_in_core *iclog; restart: spin_lock(&log->l_icloglock); @@ -2952,7 +2599,7 @@ restart: goto restart; } - head = &iclog->ic_header; + head = iclog->ic_header; atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset; @@ -2978,10 +2625,11 @@ restart: * until you know exactly how many bytes get copied. Therefore, wait * until later to update ic_offset. * - * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's + * xlog_write() algorithm assumes that at least 2 xlog_op_header's * can fit into remaining data section. */ - if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + if (iclog->ic_size - iclog->ic_offset < + 2 * sizeof(struct xlog_op_header)) { int error = 0; xlog_state_switch_iclogs(log, iclog, iclog->ic_size); @@ -3037,21 +2685,16 @@ xfs_log_ticket_regrant( if (ticket->t_cnt > 0) ticket->t_cnt--; - xlog_grant_sub_space(log, &log->l_reserve_head.grant, - ticket->t_curr_res); - xlog_grant_sub_space(log, &log->l_write_head.grant, - ticket->t_curr_res); + xlog_grant_sub_space(&log->l_reserve_head, ticket->t_curr_res); + xlog_grant_sub_space(&log->l_write_head, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; trace_xfs_log_ticket_regrant_sub(log, ticket); /* just return if we still have some of the pre-reserved space */ if (!ticket->t_cnt) { - xlog_grant_add_space(log, &log->l_reserve_head.grant, - ticket->t_unit_res); + xlog_grant_add_space(&log->l_reserve_head, ticket->t_unit_res); trace_xfs_log_ticket_regrant_exit(log, ticket); - - ticket->t_curr_res = ticket->t_unit_res; } xfs_log_ticket_put(ticket); @@ -3095,8 +2738,8 @@ xfs_log_ticket_ungrant( bytes += ticket->t_unit_res*ticket->t_cnt; } - xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); - xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); + xlog_grant_sub_space(&log->l_reserve_head, bytes); + xlog_grant_sub_space(&log->l_write_head, bytes); trace_xfs_log_ticket_ungrant_exit(log, ticket); @@ -3121,7 +2764,7 @@ xlog_state_switch_iclogs( if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; - iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); + iclog->ic_header->h_prev_block = cpu_to_be32(log->l_prev_block); log->l_prev_block = log->l_curr_block; log->l_prev_cycle = log->l_curr_cycle; @@ -3165,7 +2808,7 @@ xlog_force_and_check_iclog( struct xlog_in_core *iclog, bool *completed) { - xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); + xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header->h_lsn); int error; *completed = false; @@ -3177,7 +2820,7 @@ xlog_force_and_check_iclog( * If the iclog has already been completed and reused the header LSN * will have been rewritten by completion */ - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) + if (be64_to_cpu(iclog->ic_header->h_lsn) != lsn) *completed = true; return 0; } @@ -3195,7 +2838,7 @@ xlog_force_and_check_iclog( * * 1. the current iclog is active and has no data; the previous iclog * is in the active or dirty state. - * 2. the current iclog is drity, and the previous iclog is in the + * 2. the current iclog is dirty, and the previous iclog is in the * active or dirty state. * * We may sleep if: @@ -3310,7 +2953,7 @@ xlog_force_lsn( goto out_error; iclog = log->l_iclog; - while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + while (be64_to_cpu(iclog->ic_header->h_lsn) != lsn) { trace_xlog_iclog_force_lsn(iclog, _RET_IP_); iclog = iclog->ic_next; if (iclog == log->l_iclog) @@ -3419,16 +3062,16 @@ xfs_log_force_seq( */ void xfs_log_ticket_put( - xlog_ticket_t *ticket) + struct xlog_ticket *ticket) { ASSERT(atomic_read(&ticket->t_ref) > 0); if (atomic_dec_and_test(&ticket->t_ref)) kmem_cache_free(xfs_log_ticket_cache, ticket); } -xlog_ticket_t * +struct xlog_ticket * xfs_log_ticket_get( - xlog_ticket_t *ticket) + struct xlog_ticket *ticket) { ASSERT(atomic_read(&ticket->t_ref) > 0); atomic_inc(&ticket->t_ref); @@ -3480,11 +3123,11 @@ xlog_calc_unit_res( */ /* for trans header */ - unit_bytes += sizeof(xlog_op_header_t); - unit_bytes += sizeof(xfs_trans_header_t); + unit_bytes += sizeof(struct xlog_op_header); + unit_bytes += sizeof(struct xfs_trans_header); /* for start-rec */ - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); /* * for LR headers - the space for data in an iclog is the size minus @@ -3507,12 +3150,12 @@ xlog_calc_unit_res( num_headers = howmany(unit_bytes, iclog_space); /* for split-recs - ophdrs added when data split over LRs */ - unit_bytes += sizeof(xlog_op_header_t) * num_headers; + unit_bytes += sizeof(struct xlog_op_header) * num_headers; /* add extra header reservations if we overrun */ while (!num_headers || howmany(unit_bytes, iclog_space) > num_headers) { - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); num_headers++; } unit_bytes += log->l_iclog_hsize * num_headers; @@ -3549,7 +3192,8 @@ xlog_ticket_alloc( struct xlog_ticket *tic; int unit_res; - tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL); + tic = kmem_cache_zalloc(xfs_log_ticket_cache, + GFP_KERNEL | __GFP_NOFAIL); unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs); @@ -3568,65 +3212,63 @@ xlog_ticket_alloc( } #if defined(DEBUG) -/* - * Check to make sure the grant write head didn't just over lap the tail. If - * the cycles are the same, we can't be overlapping. Otherwise, make sure that - * the cycles differ by exactly one and check the byte count. - * - * This check is run unlocked, so can give false positives. Rather than assert - * on failures, use a warn-once flag and a panic tag to allow the admin to - * determine if they want to panic the machine when such an error occurs. For - * debug kernels this will have the same effect as using an assert but, unlinke - * an assert, it can be turned off at runtime. - */ -STATIC void -xlog_verify_grant_tail( - struct xlog *log) -{ - int tail_cycle, tail_blocks; - int cycle, space; - - xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space); - xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); - if (tail_cycle != cycle) { - if (cycle - 1 != tail_cycle && - !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) { - xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, - "%s: cycle - 1 != tail_cycle", __func__); - } - - if (space > BBTOB(tail_blocks) && - !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) { - xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, - "%s: space > BBTOB(tail_blocks)", __func__); - } - } -} - -/* check if it will fit */ +static void +xlog_verify_dump_tail( + struct xlog *log, + struct xlog_in_core *iclog) +{ + xfs_alert(log->l_mp, +"ran out of log space tail 0x%llx/0x%llx, head lsn 0x%llx, head 0x%x/0x%x, prev head 0x%x/0x%x", + iclog ? be64_to_cpu(iclog->ic_header->h_tail_lsn) : -1, + atomic64_read(&log->l_tail_lsn), + log->l_ailp->ail_head_lsn, + log->l_curr_cycle, log->l_curr_block, + log->l_prev_cycle, log->l_prev_block); + xfs_alert(log->l_mp, +"write grant 0x%llx, reserve grant 0x%llx, tail_space 0x%llx, size 0x%x, iclog flags 0x%x", + atomic64_read(&log->l_write_head.grant), + atomic64_read(&log->l_reserve_head.grant), + log->l_tail_space, log->l_logsize, + iclog ? iclog->ic_flags : -1); +} + +/* Check if the new iclog will fit in the log. */ STATIC void xlog_verify_tail_lsn( struct xlog *log, struct xlog_in_core *iclog) { - xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn); + xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header->h_tail_lsn); int blocks; - if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { - blocks = - log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); - if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) - xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); - } else { - ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); + if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { + blocks = log->l_logBBsize - + (log->l_prev_block - BLOCK_LSN(tail_lsn)); + if (blocks < BTOBB(iclog->ic_offset) + + BTOBB(log->l_iclog_hsize)) { + xfs_emerg(log->l_mp, + "%s: ran out of log space", __func__); + xlog_verify_dump_tail(log, iclog); + } + return; + } - if (BLOCK_LSN(tail_lsn) == log->l_prev_block) + if (CYCLE_LSN(tail_lsn) + 1 != log->l_prev_cycle) { + xfs_emerg(log->l_mp, "%s: head has wrapped tail.", __func__); + xlog_verify_dump_tail(log, iclog); + return; + } + if (BLOCK_LSN(tail_lsn) == log->l_prev_block) { xfs_emerg(log->l_mp, "%s: tail wrapped", __func__); + xlog_verify_dump_tail(log, iclog); + return; + } blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; - if (blocks < BTOBB(iclog->ic_offset) + 1) - xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); - } + if (blocks < BTOBB(iclog->ic_offset) + 1) { + xfs_emerg(log->l_mp, "%s: ran out of iclog space", __func__); + xlog_verify_dump_tail(log, iclog); + } } /* @@ -3650,13 +3292,12 @@ xlog_verify_iclog( struct xlog_in_core *iclog, int count) { - xlog_op_header_t *ophead; - xlog_in_core_t *icptr; - xlog_in_core_2_t *xhdr; - void *base_ptr, *ptr, *p; + struct xlog_rec_header *rhead = iclog->ic_header; + struct xlog_in_core *icptr; + void *base_ptr, *ptr; ptrdiff_t field_offset; uint8_t clientid; - int len, i, j, k, op_len; + int len, i, op_len; int idx; /* check validity of iclog pointers */ @@ -3670,11 +3311,10 @@ xlog_verify_iclog( spin_unlock(&log->l_icloglock); /* check log magic numbers */ - if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) + if (rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); - base_ptr = ptr = &iclog->ic_header; - p = &iclog->ic_header; + base_ptr = ptr = rhead; for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: unexpected magic num", @@ -3682,29 +3322,19 @@ xlog_verify_iclog( } /* check fields */ - len = be32_to_cpu(iclog->ic_header.h_num_logops); + len = be32_to_cpu(rhead->h_num_logops); base_ptr = ptr = iclog->ic_datap; - ophead = ptr; - xhdr = iclog->ic_data; for (i = 0; i < len; i++) { - ophead = ptr; + struct xlog_op_header *ophead = ptr; + void *p = &ophead->oh_clientid; /* clientid is only 1 byte */ - p = &ophead->oh_clientid; field_offset = p - base_ptr; if (field_offset & 0x1ff) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap); - if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { - j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - clientid = xlog_get_client_id( - xhdr[j].hic_xheader.xh_cycle_data[k]); - } else { - clientid = xlog_get_client_id( - iclog->ic_header.h_cycle_data[idx]); - } + clientid = xlog_get_client_id(*xlog_cycle_data(rhead, idx)); } if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) { xfs_warn(log->l_mp, @@ -3720,15 +3350,9 @@ xlog_verify_iclog( op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap); - if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { - j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); - } else { - op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); - } + op_len = be32_to_cpu(*xlog_cycle_data(rhead, idx)); } - ptr += sizeof(xlog_op_header_t) + op_len; + ptr += sizeof(struct xlog_op_header) + op_len; } } #endif @@ -3763,6 +3387,16 @@ xlog_force_shutdown( return false; /* + * Ensure that there is only ever one log shutdown being processed. + * If we allow the log force below on a second pass after shutting + * down the log, we risk deadlocking the CIL push as it may require + * locks on objects the current shutdown context holds (e.g. taking + * buffer locks to abort buffers on last unpin of buf log items). + */ + if (test_and_set_bit(XLOG_SHUTDOWN_STARTED, &log->l_opstate)) + return false; + + /* * Flush all the completed transactions to disk before marking the log * being shut down. We need to do this first as shutting down the log * before the force will prevent the log force from flushing the iclogs @@ -3794,6 +3428,7 @@ xlog_force_shutdown( spin_lock(&log->l_icloglock); if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { spin_unlock(&log->l_icloglock); + ASSERT(0); return false; } spin_unlock(&log->l_icloglock); @@ -3802,7 +3437,7 @@ xlog_force_shutdown( * If this log shutdown also sets the mount shutdown state, issue a * shutdown warning message. */ - if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) { + if (!xfs_set_shutdown(log->l_mp)) { xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR, "Filesystem has been shut down due to log error (0x%x).", shutdown_flags); @@ -3838,24 +3473,27 @@ xlog_force_shutdown( spin_unlock(&log->l_icloglock); wake_up_var(&log->l_opstate); + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp)) + xfs_zoned_wake_all(log->l_mp); + return log_error; } STATIC int xlog_iclogs_empty( - struct xlog *log) + struct xlog *log) { - xlog_in_core_t *iclog; + struct xlog_in_core *iclog = log->l_iclog; - iclog = log->l_iclog; do { /* endianness does not matter here, zero is zero in * any language. */ - if (iclog->ic_header.h_num_logops) + if (iclog->ic_header->h_num_logops) return 0; iclog = iclog->ic_next; } while (iclog != log->l_iclog); + return 1; } @@ -3901,23 +3539,3 @@ xfs_log_check_lsn( return valid; } - -/* - * Notify the log that we're about to start using a feature that is protected - * by a log incompat feature flag. This will prevent log covering from - * clearing those flags. - */ -void -xlog_use_incompat_feat( - struct xlog *log) -{ - down_read(&log->l_incompat_users); -} - -/* Notify the log that we've finished using log incompat features. */ -void -xlog_drop_incompat_feat( - struct xlog *log) -{ - up_read(&log->l_incompat_users); -} |
