diff options
author | Matthew Sakai <msakai@redhat.com> | 2023-11-16 20:57:50 -0500 |
---|---|---|
committer | Mike Snitzer <snitzer@kernel.org> | 2024-02-20 13:43:15 -0500 |
commit | 883069e30e0be28d792db04b97a1785848155d6b (patch) | |
tree | 204d94547da0f2e8959ac68f2baf7e941bbb08ed /drivers/md/dm-vdo/slab-depot.c | |
parent | 09eff388df02ac0fff7d726ca2136d623d1db907 (diff) |
dm vdo: add the slab summary
The slab depot maintains an additional small data structure, the "slab
summary," which is used to reduce the amount of work needed to come back
online after a crash. The slab summary maintains an entry for each slab
indicating whether or not the slab has ever been used, whether it is clean
(i.e. all of its reference count updates have been persisted to storage),
and approximately how full it is. During recovery, each physical zone will
attempt to recover at least one slab, stopping whenever it has recovered a
slab which has some free blocks. Once each zone has some space (or has
determined that none is available), the target can resume normal operation
in a degraded mode. Read and write requests can be serviced, perhaps with
degraded performance, while the remainder of the dirty slabs are recovered.
Co-developed-by: J. corwin Coburn <corwin@hurlbutnet.net>
Signed-off-by: J. corwin Coburn <corwin@hurlbutnet.net>
Co-developed-by: Michael Sclafani <dm-devel@lists.linux.dev>
Signed-off-by: Michael Sclafani <dm-devel@lists.linux.dev>
Co-developed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Matthew Sakai <msakai@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Diffstat (limited to 'drivers/md/dm-vdo/slab-depot.c')
-rw-r--r-- | drivers/md/dm-vdo/slab-depot.c | 320 |
1 files changed, 320 insertions, 0 deletions
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index af3136f83e4f..89af2e8a2621 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -193,6 +193,199 @@ static void check_if_slab_drained(struct vdo_slab *slab) (read_only ? VDO_READ_ONLY : VDO_SUCCESS)); } +/* FULLNESS HINT COMPUTATION */ + +/** + * compute_fullness_hint() - Translate a slab's free block count into a 'fullness hint' that can be + * stored in a slab_summary_entry's 7 bits that are dedicated to its free + * count. + * @depot: The depot whose summary being updated. + * @free_blocks: The number of free blocks. + * + * Note: the number of free blocks must be strictly less than 2^23 blocks, even though + * theoretically slabs could contain precisely 2^23 blocks; there is an assumption that at least + * one block is used by metadata. This assumption is necessary; otherwise, the fullness hint might + * overflow. The fullness hint formula is roughly (fullness >> 16) & 0x7f, but (2^23 >> 16) & 0x7f + * is 0, which would make it impossible to distinguish completely full from completely empty. + * + * Return: A fullness hint, which can be stored in 7 bits. + */ +static u8 __must_check compute_fullness_hint(struct slab_depot *depot, + block_count_t free_blocks) +{ + block_count_t hint; + + ASSERT_LOG_ONLY((free_blocks < (1 << 23)), "free blocks must be less than 2^23"); + + if (free_blocks == 0) + return 0; + + hint = free_blocks >> depot->hint_shift; + return ((hint == 0) ? 1 : hint); +} + +/** + * check_summary_drain_complete() - Check whether an allocators summary has finished draining. + */ +static void check_summary_drain_complete(struct block_allocator *allocator) +{ + struct vdo *vdo = allocator->depot->vdo; + + if (!vdo_is_state_draining(&allocator->summary_state) || + (allocator->summary_write_count > 0)) + return; + + vdo_finish_operation(&allocator->summary_state, + (vdo_is_read_only(vdo) ? VDO_READ_ONLY : VDO_SUCCESS)); +} + +/** + * notify_summary_waiters() - Wake all the waiters in a given queue. + * @allocator: The block allocator summary which owns the queue. + * @queue: The queue to notify. + */ +static void notify_summary_waiters(struct block_allocator *allocator, + struct wait_queue *queue) +{ + int result = (vdo_is_read_only(allocator->depot->vdo) ? VDO_READ_ONLY : VDO_SUCCESS); + + vdo_notify_all_waiters(queue, NULL, &result); +} + +static void launch_write(struct slab_summary_block *summary_block); + +/** + * finish_updating_slab_summary_block() - Finish processing a block which attempted to write, + * whether or not the attempt succeeded. + * @block: The block. + */ +static void finish_updating_slab_summary_block(struct slab_summary_block *block) +{ + notify_summary_waiters(block->allocator, &block->current_update_waiters); + block->writing = false; + block->allocator->summary_write_count--; + if (vdo_has_waiters(&block->next_update_waiters)) + launch_write(block); + else + check_summary_drain_complete(block->allocator); +} + +/** + * finish_update() - This is the callback for a successful summary block write. + * @completion: The write vio. + */ +static void finish_update(struct vdo_completion *completion) +{ + struct slab_summary_block *block = + container_of(as_vio(completion), struct slab_summary_block, vio); + + atomic64_inc(&block->allocator->depot->summary_statistics.blocks_written); + finish_updating_slab_summary_block(block); +} + +/** + * handle_write_error() - Handle an error writing a slab summary block. + * @completion: The write VIO. + */ +static void handle_write_error(struct vdo_completion *completion) +{ + struct slab_summary_block *block = + container_of(as_vio(completion), struct slab_summary_block, vio); + + vio_record_metadata_io_error(as_vio(completion)); + vdo_enter_read_only_mode(completion->vdo, completion->result); + finish_updating_slab_summary_block(block); +} + +static void write_slab_summary_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct slab_summary_block *block = + container_of(vio, struct slab_summary_block, vio); + + continue_vio_after_io(vio, finish_update, block->allocator->thread_id); +} + +/** + * launch_write() - Write a slab summary block unless it is currently out for writing. + * @block: The block that needs to be committed. + */ +static void launch_write(struct slab_summary_block *block) +{ + struct block_allocator *allocator = block->allocator; + struct slab_depot *depot = allocator->depot; + physical_block_number_t pbn; + + if (block->writing) + return; + + allocator->summary_write_count++; + vdo_transfer_all_waiters(&block->next_update_waiters, + &block->current_update_waiters); + block->writing = true; + + if (vdo_is_read_only(depot->vdo)) { + finish_updating_slab_summary_block(block); + return; + } + + memcpy(block->outgoing_entries, block->entries, VDO_BLOCK_SIZE); + + /* + * Flush before writing to ensure that the slab journal tail blocks and reference updates + * covered by this summary update are stable (VDO-2332). + */ + pbn = (depot->summary_origin + + (VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE * allocator->zone_number) + + block->index); + submit_metadata_vio(&block->vio, pbn, write_slab_summary_endio, + handle_write_error, REQ_OP_WRITE | REQ_PREFLUSH); +} + +/** + * update_slab_summary_entry() - Update the entry for a slab. + * @slab: The slab whose entry is to be updated + * @waiter: The waiter that is updating the summary. + * @tail_block_offset: The offset of the slab journal's tail block. + * @load_ref_counts: Whether the reference counts must be loaded from disk on the vdo load. + * @is_clean: Whether the slab is clean. + * @free_blocks: The number of free blocks. + */ +static void update_slab_summary_entry(struct vdo_slab *slab, struct waiter *waiter, + tail_block_offset_t tail_block_offset, + bool load_ref_counts, bool is_clean, + block_count_t free_blocks) +{ + u8 index = slab->slab_number / VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK; + struct block_allocator *allocator = slab->allocator; + struct slab_summary_block *block = &allocator->summary_blocks[index]; + int result; + struct slab_summary_entry *entry; + + if (vdo_is_read_only(block->vio.completion.vdo)) { + result = VDO_READ_ONLY; + waiter->callback(waiter, &result); + return; + } + + if (vdo_is_state_draining(&allocator->summary_state) || + vdo_is_state_quiescent(&allocator->summary_state)) { + result = VDO_INVALID_ADMIN_STATE; + waiter->callback(waiter, &result); + return; + } + + entry = &allocator->summary_entries[slab->slab_number]; + *entry = (struct slab_summary_entry) { + .tail_block_offset = tail_block_offset, + .load_ref_counts = (entry->load_ref_counts || load_ref_counts), + .is_dirty = !is_clean, + .fullness_hint = compute_fullness_hint(allocator->depot, free_blocks), + }; + vdo_enqueue_waiter(&block->next_update_waiters, waiter); + launch_write(block); +} + /** * finish_reaping() - Actually advance the head of the journal now that any necessary flushes are * complete. @@ -2402,3 +2595,130 @@ static int __must_check make_slab(physical_block_number_t slab_origin, *slab_ptr = slab; return VDO_SUCCESS; } + +/** + * finish_combining_zones() - Clean up after saving out the combined slab summary. + * @completion: The vio which was used to write the summary data. + */ +static void finish_combining_zones(struct vdo_completion *completion) +{ + int result = completion->result; + struct vdo_completion *parent = completion->parent; + + free_vio(as_vio(uds_forget(completion))); + vdo_fail_completion(parent, result); +} + +static void handle_combining_error(struct vdo_completion *completion) +{ + vio_record_metadata_io_error(as_vio(completion)); + finish_combining_zones(completion); +} + +static void write_summary_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo *vdo = vio->completion.vdo; + + continue_vio_after_io(vio, finish_combining_zones, + vdo->thread_config.admin_thread); +} + +/** + * combine_summaries() - Treating the current entries buffer as the on-disk value of all zones, + * update every zone to the correct values for every slab. + * @depot: The depot whose summary entries should be combined. + */ +static void combine_summaries(struct slab_depot *depot) +{ + /* + * Combine all the old summary data into the portion of the buffer corresponding to the + * first zone. + */ + zone_count_t zone = 0; + struct slab_summary_entry *entries = depot->summary_entries; + + if (depot->old_zone_count > 1) { + slab_count_t entry_number; + + for (entry_number = 0; entry_number < MAX_VDO_SLABS; entry_number++) { + if (zone != 0) { + memcpy(entries + entry_number, + entries + (zone * MAX_VDO_SLABS) + entry_number, + sizeof(struct slab_summary_entry)); + } + + zone++; + if (zone == depot->old_zone_count) + zone = 0; + } + } + + /* Copy the combined data to each zones's region of the buffer. */ + for (zone = 1; zone < MAX_VDO_PHYSICAL_ZONES; zone++) { + memcpy(entries + (zone * MAX_VDO_SLABS), entries, + MAX_VDO_SLABS * sizeof(struct slab_summary_entry)); + } +} + +/** + * finish_loading_summary() - Finish loading slab summary data. + * @completion: The vio which was used to read the summary data. + * + * Combines the slab summary data from all the previously written zones and copies the combined + * summary to each partition's data region. Then writes the combined summary back out to disk. This + * callback is registered in load_summary_endio(). + */ +static void finish_loading_summary(struct vdo_completion *completion) +{ + struct slab_depot *depot = completion->vdo->depot; + + /* Combine the summary from each zone so each zone is correct for all slabs. */ + combine_summaries(depot); + + /* Write the combined summary back out. */ + submit_metadata_vio(as_vio(completion), depot->summary_origin, + write_summary_endio, handle_combining_error, + REQ_OP_WRITE); +} + +static void load_summary_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo *vdo = vio->completion.vdo; + + continue_vio_after_io(vio, finish_loading_summary, + vdo->thread_config.admin_thread); +} + +/** + * load_slab_summary() - The preamble of a load operation. + * + * Implements vdo_action_preamble_fn. + */ +static void load_slab_summary(void *context, struct vdo_completion *parent) +{ + int result; + struct vio *vio; + struct slab_depot *depot = context; + const struct admin_state_code *operation = + vdo_get_current_manager_operation(depot->action_manager); + + result = create_multi_block_metadata_vio(depot->vdo, VIO_TYPE_SLAB_SUMMARY, + VIO_PRIORITY_METADATA, parent, + VDO_SLAB_SUMMARY_BLOCKS, + (char *) depot->summary_entries, &vio); + if (result != VDO_SUCCESS) { + vdo_fail_completion(parent, result); + return; + } + + if ((operation == VDO_ADMIN_STATE_FORMATTING) || + (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD)) { + finish_loading_summary(&vio->completion); + return; + } + + submit_metadata_vio(vio, depot->summary_origin, load_summary_endio, + handle_combining_error, REQ_OP_READ); +} |