diff options
Diffstat (limited to 'fs/btrfs/free-space-cache.c')
| -rw-r--r-- | fs/btrfs/free-space-cache.c | 1479 |
1 files changed, 832 insertions, 647 deletions
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 55955bd424d7..f0f72850fab2 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,56 +11,90 @@ #include <linux/ratelimit.h> #include <linux/error-injection.h> #include <linux/sched/mm.h> -#include "ctree.h" +#include <linux/string_choices.h> +#include "extent-tree.h" +#include "fs.h" +#include "messages.h" +#include "misc.h" #include "free-space-cache.h" #include "transaction.h" #include "disk-io.h" #include "extent_io.h" -#include "inode-map.h" -#include "volumes.h" #include "space-info.h" -#include "delalloc-space.h" #include "block-group.h" #include "discard.h" +#include "subpage.h" +#include "inode-item.h" +#include "accessors.h" +#include "file-item.h" +#include "file.h" +#include "super.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K #define FORCE_EXTENT_THRESHOLD SZ_1M +static struct kmem_cache *btrfs_free_space_cachep; +static struct kmem_cache *btrfs_free_space_bitmap_cachep; + struct btrfs_trim_range { u64 start; u64 bytes; struct list_head list; }; -static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *bitmap_info); static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info); -static int btrfs_wait_cache_io_root(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_io_ctl *io_ctl, - struct btrfs_path *path); + struct btrfs_free_space *info, bool update_stat); +static int search_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info, u64 *offset, + u64 *bytes, bool for_alloc); +static void free_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info); +static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes, bool update_stats); + +static void btrfs_crc32c_final(u32 crc, u8 *result) +{ + put_unaligned_le32(~crc, result); +} + +static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *node; + + while ((node = rb_last(&ctl->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + if (!info->bitmap) { + unlink_free_space(ctl, info, true); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + free_bitmap(ctl, info); + } + + cond_resched_lock(&ctl->tree_lock); + } +} static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, u64 offset) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_key location; struct btrfs_disk_key disk_key; struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct inode *inode = NULL; + struct btrfs_inode *inode; unsigned nofs_flag; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -82,17 +116,17 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, * sure NOFS is set to keep us from deadlocking. */ nofs_flag = memalloc_nofs_save(); - inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); + inode = btrfs_iget_path(location.objectid, root, path); btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); - mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_constraint(inode->i_mapping, + mapping_set_gfp_mask(inode->vfs_inode.i_mapping, + mapping_gfp_constraint(inode->vfs_inode.i_mapping, ~(__GFP_FS | __GFP_HIGHMEM))); - return inode; + return &inode->vfs_inode; } struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, @@ -104,7 +138,7 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, spin_lock(&block_group->lock); if (block_group->inode) - inode = igrab(block_group->inode); + inode = igrab(&block_group->inode->vfs_inode); spin_unlock(&block_group->lock); if (inode) return inode; @@ -122,10 +156,8 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, block_group->disk_cache_state = BTRFS_DC_CLEAR; } - if (!block_group->iref) { - block_group->inode = igrab(inode); - block_group->iref = 1; - } + if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) + block_group->inode = BTRFS_I(igrab(inode)); spin_unlock(&block_group->lock); return inode; @@ -141,17 +173,15 @@ static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header *header; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; - u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; + /* We inline CRCs for the free disk space cache */ + const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | + BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; int ret; ret = btrfs_insert_empty_inode(trans, root, path, ino); if (ret) return ret; - /* We inline crc's for the free disk space cache */ - if (ino != BTRFS_FREE_INO_OBJECTID) - flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; - leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -168,12 +198,11 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); - btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { @@ -186,7 +215,6 @@ static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header); memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); btrfs_set_free_space_key(leaf, header, &disk_key); - btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); return 0; @@ -199,7 +227,7 @@ int create_free_space_inode(struct btrfs_trans_handle *trans, int ret; u64 ino; - ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino); + ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); if (ret < 0) return ret; @@ -207,36 +235,82 @@ int create_free_space_inode(struct btrfs_trans_handle *trans, ino, block_group->start); } -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv) +/* + * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode + * handles lookup, otherwise it takes ownership and iputs the inode. + * Don't reuse an inode pointer after passing it into this function. + */ +int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_block_group *block_group) { - u64 needed_bytes; - int ret; + BTRFS_PATH_AUTO_FREE(path); + struct btrfs_key key; + int ret = 0; - /* 1 for slack space, 1 for updating the inode */ - needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + - btrfs_calc_metadata_size(fs_info, 1); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; - spin_lock(&rsv->lock); - if (rsv->reserved < needed_bytes) - ret = -ENOSPC; - else - ret = 0; - spin_unlock(&rsv->lock); - return ret; + if (!inode) + inode = lookup_free_space_inode(block_group, path); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) != -ENOENT) + ret = PTR_ERR(inode); + return ret; + } + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) { + btrfs_add_delayed_iput(BTRFS_I(inode)); + return ret; + } + clear_nlink(inode); + /* One for the block groups ref */ + spin_lock(&block_group->lock); + if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + /* One for the lookup ref */ + btrfs_add_delayed_iput(BTRFS_I(inode)); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.type = 0; + key.offset = block_group->start; + ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, + -1, 1); + if (ret) { + if (ret > 0) + ret = 0; + return ret; + } + return btrfs_del_item(trans, trans->fs_info->tree_root, path); } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, - struct inode *inode) + struct inode *vfs_inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_truncate_control control = { + .inode = BTRFS_I(vfs_inode), + .new_size = 0, + .ino = btrfs_ino(BTRFS_I(vfs_inode)), + .min_type = BTRFS_EXTENT_DATA_KEY, + .clear_extent_range = true, + }; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; + struct extent_state *cached_state = NULL; int ret = 0; bool locked = false; if (block_group) { - struct btrfs_path *path = btrfs_alloc_path(); + BTRFS_PATH_AUTO_FREE(path); + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto fail; @@ -257,22 +331,28 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_CLEAR; spin_unlock(&block_group->lock); - btrfs_free_path(path); } - btrfs_i_size_write(BTRFS_I(inode), 0); - truncate_pagecache(inode, 0); + btrfs_i_size_write(inode, 0); + truncate_pagecache(vfs_inode, 0); + + btrfs_lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); /* * We skip the throttling logic for free space cache inodes, so we don't * need to check for -EAGAIN. */ - ret = btrfs_truncate_inode_items(trans, root, inode, - 0, BTRFS_EXTENT_DATA_KEY); + ret = btrfs_truncate_inode_items(trans, root, &control); + + inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(inode, control.last_size); + + btrfs_unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); fail: if (locked) @@ -285,35 +365,24 @@ fail: static void readahead_cache(struct inode *inode) { - struct file_ra_state *ra; - unsigned long last_index; - - ra = kzalloc(sizeof(*ra), GFP_NOFS); - if (!ra) - return; + struct file_ra_state ra; + pgoff_t last_index; - file_ra_state_init(ra, inode->i_mapping); + file_ra_state_init(&ra, inode->i_mapping); last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; - page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); - - kfree(ra); + page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); } static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, int write) { int num_pages; - int check_crcs = 0; num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID) - check_crcs = 1; - /* Make sure we can fit our crcs and generation into the first page */ - if (write && check_crcs && - (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) + if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) return -ENOSPC; memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); @@ -323,8 +392,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, return -ENOMEM; io_ctl->num_pages = num_pages; - io_ctl->fs_info = btrfs_sb(inode->i_sb); - io_ctl->check_crcs = check_crcs; + io_ctl->fs_info = inode_to_fs_info(inode); io_ctl->inode = inode; return 0; @@ -364,7 +432,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) for (i = 0; i < io_ctl->num_pages; i++) { if (io_ctl->pages[i]) { - ClearPageChecked(io_ctl->pages[i]); + btrfs_folio_clear_checked(io_ctl->fs_info, + page_folio(io_ctl->pages[i]), + page_offset(io_ctl->pages[i]), + PAGE_SIZE); unlock_page(io_ctl->pages[i]); put_page(io_ctl->pages[i]); } @@ -373,28 +444,41 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { - struct page *page; + struct folio *folio; struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; for (i = 0; i < io_ctl->num_pages; i++) { - page = find_or_create_page(inode->i_mapping, i, mask); - if (!page) { + int ret; + + folio = __filemap_get_folio(inode->i_mapping, i, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); + if (IS_ERR(folio)) { io_ctl_drop_pages(io_ctl); - return -ENOMEM; + return PTR_ERR(folio); + } + + ret = set_folio_extent_mapped(folio); + if (ret < 0) { + folio_unlock(folio); + folio_put(folio); + io_ctl_drop_pages(io_ctl); + return ret; } - io_ctl->pages[i] = page; - if (uptodate && !PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (page->mapping != inode->i_mapping) { + + io_ctl->pages[i] = &folio->page; + if (uptodate && !folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { btrfs_err(BTRFS_I(inode)->root->fs_info, "free space cache page truncated"); io_ctl_drop_pages(io_ctl); return -EIO; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); io_ctl_drop_pages(io_ctl); @@ -403,59 +487,43 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) } } - for (i = 0; i < io_ctl->num_pages; i++) { + for (i = 0; i < io_ctl->num_pages; i++) clear_page_dirty_for_io(io_ctl->pages[i]); - set_page_extent_mapped(io_ctl->pages[i]); - } return 0; } static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *val; - io_ctl_map_page(io_ctl, 1); /* * Skip the csum areas. If we don't check crcs then we just have a * 64bit chunk at the front of the first page. */ - if (io_ctl->check_crcs) { - io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); - io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); - } else { - io_ctl->cur += sizeof(u64); - io_ctl->size -= sizeof(u64) * 2; - } + io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); - val = io_ctl->cur; - *val = cpu_to_le64(generation); + put_unaligned_le64(generation, io_ctl->cur); io_ctl->cur += sizeof(u64); } static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *gen; + u64 cache_gen; /* * Skip the crc area. If we don't check crcs then we just have a 64bit * chunk at the front of the first page. */ - if (io_ctl->check_crcs) { - io_ctl->cur += sizeof(u32) * io_ctl->num_pages; - io_ctl->size -= sizeof(u64) + - (sizeof(u32) * io_ctl->num_pages); - } else { - io_ctl->cur += sizeof(u64); - io_ctl->size -= sizeof(u64) * 2; - } + io_ctl->cur += sizeof(u32) * io_ctl->num_pages; + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); - gen = io_ctl->cur; - if (le64_to_cpu(*gen) != generation) { + cache_gen = get_unaligned_le64(io_ctl->cur); + if (cache_gen != generation) { btrfs_err_rl(io_ctl->fs_info, "space cache generation (%llu) does not match inode (%llu)", - *gen, generation); + cache_gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -469,15 +537,10 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) u32 crc = ~(u32)0; unsigned offset = 0; - if (!io_ctl->check_crcs) { - io_ctl_unmap_page(io_ctl); - return; - } - if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); @@ -491,11 +554,6 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) u32 crc = ~(u32)0; unsigned offset = 0; - if (!io_ctl->check_crcs) { - io_ctl_map_page(io_ctl, 0); - return 0; - } - if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; @@ -504,7 +562,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) val = *tmp; io_ctl_map_page(io_ctl, 0); - crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->fs_info, @@ -525,8 +583,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, return -ENOSPC; entry = io_ctl->cur; - entry->offset = cpu_to_le64(offset); - entry->bytes = cpu_to_le64(bytes); + put_unaligned_le64(offset, &entry->offset); + put_unaligned_le64(bytes, &entry->bytes); entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : BTRFS_FREE_SPACE_EXTENT; io_ctl->cur += sizeof(struct btrfs_free_space_entry); @@ -599,8 +657,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, } e = io_ctl->cur; - entry->offset = le64_to_cpu(e->offset); - entry->bytes = le64_to_cpu(e->bytes); + entry->offset = get_unaligned_le64(&e->offset); + entry->bytes = get_unaligned_le64(&e->bytes); *type = e->type; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); @@ -628,42 +686,48 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, return 0; } -/* - * Since we attach pinned extents after the fact we can have contiguous sections - * of free space that are split up in entries. This poses a problem with the - * tree logging stuff since it could have allocated across what appears to be 2 - * entries since we would have merged the entries when adding the pinned extents - * back to the free space cache. So run through the space cache that we just - * loaded and merge contiguous entries. This will make the log replay stuff not - * blow up and it will make for nicer allocator behavior. - */ -static void merge_space_tree(struct btrfs_free_space_ctl *ctl) +static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) { - struct btrfs_free_space *e, *prev = NULL; - struct rb_node *n; + struct btrfs_block_group *block_group = ctl->block_group; + u64 max_bytes; + u64 bitmap_bytes; + u64 extent_bytes; + u64 size = block_group->length; + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); -again: - spin_lock(&ctl->tree_lock); - for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { - e = rb_entry(n, struct btrfs_free_space, offset_index); - if (!prev) - goto next; - if (e->bitmap || prev->bitmap) - goto next; - if (prev->offset + prev->bytes == e->offset) { - unlink_free_space(ctl, prev); - unlink_free_space(ctl, e); - prev->bytes += e->bytes; - kmem_cache_free(btrfs_free_space_cachep, e); - link_free_space(ctl, prev); - prev = NULL; - spin_unlock(&ctl->tree_lock); - goto again; - } -next: - prev = e; - } - spin_unlock(&ctl->tree_lock); + max_bitmaps = max_t(u64, max_bitmaps, 1); + + if (ctl->total_bitmaps > max_bitmaps) + btrfs_err(block_group->fs_info, +"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", + block_group->start, block_group->length, + ctl->total_bitmaps, ctl->unit, max_bitmaps, + bytes_per_bg); + ASSERT(ctl->total_bitmaps <= max_bitmaps); + + /* + * We are trying to keep the total amount of memory used per 1GiB of + * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation + * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of + * bitmaps, we may end up using more memory than this. + */ + if (size < SZ_1G) + max_bytes = MAX_CACHE_BYTES_PER_GIG; + else + max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); + + bitmap_bytes = ctl->total_bitmaps * ctl->unit; + + /* + * we want the extent entry threshold to always be at most 1/2 the max + * bytes we can have, or whatever is less than that. + */ + extent_bytes = max_bytes - bitmap_bytes; + extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); + + ctl->extents_thresh = + div_u64(extent_bytes, sizeof(struct btrfs_free_space)); } static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, @@ -688,8 +752,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -747,8 +811,10 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, while (num_entries) { e = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); - if (!e) + if (!e) { + ret = -ENOMEM; goto free_cache; + } ret = io_ctl_read_entry(&io_ctl, e, &type); if (ret) { @@ -756,17 +822,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, goto free_cache; } - /* - * Sync discard ensures that the free space cache is always - * trimmed. So when reading this in, the state should reflect - * that. We also do this for async as a stop gap for lack of - * persistence. - */ - if (btrfs_test_opt(fs_info, DISCARD_SYNC) || - btrfs_test_opt(fs_info, DISCARD_ASYNC)) - e->trim_state = BTRFS_TRIM_STATE_TRIMMED; - if (!e->bytes) { + ret = -1; kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } @@ -787,21 +844,24 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, e->bitmap = kmem_cache_zalloc( btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!e->bitmap) { + ret = -ENOMEM; kmem_cache_free( btrfs_free_space_cachep, e); goto free_cache; } spin_lock(&ctl->tree_lock); ret = link_free_space(ctl, e); - ctl->total_bitmaps++; - ctl->op->recalc_thresholds(ctl); - spin_unlock(&ctl->tree_lock); if (ret) { + spin_unlock(&ctl->tree_lock); btrfs_err(fs_info, "Duplicate entries in free space cache, dumping"); + kmem_cache_free(btrfs_free_space_bitmap_cachep, e->bitmap); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } + ctl->total_bitmaps++; + recalculate_thresholds(ctl); + spin_unlock(&ctl->tree_lock); list_add_tail(&e->list, &bitmaps); } @@ -819,31 +879,68 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ret = io_ctl_read_bitmap(&io_ctl, e); if (ret) goto free_cache; - e->bitmap_extents = count_bitmap_extents(ctl, e); - if (!btrfs_free_space_trimmed(e)) { - ctl->discardable_extents[BTRFS_STAT_CURR] += - e->bitmap_extents; - ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes; - } } io_ctl_drop_pages(&io_ctl); - merge_space_tree(ctl); ret = 1; out: - btrfs_discard_update_discardable(ctl->private, ctl); io_ctl_free(&io_ctl); return ret; free_cache: io_ctl_drop_pages(&io_ctl); + + spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache(ctl); + spin_unlock(&ctl->tree_lock); goto out; } +static int copy_free_space_cache(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int ret = 0; + + while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (!info->bitmap) { + const u64 offset = info->offset; + const u64 bytes = info->bytes; + + unlink_free_space(ctl, info, true); + spin_unlock(&ctl->tree_lock); + kmem_cache_free(btrfs_free_space_cachep, info); + ret = btrfs_add_free_space(block_group, offset, bytes); + spin_lock(&ctl->tree_lock); + } else { + u64 offset = info->offset; + u64 bytes = ctl->unit; + + ret = search_bitmap(ctl, info, &offset, &bytes, false); + if (ret == 0) { + bitmap_clear_bits(ctl, info, offset, bytes, true); + spin_unlock(&ctl->tree_lock); + ret = btrfs_add_free_space(block_group, offset, + bytes); + spin_lock(&ctl->tree_lock); + } else { + free_bitmap(ctl, info); + ret = 0; + } + } + cond_resched_lock(&ctl->tree_lock); + } + return ret; +} + +static struct lock_class_key btrfs_free_space_inode_key; + int load_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space_ctl tmp_ctl = {}; struct inode *inode; struct btrfs_path *path; int ret = 0; @@ -851,6 +948,13 @@ int load_free_space_cache(struct btrfs_block_group *block_group) u64 used = block_group->used; /* + * Because we could potentially discard our loaded free space, we want + * to load everything into a temporary structure first, and then if it's + * valid copy it all into the actual free space ctl. + */ + btrfs_init_free_space_ctl(block_group, &tmp_ctl); + + /* * If this block group has been marked to be cleared for one reason or * another then we can't trust the on disk cache, so just return. */ @@ -864,8 +968,8 @@ int load_free_space_cache(struct btrfs_block_group *block_group) path = btrfs_alloc_path(); if (!path) return 0; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; /* * We must pass a path with search_commit_root set to btrfs_iget in @@ -901,19 +1005,41 @@ int load_free_space_cache(struct btrfs_block_group *block_group) } spin_unlock(&block_group->lock); - ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, + /* + * Reinitialize the class of struct inode's mapping->invalidate_lock for + * free space inodes to prevent false positives related to locks for normal + * inodes. + */ + lockdep_set_class(&(&inode->i_data)->invalidate_lock, + &btrfs_free_space_inode_key); + + ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, path, block_group->start); btrfs_free_path(path); if (ret <= 0) goto out; - spin_lock(&ctl->tree_lock); - matched = (ctl->free_space == (block_group->length - used - - block_group->bytes_super)); - spin_unlock(&ctl->tree_lock); + matched = (tmp_ctl.free_space == (block_group->length - used - + block_group->bytes_super)); - if (!matched) { - __btrfs_remove_free_space_cache(ctl); + if (matched) { + spin_lock(&tmp_ctl.tree_lock); + ret = copy_free_space_cache(block_group, &tmp_ctl); + spin_unlock(&tmp_ctl.tree_lock); + /* + * ret == 1 means we successfully loaded the free space cache, + * so we need to re-set it here. + */ + if (ret == 0) + ret = 1; + } else { + /* + * We need to call the _locked variant so we don't try to update + * the discard counters. + */ + spin_lock(&tmp_ctl.tree_lock); + __btrfs_remove_free_space_cache(&tmp_ctl); + spin_unlock(&tmp_ctl.tree_lock); btrfs_warn(fs_info, "block group %llu has wrong amount of free space", block_group->start); @@ -932,6 +1058,9 @@ out: block_group->start); } + spin_lock(&ctl->tree_lock); + btrfs_discard_update_discardable(block_group); + spin_unlock(&ctl->tree_lock); iput(inode); return ret; } @@ -951,9 +1080,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) { - cluster = list_entry(block_group->cluster_list.next, - struct btrfs_free_cluster, - block_group_list); + cluster = list_first_entry(&block_group->cluster_list, + struct btrfs_free_cluster, block_group_list); } if (!node && cluster) { @@ -1026,13 +1154,13 @@ update_cache_item(struct btrfs_trans_handle *trans, int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, 0, 0, NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DELALLOC, NULL); goto fail; } leaf = path->nodes[0]; @@ -1043,9 +1171,9 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, EXTENT_DELALLOC, 0, - 0, NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, EXTENT_DELALLOC, + NULL); btrfs_release_path(path); goto fail; } @@ -1057,7 +1185,6 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_set_free_space_entries(leaf, header, entries); btrfs_set_free_space_bitmaps(leaf, header, bitmaps); btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); return 0; @@ -1091,10 +1218,9 @@ static noinline_for_stack int write_pinned_extent_entries( start = block_group->start; while (start < block_group->start + block_group->length) { - ret = find_first_extent_bit(unpin, start, - &extent_start, &extent_end, - EXTENT_DIRTY, NULL); - if (ret) + if (!btrfs_find_first_extent_bit(unpin, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ @@ -1138,10 +1264,10 @@ static int flush_dirty_cache(struct inode *inode) { int ret; - ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, 0, 0, NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DELALLOC, NULL); return ret; } @@ -1161,8 +1287,8 @@ cleanup_write_cache_enospc(struct inode *inode, struct extent_state **cached_state) { io_ctl_drop_pages(io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + cached_state); } static int __btrfs_wait_cache_io(struct btrfs_root *root, @@ -1186,7 +1312,6 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root, ret = update_cache_item(trans, root, inode, path, offset, io_ctl->entries, io_ctl->bitmaps); out: - io_ctl_free(io_ctl); if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; @@ -1195,7 +1320,7 @@ out: "failed to write free space cache for block group %llu error %d", block_group->start, ret); } - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, BTRFS_I(inode)); if (block_group) { /* the dirty list is protected by the dirty_bgs_lock */ @@ -1224,14 +1349,6 @@ out: } -static int btrfs_wait_cache_io_root(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_io_ctl *io_ctl, - struct btrfs_path *path) -{ - return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0); -} - int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) @@ -1241,18 +1358,20 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, path, block_group->start); } -/** - * __btrfs_write_out_cache - write out cached info to an inode - * @root - the root the inode belongs to - * @ctl - the free space cache we are going to write out - * @block_group - the block_group for this cache if it belongs to a block_group - * @trans - the trans handle +/* + * Write out cached info to an inode. + * + * @inode: freespace inode we are writing out + * @ctl: free space cache we are going to write out + * @block_group: block_group for this cache if it belongs to a block_group + * @io_ctl: holds context for the io + * @trans: the trans handle * * This function writes out a free space cache struct to disk for quick recovery * on mount. This will return 0 if it was successful in writing the cache out, * or an errno if it was not. */ -static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, +static int __btrfs_write_out_cache(struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, @@ -1264,6 +1383,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, int bitmaps = 0; int ret; int must_iput = 0; + int i_size; if (!i_size_read(inode)) return -EIO; @@ -1293,8 +1413,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (ret) goto out_unlock; - lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); io_ctl_set_generation(io_ctl, trans->transid); @@ -1334,10 +1454,16 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0, - i_size_read(inode), &cached_state); - if (ret) - goto out_nospc; + i_size = i_size_read(inode); + for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) { + u64 dirty_start = i * PAGE_SIZE; + u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start; + + ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]), + dirty_start, dirty_len, &cached_state, false); + if (ret < 0) + goto out_nospc; + } if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); @@ -1346,19 +1472,20 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * them out later */ io_ctl_drop_pages(io_ctl); + io_ctl_free(io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); /* * at this point the pages are under IO and we're happy, - * The caller is responsible for waiting on them and updating the + * The caller is responsible for waiting on them and updating * the cache and the inode */ io_ctl->entries = entries; io_ctl->bitmaps = bitmaps; - ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); + ret = btrfs_fdatawrite_range(BTRFS_I(inode), 0, (u64)-1); if (ret) goto out; @@ -1383,7 +1510,7 @@ out: invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, BTRFS_I(inode)); if (must_iput) iput(inode); return ret; @@ -1409,8 +1536,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return 0; - ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, - block_group, &block_group->io_ctl, trans); + ret = __btrfs_write_out_cache(inode, ctl, block_group, + &block_group->io_ctl, trans); if (ret) { btrfs_debug(fs_info, "failed to write free space cache for block group %llu error %d", @@ -1459,20 +1586,34 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, return bitmap_start; } -static int tree_insert_offset(struct rb_root *root, u64 offset, - struct rb_node *node, int bitmap) +static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_cluster *cluster, + struct btrfs_free_space *new_entry) { - struct rb_node **p = &root->rb_node; + struct rb_root *root; + struct rb_node **p; struct rb_node *parent = NULL; - struct btrfs_free_space *info; + + lockdep_assert_held(&ctl->tree_lock); + + if (cluster) { + lockdep_assert_held(&cluster->lock); + root = &cluster->root; + } else { + root = &ctl->free_space_offset; + } + + p = &root->rb_node; while (*p) { + struct btrfs_free_space *info; + parent = *p; info = rb_entry(parent, struct btrfs_free_space, offset_index); - if (offset < info->offset) { + if (new_entry->offset < info->offset) { p = &(*p)->rb_left; - } else if (offset > info->offset) { + } else if (new_entry->offset > info->offset) { p = &(*p)->rb_right; } else { /* @@ -1488,7 +1629,7 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, * found a bitmap, we want to go left, or before * logically. */ - if (bitmap) { + if (new_entry->bitmap) { if (info->bitmap) { WARN_ON_ONCE(1); return -EEXIST; @@ -1504,13 +1645,57 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, } } - rb_link_node(node, parent, p); - rb_insert_color(node, root); + rb_link_node(&new_entry->offset_index, parent, p); + rb_insert_color(&new_entry->offset_index, root); return 0; } /* + * This is a little subtle. We *only* have ->max_extent_size set if we actually + * searched through the bitmap and figured out the largest ->max_extent_size, + * otherwise it's 0. In the case that it's 0 we don't want to tell the + * allocator the wrong thing, we want to use the actual real max_extent_size + * we've found already if it's larger, or we want to use ->bytes. + * + * This matters because find_free_space() will skip entries who's ->bytes is + * less than the required bytes. So if we didn't search down this bitmap, we + * may pick some previous entry that has a smaller ->max_extent_size than we + * have. For example, assume we have two entries, one that has + * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set + * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will + * call into find_free_space(), and return with max_extent_size == 4K, because + * that first bitmap entry had ->max_extent_size set, but the second one did + * not. If instead we returned 8K we'd come in searching for 8K, and find the + * 8K contiguous range. + * + * Consider the other case, we have 2 8K chunks in that second entry and still + * don't have ->max_extent_size set. We'll return 16K, and the next time the + * allocator comes in it'll fully search our second bitmap, and this time it'll + * get an uptodate value of 8K as the maximum chunk size. Then we'll get the + * right allocation the next loop through. + */ +static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) +{ + if (entry->bitmap && entry->max_extent_size) + return entry->max_extent_size; + return entry->bytes; +} + +/* + * We want the largest entry to be leftmost, so this is inverted from what you'd + * normally expect. + */ +static bool entry_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct btrfs_free_space *entry, *exist; + + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + exist = rb_entry(parent, struct btrfs_free_space, bytes_index); + return get_max_extent_size(exist) < get_max_extent_size(entry); +} + +/* * searches the tree for the given offset. * * fuzzy - If this is set, then we are trying to make an allocation, and we just @@ -1522,15 +1707,12 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, u64 offset, int bitmap_only, int fuzzy) { struct rb_node *n = ctl->free_space_offset.rb_node; - struct btrfs_free_space *entry, *prev = NULL; + struct btrfs_free_space *entry = NULL, *prev = NULL; - /* find entry that is closest to the 'offset' */ - while (1) { - if (!n) { - entry = NULL; - break; - } + lockdep_assert_held(&ctl->tree_lock); + /* find entry that is closest to the 'offset' */ + while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); prev = entry; @@ -1540,6 +1722,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, n = n->rb_right; else break; + + entry = NULL; } if (bitmap_only) { @@ -1616,6 +1800,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, return NULL; while (1) { + n = rb_next(&entry->offset_index); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); if (entry->bitmap) { if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) @@ -1624,33 +1812,27 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, if (entry->offset + entry->bytes > offset) break; } - - n = rb_next(&entry->offset_index); - if (!n) - return NULL; - entry = rb_entry(n, struct btrfs_free_space, offset_index); } return entry; } -static inline void -__unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) +static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) { + lockdep_assert_held(&ctl->tree_lock); + rb_erase(&info->offset_index, &ctl->free_space_offset); + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); ctl->free_extents--; if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; } -} -static void unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - __unlink_free_space(ctl, info); - ctl->free_space -= info->bytes; + if (update_stat) + ctl->free_space -= info->bytes; } static int link_free_space(struct btrfs_free_space_ctl *ctl, @@ -1658,12 +1840,15 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, { int ret = 0; + lockdep_assert_held(&ctl->tree_lock); + ASSERT(info->bytes || info->bitmap); - ret = tree_insert_offset(&ctl->free_space_offset, info->offset, - &info->offset_index, (info->bitmap != NULL)); + ret = tree_insert_offset(ctl, NULL, info); if (ret) return ret; + rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]++; ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; @@ -1674,47 +1859,27 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, return ret; } -static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) +static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) { - struct btrfs_block_group *block_group = ctl->private; - u64 max_bytes; - u64 bitmap_bytes; - u64 extent_bytes; - u64 size = block_group->length; - u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; - u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); - - max_bitmaps = max_t(u64, max_bitmaps, 1); - - ASSERT(ctl->total_bitmaps <= max_bitmaps); + ASSERT(info->bitmap); /* - * We are trying to keep the total amount of memory used per 1GiB of - * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation - * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of - * bitmaps, we may end up using more memory than this. + * If our entry is empty it's because we're on a cluster and we don't + * want to re-link it into our ctl bytes index. */ - if (size < SZ_1G) - max_bytes = MAX_CACHE_BYTES_PER_GIG; - else - max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); - - bitmap_bytes = ctl->total_bitmaps * ctl->unit; + if (RB_EMPTY_NODE(&info->bytes_index)) + return; - /* - * we want the extent entry threshold to always be at most 1/2 the max - * bytes we can have, or whatever is less than that. - */ - extent_bytes = max_bytes - bitmap_bytes; - extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); + lockdep_assert_held(&ctl->tree_lock); - ctl->extents_thresh = - div_u64(extent_bytes, sizeof(struct btrfs_free_space)); + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); + rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); } -static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, - u64 offset, u64 bytes) +static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + u64 offset, u64 bytes, bool update_stat) { unsigned long start, count, end; int extent_delta = -1; @@ -1730,6 +1895,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, if (info->max_extent_size > ctl->unit) info->max_extent_size = 0; + relink_bitmap_entry(ctl, info); + if (start && test_bit(start - 1, info->bitmap)) extent_delta++; @@ -1741,19 +1908,14 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; } -} -static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, u64 offset, - u64 bytes) -{ - __bitmap_clear_bits(ctl, info, offset, bytes); - ctl->free_space -= bytes; + if (update_stat) + ctl->free_space -= bytes; } -static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, u64 offset, - u64 bytes) +static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) { unsigned long start, count, end; int extent_delta = 1; @@ -1765,9 +1927,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, bitmap_set(info->bitmap, start, count); + /* + * We set some bytes, we have no idea what the max extent size is + * anymore. + */ + info->max_extent_size = 0; info->bytes += bytes; ctl->free_space += bytes; + relink_bitmap_entry(ctl, info); + if (start && test_bit(start - 1, info->bitmap)) extent_delta--; @@ -1835,20 +2004,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, *bytes = (u64)(max_bits) * ctl->unit; bitmap_info->max_extent_size = *bytes; + relink_bitmap_entry(ctl, bitmap_info); return -1; } -static inline u64 get_max_extent_size(struct btrfs_free_space *entry) -{ - if (entry->bitmap) - return entry->max_extent_size; - return entry->bytes; -} - /* Cache the size of the max extent in bytes */ static struct btrfs_free_space * find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, - unsigned long align, u64 *max_extent_size) + unsigned long align, u64 *max_extent_size, bool use_bytes_index) { struct btrfs_free_space *entry; struct rb_node *node; @@ -1858,16 +2021,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, if (!ctl->free_space_offset.rb_node) goto out; +again: + if (use_bytes_index) { + node = rb_first_cached(&ctl->free_space_bytes); + } else { + entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), + 0, 1); + if (!entry) + goto out; + node = &entry->offset_index; + } - entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); - if (!entry) - goto out; + for (; node; node = rb_next(node)) { + if (use_bytes_index) + entry = rb_entry(node, struct btrfs_free_space, + bytes_index); + else + entry = rb_entry(node, struct btrfs_free_space, + offset_index); - for (node = &entry->offset_index; node; node = rb_next(node)) { - entry = rb_entry(node, struct btrfs_free_space, offset_index); + /* + * If we are using the bytes index then all subsequent entries + * in this tree are going to be < bytes, so simply set the max + * extent size and exit the loop. + * + * If we're using the offset index then we need to keep going + * through the rest of the tree. + */ if (entry->bytes < *bytes) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); + if (use_bytes_index) + break; continue; } @@ -1884,6 +2069,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, tmp = entry->offset; } + /* + * We don't break here if we're using the bytes index because we + * may have another entry that has the correct alignment that is + * the right size, so we don't want to miss that possibility. + * At worst this adds another loop through the logic, but if we + * broke here we could prematurely ENOSPC. + */ if (entry->bytes < *bytes + align_off) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); @@ -1891,6 +2083,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, } if (entry->bitmap) { + struct rb_node *old_next = rb_next(node); u64 size = *bytes; ret = search_bitmap(ctl, entry, &tmp, &size, true); @@ -1903,6 +2096,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, max(get_max_extent_size(entry), *max_extent_size); } + + /* + * The bitmap may have gotten re-arranged in the space + * index here because the max_extent_size may have been + * updated. Start from the beginning again if this + * happened. + */ + if (use_bytes_index && old_next != rb_next(node)) + goto again; continue; } @@ -1914,29 +2116,6 @@ out: return NULL; } -static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *bitmap_info) -{ - struct btrfs_block_group *block_group = ctl->private; - u64 bytes = bitmap_info->bytes; - unsigned int rs, re; - int count = 0; - - if (!block_group || !bytes) - return count; - - bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0, - BITS_PER_BITMAP) { - bytes -= (rs - re) * ctl->unit; - count++; - - if (!bytes) - break; - } - - return count; -} - static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset) { @@ -1946,8 +2125,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; - - ctl->op->recalc_thresholds(ctl); + recalculate_thresholds(ctl); } static void free_bitmap(struct btrfs_free_space_ctl *ctl, @@ -1965,11 +2143,11 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; } - unlink_free_space(ctl, bitmap_info); + unlink_free_space(ctl, bitmap_info, true); kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); ctl->total_bitmaps--; - ctl->op->recalc_thresholds(ctl); + recalculate_thresholds(ctl); } static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, @@ -2003,7 +2181,7 @@ again: /* Cannot clear past the end of the bitmap */ search_bytes = min(search_bytes, end - search_start + 1); - bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); + bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); *offset += search_bytes; *bytes -= search_bytes; @@ -2073,13 +2251,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, bytes_to_set = min(end - offset, bytes); - bitmap_set_bits(ctl, info, offset, bytes_to_set); - - /* - * We set some bytes, we have no idea what the max extent size is - * anymore. - */ - info->max_extent_size = 0; + btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set); return bytes_to_set; @@ -2088,7 +2260,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { - struct btrfs_block_group *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; bool forced = false; @@ -2110,7 +2282,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want * to reserve them to larger extents, however if we have plenty - * of cache left then go ahead an dadd them, no sense in adding + * of cache left then go ahead and add them, no sense in adding * the overhead of a bitmap if we don't have to. */ if (info->bytes <= fs_info->sectorsize * 8) { @@ -2136,7 +2308,6 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, } static const struct btrfs_free_space_op free_space_op = { - .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; @@ -2158,7 +2329,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, return 0; if (ctl->op == &free_space_op) - block_group = ctl->private; + block_group = ctl->block_group; again: /* * Since we link bitmaps right into the cluster we need to see if we @@ -2170,9 +2341,8 @@ again: struct rb_node *node; struct btrfs_free_space *entry; - cluster = list_entry(block_group->cluster_list.next, - struct btrfs_free_cluster, - block_group_list); + cluster = list_first_entry(&block_group->cluster_list, + struct btrfs_free_cluster, block_group_list); spin_lock(&cluster->lock); node = rb_first(&cluster->root); if (!node) { @@ -2281,12 +2451,13 @@ out: static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { - struct btrfs_free_space *left_info; + struct btrfs_free_space *left_info = NULL; struct btrfs_free_space *right_info; bool merged = false; u64 offset = info->offset; u64 bytes = info->bytes; const bool is_trimmed = btrfs_free_space_trimmed(info); + struct rb_node *right_prev = NULL; /* * first we want to see if there is free space adjacent to the range we @@ -2294,19 +2465,18 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, * cover the entire range */ right_info = tree_search_offset(ctl, offset + bytes, 0, 0); - if (right_info && rb_prev(&right_info->offset_index)) - left_info = rb_entry(rb_prev(&right_info->offset_index), - struct btrfs_free_space, offset_index); - else + if (right_info) + right_prev = rb_prev(&right_info->offset_index); + + if (right_prev) + left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); + else if (!right_info) left_info = tree_search_offset(ctl, offset - 1, 0, 0); /* See try_merge_free_space() comment. */ if (right_info && !right_info->bitmap && (!is_trimmed || btrfs_free_space_trimmed(right_info))) { - if (update_stat) - unlink_free_space(ctl, right_info); - else - __unlink_free_space(ctl, right_info); + unlink_free_space(ctl, right_info, update_stat); info->bytes += right_info->bytes; kmem_cache_free(btrfs_free_space_cachep, right_info); merged = true; @@ -2316,10 +2486,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, if (left_info && !left_info->bitmap && left_info->offset + left_info->bytes == offset && (!is_trimmed || btrfs_free_space_trimmed(left_info))) { - if (update_stat) - unlink_free_space(ctl, left_info); - else - __unlink_free_space(ctl, left_info); + unlink_free_space(ctl, left_info, update_stat); info->offset = left_info->offset; info->bytes += left_info->bytes; kmem_cache_free(btrfs_free_space_cachep, left_info); @@ -2355,10 +2522,7 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, if (!btrfs_free_space_trimmed(bitmap)) info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; - if (update_stat) - bitmap_clear_bits(ctl, bitmap, end, bytes); - else - __bitmap_clear_bits(ctl, bitmap, end, bytes); + bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); if (!bitmap->bytes) free_bitmap(ctl, bitmap); @@ -2412,10 +2576,7 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, if (!btrfs_free_space_trimmed(bitmap)) info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; - if (update_stat) - bitmap_clear_bits(ctl, bitmap, info->offset, bytes); - else - __bitmap_clear_bits(ctl, bitmap, info->offset, bytes); + bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); if (!bitmap->bytes) free_bitmap(ctl, bitmap); @@ -2459,16 +2620,18 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, } } -int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_free_space_ctl *ctl, +static int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { - struct btrfs_block_group *block_group = ctl->private; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; int ret = 0; u64 filter_bytes = bytes; + ASSERT(!btrfs_is_zoned(fs_info)); + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) return -ENOMEM; @@ -2477,6 +2640,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, info->bytes = bytes; info->trim_state = trim_state; RB_CLEAR_NODE(&info->offset_index); + RB_CLEAR_NODE(&info->bytes_index); spin_lock(&ctl->tree_lock); @@ -2510,7 +2674,7 @@ link: if (ret) kmem_cache_free(btrfs_free_space_cachep, info); out: - btrfs_discard_update_discardable(block_group, ctl); + btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); if (ret) { @@ -2526,17 +2690,90 @@ out: return ret; } +static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, + u64 bytenr, u64 size, bool used) +{ + struct btrfs_space_info *sinfo = block_group->space_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 offset = bytenr - block_group->start; + u64 to_free, to_unusable; + int bg_reclaim_threshold = 0; + bool initial; + u64 reclaimable_unusable; + + spin_lock(&block_group->lock); + + initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); + WARN_ON(!initial && offset + size > block_group->zone_capacity); + if (!initial) + bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); + + if (!used) + to_free = size; + else if (initial) + to_free = block_group->zone_capacity; + else if (offset >= block_group->alloc_offset) + to_free = size; + else if (offset + size <= block_group->alloc_offset) + to_free = 0; + else + to_free = offset + size - block_group->alloc_offset; + to_unusable = size - to_free; + + spin_lock(&ctl->tree_lock); + ctl->free_space += to_free; + spin_unlock(&ctl->tree_lock); + /* + * If the block group is read-only, we should account freed space into + * bytes_readonly. + */ + if (!block_group->ro) { + block_group->zone_unusable += to_unusable; + WARN_ON(block_group->zone_unusable > block_group->length); + } + if (!used) { + block_group->alloc_offset -= size; + } + + reclaimable_unusable = block_group->zone_unusable - + (block_group->length - block_group->zone_capacity); + /* All the region is now unusable. Mark it as unused and reclaim */ + if (block_group->zone_unusable == block_group->length) { + btrfs_mark_bg_unused(block_group); + } else if (bg_reclaim_threshold && + reclaimable_unusable >= + mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { + btrfs_mark_bg_to_reclaim(block_group); + } + + spin_unlock(&block_group->lock); + + return 0; +} + int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + true); + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; - return __btrfs_add_free_space(block_group->fs_info, - block_group->free_space_ctl, - bytenr, size, trim_state); + return __btrfs_add_free_space(block_group, bytenr, size, trim_state); +} + +int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + false); + + return btrfs_add_free_space(block_group, bytenr, size); } /* @@ -2549,13 +2786,15 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + true); + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; - return __btrfs_add_free_space(block_group->fs_info, - block_group->free_space_ctl, - bytenr, size, trim_state); + return __btrfs_add_free_space(block_group, bytenr, size, trim_state); } int btrfs_remove_free_space(struct btrfs_block_group *block_group, @@ -2566,6 +2805,26 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group, int ret; bool re_search = false; + if (btrfs_is_zoned(block_group->fs_info)) { + /* + * This can happen with conventional zones when replaying log. + * Since the allocation info of tree-log nodes are not recorded + * to the extent-tree, calculate_alloc_pointer() failed to + * advance the allocation pointer after last allocated tree log + * node blocks. + * + * This function is called from + * btrfs_pin_extent_for_log_replay() when replaying the log. + * Advance the pointer not to overwrite the tree-log nodes. + */ + if (block_group->start + block_group->alloc_offset < + offset + bytes) { + block_group->alloc_offset = + offset + bytes - block_group->start; + } + return 0; + } + spin_lock(&ctl->tree_lock); again: @@ -2594,7 +2853,7 @@ again: re_search = false; if (!info->bitmap) { - unlink_free_space(ctl, info); + unlink_free_space(ctl, info, true); if (offset == info->offset) { u64 to_free = min(bytes, info->bytes); @@ -2630,7 +2889,7 @@ again: } spin_unlock(&ctl->tree_lock); - ret = __btrfs_add_free_space(block_group->fs_info, ctl, + ret = __btrfs_add_free_space(block_group, offset + bytes, old_end - (offset + bytes), info->trim_state); @@ -2645,7 +2904,7 @@ again: goto again; } out_lock: - btrfs_discard_update_discardable(block_group, ctl); + btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); out: return ret; @@ -2660,32 +2919,45 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, struct rb_node *n; int count = 0; + /* + * Zoned btrfs does not use free space tree and cluster. Just print + * out the free space after the allocation offset. + */ + if (btrfs_is_zoned(fs_info)) { + btrfs_info(fs_info, "free space %llu active %d", + block_group->zone_capacity - block_group->alloc_offset, + test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags)); + return; + } + spin_lock(&ctl->tree_lock); for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes && !block_group->ro) count++; btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", - info->offset, info->bytes, - (info->bitmap) ? "yes" : "no"); + info->offset, info->bytes, str_yes_no(info->bitmap)); } spin_unlock(&ctl->tree_lock); btrfs_info(fs_info, "block group has cluster?: %s", - list_empty(&block_group->cluster_list) ? "no" : "yes"); + str_no_yes(list_empty(&block_group->cluster_list))); btrfs_info(fs_info, - "%d blocks of free space at or bigger than bytes is", count); + "%d free space entries at or bigger than %llu bytes", + count, bytes); } -void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group) +void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; spin_lock_init(&ctl->tree_lock); ctl->unit = fs_info->sectorsize; ctl->start = block_group->start; - ctl->private = block_group; + ctl->block_group = block_group; ctl->op = &free_space_op; + ctl->free_space_bytes = RB_ROOT_CACHED; INIT_LIST_HEAD(&ctl->trimming_ranges); mutex_init(&ctl->cache_writeout_mutex); @@ -2703,18 +2975,20 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group) * pointed to by the cluster, someone else raced in and freed the * cluster already. In that case, we just return without changing anything */ -static int -__btrfs_return_cluster_to_free_space( +static void __btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry; struct rb_node *node; + lockdep_assert_held(&ctl->tree_lock); + spin_lock(&cluster->lock); - if (cluster->block_group != block_group) - goto out; + if (cluster->block_group != block_group) { + spin_unlock(&cluster->lock); + return; + } cluster->block_group = NULL; cluster->window_start = 0; @@ -2722,15 +2996,14 @@ __btrfs_return_cluster_to_free_space( node = rb_first(&cluster->root); while (node) { - bool bitmap; + struct btrfs_free_space *entry; entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); RB_CLEAR_NODE(&entry->offset_index); - bitmap = (entry->bitmap != NULL); - if (!bitmap) { + if (!entry->bitmap) { /* Merging treats extents as if they were new */ if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; @@ -2748,43 +3021,13 @@ __btrfs_return_cluster_to_free_space( entry->bytes; } } - tree_insert_offset(&ctl->free_space_offset, - entry->offset, &entry->offset_index, bitmap); + tree_insert_offset(ctl, NULL, entry); + rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, + entry_less); } cluster->root = RB_ROOT; - -out: spin_unlock(&cluster->lock); btrfs_put_block_group(block_group); - return 0; -} - -static void __btrfs_remove_free_space_cache_locked( - struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_free_space *info; - struct rb_node *node; - - while ((node = rb_last(&ctl->free_space_offset)) != NULL) { - info = rb_entry(node, struct btrfs_free_space, offset_index); - if (!info->bitmap) { - unlink_free_space(ctl, info); - kmem_cache_free(btrfs_free_space_cachep, info); - } else { - free_bitmap(ctl, info); - } - - cond_resched_lock(&ctl->tree_lock); - } -} - -void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) -{ - spin_lock(&ctl->tree_lock); - __btrfs_remove_free_space_cache_locked(ctl); - if (ctl->private) - btrfs_discard_update_discardable(ctl->private, ctl); - spin_unlock(&ctl->tree_lock); } void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) @@ -2804,16 +3047,13 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) cond_resched_lock(&ctl->tree_lock); } - __btrfs_remove_free_space_cache_locked(ctl); - btrfs_discard_update_discardable(block_group, ctl); + __btrfs_remove_free_space_cache(ctl); + btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); } -/** - * btrfs_is_free_space_trimmed - see if everything is trimmed - * @block_group: block_group of interest - * +/* * Walk @block_group's free space rb_tree to determine if everything is trimmed. */ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) @@ -2854,16 +3094,20 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 align_gap = 0; u64 align_gap_len = 0; enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + bool use_bytes_index = (offset == block_group->start); + + ASSERT(!btrfs_is_zoned(block_group->fs_info)); spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, - block_group->full_stripe_len, max_extent_size); + block_group->full_stripe_len, max_extent_size, + use_bytes_index); if (!entry) goto out; ret = offset; if (entry->bitmap) { - bitmap_clear_bits(ctl, entry, offset, bytes); + bitmap_clear_bits(ctl, entry, offset, bytes, true); if (!btrfs_free_space_trimmed(entry)) atomic64_add(bytes, &discard_ctl->discard_bytes_saved); @@ -2871,7 +3115,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, if (!entry->bytes) free_bitmap(ctl, entry); } else { - unlink_free_space(ctl, entry); + unlink_free_space(ctl, entry, true); align_gap_len = offset - entry->offset; align_gap = entry->offset; align_gap_trim_state = entry->trim_state; @@ -2889,12 +3133,11 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, link_free_space(ctl, entry); } out: - btrfs_discard_update_discardable(block_group, ctl); + btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); if (align_gap_len) - __btrfs_add_free_space(block_group->fs_info, ctl, - align_gap, align_gap_len, + __btrfs_add_free_space(block_group, align_gap, align_gap_len, align_gap_trim_state); return ret; } @@ -2907,12 +3150,11 @@ out: * Otherwise, it'll get a reference on the block group pointed to by the * cluster and remove the cluster from it. */ -int btrfs_return_cluster_to_free_space( +void btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl; - int ret; /* first, get a safe pointer to the block group */ spin_lock(&cluster->lock); @@ -2920,28 +3162,27 @@ int btrfs_return_cluster_to_free_space( block_group = cluster->block_group; if (!block_group) { spin_unlock(&cluster->lock); - return 0; + return; } } else if (cluster->block_group != block_group) { /* someone else has already freed it don't redo their work */ spin_unlock(&cluster->lock); - return 0; + return; } - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); spin_unlock(&cluster->lock); ctl = block_group->free_space_ctl; /* now return any extents the cluster had on it */ spin_lock(&ctl->tree_lock); - ret = __btrfs_return_cluster_to_free_space(block_group, cluster); + __btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&ctl->tree_lock); btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); /* finally drop our ref */ btrfs_put_block_group(block_group); - return ret; } static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, @@ -2951,7 +3192,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - int err; + int ret2; u64 search_start = cluster->window_start; u64 search_bytes = bytes; u64 ret = 0; @@ -2959,15 +3200,15 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, search_start = min_start; search_bytes = bytes; - err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); - if (err) { + ret2 = search_bitmap(ctl, entry, &search_start, &search_bytes, true); + if (ret2) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); return 0; } ret = search_start; - __bitmap_clear_bits(ctl, entry, ret, bytes); + bitmap_clear_bits(ctl, entry, ret, bytes, false); return ret; } @@ -2988,6 +3229,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct rb_node *node; u64 ret = 0; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; @@ -3036,8 +3279,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, entry->bytes -= bytes; } - if (entry->bytes == 0) - rb_erase(&entry->offset_index, &cluster->root); break; } out: @@ -3054,19 +3295,23 @@ out: ctl->free_space -= bytes; if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; + + spin_lock(&cluster->lock); if (entry->bytes == 0) { + rb_erase(&entry->offset_index, &cluster->root); ctl->free_extents--; if (entry->bitmap) { kmem_cache_free(btrfs_free_space_bitmap_cachep, entry->bitmap); ctl->total_bitmaps--; - ctl->op->recalc_thresholds(ctl); + recalculate_thresholds(ctl); } else if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; } kmem_cache_free(btrfs_free_space_cachep, entry); } + spin_unlock(&cluster->lock); spin_unlock(&ctl->tree_lock); return ret; @@ -3089,6 +3334,8 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, unsigned long total_found = 0; int ret; + lockdep_assert_held(&ctl->tree_lock); + i = offset_to_bit(entry->offset, ctl->unit, max_t(u64, offset, entry->offset)); want_bits = bytes_to_bits(bytes, ctl->unit); @@ -3139,8 +3386,18 @@ again: cluster->window_start = start * ctl->unit + entry->offset; rb_erase(&entry->offset_index, &ctl->free_space_offset); - ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index, 1); + rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); + + /* + * We need to know if we're currently on the normal space index when we + * manipulate the bitmap so that we know we need to remove and re-insert + * it into the space_index tree. Clear the bytes_index node here so the + * bitmap manipulation helpers know not to mess with the space_index + * until this bitmap entry is added back into the normal cache. + */ + RB_CLEAR_NODE(&entry->bytes_index); + + ret = tree_insert_offset(ctl, cluster, entry); ASSERT(!ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, @@ -3168,6 +3425,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, u64 max_extent; u64 total_size = 0; + lockdep_assert_held(&ctl->tree_lock); + entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) return -ENOSPC; @@ -3229,8 +3488,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, continue; rb_erase(&entry->offset_index, &ctl->free_space_offset); - ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index, 0); + rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); + ret = tree_insert_offset(ctl, cluster, entry); total_size += entry->bytes; ASSERT(!ret); /* -EEXIST; Logic error */ } while (node && entry != last); @@ -3314,7 +3573,8 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group, * data, keep it dense. */ if (btrfs_test_opt(fs_info, SSD_SPREAD)) { - cont1_bytes = min_bytes = bytes + empty_size; + cont1_bytes = bytes + empty_size; + min_bytes = cont1_bytes; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; min_bytes = fs_info->sectorsize; @@ -3358,7 +3618,7 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group, list_del_init(&entry->list); if (!ret) { - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; @@ -3396,7 +3656,7 @@ static int do_trimming(struct btrfs_block_group *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; - int update = 0; + bool bg_ro; const u64 end = start + bytes; const u64 reserved_end = reserved_start + reserved_bytes; enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; @@ -3404,12 +3664,14 @@ static int do_trimming(struct btrfs_block_group *block_group, spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (!block_group->ro) { + bg_ro = block_group->ro; + if (!bg_ro) { block_group->reserved += reserved_bytes; + spin_unlock(&block_group->lock); space_info->bytes_reserved += reserved_bytes; - update = 1; + } else { + spin_unlock(&block_group->lock); } - spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); @@ -3420,24 +3682,26 @@ static int do_trimming(struct btrfs_block_group *block_group, mutex_lock(&ctl->cache_writeout_mutex); if (reserved_start < start) - __btrfs_add_free_space(fs_info, ctl, reserved_start, + __btrfs_add_free_space(block_group, reserved_start, start - reserved_start, reserved_trim_state); - if (start + bytes < reserved_start + reserved_bytes) - __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end, + if (end < reserved_end) + __btrfs_add_free_space(block_group, end, reserved_end - end, reserved_trim_state); - __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state); + __btrfs_add_free_space(block_group, start, bytes, trim_state); list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); - if (update) { + if (!bg_ro) { spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (block_group->ro) - space_info->bytes_readonly += reserved_bytes; + bg_ro = block_group->ro; block_group->reserved -= reserved_bytes; - space_info->bytes_reserved -= reserved_bytes; spin_unlock(&block_group->lock); + + space_info->bytes_reserved -= reserved_bytes; + if (bg_ro) + space_info->bytes_readonly += reserved_bytes; spin_unlock(&space_info->lock); } @@ -3500,7 +3764,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, mutex_unlock(&ctl->cache_writeout_mutex); goto next; } - unlink_free_space(ctl, entry); + unlink_free_space(ctl, entry, true); /* * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim @@ -3526,7 +3790,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, goto next; } - unlink_free_space(ctl, entry); + unlink_free_space(ctl, entry, true); kmem_cache_free(btrfs_free_space_cachep, entry); } @@ -3549,7 +3813,7 @@ next: if (async && *total_trimmed) break; - if (fatal_signal_pending(current)) { + if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; break; } @@ -3569,7 +3833,7 @@ out_unlock: /* * If we break out of trimming a bitmap prematurely, we should reset the - * trimming bit. In a rather contrieved case, it's possible to race here so + * trimming bit. In a rather contrived case, it's possible to race here so * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. * * start = start of bitmap @@ -3713,7 +3977,7 @@ static int trim_bitmaps(struct btrfs_block_group *block_group, bytes > (max_discard_size + minlen)) bytes = max_discard_size; - bitmap_clear_bits(ctl, entry, start, bytes); + bitmap_clear_bits(ctl, entry, start, bytes, true); if (entry->bytes == 0) free_bitmap(ctl, entry); @@ -3740,7 +4004,7 @@ next: } block_group->discard_cursor = start; - if (fatal_signal_pending(current)) { + if (btrfs_trim_interrupted()) { if (start != offset) reset_trimming_bitmap(ctl, offset); ret = -ERESTARTSYS; @@ -3764,10 +4028,12 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group, int ret; u64 rem = 0; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + *trimmed = 0; spin_lock(&block_group->lock); - if (block_group->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } @@ -3797,7 +4063,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, *trimmed = 0; spin_lock(&block_group->lock); - if (block_group->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } @@ -3819,7 +4085,7 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, *trimmed = 0; spin_lock(&block_group->lock); - if (block_group->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } @@ -3834,168 +4100,87 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, return ret; } -/* - * Find the left-most item in the cache tree, and then return the - * smallest inode number in the item. - * - * Note: the returned inode number may not be the smallest one in - * the tree, if the left-most item is a bitmap. - */ -u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) +bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) { - struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl; - struct btrfs_free_space *entry = NULL; - u64 ino = 0; - - spin_lock(&ctl->tree_lock); - - if (RB_EMPTY_ROOT(&ctl->free_space_offset)) - goto out; - - entry = rb_entry(rb_first(&ctl->free_space_offset), - struct btrfs_free_space, offset_index); - - if (!entry->bitmap) { - ino = entry->offset; - - unlink_free_space(ctl, entry); - entry->offset++; - entry->bytes--; - if (!entry->bytes) - kmem_cache_free(btrfs_free_space_cachep, entry); - else - link_free_space(ctl, entry); - } else { - u64 offset = 0; - u64 count = 1; - int ret; - - ret = search_bitmap(ctl, entry, &offset, &count, true); - /* Logic error; Should be empty if it can't find anything */ - ASSERT(!ret); - - ino = offset; - bitmap_clear_bits(ctl, entry, offset, 1); - if (entry->bytes == 0) - free_bitmap(ctl, entry); - } -out: - spin_unlock(&ctl->tree_lock); - - return ino; + return btrfs_super_cache_generation(fs_info->super_copy); } -struct inode *lookup_free_ino_inode(struct btrfs_root *root, - struct btrfs_path *path) +static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) { - struct inode *inode = NULL; - - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_inode) - inode = igrab(root->ino_cache_inode); - spin_unlock(&root->ino_cache_lock); - if (inode) - return inode; - - inode = __lookup_free_space_inode(root, path, 0); - if (IS_ERR(inode)) - return inode; + struct btrfs_block_group *block_group; + struct rb_node *node; + int ret = 0; - spin_lock(&root->ino_cache_lock); - if (!btrfs_fs_closing(root->fs_info)) - root->ino_cache_inode = igrab(inode); - spin_unlock(&root->ino_cache_lock); + btrfs_info(fs_info, "cleaning free space cache v1"); - return inode; -} - -int create_free_ino_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path) -{ - return __create_free_space_inode(root, trans, path, - BTRFS_FREE_INO_OBJECTID, 0); + node = rb_first_cached(&fs_info->block_group_cache_tree); + while (node) { + block_group = rb_entry(node, struct btrfs_block_group, cache_node); + ret = btrfs_remove_free_space_inode(trans, NULL, block_group); + if (ret) + goto out; + node = rb_next(node); + } +out: + return ret; } -int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) { - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_path *path; - struct inode *inode; - int ret = 0; - u64 root_gen = btrfs_root_generation(&root->root_item); - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; + struct btrfs_trans_handle *trans; + int ret; /* - * If we're unmounting then just return, since this does a search on the - * normal root and not the commit root and we could deadlock. + * update_super_roots will appropriately set or unset + * super_copy->cache_generation based on SPACE_CACHE and + * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a + * transaction commit whether we are enabling space cache v1 and don't + * have any other work to do, or are disabling it and removing free + * space inodes. */ - if (btrfs_fs_closing(fs_info)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return 0; - - inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode)) - goto out; - - if (root_gen != BTRFS_I(inode)->generation) - goto out_put; - - ret = __load_free_space_cache(root, inode, ctl, path, 0); + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + if (!active) { + set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); + ret = cleanup_free_space_cache_v1(fs_info, trans); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out; + } + } - if (ret < 0) - btrfs_err(fs_info, - "failed to load free ino cache for root %llu", - root->root_key.objectid); -out_put: - iput(inode); + ret = btrfs_commit_transaction(trans); out: - btrfs_free_path(path); + clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); + return ret; } -int btrfs_write_out_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode) +int __init btrfs_free_space_init(void) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - int ret; - struct btrfs_io_ctl io_ctl; - bool release_metadata = true; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; + btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); + if (!btrfs_free_space_cachep) + return -ENOMEM; - memset(&io_ctl, 0, sizeof(io_ctl)); - ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans); - if (!ret) { - /* - * At this point writepages() didn't error out, so our metadata - * reservation is released when the writeback finishes, at - * inode.c:btrfs_finish_ordered_io(), regardless of it finishing - * with or without an error. - */ - release_metadata = false; - ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path); + btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", + PAGE_SIZE, PAGE_SIZE, + 0, NULL); + if (!btrfs_free_space_bitmap_cachep) { + kmem_cache_destroy(btrfs_free_space_cachep); + return -ENOMEM; } - if (ret) { - if (release_metadata) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - inode->i_size, true); - btrfs_debug(fs_info, - "failed to write free ino cache for root %llu error %d", - root->root_key.objectid, ret); - } + return 0; +} - return ret; +void __cold btrfs_free_space_exit(void) +{ + kmem_cache_destroy(btrfs_free_space_cachep); + kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS |
