diff options
35 files changed, 590 insertions, 360 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index c3f7fbd0d67a..efb51ee92683 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11155,7 +11155,8 @@ F: include/linux/platform_data/huawei-gaokun-ec.h HUGETLB SUBSYSTEM M: Muchun Song <muchun.song@linux.dev> -R: Oscar Salvador <osalvador@suse.de> +M: Oscar Salvador <osalvador@suse.de> +R: David Hildenbrand <david@redhat.com> L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages @@ -11166,6 +11167,7 @@ F: fs/hugetlbfs/ F: include/linux/hugetlb.h F: include/trace/events/hugetlbfs.h F: mm/hugetlb.c +F: mm/hugetlb_cgroup.c F: mm/hugetlb_cma.c F: mm/hugetlb_cma.h F: mm/hugetlb_vmemmap.c @@ -13345,6 +13347,7 @@ M: Alexander Graf <graf@amazon.com> M: Mike Rapoport <rppt@kernel.org> M: Changyuan Lyu <changyuanl@google.com> L: kexec@lists.infradead.org +L: linux-mm@kvack.org S: Maintained F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* @@ -15676,8 +15679,11 @@ S: Maintained F: Documentation/core-api/boot-time-mm.rst F: Documentation/core-api/kho/bindings/memblock/* F: include/linux/memblock.h +F: mm/bootmem_info.c F: mm/memblock.c +F: mm/memtest.c F: mm/mm_init.c +F: mm/rodata_test.c F: tools/testing/memblock/ MEMORY ALLOCATION PROFILING @@ -15732,7 +15738,6 @@ F: Documentation/admin-guide/mm/ F: Documentation/mm/ F: include/linux/gfp.h F: include/linux/gfp_types.h -F: include/linux/memfd.h F: include/linux/memory_hotplug.h F: include/linux/memory-tiers.h F: include/linux/mempolicy.h @@ -15792,6 +15797,10 @@ S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: mm/gup.c +F: mm/gup_test.c +F: mm/gup_test.h +F: tools/testing/selftests/mm/gup_longterm.c +F: tools/testing/selftests/mm/gup_test.c MEMORY MANAGEMENT - KSM (Kernel Samepage Merging) M: Andrew Morton <akpm@linux-foundation.org> @@ -15868,6 +15877,7 @@ L: linux-mm@kvack.org S: Maintained F: mm/pt_reclaim.c F: mm/vmscan.c +F: mm/workingset.c MEMORY MANAGEMENT - RMAP (REVERSE MAPPING) M: Andrew Morton <akpm@linux-foundation.org> @@ -15880,6 +15890,7 @@ R: Harry Yoo <harry.yoo@oracle.com> L: linux-mm@kvack.org S: Maintained F: include/linux/rmap.h +F: mm/page_vma_mapped.c F: mm/rmap.c MEMORY MANAGEMENT - SECRETMEM @@ -15972,11 +15983,14 @@ S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: include/trace/events/mmap.h +F: mm/mincore.c F: mm/mlock.c F: mm/mmap.c F: mm/mprotect.c F: mm/mremap.c F: mm/mseal.c +F: mm/msync.c +F: mm/nommu.c F: mm/vma.c F: mm/vma.h F: mm/vma_exec.c @@ -25027,8 +25041,11 @@ M: Hugh Dickins <hughd@google.com> R: Baolin Wang <baolin.wang@linux.alibaba.com> L: linux-mm@kvack.org S: Maintained +F: include/linux/memfd.h F: include/linux/shmem_fs.h +F: mm/memfd.c F: mm/shmem.c +F: mm/shmem_quota.c TOMOYO SECURITY MODULE M: Kentaro Takeda <takedakn@nttdata.co.jp> diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index d4697e79d5a3..b2d10063d35f 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -5,7 +5,6 @@ config BCACHE select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 select CLOSURES - select MIN_HEAP help Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 8998e61efa40..48ce750bf70a 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -164,61 +164,40 @@ static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) * prio is worth 1/8th of what INITIAL_PRIO is worth. */ -static inline unsigned int new_bucket_prio(struct cache *ca, struct bucket *b) -{ - unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; - - return (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); -} - -static inline bool new_bucket_max_cmp(const void *l, const void *r, void *args) -{ - struct bucket **lhs = (struct bucket **)l; - struct bucket **rhs = (struct bucket **)r; - struct cache *ca = args; - - return new_bucket_prio(ca, *lhs) > new_bucket_prio(ca, *rhs); -} - -static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args) -{ - struct bucket **lhs = (struct bucket **)l; - struct bucket **rhs = (struct bucket **)r; - struct cache *ca = args; +#define bucket_prio(b) \ +({ \ + unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ + \ + (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ +}) - return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs); -} +#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) +#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) static void invalidate_buckets_lru(struct cache *ca) { struct bucket *b; - const struct min_heap_callbacks bucket_max_cmp_callback = { - .less = new_bucket_max_cmp, - .swp = NULL, - }; - const struct min_heap_callbacks bucket_min_cmp_callback = { - .less = new_bucket_min_cmp, - .swp = NULL, - }; + ssize_t i; - ca->heap.nr = 0; + ca->heap.used = 0; for_each_bucket(b, ca) { if (!bch_can_invalidate_bucket(ca, b)) continue; - if (!min_heap_full(&ca->heap)) - min_heap_push(&ca->heap, &b, &bucket_max_cmp_callback, ca); - else if (!new_bucket_max_cmp(&b, min_heap_peek(&ca->heap), ca)) { + if (!heap_full(&ca->heap)) + heap_add(&ca->heap, b, bucket_max_cmp); + else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { ca->heap.data[0] = b; - min_heap_sift_down(&ca->heap, 0, &bucket_max_cmp_callback, ca); + heap_sift(&ca->heap, 0, bucket_max_cmp); } } - min_heapify_all(&ca->heap, &bucket_min_cmp_callback, ca); + for (i = ca->heap.used / 2 - 1; i >= 0; --i) + heap_sift(&ca->heap, i, bucket_min_cmp); while (!fifo_full(&ca->free_inc)) { - if (!ca->heap.nr) { + if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { /* * We don't want to be calling invalidate_buckets() * multiple times when it can't do anything @@ -227,8 +206,6 @@ static void invalidate_buckets_lru(struct cache *ca) wake_up_gc(ca->set); return; } - b = min_heap_peek(&ca->heap)[0]; - min_heap_pop(&ca->heap, &bucket_min_cmp_callback, ca); bch_invalidate_one_bucket(ca, b); } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 785b0d9008fa..1d33e40d26ea 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -458,7 +458,7 @@ struct cache { /* Allocation stuff: */ struct bucket *buckets; - DEFINE_MIN_HEAP(struct bucket *, cache_heap) heap; + DECLARE_HEAP(struct bucket *, heap); /* * If nonzero, we know we aren't going to find any buckets to invalidate diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 68258a16e125..463eb13bd0b2 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -54,11 +54,9 @@ void bch_dump_bucket(struct btree_keys *b) int __bch_count_data(struct btree_keys *b) { unsigned int ret = 0; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - if (b->ops->is_extents) for_each_key(b, k, &iter) ret += KEY_SIZE(k); @@ -69,11 +67,9 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) { va_list args; struct bkey *k, *p = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; const char *err; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - for_each_key(b, k, &iter) { if (b->ops->is_extents) { err = "Keys out of order"; @@ -114,9 +110,9 @@ bug: static void bch_btree_iter_next_check(struct btree_iter *iter) { - struct bkey *k = iter->heap.data->k, *next = bkey_next(k); + struct bkey *k = iter->data->k, *next = bkey_next(k); - if (next < iter->heap.data->end && + if (next < iter->data->end && bkey_cmp(k, iter->b->ops->is_extents ? &START_KEY(next) : next) > 0) { bch_dump_bucket(iter->b); @@ -883,14 +879,12 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; struct bset *i = bset_tree_last(b)->data; struct bkey *m, *prev = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey preceding_key_on_stack = ZERO_KEY; struct bkey *preceding_key_p = &preceding_key_on_stack; BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); - min_heap_init(&iter.heap, NULL, MAX_BSETS); - /* * If k has preceding key, preceding_key_p will be set to address * of k's preceding key; otherwise preceding_key_p will be set @@ -901,9 +895,9 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, else preceding_key(k, &preceding_key_p); - m = bch_btree_iter_init(b, &iter, preceding_key_p); + m = bch_btree_iter_stack_init(b, &iter, preceding_key_p); - if (b->ops->insert_fixup(b, k, &iter, replace_key)) + if (b->ops->insert_fixup(b, k, &iter.iter, replace_key)) return status; status = BTREE_INSERT_STATUS_INSERT; @@ -1083,94 +1077,79 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, /* Btree iterator */ -typedef bool (new_btree_iter_cmp_fn)(const void *, const void *, void *); +typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, + struct btree_iter_set); -static inline bool new_btree_iter_cmp(const void *l, const void *r, void __always_unused *args) +static inline bool btree_iter_cmp(struct btree_iter_set l, + struct btree_iter_set r) { - const struct btree_iter_set *_l = l; - const struct btree_iter_set *_r = r; - - return bkey_cmp(_l->k, _r->k) <= 0; + return bkey_cmp(l.k, r.k) > 0; } static inline bool btree_iter_end(struct btree_iter *iter) { - return !iter->heap.nr; + return !iter->used; } void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end) { - const struct min_heap_callbacks callbacks = { - .less = new_btree_iter_cmp, - .swp = NULL, - }; - if (k != end) - BUG_ON(!min_heap_push(&iter->heap, - &((struct btree_iter_set) { k, end }), - &callbacks, - NULL)); + BUG_ON(!heap_add(iter, + ((struct btree_iter_set) { k, end }), + btree_iter_cmp)); } -static struct bkey *__bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search, - struct bset_tree *start) +static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search, + struct bset_tree *start) { struct bkey *ret = NULL; - iter->heap.size = ARRAY_SIZE(iter->heap.preallocated); - iter->heap.nr = 0; + iter->iter.size = ARRAY_SIZE(iter->stack_data); + iter->iter.used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter->b = b; + iter->iter.b = b; #endif for (; start <= bset_tree_last(b); start++) { ret = bch_bset_search(b, start, search); - bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); + bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data)); } return ret; } -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, struct bkey *search) { - return __bch_btree_iter_init(b, iter, search, b->set); + return __bch_btree_iter_stack_init(b, iter, search, b->set); } static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, - new_btree_iter_cmp_fn *cmp) + btree_iter_cmp_fn *cmp) { struct btree_iter_set b __maybe_unused; struct bkey *ret = NULL; - const struct min_heap_callbacks callbacks = { - .less = cmp, - .swp = NULL, - }; if (!btree_iter_end(iter)) { bch_btree_iter_next_check(iter); - ret = iter->heap.data->k; - iter->heap.data->k = bkey_next(iter->heap.data->k); + ret = iter->data->k; + iter->data->k = bkey_next(iter->data->k); - if (iter->heap.data->k > iter->heap.data->end) { + if (iter->data->k > iter->data->end) { WARN_ONCE(1, "bset was corrupt!\n"); - iter->heap.data->k = iter->heap.data->end; + iter->data->k = iter->data->end; } - if (iter->heap.data->k == iter->heap.data->end) { - if (iter->heap.nr) { - b = min_heap_peek(&iter->heap)[0]; - min_heap_pop(&iter->heap, &callbacks, NULL); - } - } + if (iter->data->k == iter->data->end) + heap_pop(iter, b, cmp); else - min_heap_sift_down(&iter->heap, 0, &callbacks, NULL); + heap_sift(iter, 0, cmp); } return ret; @@ -1178,7 +1157,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, struct bkey *bch_btree_iter_next(struct btree_iter *iter) { - return __bch_btree_iter_next(iter, new_btree_iter_cmp); + return __bch_btree_iter_next(iter, btree_iter_cmp); } @@ -1216,18 +1195,16 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, bool fixup, bool remove_stale) { + int i; struct bkey *k, *last = NULL; BKEY_PADDED(k) tmp; bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; - const struct min_heap_callbacks callbacks = { - .less = b->ops->sort_cmp, - .swp = NULL, - }; /* Heapify the iterator, using our comparison function */ - min_heapify_all(&iter->heap, &callbacks, NULL); + for (i = iter->used / 2 - 1; i >= 0; --i) + heap_sift(iter, i, b->ops->sort_cmp); while (!btree_iter_end(iter)) { if (b->ops->sort_fixup && fixup) @@ -1316,11 +1293,10 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; - struct btree_iter iter; + struct btree_iter_stack iter; int oldsize = bch_count_data(b); - min_heap_init(&iter.heap, NULL, MAX_BSETS); - __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); + __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]); if (start) { unsigned int i; @@ -1331,7 +1307,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, order = get_order(__set_bytes(b->set->data, keys)); } - __btree_sort(b, &iter, start, order, false, state); + __btree_sort(b, &iter.iter, start, order, false, state); EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } @@ -1347,13 +1323,11 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, struct bset_sort_state *state) { uint64_t start_time = local_clock(); - struct btree_iter iter; - - min_heap_init(&iter.heap, NULL, MAX_BSETS); + struct btree_iter_stack iter; - bch_btree_iter_init(b, &iter, NULL); + bch_btree_iter_stack_init(b, &iter, NULL); - btree_mergesort(b, new->set->data, &iter, false, true); + btree_mergesort(b, new->set->data, &iter.iter, false, true); bch_time_stats_update(&state->time, start_time); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index f79441acd4c1..011f6062c4c0 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -187,9 +187,8 @@ struct bset_tree { }; struct btree_keys_ops { - bool (*sort_cmp)(const void *l, - const void *r, - void *args); + bool (*sort_cmp)(struct btree_iter_set l, + struct btree_iter_set r); struct bkey *(*sort_fixup)(struct btree_iter *iter, struct bkey *tmp); bool (*insert_fixup)(struct btree_keys *b, @@ -313,17 +312,23 @@ enum { BTREE_INSERT_STATUS_FRONT_MERGE, }; -struct btree_iter_set { - struct bkey *k, *end; -}; - /* Btree key iteration */ struct btree_iter { + size_t size, used; #ifdef CONFIG_BCACHE_DEBUG struct btree_keys *b; #endif - MIN_HEAP_PREALLOCATED(struct btree_iter_set, btree_iter_heap, MAX_BSETS) heap; + struct btree_iter_set { + struct bkey *k, *end; + } data[]; +}; + +/* Fixed-size btree_iter that can be allocated on the stack */ + +struct btree_iter_stack { + struct btree_iter iter; + struct btree_iter_set stack_data[MAX_BSETS]; }; typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); @@ -335,9 +340,9 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end); -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search); +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search); struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, const struct bkey *search); @@ -352,13 +357,14 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b, return search ? __bch_bset_search(b, t, search) : t->data->start; } -#define for_each_key_filter(b, k, iter, filter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next_filter((iter), (b), filter));) +#define for_each_key_filter(b, k, stack_iter, filter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \ + filter));) -#define for_each_key(b, k, iter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next(iter));) +#define for_each_key(b, k, stack_iter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next(&((stack_iter)->iter)));) /* Sorting */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 1d0100677357..210b59007d98 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -148,19 +148,19 @@ void bch_btree_node_read_done(struct btree *b) { const char *err = "bad btree header"; struct bset *i = btree_bset_first(b); - struct btree_iter iter; + struct btree_iter *iter; /* * c->fill_iter can allocate an iterator with more memory space * than static MAX_BSETS. * See the comment arount cache_set->fill_iter. */ - iter.heap.data = mempool_alloc(&b->c->fill_iter, GFP_NOIO); - iter.heap.size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; - iter.heap.nr = 0; + iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); + iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; + iter->used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter.b = &b->keys; + iter->b = &b->keys; #endif if (!i->seq) @@ -198,7 +198,7 @@ void bch_btree_node_read_done(struct btree *b) if (i != b->keys.set[0].data && !i->keys) goto err; - bch_btree_iter_push(&iter, i->start, bset_bkey_last(i)); + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); b->written += set_blocks(i, block_bytes(b->c->cache)); } @@ -210,7 +210,7 @@ void bch_btree_node_read_done(struct btree *b) if (i->seq == b->keys.set[0].data->seq) goto err; - bch_btree_sort_and_fix_extents(&b->keys, &iter, &b->c->sort); + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); i = b->keys.set[0].data; err = "short btree key"; @@ -222,7 +222,7 @@ void bch_btree_node_read_done(struct btree *b) bch_bset_init_next(&b->keys, write_block(b), bset_magic(&b->c->cache->sb)); out: - mempool_free(iter.heap.data, &b->c->fill_iter); + mempool_free(iter, &b->c->fill_iter); return; err: set_btree_node_io_error(b); @@ -1306,11 +1306,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) uint8_t stale = 0; unsigned int keys = 0, good_keys = 0; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct bset_tree *t; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - gc->nodes++; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { @@ -1569,11 +1567,9 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, static unsigned int btree_gc_count_keys(struct btree *b) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; unsigned int ret = 0; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) ret += bkey_u64s(k); @@ -1612,18 +1608,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, int ret = 0; bool should_rewrite; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done); for (i = r; i < r + ARRAY_SIZE(r); i++) i->b = ERR_PTR(-EINTR); while (1) { - k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, op, k, b->level - 1, true, b); @@ -1918,9 +1914,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { int ret = 0; struct bkey *k, *p = NULL; - struct btree_iter iter; - - min_heap_init(&iter.heap, NULL, MAX_BSETS); + struct btree_iter_stack iter; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(b->c, b->level, k); @@ -1928,10 +1922,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) bch_initial_mark_key(b->c, b->level + 1, &b->key); if (b->level) { - bch_btree_iter_init(&b->keys, &iter, NULL); + bch_btree_iter_stack_init(&b->keys, &iter, NULL); do { - k = bch_btree_iter_next_filter(&iter, &b->keys, + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad); if (k) { btree_node_prefetch(b, k); @@ -1959,7 +1953,7 @@ static int bch_btree_check_thread(void *arg) struct btree_check_info *info = arg; struct btree_check_state *check_state = info->state; struct cache_set *c = check_state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; @@ -1967,11 +1961,9 @@ static int bch_btree_check_thread(void *arg) cur_idx = prev_idx = 0; ret = 0; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - /* root node keys are checked before thread created */ - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -1989,7 +1981,7 @@ static int bch_btree_check_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -2062,11 +2054,9 @@ int bch_btree_check(struct cache_set *c) int ret = 0; int i; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct btree_check_state check_state; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - /* check and mark root node keys */ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(c, c->root->level, k); @@ -2560,12 +2550,11 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, if (b->level) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad))) { ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); @@ -2594,12 +2583,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, { int ret = MAP_CONTINUE; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : bcache_btree(map_keys_recurse, k, diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 4b84fda1530a..d626ffcbecb9 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -33,16 +33,15 @@ static void sort_key_next(struct btree_iter *iter, i->k = bkey_next(i->k); if (i->k == i->end) - *i = iter->heap.data[--iter->heap.nr]; + *i = iter->data[--iter->used]; } -static bool new_bch_key_sort_cmp(const void *l, const void *r, void *args) +static bool bch_key_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) { - struct btree_iter_set *_l = (struct btree_iter_set *)l; - struct btree_iter_set *_r = (struct btree_iter_set *)r; - int64_t c = bkey_cmp(_l->k, _r->k); + int64_t c = bkey_cmp(l.k, r.k); - return !(c ? c > 0 : _l->k < _r->k); + return c ? c > 0 : l.k < r.k; } static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) @@ -239,7 +238,7 @@ static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, } const struct btree_keys_ops bch_btree_keys_ops = { - .sort_cmp = new_bch_key_sort_cmp, + .sort_cmp = bch_key_sort_cmp, .insert_fixup = bch_btree_ptr_insert_fixup, .key_invalid = bch_btree_ptr_invalid, .key_bad = bch_btree_ptr_bad, @@ -256,28 +255,22 @@ const struct btree_keys_ops bch_btree_keys_ops = { * Necessary for btree_sort_fixup() - if there are multiple keys that compare * equal in different sets, we have to process them newest to oldest. */ - -static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_unused *args) +static bool bch_extent_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) { - struct btree_iter_set *_l = (struct btree_iter_set *)l; - struct btree_iter_set *_r = (struct btree_iter_set *)r; - int64_t c = bkey_cmp(&START_KEY(_l->k), &START_KEY(_r->k)); + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); - return !(c ? c > 0 : _l->k < _r->k); + return c ? c > 0 : l.k < r.k; } static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, struct bkey *tmp) { - const struct min_heap_callbacks callbacks = { - .less = new_bch_extent_sort_cmp, - .swp = NULL, - }; - while (iter->heap.nr > 1) { - struct btree_iter_set *top = iter->heap.data, *i = top + 1; - - if (iter->heap.nr > 2 && - !new_bch_extent_sort_cmp(&i[0], &i[1], NULL)) + while (iter->used > 1) { + struct btree_iter_set *top = iter->data, *i = top + 1; + + if (iter->used > 2 && + bch_extent_sort_cmp(i[0], i[1])) i++; if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) @@ -285,7 +278,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, if (!KEY_SIZE(i->k)) { sort_key_next(iter, i); - min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL); + heap_sift(iter, i - top, bch_extent_sort_cmp); continue; } @@ -295,7 +288,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, else bch_cut_front(top->k, i->k); - min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL); + heap_sift(iter, i - top, bch_extent_sort_cmp); } else { /* can't happen because of comparison func */ BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); @@ -305,7 +298,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, bch_cut_back(&START_KEY(i->k), tmp); bch_cut_front(i->k, top->k); - min_heap_sift_down(&iter->heap, 0, &callbacks, NULL); + heap_sift(iter, 0, bch_extent_sort_cmp); return tmp; } else { @@ -625,7 +618,7 @@ static bool bch_extent_merge(struct btree_keys *bk, } const struct btree_keys_ops bch_extent_keys_ops = { - .sort_cmp = new_bch_extent_sort_cmp, + .sort_cmp = bch_extent_sort_cmp, .sort_fixup = bch_extent_sort_fixup, .insert_fixup = bch_extent_insert_fixup, .key_invalid = bch_extent_invalid, diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 45ca134cbf02..26a6a535ec32 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -182,19 +182,16 @@ err: if (!IS_ERR_OR_NULL(w->private)) closure_sync(&cl); } -static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *args) +static bool bucket_cmp(struct bucket *l, struct bucket *r) { - struct bucket **_l = (struct bucket **)l; - struct bucket **_r = (struct bucket **)r; - - return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r); + return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); } static unsigned int bucket_heap_top(struct cache *ca) { struct bucket *b; - return (b = min_heap_peek(&ca->heap)[0]) ? GC_SECTORS_USED(b) : 0; + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; } void bch_moving_gc(struct cache_set *c) @@ -202,10 +199,6 @@ void bch_moving_gc(struct cache_set *c) struct cache *ca = c->cache; struct bucket *b; unsigned long sectors_to_move, reserve_sectors; - const struct min_heap_callbacks callbacks = { - .less = new_bucket_cmp, - .swp = NULL, - }; if (!c->copy_gc_enabled) return; @@ -216,7 +209,7 @@ void bch_moving_gc(struct cache_set *c) reserve_sectors = ca->sb.bucket_size * fifo_used(&ca->free[RESERVE_MOVINGGC]); - ca->heap.nr = 0; + ca->heap.used = 0; for_each_bucket(b, ca) { if (GC_MARK(b) == GC_MARK_METADATA || @@ -225,31 +218,25 @@ void bch_moving_gc(struct cache_set *c) atomic_read(&b->pin)) continue; - if (!min_heap_full(&ca->heap)) { + if (!heap_full(&ca->heap)) { sectors_to_move += GC_SECTORS_USED(b); - min_heap_push(&ca->heap, &b, &callbacks, NULL); - } else if (!new_bucket_cmp(&b, min_heap_peek(&ca->heap), ca)) { + heap_add(&ca->heap, b, bucket_cmp); + } else if (bucket_cmp(b, heap_peek(&ca->heap))) { sectors_to_move -= bucket_heap_top(ca); sectors_to_move += GC_SECTORS_USED(b); ca->heap.data[0] = b; - min_heap_sift_down(&ca->heap, 0, &callbacks, NULL); + heap_sift(&ca->heap, 0, bucket_cmp); } } while (sectors_to_move > reserve_sectors) { - if (ca->heap.nr) { - b = min_heap_peek(&ca->heap)[0]; - min_heap_pop(&ca->heap, &callbacks, NULL); - } + heap_pop(&ca->heap, b, bucket_cmp); sectors_to_move -= GC_SECTORS_USED(b); } - while (ca->heap.nr) { - b = min_heap_peek(&ca->heap)[0]; - min_heap_pop(&ca->heap, &callbacks, NULL); + while (heap_pop(&ca->heap, b, bucket_cmp)) SET_GC_MOVE(b, 1); - } mutex_unlock(&c->bucket_lock); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1efb768b2890..2ea490b9d370 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1912,7 +1912,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->data_buckets); - iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * + iter_size = sizeof(struct btree_iter) + + ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * sizeof(struct btree_iter_set); c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index e8f696cb58c0..826b14cae4e5 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -660,9 +660,7 @@ static unsigned int bch_root_usage(struct cache_set *c) unsigned int bytes = 0; struct bkey *k; struct btree *b; - struct btree_iter iter; - - min_heap_init(&iter.heap, NULL, MAX_BSETS); + struct btree_iter_stack iter; goto lock_root; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 539454d8e2d0..f61ab1bada6c 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -9,7 +9,6 @@ #include <linux/kernel.h> #include <linux/sched/clock.h> #include <linux/llist.h> -#include <linux/min_heap.h> #include <linux/ratelimit.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> @@ -31,10 +30,16 @@ struct closure; #endif +#define DECLARE_HEAP(type, name) \ + struct { \ + size_t size, used; \ + type *data; \ + } name + #define init_heap(heap, _size, gfp) \ ({ \ size_t _bytes; \ - (heap)->nr = 0; \ + (heap)->used = 0; \ (heap)->size = (_size); \ _bytes = (heap)->size * sizeof(*(heap)->data); \ (heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \ @@ -47,6 +52,64 @@ do { \ (heap)->data = NULL; \ } while (0) +#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) + +#define heap_sift(h, i, cmp) \ +do { \ + size_t _r, _j = i; \ + \ + for (; _j * 2 + 1 < (h)->used; _j = _r) { \ + _r = _j * 2 + 1; \ + if (_r + 1 < (h)->used && \ + cmp((h)->data[_r], (h)->data[_r + 1])) \ + _r++; \ + \ + if (cmp((h)->data[_r], (h)->data[_j])) \ + break; \ + heap_swap(h, _r, _j); \ + } \ +} while (0) + +#define heap_sift_down(h, i, cmp) \ +do { \ + while (i) { \ + size_t p = (i - 1) / 2; \ + if (cmp((h)->data[i], (h)->data[p])) \ + break; \ + heap_swap(h, i, p); \ + i = p; \ + } \ +} while (0) + +#define heap_add(h, d, cmp) \ +({ \ + bool _r = !heap_full(h); \ + if (_r) { \ + size_t _i = (h)->used++; \ + (h)->data[_i] = d; \ + \ + heap_sift_down(h, _i, cmp); \ + heap_sift(h, _i, cmp); \ + } \ + _r; \ +}) + +#define heap_pop(h, d, cmp) \ +({ \ + bool _r = (h)->used; \ + if (_r) { \ + (d) = (h)->data[0]; \ + (h)->used--; \ + heap_swap(h, 0, (h)->used); \ + heap_sift(h, 0, cmp); \ + } \ + _r; \ +}) + +#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL) + +#define heap_full(h) ((h)->used == (h)->size) + #define DECLARE_FIFO(type, name) \ struct { \ size_t front, back, size, mask; \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 453efbbdc8ee..302e75f1fc4b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -908,16 +908,15 @@ static int bch_dirty_init_thread(void *arg) struct dirty_init_thrd_info *info = arg; struct bch_dirty_init_state *state = info->state; struct cache_set *c = state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; k = p = NULL; prev_idx = 0; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -931,7 +930,7 @@ static int bch_dirty_init_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -980,13 +979,11 @@ void bch_sectors_dirty_init(struct bcache_device *d) int i; struct btree *b = NULL; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct sectors_dirty_init op; struct cache_set *c = d->c; struct bch_dirty_init_state state; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - retry_lock: b = c->root; rw_lock(0, b, b->level); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c7cc24a5dd5e..8c597fa60523 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1377,7 +1377,10 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) { - WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); + struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root); + + if (WARN_ON(node)) + refcount_dec(&node->refs); } static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1beb9458f622..0d6ad7512f21 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1835,6 +1835,8 @@ void btrfs_put_root(struct btrfs_root *root) if (refcount_dec_and_test(&root->refs)) { if (WARN_ON(!xa_empty(&root->inodes))) xa_destroy(&root->inodes); + if (WARN_ON(!xa_empty(&root->delayed_nodes))) + xa_destroy(&root->delayed_nodes); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); @@ -2156,8 +2158,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) - ret = PTR_ERR(root); + ret = PTR_ERR(root); break; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); @@ -4310,8 +4311,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * * So wait for all ongoing ordered extents to complete and then run * delayed iputs. This works because once we reach this point no one - * can either create new ordered extents nor create delayed iputs - * through some other means. + * can create new ordered extents, but delayed iputs can still be added + * by a reclaim worker (see comments further below). * * Also note that btrfs_wait_ordered_roots() is not safe here, because * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, @@ -4322,15 +4323,29 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->endio_write_workers); /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); + /* + * Run delayed iputs in case an async reclaim worker is waiting for them + * to be run as mentioned above. + */ btrfs_run_delayed_iputs(fs_info); - /* There should be no more workload to generate new delayed iputs. */ - set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); cancel_work_sync(&fs_info->em_shrinker_work); + /* + * Run delayed iputs again because an async reclaim worker may have + * added new ones if it was flushing delalloc: + * + * shrink_delalloc() -> btrfs_start_delalloc_roots() -> + * start_delalloc_inodes() -> btrfs_add_delayed_iput() + */ + btrfs_run_delayed_iputs(fs_info); + + /* There should be no more workload to generate new delayed iputs. */ + set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); + /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 849199768664..1dc931c4937f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4312,7 +4312,6 @@ static int try_release_subpage_extent_buffer(struct folio *folio) spin_unlock(&eb->refs_lock); continue; } - xa_unlock_irq(&fs_info->buffer_tree); /* * If tree ref isn't set then we know the ref on this eb is a @@ -4329,6 +4328,7 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * check the folio private at the end. And * release_extent_buffer() will release the refs_lock. */ + xa_unlock_irq(&fs_info->buffer_tree); release_extent_buffer(eb); xa_lock_irq(&fs_info->buffer_tree); } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 0c573d46639a..a3e2a2a81461 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1115,11 +1115,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); if (ret < 0) goto out_locked; - ASSERT(ret == 0); + /* + * If ret is 1 (no key found), it means this is an empty block group, + * without any extents allocated from it and there's no block group + * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree + * because we are using the block group tree feature, so block group + * items are stored in the block group tree. It also means there are no + * extents allocated for block groups with a start offset beyond this + * block group's end offset (this is the last, highest, block group). + */ + if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE)) + ASSERT(ret == 0); start = block_group->start; end = block_group->start + block_group->length; - while (1) { + while (ret == 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type == BTRFS_EXTENT_ITEM_KEY || @@ -1149,8 +1159,6 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_next_item(extent_root, path); if (ret < 0) goto out_locked; - if (ret) - break; } if (start < end) { ret = __add_to_free_space_tree(trans, block_group, path2, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c0c778243bf1..26d6ed170a19 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4250,9 +4250,9 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { - btrfs_info(fs_info, - "failed to delete reference to %.*s, inode %llu parent %llu", - name->len, name->name, ino, dir_ino); + btrfs_crit(fs_info, + "failed to delete reference to %.*s, root %llu inode %llu parent %llu", + name->len, name->name, btrfs_root_id(root), ino, dir_ino); btrfs_abort_transaction(trans, ret); goto err; } @@ -8059,6 +8059,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + bool logs_pinned = false; struct fscrypt_name old_fname, new_fname; struct fscrypt_str *old_name, *new_name; @@ -8182,6 +8183,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && + new_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not for + * root entries) pin the log early to prevent any concurrent + * task from logging the directory after we removed the old + * entries and before we add the new entries, otherwise that + * task can sync a log without any entry for the inodes we are + * renaming and therefore replaying that log, if a power failure + * happens after syncing the log, would result in deleting the + * inodes. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8253,30 +8279,23 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. + * Do the log updates for all inodes. + * + * If either entry is for a root we don't need to update the logs since + * we've called btrfs_set_log_full_commit() before. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(dest); - - /* Do the log updates for all inodes. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) { btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), old_rename_ctx.index, new_dentry->d_parent); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), new_rename_ctx.index, old_dentry->d_parent); + } - /* Now unpin the logs. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) +out_fail: + if (logs_pinned) { btrfs_end_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_end_log_trans(dest); -out_fail: + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -8326,6 +8345,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; + bool logs_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -8460,6 +8480,29 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not a + * root entry) pin the log to prevent any concurrent task from + * logging the directory after we removed the old entry and + * before we add the new entry, otherwise that task can sync + * a log without any entry for the inode we are renaming and + * therefore replaying that log, if a power failure happens + * after syncing the log, would result in deleting the inode. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8524,7 +8567,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), rename_ctx.index, new_dentry->d_parent); @@ -8540,6 +8583,10 @@ static int btrfs_rename(struct mnt_idmap *idmap, } } out_fail: + if (logs_pinned) { + btrfs_end_log_trans(root); + btrfs_end_log_trans(dest); + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 913acef3f0a9..4eda35bdba71 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3139,7 +3139,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) return -EPERM; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { - btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); + btrfs_err(fs_info, "scrub: extent tree v2 not yet supported"); return -EINVAL; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ce36fafc771e..7cd5e76a783c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -557,7 +557,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", +"scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, @@ -571,7 +571,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, err: btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", + "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, @@ -596,7 +596,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * /* Super block error, no need to search extent tree. */ if (is_super) { - btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", + btrfs_warn_in_rcu(fs_info, "scrub: %s on device %s, physical %llu", errstr, btrfs_dev_name(dev), physical); return; } @@ -631,14 +631,14 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * &ref_level); if (ret < 0) { btrfs_warn(fs_info, - "failed to resolve tree backref for logical %llu: %d", - swarn.logical, ret); + "scrub: failed to resolve tree backref for logical %llu: %d", + swarn.logical, ret); break; } if (ret > 0) break; btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", +"scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, btrfs_dev_name(dev), swarn.physical, (ref_level ? "node" : "leaf"), ref_level, ref_root); @@ -718,7 +718,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_bytenr(header), logical); return; @@ -728,7 +728,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad fsid, has %pU want %pU", + "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", logical, stripe->mirror_num, header->fsid, fs_info->fs_devices->fsid); return; @@ -738,7 +738,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", logical, stripe->mirror_num, header->chunk_tree_uuid, fs_info->chunk_tree_uuid); return; @@ -760,7 +760,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, +"scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, logical, stripe->mirror_num, CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); @@ -771,7 +771,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad generation, has %llu want %llu", + "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_generation(header), stripe->sectors[sector_nr].generation); @@ -814,7 +814,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) */ if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { btrfs_warn_rl(fs_info, - "tree block at %llu crosses stripe boundary %llu", + "scrub: tree block at %llu crosses stripe boundary %llu", stripe->logical + (sector_nr << fs_info->sectorsize_bits), stripe->logical); @@ -1046,12 +1046,12 @@ skip: if (repaired) { if (dev) { btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on dev %s physical %llu", + "scrub: fixed up error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on mirror %u", + "scrub: fixed up error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } continue; @@ -1060,12 +1060,12 @@ skip: /* The remaining are all for unrepaired. */ if (dev) { btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on dev %s physical %llu", +"scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on mirror %u", + "scrub: unable to fixup (regular) error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } @@ -1593,8 +1593,7 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, physical, sctx->write_pointer); if (ret) - btrfs_err(fs_info, - "zoned: failed to recover write pointer"); + btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer"); } mutex_unlock(&sctx->wr_lock); btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); @@ -1658,7 +1657,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, int ret; if (unlikely(!extent_root || !csum_root)) { - btrfs_err(fs_info, "no valid extent or csum root for scrub"); + btrfs_err(fs_info, "scrub: no valid extent or csum root found"); return -EUCLEAN; } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * @@ -1907,7 +1906,7 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; btrfs_err(fs_info, - "stripe %llu has unrepaired metadata sector at %llu", + "scrub: stripe %llu has unrepaired metadata sector at logical %llu", stripe->logical, stripe->logical + (i << fs_info->sectorsize_bits)); return true; @@ -2167,7 +2166,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); if (!bitmap_empty(&error, stripe->nr_sectors)) { btrfs_err(fs_info, -"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", +"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, &error); ret = -EIO; @@ -2789,14 +2788,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ro_set = 0; } else if (ret == -ETXTBSY) { btrfs_warn(fs_info, - "skipping scrub of block group %llu due to active swapfile", + "scrub: skipping scrub of block group %llu due to active swapfile", cache->start); scrub_pause_off(fs_info); ret = 0; goto skip_unfreeze; } else { - btrfs_warn(fs_info, - "failed setting block group ro: %d", ret); + btrfs_warn(fs_info, "scrub: failed setting block group ro: %d", + ret); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); scrub_pause_off(fs_info); @@ -2892,13 +2891,13 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, ret = btrfs_check_super_csum(fs_info, sb); if (ret != 0) { btrfs_err_rl(fs_info, - "super block at physical %llu devid %llu has bad csum", + "scrub: super block at physical %llu devid %llu has bad csum", physical, dev->devid); return -EIO; } if (btrfs_super_generation(sb) != generation) { btrfs_err_rl(fs_info, -"super block at physical %llu devid %llu has bad generation %llu expect %llu", +"scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", physical, dev->devid, btrfs_super_generation(sb), generation); return -EUCLEAN; @@ -3059,7 +3058,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, - "scrub on devid %llu: filesystem on %s is not writable", + "scrub: devid %llu: filesystem on %s is not writable", devid, btrfs_dev_name(dev)); ret = -EROFS; goto out; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 97e933113b82..858b609e292c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -668,15 +668,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = ALIGN(start + size, fs_info->sectorsize); } else { - ret = 0; - goto out; + btrfs_err(fs_info, + "unexpected extent type=%d root=%llu inode=%llu offset=%llu", + found_type, btrfs_root_id(root), key->objectid, key->offset); + return -EUCLEAN; } inode = read_one_inode(root, key->objectid); - if (!inode) { - ret = -EIO; - goto out; - } + if (!inode) + return -EIO; /* * first check to see if we already have this extent in the @@ -961,7 +961,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(&inode->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1176,8 +1177,8 @@ again: ret = unlink_inode_for_log_replay(trans, victim_parent, inode, &victim_name); + iput(&victim_parent->vfs_inode); } - iput(&victim_parent->vfs_inode); kfree(victim_name.name); if (ret) return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 89835071cfea..f475b4b7c457 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3282,6 +3282,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) device->bytes_used - dev_extent_len); atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); + + if (list_empty(&device->post_commit_list)) { + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); + } + mutex_unlock(&fs_info->chunk_mutex); } } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b5b0156d5b95..9430b34d3cbb 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1403,7 +1403,8 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1426,6 +1427,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, zone_info[1].physical); return -EIO; } + + if (zone_info[0].alloc_offset == WP_CONVENTIONAL) + zone_info[0].alloc_offset = last_alloc; + + if (zone_info[1].alloc_offset == WP_CONVENTIONAL) + zone_info[1].alloc_offset = last_alloc; + if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); @@ -1446,7 +1454,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; int i; @@ -1461,10 +1470,12 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); for (i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) + zone_info[i].alloc_offset = last_alloc; + if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && !btrfs_test_opt(fs_info, DEGRADED)) { btrfs_err(fs_info, @@ -1494,7 +1505,8 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1505,10 +1517,29 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { + u64 stripe_nr, full_stripe_nr; + u64 stripe_offset; + int stripe_index; + + stripe_nr = div64_u64(last_alloc, map->stripe_size); + stripe_offset = stripe_nr * map->stripe_size; + full_stripe_nr = div_u64(stripe_nr, map->num_stripes); + div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); + + zone_info[i].alloc_offset = + full_stripe_nr * map->stripe_size; + + if (stripe_index > i) + zone_info[i].alloc_offset += map->stripe_size; + else if (stripe_index == i) + zone_info[i].alloc_offset += + (last_alloc - stripe_offset); + } + if (test_bit(0, active) != test_bit(i, active)) { if (!btrfs_zone_activate(bg)) return -EIO; @@ -1526,7 +1557,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1537,8 +1569,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (test_bit(0, active) != test_bit(i, active)) { @@ -1549,6 +1580,29 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { + u64 stripe_nr, full_stripe_nr; + u64 stripe_offset; + int stripe_index; + + stripe_nr = div64_u64(last_alloc, map->stripe_size); + stripe_offset = stripe_nr * map->stripe_size; + full_stripe_nr = div_u64(stripe_nr, + map->num_stripes / map->sub_stripes); + div_u64_rem(stripe_nr, + (map->num_stripes / map->sub_stripes), + &stripe_index); + + zone_info[i].alloc_offset = + full_stripe_nr * map->stripe_size; + + if (stripe_index > (i / map->sub_stripes)) + zone_info[i].alloc_offset += map->stripe_size; + else if (stripe_index == (i / map->sub_stripes)) + zone_info[i].alloc_offset += + (last_alloc - stripe_offset); + } + if ((i % map->sub_stripes) == 0) { bg->zone_capacity += zone_info[i].capacity; bg->alloc_offset += zone_info[i].alloc_offset; @@ -1637,18 +1691,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; case BTRFS_BLOCK_GROUP_DUP: - ret = btrfs_load_block_group_dup(cache, map, zone_info, active); + ret = btrfs_load_block_group_dup(cache, map, zone_info, active, + last_alloc); break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID1C3: case BTRFS_BLOCK_GROUP_RAID1C4: - ret = btrfs_load_block_group_raid1(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid1(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID0: - ret = btrfs_load_block_group_raid0(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid0(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID10: - ret = btrfs_load_block_group_raid10(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid10(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6bd3de64f2a8..696131e655ed 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -35,6 +35,17 @@ #include <trace/events/f2fs.h> #include <uapi/linux/f2fs.h> +static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size) +{ + loff_t old_size = i_size_read(inode); + + if (old_size >= new_size) + return; + + /* zero or drop pages only in range of [old_size, new_size] */ + truncate_pagecache(inode, old_size); +} + static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); @@ -103,8 +114,13 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT); + filemap_invalidate_unlock(inode->i_mapping); + file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(inode->i_mapping); + folio_lock(folio); if (unlikely(folio->mapping != inode->i_mapping || folio_pos(folio) > i_size_read(inode) || @@ -1109,6 +1125,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, f2fs_down_write(&fi->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); + if (attr->ia_size > old_size) + f2fs_zero_post_eof_page(inode, attr->ia_size); truncate_setsize(inode, attr->ia_size); if (attr->ia_size <= old_size) @@ -1227,6 +1245,10 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(inode->i_mapping); + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1510,6 +1532,8 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); truncate_pagecache(inode, offset); @@ -1631,6 +1655,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; + filemap_invalidate_lock(mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(mapping); + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1762,6 +1790,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); + + f2fs_zero_post_eof_page(inode, offset + len); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1819,6 +1849,10 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, if (err) return err; + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_balance_fs(sbi, true); pg_start = ((unsigned long long)offset) >> PAGE_SHIFT; @@ -4860,6 +4894,10 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) err = file_modified(file); if (err) return err; + + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from)); + filemap_invalidate_unlock(inode->i_mapping); return count; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1cb4cba7f961..bfe104db284e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2078,7 +2078,6 @@ write_node: if (!__write_node_folio(folio, false, &submitted, wbc, do_balance, io_type, NULL)) { - folio_unlock(folio); folio_batch_release(&fbatch); ret = -EIO; goto out; diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 69b953551677..5a21dbe17950 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -164,11 +164,21 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } /* almost as free_reserved_page(), just don't free the page */ -static void kho_restore_page(struct page *page) +static void kho_restore_page(struct page *page, unsigned int order) { - ClearPageReserved(page); - init_page_count(page); - adjust_managed_page_count(page, 1); + unsigned int nr_pages = (1 << order); + + /* Head page gets refcount of 1. */ + set_page_count(page, 1); + + /* For higher order folios, tail pages get a page count of zero. */ + for (unsigned int i = 1; i < nr_pages; i++) + set_page_count(page + i, 0); + + if (order > 0) + prep_compound_page(page, order); + + adjust_managed_page_count(page, nr_pages); } /** @@ -186,15 +196,10 @@ struct folio *kho_restore_folio(phys_addr_t phys) return NULL; order = page->private; - if (order) { - if (order > MAX_PAGE_ORDER) - return NULL; - - prep_compound_page(page, order); - } else { - kho_restore_page(page); - } + if (order > MAX_PAGE_ORDER) + return NULL; + kho_restore_page(page, order); return page_folio(page); } EXPORT_SYMBOL_GPL(kho_restore_folio); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index affe979bd14d..00524e55a21e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5527,8 +5527,9 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) mas->store_type = mas_wr_store_type(&wr_mas); request = mas_prealloc_calc(&wr_mas, entry); if (!request) - return ret; + goto set_flag; + mas->mas_flags &= ~MA_STATE_PREALLOC; mas_node_count_gfp(mas, request, gfp); if (mas_is_err(mas)) { mas_set_alloc_req(mas, 0); @@ -5538,6 +5539,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) return ret; } +set_flag: mas->mas_flags |= MA_STATE_PREALLOC; return ret; } @@ -2303,13 +2303,13 @@ static void pofs_unpin(struct pages_or_folios *pofs) /* * Returns the number of collected folios. Return value is always >= 0. */ -static void collect_longterm_unpinnable_folios( +static unsigned long collect_longterm_unpinnable_folios( struct list_head *movable_folio_list, struct pages_or_folios *pofs) { + unsigned long i, collected = 0; struct folio *prev_folio = NULL; bool drain_allow = true; - unsigned long i; for (i = 0; i < pofs->nr_entries; i++) { struct folio *folio = pofs_get_folio(pofs, i); @@ -2321,6 +2321,8 @@ static void collect_longterm_unpinnable_folios( if (folio_is_longterm_pinnable(folio)) continue; + collected++; + if (folio_is_device_coherent(folio)) continue; @@ -2342,6 +2344,8 @@ static void collect_longterm_unpinnable_folios( NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } + + return collected; } /* @@ -2418,9 +2422,11 @@ static long check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs) { LIST_HEAD(movable_folio_list); + unsigned long collected; - collect_longterm_unpinnable_folios(&movable_folio_list, pofs); - if (list_empty(&movable_folio_list)) + collected = collect_longterm_unpinnable_folios(&movable_folio_list, + pofs); + if (!collected) return 0; return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs); diff --git a/mm/memory.c b/mm/memory.c index 8eba595056fe..b0cda5aab398 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4315,26 +4315,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - struct swap_info_struct *si = swp_swap_info(entry); - pgoff_t offset = swp_offset(entry); - int i; - - /* - * While allocating a large folio and doing swap_read_folio, which is - * the case the being faulted pte doesn't have swapcache. We need to - * ensure all PTEs have no cache as well, otherwise, we might go to - * swap devices while the content is in swapcache. - */ - for (i = 0; i < max_nr; i++) { - if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) - return i; - } - - return i; -} - /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. diff --git a/mm/shmem.c b/mm/shmem.c index 0c5fb4ffa03a..3a5a65b1f41a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2259,6 +2259,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio = swap_cache_get_folio(swap, NULL, 0); order = xa_get_order(&mapping->i_pages, index); if (!folio) { + int nr_pages = 1 << order; bool fallback_order0 = false; /* Or update major stats only when swapin succeeds?? */ @@ -2272,9 +2273,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. */ if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled())) + !zswap_never_enabled() || + non_swapcache_batch(swap, nr_pages) != nr_pages)) fallback_order0 = true; /* Skip swapcache for synchronous device. */ diff --git a/mm/swap.h b/mm/swap.h index 2269eb9df0af..9096082a915e 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -106,6 +106,25 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return find_next_bit(sis->zeromap, end, start) - start; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + struct swap_info_struct *si = swp_swap_info(entry); + pgoff_t offset = swp_offset(entry); + int i; + + /* + * While allocating a large folio and doing mTHP swapin, we need to + * ensure all entries are not cached, otherwise, the mTHP folio will + * be in conflict with the folio in swap cache. + */ + for (i = 0; i < max_nr; i++) { + if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + return i; + } + + return i; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -199,6 +218,10 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return 0; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + return 0; +} #endif /* CONFIG_SWAP */ /** diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index bc473ad21202..8253978ee0fb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1084,8 +1084,18 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio *src_folio) + struct folio *src_folio, + struct swap_info_struct *si, swp_entry_t entry) { + /* + * Check if the folio still belongs to the target swap entry after + * acquiring the lock. Folio can be freed in the swap cache while + * not locked. + */ + if (src_folio && unlikely(!folio_test_swapcache(src_folio) || + entry.val != src_folio->swap.val)) + return -EAGAIN; + double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, @@ -1102,6 +1112,25 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, if (src_folio) { folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); + } else { + /* + * Check if the swap entry is cached after acquiring the src_pte + * lock. Otherwise, we might miss a newly loaded swap cache folio. + * + * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. + * We are trying to catch newly added swap cache, the only possible case is + * when a folio is swapped in and out again staying in swap cache, using the + * same entry before the PTE check above. The PTL is acquired and released + * twice, each time after updating the swap_map's flag. So holding + * the PTL here ensures we see the updated value. False positive is possible, + * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the + * cache, or during the tiny synchronization window between swap cache and + * swap_map, but it will be gone very quickly, worst result is retry jitters. + */ + if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); @@ -1412,7 +1441,7 @@ retry: } err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, - dst_ptl, src_ptl, src_folio); + dst_ptl, src_ptl, src_folio, si, entry); } out: diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config index a28baa536332..deba93379c80 100644 --- a/tools/testing/selftests/mm/config +++ b/tools/testing/selftests/mm/config @@ -8,3 +8,6 @@ CONFIG_GUP_TEST=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_MEM_SOFT_DIRTY=y CONFIG_ANON_VMA_NAME=y +CONFIG_FTRACE=y +CONFIG_PROFILING=y +CONFIG_UPROBES=y diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c index bbae66fc5038..cc26480098ae 100644 --- a/tools/testing/selftests/mm/merge.c +++ b/tools/testing/selftests/mm/merge.c @@ -470,7 +470,9 @@ TEST_F(merge, handle_uprobe_upon_merged_vma) ASSERT_GE(fd, 0); ASSERT_EQ(ftruncate(fd, page_size), 0); - ASSERT_EQ(read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type), 0); + if (read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type) != 0) { + SKIP(goto out, "Failed to read uprobe sysfs file, skipping"); + } memset(&attr, 0, attr_sz); attr.size = attr_sz; @@ -491,6 +493,7 @@ TEST_F(merge, handle_uprobe_upon_merged_vma) ASSERT_NE(mremap(ptr2, page_size, page_size, MREMAP_MAYMOVE | MREMAP_FIXED, ptr1), MAP_FAILED); +out: close(fd); remove(probe_file); } diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings index a953c96aa16e..e2206265f67c 100644 --- a/tools/testing/selftests/mm/settings +++ b/tools/testing/selftests/mm/settings @@ -1 +1 @@ -timeout=180 +timeout=900 |