From df98b6e2c52f65665eaf0fc23e647fb64335b289 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 20 Jun 2011 14:53:48 -0400 Subject: Btrfs: fix how we merge extent states and deal with cached states First, we can sometimes free the state we're merging, which means anybody who calls merge_state() may have the state it passed in free'ed. This is problematic because we could end up caching the state, which makes caching useless as the state will no longer be part of the tree. So instead of free'ing the state we passed into merge_state(), set it's end to the other->end and free the other state. This way we are sure to cache the correct state. Also because we can merge states together, instead of only using the cache'd state if it's start == the start we are looking for, go ahead and use it if the start we are looking for is within the range of the cached state. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b181a94a7170..abb922daf1b6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -279,11 +279,10 @@ static int merge_state(struct extent_io_tree *tree, if (other->start == state->end + 1 && other->state == state->state) { merge_cb(tree, state, other); - other->start = state->start; - state->tree = NULL; - rb_erase(&state->rb_node, &tree->state); - free_extent_state(state); - state = NULL; + state->end = other->end; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); } } @@ -349,7 +348,6 @@ static int insert_state(struct extent_io_tree *tree, "%llu %llu\n", (unsigned long long)found->start, (unsigned long long)found->end, (unsigned long long)start, (unsigned long long)end); - free_extent_state(state); return -EEXIST; } state->tree = tree; @@ -498,7 +496,8 @@ again: cached_state = NULL; } - if (cached && cached->tree && cached->start == start) { + if (cached && cached->tree && cached->start <= start && + cached->end > start) { if (clear) atomic_dec(&cached->refs); state = cached; @@ -740,7 +739,8 @@ again: spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; - if (state->start == start && state->tree) { + if (state->start <= start && state->end > start && + state->tree) { node = &state->rb_node; goto hit_next; } @@ -781,13 +781,13 @@ hit_next: if (err) goto out; - next_node = rb_next(node); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; + next_node = rb_next(&state->rb_node); if (next_node && start < end && prealloc && !need_resched()) { state = rb_entry(next_node, struct extent_state, rb_node); @@ -860,7 +860,6 @@ hit_next: * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - atomic_inc(&prealloc->refs); err = insert_state(tree, prealloc, start, this_end, &bits); BUG_ON(err == -EEXIST); @@ -870,7 +869,6 @@ hit_next: goto out; } cache_state(prealloc, cached_state); - free_extent_state(prealloc); prealloc = NULL; start = this_end + 1; goto search_again; @@ -1562,7 +1560,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bitset = 0; spin_lock(&tree->lock); - if (cached && cached->tree && cached->start == start) + if (cached && cached->tree && cached->start <= start && + cached->end > start) node = &cached->rb_node; else node = tree_search(tree, start); -- cgit From f7aaa06bff6f5fe049ce9723267e1639c2c3d8b5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Jul 2011 21:26:38 +0000 Subject: Btrfs: tag pages for writeback in sync Everybody else does this, we need to do it too. If we're syncing, we need to tag the pages we're going to write for writeback so we don't end up writing the same stuff over and over again if somebody is constantly redirtying our file. This will keep us from having latencies with heavy sync workloads. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index abb922daf1b6..89bbde46bd83 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2421,6 +2421,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; + int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { @@ -2431,11 +2432,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, end = wbc->range_end >> PAGE_CACHE_SHIFT; scanned = 1; } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, min(end - index, - (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { unsigned i; scanned = 1; -- cgit From a65917156e345946dbde3d7effd28124c6d6a8c2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 19 Jul 2011 12:04:14 -0400 Subject: Btrfs: stop using highmem for extent_buffers The extent_buffers have a very complex interface where we use HIGHMEM for metadata and try to cache a kmap mapping to access the memory. The next commit adds reader/writer locks, and concurrent use of this kmap cache would make it even more complex. This commit drops the ability to use HIGHMEM with extent buffers, and rips out all of the related code. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 67 ++++++++++------------------------------------------ 1 file changed, 13 insertions(+), 54 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 89bbde46bd83..76ecbb8ed0e0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3114,7 +3114,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, i = 0; } for (; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); + p = find_or_create_page(mapping, index, GFP_NOFS); if (!p) { WARN_ON(1); goto free_eb; @@ -3487,9 +3487,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, page = extent_buffer_page(eb, i); cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); memcpy(dst, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER1); dst += cur; len -= cur; @@ -3499,9 +3498,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, char **token, char **map, + unsigned long min_len, char **map, unsigned long *map_start, - unsigned long *map_len, int km) + unsigned long *map_len) { size_t offset = start & (PAGE_CACHE_SIZE - 1); char *kaddr; @@ -3531,42 +3530,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, } p = extent_buffer_page(eb, i); - kaddr = kmap_atomic(p, km); - *token = kaddr; + kaddr = page_address(p); *map = kaddr + offset; *map_len = PAGE_CACHE_SIZE - offset; return 0; } -int map_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, - char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km) -{ - int err; - int save = 0; - if (eb->map_token) { - unmap_extent_buffer(eb, eb->map_token, km); - eb->map_token = NULL; - save = 1; - } - err = map_private_extent_buffer(eb, start, min_len, token, map, - map_start, map_len, km); - if (!err && save) { - eb->map_token = *token; - eb->kaddr = *map; - eb->map_start = *map_start; - eb->map_len = *map_len; - } - return err; -} - -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) -{ - kunmap_atomic(token, km); -} - int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) @@ -3590,9 +3559,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); ret = memcmp(ptr, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER0); if (ret) break; @@ -3625,9 +3593,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); memcpy(kaddr + offset, src, cur); - kunmap_atomic(kaddr, KM_USER1); src += cur; len -= cur; @@ -3656,9 +3623,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); memset(kaddr + offset, c, cur); - kunmap_atomic(kaddr, KM_USER0); len -= cur; offset = 0; @@ -3689,9 +3655,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); read_extent_buffer(src, kaddr + offset, src_offset, cur); - kunmap_atomic(kaddr, KM_USER0); src_offset += cur; len -= cur; @@ -3704,20 +3669,17 @@ static void move_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); if (dst_page == src_page) { memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); } else { - char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *src_kaddr = page_address(src_page); char *p = dst_kaddr + dst_off + len; char *s = src_kaddr + src_off + len; while (len--) *--p = *--s; - - kunmap_atomic(src_kaddr, KM_USER1); } - kunmap_atomic(dst_kaddr, KM_USER0); } static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) @@ -3730,20 +3692,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); char *src_kaddr; if (dst_page != src_page) { - src_kaddr = kmap_atomic(src_page, KM_USER1); + src_kaddr = page_address(src_page); } else { src_kaddr = dst_kaddr; BUG_ON(areas_overlap(src_off, dst_off, len)); } memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); - kunmap_atomic(dst_kaddr, KM_USER0); - if (dst_page != src_page) - kunmap_atomic(src_kaddr, KM_USER1); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, -- cgit From bd681513fa6f2ff29aa391f01e413a2d1c59fd77 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sat, 16 Jul 2011 15:23:14 -0400 Subject: Btrfs: switch the btrfs tree locks to reader/writer The btrfs metadata btree is the source of significant lock contention, especially in the root node. This commit changes our locking to use a reader/writer lock. The lock is built on top of rw spinlocks, and it extends the lock tracking to remember if we have a read lock or a write lock when we go to blocking. Atomics count the number of blocking readers or writers at any given time. It removes all of the adaptive spinning from the old code and uses only the spinning/blocking hints inside of btrfs to decide when it should continue spinning. In read heavy workloads this is dramatically faster. In write heavy workloads we're still faster because of less contention on the root node lock. We suffer slightly in dbench because we schedule more often during write locks, but all other benchmarks so far are improved. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 76ecbb8ed0e0..5392c3b12fc1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3017,8 +3017,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; - spin_lock_init(&eb->lock); - init_waitqueue_head(&eb->lock_wq); + rwlock_init(&eb->lock); + atomic_set(&eb->write_locks, 0); + atomic_set(&eb->read_locks, 0); + atomic_set(&eb->blocking_readers, 0); + atomic_set(&eb->blocking_writers, 0); + atomic_set(&eb->spinning_readers, 0); + atomic_set(&eb->spinning_writers, 0); + init_waitqueue_head(&eb->write_lock_wq); + init_waitqueue_head(&eb->read_lock_wq); #if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); -- cgit From 19b6caf4acbf065dc96b47741d99f1b87243c468 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 25 Jul 2011 06:50:50 -0400 Subject: Btrfs: reduce extent_state lock contention for metadata For metadata buffers that don't straddle pages (all of them), btrfs can safely use the page uptodate bits and extent_buffer uptodate bit instead of needing to use the extent_state tree. This greatly reduces contention on the state tree lock. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 55 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 14 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5392c3b12fc1..7b60f621e761 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3268,6 +3268,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, return was_dirty; } +static int __eb_straddles_pages(u64 start, u64 len) +{ + if (len < PAGE_CACHE_SIZE) + return 1; + if (start & (PAGE_CACHE_SIZE - 1)) + return 1; + if ((start + len) & (PAGE_CACHE_SIZE - 1)) + return 1; + return 0; +} + +static int eb_straddles_pages(struct extent_buffer *eb) +{ + return __eb_straddles_pages(eb->start, eb->len); +} + int clear_extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb, struct extent_state **cached_state) @@ -3279,8 +3295,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); + if (eb_straddles_pages(eb)) { + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + cached_state, GFP_NOFS); + } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3298,8 +3316,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - NULL, GFP_NOFS); + if (eb_straddles_pages(eb)) { + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + NULL, GFP_NOFS); + } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || @@ -3322,9 +3342,12 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); - if (ret) - return 1; + if (__eb_straddles_pages(start, end - start + 1)) { + ret = test_range_bit(tree, start, end, + EXTENT_UPTODATE, 1, NULL); + if (ret) + return 1; + } while (start <= end) { index = start >> PAGE_CACHE_SHIFT; page = find_get_page(tree->mapping, index); @@ -3352,10 +3375,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 1; - ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, cached_state); - if (ret) - return ret; + if (eb_straddles_pages(eb)) { + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, cached_state); + if (ret) + return ret; + } num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { @@ -3388,9 +3413,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL)) { - return 0; + if (eb_straddles_pages(eb)) { + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, NULL)) { + return 0; + } } if (start) { -- cgit