summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/backref.c2
-rw-r--r--fs/btrfs/block-group.c39
-rw-r--r--fs/btrfs/compression.c74
-rw-r--r--fs/btrfs/compression.h26
-rw-r--r--fs/btrfs/ctree.c5
-rw-r--r--fs/btrfs/ctree.h120
-rw-r--r--fs/btrfs/delalloc-space.c2
-rw-r--r--fs/btrfs/delayed-inode.c41
-rw-r--r--fs/btrfs/delayed-ref.c26
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/discard.c2
-rw-r--r--fs/btrfs/disk-io.c81
-rw-r--r--fs/btrfs/extent-tree.c20
-rw-r--r--fs/btrfs/extent_io.c973
-rw-r--r--fs/btrfs/extent_io.h29
-rw-r--r--fs/btrfs/file-item.c110
-rw-r--r--fs/btrfs/file.c48
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c536
-rw-r--r--fs/btrfs/ioctl.c184
-rw-r--r--fs/btrfs/locking.c4
-rw-r--r--fs/btrfs/ordered-data.c253
-rw-r--r--fs/btrfs/ordered-data.h10
-rw-r--r--fs/btrfs/props.c16
-rw-r--r--fs/btrfs/qgroup.c10
-rw-r--r--fs/btrfs/reflink.c52
-rw-r--r--fs/btrfs/relocation.c75
-rw-r--r--fs/btrfs/scrub.c159
-rw-r--r--fs/btrfs/send.c47
-rw-r--r--fs/btrfs/space-info.c233
-rw-r--r--fs/btrfs/space-info.h30
-rw-r--r--fs/btrfs/subpage.c155
-rw-r--r--fs/btrfs/subpage.h33
-rw-r--r--fs/btrfs/super.c16
-rw-r--r--fs/btrfs/sysfs.c74
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/transaction.c61
-rw-r--r--fs/btrfs/transaction.h6
-rw-r--r--fs/btrfs/tree-log.c59
-rw-r--r--fs/btrfs/volumes.c24
-rw-r--r--fs/btrfs/volumes.h5
-rw-r--r--fs/btrfs/zoned.c66
-rw-r--r--fs/btrfs/zoned.h9
44 files changed, 2233 insertions, 1490 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 68b95ad82126..520a0f6a7d9e 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -18,6 +18,8 @@ config BTRFS_FS
select RAID6_PQ
select XOR_BLOCKS
select SRCU
+ depends on !PPC_256K_PAGES # powerpc
+ depends on !PAGE_SIZE_256KB # hexagon
help
Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 117d423fdb93..7a8a2fc19533 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2675,7 +2675,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
*
* @ref_key: The same as @ref_key in handle_direct_tree_backref()
* @tree_key: The first key of this tree block.
- * @path: A clean (released) path, to avoid allocating path everytime
+ * @path: A clean (released) path, to avoid allocating path every time
* the function get called.
*/
static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index aa57bdc8fc89..38b127b9edfc 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1399,7 +1399,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_space_info_update_bytes_pinned(fs_info, space_info,
-block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
- __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned);
block_group->pinned = 0;
spin_unlock(&block_group->lock);
@@ -1491,7 +1490,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
struct btrfs_block_group *bg;
struct btrfs_space_info *space_info;
- int ret;
+ LIST_HEAD(again_list);
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
@@ -1502,6 +1501,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
mutex_lock(&fs_info->reclaim_bgs_lock);
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->reclaim_bgs)) {
+ int ret = 0;
+
bg = list_first_entry(&fs_info->reclaim_bgs,
struct btrfs_block_group,
bg_list);
@@ -1547,9 +1548,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
bg->start);
next:
- btrfs_put_block_group(bg);
spin_lock(&fs_info->unused_bgs_lock);
+ if (ret == -EAGAIN && list_empty(&bg->bg_list))
+ list_add_tail(&bg->bg_list, &again_list);
+ else
+ btrfs_put_block_group(bg);
}
+ list_splice_tail(&again_list, &fs_info->reclaim_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
@@ -2442,16 +2447,16 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
if (!--cache->ro) {
- num_bytes = cache->length - cache->reserved -
- cache->pinned - cache->bytes_super -
- cache->zone_unusable - cache->used;
- sinfo->bytes_readonly -= num_bytes;
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes back */
cache->zone_unusable = cache->alloc_offset - cache->used;
sinfo->bytes_zone_unusable += cache->zone_unusable;
sinfo->bytes_readonly -= cache->zone_unusable;
}
+ num_bytes = cache->length - cache->reserved -
+ cache->pinned - cache->bytes_super -
+ cache->zone_unusable - cache->used;
+ sinfo->bytes_readonly -= num_bytes;
list_del_init(&cache->ro_list);
}
spin_unlock(&cache->lock);
@@ -2505,7 +2510,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
struct extent_changeset *data_reserved = NULL;
u64 alloc_hint = 0;
int dcs = BTRFS_DC_ERROR;
- u64 num_pages = 0;
+ u64 cache_size = 0;
int retries = 0;
int ret = 0;
@@ -2617,20 +2622,20 @@ again:
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = div_u64(block_group->length, SZ_256M);
- if (!num_pages)
- num_pages = 1;
+ cache_size = div_u64(block_group->length, SZ_256M);
+ if (!cache_size)
+ cache_size = 1;
- num_pages *= 16;
- num_pages *= PAGE_SIZE;
+ cache_size *= 16;
+ cache_size *= fs_info->sectorsize;
ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
- num_pages);
+ cache_size);
if (ret)
goto out_put;
- ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
- num_pages, num_pages,
+ ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
+ cache_size, cache_size,
&alloc_hint);
/*
* Our cache requires contiguous chunks so that we don't modify a bunch
@@ -3062,8 +3067,6 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- __btrfs_mod_total_bytes_pinned(cache->space_info,
- num_bytes);
set_extent_dirty(&trans->transaction->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d17ac301032e..9a023ae0f98b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -149,7 +149,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
const u32 csum_size = fs_info->csum_size;
const u32 sectorsize = fs_info->sectorsize;
struct page *page;
- unsigned long i;
+ unsigned int i;
char *kaddr;
u8 csum[BTRFS_CSUM_SIZE];
struct compressed_bio *cb = bio->bi_private;
@@ -208,7 +208,7 @@ static void end_compressed_bio_read(struct bio *bio)
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
struct page *page;
- unsigned long index;
+ unsigned int index;
unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
int ret = 0;
@@ -334,7 +334,7 @@ static void end_compressed_bio_write(struct bio *bio)
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
struct page *page;
- unsigned long index;
+ unsigned int index;
if (bio->bi_status)
cb->errors = 1;
@@ -349,12 +349,10 @@ static void end_compressed_bio_write(struct bio *bio)
* call back into the FS and do all the end_io operations
*/
inode = cb->inode;
- cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
btrfs_record_physical_zoned(inode, cb->start, bio);
- btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
+ btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
bio->bi_status == BLK_STS_OK);
- cb->compressed_pages[0]->mapping = NULL;
end_compressed_writeback(inode, cb);
/* note, our inode could be gone now */
@@ -387,10 +385,10 @@ out:
* the end io hooks.
*/
blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
- unsigned long len, u64 disk_start,
- unsigned long compressed_len,
+ unsigned int len, u64 disk_start,
+ unsigned int compressed_len,
struct page **compressed_pages,
- unsigned long nr_pages,
+ unsigned int nr_pages,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css)
{
@@ -427,24 +425,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
bio->bi_end_io = end_compressed_bio_write;
if (use_append) {
- struct extent_map *em;
- struct map_lookup *map;
- struct block_device *bdev;
+ struct btrfs_device *device;
- em = btrfs_get_chunk_map(fs_info, disk_start, PAGE_SIZE);
- if (IS_ERR(em)) {
+ device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
+ if (IS_ERR(device)) {
kfree(cb);
bio_put(bio);
return BLK_STS_NOTSUPP;
}
- map = em->map_lookup;
- /* We only support single profile for now */
- ASSERT(map->num_stripes == 1);
- bdev = map->stripes[0].dev->bdev;
-
- bio_set_dev(bio, bdev);
- free_extent_map(em);
+ bio_set_dev(bio, device->bdev);
}
if (blkcg_css) {
@@ -457,7 +447,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
bytes_left = compressed_len;
for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
int submit = 0;
- int len;
+ int len = 0;
page = compressed_pages[pg_index];
page->mapping = inode->vfs_inode.i_mapping;
@@ -465,10 +455,17 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
0);
- if (pg_index == 0 && use_append)
- len = bio_add_zone_append_page(bio, page, PAGE_SIZE, 0);
- else
- len = bio_add_page(bio, page, PAGE_SIZE, 0);
+ /*
+ * Page can only be added to bio if the current bio fits in
+ * stripe.
+ */
+ if (!submit) {
+ if (pg_index == 0 && use_append)
+ len = bio_add_zone_append_page(bio, page,
+ PAGE_SIZE, 0);
+ else
+ len = bio_add_page(bio, page, PAGE_SIZE, 0);
+ }
page->mapping = NULL;
if (submit || len < PAGE_SIZE) {
@@ -508,7 +505,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
}
if (bytes_left < PAGE_SIZE) {
btrfs_info(fs_info,
- "bytes left %lu compress len %lu nr %lu",
+ "bytes left %lu compress len %u nr %u",
bytes_left, cb->compressed_len, cb->nr_pages);
}
bytes_left -= PAGE_SIZE;
@@ -670,9 +667,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
- unsigned long compressed_len;
- unsigned long nr_pages;
- unsigned long pg_index;
+ unsigned int compressed_len;
+ unsigned int nr_pages;
+ unsigned int pg_index;
struct page *page;
struct bio *comp_bio;
u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
@@ -1195,9 +1192,6 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level)
*
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
- *
- * @max_out tells us the max number of bytes that we're allowed to
- * stuff into pages
*/
int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
u64 start, struct page **pages,
@@ -1218,20 +1212,6 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
return ret;
}
-/*
- * pages_in is an array of pages with compressed data.
- *
- * disk_start is the starting logical offset of this array in the file
- *
- * orig_bio contains the pages from the file that we want to decompress into
- *
- * srclen is the number of bytes in pages_in
- *
- * The basic idea is that we have a bio that was created by readpages.
- * The pages in the bio are for the uncompressed data, and they may not
- * be contiguous. They all correspond to the range of bytes covered by
- * the compressed extent.
- */
static int btrfs_decompress_bio(struct compressed_bio *cb)
{
struct list_head *workspace;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 8001b700ea3a..c359f20920d0 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -31,6 +31,9 @@ struct compressed_bio {
/* number of bios pending for this compressed extent */
refcount_t pending_bios;
+ /* Number of compressed pages in the array */
+ unsigned int nr_pages;
+
/* the pages with the compressed data on them */
struct page **compressed_pages;
@@ -40,20 +43,17 @@ struct compressed_bio {
/* starting offset in the inode for our pages */
u64 start;
- /* number of bytes in the inode we're working on */
- unsigned long len;
-
- /* number of bytes on disk */
- unsigned long compressed_len;
+ /* Number of bytes in the inode we're working on */
+ unsigned int len;
- /* the compression algorithm for this bio */
- int compress_type;
+ /* Number of bytes on disk */
+ unsigned int compressed_len;
- /* number of compressed pages in the array */
- unsigned long nr_pages;
+ /* The compression algorithm for this bio */
+ u8 compress_type;
/* IO errors */
- int errors;
+ u8 errors;
int mirror_num;
/* for reads, this is the bio we are copying the data into */
@@ -91,10 +91,10 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
struct bio *bio);
blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
- unsigned long len, u64 disk_start,
- unsigned long compressed_len,
+ unsigned int len, u64 disk_start,
+ unsigned int compressed_len,
struct page **compressed_pages,
- unsigned long nr_pages,
+ unsigned int nr_pages,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css);
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a484fb72a01f..4bc3ca2cbd7d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -596,7 +596,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
trans->transid, fs_info->generation);
if (!should_cow_block(trans, root, buf)) {
- trans->dirty = true;
*cow_ret = buf;
return 0;
}
@@ -1788,10 +1787,8 @@ again:
* then we don't want to set the path blocking,
* so we test it here
*/
- if (!should_cow_block(trans, root, b)) {
- trans->dirty = true;
+ if (!should_cow_block(trans, root, b))
goto cow_done;
- }
/*
* must have write locks on this node and the
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9fb76829a281..e5e53e592d4f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -561,10 +561,16 @@ enum {
/*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
- * Set and cleared while holding fs_info::balance_mutex.
*/
BTRFS_FS_BALANCE_RUNNING,
+ /*
+ * Indicate that relocation of a chunk has started, it's set per chunk
+ * and is toggled between chunks.
+ * Set, tested and cleared while holding fs_info::send_reloc_lock.
+ */
+ BTRFS_FS_RELOC_RUNNING,
+
/* Indicate that the cleaner thread is awake and doing something. */
BTRFS_FS_CLEANER_RUNNING,
@@ -817,8 +823,6 @@ struct btrfs_fs_info {
struct kobject *space_info_kobj;
struct kobject *qgroups_kobj;
- u64 total_pinned;
-
/* used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
struct percpu_counter delalloc_bytes;
@@ -871,6 +875,9 @@ struct btrfs_fs_info {
struct btrfs_balance_control *balance_ctl;
wait_queue_head_t balance_wait_q;
+ /* Cancellation requests for chunk relocation */
+ atomic_t reloc_cancel_req;
+
u32 data_chunk_allocations;
u32 metadata_ratio;
@@ -986,14 +993,15 @@ struct btrfs_fs_info {
struct crypto_shash *csum_shash;
+ spinlock_t send_reloc_lock;
/*
* Number of send operations in progress.
- * Updated while holding fs_info::balance_mutex.
+ * Updated while holding fs_info::send_reloc_lock.
*/
int send_in_progress;
- /* Type of exclusive operation running */
- unsigned long exclusive_operation;
+ /* Type of exclusive operation running, protected by super_lock */
+ enum btrfs_exclusive_operation exclusive_operation;
/*
* Zone size > 0 when in ZONED mode, otherwise it's used for a check
@@ -1375,38 +1383,39 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
*
* Note: don't forget to add new options to btrfs_show_options()
*/
-#define BTRFS_MOUNT_NODATASUM (1 << 0)
-#define BTRFS_MOUNT_NODATACOW (1 << 1)
-#define BTRFS_MOUNT_NOBARRIER (1 << 2)
-#define BTRFS_MOUNT_SSD (1 << 3)
-#define BTRFS_MOUNT_DEGRADED (1 << 4)
-#define BTRFS_MOUNT_COMPRESS (1 << 5)
-#define BTRFS_MOUNT_NOTREELOG (1 << 6)
-#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
-#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
-#define BTRFS_MOUNT_NOSSD (1 << 9)
-#define BTRFS_MOUNT_DISCARD_SYNC (1 << 10)
-#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
-#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
-#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
-#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
-#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
-#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
-/* bit 17 is free */
-#define BTRFS_MOUNT_USEBACKUPROOT (1 << 18)
-#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
-#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
-#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
-#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
-#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
-#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
-#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
-#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
-#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
-#define BTRFS_MOUNT_REF_VERIFY (1 << 28)
-#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29)
-#define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30)
-#define BTRFS_MOUNT_IGNOREDATACSUMS (1 << 31)
+enum {
+ BTRFS_MOUNT_NODATASUM = (1UL << 0),
+ BTRFS_MOUNT_NODATACOW = (1UL << 1),
+ BTRFS_MOUNT_NOBARRIER = (1UL << 2),
+ BTRFS_MOUNT_SSD = (1UL << 3),
+ BTRFS_MOUNT_DEGRADED = (1UL << 4),
+ BTRFS_MOUNT_COMPRESS = (1UL << 5),
+ BTRFS_MOUNT_NOTREELOG = (1UL << 6),
+ BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7),
+ BTRFS_MOUNT_SSD_SPREAD = (1UL << 8),
+ BTRFS_MOUNT_NOSSD = (1UL << 9),
+ BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10),
+ BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11),
+ BTRFS_MOUNT_SPACE_CACHE = (1UL << 12),
+ BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13),
+ BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14),
+ BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15),
+ BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
+ BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
+ BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
+ BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19),
+ BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20),
+ BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21),
+ BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22),
+ BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23),
+ BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24),
+ BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25),
+ BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26),
+ BTRFS_MOUNT_REF_VERIFY = (1UL << 27),
+ BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28),
+ BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29),
+ BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30),
+};
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
@@ -2216,11 +2225,13 @@ BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
static inline bool btrfs_root_readonly(const struct btrfs_root *root)
{
+ /* Byte-swap the constant at compile time, root_item::flags is LE */
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
}
static inline bool btrfs_root_dead(const struct btrfs_root *root)
{
+ /* Byte-swap the constant at compile time, root_item::flags is LE */
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
}
@@ -2746,9 +2757,9 @@ enum btrfs_reserve_flush_enum {
/*
* Flush space by above mentioned methods and by:
* - Running delayed iputs
- * - Commiting transaction
+ * - Committing transaction
*
- * Can be interruped by fatal signal.
+ * Can be interrupted by a fatal signal.
*/
BTRFS_RESERVE_FLUSH_DATA,
BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
@@ -2758,7 +2769,7 @@ enum btrfs_reserve_flush_enum {
* Pretty much the same as FLUSH_ALL, but can also steal space from
* global rsv.
*
- * Can be interruped by fatal signal.
+ * Can be interrupted by a fatal signal.
*/
BTRFS_RESERVE_FLUSH_ALL_STEAL,
};
@@ -2774,7 +2785,6 @@ enum btrfs_flush_state {
ALLOC_CHUNK_FORCE = 8,
RUN_DELAYED_IPUTS = 9,
COMMIT_TRANS = 10,
- FORCE_COMMIT_TRANS = 11,
};
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
@@ -3100,8 +3110,8 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end);
+unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
+ struct page *page, u64 start, u64 end);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -3125,7 +3135,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode, u64 new_size,
- u32 min_type);
+ u32 min_type, u64 *extents_found);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
@@ -3146,9 +3156,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
unsigned long bio_flags);
-bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
- unsigned int size);
-void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end);
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
@@ -3187,7 +3195,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
-void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
extern const struct iomap_ops btrfs_dio_iomap_ops;
@@ -3222,6 +3231,9 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
/* file.c */
@@ -3786,4 +3798,14 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
return fs_info->zoned != 0;
}
+/*
+ * We use page status Private2 to indicate there is an ordered extent with
+ * unfinished IO.
+ *
+ * Rename the Private2 accessors to Ordered, to improve readability.
+ */
+#define PageOrdered(page) PagePrivate2(page)
+#define SetPageOrdered(page) SetPagePrivate2(page)
+#define ClearPageOrdered(page) ClearPagePrivate2(page)
+
#endif
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 56642ca7af10..2059d1504149 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -89,7 +89,7 @@
* ->outstanding_extents += 1 (current value is 1)
*
* -> set_delalloc
- * ->outstanding_extents += 1 (currrent value is 2)
+ * ->outstanding_extents += 1 (current value is 2)
*
* -> btrfs_delalloc_release_extents()
* ->outstanding_extents -= 1 (current value is 1)
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1a88f6214ebc..257c1e18abd4 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -681,7 +681,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
{
struct btrfs_delayed_item *curr, *next;
int free_space;
- int total_data_size = 0, total_size = 0;
+ int total_size = 0;
struct extent_buffer *leaf;
char *data_ptr;
struct btrfs_key *keys;
@@ -706,7 +706,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
*/
while (total_size + next->data_len + sizeof(struct btrfs_item) <=
free_space) {
- total_data_size += next->data_len;
total_size += next->data_len + sizeof(struct btrfs_item);
list_add_tail(&next->tree_list, &head);
nitems++;
@@ -974,14 +973,16 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
{
- struct btrfs_delayed_root *delayed_root;
- ASSERT(delayed_node->root);
- clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
- delayed_node->count--;
+ if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) {
+ struct btrfs_delayed_root *delayed_root;
- delayed_root = delayed_node->root->fs_info->delayed_root;
- finish_one_item(delayed_root);
+ ASSERT(delayed_node->root);
+ delayed_node->count--;
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+ finish_one_item(delayed_root);
+ }
}
static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -1009,12 +1010,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
nofs_flag = memalloc_nofs_save();
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
memalloc_nofs_restore(nofs_flag);
- if (ret > 0) {
- btrfs_release_path(path);
- return -ENOENT;
- } else if (ret < 0) {
- return ret;
- }
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -1024,7 +1023,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
- goto no_iref;
+ goto out;
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(leaf))
@@ -1046,12 +1045,19 @@ again:
btrfs_del_item(trans, root, path);
out:
btrfs_release_delayed_iref(node);
-no_iref:
btrfs_release_path(path);
err_out:
btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
btrfs_release_delayed_inode(node);
+ /*
+ * If we fail to update the delayed inode we need to abort the
+ * transaction, because we could leave the inode with the improper
+ * counts behind.
+ */
+ if (ret && ret != -ENOENT)
+ btrfs_abort_transaction(trans, ret);
+
return ret;
search:
@@ -1898,8 +1904,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
btrfs_release_delayed_item(prev_item);
}
- if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
- btrfs_release_delayed_iref(delayed_node);
+ btrfs_release_delayed_iref(delayed_node);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index c92d9d4f5f46..06bc842ecdb3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -641,7 +641,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs =
&trans->transaction->delayed_refs;
struct btrfs_fs_info *fs_info = trans->fs_info;
- u64 flags = btrfs_ref_head_to_space_flags(existing);
int old_ref_mod;
BUG_ON(existing->is_data != update->is_data);
@@ -711,26 +710,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
}
}
- /*
- * This handles the following conditions:
- *
- * 1. We had a ref mod of 0 or more and went negative, indicating that
- * we may be freeing space, so add our space to the
- * total_bytes_pinned counter.
- * 2. We were negative and went to 0 or positive, so no longer can say
- * that the space would be pinned, decrement our counter from the
- * total_bytes_pinned counter.
- * 3. We are now at 0 and have ->must_insert_reserved set, which means
- * this was a new allocation and then we dropped it, and thus must
- * add our space to the total_bytes_pinned counter.
- */
- if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
- btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
- else if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
- btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes);
- else if (existing->total_ref_mod == 0 && existing->must_insert_reserved)
- btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
-
spin_unlock(&existing->lock);
}
@@ -835,17 +814,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
- u64 flags = btrfs_ref_head_to_space_flags(head_ref);
-
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
trans->delayed_ref_updates +=
btrfs_csum_bytes_to_leaves(trans->fs_info,
head_ref->num_bytes);
}
- if (head_ref->ref_mod < 0)
- btrfs_mod_total_bytes_pinned(trans->fs_info, flags,
- head_ref->num_bytes);
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d05f73530af7..d029be40ea6f 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -37,7 +37,7 @@
* - Write duplication
*
* All new writes will be written to both target and source devices, so even
- * if replace gets canceled, sources device still contans up-to-date data.
+ * if replace gets canceled, sources device still contains up-to-date data.
*
* Location: handle_ops_on_dev_replace() from __btrfs_map_block()
* Start: btrfs_dev_replace_start()
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 306ff20af70f..e1b7bd927d69 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -624,7 +624,7 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
* @fs_info: fs_info of interest
*
* The unused_bgs list needs to be punted to the discard lists because the
- * order of operations is changed. In the normal sychronous discard path, the
+ * order of operations is changed. In the normal synchronous discard path, the
* block groups are trimmed via a single large trim in transaction commit. This
* is ultimately what we are trying to avoid with asynchronous discard. Thus,
* it must be done before going down the unused_bgs path.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c9a3036c23bf..b117dd3b8172 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -241,7 +241,6 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
{
struct extent_state *cached_state = NULL;
int ret;
- bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
@@ -249,9 +248,6 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
- if (need_lock)
- btrfs_tree_read_lock(eb);
-
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
if (extent_buffer_uptodate(eb) &&
@@ -264,22 +260,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
eb->start,
parent_transid, btrfs_header_generation(eb));
ret = 1;
-
- /*
- * Things reading via commit roots that don't have normal protection,
- * like send, can have a really old block in cache that may point at a
- * block that has been freed and re-allocated. So don't clear uptodate
- * if we find an eb that is under IO (dirty/writeback) because we could
- * end up reading in the stale data and then writing it back out and
- * making everybody very sad.
- */
- if (!extent_buffer_under_io(eb))
- clear_extent_buffer_uptodate(eb);
+ clear_extent_buffer_uptodate(eb);
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
- if (need_lock)
- btrfs_tree_read_unlock(eb);
return ret;
}
@@ -584,6 +568,7 @@ static int validate_extent_buffer(struct extent_buffer *eb)
const u32 csum_size = fs_info->csum_size;
u8 found_level;
u8 result[BTRFS_CSUM_SIZE];
+ const u8 *header_csum;
int ret = 0;
found_start = btrfs_header_bytenr(eb);
@@ -608,15 +593,14 @@ static int validate_extent_buffer(struct extent_buffer *eb)
}
csum_tree_block(eb, result);
+ header_csum = page_address(eb->pages[0]) +
+ get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
- if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
- u8 val[BTRFS_CSUM_SIZE] = { 0 };
-
- read_extent_buffer(eb, &val, 0, csum_size);
+ if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
- "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
- fs_info->sb->s_id, eb->start,
- CSUM_FMT_VALUE(csum_size, val),
+ "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
+ eb->start,
+ CSUM_FMT_VALUE(csum_size, header_csum),
CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb));
ret = -EUCLEAN;
@@ -917,23 +901,22 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
return btree_csum_one_bio(bio);
}
-static int check_async_write(struct btrfs_fs_info *fs_info,
+static bool should_async_write(struct btrfs_fs_info *fs_info,
struct btrfs_inode *bi)
{
if (btrfs_is_zoned(fs_info))
- return 0;
+ return false;
if (atomic_read(&bi->sync_writers))
- return 0;
+ return false;
if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
- return 0;
- return 1;
+ return false;
+ return true;
}
blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int async = check_async_write(fs_info, BTRFS_I(inode));
blk_status_t ret;
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
@@ -946,7 +929,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
if (ret)
goto out_w_error;
ret = btrfs_map_bio(fs_info, bio, mirror_num);
- } else if (!async) {
+ } else if (!should_async_write(fs_info, BTRFS_I(inode))) {
ret = btree_csum_one_bio(bio);
if (ret)
goto out_w_error;
@@ -2252,6 +2235,7 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->balance_cancel_req, 0);
fs_info->balance_ctl = NULL;
init_waitqueue_head(&fs_info->balance_wait_q);
+ atomic_set(&fs_info->reloc_cancel_req, 0);
}
static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
@@ -2648,6 +2632,24 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
+ BTRFS_FSID_SIZE)) {
+ btrfs_err(fs_info,
+ "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
+ fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
+ ret = -EINVAL;
+ }
+
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
+ memcmp(fs_info->fs_devices->metadata_uuid,
+ fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
+ btrfs_err(fs_info,
+"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
+ fs_info->super_copy->metadata_uuid,
+ fs_info->fs_devices->metadata_uuid);
+ ret = -EINVAL;
+ }
+
if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
@@ -2981,6 +2983,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->swapfile_pins_lock);
fs_info->swapfile_pins = RB_ROOT;
+ spin_lock_init(&fs_info->send_reloc_lock);
fs_info->send_in_progress = 0;
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
@@ -3279,14 +3282,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
disk_super = fs_info->super_copy;
- ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
- BTRFS_FSID_SIZE));
-
- if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
- ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid,
- fs_info->super_copy->metadata_uuid,
- BTRFS_FSID_SIZE));
- }
features = btrfs_super_flags(disk_super);
if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
@@ -3461,7 +3456,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* At this point we know all the devices that make this filesystem,
* including the seed devices but we don't know yet if the replace
* target is required. So free devices that are not part of this
- * filesystem but skip the replace traget device which is checked
+ * filesystem but skip the replace target device which is checked
* below in btrfs_init_dev_replace().
*/
btrfs_free_extra_devids(fs_devices);
@@ -3588,8 +3583,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
ret = btrfsic_mount(fs_info, fs_devices,
btrfs_test_opt(fs_info,
- CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
- 1 : 0,
+ CHECK_INTEGRITY_DATA) ? 1 : 0,
fs_info->check_integrity_print_mask);
if (ret)
btrfs_warn(fs_info,
@@ -4686,9 +4680,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
cache->space_info->bytes_reserved -= head->num_bytes;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(
- &cache->space_info->total_bytes_pinned,
- head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
btrfs_put_block_group(cache);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f1d15b68994a..d296483d148f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1425,7 +1425,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
* bytenr of the parent block. Since new extents are always
* created with indirect references, this will only be the case
* when relocating a shared extent. In that case, root_objectid
- * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
+ * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
* be 0
*
* @root_objectid: The id of the root where this modification has originated,
@@ -1804,19 +1804,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
}
- /*
- * We were dropping refs, or had a new ref and dropped it, and thus must
- * adjust down our total_bytes_pinned, the space may or may not have
- * been pinned and so is accounted for properly in the pinned space by
- * now.
- */
- if (head->total_ref_mod < 0 ||
- (head->total_ref_mod == 0 && head->must_insert_reserved)) {
- u64 flags = btrfs_ref_head_to_space_flags(head);
-
- btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
- }
-
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
}
@@ -1868,7 +1855,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
btrfs_put_delayed_ref_head(head);
- return 0;
+ return ret;
}
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
@@ -2551,7 +2538,6 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
@@ -2762,7 +2748,6 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
cache->pinned -= len;
btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
space_info->max_extent_size = 0;
- __btrfs_mod_total_bytes_pinned(space_info, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
@@ -4784,7 +4769,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
}
- trans->dirty = true;
/* this returns a buffer locked for blocking */
return buf;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dee2dafbc872..9e81d25dea70 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -136,7 +136,7 @@ struct tree_entry {
};
struct extent_page_data {
- struct bio *bio;
+ struct btrfs_bio_ctrl bio_ctrl;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
@@ -185,10 +185,12 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num,
/* Cleanup unsubmitted bios */
static void end_write_bio(struct extent_page_data *epd, int ret)
{
- if (epd->bio) {
- epd->bio->bi_status = errno_to_blk_status(ret);
- bio_endio(epd->bio);
- epd->bio = NULL;
+ struct bio *bio = epd->bio_ctrl.bio;
+
+ if (bio) {
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
+ epd->bio_ctrl.bio = NULL;
}
}
@@ -201,9 +203,10 @@ static void end_write_bio(struct extent_page_data *epd, int ret)
static int __must_check flush_write_bio(struct extent_page_data *epd)
{
int ret = 0;
+ struct bio *bio = epd->bio_ctrl.bio;
- if (epd->bio) {
- ret = submit_one_bio(epd->bio, 0, 0);
+ if (bio) {
+ ret = submit_one_bio(bio, 0, 0);
/*
* Clean up of epd->bio is handled by its endio function.
* And endio is either triggered by successful bio execution
@@ -211,7 +214,7 @@ static int __must_check flush_write_bio(struct extent_page_data *epd)
* So at this point, no matter what happened, we don't need
* to clean up epd->bio.
*/
- epd->bio = NULL;
+ epd->bio_ctrl.bio = NULL;
}
return ret;
}
@@ -1805,10 +1808,130 @@ out:
return found;
}
+/*
+ * Process one page for __process_pages_contig().
+ *
+ * Return >0 if we hit @page == @locked_page.
+ * Return 0 if we updated the page status.
+ * Return -EGAIN if the we need to try again.
+ * (For PAGE_LOCK case but got dirty page or page not belong to mapping)
+ */
+static int process_one_page(struct btrfs_fs_info *fs_info,
+ struct address_space *mapping,
+ struct page *page, struct page *locked_page,
+ unsigned long page_ops, u64 start, u64 end)
+{
+ u32 len;
+
+ ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
+ len = end + 1 - start;
+
+ if (page_ops & PAGE_SET_ORDERED)
+ btrfs_page_clamp_set_ordered(fs_info, page, start, len);
+ if (page_ops & PAGE_SET_ERROR)
+ btrfs_page_clamp_set_error(fs_info, page, start, len);
+ if (page_ops & PAGE_START_WRITEBACK) {
+ btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
+ btrfs_page_clamp_set_writeback(fs_info, page, start, len);
+ }
+ if (page_ops & PAGE_END_WRITEBACK)
+ btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
+
+ if (page == locked_page)
+ return 1;
+
+ if (page_ops & PAGE_LOCK) {
+ int ret;
+
+ ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
+ if (ret)
+ return ret;
+ if (!PageDirty(page) || page->mapping != mapping) {
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
+ return -EAGAIN;
+ }
+ }
+ if (page_ops & PAGE_UNLOCK)
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
+ return 0;
+}
+
static int __process_pages_contig(struct address_space *mapping,
struct page *locked_page,
- pgoff_t start_index, pgoff_t end_index,
- unsigned long page_ops, pgoff_t *index_ret);
+ u64 start, u64 end, unsigned long page_ops,
+ u64 *processed_end)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+ pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
+ pgoff_t index = start_index;
+ unsigned long nr_pages = end_index - start_index + 1;
+ unsigned long pages_processed = 0;
+ struct page *pages[16];
+ int err = 0;
+ int i;
+
+ if (page_ops & PAGE_LOCK) {
+ ASSERT(page_ops == PAGE_LOCK);
+ ASSERT(processed_end && *processed_end == start);
+ }
+
+ if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
+ mapping_set_error(mapping, -EIO);
+
+ while (nr_pages > 0) {
+ int found_pages;
+
+ found_pages = find_get_pages_contig(mapping, index,
+ min_t(unsigned long,
+ nr_pages, ARRAY_SIZE(pages)), pages);
+ if (found_pages == 0) {
+ /*
+ * Only if we're going to lock these pages, we can find
+ * nothing at @index.
+ */
+ ASSERT(page_ops & PAGE_LOCK);
+ err = -EAGAIN;
+ goto out;
+ }
+
+ for (i = 0; i < found_pages; i++) {
+ int process_ret;
+
+ process_ret = process_one_page(fs_info, mapping,
+ pages[i], locked_page, page_ops,
+ start, end);
+ if (process_ret < 0) {
+ for (; i < found_pages; i++)
+ put_page(pages[i]);
+ err = -EAGAIN;
+ goto out;
+ }
+ put_page(pages[i]);
+ pages_processed++;
+ }
+ nr_pages -= found_pages;
+ index += found_pages;
+ cond_resched();
+ }
+out:
+ if (err && processed_end) {
+ /*
+ * Update @processed_end. I know this is awful since it has
+ * two different return value patterns (inclusive vs exclusive).
+ *
+ * But the exclusive pattern is necessary if @start is 0, or we
+ * underflow and check against processed_end won't work as
+ * expected.
+ */
+ if (pages_processed)
+ *processed_end = min(end,
+ ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
+ else
+ *processed_end = start;
+ }
+ return err;
+}
static noinline void __unlock_for_delalloc(struct inode *inode,
struct page *locked_page,
@@ -1821,7 +1944,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode,
if (index == locked_page->index && end_index == index)
return;
- __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
+ __process_pages_contig(inode->i_mapping, locked_page, start, end,
PAGE_UNLOCK, NULL);
}
@@ -1831,19 +1954,19 @@ static noinline int lock_delalloc_pages(struct inode *inode,
u64 delalloc_end)
{
unsigned long index = delalloc_start >> PAGE_SHIFT;
- unsigned long index_ret = index;
unsigned long end_index = delalloc_end >> PAGE_SHIFT;
+ u64 processed_end = delalloc_start;
int ret;
ASSERT(locked_page);
if (index == locked_page->index && index == end_index)
return 0;
- ret = __process_pages_contig(inode->i_mapping, locked_page, index,
- end_index, PAGE_LOCK, &index_ret);
- if (ret == -EAGAIN)
+ ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
+ delalloc_end, PAGE_LOCK, &processed_end);
+ if (ret == -EAGAIN && processed_end > delalloc_start)
__unlock_for_delalloc(inode, locked_page, delalloc_start,
- (u64)index_ret << PAGE_SHIFT);
+ processed_end);
return ret;
}
@@ -1936,84 +2059,6 @@ out_failed:
return found;
}
-static int __process_pages_contig(struct address_space *mapping,
- struct page *locked_page,
- pgoff_t start_index, pgoff_t end_index,
- unsigned long page_ops, pgoff_t *index_ret)
-{
- unsigned long nr_pages = end_index - start_index + 1;
- unsigned long pages_processed = 0;
- pgoff_t index = start_index;
- struct page *pages[16];
- unsigned ret;
- int err = 0;
- int i;
-
- if (page_ops & PAGE_LOCK) {
- ASSERT(page_ops == PAGE_LOCK);
- ASSERT(index_ret && *index_ret == start_index);
- }
-
- if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
- mapping_set_error(mapping, -EIO);
-
- while (nr_pages > 0) {
- ret = find_get_pages_contig(mapping, index,
- min_t(unsigned long,
- nr_pages, ARRAY_SIZE(pages)), pages);
- if (ret == 0) {
- /*
- * Only if we're going to lock these pages,
- * can we find nothing at @index.
- */
- ASSERT(page_ops & PAGE_LOCK);
- err = -EAGAIN;
- goto out;
- }
-
- for (i = 0; i < ret; i++) {
- if (page_ops & PAGE_SET_PRIVATE2)
- SetPagePrivate2(pages[i]);
-
- if (locked_page && pages[i] == locked_page) {
- put_page(pages[i]);
- pages_processed++;
- continue;
- }
- if (page_ops & PAGE_START_WRITEBACK) {
- clear_page_dirty_for_io(pages[i]);
- set_page_writeback(pages[i]);
- }
- if (page_ops & PAGE_SET_ERROR)
- SetPageError(pages[i]);
- if (page_ops & PAGE_END_WRITEBACK)
- end_page_writeback(pages[i]);
- if (page_ops & PAGE_UNLOCK)
- unlock_page(pages[i]);
- if (page_ops & PAGE_LOCK) {
- lock_page(pages[i]);
- if (!PageDirty(pages[i]) ||
- pages[i]->mapping != mapping) {
- unlock_page(pages[i]);
- for (; i < ret; i++)
- put_page(pages[i]);
- err = -EAGAIN;
- goto out;
- }
- }
- put_page(pages[i]);
- pages_processed++;
- }
- nr_pages -= ret;
- index += ret;
- cond_resched();
- }
-out:
- if (err && index_ret)
- *index_ret = start_index + pages_processed - 1;
- return err;
-}
-
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 clear_bits, unsigned long page_ops)
@@ -2021,8 +2066,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
- start >> PAGE_SHIFT, end >> PAGE_SHIFT,
- page_ops, NULL);
+ start, end, page_ops, NULL);
}
/*
@@ -2381,13 +2425,6 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
BUG_ON(!failrec->this_mirror);
- if (failrec->in_validation) {
- /* there was no real error, just free the record */
- btrfs_debug(fs_info,
- "clean_io_failure: freeing dummy error at %llu",
- failrec->start);
- goto out;
- }
if (sb_rdonly(fs_info->sb))
goto out;
@@ -2449,7 +2486,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
}
static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
- u64 start, u64 end)
+ u64 start)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct io_failure_record *failrec;
@@ -2457,15 +2494,15 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ const u32 sectorsize = fs_info->sectorsize;
int ret;
u64 logical;
failrec = get_state_failrec(failure_tree, start);
if (!IS_ERR(failrec)) {
btrfs_debug(fs_info,
- "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
- failrec->logical, failrec->start, failrec->len,
- failrec->in_validation);
+ "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
+ failrec->logical, failrec->start, failrec->len);
/*
* when data can be on disk more than twice, add to failrec here
* (e.g. with a list for failed_mirror) to make
@@ -2480,10 +2517,9 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
return ERR_PTR(-ENOMEM);
failrec->start = start;
- failrec->len = end - start + 1;
+ failrec->len = sectorsize;
failrec->this_mirror = 0;
failrec->bio_flags = 0;
- failrec->in_validation = 0;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
@@ -2519,12 +2555,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
free_extent_map(em);
/* Set the bits in the private failure tree */
- ret = set_extent_bits(failure_tree, start, end,
+ ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
EXTENT_LOCKED | EXTENT_DIRTY);
if (ret >= 0) {
ret = set_state_failrec(failure_tree, start, failrec);
/* Set the bits in the inode's tree */
- ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
+ ret = set_extent_bits(tree, start, start + sectorsize - 1,
+ EXTENT_DAMAGED);
} else if (ret < 0) {
kfree(failrec);
return ERR_PTR(ret);
@@ -2533,7 +2570,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
return failrec;
}
-static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
+static bool btrfs_check_repairable(struct inode *inode,
struct io_failure_record *failrec,
int failed_mirror)
{
@@ -2553,39 +2590,22 @@ static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
return false;
}
+ /* The failure record should only contain one sector */
+ ASSERT(failrec->len == fs_info->sectorsize);
+
/*
- * there are two premises:
- * a) deliver good data to the caller
- * b) correct the bad sectors on disk
+ * There are two premises:
+ * a) deliver good data to the caller
+ * b) correct the bad sectors on disk
+ *
+ * Since we're only doing repair for one sector, we only need to get
+ * a good copy of the failed sector and if we succeed, we have setup
+ * everything for repair_io_failure to do the rest for us.
*/
- if (needs_validation) {
- /*
- * to fulfill b), we need to know the exact failing sectors, as
- * we don't want to rewrite any more than the failed ones. thus,
- * we need separate read requests for the failed bio
- *
- * if the following BUG_ON triggers, our validation request got
- * merged. we need separate requests for our algorithm to work.
- */
- BUG_ON(failrec->in_validation);
- failrec->in_validation = 1;
- failrec->this_mirror = failed_mirror;
- } else {
- /*
- * we're ready to fulfill a) and b) alongside. get a good copy
- * of the failed sector and if we succeed, we have setup
- * everything for repair_io_failure to do the rest for us.
- */
- if (failrec->in_validation) {
- BUG_ON(failrec->this_mirror != failed_mirror);
- failrec->in_validation = 0;
- failrec->this_mirror = 0;
- }
- failrec->failed_mirror = failed_mirror;
+ failrec->failed_mirror = failed_mirror;
+ failrec->this_mirror++;
+ if (failrec->this_mirror == failed_mirror)
failrec->this_mirror++;
- if (failrec->this_mirror == failed_mirror)
- failrec->this_mirror++;
- }
if (failrec->this_mirror > num_copies) {
btrfs_debug(fs_info,
@@ -2597,53 +2617,11 @@ static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
return true;
}
-static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio)
-{
- u64 len = 0;
- const u32 blocksize = inode->i_sb->s_blocksize;
-
- /*
- * If bi_status is BLK_STS_OK, then this was a checksum error, not an
- * I/O error. In this case, we already know exactly which sector was
- * bad, so we don't need to validate.
- */
- if (bio->bi_status == BLK_STS_OK)
- return false;
-
- /*
- * We need to validate each sector individually if the failed I/O was
- * for multiple sectors.
- *
- * There are a few possible bios that can end up here:
- * 1. A buffered read bio, which is not cloned.
- * 2. A direct I/O read bio, which is cloned.
- * 3. A (buffered or direct) repair bio, which is not cloned.
- *
- * For cloned bios (case 2), we can get the size from
- * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get
- * it from the bvecs.
- */
- if (bio_flagged(bio, BIO_CLONED)) {
- if (btrfs_io_bio(bio)->iter.bi_size > blocksize)
- return true;
- } else {
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_bvec_all(bvec, bio, i) {
- len += bvec->bv_len;
- if (len > blocksize)
- return true;
- }
- }
- return false;
-}
-
-blk_status_t btrfs_submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u32 bio_offset,
- struct page *page, unsigned int pgoff,
- u64 start, u64 end, int failed_mirror,
- submit_bio_hook_t *submit_bio_hook)
+int btrfs_repair_one_sector(struct inode *inode,
+ struct bio *failed_bio, u32 bio_offset,
+ struct page *page, unsigned int pgoff,
+ u64 start, int failed_mirror,
+ submit_bio_hook_t *submit_bio_hook)
{
struct io_failure_record *failrec;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2651,7 +2629,6 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
const int icsum = bio_offset >> fs_info->sectorsize_bits;
- bool need_validation;
struct bio *repair_bio;
struct btrfs_io_bio *repair_io_bio;
blk_status_t status;
@@ -2661,23 +2638,19 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
- failrec = btrfs_get_io_failure_record(inode, start, end);
+ failrec = btrfs_get_io_failure_record(inode, start);
if (IS_ERR(failrec))
- return errno_to_blk_status(PTR_ERR(failrec));
+ return PTR_ERR(failrec);
- need_validation = btrfs_io_needs_validation(inode, failed_bio);
- if (!btrfs_check_repairable(inode, need_validation, failrec,
- failed_mirror)) {
+ if (!btrfs_check_repairable(inode, failrec, failed_mirror)) {
free_io_failure(failure_tree, tree, failrec);
- return BLK_STS_IOERR;
+ return -EIO;
}
repair_bio = btrfs_io_bio_alloc(1);
repair_io_bio = btrfs_io_bio(repair_bio);
repair_bio->bi_opf = REQ_OP_READ;
- if (need_validation)
- repair_bio->bi_opf |= REQ_FAILFAST_DEV;
repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
repair_bio->bi_private = failed_bio->bi_private;
@@ -2695,8 +2668,8 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
repair_io_bio->iter = repair_bio->bi_iter;
btrfs_debug(btrfs_sb(inode->i_sb),
-"repair read error: submitting new read to mirror %d, in_validation=%d",
- failrec->this_mirror, failrec->in_validation);
+ "repair read error: submitting new read to mirror %d",
+ failrec->this_mirror);
status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
failrec->bio_flags);
@@ -2704,17 +2677,114 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
free_io_failure(failure_tree, tree, failrec);
bio_put(repair_bio);
}
- return status;
+ return blk_status_to_errno(status);
+}
+
+static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+
+ ASSERT(page_offset(page) <= start &&
+ start + len <= page_offset(page) + PAGE_SIZE);
+
+ if (uptodate) {
+ btrfs_page_set_uptodate(fs_info, page, start, len);
+ } else {
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_page_set_error(fs_info, page, start, len);
+ }
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ unlock_page(page);
+ else
+ btrfs_subpage_end_reader(fs_info, page, start, len);
+}
+
+static blk_status_t submit_read_repair(struct inode *inode,
+ struct bio *failed_bio, u32 bio_offset,
+ struct page *page, unsigned int pgoff,
+ u64 start, u64 end, int failed_mirror,
+ unsigned int error_bitmap,
+ submit_bio_hook_t *submit_bio_hook)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+ const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
+ int error = 0;
+ int i;
+
+ BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+
+ /* We're here because we had some read errors or csum mismatch */
+ ASSERT(error_bitmap);
+
+ /*
+ * We only get called on buffered IO, thus page must be mapped and bio
+ * must not be cloned.
+ */
+ ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED));
+
+ /* Iterate through all the sectors in the range */
+ for (i = 0; i < nr_bits; i++) {
+ const unsigned int offset = i * sectorsize;
+ struct extent_state *cached = NULL;
+ bool uptodate = false;
+ int ret;
+
+ if (!(error_bitmap & (1U << i))) {
+ /*
+ * This sector has no error, just end the page read
+ * and unlock the range.
+ */
+ uptodate = true;
+ goto next;
+ }
+
+ ret = btrfs_repair_one_sector(inode, failed_bio,
+ bio_offset + offset,
+ page, pgoff + offset, start + offset,
+ failed_mirror, submit_bio_hook);
+ if (!ret) {
+ /*
+ * We have submitted the read repair, the page release
+ * will be handled by the endio function of the
+ * submitted repair bio.
+ * Thus we don't need to do any thing here.
+ */
+ continue;
+ }
+ /*
+ * Repair failed, just record the error but still continue.
+ * Or the remaining sectors will not be properly unlocked.
+ */
+ if (!error)
+ error = ret;
+next:
+ end_page_read(page, uptodate, start + offset, sectorsize);
+ if (uptodate)
+ set_extent_uptodate(&BTRFS_I(inode)->io_tree,
+ start + offset,
+ start + offset + sectorsize - 1,
+ &cached, GFP_ATOMIC);
+ unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree,
+ start + offset,
+ start + offset + sectorsize - 1,
+ &cached);
+ }
+ return errno_to_blk_status(error);
}
/* lots and lots of room for performance fixes in the end_bio funcs */
void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
+ struct btrfs_inode *inode;
int uptodate = (err == 0);
int ret = 0;
- btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
+ ASSERT(page && page->mapping);
+ inode = BTRFS_I(page->mapping->host);
+ btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
if (!uptodate) {
ClearPageUptodate(page);
@@ -2747,25 +2817,20 @@ static void end_bio_extent_writepage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
- /* We always issue full-page reads, but if some block
- * in a page fails to read, blk_update_request() will
- * advance bv_offset and adjust bv_len to compensate.
- * Print a warning for nonzero offsets, and an error
- * if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
- if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
- btrfs_err(fs_info,
- "partial page write in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else
- btrfs_info(fs_info,
- "incomplete page write in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- }
+ /* Our read/write should always be sector aligned. */
+ if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ btrfs_err(fs_info,
+ "partial page write in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
+ btrfs_info(fs_info,
+ "incomplete page write with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
- start = page_offset(page);
- end = start + bvec->bv_offset + bvec->bv_len - 1;
+ start = page_offset(page) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
if (first_bvec) {
btrfs_record_physical_zoned(inode, start, bio);
@@ -2773,7 +2838,8 @@ static void end_bio_extent_writepage(struct bio *bio)
}
end_extent_writepage(page, error, start, end);
- end_page_writeback(page);
+
+ btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
}
bio_put(bio);
@@ -2862,30 +2928,6 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
}
-static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
-
- ASSERT(page_offset(page) <= start &&
- start + len <= page_offset(page) + PAGE_SIZE);
-
- if (uptodate) {
- btrfs_page_set_uptodate(fs_info, page, start, len);
- } else {
- btrfs_page_clear_uptodate(fs_info, page, start, len);
- btrfs_page_set_error(fs_info, page, start, len);
- }
-
- if (fs_info->sectorsize == PAGE_SIZE)
- unlock_page(page);
- else if (is_data_inode(page->mapping->host))
- /*
- * For subpage data, unlock the page if we're the last reader.
- * For subpage metadata, page lock is not utilized for read.
- */
- btrfs_subpage_end_reader(fs_info, page, start, len);
-}
-
/*
* Find extent buffer for a givne bytenr.
*
@@ -2929,7 +2971,6 @@ static struct extent_buffer *find_extent_buffer_readpage(
static void end_bio_extent_readpage(struct bio *bio)
{
struct bio_vec *bvec;
- int uptodate = !bio->bi_status;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct extent_io_tree *tree, *failure_tree;
struct processed_extent processed = { 0 };
@@ -2944,10 +2985,12 @@ static void end_bio_extent_readpage(struct bio *bio)
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
+ bool uptodate = !bio->bi_status;
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
+ unsigned int error_bitmap = (unsigned int)-1;
u64 start;
u64 end;
u32 len;
@@ -2982,14 +3025,16 @@ static void end_bio_extent_readpage(struct bio *bio)
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
- if (is_data_inode(inode))
- ret = btrfs_verify_data_csum(io_bio,
+ if (is_data_inode(inode)) {
+ error_bitmap = btrfs_verify_data_csum(io_bio,
bio_offset, page, start, end);
- else
+ ret = error_bitmap;
+ } else {
ret = btrfs_validate_metadata_buffer(io_bio,
page, start, end, mirror);
+ }
if (ret)
- uptodate = 0;
+ uptodate = false;
else
clean_io_failure(BTRFS_I(inode)->root->fs_info,
failure_tree, tree, start,
@@ -3001,27 +3046,18 @@ static void end_bio_extent_readpage(struct bio *bio)
goto readpage_ok;
if (is_data_inode(inode)) {
-
/*
- * The generic bio_readpage_error handles errors the
- * following way: If possible, new read requests are
- * created and submitted and will end up in
- * end_bio_extent_readpage as well (if we're lucky,
- * not in the !uptodate case). In that case it returns
- * 0 and we just go on with the next page in our bio.
- * If it can't handle the error it will return -EIO and
- * we remain responsible for that page.
+ * btrfs_submit_read_repair() will handle all the good
+ * and bad sectors, we just continue to the next bvec.
*/
- if (!btrfs_submit_read_repair(inode, bio, bio_offset,
- page,
- start - page_offset(page),
- start, end, mirror,
- btrfs_submit_data_bio)) {
- uptodate = !bio->bi_status;
- ASSERT(bio_offset + len > bio_offset);
- bio_offset += len;
- continue;
- }
+ submit_read_repair(inode, bio, bio_offset, page,
+ start - page_offset(page), start,
+ end, mirror, error_bitmap,
+ btrfs_submit_data_bio);
+
+ ASSERT(bio_offset + len > bio_offset);
+ bio_offset += len;
+ continue;
} else {
struct extent_buffer *eb;
@@ -3151,42 +3187,99 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
*
* Return true if successfully page added. Otherwise, return false.
*/
-static bool btrfs_bio_add_page(struct bio *bio, struct page *page,
+static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+ struct page *page,
u64 disk_bytenr, unsigned int size,
unsigned int pg_offset,
- unsigned long prev_bio_flags,
unsigned long bio_flags)
{
+ struct bio *bio = bio_ctrl->bio;
+ u32 bio_size = bio->bi_iter.bi_size;
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
bool contig;
int ret;
- if (prev_bio_flags != bio_flags)
+ ASSERT(bio);
+ /* The limit should be calculated when bio_ctrl->bio is allocated */
+ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
+ if (bio_ctrl->bio_flags != bio_flags)
return false;
- if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
+ if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
contig = bio->bi_iter.bi_sector == sector;
else
contig = bio_end_sector(bio) == sector;
if (!contig)
return false;
- if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags))
+ if (bio_size + size > bio_ctrl->len_to_oe_boundary ||
+ bio_size + size > bio_ctrl->len_to_stripe_boundary)
return false;
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct page *first_page = bio_first_bvec_all(bio)->bv_page;
-
- if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size))
- return false;
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
ret = bio_add_zone_append_page(bio, page, size, pg_offset);
- } else {
+ else
ret = bio_add_page(bio, page, size, pg_offset);
- }
return ret == size;
}
+static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
+ struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_io_geometry geom;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_map *em;
+ u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
+ int ret;
+
+ /*
+ * Pages for compressed extent are never submitted to disk directly,
+ * thus it has no real boundary, just set them to U32_MAX.
+ *
+ * The split happens for real compressed bio, which happens in
+ * btrfs_submit_compressed_read/write().
+ */
+ if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) {
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
+ bio_ctrl->len_to_stripe_boundary = U32_MAX;
+ return 0;
+ }
+ em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
+ logical, &geom);
+ free_extent_map(em);
+ if (ret < 0) {
+ return ret;
+ }
+ if (geom.len > U32_MAX)
+ bio_ctrl->len_to_stripe_boundary = U32_MAX;
+ else
+ bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
+
+ if (!btrfs_is_zoned(fs_info) ||
+ bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
+ return 0;
+ }
+
+ ASSERT(fs_info->max_zone_append_size > 0);
+ /* Ordered extent not yet created, so we're good */
+ ordered = btrfs_lookup_ordered_extent(inode, logical);
+ if (!ordered) {
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
+ return 0;
+ }
+
+ bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
+ ordered->disk_bytenr + ordered->disk_num_bytes - logical);
+ btrfs_put_ordered_extent(ordered);
+ return 0;
+}
+
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @wbc: optional writeback control for io accounting
@@ -3203,12 +3296,11 @@ static bool btrfs_bio_add_page(struct bio *bio, struct page *page,
*/
static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
+ struct btrfs_bio_ctrl *bio_ctrl,
struct page *page, u64 disk_bytenr,
size_t size, unsigned long pg_offset,
- struct bio **bio_ret,
bio_end_io_t end_io_func,
int mirror_num,
- unsigned long prev_bio_flags,
unsigned long bio_flags,
bool force_bio_submit)
{
@@ -3219,19 +3311,19 @@ static int submit_extent_page(unsigned int opf,
struct extent_io_tree *tree = &inode->io_tree;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- ASSERT(bio_ret);
+ ASSERT(bio_ctrl);
- if (*bio_ret) {
- bio = *bio_ret;
+ ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
+ pg_offset + size <= PAGE_SIZE);
+ if (bio_ctrl->bio) {
+ bio = bio_ctrl->bio;
if (force_bio_submit ||
- !btrfs_bio_add_page(bio, page, disk_bytenr, io_size,
- pg_offset, prev_bio_flags, bio_flags)) {
- ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
- if (ret < 0) {
- *bio_ret = NULL;
+ !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size,
+ pg_offset, bio_flags)) {
+ ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags);
+ bio_ctrl->bio = NULL;
+ if (ret < 0)
return ret;
- }
- bio = NULL;
} else {
if (wbc)
wbc_account_cgroup_owner(wbc, page, io_size);
@@ -3254,22 +3346,18 @@ static int submit_extent_page(unsigned int opf,
wbc_account_cgroup_owner(wbc, page, io_size);
}
if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_device *device;
- em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size);
+ if (IS_ERR(device))
+ return PTR_ERR(device);
- map = em->map_lookup;
- /* We only support single profile for now */
- ASSERT(map->num_stripes == 1);
- btrfs_io_bio(bio)->device = map->stripes[0].dev;
-
- free_extent_map(em);
+ btrfs_io_bio(bio)->device = device;
}
- *bio_ret = bio;
+ bio_ctrl->bio = bio;
+ bio_ctrl->bio_flags = bio_flags;
+ ret = calc_bio_boundaries(bio_ctrl, inode);
return ret;
}
@@ -3382,7 +3470,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* return 0 on success, otherwise return error
*/
int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
- struct bio **bio, unsigned long *bio_flags,
+ struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
@@ -3558,15 +3646,13 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
}
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
- page, disk_bytenr, iosize,
- pg_offset, bio,
+ bio_ctrl, page, disk_bytenr, iosize,
+ pg_offset,
end_bio_extent_readpage, 0,
- *bio_flags,
this_bio_flag,
force_bio_submit);
if (!ret) {
nr++;
- *bio_flags = this_bio_flag;
} else {
unlock_extent(tree, cur, cur + iosize - 1);
end_page_read(page, false, cur, iosize);
@@ -3580,11 +3666,10 @@ out:
}
static inline void contiguous_readpages(struct page *pages[], int nr_pages,
- u64 start, u64 end,
- struct extent_map **em_cached,
- struct bio **bio,
- unsigned long *bio_flags,
- u64 *prev_em_start)
+ u64 start, u64 end,
+ struct extent_map **em_cached,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ u64 *prev_em_start)
{
struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
int index;
@@ -3592,7 +3677,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
- btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
+ btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
@@ -3680,6 +3765,54 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
}
/*
+ * Find the first byte we need to write.
+ *
+ * For subpage, one page can contain several sectors, and
+ * __extent_writepage_io() will just grab all extent maps in the page
+ * range and try to submit all non-inline/non-compressed extents.
+ *
+ * This is a big problem for subpage, we shouldn't re-submit already written
+ * data at all.
+ * This function will lookup subpage dirty bit to find which range we really
+ * need to submit.
+ *
+ * Return the next dirty range in [@start, @end).
+ * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
+ */
+static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
+ struct page *page, u64 *start, u64 *end)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ u64 orig_start = *start;
+ /* Declare as unsigned long so we can use bitmap ops */
+ unsigned long dirty_bitmap;
+ unsigned long flags;
+ int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits;
+ int range_start_bit = nbits;
+ int range_end_bit;
+
+ /*
+ * For regular sector size == page size case, since one page only
+ * contains one sector, we return the page offset directly.
+ */
+ if (fs_info->sectorsize == PAGE_SIZE) {
+ *start = page_offset(page);
+ *end = page_offset(page) + PAGE_SIZE;
+ return;
+ }
+
+ /* We should have the page locked, but just in case */
+ spin_lock_irqsave(&subpage->lock, flags);
+ dirty_bitmap = subpage->dirty_bitmap;
+ spin_unlock_irqrestore(&subpage->lock, flags);
+
+ bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit,
+ BTRFS_SUBPAGE_BITMAP_SIZE);
+ *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
+ *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
+}
+
+/*
* helper for __extent_writepage. This calls the writepage start hooks,
* and does the loop to map the page into extents and bios.
*
@@ -3696,7 +3829,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
int *nr_ret)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_io_tree *tree = &inode->io_tree;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
@@ -3727,15 +3859,26 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
while (cur <= end) {
u64 disk_bytenr;
u64 em_end;
+ u64 dirty_range_start = cur;
+ u64 dirty_range_end;
u32 iosize;
if (cur >= i_size) {
- btrfs_writepage_endio_finish_ordered(page, cur, end, 1);
+ btrfs_writepage_endio_finish_ordered(inode, page, cur,
+ end, 1);
break;
}
+
+ find_next_dirty_byte(fs_info, page, &dirty_range_start,
+ &dirty_range_end);
+ if (cur < dirty_range_start) {
+ cur = dirty_range_start;
+ continue;
+ }
+
em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
if (IS_ERR_OR_NULL(em)) {
- SetPageError(page);
+ btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
ret = PTR_ERR_OR_ZERO(em);
break;
}
@@ -3750,8 +3893,11 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
disk_bytenr = em->block_start + extent_offset;
- /* Note that em_end from extent_map_end() is exclusive */
- iosize = min(em_end, end + 1) - cur;
+ /*
+ * Note that em_end from extent_map_end() and dirty_range_end from
+ * find_next_dirty_byte() are all exclusive
+ */
+ iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
if (btrfs_use_zone_append(inode, em->block_start))
opf = REQ_OP_ZONE_APPEND;
@@ -3768,28 +3914,38 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
if (compressed)
nr++;
else
- btrfs_writepage_endio_finish_ordered(page, cur,
- cur + iosize - 1, 1);
+ btrfs_writepage_endio_finish_ordered(inode,
+ page, cur, cur + iosize - 1, 1);
cur += iosize;
continue;
}
- btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
+ btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
btrfs_err(inode->root->fs_info,
"page %lu not writeback, cur %llu end %llu",
page->index, cur, end);
}
- ret = submit_extent_page(opf | write_flags, wbc, page,
+ /*
+ * Although the PageDirty bit is cleared before entering this
+ * function, subpage dirty bit is not cleared.
+ * So clear subpage dirty bit here so next time we won't submit
+ * page for range already written to disk.
+ */
+ btrfs_page_clear_dirty(fs_info, page, cur, iosize);
+
+ ret = submit_extent_page(opf | write_flags, wbc,
+ &epd->bio_ctrl, page,
disk_bytenr, iosize,
- cur - page_offset(page), &epd->bio,
+ cur - page_offset(page),
end_bio_extent_writepage,
- 0, 0, 0, false);
+ 0, 0, false);
if (ret) {
- SetPageError(page);
+ btrfs_page_set_error(fs_info, page, cur, iosize);
if (PageWriteback(page))
- end_page_writeback(page);
+ btrfs_page_clear_writeback(fs_info, page, cur,
+ iosize);
}
cur += iosize;
@@ -4098,12 +4254,15 @@ static struct extent_buffer *find_extent_buffer_nolock(
* Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
* after all extent buffers in the page has finished their writeback.
*/
-static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info,
- struct bio *bio)
+static void end_bio_subpage_eb_writepage(struct bio *bio)
{
+ struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
+ fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
+ ASSERT(fs_info->sectorsize < PAGE_SIZE);
+
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
@@ -4154,16 +4313,11 @@ static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info,
static void end_bio_extent_buffer_writepage(struct bio *bio)
{
- struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct extent_buffer *eb;
int done;
struct bvec_iter_all iter_all;
- fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
- if (fs_info->sectorsize < PAGE_SIZE)
- return end_bio_subpage_eb_writepage(fs_info, bio);
-
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
@@ -4189,12 +4343,34 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
bio_put(bio);
}
+static void prepare_eb_write(struct extent_buffer *eb)
+{
+ u32 nritems;
+ unsigned long start;
+ unsigned long end;
+
+ clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
+ atomic_set(&eb->io_pages, num_extent_pages(eb));
+
+ /* Set btree blocks beyond nritems with 0 to avoid stale content */
+ nritems = btrfs_header_nritems(eb);
+ if (btrfs_header_level(eb) > 0) {
+ end = btrfs_node_key_ptr_offset(nritems);
+ memzero_extent_buffer(eb, end, eb->len - end);
+ } else {
+ /*
+ * Leaf:
+ * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
+ */
+ start = btrfs_item_nr_offset(nritems);
+ end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
+ memzero_extent_buffer(eb, start, end - start);
+ }
+}
+
/*
* Unlike the work in write_one_eb(), we rely completely on extent locking.
* Page locking is only utilized at minimum to keep the VMM code happy.
- *
- * Caller should still call write_one_eb() other than this function directly.
- * As write_one_eb() has extra preparation before submitting the extent buffer.
*/
static int write_one_subpage_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
@@ -4206,6 +4382,8 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
bool no_dirty_ebs = false;
int ret;
+ prepare_eb_write(eb);
+
/* clear_page_dirty_for_io() in subpage helper needs page locked */
lock_page(page);
btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
@@ -4216,10 +4394,10 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
if (no_dirty_ebs)
clear_page_dirty_for_io(page);
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page,
- eb->start, eb->len, eb->start - page_offset(page),
- &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0,
- false);
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
+ &epd->bio_ctrl, page, eb->start, eb->len,
+ eb->start - page_offset(page),
+ end_bio_subpage_eb_writepage, 0, 0, false);
if (ret) {
btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
set_btree_ioerr(page, eb);
@@ -4244,45 +4422,23 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct extent_page_data *epd)
{
u64 disk_bytenr = eb->start;
- u32 nritems;
int i, num_pages;
- unsigned long start, end;
unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
int ret = 0;
- clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
- num_pages = num_extent_pages(eb);
- atomic_set(&eb->io_pages, num_pages);
-
- /* set btree blocks beyond nritems with 0 to avoid stale content. */
- nritems = btrfs_header_nritems(eb);
- if (btrfs_header_level(eb) > 0) {
- end = btrfs_node_key_ptr_offset(nritems);
-
- memzero_extent_buffer(eb, end, eb->len - end);
- } else {
- /*
- * leaf:
- * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
- */
- start = btrfs_item_nr_offset(nritems);
- end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
- memzero_extent_buffer(eb, start, end - start);
- }
-
- if (eb->fs_info->sectorsize < PAGE_SIZE)
- return write_one_subpage_eb(eb, wbc, epd);
+ prepare_eb_write(eb);
+ num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- p, disk_bytenr, PAGE_SIZE, 0,
- &epd->bio,
+ &epd->bio_ctrl, p, disk_bytenr,
+ PAGE_SIZE, 0,
end_bio_extent_buffer_writepage,
- 0, 0, 0, false);
+ 0, 0, false);
if (ret) {
set_btree_ioerr(p, eb);
if (PageWriteback(p))
@@ -4386,7 +4542,7 @@ static int submit_eb_subpage(struct page *page,
free_extent_buffer(eb);
goto cleanup;
}
- ret = write_one_eb(eb, wbc, epd);
+ ret = write_one_subpage_eb(eb, wbc, epd);
free_extent_buffer(eb);
if (ret < 0)
goto cleanup;
@@ -4498,7 +4654,7 @@ int btree_write_cache_pages(struct address_space *mapping,
{
struct extent_buffer *eb_context = NULL;
struct extent_page_data epd = {
- .bio = NULL,
+ .bio_ctrl = { 0 },
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4780,7 +4936,7 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
{
int ret;
struct extent_page_data epd = {
- .bio = NULL,
+ .bio_ctrl = { 0 },
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4807,7 +4963,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
PAGE_SHIFT;
struct extent_page_data epd = {
- .bio = NULL,
+ .bio_ctrl = { 0 },
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
};
@@ -4827,8 +4983,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
if (clear_page_dirty_for_io(page))
ret = __extent_writepage(page, &wbc_writepages, &epd);
else {
- btrfs_writepage_endio_finish_ordered(page, start,
- start + PAGE_SIZE - 1, 1);
+ btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
+ page, start, start + PAGE_SIZE - 1, 1);
unlock_page(page);
}
put_page(page);
@@ -4850,7 +5006,7 @@ int extent_writepages(struct address_space *mapping,
{
int ret = 0;
struct extent_page_data epd = {
- .bio = NULL,
+ .bio_ctrl = { 0 },
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4867,8 +5023,7 @@ int extent_writepages(struct address_space *mapping,
void extent_readahead(struct readahead_control *rac)
{
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
struct page *pagepool[16];
struct extent_map *em_cached = NULL;
u64 prev_em_start = (u64)-1;
@@ -4879,14 +5034,14 @@ void extent_readahead(struct readahead_control *rac)
u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
contiguous_readpages(pagepool, nr, contig_start, contig_end,
- &em_cached, &bio, &bio_flags, &prev_em_start);
+ &em_cached, &bio_ctrl, &prev_em_start);
}
if (em_cached)
free_extent_map(em_cached);
- if (bio) {
- if (submit_one_bio(bio, 0, bio_flags))
+ if (bio_ctrl.bio) {
+ if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags))
return;
}
}
@@ -5429,6 +5584,12 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
subpage = (struct btrfs_subpage *)page->private;
if (atomic_read(&subpage->eb_refs))
return true;
+ /*
+ * Even there is no eb refs here, we may still have
+ * end_page_read() call relying on page::private.
+ */
+ if (atomic_read(&subpage->readers))
+ return true;
}
return false;
}
@@ -5489,7 +5650,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
/*
* We can only detach the page private if there are no other ebs in the
- * page range.
+ * page range and no unfinished IO.
*/
if (!page_range_has_eb(fs_info, page))
btrfs_detach_subpage(fs_info, page);
@@ -6176,7 +6337,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
struct page *page = eb->pages[0];
- struct bio *bio = NULL;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
int ret = 0;
ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
@@ -6184,10 +6345,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
if (wait == WAIT_NONE) {
- ret = try_lock_extent(io_tree, eb->start,
- eb->start + eb->len - 1);
- if (ret <= 0)
- return ret;
+ if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
+ return -EAGAIN;
} else {
ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
if (ret < 0)
@@ -6209,9 +6368,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
check_buffer_tree_ref(eb);
btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
- ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start,
- eb->len, eb->start - page_offset(page), &bio,
- end_bio_extent_readpage, mirror_num, 0, 0,
+ btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
+ ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
+ page, eb->start, eb->len,
+ eb->start - page_offset(page),
+ end_bio_extent_readpage, mirror_num, 0,
true);
if (ret) {
/*
@@ -6221,10 +6382,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
*/
atomic_dec(&eb->io_pages);
}
- if (bio) {
+ if (bio_ctrl.bio) {
int tmp;
- tmp = submit_one_bio(bio, mirror_num, 0);
+ tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0);
+ bio_ctrl.bio = NULL;
if (tmp < 0)
return tmp;
}
@@ -6247,8 +6409,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
int all_uptodate = 1;
int num_pages;
unsigned long num_reads = 0;
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -6312,9 +6473,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
ClearPageError(page);
err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
- page, page_offset(page), PAGE_SIZE, 0,
- &bio, end_bio_extent_readpage,
- mirror_num, 0, 0, false);
+ &bio_ctrl, page, page_offset(page),
+ PAGE_SIZE, 0, end_bio_extent_readpage,
+ mirror_num, 0, false);
if (err) {
/*
* We failed to submit the bio so it's the
@@ -6331,8 +6492,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
}
- if (bio) {
- err = submit_one_bio(bio, mirror_num, bio_flags);
+ if (bio_ctrl.bio) {
+ err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags);
+ bio_ctrl.bio = NULL;
if (err)
return err;
}
@@ -6515,9 +6677,10 @@ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
char *kaddr;
assert_eb_page_uptodate(eb, eb->pages[0]);
- kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
- memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
- BTRFS_FSID_SIZE);
+ kaddr = page_address(eb->pages[0]) +
+ get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
+ chunk_tree_uuid));
+ memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
}
void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
@@ -6525,9 +6688,9 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
char *kaddr;
assert_eb_page_uptodate(eb, eb->pages[0]);
- kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
- memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
- BTRFS_FSID_SIZE);
+ kaddr = page_address(eb->pages[0]) +
+ get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
+ memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
}
void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 227215a5722c..62027f551b44 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -39,7 +39,7 @@ enum {
/* Page starts writeback, clear dirty bit and set writeback bit */
#define PAGE_START_WRITEBACK (1 << 1)
#define PAGE_END_WRITEBACK (1 << 2)
-#define PAGE_SET_PRIVATE2 (1 << 3)
+#define PAGE_SET_ORDERED (1 << 3)
#define PAGE_SET_ERROR (1 << 4)
#define PAGE_LOCK (1 << 5)
@@ -102,6 +102,17 @@ struct extent_buffer {
};
/*
+ * Structure to record info about the bio being assembled, and other info like
+ * how many bytes are there before stripe/ordered extent boundary.
+ */
+struct btrfs_bio_ctrl {
+ struct bio *bio;
+ unsigned long bio_flags;
+ u32 len_to_stripe_boundary;
+ u32 len_to_oe_boundary;
+};
+
+/*
* Structure to record how many bytes and which ranges are set/cleared
*/
struct extent_changeset {
@@ -169,7 +180,7 @@ int try_release_extent_buffer(struct page *page);
int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags);
int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
- struct bio **bio, unsigned long *bio_flags,
+ struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
@@ -281,7 +292,7 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
* When IO fails, either with EIO or csum verification fails, we
* try other mirrors that might have a good copy of the data. This
* io_failure_record is used to record state as we go through all the
- * mirrors. If another mirror has good data, the page is set up to date
+ * mirrors. If another mirror has good data, the sector is set up to date
* and things continue. If a good mirror can't be found, the original
* bio end_io callback is called to indicate things have failed.
*/
@@ -293,15 +304,13 @@ struct io_failure_record {
unsigned long bio_flags;
int this_mirror;
int failed_mirror;
- int in_validation;
};
-
-blk_status_t btrfs_submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u32 bio_offset,
- struct page *page, unsigned int pgoff,
- u64 start, u64 end, int failed_mirror,
- submit_bio_hook_t *submit_bio_hook);
+int btrfs_repair_one_sector(struct inode *inode,
+ struct bio *failed_bio, u32 bio_offset,
+ struct page *page, unsigned int pgoff,
+ u64 start, int failed_mirror,
+ submit_bio_hook_t *submit_bio_hook);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 294602f139ef..df6631eefc65 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -618,7 +618,7 @@ fail:
* @file_start: offset in file this bio begins to describe
* @contig: Boolean. If true/1 means all bio vecs in this bio are
* contiguous and they begin at @file_start in the file. False/0
- * means this bio can contains potentially discontigous bio vecs
+ * means this bio can contain potentially discontiguous bio vecs
* so the logical offset of each should be calculated separately.
*/
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
@@ -788,7 +788,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 end_byte = bytenr + len;
u64 csum_end;
struct extent_buffer *leaf;
- int ret;
+ int ret = 0;
const u32 csum_size = fs_info->csum_size;
u32 blocksize_bits = fs_info->sectorsize_bits;
@@ -806,6 +806,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -862,7 +863,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, root, path,
path->slots[0], del_nr);
if (ret)
- goto out;
+ break;
if (key.offset == bytenr)
break;
} else if (key.offset < bytenr && csum_end > end_byte) {
@@ -906,8 +907,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_split_item(trans, root, path, &key, offset);
if (ret && ret != -EAGAIN) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ break;
}
+ ret = 0;
key.offset = end_byte - 1;
} else {
@@ -917,12 +919,41 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
}
- ret = 0;
-out:
btrfs_free_path(path);
return ret;
}
+static int find_next_csum_offset(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 *next_offset)
+{
+ const u32 nritems = btrfs_header_nritems(path->nodes[0]);
+ struct btrfs_key found_key;
+ int slot = path->slots[0] + 1;
+ int ret;
+
+ if (nritems == 0 || slot >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *next_offset = (u64)-1;
+ return 0;
+ }
+ slot = path->slots[0];
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+
+ if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ found_key.type != BTRFS_EXTENT_CSUM_KEY)
+ *next_offset = (u64)-1;
+ else
+ *next_offset = found_key.offset;
+
+ return 0;
+}
+
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
@@ -938,7 +969,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
u64 total_bytes = 0;
u64 csum_offset;
u64 bytenr;
- u32 nritems;
u32 ins_size;
int index = 0;
int found_next;
@@ -981,26 +1011,10 @@ again:
goto insert;
}
} else {
- int slot = path->slots[0] + 1;
- /* we didn't find a csum item, insert one */
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (!nritems || (path->slots[0] >= nritems - 1)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- found_next = 1;
- goto insert;
- }
- slot = path->slots[0];
- }
- btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
- if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
- found_key.type != BTRFS_EXTENT_CSUM_KEY) {
- found_next = 1;
- goto insert;
- }
- next_offset = found_key.offset;
+ /* We didn't find a csum item, insert one. */
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
found_next = 1;
goto insert;
}
@@ -1056,8 +1070,48 @@ extend_csum:
tmp = sums->len - total_bytes;
tmp >>= fs_info->sectorsize_bits;
WARN_ON(tmp < 1);
+ extend_nr = max_t(int, 1, tmp);
+
+ /*
+ * A log tree can already have checksum items with a subset of
+ * the checksums we are trying to log. This can happen after
+ * doing a sequence of partial writes into prealloc extents and
+ * fsyncs in between, with a full fsync logging a larger subrange
+ * of an extent for which a previous fast fsync logged a smaller
+ * subrange. And this happens in particular due to merging file
+ * extent items when we complete an ordered extent for a range
+ * covered by a prealloc extent - this is done at
+ * btrfs_mark_extent_written().
+ *
+ * So if we try to extend the previous checksum item, which has
+ * a range that ends at the start of the range we want to insert,
+ * make sure we don't extend beyond the start offset of the next
+ * checksum item. If we are at the last item in the leaf, then
+ * forget the optimization of extending and add a new checksum
+ * item - it is not worth the complexity of releasing the path,
+ * getting the first key for the next leaf, repeat the btree
+ * search, etc, because log trees are temporary anyway and it
+ * would only save a few bytes of leaf space.
+ */
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (path->slots[0] + 1 >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+ found_next = 1;
+ goto insert;
+ }
+
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+
+ tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits;
+ if (tmp <= INT_MAX)
+ extend_nr = min_t(int, extend_nr, tmp);
+ }
- extend_nr = max_t(int, 1, (int)tmp);
diff = (csum_offset + extend_nr) * csum_size;
diff = min(diff,
MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3b10d98b4ebb..28a05ba47060 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
#include "compression.h"
#include "delalloc-space.h"
#include "reflink.h"
+#include "subpage.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@@ -482,6 +483,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
start_pos = round_down(pos, fs_info->sectorsize);
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
+ ASSERT(num_bytes <= U32_MAX);
end_of_last_block = start_pos + num_bytes - 1;
@@ -500,9 +502,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
- SetPageUptodate(p);
+
+ btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
ClearPageChecked(p);
- set_page_dirty(p);
+ btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
}
/*
@@ -1094,7 +1097,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
int del_nr = 0;
int del_slot = 0;
int recow;
- int ret;
+ int ret = 0;
u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
@@ -1315,7 +1318,7 @@ again:
}
out:
btrfs_free_path(path);
- return 0;
+ return ret;
}
/*
@@ -2483,6 +2486,17 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
const u64 lockend,
struct extent_state **cached_state)
{
+ /*
+ * For subpage case, if the range is not at page boundary, we could
+ * have pages at the leading/tailing part of the range.
+ * This could lead to dead loop since filemap_range_has_page()
+ * will always return true.
+ * So here we need to do extra page alignment for
+ * filemap_range_has_page().
+ */
+ const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
+ const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
+
while (1) {
struct btrfs_ordered_extent *ordered;
int ret;
@@ -2503,7 +2517,7 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
(ordered->file_offset + ordered->num_bytes <= lockstart ||
ordered->file_offset > lockend)) &&
!filemap_range_has_page(inode->i_mapping,
- lockstart, lockend)) {
+ page_lockstart, page_lockend)) {
if (ordered)
btrfs_put_ordered_extent(ordered);
break;
@@ -3034,22 +3048,20 @@ struct falloc_range {
*/
static int add_falloc_range(struct list_head *head, u64 start, u64 len)
{
- struct falloc_range *prev = NULL;
struct falloc_range *range = NULL;
- if (list_empty(head))
- goto insert;
-
- /*
- * As fallocate iterate by bytenr order, we only need to check
- * the last range.
- */
- prev = list_entry(head->prev, struct falloc_range, list);
- if (prev->start + prev->len == start) {
- prev->len += len;
- return 0;
+ if (!list_empty(head)) {
+ /*
+ * As fallocate iterates by bytenr order, we only need to check
+ * the last range.
+ */
+ range = list_last_entry(head, struct falloc_range, list);
+ if (range->start + range->len == start) {
+ range->len += len;
+ return 0;
+ }
}
-insert:
+
range = kmalloc(sizeof(*range), GFP_KERNEL);
if (!range)
return -ENOMEM;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4806295116d8..2131ae5b9ed7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -327,7 +327,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
* need to check for -EAGAIN.
*/
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
- 0, BTRFS_EXTENT_DATA_KEY);
+ 0, BTRFS_EXTENT_DATA_KEY, NULL);
if (ret)
goto fail;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 33f14573f2ec..e6eb20987351 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -51,6 +51,7 @@
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
+#include "subpage.h"
struct btrfs_iget_args {
u64 ino;
@@ -166,22 +167,47 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
struct page *page;
while (index <= end_index) {
+ /*
+ * For locked page, we will call end_extent_writepage() on it
+ * in run_delalloc_range() for the error handling. That
+ * end_extent_writepage() function will call
+ * btrfs_mark_ordered_io_finished() to clear page Ordered and
+ * run the ordered extent accounting.
+ *
+ * Here we can't just clear the Ordered bit, or
+ * btrfs_mark_ordered_io_finished() would skip the accounting
+ * for the page range, and the ordered extent will never finish.
+ */
+ if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
+ index++;
+ continue;
+ }
page = find_get_page(inode->vfs_inode.i_mapping, index);
index++;
if (!page)
continue;
- ClearPagePrivate2(page);
+
+ /*
+ * Here we just clear all Ordered bits for every page in the
+ * range, then __endio_write_update_ordered() will handle
+ * the ordered extent accounting for the range.
+ */
+ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
+ offset, bytes);
put_page(page);
}
+ /* The locked page covers the full range, nothing needs to be done */
+ if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
+ return;
/*
* In case this page belongs to the delalloc range being instantiated
* then skip it, since the first page of a range is going to be
* properly cleaned up by the caller of run_delalloc_range
*/
if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- offset += PAGE_SIZE;
- bytes -= PAGE_SIZE;
+ bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
+ offset = page_offset(locked_page) + PAGE_SIZE;
}
return __endio_write_update_ordered(inode, offset, bytes, false);
@@ -603,7 +629,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (inode_need_compress(BTRFS_I(inode), start, end)) {
+ if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
@@ -946,7 +972,8 @@ retry:
const u64 end = start + async_extent->ram_size - 1;
p->mapping = inode->vfs_inode.i_mapping;
- btrfs_writepage_endio_finish_ordered(p, start, end, 0);
+ btrfs_writepage_endio_finish_ordered(inode, p, start,
+ end, 0);
p->mapping = NULL;
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
@@ -1064,7 +1091,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(inode, start, end, NULL,
+ extent_clear_unlock_delalloc(inode, start, end,
+ locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
@@ -1072,6 +1100,19 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
*nr_written = *nr_written +
(end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
+ /*
+ * locked_page is locked by the caller of
+ * writepage_delalloc(), not locked by
+ * __process_pages_contig().
+ *
+ * We can't let __process_pages_contig() to unlock it,
+ * as it doesn't have any subpage::writers recorded.
+ *
+ * Here we manually unlock the page, since the caller
+ * can't use page_started to determine if it's an
+ * inline extent or a compressed extent.
+ */
+ unlock_page(locked_page);
goto out;
} else if (ret < 0) {
goto out_unlock;
@@ -1150,15 +1191,16 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- /* we're not doing compressed IO, don't unlock the first
- * page (which the caller expects to stay locked), don't
- * clear any dirty bits and don't set any writeback bits
+ /*
+ * We're not doing compressed IO, don't unlock the first page
+ * (which the caller expects to stay locked), don't clear any
+ * dirty bits and don't set any writeback bits
*
- * Do set the Private2 bit so we know this page was properly
- * setup for writepage
+ * Do set the Ordered (Private2) bit so we know this page was
+ * properly setup for writepage.
*/
page_ops = unlock ? PAGE_UNLOCK : 0;
- page_ops |= PAGE_SET_PRIVATE2;
+ page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
locked_page,
@@ -1822,7 +1864,7 @@ out_check:
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_PRIVATE2);
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
cur_offset = extent_end;
@@ -2193,26 +2235,22 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 logical = bio->bi_iter.bi_sector << 9;
+ u32 bio_len = bio->bi_iter.bi_size;
struct extent_map *em;
- u64 length = 0;
- u64 map_length;
int ret = 0;
struct btrfs_io_geometry geom;
if (bio_flags & EXTENT_BIO_COMPRESSED)
return 0;
- length = bio->bi_iter.bi_size;
- map_length = length;
- em = btrfs_get_chunk_map(fs_info, logical, map_length);
+ em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical,
- map_length, &geom);
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
if (ret < 0)
goto out;
- if (geom.len < length + size)
+ if (geom.len < bio_len + size)
ret = 1;
out:
free_extent_map(em);
@@ -2233,33 +2271,6 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
-bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
- unsigned int size)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_extent *ordered;
- u64 len = bio->bi_iter.bi_size + size;
- bool ret = true;
-
- ASSERT(btrfs_is_zoned(fs_info));
- ASSERT(fs_info->max_zone_append_size > 0);
- ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND);
-
- /* Ordered extent not yet created, so we're good */
- ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
- if (!ordered)
- return ret;
-
- if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len >
- ordered->disk_bytenr + ordered->disk_num_bytes)
- ret = false;
-
- btrfs_put_ordered_extent(ordered);
-
- return ret;
-}
-
static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
struct bio *bio, loff_t file_offset)
{
@@ -2601,7 +2612,7 @@ again:
lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
- if (PagePrivate2(page))
+ if (PageOrdered(page))
goto out_reserved;
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
@@ -2676,8 +2687,8 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_writepage_fixup *fixup;
- /* this page is properly in the ordered list */
- if (TestClearPagePrivate2(page))
+ /* This page has ordered extent covering it already */
+ if (PageOrdered(page))
return 0;
/*
@@ -2773,7 +2784,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
/*
* If we dropped an inline extent here, we know the range where it is
* was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
- * number of bytes only for that range contaning the inline extent.
+ * number of bytes only for that range containing the inline extent.
* The remaining of the range will be processed when clearning the
* EXTENT_DELALLOC_BIT bit through the ordered extent completion.
*/
@@ -3000,6 +3011,18 @@ out:
if (ret || truncated) {
u64 unwritten_start = start;
+ /*
+ * If we failed to finish this ordered extent for any reason we
+ * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
+ * extent, and mark the inode with the error if it wasn't
+ * already set. Any error during writeback would have already
+ * set the mapping error, so we need to set it if we're the ones
+ * marking this ordered extent as failed.
+ */
+ if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
+ &ordered_extent->flags))
+ mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
+
if (truncated)
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
@@ -3057,28 +3080,14 @@ static void finish_ordered_fn(struct btrfs_work *work)
btrfs_finish_ordered_io(ordered_extent);
}
-void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
u64 end, int uptodate)
{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_extent *ordered_extent = NULL;
- struct btrfs_workqueue *wq;
+ trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
- trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
-
- ClearPagePrivate2(page);
- if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
- end - start + 1, uptodate))
- return;
-
- if (btrfs_is_free_space_inode(inode))
- wq = fs_info->endio_freespace_worker;
- else
- wq = fs_info->endio_write_workers;
-
- btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
- btrfs_queue_work(wq, &ordered_extent->work);
+ btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
+ finish_ordered_fn, uptodate);
}
/*
@@ -3140,15 +3149,19 @@ zeroit:
* @bio_offset: offset to the beginning of the bio (in bytes)
* @start: file offset of the range start
* @end: file offset of the range end (inclusive)
+ *
+ * Return a bitmap where bit set means a csum mismatch, and bit not set means
+ * csum match.
*/
-int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end)
+unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
+ struct page *page, u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
const u32 sectorsize = root->fs_info->sectorsize;
u32 pg_off;
+ unsigned int result = 0;
if (PageChecked(page)) {
ClearPageChecked(page);
@@ -3176,10 +3189,14 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
- if (ret < 0)
- return -EIO;
+ if (ret < 0) {
+ const int nr_bit = (pg_off - offset_in_page(start)) >>
+ root->fs_info->sectorsize_bits;
+
+ result |= (1U << nr_bit);
+ }
}
- return 0;
+ return result;
}
/*
@@ -4097,7 +4114,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
* This is a placeholder inode for a subvolume we didn't have a
* reference to at the time of the snapshot creation. In the meantime
* we could have renamed the real subvol link into our snapshot, so
- * depending on btrfs_del_root_ref to return -ENOENT here is incorret.
+ * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
* Instead simply lookup the dir_index_item for this entry so we can
* remove it. Otherwise we know we have a ref to the root and we can
* call btrfs_del_root_ref, and it _shouldn't_ fail.
@@ -4452,20 +4469,36 @@ out:
#define NEED_TRUNCATE_BLOCK 1
/*
- * this can truncate away extent items, csum items and directory items.
- * It starts at a high offset and removes keys until it can't find
- * any higher than new_size
+ * Remove inode items from a given root.
*
- * csum items that cross the new i_size are truncated to the new size
- * as well.
+ * @trans: A transaction handle.
+ * @root: The root from which to remove items.
+ * @inode: The inode whose items we want to remove.
+ * @new_size: The new i_size for the inode. This is only applicable when
+ * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise.
+ * @min_type: The minimum key type to remove. All keys with a type
+ * greater than this value are removed and all keys with
+ * this type are removed only if their offset is >= @new_size.
+ * @extents_found: Output parameter that will contain the number of file
+ * extent items that were removed or adjusted to the new
+ * inode i_size. The caller is responsible for initializing
+ * the counter. Also, it can be NULL if the caller does not
+ * need this counter.
*
- * min_type is the minimum key type to truncate down to. If set to 0, this
- * will kill all the items on this inode, including the INODE_ITEM_KEY.
+ * Remove all keys associated with the inode from the given root that have a key
+ * with a type greater than or equals to @min_type. When @min_type has a value of
+ * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
+ * greater than or equals to @new_size. If a file extent item that starts before
+ * @new_size and ends after it is found, its length is adjusted.
+ *
+ * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
+ * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
*/
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
- u64 new_size, u32 min_type)
+ u64 new_size, u32 min_type,
+ u64 *extents_found)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
@@ -4611,6 +4644,9 @@ search_again:
if (found_type != BTRFS_EXTENT_DATA_KEY)
goto delete;
+ if (extents_found != NULL)
+ (*extents_found)++;
+
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
@@ -4929,7 +4965,7 @@ again:
flush_dcache_page(page);
}
ClearPageChecked(page);
- set_page_dirty(page);
+ btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
@@ -5443,7 +5479,7 @@ void btrfs_evict_inode(struct inode *inode)
trans->block_rsv = rsv;
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
- 0, 0);
+ 0, 0, NULL);
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -7925,19 +7961,17 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
btrfs_ino(BTRFS_I(inode)),
pgoff);
} else {
- blk_status_t status;
+ int ret;
ASSERT((start - io_bio->logical) < UINT_MAX);
- status = btrfs_submit_read_repair(inode,
- &io_bio->bio,
- start - io_bio->logical,
- bvec.bv_page, pgoff,
- start,
- start + sectorsize - 1,
- io_bio->mirror_num,
- submit_dio_repair_bio);
- if (status)
- err = status;
+ ret = btrfs_repair_one_sector(inode,
+ &io_bio->bio,
+ start - io_bio->logical,
+ bvec.bv_page, pgoff,
+ start, io_bio->mirror_num,
+ submit_dio_repair_bio);
+ if (ret)
+ err = errno_to_blk_status(ret);
}
start += sectorsize;
ASSERT(bio_offset + sectorsize > bio_offset);
@@ -7952,41 +7986,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_extent *ordered = NULL;
- struct btrfs_workqueue *wq;
- u64 ordered_offset = offset;
- u64 ordered_bytes = bytes;
- u64 last_offset;
-
- if (btrfs_is_free_space_inode(inode))
- wq = fs_info->endio_freespace_worker;
- else
- wq = fs_info->endio_write_workers;
-
- while (ordered_offset < offset + bytes) {
- last_offset = ordered_offset;
- if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
- &ordered_offset,
- ordered_bytes,
- uptodate)) {
- btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
- NULL);
- btrfs_queue_work(wq, &ordered->work);
- }
-
- /* No ordered extent found in the range, exit */
- if (ordered_offset == last_offset)
- return;
- /*
- * Our bio might span multiple ordered extents. In this case
- * we keep going until we have accounted the whole dio.
- */
- if (ordered_offset < offset + bytes) {
- ordered_bytes = offset + bytes - ordered_offset;
- ordered = NULL;
- }
- }
+ btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
+ finish_ordered_fn, uptodate);
}
static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
@@ -8160,7 +8161,7 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
goto out_err_em;
}
ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
- logical, submit_len, &geom);
+ logical, &geom);
if (ret) {
status = errno_to_blk_status(ret);
goto out_err_em;
@@ -8264,15 +8265,14 @@ int btrfs_readpage(struct file *file, struct page *page)
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- unsigned long bio_flags = 0;
- struct bio *bio = NULL;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
int ret;
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
- ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
- if (bio)
- ret = submit_one_bio(bio, 0, bio_flags);
+ ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
+ if (bio_ctrl.bio)
+ ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
return ret;
}
@@ -8341,9 +8341,9 @@ static int btrfs_migratepage(struct address_space *mapping,
if (page_has_private(page))
attach_page_private(newpage, detach_page_private(page));
- if (PagePrivate2(page)) {
- ClearPagePrivate2(page);
- SetPagePrivate2(newpage);
+ if (PageOrdered(page)) {
+ ClearPageOrdered(page);
+ SetPageOrdered(newpage);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -8358,27 +8358,42 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
- struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_SIZE - 1;
- u64 start;
- u64 end;
+ u64 cur;
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
- bool found_ordered = false;
- bool completed_ordered = false;
/*
- * we have the page locked, so new writeback can't start,
- * and the dirty bit won't be cleared while we are here.
+ * We have page locked so no new ordered extent can be created on this
+ * page, nor bio can be submitted for this page.
*
- * Wait for IO on this page so that we can safely clear
- * the PagePrivate2 bit and do ordered accounting
+ * But already submitted bio can still be finished on this page.
+ * Furthermore, endio function won't skip page which has Ordered
+ * (Private2) already cleared, so it's possible for endio and
+ * invalidatepage to do the same ordered extent accounting twice
+ * on one page.
+ *
+ * So here we wait for any submitted bios to finish, so that we won't
+ * do double ordered extent accounting on the same page.
*/
wait_on_page_writeback(page);
- if (offset) {
+ /*
+ * For subpage case, we have call sites like
+ * btrfs_punch_hole_lock_range() which passes range not aligned to
+ * sectorsize.
+ * If the range doesn't cover the full page, we don't need to and
+ * shouldn't clear page extent mapped, as page->private can still
+ * record subpage dirty bits for other part of the range.
+ *
+ * For cases that can invalidate the full even the range doesn't
+ * cover the full page, like invalidating the last page, we're
+ * still safe to wait for ordered extent to finish.
+ */
+ if (!(offset == 0 && length == PAGE_SIZE)) {
btrfs_releasepage(page, GFP_NOFS);
return;
}
@@ -8386,89 +8401,123 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
if (!inode_evicting)
lock_extent_bits(tree, page_start, page_end, &cached_state);
- start = page_start;
-again:
- ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
- if (ordered) {
- found_ordered = true;
- end = min(page_end,
- ordered->file_offset + ordered->num_bytes - 1);
+ cur = page_start;
+ while (cur < page_end) {
+ struct btrfs_ordered_extent *ordered;
+ bool delete_states;
+ u64 range_end;
+ u32 range_len;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, cur,
+ page_end + 1 - cur);
+ if (!ordered) {
+ range_end = page_end;
+ /*
+ * No ordered extent covering this range, we are safe
+ * to delete all extent states in the range.
+ */
+ delete_states = true;
+ goto next;
+ }
+ if (ordered->file_offset > cur) {
+ /*
+ * There is a range between [cur, oe->file_offset) not
+ * covered by any ordered extent.
+ * We are safe to delete all extent states, and handle
+ * the ordered extent in the next iteration.
+ */
+ range_end = ordered->file_offset - 1;
+ delete_states = true;
+ goto next;
+ }
+
+ range_end = min(ordered->file_offset + ordered->num_bytes - 1,
+ page_end);
+ ASSERT(range_end + 1 - cur < U32_MAX);
+ range_len = range_end + 1 - cur;
+ if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) {
+ /*
+ * If Ordered (Private2) is cleared, it means endio has
+ * already been executed for the range.
+ * We can't delete the extent states as
+ * btrfs_finish_ordered_io() may still use some of them.
+ */
+ delete_states = false;
+ goto next;
+ }
+ btrfs_page_clear_ordered(fs_info, page, cur, range_len);
+
/*
* IO on this page will never be started, so we need to account
* for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
* here, must leave that up for the ordered extent completion.
+ *
+ * This will also unlock the range for incoming
+ * btrfs_finish_ordered_io().
*/
if (!inode_evicting)
- clear_extent_bit(tree, start, end,
+ clear_extent_bit(tree, cur, range_end,
EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state);
+
+ spin_lock_irq(&inode->ordered_tree.lock);
+ set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+ ordered->truncated_len = min(ordered->truncated_len,
+ cur - ordered->file_offset);
+ spin_unlock_irq(&inode->ordered_tree.lock);
+
+ if (btrfs_dec_test_ordered_pending(inode, &ordered,
+ cur, range_end + 1 - cur, 1)) {
+ btrfs_finish_ordered_io(ordered);
+ /*
+ * The ordered extent has finished, now we're again
+ * safe to delete all extent states of the range.
+ */
+ delete_states = true;
+ } else {
+ /*
+ * btrfs_finish_ordered_io() will get executed by endio
+ * of other pages, thus we can't delete extent states
+ * anymore
+ */
+ delete_states = false;
+ }
+next:
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
/*
- * whoever cleared the private bit is responsible
- * for the finish_ordered_io
+ * Qgroup reserved space handler
+ * Sector(s) here will be either:
+ *
+ * 1) Already written to disk or bio already finished
+ * Then its QGROUP_RESERVED bit in io_tree is already cleared.
+ * Qgroup will be handled by its qgroup_record then.
+ * btrfs_qgroup_free_data() call will do nothing here.
+ *
+ * 2) Not written to disk yet
+ * Then btrfs_qgroup_free_data() call will clear the
+ * QGROUP_RESERVED bit of its io_tree, and free the qgroup
+ * reserved data space.
+ * Since the IO will never happen for this page.
*/
- if (TestClearPagePrivate2(page)) {
- spin_lock_irq(&inode->ordered_tree.lock);
- set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
- ordered->truncated_len = min(ordered->truncated_len,
- start - ordered->file_offset);
- spin_unlock_irq(&inode->ordered_tree.lock);
-
- if (btrfs_dec_test_ordered_pending(inode, &ordered,
- start,
- end - start + 1, 1)) {
- btrfs_finish_ordered_io(ordered);
- completed_ordered = true;
- }
- }
- btrfs_put_ordered_extent(ordered);
+ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
if (!inode_evicting) {
- cached_state = NULL;
- lock_extent_bits(tree, start, end,
- &cached_state);
+ clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_UPTODATE |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
+ delete_states, &cached_state);
}
-
- start = end + 1;
- if (start < page_end)
- goto again;
+ cur = range_end + 1;
}
-
/*
- * Qgroup reserved space handler
- * Page here will be either
- * 1) Already written to disk or ordered extent already submitted
- * Then its QGROUP_RESERVED bit in io_tree is already cleaned.
- * Qgroup will be handled by its qgroup_record then.
- * btrfs_qgroup_free_data() call will do nothing here.
- *
- * 2) Not written to disk yet
- * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED
- * bit of its io_tree, and free the qgroup reserved data space.
- * Since the IO will never happen for this page.
+ * We have iterated through all ordered extents of the page, the page
+ * should not have Ordered (Private2) anymore, or the above iteration
+ * did something wrong.
*/
- btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
- if (!inode_evicting) {
- bool delete = true;
-
- /*
- * If there's an ordered extent for this range and we have not
- * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set
- * in the range for the ordered extent completion. We must also
- * not delete the range, otherwise we would lose that bit (and
- * any other bits set in the range). Make sure EXTENT_UPTODATE
- * is cleared if we don't delete, otherwise it can lead to
- * corruptions if the i_size is extented later.
- */
- if (found_ordered && !completed_ordered)
- delete = false;
- clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_UPTODATE |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
- delete, &cached_state);
-
+ ASSERT(!PageOrdered(page));
+ if (!inode_evicting)
__btrfs_releasepage(page, GFP_NOFS);
- }
-
ClearPageChecked(page);
clear_page_extent_mapped(page);
}
@@ -8614,8 +8663,8 @@ again:
flush_dcache_page(page);
}
ClearPageChecked(page);
- set_page_dirty(page);
- SetPageUptodate(page);
+ btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
+ btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
@@ -8649,6 +8698,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
+ u64 extents_found = 0;
if (!skip_writeback) {
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
@@ -8706,20 +8756,13 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
min_size, false);
BUG_ON(ret);
- /*
- * So if we truncate and then write and fsync we normally would just
- * write the extents that changed, which is a problem if we need to
- * first truncate that entire inode. So set this flag so we write out
- * all of the extents in the inode to the sync log so we're completely
- * safe.
- */
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
trans->block_rsv = rsv;
while (1) {
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
inode->i_size,
- BTRFS_EXTENT_DATA_KEY);
+ BTRFS_EXTENT_DATA_KEY,
+ &extents_found);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
break;
@@ -8781,6 +8824,22 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
}
out:
btrfs_free_block_rsv(fs_info, rsv);
+ /*
+ * So if we truncate and then write and fsync we normally would just
+ * write the extents that changed, which is a problem if we need to
+ * first truncate that entire inode. So set this flag so we write out
+ * all of the extents in the inode to the sync log so we're completely
+ * safe.
+ *
+ * If no extents were dropped or trimmed we don't need to force the next
+ * fsync to truncate all the inode's items from the log and re-log them
+ * all. This means the truncate operation did not change the file size,
+ * or changed it to a smaller size but there was only an implicit hole
+ * between the old i_size and the new i_size, and there were no prealloc
+ * extents beyond i_size to drop.
+ */
+ if (extents_found > 0)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
return ret;
}
@@ -9076,6 +9135,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
+ bool need_abort = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
@@ -9135,6 +9195,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
old_idx);
if (ret)
goto out_fail;
+ need_abort = true;
}
/* And now for the dest. */
@@ -9150,8 +9211,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
new_ino,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
- if (ret)
+ if (ret) {
+ if (need_abort)
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
+ }
}
/* Update inode version and ctime/mtime. */
@@ -10182,17 +10246,21 @@ out:
return ret;
}
-void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct inode *inode = tree->private_data;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
+ u32 len;
+ ASSERT(end + 1 - start <= U32_MAX);
+ len = end + 1 - start;
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
+ page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
- set_page_writeback(page);
+
+ btrfs_page_set_writeback(fs_info, page, start, len);
put_page(page);
index++;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5dc2fd843ae3..0ba98e08a029 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -353,15 +353,55 @@ update_flags:
return ret;
}
+/*
+ * Start exclusive operation @type, return true on success
+ */
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type)
{
- return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type);
+ bool ret = false;
+
+ spin_lock(&fs_info->super_lock);
+ if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
+ fs_info->exclusive_operation = type;
+ ret = true;
+ }
+ spin_unlock(&fs_info->super_lock);
+
+ return ret;
+}
+
+/*
+ * Conditionally allow to enter the exclusive operation in case it's compatible
+ * with the running one. This must be paired with btrfs_exclop_start_unlock and
+ * btrfs_exclop_finish.
+ *
+ * Compatibility:
+ * - the same type is already running
+ * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
+ * must check the condition first that would allow none -> @type
+ */
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ spin_lock(&fs_info->super_lock);
+ if (fs_info->exclusive_operation == type)
+ return true;
+
+ spin_unlock(&fs_info->super_lock);
+ return false;
+}
+
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
+{
+ spin_unlock(&fs_info->super_lock);
}
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
{
+ spin_lock(&fs_info->super_lock);
WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+ spin_unlock(&fs_info->super_lock);
sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
}
@@ -1455,7 +1495,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (btrfs_defrag_cancelled(fs_info)) {
btrfs_debug(fs_info, "defrag_file cancelled");
ret = -EAGAIN;
- break;
+ goto error;
}
if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
@@ -1533,6 +1573,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
}
+ ret = defrag_count;
+error:
if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
filemap_flush(inode->i_mapping);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
@@ -1546,8 +1588,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
}
- ret = defrag_count;
-
out_ra:
if (do_compress) {
btrfs_inode_lock(inode, 0);
@@ -1560,6 +1600,48 @@ out_ra:
return ret;
}
+/*
+ * Try to start exclusive operation @type or cancel it if it's running.
+ *
+ * Return:
+ * 0 - normal mode, newly claimed op started
+ * >0 - normal mode, something else is running,
+ * return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space
+ * ECANCELED - cancel mode, successful cancel
+ * ENOTCONN - cancel mode, operation not running anymore
+ */
+static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type, bool cancel)
+{
+ if (!cancel) {
+ /* Start normal op */
+ if (!btrfs_exclop_start(fs_info, type))
+ return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ /* Exclusive operation is now claimed */
+ return 0;
+ }
+
+ /* Cancel running op */
+ if (btrfs_exclop_start_try_lock(fs_info, type)) {
+ /*
+ * This blocks any exclop finish from setting it to NONE, so we
+ * request cancellation. Either it runs and we will wait for it,
+ * or it has finished and no waiting will happen.
+ */
+ atomic_inc(&fs_info->reloc_cancel_req);
+ btrfs_exclop_start_unlock(fs_info);
+
+ if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
+ wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING,
+ TASK_INTERRUPTIBLE);
+
+ return -ECANCELED;
+ }
+
+ /* Something else is running or none */
+ return -ENOTCONN;
+}
+
static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
@@ -1577,6 +1659,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
char *devstr = NULL;
int ret = 0;
int mod = 0;
+ bool cancel;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1585,20 +1668,23 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret)
return ret;
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) {
- mnt_drop_write_file(file);
- return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
- }
-
+ /*
+ * Read the arguments before checking exclusivity to be able to
+ * distinguish regular resize and cancel
+ */
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
- goto out;
+ goto out_drop;
}
-
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-
sizestr = vol_args->name;
+ cancel = (strcmp("cancel", sizestr) == 0);
+ ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
+ if (ret)
+ goto out_free;
+ /* Exclusive operation is now claimed */
+
devstr = strchr(sizestr, ':');
if (devstr) {
sizestr = devstr + 1;
@@ -1606,10 +1692,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
devstr = vol_args->name;
ret = kstrtoull(devstr, 10, &devid);
if (ret)
- goto out_free;
+ goto out_finish;
if (!devid) {
ret = -EINVAL;
- goto out_free;
+ goto out_finish;
}
btrfs_info(fs_info, "resizing devid %llu", devid);
}
@@ -1619,7 +1705,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
ret = -ENODEV;
- goto out_free;
+ goto out_finish;
}
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1627,7 +1713,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
"resizer unable to apply on readonly device %llu",
devid);
ret = -EPERM;
- goto out_free;
+ goto out_finish;
}
if (!strcmp(sizestr, "max"))
@@ -1643,13 +1729,13 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = memparse(sizestr, &retptr);
if (*retptr != '\0' || new_size == 0) {
ret = -EINVAL;
- goto out_free;
+ goto out_finish;
}
}
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = -EPERM;
- goto out_free;
+ goto out_finish;
}
old_size = btrfs_device_get_total_bytes(device);
@@ -1657,24 +1743,24 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (mod < 0) {
if (new_size > old_size) {
ret = -EINVAL;
- goto out_free;
+ goto out_finish;
}
new_size = old_size - new_size;
} else if (mod > 0) {
if (new_size > ULLONG_MAX - old_size) {
ret = -ERANGE;
- goto out_free;
+ goto out_finish;
}
new_size = old_size + new_size;
}
if (new_size < SZ_256M) {
ret = -EINVAL;
- goto out_free;
+ goto out_finish;
}
if (new_size > device->bdev->bd_inode->i_size) {
ret = -EFBIG;
- goto out_free;
+ goto out_finish;
}
new_size = round_down(new_size, fs_info->sectorsize);
@@ -1683,7 +1769,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto out_free;
+ goto out_finish;
}
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans);
@@ -1696,10 +1782,11 @@ static noinline int btrfs_ioctl_resize(struct file *file,
"resize device %s (devid %llu) from %llu to %llu",
rcu_str_deref(device->name), device->devid,
old_size, new_size);
+out_finish:
+ btrfs_exclop_finish(fs_info);
out_free:
kfree(vol_args);
-out:
- btrfs_exclop_finish(fs_info);
+out_drop:
mnt_drop_write_file(file);
return ret;
}
@@ -2897,7 +2984,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
err = PTR_ERR(subvol_name_ptr);
goto free_parent;
}
- /* subvol_name_ptr is already NULL termined */
+ /* subvol_name_ptr is already nul terminated */
subvol_name = (char *)kbasename(subvol_name_ptr);
}
} else {
@@ -3119,6 +3206,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
+ bool cancel = false;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3137,18 +3225,22 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
ret = -EOPNOTSUPP;
goto out;
}
+ vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) &&
+ strcmp("cancel", vol_args->name) == 0)
+ cancel = true;
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
- ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
+ cancel);
+ if (ret)
goto out;
- }
+ /* Exclusive operation is now claimed */
- if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
- } else {
- vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ else
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
- }
+
btrfs_exclop_finish(fs_info);
if (!ret) {
@@ -3172,6 +3264,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
int ret;
+ bool cancel;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3180,25 +3273,24 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
- ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
- goto out_drop_write;
- }
-
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
- goto out;
+ goto out_drop_write;
}
-
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- ret = btrfs_rm_device(fs_info, vol_args->name, 0);
+ cancel = (strcmp("cancel", vol_args->name) == 0);
+
+ ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
+ cancel);
+ if (ret == 0) {
+ ret = btrfs_rm_device(fs_info, vol_args->name, 0);
+ if (!ret)
+ btrfs_info(fs_info, "disk deleted %s", vol_args->name);
+ btrfs_exclop_finish(fs_info);
+ }
- if (!ret)
- btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
-out:
- btrfs_exclop_finish(fs_info);
out_drop_write:
mnt_drop_write_file(file);
@@ -3551,7 +3643,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
goto out;
}
transid = trans->transid;
- ret = btrfs_commit_transaction_async(trans, 0);
+ ret = btrfs_commit_transaction_async(trans);
if (ret) {
btrfs_end_transaction(trans);
return ret;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 5fafc5e89bb7..313d9d685adb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -57,7 +57,7 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
/*
* Try-lock for read.
*
- * Retrun 1 if the rwlock has been taken, 0 otherwise
+ * Return 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
@@ -72,7 +72,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
/*
* Try-lock for write.
*
- * Retrun 1 if the rwlock has been taken, 0 otherwise
+ * Return 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6c413bb451a3..6eb41b7c0c84 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,6 +16,7 @@
#include "compression.h"
#include "delalloc-space.h"
#include "qgroup.h"
+#include "subpage.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -300,81 +301,142 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
}
/*
- * Finish IO for one ordered extent across a given range. The range can
- * contain several ordered extents.
+ * Mark all ordered extents io inside the specified range finished.
*
- * @found_ret: Return the finished ordered extent
- * @file_offset: File offset for the finished IO
- * Will also be updated to one byte past the range that is
- * recordered as finished. This allows caller to walk forward.
- * @io_size: Length of the finish IO range
- * @uptodate: If the IO finished without problem
- *
- * Return true if any ordered extent is finished in the range, and update
- * @found_ret and @file_offset.
- * Return false otherwise.
+ * @page: The invovled page for the opeartion.
+ * For uncompressed buffered IO, the page status also needs to be
+ * updated to indicate whether the pending ordered io is finished.
+ * Can be NULL for direct IO and compressed write.
+ * For these cases, callers are ensured they won't execute the
+ * endio function twice.
+ * @finish_func: The function to be executed when all the IO of an ordered
+ * extent are finished.
*
- * NOTE: Although The range can cross multiple ordered extents, only one
- * ordered extent will be updated during one call. The caller is responsible to
- * iterate all ordered extents in the range.
+ * This function is called for endio, thus the range must have ordered
+ * extent(s) coveri it.
*/
-bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
- struct btrfs_ordered_extent **finished_ret,
- u64 *file_offset, u64 io_size, int uptodate)
+void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
+ struct page *page, u64 file_offset,
+ u64 num_bytes, btrfs_func_t finish_func,
+ bool uptodate)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_workqueue *wq;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- bool finished = false;
unsigned long flags;
- u64 dec_end;
- u64 dec_start;
- u64 to_dec;
+ u64 cur = file_offset;
+
+ if (btrfs_is_free_space_inode(inode))
+ wq = fs_info->endio_freespace_worker;
+ else
+ wq = fs_info->endio_write_workers;
+
+ if (page)
+ ASSERT(page->mapping && page_offset(page) <= file_offset &&
+ file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
spin_lock_irqsave(&tree->lock, flags);
- node = tree_search(tree, *file_offset);
- if (!node)
- goto out;
+ while (cur < file_offset + num_bytes) {
+ u64 entry_end;
+ u64 end;
+ u32 len;
- entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (!in_range(*file_offset, entry->file_offset, entry->num_bytes))
- goto out;
+ node = tree_search(tree, cur);
+ /* No ordered extents at all */
+ if (!node)
+ break;
- dec_start = max(*file_offset, entry->file_offset);
- dec_end = min(*file_offset + io_size,
- entry->file_offset + entry->num_bytes);
- *file_offset = dec_end;
- if (dec_start > dec_end) {
- btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu",
- dec_start, dec_end);
- }
- to_dec = dec_end - dec_start;
- if (to_dec > entry->bytes_left) {
- btrfs_crit(fs_info,
- "bad ordered accounting left %llu size %llu",
- entry->bytes_left, to_dec);
- }
- entry->bytes_left -= to_dec;
- if (!uptodate)
- set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ entry_end = entry->file_offset + entry->num_bytes;
+ /*
+ * |<-- OE --->| |
+ * cur
+ * Go to next OE.
+ */
+ if (cur >= entry_end) {
+ node = rb_next(node);
+ /* No more ordered extents, exit */
+ if (!node)
+ break;
+ entry = rb_entry(node, struct btrfs_ordered_extent,
+ rb_node);
+
+ /* Go to next ordered extent and continue */
+ cur = entry->file_offset;
+ continue;
+ }
+ /*
+ * | |<--- OE --->|
+ * cur
+ * Go to the start of OE.
+ */
+ if (cur < entry->file_offset) {
+ cur = entry->file_offset;
+ continue;
+ }
- if (entry->bytes_left == 0) {
/*
- * Ensure only one caller can set the flag and finished_ret
- * accordingly
+ * Now we are definitely inside one ordered extent.
+ *
+ * |<--- OE --->|
+ * |
+ * cur
*/
- finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- /* test_and_set_bit implies a barrier */
- cond_wake_up_nomb(&entry->wait);
- }
-out:
- if (finished && finished_ret && entry) {
- *finished_ret = entry;
- refcount_inc(&entry->refs);
+ end = min(entry->file_offset + entry->num_bytes,
+ file_offset + num_bytes) - 1;
+ ASSERT(end + 1 - cur < U32_MAX);
+ len = end + 1 - cur;
+
+ if (page) {
+ /*
+ * Ordered (Private2) bit indicates whether we still
+ * have pending io unfinished for the ordered extent.
+ *
+ * If there's no such bit, we need to skip to next range.
+ */
+ if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
+ cur += len;
+ continue;
+ }
+ btrfs_page_clear_ordered(fs_info, page, cur, len);
+ }
+
+ /* Now we're fine to update the accounting */
+ if (unlikely(len > entry->bytes_left)) {
+ WARN_ON(1);
+ btrfs_crit(fs_info,
+"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode),
+ entry->file_offset,
+ entry->num_bytes,
+ len, entry->bytes_left);
+ entry->bytes_left = 0;
+ } else {
+ entry->bytes_left -= len;
+ }
+
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
+ /*
+ * All the IO of the ordered extent is finished, we need to queue
+ * the finish_func to be executed.
+ */
+ if (entry->bytes_left == 0) {
+ set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ cond_wake_up(&entry->wait);
+ refcount_inc(&entry->refs);
+ spin_unlock_irqrestore(&tree->lock, flags);
+ btrfs_init_work(&entry->work, finish_func, NULL, NULL);
+ btrfs_queue_work(wq, &entry->work);
+ spin_lock_irqsave(&tree->lock, flags);
+ }
+ cur += len;
}
spin_unlock_irqrestore(&tree->lock, flags);
- return finished;
}
/*
@@ -870,6 +932,81 @@ out:
}
/*
+ * Lookup the first ordered extent that overlaps the range
+ * [@file_offset, @file_offset + @len).
+ *
+ * The difference between this and btrfs_lookup_first_ordered_extent() is
+ * that this one won't return any ordered extent that does not overlap the range.
+ * And the difference against btrfs_lookup_ordered_extent() is, this function
+ * ensures the first ordered extent gets returned.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
+ struct btrfs_inode *inode, u64 file_offset, u64 len)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *node;
+ struct rb_node *cur;
+ struct rb_node *prev;
+ struct rb_node *next;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ spin_lock_irq(&tree->lock);
+ node = tree->tree.rb_node;
+ /*
+ * Here we don't want to use tree_search() which will use tree->last
+ * and screw up the search order.
+ * And __tree_search() can't return the adjacent ordered extents
+ * either, thus here we do our own search.
+ */
+ while (node) {
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+ if (file_offset < entry->file_offset) {
+ node = node->rb_left;
+ } else if (file_offset >= entry_end(entry)) {
+ node = node->rb_right;
+ } else {
+ /*
+ * Direct hit, got an ordered extent that starts at
+ * @file_offset
+ */
+ goto out;
+ }
+ }
+ if (!entry) {
+ /* Empty tree */
+ goto out;
+ }
+
+ cur = &entry->rb_node;
+ /* We got an entry around @file_offset, check adjacent entries */
+ if (entry->file_offset < file_offset) {
+ prev = cur;
+ next = rb_next(cur);
+ } else {
+ prev = rb_prev(cur);
+ next = cur;
+ }
+ if (prev) {
+ entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ goto out;
+ }
+ if (next) {
+ entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ goto out;
+ }
+ /* No ordered extent in the range */
+ entry = NULL;
+out:
+ if (entry)
+ refcount_inc(&entry->refs);
+ spin_unlock_irq(&tree->lock);
+ return entry;
+}
+
+/*
* btrfs_flush_ordered_range - Lock the passed range and ensures all pending
* ordered extents in it are run to completion.
*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e60c07f36427..566472004edd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -172,13 +172,13 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
+void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
+ struct page *page, u64 file_offset,
+ u64 num_bytes, btrfs_func_t finish_func,
+ bool uptodate);
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate);
-bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
- struct btrfs_ordered_extent **finished_ret,
- u64 *file_offset, u64 io_size,
- int uptodate);
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type);
@@ -196,6 +196,8 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
+ struct btrfs_inode *inode, u64 file_offset, u64 len);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode,
u64 file_offset,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 2dcb1cb21634..b1cb5a8c2999 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -260,6 +260,10 @@ static int prop_compression_validate(const char *value, size_t len)
if (btrfs_compress_is_valid_type(value, len))
return 0;
+ if ((len == 2 && strncmp("no", value, 2) == 0) ||
+ (len == 4 && strncmp("none", value, 4) == 0))
+ return 0;
+
return -EINVAL;
}
@@ -269,7 +273,17 @@ static int prop_compression_apply(struct inode *inode, const char *value,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int type;
+ /* Reset to defaults */
if (len == 0) {
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+ return 0;
+ }
+
+ /* Set NOCOMPRESS flag */
+ if ((len == 2 && strncmp("no", value, 2) == 0) ||
+ (len == 4 && strncmp("none", value, 4) == 0)) {
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
@@ -348,7 +362,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
/*
* This is not strictly necessary as the property should be
- * valid, but in case it isn't, don't propagate it futher.
+ * valid, but in case it isn't, don't propagate it further.
*/
ret = h->validate(value, strlen(value));
if (ret)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3ded812f522c..07ec06d4e972 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2521,7 +2521,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
int ret = 0;
/*
- * If quotas get disabled meanwhile, the resouces need to be freed and
+ * If quotas get disabled meanwhile, the resources need to be freed and
* we can't just exit here.
*/
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
@@ -3545,13 +3545,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
struct btrfs_trans_handle *trans;
int ret;
- /*
- * Can't hold an open transaction or we run the risk of deadlocking,
- * and can't either be under the context of a send operation (where
- * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that
- * would result in a crash when starting a transaction and does not
- * make sense either (send is a read-only operation).
- */
+ /* Can't hold an open transaction or we run the risk of deadlocking. */
ASSERT(current->journal_info == NULL);
if (WARN_ON(current->journal_info))
return 0;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index d434dc78dadf..9b0814318e72 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -7,6 +7,7 @@
#include "delalloc-space.h"
#include "reflink.h"
#include "transaction.h"
+#include "subpage.h"
#define BTRFS_MAX_DEDUPE_LEN SZ_16M
@@ -52,7 +53,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
const u64 datal,
const u8 comp_type)
{
- const u64 block_size = btrfs_inode_sectorsize(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 block_size = fs_info->sectorsize;
const u64 range_end = file_offset + block_size - 1;
const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
@@ -106,10 +108,12 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
if (comp_type == BTRFS_COMPRESS_NONE) {
- memcpy_to_page(page, 0, data_start, datal);
+ memcpy_to_page(page, offset_in_page(file_offset), data_start,
+ datal);
flush_dcache_page(page);
} else {
- ret = btrfs_decompress(comp_type, data_start, page, 0,
+ ret = btrfs_decompress(comp_type, data_start, page,
+ offset_in_page(file_offset),
inline_size, datal);
if (ret)
goto out_unlock;
@@ -133,9 +137,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
flush_dcache_page(page);
}
- SetPageUptodate(page);
+ btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
ClearPageChecked(page);
- set_page_dirty(page);
+ btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
out_unlock:
if (page) {
unlock_page(page);
@@ -203,10 +207,7 @@ static int clone_copy_inline_extent(struct inode *dst,
* inline extent's data to the page.
*/
ASSERT(key.offset > 0);
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal,
- comp_type);
- goto out;
+ goto copy_to_page;
}
} else if (i_size_read(dst) <= datal) {
struct btrfs_file_extent_item *ei;
@@ -222,13 +223,10 @@ static int clone_copy_inline_extent(struct inode *dst,
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal, comp_type);
- goto out;
+ goto copy_to_page;
}
copy_inline_extent:
- ret = 0;
/*
* We have no extent items, or we have an extent at offset 0 which may
* or may not be inlined. All these cases are dealt the same way.
@@ -240,11 +238,13 @@ copy_inline_extent:
* clone. Deal with all these cases by copying the inline extent
* data into the respective page at the destination inode.
*/
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal, comp_type);
- goto out;
+ goto copy_to_page;
}
+ /*
+ * Release path before starting a new transaction so we don't hold locks
+ * that would confuse lockdep.
+ */
btrfs_release_path(path);
/*
* If we end up here it means were copy the inline extent into a leaf
@@ -282,11 +282,6 @@ copy_inline_extent:
out:
if (!ret && !trans) {
/*
- * Release path before starting a new transaction so we don't
- * hold locks that would confuse lockdep.
- */
- btrfs_release_path(path);
- /*
* No transaction here means we copied the inline extent into a
* page of the destination inode.
*
@@ -306,6 +301,21 @@ out:
*trans_out = trans;
return ret;
+
+copy_to_page:
+ /*
+ * Release our path because we don't need it anymore and also because
+ * copy_inline_to_page() needs to reserve data and metadata, which may
+ * need to flush delalloc when we are low on available space and
+ * therefore cause a deadlock if writeback of an inline extent needs to
+ * write to the same leaf or an ordered extent completion needs to write
+ * to the same leaf.
+ */
+ btrfs_release_path(path);
+
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
+ goto out;
}
/**
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b70be2ac2e9e..fc831597cb22 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2876,11 +2876,12 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
}
/*
- * Allow error injection to test balance cancellation
+ * Allow error injection to test balance/relocation cancellation
*/
noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
{
return atomic_read(&fs_info->balance_cancel_req) ||
+ atomic_read(&fs_info->reloc_cancel_req) ||
fatal_signal_pending(current);
}
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
@@ -3780,6 +3781,60 @@ out:
return inode;
}
+/*
+ * Mark start of chunk relocation that is cancellable. Check if the cancellation
+ * has been requested meanwhile and don't start in that case.
+ *
+ * Return:
+ * 0 success
+ * -EINPROGRESS operation is already in progress, that's probably a bug
+ * -ECANCELED cancellation request was set before the operation started
+ * -EAGAIN can not start because there are ongoing send operations
+ */
+static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
+{
+ spin_lock(&fs_info->send_reloc_lock);
+ if (fs_info->send_in_progress) {
+ btrfs_warn_rl(fs_info,
+"cannot run relocation while send operations are in progress (%d in progress)",
+ fs_info->send_in_progress);
+ spin_unlock(&fs_info->send_reloc_lock);
+ return -EAGAIN;
+ }
+ if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
+ /* This should not happen */
+ spin_unlock(&fs_info->send_reloc_lock);
+ btrfs_err(fs_info, "reloc already running, cannot start");
+ return -EINPROGRESS;
+ }
+ spin_unlock(&fs_info->send_reloc_lock);
+
+ if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
+ btrfs_info(fs_info, "chunk relocation canceled on start");
+ /*
+ * On cancel, clear all requests but let the caller mark
+ * the end after cleanup operations.
+ */
+ atomic_set(&fs_info->reloc_cancel_req, 0);
+ return -ECANCELED;
+ }
+ return 0;
+}
+
+/*
+ * Mark end of chunk relocation that is cancellable and wake any waiters.
+ */
+static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
+{
+ /* Requested after start, clear bit first so any waiters can continue */
+ if (atomic_read(&fs_info->reloc_cancel_req) > 0)
+ btrfs_info(fs_info, "chunk relocation canceled during operation");
+ spin_lock(&fs_info->send_reloc_lock);
+ clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
+ spin_unlock(&fs_info->send_reloc_lock);
+ atomic_set(&fs_info->reloc_cancel_req, 0);
+}
+
static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
{
struct reloc_control *rc;
@@ -3862,6 +3917,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
return -ENOMEM;
}
+ ret = reloc_chunk_start(fs_info);
+ if (ret < 0) {
+ err = ret;
+ goto out_put_bg;
+ }
+
rc->extent_root = extent_root;
rc->block_group = bg;
@@ -3952,7 +4013,9 @@ out:
if (err && rw)
btrfs_dec_block_group_ro(rc->block_group);
iput(rc->data_inode);
- btrfs_put_block_group(rc->block_group);
+out_put_bg:
+ btrfs_put_block_group(bg);
+ reloc_chunk_end(fs_info);
free_reloc_control(rc);
return err;
}
@@ -4073,6 +4136,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
goto out;
}
+ ret = reloc_chunk_start(fs_info);
+ if (ret < 0) {
+ err = ret;
+ goto out_end;
+ }
+
rc->extent_root = fs_info->extent_root;
set_reloc_control(rc);
@@ -4137,6 +4206,8 @@ out_clean:
err = ret;
out_unset:
unset_reloc_control(rc);
+out_end:
+ reloc_chunk_end(fs_info);
free_reloc_control(rc);
out:
free_reloc_roots(&reloc_roots);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 485cda3eb8d7..088641ba7a8e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -165,6 +165,10 @@ struct scrub_ctx {
int readonly;
int pages_per_rd_bio;
+ /* State of IO submission throttling affecting the associated device */
+ ktime_t throttle_deadline;
+ u64 throttle_sent;
+
int is_dev_replace;
u64 write_pointer;
@@ -605,6 +609,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
init_waitqueue_head(&sctx->list_wait);
+ sctx->throttle_deadline = 0;
WARN_ON(sctx->wr_curr_bio != NULL);
mutex_init(&sctx->wr_lock);
@@ -626,7 +631,6 @@ nomem:
static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
void *warn_ctx)
{
- u64 isize;
u32 nlink;
int ret;
int i;
@@ -662,7 +666,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
eb = swarn->path->nodes[0];
inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
struct btrfs_inode_item);
- isize = btrfs_inode_size(eb, inode_item);
nlink = btrfs_inode_nlink(eb, inode_item);
btrfs_release_path(swarn->path);
@@ -691,12 +694,12 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
+"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
swarn->errstr, swarn->logical,
rcu_str_deref(swarn->dev->name),
swarn->physical,
root, inum, offset,
- min(isize - offset, (u64)PAGE_SIZE), nlink,
+ fs_info->sectorsize, nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
btrfs_put_root(local_root);
@@ -885,25 +888,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* read all mirrors one after the other. This includes to
* re-read the extent or metadata block that failed (that was
* the cause that this fixup code is called) another time,
- * page by page this time in order to know which pages
+ * sector by sector this time in order to know which sectors
* caused I/O errors and which ones are good (for all mirrors).
* It is the goal to handle the situation when more than one
* mirror contains I/O errors, but the errors do not
* overlap, i.e. the data can be repaired by selecting the
- * pages from those mirrors without I/O error on the
- * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
- * would be that mirror #1 has an I/O error on the first page,
- * the second page is good, and mirror #2 has an I/O error on
- * the second page, but the first page is good.
- * Then the first page of the first mirror can be repaired by
- * taking the first page of the second mirror, and the
- * second page of the second mirror can be repaired by
- * copying the contents of the 2nd page of the 1st mirror.
- * One more note: if the pages of one mirror contain I/O
+ * sectors from those mirrors without I/O error on the
+ * particular sectors. One example (with blocks >= 2 * sectorsize)
+ * would be that mirror #1 has an I/O error on the first sector,
+ * the second sector is good, and mirror #2 has an I/O error on
+ * the second sector, but the first sector is good.
+ * Then the first sector of the first mirror can be repaired by
+ * taking the first sector of the second mirror, and the
+ * second sector of the second mirror can be repaired by
+ * copying the contents of the 2nd sector of the 1st mirror.
+ * One more note: if the sectors of one mirror contain I/O
* errors, the checksum cannot be verified. In order to get
* the best data for repairing, the first attempt is to find
* a mirror without I/O errors and with a validated checksum.
- * Only if this is not possible, the pages are picked from
+ * Only if this is not possible, the sectors are picked from
* mirrors with I/O errors without considering the checksum.
* If the latter is the case, at the end, the checksum of the
* repaired area is verified in order to correctly maintain
@@ -1060,26 +1063,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
/*
* In case of I/O errors in the area that is supposed to be
- * repaired, continue by picking good copies of those pages.
- * Select the good pages from mirrors to rewrite bad pages from
+ * repaired, continue by picking good copies of those sectors.
+ * Select the good sectors from mirrors to rewrite bad sectors from
* the area to fix. Afterwards verify the checksum of the block
* that is supposed to be repaired. This verification step is
* only done for the purpose of statistic counting and for the
* final scrub report, whether errors remain.
* A perfect algorithm could make use of the checksum and try
- * all possible combinations of pages from the different mirrors
+ * all possible combinations of sectors from the different mirrors
* until the checksum verification succeeds. For example, when
- * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+ * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
* of mirror #2 is readable but the final checksum test fails,
- * then the 2nd page of mirror #3 could be tried, whether now
+ * then the 2nd sector of mirror #3 could be tried, whether now
* the final checksum succeeds. But this would be a rare
* exception and is therefore not implemented. At least it is
* avoided that the good copy is overwritten.
* A more useful improvement would be to pick the sectors
* without I/O error based on sector sizes (512 bytes on legacy
- * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+ * disks) instead of on sectorsize. Then maybe 512 byte of one
* mirror could be repaired by taking 512 byte of a different
- * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+ * mirror, even if other 512 byte sectors in the same sectorsize
* area are unreadable.
*/
success = 1;
@@ -1260,7 +1263,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
{
struct scrub_ctx *sctx = original_sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 length = original_sblock->page_count * PAGE_SIZE;
+ u64 length = original_sblock->page_count * fs_info->sectorsize;
u64 logical = original_sblock->pagev[0]->logical;
u64 generation = original_sblock->pagev[0]->generation;
u64 flags = original_sblock->pagev[0]->flags;
@@ -1283,13 +1286,13 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
*/
while (length > 0) {
- sublen = min_t(u64, length, PAGE_SIZE);
+ sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
bbio = NULL;
/*
- * with a length of PAGE_SIZE, each returned stripe
- * represents one mirror
+ * With a length of sectorsize, each returned stripe represents
+ * one mirror
*/
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
@@ -1480,7 +1483,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
bio = btrfs_io_bio_alloc(1);
bio_set_dev(bio, spage->dev->bdev);
- bio_add_page(bio, spage->page, PAGE_SIZE, 0);
+ bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
bio->bi_iter.bi_sector = spage->physical >> 9;
bio->bi_opf = REQ_OP_READ;
@@ -1544,6 +1547,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
struct scrub_page *spage_good = sblock_good->pagev[page_num];
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
BUG_ON(spage_bad->page == NULL);
BUG_ON(spage_good->page == NULL);
@@ -1563,8 +1567,8 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
bio->bi_iter.bi_sector = spage_bad->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
- ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0);
- if (PAGE_SIZE != ret) {
+ ret = bio_add_page(bio, spage_good->page, sectorsize, 0);
+ if (ret != sectorsize) {
bio_put(bio);
return -EIO;
}
@@ -1642,6 +1646,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
{
struct scrub_bio *sbio;
int ret;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
mutex_lock(&sctx->wr_lock);
again:
@@ -1681,16 +1686,16 @@ again:
bio->bi_iter.bi_sector = sbio->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+ } else if (sbio->physical + sbio->page_count * sectorsize !=
spage->physical_for_dev_replace ||
- sbio->logical + sbio->page_count * PAGE_SIZE !=
+ sbio->logical + sbio->page_count * sectorsize !=
spage->logical) {
scrub_wr_submit(sctx);
goto again;
}
- ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
- if (ret != PAGE_SIZE) {
+ ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
+ if (ret != sectorsize) {
if (sbio->page_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
@@ -1729,7 +1734,8 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
btrfsic_submit_bio(sbio->bio);
if (btrfs_is_zoned(sctx->fs_info))
- sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE;
+ sctx->write_pointer = sbio->physical + sbio->page_count *
+ sctx->fs_info->sectorsize;
}
static void scrub_wr_bio_end_io(struct bio *bio)
@@ -1988,6 +1994,65 @@ static void scrub_page_put(struct scrub_page *spage)
}
}
+/*
+ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
+ * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
+ */
+static void scrub_throttle(struct scrub_ctx *sctx)
+{
+ const int time_slice = 1000;
+ struct scrub_bio *sbio;
+ struct btrfs_device *device;
+ s64 delta;
+ ktime_t now;
+ u32 div;
+ u64 bwlimit;
+
+ sbio = sctx->bios[sctx->curr];
+ device = sbio->dev;
+ bwlimit = READ_ONCE(device->scrub_speed_max);
+ if (bwlimit == 0)
+ return;
+
+ /*
+ * Slice is divided into intervals when the IO is submitted, adjust by
+ * bwlimit and maximum of 64 intervals.
+ */
+ div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+ div = min_t(u32, 64, div);
+
+ /* Start new epoch, set deadline */
+ now = ktime_get();
+ if (sctx->throttle_deadline == 0) {
+ sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
+ sctx->throttle_sent = 0;
+ }
+
+ /* Still in the time to send? */
+ if (ktime_before(now, sctx->throttle_deadline)) {
+ /* If current bio is within the limit, send it */
+ sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
+ if (sctx->throttle_sent <= div_u64(bwlimit, div))
+ return;
+
+ /* We're over the limit, sleep until the rest of the slice */
+ delta = ktime_ms_delta(sctx->throttle_deadline, now);
+ } else {
+ /* New request after deadline, start new epoch */
+ delta = 0;
+ }
+
+ if (delta) {
+ long timeout;
+
+ timeout = div_u64(delta * HZ, 1000);
+ schedule_timeout_interruptible(timeout);
+ }
+
+ /* Next call will start the deadline period */
+ sctx->throttle_deadline = 0;
+}
+
static void scrub_submit(struct scrub_ctx *sctx)
{
struct scrub_bio *sbio;
@@ -1995,6 +2060,8 @@ static void scrub_submit(struct scrub_ctx *sctx)
if (sctx->curr == -1)
return;
+ scrub_throttle(sctx);
+
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
@@ -2006,6 +2073,7 @@ static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
{
struct scrub_block *sblock = spage->sblock;
struct scrub_bio *sbio;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
int ret;
again:
@@ -2044,9 +2112,9 @@ again:
bio->bi_iter.bi_sector = sbio->physical >> 9;
bio->bi_opf = REQ_OP_READ;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+ } else if (sbio->physical + sbio->page_count * sectorsize !=
spage->physical ||
- sbio->logical + sbio->page_count * PAGE_SIZE !=
+ sbio->logical + sbio->page_count * sectorsize !=
spage->logical ||
sbio->dev != spage->dev) {
scrub_submit(sctx);
@@ -2054,8 +2122,8 @@ again:
}
sbio->pagev[sbio->page_count] = spage;
- ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
- if (ret != PAGE_SIZE) {
+ ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
+ if (ret != sectorsize) {
if (sbio->page_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
@@ -2398,7 +2466,7 @@ static void scrub_block_complete(struct scrub_block *sblock)
if (sblock->sparity && corrupted && !sblock->data_corrected) {
u64 start = sblock->pagev[0]->logical;
u64 end = sblock->pagev[sblock->page_count - 1]->logical +
- PAGE_SIZE;
+ sblock->sctx->fs_info->sectorsize;
ASSERT(end - start <= U32_MAX);
scrub_parity_mark_sectors_error(sblock->sparity,
@@ -2418,7 +2486,7 @@ static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *su
* the csum into @csum.
*
* The search source is sctx->csum_list, which is a pre-populated list
- * storing bytenr ordered csum ranges. We're reponsible to cleanup any range
+ * storing bytenr ordered csum ranges. We're responsible to cleanup any range
* that is before @logical.
*
* Return 0 if there is no csum for the range.
@@ -3138,28 +3206,23 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
physical = map->stripes[num].physical;
offset = 0;
nstripes = div64_u64(length, map->stripe_len);
+ mirror_num = 1;
+ increment = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
offset = map->stripe_len * num;
increment = map->stripe_len * map->num_stripes;
- mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
int factor = map->num_stripes / map->sub_stripes;
offset = map->stripe_len * (num / map->sub_stripes);
increment = map->stripe_len * factor;
mirror_num = num % map->sub_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
get_raid56_logic_offset(physical, num, map, &offset, NULL);
increment = map->stripe_len * nr_data_stripes(map);
- mirror_num = 1;
- } else {
- increment = map->stripe_len;
- mirror_num = 1;
}
path = btrfs_alloc_path();
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index bd69db72acc5..6ac37ae6c811 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2078,16 +2078,6 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
}
/*
- * Removes the entry from the list and adds it back to the end. This marks the
- * entry as recently used so that name_cache_clean_unused does not remove it.
- */
-static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
-{
- list_del(&nce->list);
- list_add_tail(&nce->list, &sctx->name_cache_list);
-}
-
-/*
* Remove some entries from the beginning of name_cache_list.
*/
static void name_cache_clean_unused(struct send_ctx *sctx)
@@ -2147,7 +2137,13 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
kfree(nce);
nce = NULL;
} else {
- name_cache_used(sctx, nce);
+ /*
+ * Removes the entry from the list and adds it back to
+ * the end. This marks the entry as recently used so
+ * that name_cache_clean_unused does not remove it.
+ */
+ list_move_tail(&nce->list, &sctx->name_cache_list);
+
*parent_ino = nce->parent_ino;
*parent_gen = nce->parent_gen;
ret = fs_path_add(dest, nce->name, nce->name_len);
@@ -4064,6 +4060,17 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret < 0)
goto out;
} else {
+ /*
+ * If we previously orphanized a directory that
+ * collided with a new reference that we already
+ * processed, recompute the current path because
+ * that directory may be part of the path.
+ */
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
goto out;
@@ -6507,7 +6514,7 @@ static int changed_extent(struct send_ctx *sctx,
* updates the inode item, but it only changes the iversion (sequence
* field in the inode item) of the inode, so if a file is deduplicated
* the same amount of times in both the parent and send snapshots, its
- * iversion becames the same in both snapshots, whence the inode item is
+ * iversion becomes the same in both snapshots, whence the inode item is
* the same on both snapshots.
*/
if (sctx->cur_ino != sctx->cmp_key->objectid)
@@ -7409,23 +7416,21 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
if (ret)
goto out;
- mutex_lock(&fs_info->balance_mutex);
- if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
- mutex_unlock(&fs_info->balance_mutex);
+ spin_lock(&fs_info->send_reloc_lock);
+ if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
+ spin_unlock(&fs_info->send_reloc_lock);
btrfs_warn_rl(fs_info,
- "cannot run send because a balance operation is in progress");
+ "cannot run send because a relocation operation is in progress");
ret = -EAGAIN;
goto out;
}
fs_info->send_in_progress++;
- mutex_unlock(&fs_info->balance_mutex);
+ spin_unlock(&fs_info->send_reloc_lock);
- current->journal_info = BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
- current->journal_info = NULL;
- mutex_lock(&fs_info->balance_mutex);
+ spin_lock(&fs_info->send_reloc_lock);
fs_info->send_in_progress--;
- mutex_unlock(&fs_info->balance_mutex);
+ spin_unlock(&fs_info->send_reloc_lock);
if (ret < 0)
goto out;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 2dc674b7c3b1..f79bf85f2439 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -133,18 +133,13 @@
* operations, however they won't be usable until the transaction commits.
*
* COMMIT_TRANS
- * may_commit_transaction() is the ultimate arbiter on whether we commit the
- * transaction or not. In order to avoid constantly churning we do all the
- * above flushing first and then commit the transaction as the last resort.
- * However we need to take into account things like pinned space that would
- * be freed, plus any delayed work we may not have gotten rid of in the case
- * of metadata.
- *
- * FORCE_COMMIT_TRANS
- * For use by the preemptive flusher. We use this to bypass the ticketing
- * checks in may_commit_transaction, as we have more information about the
- * overall state of the system and may want to commit the transaction ahead
- * of actual ENOSPC conditions.
+ * This will commit the transaction. Historically we had a lot of logic
+ * surrounding whether or not we'd commit the transaction, but this waits born
+ * out of a pre-tickets era where we could end up committing the transaction
+ * thousands of times in a row without making progress. Now thanks to our
+ * ticketing system we know if we're not making progress and can error
+ * everybody out after a few commits rather than burning the disk hoping for
+ * a different answer.
*
* OVERCOMMIT
*
@@ -197,13 +192,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (!space_info)
return -ENOMEM;
- ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
- GFP_KERNEL);
- if (ret) {
- kfree(space_info);
- return ret;
- }
-
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]);
init_rwsem(&space_info->groups_sem);
@@ -389,7 +377,7 @@ again:
ticket = list_first_entry(head, struct reserve_ticket, list);
- /* Check and see if our ticket can be satisified now. */
+ /* Check and see if our ticket can be satisfied now. */
if ((used + ticket->bytes <= space_info->total_bytes) ||
btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
flush)) {
@@ -495,7 +483,8 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
*/
static void shrink_delalloc(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
- u64 to_reclaim, bool wait_ordered)
+ u64 to_reclaim, bool wait_ordered,
+ bool for_preempt)
{
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
@@ -532,7 +521,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
* ordered extents, otherwise we'll waste time trying to flush delalloc
* that likely won't give us the space back we need.
*/
- if (ordered_bytes > delalloc_bytes)
+ if (ordered_bytes > delalloc_bytes && !for_preempt)
wait_ordered = true;
loops = 0;
@@ -551,6 +540,14 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
break;
}
+ /*
+ * If we are for preemption we just want a one-shot of delalloc
+ * flushing so we can stop flushing if we decide we don't need
+ * to anymore.
+ */
+ if (for_preempt)
+ break;
+
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
@@ -566,109 +563,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
}
}
-/**
- * Possibly commit the transaction if its ok to
- *
- * @fs_info: the filesystem
- * @space_info: space_info we are checking for commit, either data or metadata
- *
- * This will check to make sure that committing the transaction will actually
- * get us somewhere and then commit the transaction if it does. Otherwise it
- * will return -ENOSPC.
- */
-static int may_commit_transaction(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
-{
- struct reserve_ticket *ticket = NULL;
- struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
- struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
- struct btrfs_trans_handle *trans;
- u64 reclaim_bytes = 0;
- u64 bytes_needed = 0;
- u64 cur_free_bytes = 0;
-
- trans = (struct btrfs_trans_handle *)current->journal_info;
- if (trans)
- return -EAGAIN;
-
- spin_lock(&space_info->lock);
- cur_free_bytes = btrfs_space_info_used(space_info, true);
- if (cur_free_bytes < space_info->total_bytes)
- cur_free_bytes = space_info->total_bytes - cur_free_bytes;
- else
- cur_free_bytes = 0;
-
- if (!list_empty(&space_info->priority_tickets))
- ticket = list_first_entry(&space_info->priority_tickets,
- struct reserve_ticket, list);
- else if (!list_empty(&space_info->tickets))
- ticket = list_first_entry(&space_info->tickets,
- struct reserve_ticket, list);
- if (ticket)
- bytes_needed = ticket->bytes;
-
- if (bytes_needed > cur_free_bytes)
- bytes_needed -= cur_free_bytes;
- else
- bytes_needed = 0;
- spin_unlock(&space_info->lock);
-
- if (!bytes_needed)
- return 0;
-
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- /*
- * See if there is enough pinned space to make this reservation, or if
- * we have block groups that are going to be freed, allowing us to
- * possibly do a chunk allocation the next loop through.
- */
- if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
- __percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
- goto commit;
-
- /*
- * See if there is some space in the delayed insertion reserve for this
- * reservation. If the space_info's don't match (like for DATA or
- * SYSTEM) then just go enospc, reclaiming this space won't recover any
- * space to satisfy those reservations.
- */
- if (space_info != delayed_rsv->space_info)
- goto enospc;
-
- spin_lock(&delayed_rsv->lock);
- reclaim_bytes += delayed_rsv->reserved;
- spin_unlock(&delayed_rsv->lock);
-
- spin_lock(&delayed_refs_rsv->lock);
- reclaim_bytes += delayed_refs_rsv->reserved;
- spin_unlock(&delayed_refs_rsv->lock);
-
- spin_lock(&trans_rsv->lock);
- reclaim_bytes += trans_rsv->reserved;
- spin_unlock(&trans_rsv->lock);
-
- if (reclaim_bytes >= bytes_needed)
- goto commit;
- bytes_needed -= reclaim_bytes;
-
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
- goto enospc;
-
-commit:
- return btrfs_commit_transaction(trans);
-enospc:
- btrfs_end_transaction(trans);
- return -ENOSPC;
-}
-
/*
* Try to flush some data based on policy set by @state. This is only advisory
* and may fail for various reasons. The caller is supposed to examine the
@@ -702,7 +596,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
shrink_delalloc(fs_info, space_info, num_bytes,
- state == FLUSH_DELALLOC_WAIT);
+ state == FLUSH_DELALLOC_WAIT, for_preempt);
break;
case FLUSH_DELAYED_REFS_NR:
case FLUSH_DELAYED_REFS:
@@ -743,9 +637,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
btrfs_wait_on_delayed_iputs(fs_info);
break;
case COMMIT_TRANS:
- ret = may_commit_transaction(fs_info, space_info);
- break;
- case FORCE_COMMIT_TRANS:
+ ASSERT(current->journal_info == NULL);
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -792,12 +684,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
+ u64 global_rsv_size = fs_info->global_block_rsv.reserved;
u64 ordered, delalloc;
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
u64 used;
/* If we're just plain full then async reclaim just slows us down. */
- if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
+ if ((space_info->bytes_used + space_info->bytes_reserved +
+ global_rsv_size) >= thresh)
return false;
/*
@@ -838,8 +732,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
thresh = calc_available_free_space(fs_info, space_info,
BTRFS_RESERVE_FLUSH_ALL);
- thresh += (space_info->total_bytes - space_info->bytes_used -
- space_info->bytes_reserved - space_info->bytes_readonly);
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_readonly + global_rsv_size;
+ if (used < space_info->total_bytes)
+ thresh += space_info->total_bytes - used;
thresh >>= space_info->clamp;
used = space_info->bytes_pinned;
@@ -860,14 +756,20 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
* clearly be heavy enough to warrant preemptive flushing. In the case
* of heavy DIO or ordered reservations, preemptive flushing will just
* waste time and cause us to slow down.
+ *
+ * We want to make sure we truly are maxed out on ordered however, so
+ * cut ordered in half, and if it's still higher than delalloc then we
+ * can keep flushing. This is to avoid the case where we start
+ * flushing, and now delalloc == ordered and we stop preemptively
+ * flushing when we could still have several gigs of delalloc to flush.
*/
- ordered = percpu_counter_read_positive(&fs_info->ordered_bytes);
+ ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
if (ordered >= delalloc)
used += fs_info->delayed_refs_rsv.reserved +
fs_info->delayed_block_rsv.reserved;
else
- used += space_info->bytes_may_use;
+ used += space_info->bytes_may_use - global_rsv_size;
return (used >= thresh && !btrfs_fs_closing(fs_info) &&
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
@@ -921,7 +823,6 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id;
- u64 first_ticket_bytes = 0;
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
@@ -937,21 +838,6 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
steal_from_global_rsv(fs_info, space_info, ticket))
return true;
- /*
- * may_commit_transaction will avoid committing the transaction
- * if it doesn't feel like the space reclaimed by the commit
- * would result in the ticket succeeding. However if we have a
- * smaller ticket in the queue it may be small enough to be
- * satisified by committing the transaction, so if any
- * subsequent ticket is smaller than the first ticket go ahead
- * and send us back for another loop through the enospc flushing
- * code.
- */
- if (first_ticket_bytes == 0)
- first_ticket_bytes = ticket->bytes;
- else if (first_ticket_bytes > ticket->bytes)
- return true;
-
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
btrfs_info(fs_info, "failing ticket with %llu bytes",
ticket->bytes);
@@ -1117,7 +1003,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
(delayed_block_rsv->reserved +
delayed_refs_rsv->reserved)) {
to_reclaim = space_info->bytes_pinned;
- flush = FORCE_COMMIT_TRANS;
+ flush = COMMIT_TRANS;
} else if (delayed_block_rsv->reserved >
delayed_refs_rsv->reserved) {
to_reclaim = delayed_block_rsv->reserved;
@@ -1171,28 +1057,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* immediately re-usable, it comes in the form of a delayed ref, which must be
* run and then the transaction must be committed.
*
- * FLUSH_DELAYED_REFS
- * The above two cases generate delayed refs that will affect
- * ->total_bytes_pinned. However this counter can be inconsistent with
- * reality if there are outstanding delayed refs. This is because we adjust
- * the counter based solely on the current set of delayed refs and disregard
- * any on-disk state which might include more refs. So for example, if we
- * have an extent with 2 references, but we only drop 1, we'll see that there
- * is a negative delayed ref count for the extent and assume that the space
- * will be freed, and thus increase ->total_bytes_pinned.
- *
- * Running the delayed refs gives us the actual real view of what will be
- * freed at the transaction commit time. This stage will not actually free
- * space for us, it just makes sure that may_commit_transaction() has all of
- * the information it needs to make the right decision.
- *
* COMMIT_TRANS
- * This is where we reclaim all of the pinned space generated by the previous
- * two stages. We will not commit the transaction if we don't think we're
- * likely to satisfy our request, which means if our current free space +
- * total_bytes_pinned < reservation we will not commit. This is why the
- * previous states are actually important, to make sure we know for sure
- * whether committing the transaction will allow us to make progress.
+ * This is where we reclaim all of the pinned space generated by running the
+ * iputs
*
* ALLOC_CHUNK_FORCE
* For data we start with alloc chunk force, however we could have been full
@@ -1202,7 +1069,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
static const enum btrfs_flush_state data_flush_states[] = {
FLUSH_DELALLOC_WAIT,
RUN_DELAYED_IPUTS,
- FLUSH_DELAYED_REFS,
COMMIT_TRANS,
ALLOC_CHUNK_FORCE,
};
@@ -1561,6 +1427,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
flush == BTRFS_RESERVE_FLUSH_DATA) {
list_add_tail(&ticket.list, &space_info->tickets);
if (!space_info->flush) {
+ /*
+ * We were forced to add a reserve ticket, so
+ * our preemptive flushing is unable to keep
+ * up. Clamp down on the threshold for the
+ * preemptive flushing in order to keep up with
+ * the workload.
+ */
+ maybe_clamp_preempt(fs_info, space_info);
+
space_info->flush = 1;
trace_btrfs_trigger_flush(fs_info,
space_info->flags,
@@ -1572,14 +1447,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
list_add_tail(&ticket.list,
&space_info->priority_tickets);
}
-
- /*
- * We were forced to add a reserve ticket, so our preemptive
- * flushing is unable to keep up. Clamp down on the threshold
- * for the preemptive flushing in order to keep up with the
- * workload.
- */
- maybe_clamp_preempt(fs_info, space_info);
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
/*
@@ -1588,8 +1455,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* the async reclaim as we will panic.
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
- need_preemptive_reclaim(fs_info, space_info) &&
- !work_busy(&fs_info->preempt_reclaim_work)) {
+ !work_busy(&fs_info->preempt_reclaim_work) &&
+ need_preemptive_reclaim(fs_info, space_info)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
queue_work(system_unbound_wq,
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index b1a8ffb03b3e..cb5056472e79 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -43,18 +43,6 @@ struct btrfs_space_info {
u64 flags;
- /*
- * bytes_pinned is kept in line with what is actually pinned, as in
- * we've called update_block_group and dropped the bytes_used counter
- * and increased the bytes_pinned counter. However this means that
- * bytes_pinned does not reflect the bytes that will be pinned once the
- * delayed refs are flushed, so this counter is inc'ed every time we
- * call btrfs_free_extent so it is a realtime count of what will be
- * freed once the transaction is committed. It will be zeroed every
- * time the transaction commits.
- */
- struct percpu_counter total_bytes_pinned;
-
struct list_head list;
/* Protected by the spinlock 'lock'. */
struct list_head ro_bgs;
@@ -157,22 +145,4 @@ static inline void btrfs_space_info_free_bytes_may_use(
}
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
-
-static inline void __btrfs_mod_total_bytes_pinned(
- struct btrfs_space_info *space_info,
- s64 mod)
-{
- percpu_counter_add_batch(&space_info->total_bytes_pinned, mod,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-}
-
-static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info,
- u64 flags, s64 mod)
-{
- struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags);
-
- ASSERT(space_info);
- __btrfs_mod_total_bytes_pinned(space_info, mod);
-}
-
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 2d19089ab625..640bcd21bf28 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -3,6 +3,7 @@
#include <linux/slab.h>
#include "ctree.h"
#include "subpage.h"
+#include "btrfs_inode.h"
/*
* Subpage (sectorsize < PAGE_SIZE) support overview:
@@ -110,10 +111,12 @@ int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
if (!*ret)
return -ENOMEM;
spin_lock_init(&(*ret)->lock);
- if (type == BTRFS_SUBPAGE_METADATA)
+ if (type == BTRFS_SUBPAGE_METADATA) {
atomic_set(&(*ret)->eb_refs, 0);
- else
+ } else {
atomic_set(&(*ret)->readers, 0);
+ atomic_set(&(*ret)->writers, 0);
+ }
return 0;
}
@@ -183,12 +186,10 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
const int nbits = len >> fs_info->sectorsize_bits;
- int ret;
btrfs_subpage_assert(fs_info, page, start, len);
- ret = atomic_add_return(nbits, &subpage->readers);
- ASSERT(ret == nbits);
+ atomic_add(nbits, &subpage->readers);
}
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
@@ -196,10 +197,95 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
const int nbits = len >> fs_info->sectorsize_bits;
+ bool is_data;
+ bool last;
btrfs_subpage_assert(fs_info, page, start, len);
+ is_data = is_data_inode(page->mapping->host);
ASSERT(atomic_read(&subpage->readers) >= nbits);
- if (atomic_sub_and_test(nbits, &subpage->readers))
+ last = atomic_sub_and_test(nbits, &subpage->readers);
+
+ /*
+ * For data we need to unlock the page if the last read has finished.
+ *
+ * And please don't replace @last with atomic_sub_and_test() call
+ * inside if () condition.
+ * As we want the atomic_sub_and_test() to be always executed.
+ */
+ if (is_data && last)
+ unlock_page(page);
+}
+
+static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
+{
+ u64 orig_start = *start;
+ u32 orig_len = *len;
+
+ *start = max_t(u64, page_offset(page), orig_start);
+ *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ orig_start + orig_len) - *start;
+}
+
+void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = (len >> fs_info->sectorsize_bits);
+ int ret;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ ASSERT(atomic_read(&subpage->readers) == 0);
+ ret = atomic_add_return(nbits, &subpage->writers);
+ ASSERT(ret == nbits);
+}
+
+bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = (len >> fs_info->sectorsize_bits);
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ ASSERT(atomic_read(&subpage->writers) >= nbits);
+ return atomic_sub_and_test(nbits, &subpage->writers);
+}
+
+/*
+ * Lock a page for delalloc page writeback.
+ *
+ * Return -EAGAIN if the page is not properly initialized.
+ * Return 0 with the page locked, and writer counter updated.
+ *
+ * Even with 0 returned, the page still need extra check to make sure
+ * it's really the correct page, as the caller is using
+ * find_get_pages_contig(), which can race with page invalidating.
+ */
+int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {
+ lock_page(page);
+ return 0;
+ }
+ lock_page(page);
+ if (!PagePrivate(page) || !page->private) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
+ btrfs_subpage_clamp_range(page, &start, &len);
+ btrfs_subpage_start_writer(fs_info, page, start, len);
+ return 0;
+}
+
+void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)
+ return unlock_page(page);
+ btrfs_subpage_clamp_range(page, &start, &len);
+ if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
unlock_page(page);
}
@@ -354,6 +440,32 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
spin_unlock_irqrestore(&subpage->lock, flags);
}
+void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->ordered_bitmap |= tmp;
+ SetPageOrdered(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->ordered_bitmap &= ~tmp;
+ if (subpage->ordered_bitmap == 0)
+ ClearPageOrdered(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
/*
* Unlike set/clear which is dependent on each page status, for test all bits
* are tested in the same way.
@@ -376,6 +488,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
/*
* Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@@ -408,6 +521,34 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
return test_page_func(page); \
return btrfs_subpage_test_##name(fs_info, page, start, len); \
+} \
+void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ set_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_clamp_range(page, &start, &len); \
+ btrfs_subpage_set_##name(fs_info, page, start, len); \
+} \
+void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ clear_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_clamp_range(page, &start, &len); \
+ btrfs_subpage_clear_##name(fs_info, page, start, len); \
+} \
+bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
+ return test_page_func(page); \
+ btrfs_subpage_clamp_range(page, &start, &len); \
+ return btrfs_subpage_test_##name(fs_info, page, start, len); \
}
IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
PageUptodate);
@@ -416,3 +557,5 @@ IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
PageDirty);
IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
PageWriteback);
+IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
+ PageOrdered);
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index bfd626e955be..4d7aca85d915 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -22,6 +22,14 @@ struct btrfs_subpage {
u16 error_bitmap;
u16 dirty_bitmap;
u16 writeback_bitmap;
+ /*
+ * Both data and metadata needs to track how many readers are for the
+ * page.
+ * Data relies on @readers to unlock the page when last reader finished.
+ * While metadata doesn't need page unlock, it needs to prevent
+ * page::private get cleared before the last end_page_read().
+ */
+ atomic_t readers;
union {
/*
* Structures only used by metadata
@@ -32,7 +40,10 @@ struct btrfs_subpage {
atomic_t eb_refs;
/* Structures only used by data */
struct {
- atomic_t readers;
+ atomic_t writers;
+
+ /* Tracke pending ordered extent in this sector */
+ u16 ordered_bitmap;
};
};
};
@@ -63,6 +74,15 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len);
+void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+
/*
* Template for subpage related operations.
*
@@ -72,6 +92,10 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
* btrfs_page_*() are for call sites where the page can either be subpage
* specific or regular page. The function will handle both cases.
* But the range still needs to be inside the page.
+ *
+ * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't
+ * need to be inside the page. Those functions will truncate the range
+ * automatically.
*/
#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
@@ -85,12 +109,19 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len); \
bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len);
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(error);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
DECLARE_BTRFS_SUBPAGE_OPS(writeback);
+DECLARE_BTRFS_SUBPAGE_OPS(ordered);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4a396c1147f1..d07b18b2b250 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -299,17 +299,6 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
WRITE_ONCE(trans->aborted, errno);
- /* Nothing used. The other threads that have joined this
- * transaction may be able to continue. */
- if (!trans->dirty && list_empty(&trans->new_bgs)) {
- const char *errstr;
-
- errstr = btrfs_decode_error(errno);
- btrfs_warn(fs_info,
- "%s:%d: Aborting unused transaction(%s).",
- function, line, errstr);
- return;
- }
WRITE_ONCE(trans->transaction->aborted, errno);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
@@ -945,8 +934,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_check_integrity_including_extent_data:
btrfs_info(info,
"enabling check integrity including extent data");
- btrfs_set_opt(info->mount_opt,
- CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+ btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA);
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
case Opt_check_integrity:
@@ -1527,7 +1515,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (btrfs_test_opt(info, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
+ if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA))
seq_puts(seq, ",check_int_data");
else if (btrfs_test_opt(info, CHECK_INTEGRITY))
seq_puts(seq, ",check_int");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 436ac7b4b334..9d1d140118ff 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -429,7 +429,7 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
fs_info->discard_ctl.discard_bitmap_bytes);
}
BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
@@ -451,7 +451,7 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
fs_info->discard_ctl.discard_extent_bytes);
}
BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
@@ -665,15 +665,6 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
} \
BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field)
-static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
- struct kobj_attribute *a,
- char *buf)
-{
- struct btrfs_space_info *sinfo = to_space_info(kobj);
- s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
- return scnprintf(buf, PAGE_SIZE, "%lld\n", val);
-}
-
SPACE_INFO_ATTR(flags);
SPACE_INFO_ATTR(total_bytes);
SPACE_INFO_ATTR(bytes_used);
@@ -684,8 +675,6 @@ SPACE_INFO_ATTR(bytes_readonly);
SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
-BTRFS_ATTR(space_info, total_bytes_pinned,
- btrfs_space_info_show_total_bytes_pinned);
static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, flags),
@@ -698,7 +687,6 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
- BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
NULL,
};
ATTRIBUTE_GROUPS(space_info);
@@ -706,7 +694,6 @@ ATTRIBUTE_GROUPS(space_info);
static void space_info_release(struct kobject *kobj)
{
struct btrfs_space_info *sinfo = to_space_info(kobj);
- percpu_counter_destroy(&sinfo->total_bytes_pinned);
kfree(sinfo);
}
@@ -1455,6 +1442,33 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
}
BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
+static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
+ READ_ONCE(device->scrub_speed_max));
+}
+
+static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+ char *endptr;
+ unsigned long long limit;
+
+ limit = memparse(buf, &endptr);
+ WRITE_ONCE(device->scrub_speed_max, limit);
+ return len;
+}
+BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show,
+ btrfs_devinfo_scrub_speed_max_store);
+
static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
@@ -1468,10 +1482,40 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
}
BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
+static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ if (!device->dev_stats_valid)
+ return scnprintf(buf, PAGE_SIZE, "invalid\n");
+
+ /*
+ * Print all at once so we get a snapshot of all values from the same
+ * time. Keep them in sync and in order of definition of
+ * btrfs_dev_stat_values.
+ */
+ return scnprintf(buf, PAGE_SIZE,
+ "write_errs %d\n"
+ "read_errs %d\n"
+ "flush_errs %d\n"
+ "corruption_errs %d\n"
+ "generation_errs %d\n",
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
+
static struct attribute *devid_attrs[] = {
+ BTRFS_ATTR_PTR(devid, error_stats),
BTRFS_ATTR_PTR(devid, in_fs_metadata),
BTRFS_ATTR_PTR(devid, missing),
BTRFS_ATTR_PTR(devid, replace_target),
+ BTRFS_ATTR_PTR(devid, scrub_speed_max),
BTRFS_ATTR_PTR(devid, writeable),
NULL
};
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index c0aefe6dee0b..319fed82d741 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -557,7 +557,7 @@ int btrfs_test_extent_map(void)
{
/*
* Test a chunk with 2 data stripes one of which
- * interesects the physical address of the super block
+ * intersects the physical address of the super block
* is correctly recognised.
*/
.raid_type = BTRFS_BLOCK_GROUP_RAID1,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f75de9f6c0ad..50318231c1a8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -583,9 +583,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
bool do_chunk_alloc = false;
int ret;
- /* Send isn't supposed to start transactions. */
- ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
-
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return ERR_PTR(-EROFS);
@@ -1406,8 +1403,10 @@ int btrfs_defrag_root(struct btrfs_root *root)
while (1) {
trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
ret = btrfs_defrag_leaves(trans, root);
@@ -1476,7 +1475,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
/*
@@ -1869,31 +1868,6 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
}
/*
- * wait for the current transaction commit to start and block subsequent
- * transaction joins
- */
-static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info,
- struct btrfs_transaction *trans)
-{
- wait_event(fs_info->transaction_blocked_wait,
- trans->state >= TRANS_STATE_COMMIT_START ||
- TRANS_ABORTED(trans));
-}
-
-/*
- * wait for the current transaction to start and then become unblocked.
- * caller holds ref.
- */
-static void wait_current_trans_commit_start_and_unblock(
- struct btrfs_fs_info *fs_info,
- struct btrfs_transaction *trans)
-{
- wait_event(fs_info->transaction_wait,
- trans->state >= TRANS_STATE_UNBLOCKED ||
- TRANS_ABORTED(trans));
-}
-
-/*
* commit transactions asynchronously. once btrfs_commit_transaction_async
* returns, any subsequent transaction will not be allowed to join.
*/
@@ -1920,8 +1894,7 @@ static void do_async_commit(struct work_struct *work)
kfree(ac);
}
-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
- int wait_for_unblock)
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_async_commit *ac;
@@ -1953,13 +1926,13 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
__sb_writers_release(fs_info->sb, SB_FREEZE_FS);
schedule_work(&ac->work);
-
- /* wait for transaction to start and unblock */
- if (wait_for_unblock)
- wait_current_trans_commit_start_and_unblock(fs_info, cur_trans);
- else
- wait_current_trans_commit_start(fs_info, cur_trans);
-
+ /*
+ * Wait for the current transaction commit to start and block
+ * subsequent transaction joins
+ */
+ wait_event(fs_info->transaction_blocked_wait,
+ cur_trans->state >= TRANS_STATE_COMMIT_START ||
+ TRANS_ABORTED(cur_trans));
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -2074,14 +2047,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ASSERT(refcount_read(&trans->use_count) == 1);
- /*
- * Some places just start a transaction to commit it. We need to make
- * sure that if this commit fails that the abort code actually marks the
- * transaction as failed, so set trans->dirty to make the abort code do
- * the right thing.
- */
- trans->dirty = true;
-
/* Stop the commit early if ->aborted is set */
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 364cfbb4c5c5..07d76029f598 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -122,8 +122,6 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
-#define BTRFS_SEND_TRANS_STUB ((void *)1)
-
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
@@ -143,7 +141,6 @@ struct btrfs_trans_handle {
bool allocating_chunk;
bool can_flush_pending_bgs;
bool reloc_reserved;
- bool dirty;
bool in_fsync;
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
@@ -227,8 +224,7 @@ void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
- int wait_for_unblock);
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 326be57f2828..cab451d19547 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1574,7 +1574,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
}
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
@@ -1749,7 +1751,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
}
BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1787,6 +1791,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
if (ret == 1) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -1799,17 +1804,19 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ break;
btrfs_release_path(path);
inode = read_one_inode(root, key.offset);
- if (!inode)
- return -EIO;
+ if (!inode) {
+ ret = -EIO;
+ break;
+ }
ret = fixup_inode_link_count(trans, root, inode);
iput(inode);
if (ret)
- goto out;
+ break;
/*
* fixup on a directory may create new entries,
@@ -1818,8 +1825,6 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
*/
key.offset = (u64)-1;
}
- ret = 0;
-out:
btrfs_release_path(path);
return ret;
}
@@ -3297,6 +3302,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* begins and releases it only after writing its superblock.
*/
mutex_lock(&fs_info->tree_log_mutex);
+
+ /*
+ * The previous transaction writeout phase could have failed, and thus
+ * marked the fs in an error state. We must not commit here, as we
+ * could have updated our generation in the super_for_commit and
+ * writing the super here would result in transid mismatches. If there
+ * is an error here just bail.
+ */
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ ret = -EIO;
+ btrfs_set_log_full_commit(trans);
+ btrfs_abort_transaction(trans, ret);
+ mutex_unlock(&fs_info->tree_log_mutex);
+ goto out_wake_log_root;
+ }
+
btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
ret = write_all_supers(fs_info, 1);
@@ -4447,7 +4468,8 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
ret = btrfs_truncate_inode_items(trans,
root->log_root,
inode, truncate_offset,
- BTRFS_EXTENT_DATA_KEY);
+ BTRFS_EXTENT_DATA_KEY,
+ NULL);
} while (ret == -EAGAIN);
if (ret)
goto out;
@@ -5395,7 +5417,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
&inode->runtime_flags);
while(1) {
ret = btrfs_truncate_inode_items(trans,
- log, inode, 0, 0);
+ log, inode, 0, 0, NULL);
if (ret != -EAGAIN)
break;
}
@@ -5445,13 +5467,23 @@ log_extents:
btrfs_release_path(dst_path);
if (need_log_inode_item) {
err = log_inode_item(trans, log, dst_path, inode);
- if (!err && !xattrs_logged) {
+ if (err)
+ goto out_unlock;
+ /*
+ * If we are doing a fast fsync and the inode was logged before
+ * in this transaction, we don't need to log the xattrs because
+ * they were logged before. If xattrs were added, changed or
+ * deleted since the last time we logged the inode, then we have
+ * already logged them because the inode had the runtime flag
+ * BTRFS_INODE_COPY_EVERYTHING set.
+ */
+ if (!xattrs_logged && inode->logged_trans < trans->transid) {
err = btrfs_log_all_xattrs(trans, root, inode, path,
dst_path);
+ if (err)
+ goto out_unlock;
btrfs_release_path(path);
}
- if (err)
- goto out_unlock;
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
@@ -6350,6 +6382,7 @@ next:
error:
if (wc.trans)
btrfs_end_transaction(wc.trans);
+ clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 47d27059d064..782e16795bc4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -717,7 +717,7 @@ static struct btrfs_fs_devices *find_fsid_changed(
/*
* Handles the case where scanned device is part of an fs that had
- * multiple successful changes of FSID but curently device didn't
+ * multiple successful changes of FSID but currently device didn't
* observe it. Meaning our fsid will be different than theirs. We need
* to handle two subcases :
* 1 - The fs still continues to have different METADATA/FSID uuids.
@@ -1550,7 +1550,7 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
* check to ensure dev extents are not double allocated.
* This makes the function safe to allocate dev extents but may not report
* correct usable device space, as device extent freed in current transaction
- * is not reported as avaiable.
+ * is not reported as available.
*/
static int find_free_dev_extent_start(struct btrfs_device *device,
u64 num_bytes, u64 search_start, u64 *start,
@@ -4217,14 +4217,6 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
btrfs_bg_type_to_raid_name(data_target));
}
- if (fs_info->send_in_progress) {
- btrfs_warn_rl(fs_info,
-"cannot run balance while send operations are in progress (%d in progress)",
- fs_info->send_in_progress);
- ret = -EAGAIN;
- goto out;
- }
-
ret = insert_balance_item(fs_info, bctl);
if (ret && ret != -EEXIST)
goto out;
@@ -6127,17 +6119,17 @@ static bool need_full_stripe(enum btrfs_map_op op)
* @em: mapping containing the logical extent
* @op: type of operation - write or read
* @logical: address that we want to figure out the geometry of
- * @len: the length of IO we are going to perform, starting at @logical
* @io_geom: pointer used to return values
*
* Returns < 0 in case a chunk for the given logical address cannot be found,
* usually shouldn't happen unless @logical is corrupted, 0 otherwise.
*/
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
- enum btrfs_map_op op, u64 logical, u64 len,
+ enum btrfs_map_op op, u64 logical,
struct btrfs_io_geometry *io_geom)
{
struct map_lookup *map;
+ u64 len;
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
@@ -6152,7 +6144,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
offset = logical - em->start;
/* Len of a stripe in a chunk */
stripe_len = map->stripe_len;
- /* Stripe wher this block falls in */
+ /* Stripe where this block falls in */
stripe_nr = div64_u64(offset, stripe_len);
/* Offset of stripe in the chunk */
stripe_offset = stripe_nr * stripe_len;
@@ -6243,7 +6235,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
em = btrfs_get_chunk_map(fs_info, logical, *length);
ASSERT(!IS_ERR(em));
- ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom);
+ ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
if (ret < 0)
return ret;
@@ -6670,8 +6662,6 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
*
* If devid and uuid are both specified, the match must be exact, otherwise
* only devid is used.
- *
- * If @seed is true, traverse through the seed devices.
*/
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *uuid, u8 *fsid)
@@ -7865,7 +7855,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
ret = -EUCLEAN;
}
- /* Make sure no dev extent is beyond device bondary */
+ /* Make sure no dev extent is beyond device boundary */
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9c0d84e5ec06..c7fc7caf575c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -143,6 +143,9 @@ struct btrfs_device {
struct completion kobj_unregister;
/* For sysfs/FSID/devinfo/devid/ */
struct kobject devid_kobj;
+
+ /* Bandwidth limit for scrub, in bytes */
+ u64 scrub_speed_max;
};
/*
@@ -443,7 +446,7 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
- enum btrfs_map_op op, u64 logical, u64 len,
+ enum btrfs_map_op op, u64 logical,
struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 1bb8ee97aae0..297c0b1c0634 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -81,7 +81,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
* *: Special case, no superblock is written
* 0: Use write pointer of zones[0]
* 1: Use write pointer of zones[1]
- * C: Compare super blcoks from zones[0] and zones[1], use the latest
+ * C: Compare super blocks from zones[0] and zones[1], use the latest
* one determined by generation
* x: Invalid state
*/
@@ -150,6 +150,18 @@ static inline u32 sb_zone_number(int shift, int mirror)
return (u32)zone;
}
+static inline sector_t zone_start_sector(u32 zone_number,
+ struct block_device *bdev)
+{
+ return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
+}
+
+static inline u64 zone_start_physical(u32 zone_number,
+ struct btrfs_zoned_device_info *zone_info)
+{
+ return (u64)zone_number << zone_info->zone_size_shift;
+}
+
/*
* Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
* device into static sized chunks and fake a conventional zone on each of
@@ -405,8 +417,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
if (sb_zone + 1 >= zone_info->nr_zones)
continue;
- sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT);
- ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT,
+ ret = btrfs_get_dev_zones(device,
+ zone_start_physical(sb_zone, zone_info),
&zone_info->sb_zones[sb_pos],
&nr_zones);
if (ret)
@@ -421,7 +433,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
}
/*
- * If zones[0] is conventional, always use the beggining of the
+ * If zones[0] is conventional, always use the beginning of the
* zone to record superblock. No need to validate in that case.
*/
if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
@@ -721,7 +733,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift,
+ ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
zones);
if (ret < 0)
@@ -826,7 +838,7 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
return -ENOENT;
return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- sb_zone << zone_sectors_shift,
+ zone_start_sector(sb_zone, bdev),
zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
}
@@ -878,7 +890,8 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
if (!(end <= sb_zone ||
sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
have_sb = true;
- pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift;
+ pos = zone_start_physical(
+ sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
break;
}
@@ -1127,6 +1140,10 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
+ zone.start << SECTOR_SHIFT,
+ rcu_str_deref(device->name), device->devid);
ret = -EIO;
goto out;
}
@@ -1187,6 +1204,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
case 0: /* single */
+ if (alloc_offsets[0] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical);
+ ret = -EIO;
+ goto out;
+ }
cache->alloc_offset = alloc_offsets[0];
break;
case BTRFS_BLOCK_GROUP_DUP:
@@ -1204,6 +1228,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
out:
+ if (cache->alloc_offset > fs_info->zone_size) {
+ btrfs_err(fs_info,
+ "zoned: invalid write pointer %llu in block group %llu",
+ cache->alloc_offset, cache->start);
+ ret = -EIO;
+ }
+
/* An extent is allocated after the write pointer */
if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
btrfs_err(fs_info,
@@ -1502,3 +1533,24 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
length = wp - physical_pos;
return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
}
+
+struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct btrfs_device *device;
+ struct extent_map *em;
+ struct map_lookup *map;
+
+ em = btrfs_get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(em))
+ return ERR_CAST(em);
+
+ map = em->map_lookup;
+ /* We only support single profile for now */
+ ASSERT(map->num_stripes == 1);
+ device = map->stripes[0].dev;
+
+ free_extent_map(em);
+
+ return device;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index e55d32595c2c..b0ae2608cb6b 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -65,6 +65,8 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length);
int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
u64 physical_start, u64 physical_pos);
+struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -191,6 +193,13 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev,
return -EOPNOTSUPP;
}
+static inline struct btrfs_device *btrfs_zoned_get_device(
+ struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)