summaryrefslogtreecommitdiff
path: root/fs/bcachefs/fs-io-buffered.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/fs-io-buffered.c')
-rw-r--r--fs/bcachefs/fs-io-buffered.c271
1 files changed, 140 insertions, 131 deletions
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 73c12e565af5..66bacdd49f78 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -30,15 +30,8 @@ static void bch2_readpages_end_io(struct bio *bio)
{
struct folio_iter fi;
- bio_for_each_folio_all(fi, bio) {
- if (!bio->bi_status) {
- folio_mark_uptodate(fi.folio);
- } else {
- folio_clear_uptodate(fi.folio);
- folio_set_error(fi.folio);
- }
- folio_unlock(fi.folio);
- }
+ bio_for_each_folio_all(fi, bio)
+ folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK);
bio_put(bio);
}
@@ -117,11 +110,21 @@ static int readpage_bio_extend(struct btree_trans *trans,
if (!get_more)
break;
+ unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio);
+
+ if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping))
+ break;
+
+ unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
+
+ /* ensure proper alignment */
+ order = min(order, __ffs(folio_offset|BIT(31)));
+
folio = xa_load(&iter->mapping->i_pages, folio_offset);
if (folio && !xa_is_value(folio))
break;
- folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
+ folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order);
if (!folio)
break;
@@ -156,47 +159,39 @@ static void bchfs_read(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_buf sk;
- int flags = BCH_READ_RETRY_IF_STALE|
- BCH_READ_MAY_PROMOTE;
- u32 snapshot;
+ int flags = BCH_READ_retry_if_stale|
+ BCH_READ_may_promote;
int ret = 0;
- rbio->c = c;
- rbio->start_time = local_clock();
rbio->subvol = inum.subvol;
bch2_bkey_buf_init(&sk);
-retry:
bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
- BTREE_ITER_SLOTS);
+ POS(inum.inum, rbio->bio.bi_iter.bi_sector),
+ BTREE_ITER_slots);
while (1) {
struct bkey_s_c k;
- unsigned bytes, sectors, offset_into_extent;
+ unsigned bytes, sectors;
+ s64 offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
- /*
- * read_extent -> io_time_reset may cause a transaction restart
- * without returning an error, we need to check for that here:
- */
- ret = bch2_trans_relock(trans);
+ bch2_trans_begin(trans);
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
- break;
+ goto err;
- bch2_btree_iter_set_pos(&iter,
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
+
+ bch2_btree_iter_set_pos(trans, &iter,
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
- break;
+ goto err;
offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
@@ -207,51 +202,65 @@ retry:
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &sk);
if (ret)
- break;
+ goto err;
k = bkey_i_to_s_c(sk.k);
- sectors = min(sectors, k.k->size - offset_into_extent);
+ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
if (readpages_iter) {
ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
extent_partial_reads_expensive(k));
if (ret)
- break;
+ goto err;
}
bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
swap(rbio->bio.bi_iter.bi_size, bytes);
if (rbio->bio.bi_iter.bi_size == bytes)
- flags |= BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_last_fragment;
bch2_bio_page_state_set(&rbio->bio, k);
bch2_read_extent(trans, rbio, iter.pos,
data_btree, k, offset_into_extent, flags);
+ /*
+ * Careful there's a landmine here if bch2_read_extent() ever
+ * starts returning transaction restarts here.
+ *
+ * We've changed rbio->bi_iter.bi_size to be "bytes we can read
+ * from this extent" with the swap call, and we restore it
+ * below. That restore needs to come before checking for
+ * errors.
+ *
+ * But unlike __bch2_read(), we use the rbio bvec iter, not one
+ * on the stack, so we can't do the restore right after the
+ * bch2_read_extent() call: we don't own that iterator anymore
+ * if BCH_READ_last_fragment is set, since we may have submitted
+ * that rbio instead of cloning it.
+ */
- if (flags & BCH_READ_LAST_FRAGMENT)
+ if (flags & BCH_READ_last_fragment)
break;
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
-
- ret = btree_trans_too_many_iters(trans);
- if (ret)
+err:
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
-err:
bch2_trans_iter_exit(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
if (ret) {
- bch_err_inum_offset_ratelimited(c,
- iter.pos.inode,
- iter.pos.offset << 9,
- "read error %i from btree lookup", ret);
+ struct printbuf buf = PRINTBUF;
+ lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9));
+ prt_printf(&buf, "read error %i from btree lookup", ret);
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+
rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
}
@@ -264,9 +273,9 @@ void bch2_readahead(struct readahead_control *ractl)
struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts;
- struct btree_trans *trans = bch2_trans_get(c);
struct folio *folio;
struct readpages_iter readpages_iter;
+ struct blk_plug plug;
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
@@ -274,8 +283,19 @@ void bch2_readahead(struct readahead_control *ractl)
if (ret)
return;
+ /*
+ * Besides being a general performance optimization, plugging helps with
+ * avoiding btree transaction srcu warnings - submitting a bio can
+ * block, and we don't want todo that with the transaction locked.
+ *
+ * However, plugged bios are submitted when we schedule; we ideally
+ * would have our own scheduler hook to call unlock_long() before
+ * scheduling.
+ */
+ blk_start_plug(&plug);
bch2_pagecache_add_get(inode);
+ struct btree_trans *trans = bch2_trans_get(c);
while ((folio = readpage_iter_peek(&readpages_iter))) {
unsigned n = min_t(unsigned,
readpages_iter.folios.nr -
@@ -284,37 +304,26 @@ void bch2_readahead(struct readahead_control *ractl)
struct bch_read_bio *rbio =
rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
GFP_KERNEL, &c->bio_read),
- opts);
+ c,
+ opts,
+ bch2_readpages_end_io);
readpage_iter_advance(&readpages_iter);
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- rbio->bio.bi_end_io = bch2_readpages_end_io;
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
bchfs_read(trans, rbio, inode_inum(inode),
&readpages_iter);
bch2_trans_unlock(trans);
}
+ bch2_trans_put(trans);
bch2_pagecache_add_put(inode);
-
- bch2_trans_put(trans);
+ blk_finish_plug(&plug);
darray_exit(&readpages_iter.folios);
}
-static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
- subvol_inum inum, struct folio *folio)
-{
- bch2_folio_create(folio, __GFP_NOFAIL);
-
- rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
- rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
- bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0));
-}
-
static void bch2_read_single_folio_end_io(struct bio *bio)
{
complete(bio->bi_private);
@@ -326,17 +335,30 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_read_bio *rbio;
struct bch_io_opts opts;
+ struct blk_plug plug;
int ret;
DECLARE_COMPLETION_ONSTACK(done);
+ BUG_ON(folio_test_uptodate(folio));
+ BUG_ON(folio_test_dirty(folio));
+
+ if (!bch2_folio_create(folio, GFP_KERNEL))
+ return -ENOMEM;
+
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
- opts);
+ c,
+ opts,
+ bch2_read_single_folio_end_io);
rbio->bio.bi_private = &done;
- rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
+ rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+ rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+ BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
- __bchfs_readfolio(c, rbio, inode_inum(inode), folio);
+ blk_start_plug(&plug);
+ bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
+ blk_finish_plug(&plug);
wait_for_completion(&done);
ret = blk_status_to_errno(rbio->bio.bi_status);
@@ -372,17 +394,9 @@ struct bch_writepage_state {
struct bch_io_opts opts;
struct bch_folio_sector *tmp;
unsigned tmp_sectors;
+ struct blk_plug plug;
};
-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
- struct bch_inode_info *inode)
-{
- struct bch_writepage_state ret = { 0 };
-
- bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
- return ret;
-}
-
/*
* Determine when a writepage io is full. We have to limit writepage bios to a
* single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
@@ -413,7 +427,6 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
bio_for_each_folio_all(fi, bio) {
struct bch_folio *s;
- folio_set_error(fi.folio);
mapping_set_error(fi.folio->mapping, -EIO);
s = __bch2_folio(fi.folio);
@@ -424,7 +437,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
}
}
- if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+ if (io->op.flags & BCH_WRITE_wrote_data_inline) {
bio_for_each_folio_all(fi, bio) {
struct bch_folio *s;
@@ -450,8 +463,8 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
*/
/*
- * PageWriteback is effectively our ref on the inode - fixup i_blocks
- * before calling end_page_writeback:
+ * The writeback flag is effectively our ref on the inode -
+ * fixup i_blocks before calling folio_end_writeback:
*/
bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
@@ -499,7 +512,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
- op->subvol = inode->ei_subvol;
+ op->subvol = inode->ei_inum.subvol;
op->pos = POS(inode->v.i_ino, sector);
op->end_io = bch2_writepage_io_done;
op->devs_need_flush = &inode->ei_devs_need_flush;
@@ -547,7 +560,7 @@ do_io:
if (f_sectors > w->tmp_sectors) {
kfree(w->tmp);
- w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL);
+ w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL);
w->tmp_sectors = f_sectors;
}
@@ -629,15 +642,6 @@ do_io:
BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
sectors << 9, offset << 9));
- /* Check for writing past i_size: */
- WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
- round_up(i_size, block_bytes(c)) &&
- !test_bit(BCH_FS_emergency_ro, &c->flags),
- "writing past i_size: %llu > %llu (unrounded %llu)\n",
- bio_end_sector(&w->io->op.wbio.bio) << 9,
- round_up(i_size, block_bytes(c)),
- i_size);
-
w->io->op.res.sectors += reserved_sectors;
w->io->op.i_sectors_delta -= dirty_sectors;
w->io->op.new_i_size = i_size;
@@ -654,17 +658,17 @@ do_io:
int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct bch_fs *c = mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w =
- bch_writepage_state_init(c, to_bch_ei(mapping->host));
- struct blk_plug plug;
- int ret;
+ struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL);
- blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
- if (w.io)
- bch2_writepage_do_io(&w);
- blk_finish_plug(&plug);
- kfree(w.tmp);
+ bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode);
+
+ blk_start_plug(&w->plug);
+ int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w);
+ if (w->io)
+ bch2_writepage_do_io(w);
+ blk_finish_plug(&w->plug);
+ kfree(w->tmp);
+ kfree(w);
return bch2_err_class(ret);
}
@@ -672,7 +676,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
int bch2_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -691,9 +695,9 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
bch2_pagecache_add_get(inode);
folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
- FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
- mapping_gfp_mask(mapping));
- if (IS_ERR_OR_NULL(folio))
+ FGP_WRITEBEGIN | fgf_set_order(len),
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
goto err_unlock;
offset = pos - folio_pos(folio);
@@ -741,12 +745,11 @@ out:
goto err;
}
- *pagep = &folio->page;
+ *foliop = folio;
return 0;
err:
folio_unlock(folio);
folio_put(folio);
- *pagep = NULL;
err_unlock:
bch2_pagecache_add_put(inode);
kfree(res);
@@ -756,12 +759,11 @@ err_unlock:
int bch2_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation *res = fsdata;
- struct folio *folio = page_folio(page);
unsigned offset = pos - folio_pos(folio);
lockdep_assert_held(&inode->v.i_rwsem);
@@ -832,9 +834,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
darray_init(&fs);
ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
- FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
- mapping_gfp_mask(mapping),
- &fs);
+ FGP_WRITEBEGIN | fgf_set_order(len),
+ mapping_gfp_mask(mapping), &fs);
if (ret)
goto out;
@@ -867,24 +868,32 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
f_pos = pos;
f_offset = pos - folio_pos(darray_first(fs));
darray_for_each(fs, fi) {
+ ssize_t f_reserved;
+
f = *fi;
f_len = min(end, folio_end_pos(f)) - f_pos;
+ f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len);
+
+ if (unlikely(f_reserved != f_len)) {
+ if (f_reserved < 0) {
+ if (f == darray_first(fs)) {
+ ret = f_reserved;
+ goto out;
+ }
+
+ folios_trunc(&fs, fi);
+ end = min(end, folio_end_pos(darray_last(fs)));
+ } else {
+ if (!folio_test_uptodate(f)) {
+ ret = bch2_read_single_folio(f, mapping);
+ if (ret)
+ goto out;
+ }
+
+ folios_trunc(&fs, fi + 1);
+ end = f_pos + f_reserved;
+ }
- /*
- * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
- * supposed to write as much as we have disk space for.
- *
- * On failure here we should still write out a partial page if
- * we aren't completely out of disk space - we don't do that
- * yet:
- */
- ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
- if (unlikely(ret)) {
- folios_trunc(&fs, fi);
- if (!fs.nr)
- goto out;
-
- end = min(end, folio_end_pos(darray_last(fs)));
break;
}
@@ -901,7 +910,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
darray_for_each(fs, fi) {
f = *fi;
f_len = min(end, folio_end_pos(f)) - f_pos;
- f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
+ f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter);
if (!f_copied) {
folios_trunc(&fs, fi);
break;