diff options
Diffstat (limited to 'fs/bcachefs/fs-io-buffered.c')
-rw-r--r-- | fs/bcachefs/fs-io-buffered.c | 401 |
1 files changed, 173 insertions, 228 deletions
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 39292e7ef342..66bacdd49f78 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -30,15 +30,8 @@ static void bch2_readpages_end_io(struct bio *bio) { struct folio_iter fi; - bio_for_each_folio_all(fi, bio) { - if (!bio->bi_status) { - folio_mark_uptodate(fi.folio); - } else { - folio_clear_uptodate(fi.folio); - folio_set_error(fi.folio); - } - folio_unlock(fi.folio); - } + bio_for_each_folio_all(fi, bio) + folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK); bio_put(bio); } @@ -117,11 +110,21 @@ static int readpage_bio_extend(struct btree_trans *trans, if (!get_more) break; + unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio); + + if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping)) + break; + + unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); + + /* ensure proper alignment */ + order = min(order, __ffs(folio_offset|BIT(31))); + folio = xa_load(&iter->mapping->i_pages, folio_offset); if (folio && !xa_is_value(folio)) break; - folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); + folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order); if (!folio) break; @@ -156,47 +159,39 @@ static void bchfs_read(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; - int flags = BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE; - u32 snapshot; + int flags = BCH_READ_retry_if_stale| + BCH_READ_may_promote; int ret = 0; - rbio->c = c; - rbio->start_time = local_clock(); rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); -retry: bch2_trans_begin(trans); - iter = (struct btree_iter) { NULL }; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); + POS(inum.inum, rbio->bio.bi_iter.bi_sector), + BTREE_ITER_slots); while (1) { struct bkey_s_c k; - unsigned bytes, sectors, offset_into_extent; + unsigned bytes, sectors; + s64 offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; - /* - * read_extent -> io_time_reset may cause a transaction restart - * without returning an error, we need to check for that here: - */ - ret = bch2_trans_relock(trans); + bch2_trans_begin(trans); + + u32 snapshot; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) - break; + goto err; + + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - bch2_btree_iter_set_pos(&iter, + bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) - break; + goto err; offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); @@ -207,51 +202,65 @@ retry: ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &sk); if (ret) - break; + goto err; k = bkey_i_to_s_c(sk.k); - sectors = min(sectors, k.k->size - offset_into_extent); + sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); if (readpages_iter) { ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, extent_partial_reads_expensive(k)); if (ret) - break; + goto err; } bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_last_fragment; bch2_bio_page_state_set(&rbio->bio, k); bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); + /* + * Careful there's a landmine here if bch2_read_extent() ever + * starts returning transaction restarts here. + * + * We've changed rbio->bi_iter.bi_size to be "bytes we can read + * from this extent" with the swap call, and we restore it + * below. That restore needs to come before checking for + * errors. + * + * But unlike __bch2_read(), we use the rbio bvec iter, not one + * on the stack, so we can't do the restore right after the + * bch2_read_extent() call: we don't own that iterator anymore + * if BCH_READ_last_fragment is set, since we may have submitted + * that rbio instead of cloning it. + */ - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) break; swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); - - ret = btree_trans_too_many_iters(trans); - if (ret) +err: + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } -err: bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (ret) { - bch_err_inum_offset_ratelimited(c, - iter.pos.inode, - iter.pos.offset << 9, - "read error %i from btree lookup", ret); + struct printbuf buf = PRINTBUF; + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); + prt_printf(&buf, "read error %i from btree lookup", ret); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); } @@ -264,9 +273,9 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts; - struct btree_trans *trans = bch2_trans_get(c); struct folio *folio; struct readpages_iter readpages_iter; + struct blk_plug plug; bch2_inode_opts_get(&opts, c, &inode->ei_inode); @@ -274,8 +283,19 @@ void bch2_readahead(struct readahead_control *ractl) if (ret) return; + /* + * Besides being a general performance optimization, plugging helps with + * avoiding btree transaction srcu warnings - submitting a bio can + * block, and we don't want todo that with the transaction locked. + * + * However, plugged bios are submitted when we schedule; we ideally + * would have our own scheduler hook to call unlock_long() before + * scheduling. + */ + blk_start_plug(&plug); bch2_pagecache_add_get(inode); + struct btree_trans *trans = bch2_trans_get(c); while ((folio = readpage_iter_peek(&readpages_iter))) { unsigned n = min_t(unsigned, readpages_iter.folios.nr - @@ -284,22 +304,23 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_read_bio *rbio = rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); + c, + opts, + bch2_readpages_end_io); readpage_iter_advance(&readpages_iter); rbio->bio.bi_iter.bi_sector = folio_sector(folio); - rbio->bio.bi_end_io = bch2_readpages_end_io; BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); bchfs_read(trans, rbio, inode_inum(inode), &readpages_iter); bch2_trans_unlock(trans); } + bch2_trans_put(trans); bch2_pagecache_add_put(inode); - - bch2_trans_put(trans); + blk_finish_plug(&plug); darray_exit(&readpages_iter.folios); } @@ -314,24 +335,30 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_read_bio *rbio; struct bch_io_opts opts; + struct blk_plug plug; int ret; DECLARE_COMPLETION_ONSTACK(done); + BUG_ON(folio_test_uptodate(folio)); + BUG_ON(folio_test_dirty(folio)); + if (!bch2_folio_create(folio, GFP_KERNEL)) return -ENOMEM; bch2_inode_opts_get(&opts, c, &inode->ei_inode); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); + c, + opts, + bch2_read_single_folio_end_io); rbio->bio.bi_private = &done; - rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + blk_start_plug(&plug); bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); + blk_finish_plug(&plug); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); @@ -367,17 +394,9 @@ struct bch_writepage_state { struct bch_io_opts opts; struct bch_folio_sector *tmp; unsigned tmp_sectors; + struct blk_plug plug; }; -static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, - struct bch_inode_info *inode) -{ - struct bch_writepage_state ret = { 0 }; - - bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); - return ret; -} - /* * Determine when a writepage io is full. We have to limit writepage bios to a * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to @@ -408,7 +427,6 @@ static void bch2_writepage_io_done(struct bch_write_op *op) bio_for_each_folio_all(fi, bio) { struct bch_folio *s; - folio_set_error(fi.folio); mapping_set_error(fi.folio->mapping, -EIO); s = __bch2_folio(fi.folio); @@ -419,7 +437,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op) } } - if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { + if (io->op.flags & BCH_WRITE_wrote_data_inline) { bio_for_each_folio_all(fi, bio) { struct bch_folio *s; @@ -445,8 +463,8 @@ static void bch2_writepage_io_done(struct bch_write_op *op) */ /* - * PageWriteback is effectively our ref on the inode - fixup i_blocks - * before calling end_page_writeback: + * The writeback flag is effectively our ref on the inode - + * fixup i_blocks before calling folio_end_writeback: */ bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); @@ -494,7 +512,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); - op->subvol = inode->ei_subvol; + op->subvol = inode->ei_inum.subvol; op->pos = POS(inode->v.i_ino, sector); op->end_io = bch2_writepage_io_done; op->devs_need_flush = &inode->ei_devs_need_flush; @@ -542,7 +560,7 @@ do_io: if (f_sectors > w->tmp_sectors) { kfree(w->tmp); - w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL); + w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL); w->tmp_sectors = f_sectors; } @@ -624,15 +642,6 @@ do_io: BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, sectors << 9, offset << 9)); - /* Check for writing past i_size: */ - WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > - round_up(i_size, block_bytes(c)) && - !test_bit(BCH_FS_emergency_ro, &c->flags), - "writing past i_size: %llu > %llu (unrounded %llu)\n", - bio_end_sector(&w->io->op.wbio.bio) << 9, - round_up(i_size, block_bytes(c)), - i_size); - w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; w->io->op.new_i_size = i_size; @@ -649,17 +658,17 @@ do_io: int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct bch_fs *c = mapping->host->i_sb->s_fs_info; - struct bch_writepage_state w = - bch_writepage_state_init(c, to_bch_ei(mapping->host)); - struct blk_plug plug; - int ret; + struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL); - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); - if (w.io) - bch2_writepage_do_io(&w); - blk_finish_plug(&plug); - kfree(w.tmp); + bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode); + + blk_start_plug(&w->plug); + int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w); + if (w->io) + bch2_writepage_do_io(w); + blk_finish_plug(&w->plug); + kfree(w->tmp); + kfree(w); return bch2_err_class(ret); } @@ -667,7 +676,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc int bch2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -686,9 +695,9 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, bch2_pagecache_add_get(inode); folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, - FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, - mapping_gfp_mask(mapping)); - if (IS_ERR_OR_NULL(folio)) + FGP_WRITEBEGIN | fgf_set_order(len), + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) goto err_unlock; offset = pos - folio_pos(folio); @@ -736,12 +745,11 @@ out: goto err; } - *pagep = &folio->page; + *foliop = folio; return 0; err: folio_unlock(folio); folio_put(folio); - *pagep = NULL; err_unlock: bch2_pagecache_add_put(inode); kfree(res); @@ -751,12 +759,11 @@ err_unlock: int bch2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation *res = fsdata; - struct folio *folio = page_folio(page); unsigned offset = pos - folio_pos(folio); lockdep_assert_held(&inode->v.i_rwsem); @@ -810,8 +817,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi) static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, struct iov_iter *iter, - loff_t pos, unsigned len, - bool inode_locked) + loff_t pos, unsigned len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; @@ -828,23 +834,13 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, darray_init(&fs); ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, - FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, - mapping_gfp_mask(mapping), - &fs); + FGP_WRITEBEGIN | fgf_set_order(len), + mapping_gfp_mask(mapping), &fs); if (ret) goto out; BUG_ON(!fs.nr); - /* - * If we're not using the inode lock, we need to lock all the folios for - * atomiticity of writes vs. other writes: - */ - if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { - ret = -BCH_ERR_need_inode_lock; - goto out; - } - f = darray_first(fs); if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); @@ -872,24 +868,32 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, f_pos = pos; f_offset = pos - folio_pos(darray_first(fs)); darray_for_each(fs, fi) { + ssize_t f_reserved; + f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; + f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len); + + if (unlikely(f_reserved != f_len)) { + if (f_reserved < 0) { + if (f == darray_first(fs)) { + ret = f_reserved; + goto out; + } + + folios_trunc(&fs, fi); + end = min(end, folio_end_pos(darray_last(fs))); + } else { + if (!folio_test_uptodate(f)) { + ret = bch2_read_single_folio(f, mapping); + if (ret) + goto out; + } + + folios_trunc(&fs, fi + 1); + end = f_pos + f_reserved; + } - /* - * XXX: per POSIX and fstests generic/275, on -ENOSPC we're - * supposed to write as much as we have disk space for. - * - * On failure here we should still write out a partial page if - * we aren't completely out of disk space - we don't do that - * yet: - */ - ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); - if (unlikely(ret)) { - folios_trunc(&fs, fi); - if (!fs.nr) - goto out; - - end = min(end, folio_end_pos(darray_last(fs))); break; } @@ -906,7 +910,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, darray_for_each(fs, fi) { f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; - f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); + f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter); if (!f_copied) { folios_trunc(&fs, fi); break; @@ -939,10 +943,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, end = pos + copied; spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) { - BUG_ON(!inode_locked); + if (end > inode->v.i_size) i_size_write(&inode->v, end); - } spin_unlock(&inode->v.i_lock); f_pos = pos; @@ -986,68 +988,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos; - bool inode_locked = false; - ssize_t written = 0, written2 = 0, ret = 0; - - /* - * We don't take the inode lock unless i_size will be changing. Folio - * locks provide exclusion with other writes, and the pagecache add lock - * provides exclusion with truncate and hole punching. - * - * There is one nasty corner case where atomicity would be broken - * without great care: when copying data from userspace to the page - * cache, we do that with faults disable - a page fault would recurse - * back into the filesystem, taking filesystem locks again, and - * deadlock; so it's done with faults disabled, and we fault in the user - * buffer when we aren't holding locks. - * - * If we do part of the write, but we then race and in the userspace - * buffer have been evicted and are no longer resident, then we have to - * drop our folio locks to re-fault them in, breaking write atomicity. - * - * To fix this, we restart the write from the start, if we weren't - * holding the inode lock. - * - * There is another wrinkle after that; if we restart the write from the - * start, and then get an unrecoverable error, we _cannot_ claim to - * userspace that we did not write data we actually did - so we must - * track (written2) the most we ever wrote. - */ - - if ((iocb->ki_flags & IOCB_APPEND) || - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { - inode_lock(&inode->v); - inode_locked = true; - } - - ret = generic_write_checks(iocb, iter); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); - if (ret) { - if (!inode_locked) { - inode_lock(&inode->v); - inode_locked = true; - ret = file_remove_privs_flags(file, 0); - } - if (ret) - goto unlock; - } - - ret = file_update_time(file); - if (ret) - goto unlock; - - pos = iocb->ki_pos; + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + int ret = 0; bch2_pagecache_add_get(inode); - if (!inode_locked && - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) - goto get_inode_lock; - do { unsigned offset = pos & (PAGE_SIZE - 1); unsigned bytes = iov_iter_count(iter); @@ -1072,17 +1018,12 @@ again: } } - if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) - goto get_inode_lock; - if (unlikely(fatal_signal_pending(current))) { ret = -EINTR; break; } - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); - if (ret == -BCH_ERR_need_inode_lock) - goto get_inode_lock; + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); if (unlikely(ret < 0)) break; @@ -1103,46 +1044,50 @@ again: } pos += ret; written += ret; - written2 = max(written, written2); - - if (ret != bytes && !inode_locked) - goto get_inode_lock; ret = 0; balance_dirty_pages_ratelimited(mapping); - - if (0) { -get_inode_lock: - bch2_pagecache_add_put(inode); - inode_lock(&inode->v); - inode_locked = true; - bch2_pagecache_add_get(inode); - - iov_iter_revert(iter, written); - pos -= written; - written = 0; - ret = 0; - } } while (iov_iter_count(iter)); - bch2_pagecache_add_put(inode); -unlock: - if (inode_locked) - inode_unlock(&inode->v); - iocb->ki_pos += written; + bch2_pagecache_add_put(inode); - ret = max(written, written2) ?: ret; - if (ret > 0) - ret = generic_write_sync(iocb, ret); - return ret; + return written ? written : ret; } -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) { - ssize_t ret = iocb->ki_flags & IOCB_DIRECT - ? bch2_direct_write(iocb, iter) - : bch2_buffered_write(iocb, iter); + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = bch2_direct_write(iocb, from); + goto out; + } + + inode_lock(&inode->v); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs(file); + if (ret) + goto unlock; + + ret = file_update_time(file); + if (ret) + goto unlock; + + ret = bch2_buffered_write(iocb, from); + if (likely(ret > 0)) + iocb->ki_pos += ret; +unlock: + inode_unlock(&inode->v); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); +out: return bch2_err_class(ret); } |