diff options
Diffstat (limited to 'fs/bcachefs/io_write.c')
-rw-r--r-- | fs/bcachefs/io_write.c | 690 |
1 files changed, 401 insertions, 289 deletions
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index f137252bccc5..88b1eec8eff3 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -6,6 +6,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "bkey_buf.h" #include "bset.h" #include "btree_update.h" @@ -15,6 +16,7 @@ #include "compress.h" #include "debug.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "extent_update.h" #include "inode.h" @@ -34,6 +36,12 @@ #include <linux/random.h> #include <linux/sched/mm.h> +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_write_corrupt_ratio; +module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(write_corrupt_ratio, ""); +#endif + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, @@ -69,11 +77,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) u64 io_latency = time_after64(now, submit_time) ? now - submit_time : 0; - u64 old, new, v = atomic64_read(latency); + u64 old, new; + old = atomic64_read(latency); do { - old = v; - /* * If the io latency was reasonably close to the current * latency, skip doing the update and atomic operation - most of @@ -84,7 +91,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) break; new = ewma_add(old, io_latency, 5); - } while ((v = atomic64_cmpxchg(latency, old, new)) != old); + } while (!atomic64_try_cmpxchg(latency, &old, new)); bch2_congested_acct(ca, io_latency, now, rw); @@ -163,10 +170,10 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, *i_sectors_delta = 0; *disk_sectors_delta = 0; - bch2_trans_copy_iter(&iter, extent_iter); + bch2_trans_copy_iter(trans, &iter, extent_iter); - for_each_btree_key_upto_continue_norestart(iter, - new->k.p, BTREE_ITER_SLOTS, old, ret) { + for_each_btree_key_max_continue_norestart(trans, iter, + new->k.p, BTREE_ITER_slots, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), bkey_start_offset(old.k)); @@ -199,9 +206,6 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, u64 new_i_size, s64 i_sectors_delta) { - struct btree_iter iter; - struct bkey_i *k; - struct bkey_i_inode_v3 *inode; /* * Crazy performance optimization: * Every extent update needs to also update the inode: the inode trigger @@ -213,26 +217,38 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, * to be journalled - if we crash, the bi_journal_seq update will be * lost, but that's fine. */ - unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; - int ret; + unsigned inode_update_flags = BTREE_UPDATE_nojournal; - k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, extent_iter->pos.inode, extent_iter->snapshot), - BTREE_ITER_CACHED); - ret = PTR_ERR_OR_ZERO(k); + BTREE_ITER_intent| + BTREE_ITER_cached); + int ret = bkey_err(k); if (unlikely(ret)) return ret; - if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { - k = bch2_inode_to_v3(trans, k); - ret = PTR_ERR_OR_ZERO(k); + /* + * varint_decode_fast(), in the inode .invalid method, reads up to 7 + * bytes past the end of the buffer: + */ + struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8); + ret = PTR_ERR_OR_ZERO(k_mut); + if (unlikely(ret)) + goto err; + + bkey_reassemble(k_mut, k); + + if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) { + k_mut = bch2_inode_to_v3(trans, k_mut); + ret = PTR_ERR_OR_ZERO(k_mut); if (unlikely(ret)) goto err; } - inode = bkey_i_to_inode_v3(k); + struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut); if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && new_i_size > le64_to_cpu(inode->v.bi_size)) { @@ -241,17 +257,42 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, } if (i_sectors_delta) { + s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors); + if (unlikely(bi_sectors + i_sectors_delta < 0)) { + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0", + extent_iter->pos.inode, bi_sectors, i_sectors_delta); + + bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + + if (i_sectors_delta < 0) + i_sectors_delta = -bi_sectors; + else + i_sectors_delta = 0; + } + le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); inode_update_flags = 0; } + /* + * extents, dirents and xattrs updates require that an inode update also + * happens - to ensure that if a key exists in one of those btrees with + * a given snapshot ID an inode is also present - so we may have to skip + * the nojournal optimization: + */ if (inode->k.p.snapshot != iter.snapshot) { inode->k.p.snapshot = iter.snapshot; inode_update_flags = 0; } ret = bch2_trans_update(trans, &iter, &inode->k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_UPDATE_internal_snapshot_node| inode_update_flags); err: bch2_trans_iter_exit(trans, &iter); @@ -278,7 +319,7 @@ int bch2_extent_update(struct btree_trans *trans, * path already traversed at iter->pos because * bch2_trans_extent_update() will use it to attempt extent merging */ - ret = __bch2_btree_iter_traverse(iter); + ret = __bch2_btree_iter_traverse(trans, iter); if (ret) return ret; @@ -323,7 +364,7 @@ int bch2_extent_update(struct btree_trans *trans, if (i_sectors_delta_total) *i_sectors_delta_total += i_sectors_delta; - bch2_btree_iter_set_pos(iter, next_pos); + bch2_btree_iter_set_pos(trans, iter, next_pos); return 0; } @@ -360,13 +401,12 @@ static int bch2_write_index_default(struct bch_write_op *op) bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, bkey_start_pos(&sk.k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); - ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: - bch2_extent_update(trans, inum, &iter, sk.k, + ret = bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_CHECK_ENOSPC); + op->flags & BCH_WRITE_check_enospc); bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -388,6 +428,38 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ +void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) +{ + struct printbuf buf = PRINTBUF; + + if (op->subvol) { + bch2_inum_offset_err_msg(op->c, &buf, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + } else { + struct bpos pos = op->pos; + pos.offset = offset; + bch2_inum_snap_offset_err_msg(op->c, &buf, pos); + } + + prt_str(&buf, "write error: "); + + va_list args; + va_start(args, fmt); + prt_vprintf(&buf, fmt, args); + va_end(args); + + if (op->flags & BCH_WRITE_move) { + struct data_update *u = container_of(op, struct data_update, op); + + prt_printf(&buf, "\n from internal move "); + bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k)); + } + + bch_err_ratelimited(op->c, "%s", buf.buf); + printbuf_exit(&buf); +} + void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k, @@ -395,17 +467,29 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); struct bch_write_bio *n; + unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE; + unsigned ref_idx = type == BCH_DATA_btree + ? BCH_DEV_READ_REF_btree_node_write + : BCH_DEV_WRITE_REF_io_write; BUG_ON(c->opts.nochanges); - bkey_for_each_ptr(ptrs, ptr) { - BUG_ON(!bch2_dev_exists2(c, ptr->dev)); + const struct bch_extent_ptr *last = NULL; + bkey_for_each_ptr(ptrs, ptr) + last = ptr; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + bkey_for_each_ptr(ptrs, ptr) { + /* + * XXX: btree writes should be using io_ref[WRITE], but we + * aren't retrying failed btree writes yet (due to device + * removal/ro): + */ + struct bch_dev *ca = nocow + ? bch2_dev_have_ref(c, ptr->dev) + : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx); - if (to_entry(ptr + 1) < ptrs.end) { - n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, - GFP_NOFS, &ca->replica_set)); + if (ptr != last) { + n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; n->bio.bi_private = wbio->bio.bi_private; @@ -422,11 +506,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->c = c; n->dev = ptr->dev; - n->have_ioref = nocow || bch2_dev_get_ioref(ca, - type == BCH_DATA_btree ? READ : WRITE); + n->have_ioref = ca != NULL; n->nocow = nocow; n->submit_time = local_clock(); n->inode_offset = bkey_start_offset(&k->k); + if (nocow) + n->nocow_bucket = PTR_BUCKET_NR(ca, ptr); n->bio.bi_iter.bi_sector = ptr->offset; if (likely(n->have_ioref)) { @@ -460,20 +545,21 @@ static void bch2_write_done(struct closure *cl) bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); bch2_disk_reservation_put(c, &op->res); - if (!(op->flags & BCH_WRITE_MOVE)) - bch2_write_ref_put(c, BCH_WRITE_REF_write); + if (!(op->flags & BCH_WRITE_move)) + enumerated_ref_put(&c->writes, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); EBUG_ON(cl->parent); closure_debug_destroy(cl); + async_object_list_del(c, write_op, op->list_idx); if (op->end_io) op->end_io(op); } static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { + struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; - struct bch_extent_ptr *ptr; struct bkey_i *src, *dst = keys->keys, *n; for (src = keys->keys; src != keys->top; src = n) { @@ -484,7 +570,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) test_bit(ptr->dev, op->failed.d)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return -EIO; + return bch_err_throw(c, data_write_io); } if (dst != src) @@ -507,7 +593,7 @@ static void __bch2_write_index(struct bch_write_op *op) unsigned dev; int ret = 0; - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + if (unlikely(op->flags & BCH_WRITE_io_error)) { ret = bch2_write_drop_io_error_ptrs(op); if (ret) goto err; @@ -516,7 +602,7 @@ static void __bch2_write_index(struct bch_write_op *op) if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); - ret = !(op->flags & BCH_WRITE_MOVE) + ret = !(op->flags & BCH_WRITE_move) ? bch2_write_index_default(op) : bch2_data_update_index_update(op); @@ -525,14 +611,11 @@ static void __bch2_write_index(struct bch_write_op *op) op->written += sectors_start - keylist_sectors(keys); - if (ret && !bch2_err_matches(ret, EROFS)) { + if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - bch_err_inum_offset_ratelimited(c, - insert->k.p.inode, insert->k.p.offset << 9, - "%s write error while doing btree update: %s", - op->flags & BCH_WRITE_MOVE ? "move" : "user", - bch2_err_str(ret)); + bch2_write_op_error(op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); } if (ret) @@ -541,21 +624,29 @@ static void __bch2_write_index(struct bch_write_op *op) out: /* If some a bucket wasn't written, we can't erasure code it: */ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) - bch2_open_bucket_write_error(c, &op->open_buckets, dev); + bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io); bch2_open_buckets_put(c, &op->open_buckets); return; err: keys->top = keys->keys; op->error = ret; - op->flags |= BCH_WRITE_DONE; + op->flags |= BCH_WRITE_submitted; goto out; } static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) { if (state != wp->state) { + struct task_struct *p = current; u64 now = ktime_get_ns(); + u64 runtime = p->se.sum_exec_runtime + + (now - p->se.exec_start); + + if (state == WRITE_POINT_runnable) + wp->last_runtime = runtime; + else if (wp->state == WRITE_POINT_runnable) + wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; if (wp->last_state_change && time_after64(now, wp->last_state_change)) @@ -569,7 +660,7 @@ static inline void wp_update_state(struct write_point *wp, bool running) { enum write_point_state state; - state = running ? WRITE_POINT_running : + state = running ? WRITE_POINT_runnable: !list_empty(&wp->writes) ? WRITE_POINT_waiting_io : WRITE_POINT_stopped; @@ -583,8 +674,8 @@ static CLOSURE_CALLBACK(bch2_write_index) struct workqueue_struct *wq = index_update_wq(op); unsigned long flags; - if ((op->flags & BCH_WRITE_DONE) && - (op->flags & BCH_WRITE_MOVE)) + if ((op->flags & BCH_WRITE_submitted) && + (op->flags & BCH_WRITE_move)) bch2_bio_free_pages_pool(op->c, &op->wbio.bio); spin_lock_irqsave(&wp->writes_lock, flags); @@ -615,20 +706,18 @@ void bch2_write_point_do_index_updates(struct work_struct *work) while (1) { spin_lock_irq(&wp->writes_lock); - op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); - if (op) - list_del(&op->wp_list); + op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list); wp_update_state(wp, op != NULL); spin_unlock_irq(&wp->writes_lock); if (!op) break; - op->flags |= BCH_WRITE_IN_WORKER; + op->flags |= BCH_WRITE_in_worker; __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_DONE)) + if (!(op->flags & BCH_WRITE_submitted)) __bch2_write(op); else bch2_write_done(&op->cl); @@ -642,25 +731,41 @@ static void bch2_write_endio(struct bio *bio) struct bch_write_bio *wbio = to_wbio(bio); struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_fs *c = wbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); - - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, - op->pos.inode, - wbio->inode_offset << 9, - "data write error: %s", - bch2_blk_status_to_str(bio->bi_status))) { + struct bch_dev *ca = wbio->have_ioref + ? bch2_dev_have_ref(c, wbio->dev) + : NULL; + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + wbio->submit_time, !bio->bi_status); + + if (unlikely(bio->bi_status)) { + if (ca) + bch_err_inum_offset_ratelimited(ca, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status)); + else + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status)); set_bit(wbio->dev, op->failed.d); - op->flags |= BCH_WRITE_IO_ERROR; + op->flags |= BCH_WRITE_io_error; } - if (wbio->nocow) + if (wbio->nocow) { + bch2_bucket_nocow_unlock(&c->nocow_locks, + POS(ca->dev_idx, wbio->nocow_bucket), + BUCKET_NOCOW_LOCK_UPDATE); set_bit(wbio->dev, op->devs_need_flush->d); - - if (wbio->have_ioref) { - bch2_latency_acct(ca, wbio->submit_time, WRITE); - percpu_ref_put(&ca->io_ref); } + if (wbio->have_ioref) + enumerated_ref_put(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_io_write); + if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -685,7 +790,7 @@ static void init_append_extent(struct bch_write_op *op, e = bkey_extent_init(op->insert_keys.top); e->k.p = op->pos; e->k.size = crc.uncompressed_size; - e->k.version = version; + e->k.bversion = version; if (crc.csum_type || crc.compression_type || @@ -693,7 +798,10 @@ static void init_append_extent(struct bch_write_op *op, bch2_extent_crc_append(&e->k_i, crc); bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, - op->flags & BCH_WRITE_CACHED); + op->flags & BCH_WRITE_cached); + + if (!(op->flags & BCH_WRITE_move)) + bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); bch2_keylist_push(&op->insert_keys); } @@ -753,7 +861,6 @@ static int bch2_write_rechecksum(struct bch_fs *c, { struct bio *bio = &op->wbio.bio; struct bch_extent_crc_unpacked new_crc; - int ret; /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ @@ -761,10 +868,10 @@ static int bch2_write_rechecksum(struct bch_fs *c, bch2_csum_type_is_encryption(new_csum_type)) new_csum_type = op->crc.csum_type; - ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, - NULL, &new_crc, - op->crc.offset, op->crc.live_size, - new_csum_type); + int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, + NULL, &new_crc, + op->crc.offset, op->crc.live_size, + new_csum_type); if (ret) return ret; @@ -774,44 +881,12 @@ static int bch2_write_rechecksum(struct bch_fs *c, return 0; } -static int bch2_write_decrypt(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct nonce nonce = extent_nonce(op->version, op->crc); - struct bch_csum csum; - int ret; - - if (!bch2_csum_type_is_encryption(op->crc.csum_type)) - return 0; - - /* - * If we need to decrypt data in the write path, we'll no longer be able - * to verify the existing checksum (poly1305 mac, in this case) after - * it's decrypted - this is the last point we'll be able to reverify the - * checksum: - */ - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return -EIO; - - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - return ret; -} - -static enum prep_encoded_ret { - PREP_ENCODED_OK, - PREP_ENCODED_ERR, - PREP_ENCODED_CHECKSUM_ERR, - PREP_ENCODED_DO_WRITE, -} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) +static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) { struct bch_fs *c = op->c; struct bio *bio = &op->wbio.bio; - - if (!(op->flags & BCH_WRITE_DATA_ENCODED)) - return PREP_ENCODED_OK; + struct bch_csum csum; + int ret = 0; BUG_ON(bio_sectors(bio) != op->crc.compressed_size); @@ -822,12 +897,13 @@ static enum prep_encoded_ret { (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || op->incompressible)) { if (!crc_is_compressed(op->crc) && - op->csum_type != op->crc.csum_type && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + op->csum_type != op->crc.csum_type) { + ret = bch2_write_rechecksum(c, op, op->csum_type); + if (ret) + return ret; + } - return PREP_ENCODED_DO_WRITE; + return 1; } /* @@ -835,20 +911,24 @@ static enum prep_encoded_ret { * is, we have to decompress it: */ if (crc_is_compressed(op->crc)) { - struct bch_csum csum; - - if (bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; - /* Last point we can still verify checksum: */ - csum = bch2_checksum_bio(c, op->crc.csum_type, - extent_nonce(op->version, op->crc), - bio); + struct nonce nonce = extent_nonce(op->version, op->crc); + csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + goto csum_err; + + if (bch2_csum_type_is_encryption(op->crc.csum_type)) { + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); + if (ret) + return ret; + + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + } - if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) - return PREP_ENCODED_ERR; + ret = bch2_bio_uncompress_inplace(op, bio); + if (ret) + return ret; } /* @@ -860,22 +940,44 @@ static enum prep_encoded_ret { * If the data is checksummed and we're only writing a subset, * rechecksum and adjust bio to point to currently live data: */ - if ((op->crc.live_size != op->crc.uncompressed_size || - op->crc.csum_type != op->csum_type) && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + if (op->crc.live_size != op->crc.uncompressed_size || + op->crc.csum_type != op->csum_type) { + ret = bch2_write_rechecksum(c, op, op->csum_type); + if (ret) + return ret; + } /* * If we want to compress the data, it has to be decrypted: */ - if ((op->compression_opt || - bch2_csum_type_is_encryption(op->crc.csum_type) != - bch2_csum_type_is_encryption(op->csum_type)) && - bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; + if (bch2_csum_type_is_encryption(op->crc.csum_type) && + (op->compression_opt || op->crc.csum_type != op->csum_type)) { + struct nonce nonce = extent_nonce(op->version, op->crc); + csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) + goto csum_err; + + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); + if (ret) + return ret; + + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + } - return PREP_ENCODED_OK; + return 0; +csum_err: + bch2_write_op_error(op, op->pos.offset, + "error verifying existing checksum while moving existing data (memory corruption?)\n" + " expected %0llx:%0llx got %0llx:%0llx type %s", + op->crc.csum.hi, + op->crc.csum.lo, + csum.hi, + csum.lo, + op->crc.csum_type < BCH_CSUM_NR + ? __bch2_csum_types[op->crc.csum_type] + : "(unknown)"); + return bch_err_throw(c, data_write_csum); } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, @@ -890,43 +992,51 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, bool page_alloc_failed = false; int ret, more = 0; + if (op->incompressible) + op->compression_opt = 0; + BUG_ON(!bio_sectors(src)); ec_buf = bch2_writepoint_ec_buf(c, wp); - switch (bch2_write_prep_encoded_data(op, wp)) { - case PREP_ENCODED_OK: - break; - case PREP_ENCODED_ERR: - ret = -EIO; - goto err; - case PREP_ENCODED_CHECKSUM_ERR: - goto csum_err; - case PREP_ENCODED_DO_WRITE: - /* XXX look for bug here */ - if (ec_buf) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bio_copy_data(dst, src); - bounce = true; + if (unlikely(op->flags & BCH_WRITE_data_encoded)) { + ret = bch2_write_prep_encoded_data(op, wp); + if (ret < 0) + goto err; + if (ret) { + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } + init_append_extent(op, wp, op->version, op->crc); + goto do_write; } - init_append_extent(op, wp, op->version, op->crc); - goto do_write; } if (ec_buf || op->compression_opt || (op->csum_type && - !(op->flags & BCH_WRITE_PAGES_STABLE)) || + !(op->flags & BCH_WRITE_pages_stable)) || (bch2_csum_type_is_encryption(op->csum_type) && - !(op->flags & BCH_WRITE_PAGES_OWNED))) { + !(op->flags & BCH_WRITE_pages_owned))) { dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed, ec_buf); bounce = true; } +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio); + if (!bounce && write_corrupt_ratio) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bounce = true; + } +#endif saved_iter = dst->bi_iter; do { @@ -940,7 +1050,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, break; BUG_ON(op->compression_opt && - (op->flags & BCH_WRITE_DATA_ENCODED) && + (op->flags & BCH_WRITE_data_encoded) && bch2_csum_type_is_encryption(op->crc.csum_type)); BUG_ON(op->compression_opt && !bounce); @@ -978,7 +1088,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, } } - if ((op->flags & BCH_WRITE_DATA_ENCODED) && + if ((op->flags & BCH_WRITE_data_encoded) && !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { @@ -996,12 +1106,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, * data can't be modified (by userspace) while it's in * flight. */ - if (bch2_rechecksum_bio(c, src, version, op->crc, + ret = bch2_rechecksum_bio(c, src, version, op->crc, &crc, &op->crc, src_len >> 9, bio_sectors(src) - (src_len >> 9), - op->csum_type)) - goto csum_err; + op->csum_type); + if (ret) + goto err; /* * rchecksum_bio sets compression_type on crc from op->crc, * this isn't always correct as sometimes we're changing @@ -1010,13 +1121,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, crc.compression_type = compression_type; crc.nonce = nonce; } else { - if ((op->flags & BCH_WRITE_DATA_ENCODED) && - bch2_rechecksum_bio(c, src, version, op->crc, + if ((op->flags & BCH_WRITE_data_encoded) && + (ret = bch2_rechecksum_bio(c, src, version, op->crc, NULL, &op->crc, src_len >> 9, bio_sectors(src) - (src_len >> 9), - op->crc.csum_type)) - goto csum_err; + op->crc.csum_type))) + goto err; crc.compressed_size = dst_len >> 9; crc.uncompressed_size = src_len >> 9; @@ -1036,6 +1147,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, init_append_extent(op, wp, version, crc); +#ifdef CONFIG_BCACHEFS_DEBUG + if (write_corrupt_ratio) { + swap(dst->bi_iter.bi_size, dst_len); + bch2_maybe_corrupt_bio(dst, write_corrupt_ratio); + swap(dst->bi_iter.bi_size, dst_len); + } +#endif + if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); @@ -1067,10 +1186,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, do_write: *_dst = dst; return more; -csum_err: - bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)", - op->flags & BCH_WRITE_MOVE ? "move" : "user"); - ret = -EIO; err: if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); @@ -1093,6 +1208,8 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, return false; e = bkey_s_c_to_extent(k); + + guard(rcu)(); extent_for_each_ptr_decode(e, p, entry) { if (crc_is_encoded(p.crc) || p.has_ec) return false; @@ -1103,20 +1220,6 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, return replicas >= op->opts.data_replicas; } -static inline void bch2_nocow_write_unlock(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - - for_each_keylist_key(&op->insert_keys, k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - - bkey_for_each_ptr(ptrs, ptr) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), - BUCKET_NOCOW_LOCK_UPDATE); - } -} - static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *orig, @@ -1150,48 +1253,43 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return bch2_extent_update_i_size_sectors(trans, iter, min(new->k.p.offset << 9, new_i_size), 0) ?: bch2_trans_update(trans, iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans *trans = bch2_trans_get(c); + int ret = 0; for_each_keylist_key(&op->insert_keys, orig) { - int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, + ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); - - if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - - bch_err_inum_offset_ratelimited(c, - insert->k.p.inode, insert->k.p.offset << 9, - "%s write error while doing btree update: %s", - op->flags & BCH_WRITE_MOVE ? "move" : "user", - bch2_err_str(ret)); - } - - if (ret) { - op->error = ret; + if (ret) break; - } } bch2_trans_put(trans); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); + bch2_write_op_error(op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); + } + + if (ret) + op->error = ret; } static void __bch2_nocow_write_done(struct bch_write_op *op) { - bch2_nocow_write_unlock(op); - - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { - op->error = -EIO; - } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) + if (unlikely(op->flags & BCH_WRITE_io_error)) { + op->error = bch_err_throw(op->c, data_write_io); + } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1218,9 +1316,9 @@ static void bch2_nocow_write(struct bch_write_op *op) DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; u32 snapshot; struct bucket_to_lock *stale_at; - int ret; + int stale, ret; - if (op->flags & BCH_WRITE_MOVE) + if (op->flags & BCH_WRITE_move) return; darray_init(&buckets); @@ -1234,13 +1332,17 @@ retry: bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(op->pos.inode, op->pos.offset, snapshot), - BTREE_ITER_SLOTS); + BTREE_ITER_slots); while (1) { struct bio *bio = &op->wbio.bio; buckets.nr = 0; - k = bch2_btree_iter_peek_slot(&iter); + ret = bch2_trans_relock(trans); + if (ret) + break; + + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) break; @@ -1259,21 +1361,23 @@ retry: /* Get iorefs before dropping btree locks: */ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - struct bpos b = PTR_BUCKET_POS(c, ptr); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, + BCH_DEV_WRITE_REF_io_write); + if (unlikely(!ca)) + goto err_get_ioref; + + struct bpos b = PTR_BUCKET_POS(ca, ptr); struct nocow_lock_bucket *l = bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); prefetch(l); - if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) - goto err_get_ioref; - /* XXX allocating memory with btree locks held - rare */ darray_push_gfp(&buckets, ((struct bucket_to_lock) { .b = b, .gen = ptr->gen, .l = l, }), GFP_KERNEL|__GFP_NOFAIL); if (ptr->unwritten) - op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + op->flags |= BCH_WRITE_convert_unwritten; } /* Unlock before taking nocow locks, doing IO: */ @@ -1281,20 +1385,18 @@ retry: bch2_trans_unlock(trans); bch2_cut_front(op->pos, op->insert_keys.top); - if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) + if (op->flags & BCH_WRITE_convert_unwritten) bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); darray_for_each(buckets, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode); + struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode); __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, bucket_to_u64(i->b), BUCKET_NOCOW_LOCK_UPDATE); - rcu_read_lock(); - bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen); - rcu_read_unlock(); - + int gen = bucket_gen_get(ca, i->b.offset); + stale = gen < 0 ? gen : gen_after(gen, i->gen); if (unlikely(stale)) { stale_at = i; goto err_bucket_stale; @@ -1308,7 +1410,7 @@ retry: wbio_init(bio)->put_bio = true; bio->bi_opf = op->wbio.bio.bi_opf; } else { - op->flags |= BCH_WRITE_DONE; + op->flags |= BCH_WRITE_submitted; } op->pos.offset += bio_sectors(bio); @@ -1318,13 +1420,14 @@ retry: bio->bi_private = &op->cl; bio->bi_opf |= REQ_OP_WRITE; closure_get(&op->cl); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, op->insert_keys.top, true); bch2_keylist_push(&op->insert_keys); - if (op->flags & BCH_WRITE_DONE) + if (op->flags & BCH_WRITE_submitted) break; - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); } out: bch2_trans_iter_exit(trans, &iter); @@ -1332,23 +1435,22 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + bch2_trans_put(trans); + darray_exit(&buckets); + if (ret) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, op->pos.offset << 9, - "%s: btree lookup error %s", __func__, bch2_err_str(ret)); + bch2_write_op_error(op, op->pos.offset, + "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); op->error = ret; - op->flags |= BCH_WRITE_DONE; + op->flags |= BCH_WRITE_submitted; } - bch2_trans_put(trans); - darray_exit(&buckets); - /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_DONE)) { + if (!(op->flags & BCH_WRITE_submitted)) { closure_sync(&op->cl); __bch2_nocow_write_done(op); op->insert_keys.top = op->insert_keys.keys; - } else if (op->flags & BCH_WRITE_SYNC) { + } else if (op->flags & BCH_WRITE_sync) { closure_sync(&op->cl); bch2_nocow_write_done(&op->cl.work); } else { @@ -1362,7 +1464,8 @@ err: return; err_get_ioref: darray_for_each(buckets, i) - percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref); + enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE], + BCH_DEV_WRITE_REF_io_write); /* Fall back to COW path: */ goto out; @@ -1373,8 +1476,18 @@ err_bucket_stale: break; } - /* We can retry this: */ - ret = -BCH_ERR_transaction_restart; + struct printbuf buf = PRINTBUF; + if (bch2_fs_inconsistent_on(stale < 0, c, + "pointer to invalid bucket in nocow path on device %llu\n %s", + stale_at->b.inode, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch_err_throw(c, data_write_invalid_ptr); + } else { + /* We can retry this: */ + ret = bch_err_throw(c, transaction_restart); + } + printbuf_exit(&buf); + goto err_get_ioref; } @@ -1390,7 +1503,7 @@ static void __bch2_write(struct bch_write_op *op) if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { bch2_nocow_write(op); - if (op->flags & BCH_WRITE_DONE) + if (op->flags & BCH_WRITE_submitted) goto out_nofs_restore; } again: @@ -1417,19 +1530,17 @@ again: * freeing up space on specific disks, which means that * allocations for specific disks may hang arbitrarily long: */ - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_run(c, lockrestart_do(trans, bch2_alloc_sectors_start_trans(trans, op->target, - op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), op->write_point, &op->devs_have, op->nr_replicas, op->nr_replicas_required, op->watermark, op->flags, - (op->flags & (BCH_WRITE_ALLOC_NOWAIT| - BCH_WRITE_ONLY_SPECIFIED_DEVS)) - ? NULL : &op->cl, &wp)); + &op->cl, &wp))); if (unlikely(ret)) { if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) break; @@ -1445,16 +1556,12 @@ again: bch2_alloc_sectors_done_inlined(c, wp); err: if (ret <= 0) { - op->flags |= BCH_WRITE_DONE; - - if (ret < 0) { - if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s(): %s error: %s", __func__, - op->flags & BCH_WRITE_MOVE ? "move" : "user", - bch2_err_str(ret)); + op->flags |= BCH_WRITE_submitted; + + if (unlikely(ret < 0)) { + if (!(op->flags & BCH_WRITE_alloc_nowait)) + bch2_write_op_error(op, op->pos.offset, + "%s(): %s", __func__, bch2_err_str(ret)); op->error = ret; break; } @@ -1480,13 +1587,14 @@ err: * synchronously here if we weren't able to submit all of the IO at * once, as that signals backpressure to the caller. */ - if ((op->flags & BCH_WRITE_SYNC) || - (!(op->flags & BCH_WRITE_DONE) && - !(op->flags & BCH_WRITE_IN_WORKER))) { - closure_sync(&op->cl); + if ((op->flags & BCH_WRITE_sync) || + (!(op->flags & BCH_WRITE_submitted) && + !(op->flags & BCH_WRITE_in_worker))) { + bch2_wait_on_allocator(c, &op->cl); + __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_DONE)) + if (!(op->flags & BCH_WRITE_submitted)) goto again; bch2_write_done(&op->cl); } else { @@ -1505,8 +1613,10 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) unsigned sectors; int ret; - op->flags |= BCH_WRITE_WROTE_DATA_INLINE; - op->flags |= BCH_WRITE_DONE; + memset(&op->failed, 0, sizeof(op->failed)); + + op->flags |= BCH_WRITE_wrote_data_inline; + op->flags |= BCH_WRITE_submitted; bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); @@ -1523,7 +1633,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) id = bkey_inline_data_init(op->insert_keys.top); id->k.p = op->pos; - id->k.version = op->version; + id->k.bversion = op->version; id->k.size = sectors; iter = bio->bi_iter; @@ -1569,33 +1679,35 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); + async_object_list_add(c, write_op, op, &op->list_idx); + + if (op->flags & BCH_WRITE_only_specified_devs) + op->flags |= BCH_WRITE_alloc_nowait; + op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; - if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s write error: misaligned write", - op->flags & BCH_WRITE_MOVE ? "move" : "user"); - op->error = -EIO; + if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { + bch2_write_op_error(op, op->pos.offset, "misaligned write"); + op->error = bch_err_throw(c, data_write_misaligned); goto err; } if (c->opts.nochanges) { - op->error = -BCH_ERR_erofs_no_writes; + op->error = bch_err_throw(c, erofs_no_writes); goto err; } - if (!(op->flags & BCH_WRITE_MOVE) && - !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { - op->error = -BCH_ERR_erofs_no_writes; + if (!(op->flags & BCH_WRITE_move) && + !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) { + op->error = bch_err_throw(c, erofs_no_writes); goto err; } - this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); + if (!(op->flags & BCH_WRITE_move)) + this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); bch2_increment_clock(c, bio_sectors(bio), WRITE); data_len = min_t(u64, bio->bi_iter.bi_size, @@ -1613,6 +1725,7 @@ err: bch2_disk_reservation_put(c, &op->res); closure_debug_destroy(&op->cl); + async_object_list_del(c, write_op, op->list_idx); if (op->end_io) op->end_io(op); } @@ -1626,43 +1739,42 @@ static const char * const bch2_write_flags[] = { void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) { - prt_str(out, "pos: "); + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_printf(out, "pos:\t"); bch2_bpos_to_text(out, op->pos); prt_newline(out); printbuf_indent_add(out, 2); - prt_str(out, "started: "); + prt_printf(out, "started:\t"); bch2_pr_time_units(out, local_clock() - op->start_time); prt_newline(out); - prt_str(out, "flags: "); + prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_write_flags, op->flags); prt_newline(out); - prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); - prt_newline(out); + prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas); + prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); + + prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); + prt_printf(out, "ret\t%s\n", bch2_err_str(op->error)); printbuf_indent_sub(out, 2); } void bch2_fs_io_write_exit(struct bch_fs *c) { - mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->replica_set); bioset_exit(&c->bio_write); } int bch2_fs_io_write_init(struct bch_fs *c) { - if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_write_init; - - if (mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->opts.btree_node_size, - c->opts.encoded_extent_max) / - PAGE_SIZE, 0)) - return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || + bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) + return bch_err_throw(c, ENOMEM_bio_write_init); return 0; } |