summaryrefslogtreecommitdiff
path: root/fs/bcachefs/io_write.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/io_write.c')
-rw-r--r--fs/bcachefs/io_write.c690
1 files changed, 401 insertions, 289 deletions
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index f137252bccc5..88b1eec8eff3 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -6,6 +6,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "async_objs.h"
#include "bkey_buf.h"
#include "bset.h"
#include "btree_update.h"
@@ -15,6 +16,7 @@
#include "compress.h"
#include "debug.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extent_update.h"
#include "inode.h"
@@ -34,6 +36,12 @@
#include <linux/random.h>
#include <linux/sched/mm.h>
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_write_corrupt_ratio;
+module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(write_corrupt_ratio, "");
+#endif
+
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
@@ -69,11 +77,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
u64 io_latency = time_after64(now, submit_time)
? now - submit_time
: 0;
- u64 old, new, v = atomic64_read(latency);
+ u64 old, new;
+ old = atomic64_read(latency);
do {
- old = v;
-
/*
* If the io latency was reasonably close to the current
* latency, skip doing the update and atomic operation - most of
@@ -84,7 +91,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
break;
new = ewma_add(old, io_latency, 5);
- } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+ } while (!atomic64_try_cmpxchg(latency, &old, new));
bch2_congested_acct(ca, io_latency, now, rw);
@@ -163,10 +170,10 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
*i_sectors_delta = 0;
*disk_sectors_delta = 0;
- bch2_trans_copy_iter(&iter, extent_iter);
+ bch2_trans_copy_iter(trans, &iter, extent_iter);
- for_each_btree_key_upto_continue_norestart(iter,
- new->k.p, BTREE_ITER_SLOTS, old, ret) {
+ for_each_btree_key_max_continue_norestart(trans, iter,
+ new->k.p, BTREE_ITER_slots, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
bkey_start_offset(old.k));
@@ -199,9 +206,6 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
u64 new_i_size,
s64 i_sectors_delta)
{
- struct btree_iter iter;
- struct bkey_i *k;
- struct bkey_i_inode_v3 *inode;
/*
* Crazy performance optimization:
* Every extent update needs to also update the inode: the inode trigger
@@ -213,26 +217,38 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
* to be journalled - if we crash, the bi_journal_seq update will be
* lost, but that's fine.
*/
- unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
- int ret;
+ unsigned inode_update_flags = BTREE_UPDATE_nojournal;
- k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0,
extent_iter->pos.inode,
extent_iter->snapshot),
- BTREE_ITER_CACHED);
- ret = PTR_ERR_OR_ZERO(k);
+ BTREE_ITER_intent|
+ BTREE_ITER_cached);
+ int ret = bkey_err(k);
if (unlikely(ret))
return ret;
- if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
- k = bch2_inode_to_v3(trans, k);
- ret = PTR_ERR_OR_ZERO(k);
+ /*
+ * varint_decode_fast(), in the inode .invalid method, reads up to 7
+ * bytes past the end of the buffer:
+ */
+ struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8);
+ ret = PTR_ERR_OR_ZERO(k_mut);
+ if (unlikely(ret))
+ goto err;
+
+ bkey_reassemble(k_mut, k);
+
+ if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) {
+ k_mut = bch2_inode_to_v3(trans, k_mut);
+ ret = PTR_ERR_OR_ZERO(k_mut);
if (unlikely(ret))
goto err;
}
- inode = bkey_i_to_inode_v3(k);
+ struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut);
if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
new_i_size > le64_to_cpu(inode->v.bi_size)) {
@@ -241,17 +257,42 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
}
if (i_sectors_delta) {
+ s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors);
+ if (unlikely(bi_sectors + i_sectors_delta < 0)) {
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0",
+ extent_iter->pos.inode, bi_sectors, i_sectors_delta);
+
+ bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf);
+ if (print)
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+
+ if (i_sectors_delta < 0)
+ i_sectors_delta = -bi_sectors;
+ else
+ i_sectors_delta = 0;
+ }
+
le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
inode_update_flags = 0;
}
+ /*
+ * extents, dirents and xattrs updates require that an inode update also
+ * happens - to ensure that if a key exists in one of those btrees with
+ * a given snapshot ID an inode is also present - so we may have to skip
+ * the nojournal optimization:
+ */
if (inode->k.p.snapshot != iter.snapshot) {
inode->k.p.snapshot = iter.snapshot;
inode_update_flags = 0;
}
ret = bch2_trans_update(trans, &iter, &inode->k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ BTREE_UPDATE_internal_snapshot_node|
inode_update_flags);
err:
bch2_trans_iter_exit(trans, &iter);
@@ -278,7 +319,7 @@ int bch2_extent_update(struct btree_trans *trans,
* path already traversed at iter->pos because
* bch2_trans_extent_update() will use it to attempt extent merging
*/
- ret = __bch2_btree_iter_traverse(iter);
+ ret = __bch2_btree_iter_traverse(trans, iter);
if (ret)
return ret;
@@ -323,7 +364,7 @@ int bch2_extent_update(struct btree_trans *trans,
if (i_sectors_delta_total)
*i_sectors_delta_total += i_sectors_delta;
- bch2_btree_iter_set_pos(iter, next_pos);
+ bch2_btree_iter_set_pos(trans, iter, next_pos);
return 0;
}
@@ -360,13 +401,12 @@ static int bch2_write_index_default(struct bch_write_op *op)
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
bkey_start_pos(&sk.k->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
- ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
- bch2_extent_update(trans, inum, &iter, sk.k,
+ ret = bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
- op->flags & BCH_WRITE_CHECK_ENOSPC);
+ op->flags & BCH_WRITE_check_enospc);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -388,6 +428,38 @@ static int bch2_write_index_default(struct bch_write_op *op)
/* Writes */
+void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
+{
+ struct printbuf buf = PRINTBUF;
+
+ if (op->subvol) {
+ bch2_inum_offset_err_msg(op->c, &buf,
+ (subvol_inum) { op->subvol, op->pos.inode, },
+ offset << 9);
+ } else {
+ struct bpos pos = op->pos;
+ pos.offset = offset;
+ bch2_inum_snap_offset_err_msg(op->c, &buf, pos);
+ }
+
+ prt_str(&buf, "write error: ");
+
+ va_list args;
+ va_start(args, fmt);
+ prt_vprintf(&buf, fmt, args);
+ va_end(args);
+
+ if (op->flags & BCH_WRITE_move) {
+ struct data_update *u = container_of(op, struct data_update, op);
+
+ prt_printf(&buf, "\n from internal move ");
+ bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k));
+ }
+
+ bch_err_ratelimited(op->c, "%s", buf.buf);
+ printbuf_exit(&buf);
+}
+
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
enum bch_data_type type,
const struct bkey_i *k,
@@ -395,17 +467,29 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
struct bch_write_bio *n;
+ unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE;
+ unsigned ref_idx = type == BCH_DATA_btree
+ ? BCH_DEV_READ_REF_btree_node_write
+ : BCH_DEV_WRITE_REF_io_write;
BUG_ON(c->opts.nochanges);
- bkey_for_each_ptr(ptrs, ptr) {
- BUG_ON(!bch2_dev_exists2(c, ptr->dev));
+ const struct bch_extent_ptr *last = NULL;
+ bkey_for_each_ptr(ptrs, ptr)
+ last = ptr;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ bkey_for_each_ptr(ptrs, ptr) {
+ /*
+ * XXX: btree writes should be using io_ref[WRITE], but we
+ * aren't retrying failed btree writes yet (due to device
+ * removal/ro):
+ */
+ struct bch_dev *ca = nocow
+ ? bch2_dev_have_ref(c, ptr->dev)
+ : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx);
- if (to_entry(ptr + 1) < ptrs.end) {
- n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
- GFP_NOFS, &ca->replica_set));
+ if (ptr != last) {
+ n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
n->bio.bi_end_io = wbio->bio.bi_end_io;
n->bio.bi_private = wbio->bio.bi_private;
@@ -422,11 +506,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->c = c;
n->dev = ptr->dev;
- n->have_ioref = nocow || bch2_dev_get_ioref(ca,
- type == BCH_DATA_btree ? READ : WRITE);
+ n->have_ioref = ca != NULL;
n->nocow = nocow;
n->submit_time = local_clock();
n->inode_offset = bkey_start_offset(&k->k);
+ if (nocow)
+ n->nocow_bucket = PTR_BUCKET_NR(ca, ptr);
n->bio.bi_iter.bi_sector = ptr->offset;
if (likely(n->have_ioref)) {
@@ -460,20 +545,21 @@ static void bch2_write_done(struct closure *cl)
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
bch2_disk_reservation_put(c, &op->res);
- if (!(op->flags & BCH_WRITE_MOVE))
- bch2_write_ref_put(c, BCH_WRITE_REF_write);
+ if (!(op->flags & BCH_WRITE_move))
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_write);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
EBUG_ON(cl->parent);
closure_debug_destroy(cl);
+ async_object_list_del(c, write_op, op->list_idx);
if (op->end_io)
op->end_io(op);
}
static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
{
+ struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
- struct bch_extent_ptr *ptr;
struct bkey_i *src, *dst = keys->keys, *n;
for (src = keys->keys; src != keys->top; src = n) {
@@ -484,7 +570,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
test_bit(ptr->dev, op->failed.d));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
- return -EIO;
+ return bch_err_throw(c, data_write_io);
}
if (dst != src)
@@ -507,7 +593,7 @@ static void __bch2_write_index(struct bch_write_op *op)
unsigned dev;
int ret = 0;
- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+ if (unlikely(op->flags & BCH_WRITE_io_error)) {
ret = bch2_write_drop_io_error_ptrs(op);
if (ret)
goto err;
@@ -516,7 +602,7 @@ static void __bch2_write_index(struct bch_write_op *op)
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
- ret = !(op->flags & BCH_WRITE_MOVE)
+ ret = !(op->flags & BCH_WRITE_move)
? bch2_write_index_default(op)
: bch2_data_update_index_update(op);
@@ -525,14 +611,11 @@ static void __bch2_write_index(struct bch_write_op *op)
op->written += sectors_start - keylist_sectors(keys);
- if (ret && !bch2_err_matches(ret, EROFS)) {
+ if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
- bch_err_inum_offset_ratelimited(c,
- insert->k.p.inode, insert->k.p.offset << 9,
- "%s write error while doing btree update: %s",
- op->flags & BCH_WRITE_MOVE ? "move" : "user",
- bch2_err_str(ret));
+ bch2_write_op_error(op, bkey_start_offset(&insert->k),
+ "btree update error: %s", bch2_err_str(ret));
}
if (ret)
@@ -541,21 +624,29 @@ static void __bch2_write_index(struct bch_write_op *op)
out:
/* If some a bucket wasn't written, we can't erasure code it: */
for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
- bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+ bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io);
bch2_open_buckets_put(c, &op->open_buckets);
return;
err:
keys->top = keys->keys;
op->error = ret;
- op->flags |= BCH_WRITE_DONE;
+ op->flags |= BCH_WRITE_submitted;
goto out;
}
static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
{
if (state != wp->state) {
+ struct task_struct *p = current;
u64 now = ktime_get_ns();
+ u64 runtime = p->se.sum_exec_runtime +
+ (now - p->se.exec_start);
+
+ if (state == WRITE_POINT_runnable)
+ wp->last_runtime = runtime;
+ else if (wp->state == WRITE_POINT_runnable)
+ wp->time[WRITE_POINT_running] += runtime - wp->last_runtime;
if (wp->last_state_change &&
time_after64(now, wp->last_state_change))
@@ -569,7 +660,7 @@ static inline void wp_update_state(struct write_point *wp, bool running)
{
enum write_point_state state;
- state = running ? WRITE_POINT_running :
+ state = running ? WRITE_POINT_runnable:
!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
: WRITE_POINT_stopped;
@@ -583,8 +674,8 @@ static CLOSURE_CALLBACK(bch2_write_index)
struct workqueue_struct *wq = index_update_wq(op);
unsigned long flags;
- if ((op->flags & BCH_WRITE_DONE) &&
- (op->flags & BCH_WRITE_MOVE))
+ if ((op->flags & BCH_WRITE_submitted) &&
+ (op->flags & BCH_WRITE_move))
bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
spin_lock_irqsave(&wp->writes_lock, flags);
@@ -615,20 +706,18 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
while (1) {
spin_lock_irq(&wp->writes_lock);
- op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
- if (op)
- list_del(&op->wp_list);
+ op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list);
wp_update_state(wp, op != NULL);
spin_unlock_irq(&wp->writes_lock);
if (!op)
break;
- op->flags |= BCH_WRITE_IN_WORKER;
+ op->flags |= BCH_WRITE_in_worker;
__bch2_write_index(op);
- if (!(op->flags & BCH_WRITE_DONE))
+ if (!(op->flags & BCH_WRITE_submitted))
__bch2_write(op);
else
bch2_write_done(&op->cl);
@@ -642,25 +731,41 @@ static void bch2_write_endio(struct bio *bio)
struct bch_write_bio *wbio = to_wbio(bio);
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_fs *c = wbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
-
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
- op->pos.inode,
- wbio->inode_offset << 9,
- "data write error: %s",
- bch2_blk_status_to_str(bio->bi_status))) {
+ struct bch_dev *ca = wbio->have_ioref
+ ? bch2_dev_have_ref(c, wbio->dev)
+ : NULL;
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+ wbio->submit_time, !bio->bi_status);
+
+ if (unlikely(bio->bi_status)) {
+ if (ca)
+ bch_err_inum_offset_ratelimited(ca,
+ op->pos.inode,
+ wbio->inode_offset << 9,
+ "data write error: %s",
+ bch2_blk_status_to_str(bio->bi_status));
+ else
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ wbio->inode_offset << 9,
+ "data write error: %s",
+ bch2_blk_status_to_str(bio->bi_status));
set_bit(wbio->dev, op->failed.d);
- op->flags |= BCH_WRITE_IO_ERROR;
+ op->flags |= BCH_WRITE_io_error;
}
- if (wbio->nocow)
+ if (wbio->nocow) {
+ bch2_bucket_nocow_unlock(&c->nocow_locks,
+ POS(ca->dev_idx, wbio->nocow_bucket),
+ BUCKET_NOCOW_LOCK_UPDATE);
set_bit(wbio->dev, op->devs_need_flush->d);
-
- if (wbio->have_ioref) {
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
- percpu_ref_put(&ca->io_ref);
}
+ if (wbio->have_ioref)
+ enumerated_ref_put(&ca->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_io_write);
+
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
@@ -685,7 +790,7 @@ static void init_append_extent(struct bch_write_op *op,
e = bkey_extent_init(op->insert_keys.top);
e->k.p = op->pos;
e->k.size = crc.uncompressed_size;
- e->k.version = version;
+ e->k.bversion = version;
if (crc.csum_type ||
crc.compression_type ||
@@ -693,7 +798,10 @@ static void init_append_extent(struct bch_write_op *op,
bch2_extent_crc_append(&e->k_i, crc);
bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
- op->flags & BCH_WRITE_CACHED);
+ op->flags & BCH_WRITE_cached);
+
+ if (!(op->flags & BCH_WRITE_move))
+ bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i);
bch2_keylist_push(&op->insert_keys);
}
@@ -753,7 +861,6 @@ static int bch2_write_rechecksum(struct bch_fs *c,
{
struct bio *bio = &op->wbio.bio;
struct bch_extent_crc_unpacked new_crc;
- int ret;
/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
@@ -761,10 +868,10 @@ static int bch2_write_rechecksum(struct bch_fs *c,
bch2_csum_type_is_encryption(new_csum_type))
new_csum_type = op->crc.csum_type;
- ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
- NULL, &new_crc,
- op->crc.offset, op->crc.live_size,
- new_csum_type);
+ int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+ NULL, &new_crc,
+ op->crc.offset, op->crc.live_size,
+ new_csum_type);
if (ret)
return ret;
@@ -774,44 +881,12 @@ static int bch2_write_rechecksum(struct bch_fs *c,
return 0;
}
-static int bch2_write_decrypt(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct nonce nonce = extent_nonce(op->version, op->crc);
- struct bch_csum csum;
- int ret;
-
- if (!bch2_csum_type_is_encryption(op->crc.csum_type))
- return 0;
-
- /*
- * If we need to decrypt data in the write path, we'll no longer be able
- * to verify the existing checksum (poly1305 mac, in this case) after
- * it's decrypted - this is the last point we'll be able to reverify the
- * checksum:
- */
- csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
- if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- return -EIO;
-
- ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
- op->crc.csum_type = 0;
- op->crc.csum = (struct bch_csum) { 0, 0 };
- return ret;
-}
-
-static enum prep_encoded_ret {
- PREP_ENCODED_OK,
- PREP_ENCODED_ERR,
- PREP_ENCODED_CHECKSUM_ERR,
- PREP_ENCODED_DO_WRITE,
-} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
{
struct bch_fs *c = op->c;
struct bio *bio = &op->wbio.bio;
-
- if (!(op->flags & BCH_WRITE_DATA_ENCODED))
- return PREP_ENCODED_OK;
+ struct bch_csum csum;
+ int ret = 0;
BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
@@ -822,12 +897,13 @@ static enum prep_encoded_ret {
(op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
op->incompressible)) {
if (!crc_is_compressed(op->crc) &&
- op->csum_type != op->crc.csum_type &&
- bch2_write_rechecksum(c, op, op->csum_type) &&
- !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
+ op->csum_type != op->crc.csum_type) {
+ ret = bch2_write_rechecksum(c, op, op->csum_type);
+ if (ret)
+ return ret;
+ }
- return PREP_ENCODED_DO_WRITE;
+ return 1;
}
/*
@@ -835,20 +911,24 @@ static enum prep_encoded_ret {
* is, we have to decompress it:
*/
if (crc_is_compressed(op->crc)) {
- struct bch_csum csum;
-
- if (bch2_write_decrypt(op))
- return PREP_ENCODED_CHECKSUM_ERR;
-
/* Last point we can still verify checksum: */
- csum = bch2_checksum_bio(c, op->crc.csum_type,
- extent_nonce(op->version, op->crc),
- bio);
+ struct nonce nonce = extent_nonce(op->version, op->crc);
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
+ goto csum_err;
+
+ if (bch2_csum_type_is_encryption(op->crc.csum_type)) {
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
+ if (ret)
+ return ret;
+
+ op->crc.csum_type = 0;
+ op->crc.csum = (struct bch_csum) { 0, 0 };
+ }
- if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
- return PREP_ENCODED_ERR;
+ ret = bch2_bio_uncompress_inplace(op, bio);
+ if (ret)
+ return ret;
}
/*
@@ -860,22 +940,44 @@ static enum prep_encoded_ret {
* If the data is checksummed and we're only writing a subset,
* rechecksum and adjust bio to point to currently live data:
*/
- if ((op->crc.live_size != op->crc.uncompressed_size ||
- op->crc.csum_type != op->csum_type) &&
- bch2_write_rechecksum(c, op, op->csum_type) &&
- !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
+ if (op->crc.live_size != op->crc.uncompressed_size ||
+ op->crc.csum_type != op->csum_type) {
+ ret = bch2_write_rechecksum(c, op, op->csum_type);
+ if (ret)
+ return ret;
+ }
/*
* If we want to compress the data, it has to be decrypted:
*/
- if ((op->compression_opt ||
- bch2_csum_type_is_encryption(op->crc.csum_type) !=
- bch2_csum_type_is_encryption(op->csum_type)) &&
- bch2_write_decrypt(op))
- return PREP_ENCODED_CHECKSUM_ERR;
+ if (bch2_csum_type_is_encryption(op->crc.csum_type) &&
+ (op->compression_opt || op->crc.csum_type != op->csum_type)) {
+ struct nonce nonce = extent_nonce(op->version, op->crc);
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+ goto csum_err;
+
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
+ if (ret)
+ return ret;
+
+ op->crc.csum_type = 0;
+ op->crc.csum = (struct bch_csum) { 0, 0 };
+ }
- return PREP_ENCODED_OK;
+ return 0;
+csum_err:
+ bch2_write_op_error(op, op->pos.offset,
+ "error verifying existing checksum while moving existing data (memory corruption?)\n"
+ " expected %0llx:%0llx got %0llx:%0llx type %s",
+ op->crc.csum.hi,
+ op->crc.csum.lo,
+ csum.hi,
+ csum.lo,
+ op->crc.csum_type < BCH_CSUM_NR
+ ? __bch2_csum_types[op->crc.csum_type]
+ : "(unknown)");
+ return bch_err_throw(c, data_write_csum);
}
static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
@@ -890,43 +992,51 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
bool page_alloc_failed = false;
int ret, more = 0;
+ if (op->incompressible)
+ op->compression_opt = 0;
+
BUG_ON(!bio_sectors(src));
ec_buf = bch2_writepoint_ec_buf(c, wp);
- switch (bch2_write_prep_encoded_data(op, wp)) {
- case PREP_ENCODED_OK:
- break;
- case PREP_ENCODED_ERR:
- ret = -EIO;
- goto err;
- case PREP_ENCODED_CHECKSUM_ERR:
- goto csum_err;
- case PREP_ENCODED_DO_WRITE:
- /* XXX look for bug here */
- if (ec_buf) {
- dst = bch2_write_bio_alloc(c, wp, src,
- &page_alloc_failed,
- ec_buf);
- bio_copy_data(dst, src);
- bounce = true;
+ if (unlikely(op->flags & BCH_WRITE_data_encoded)) {
+ ret = bch2_write_prep_encoded_data(op, wp);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ if (ec_buf) {
+ dst = bch2_write_bio_alloc(c, wp, src,
+ &page_alloc_failed,
+ ec_buf);
+ bio_copy_data(dst, src);
+ bounce = true;
+ }
+ init_append_extent(op, wp, op->version, op->crc);
+ goto do_write;
}
- init_append_extent(op, wp, op->version, op->crc);
- goto do_write;
}
if (ec_buf ||
op->compression_opt ||
(op->csum_type &&
- !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+ !(op->flags & BCH_WRITE_pages_stable)) ||
(bch2_csum_type_is_encryption(op->csum_type) &&
- !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+ !(op->flags & BCH_WRITE_pages_owned))) {
dst = bch2_write_bio_alloc(c, wp, src,
&page_alloc_failed,
ec_buf);
bounce = true;
}
+#ifdef CONFIG_BCACHEFS_DEBUG
+ unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
+ if (!bounce && write_corrupt_ratio) {
+ dst = bch2_write_bio_alloc(c, wp, src,
+ &page_alloc_failed,
+ ec_buf);
+ bounce = true;
+ }
+#endif
saved_iter = dst->bi_iter;
do {
@@ -940,7 +1050,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
break;
BUG_ON(op->compression_opt &&
- (op->flags & BCH_WRITE_DATA_ENCODED) &&
+ (op->flags & BCH_WRITE_data_encoded) &&
bch2_csum_type_is_encryption(op->crc.csum_type));
BUG_ON(op->compression_opt && !bounce);
@@ -978,7 +1088,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
}
}
- if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+ if ((op->flags & BCH_WRITE_data_encoded) &&
!crc_is_compressed(crc) &&
bch2_csum_type_is_encryption(op->crc.csum_type) ==
bch2_csum_type_is_encryption(op->csum_type)) {
@@ -996,12 +1106,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
* data can't be modified (by userspace) while it's in
* flight.
*/
- if (bch2_rechecksum_bio(c, src, version, op->crc,
+ ret = bch2_rechecksum_bio(c, src, version, op->crc,
&crc, &op->crc,
src_len >> 9,
bio_sectors(src) - (src_len >> 9),
- op->csum_type))
- goto csum_err;
+ op->csum_type);
+ if (ret)
+ goto err;
/*
* rchecksum_bio sets compression_type on crc from op->crc,
* this isn't always correct as sometimes we're changing
@@ -1010,13 +1121,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
crc.compression_type = compression_type;
crc.nonce = nonce;
} else {
- if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
- bch2_rechecksum_bio(c, src, version, op->crc,
+ if ((op->flags & BCH_WRITE_data_encoded) &&
+ (ret = bch2_rechecksum_bio(c, src, version, op->crc,
NULL, &op->crc,
src_len >> 9,
bio_sectors(src) - (src_len >> 9),
- op->crc.csum_type))
- goto csum_err;
+ op->crc.csum_type)))
+ goto err;
crc.compressed_size = dst_len >> 9;
crc.uncompressed_size = src_len >> 9;
@@ -1036,6 +1147,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
init_append_extent(op, wp, version, crc);
+#ifdef CONFIG_BCACHEFS_DEBUG
+ if (write_corrupt_ratio) {
+ swap(dst->bi_iter.bi_size, dst_len);
+ bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
+ swap(dst->bi_iter.bi_size, dst_len);
+ }
+#endif
+
if (dst != src)
bio_advance(dst, dst_len);
bio_advance(src, src_len);
@@ -1067,10 +1186,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
do_write:
*_dst = dst;
return more;
-csum_err:
- bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)",
- op->flags & BCH_WRITE_MOVE ? "move" : "user");
- ret = -EIO;
err:
if (to_wbio(dst)->bounce)
bch2_bio_free_pages_pool(c, dst);
@@ -1093,6 +1208,8 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
return false;
e = bkey_s_c_to_extent(k);
+
+ guard(rcu)();
extent_for_each_ptr_decode(e, p, entry) {
if (crc_is_encoded(p.crc) || p.has_ec)
return false;
@@ -1103,20 +1220,6 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
return replicas >= op->opts.data_replicas;
}
-static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
-
- for_each_keylist_key(&op->insert_keys, k) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
-
- bkey_for_each_ptr(ptrs, ptr)
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr),
- BUCKET_NOCOW_LOCK_UPDATE);
- }
-}
-
static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *orig,
@@ -1150,48 +1253,43 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
return bch2_extent_update_i_size_sectors(trans, iter,
min(new->k.p.offset << 9, new_i_size), 0) ?:
bch2_trans_update(trans, iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
}
static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct btree_trans *trans = bch2_trans_get(c);
+ int ret = 0;
for_each_keylist_key(&op->insert_keys, orig) {
- int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
+ ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
- BTREE_ITER_INTENT, k,
+ BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
}));
-
- if (ret && !bch2_err_matches(ret, EROFS)) {
- struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
-
- bch_err_inum_offset_ratelimited(c,
- insert->k.p.inode, insert->k.p.offset << 9,
- "%s write error while doing btree update: %s",
- op->flags & BCH_WRITE_MOVE ? "move" : "user",
- bch2_err_str(ret));
- }
-
- if (ret) {
- op->error = ret;
+ if (ret)
break;
- }
}
bch2_trans_put(trans);
+
+ if (ret && !bch2_err_matches(ret, EROFS)) {
+ struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+ bch2_write_op_error(op, bkey_start_offset(&insert->k),
+ "btree update error: %s", bch2_err_str(ret));
+ }
+
+ if (ret)
+ op->error = ret;
}
static void __bch2_nocow_write_done(struct bch_write_op *op)
{
- bch2_nocow_write_unlock(op);
-
- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
- op->error = -EIO;
- } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
+ if (unlikely(op->flags & BCH_WRITE_io_error)) {
+ op->error = bch_err_throw(op->c, data_write_io);
+ } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
bch2_nocow_write_convert_unwritten(op);
}
@@ -1218,9 +1316,9 @@ static void bch2_nocow_write(struct bch_write_op *op)
DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
u32 snapshot;
struct bucket_to_lock *stale_at;
- int ret;
+ int stale, ret;
- if (op->flags & BCH_WRITE_MOVE)
+ if (op->flags & BCH_WRITE_move)
return;
darray_init(&buckets);
@@ -1234,13 +1332,17 @@ retry:
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(op->pos.inode, op->pos.offset, snapshot),
- BTREE_ITER_SLOTS);
+ BTREE_ITER_slots);
while (1) {
struct bio *bio = &op->wbio.bio;
buckets.nr = 0;
- k = bch2_btree_iter_peek_slot(&iter);
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ break;
+
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
break;
@@ -1259,21 +1361,23 @@ retry:
/* Get iorefs before dropping btree locks: */
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
- struct bpos b = PTR_BUCKET_POS(c, ptr);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
+ BCH_DEV_WRITE_REF_io_write);
+ if (unlikely(!ca))
+ goto err_get_ioref;
+
+ struct bpos b = PTR_BUCKET_POS(ca, ptr);
struct nocow_lock_bucket *l =
bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
prefetch(l);
- if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
- goto err_get_ioref;
-
/* XXX allocating memory with btree locks held - rare */
darray_push_gfp(&buckets, ((struct bucket_to_lock) {
.b = b, .gen = ptr->gen, .l = l,
}), GFP_KERNEL|__GFP_NOFAIL);
if (ptr->unwritten)
- op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+ op->flags |= BCH_WRITE_convert_unwritten;
}
/* Unlock before taking nocow locks, doing IO: */
@@ -1281,20 +1385,18 @@ retry:
bch2_trans_unlock(trans);
bch2_cut_front(op->pos, op->insert_keys.top);
- if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+ if (op->flags & BCH_WRITE_convert_unwritten)
bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
darray_for_each(buckets, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode);
+ struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);
__bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
bucket_to_u64(i->b),
BUCKET_NOCOW_LOCK_UPDATE);
- rcu_read_lock();
- bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen);
- rcu_read_unlock();
-
+ int gen = bucket_gen_get(ca, i->b.offset);
+ stale = gen < 0 ? gen : gen_after(gen, i->gen);
if (unlikely(stale)) {
stale_at = i;
goto err_bucket_stale;
@@ -1308,7 +1410,7 @@ retry:
wbio_init(bio)->put_bio = true;
bio->bi_opf = op->wbio.bio.bi_opf;
} else {
- op->flags |= BCH_WRITE_DONE;
+ op->flags |= BCH_WRITE_submitted;
}
op->pos.offset += bio_sectors(bio);
@@ -1318,13 +1420,14 @@ retry:
bio->bi_private = &op->cl;
bio->bi_opf |= REQ_OP_WRITE;
closure_get(&op->cl);
+
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
op->insert_keys.top, true);
bch2_keylist_push(&op->insert_keys);
- if (op->flags & BCH_WRITE_DONE)
+ if (op->flags & BCH_WRITE_submitted)
break;
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
}
out:
bch2_trans_iter_exit(trans, &iter);
@@ -1332,23 +1435,22 @@ err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
+ bch2_trans_put(trans);
+ darray_exit(&buckets);
+
if (ret) {
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode, op->pos.offset << 9,
- "%s: btree lookup error %s", __func__, bch2_err_str(ret));
+ bch2_write_op_error(op, op->pos.offset,
+ "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
op->error = ret;
- op->flags |= BCH_WRITE_DONE;
+ op->flags |= BCH_WRITE_submitted;
}
- bch2_trans_put(trans);
- darray_exit(&buckets);
-
/* fallback to cow write path? */
- if (!(op->flags & BCH_WRITE_DONE)) {
+ if (!(op->flags & BCH_WRITE_submitted)) {
closure_sync(&op->cl);
__bch2_nocow_write_done(op);
op->insert_keys.top = op->insert_keys.keys;
- } else if (op->flags & BCH_WRITE_SYNC) {
+ } else if (op->flags & BCH_WRITE_sync) {
closure_sync(&op->cl);
bch2_nocow_write_done(&op->cl.work);
} else {
@@ -1362,7 +1464,8 @@ err:
return;
err_get_ioref:
darray_for_each(buckets, i)
- percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref);
+ enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_io_write);
/* Fall back to COW path: */
goto out;
@@ -1373,8 +1476,18 @@ err_bucket_stale:
break;
}
- /* We can retry this: */
- ret = -BCH_ERR_transaction_restart;
+ struct printbuf buf = PRINTBUF;
+ if (bch2_fs_inconsistent_on(stale < 0, c,
+ "pointer to invalid bucket in nocow path on device %llu\n %s",
+ stale_at->b.inode,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch_err_throw(c, data_write_invalid_ptr);
+ } else {
+ /* We can retry this: */
+ ret = bch_err_throw(c, transaction_restart);
+ }
+ printbuf_exit(&buf);
+
goto err_get_ioref;
}
@@ -1390,7 +1503,7 @@ static void __bch2_write(struct bch_write_op *op)
if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
bch2_nocow_write(op);
- if (op->flags & BCH_WRITE_DONE)
+ if (op->flags & BCH_WRITE_submitted)
goto out_nofs_restore;
}
again:
@@ -1417,19 +1530,17 @@ again:
* freeing up space on specific disks, which means that
* allocations for specific disks may hang arbitrarily long:
*/
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_run(c, lockrestart_do(trans,
bch2_alloc_sectors_start_trans(trans,
op->target,
- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
+ op->opts.erasure_code && !(op->flags & BCH_WRITE_cached),
op->write_point,
&op->devs_have,
op->nr_replicas,
op->nr_replicas_required,
op->watermark,
op->flags,
- (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
- BCH_WRITE_ONLY_SPECIFIED_DEVS))
- ? NULL : &op->cl, &wp));
+ &op->cl, &wp)));
if (unlikely(ret)) {
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
break;
@@ -1445,16 +1556,12 @@ again:
bch2_alloc_sectors_done_inlined(c, wp);
err:
if (ret <= 0) {
- op->flags |= BCH_WRITE_DONE;
-
- if (ret < 0) {
- if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode,
- op->pos.offset << 9,
- "%s(): %s error: %s", __func__,
- op->flags & BCH_WRITE_MOVE ? "move" : "user",
- bch2_err_str(ret));
+ op->flags |= BCH_WRITE_submitted;
+
+ if (unlikely(ret < 0)) {
+ if (!(op->flags & BCH_WRITE_alloc_nowait))
+ bch2_write_op_error(op, op->pos.offset,
+ "%s(): %s", __func__, bch2_err_str(ret));
op->error = ret;
break;
}
@@ -1480,13 +1587,14 @@ err:
* synchronously here if we weren't able to submit all of the IO at
* once, as that signals backpressure to the caller.
*/
- if ((op->flags & BCH_WRITE_SYNC) ||
- (!(op->flags & BCH_WRITE_DONE) &&
- !(op->flags & BCH_WRITE_IN_WORKER))) {
- closure_sync(&op->cl);
+ if ((op->flags & BCH_WRITE_sync) ||
+ (!(op->flags & BCH_WRITE_submitted) &&
+ !(op->flags & BCH_WRITE_in_worker))) {
+ bch2_wait_on_allocator(c, &op->cl);
+
__bch2_write_index(op);
- if (!(op->flags & BCH_WRITE_DONE))
+ if (!(op->flags & BCH_WRITE_submitted))
goto again;
bch2_write_done(&op->cl);
} else {
@@ -1505,8 +1613,10 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
unsigned sectors;
int ret;
- op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
- op->flags |= BCH_WRITE_DONE;
+ memset(&op->failed, 0, sizeof(op->failed));
+
+ op->flags |= BCH_WRITE_wrote_data_inline;
+ op->flags |= BCH_WRITE_submitted;
bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
@@ -1523,7 +1633,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
id = bkey_inline_data_init(op->insert_keys.top);
id->k.p = op->pos;
- id->k.version = op->version;
+ id->k.bversion = op->version;
id->k.size = sectors;
iter = bio->bi_iter;
@@ -1569,33 +1679,35 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
+ async_object_list_add(c, write_op, op, &op->list_idx);
+
+ if (op->flags & BCH_WRITE_only_specified_devs)
+ op->flags |= BCH_WRITE_alloc_nowait;
+
op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
wbio_init(bio)->put_bio = false;
- if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode,
- op->pos.offset << 9,
- "%s write error: misaligned write",
- op->flags & BCH_WRITE_MOVE ? "move" : "user");
- op->error = -EIO;
+ if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
+ bch2_write_op_error(op, op->pos.offset, "misaligned write");
+ op->error = bch_err_throw(c, data_write_misaligned);
goto err;
}
if (c->opts.nochanges) {
- op->error = -BCH_ERR_erofs_no_writes;
+ op->error = bch_err_throw(c, erofs_no_writes);
goto err;
}
- if (!(op->flags & BCH_WRITE_MOVE) &&
- !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
- op->error = -BCH_ERR_erofs_no_writes;
+ if (!(op->flags & BCH_WRITE_move) &&
+ !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) {
+ op->error = bch_err_throw(c, erofs_no_writes);
goto err;
}
- this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
+ if (!(op->flags & BCH_WRITE_move))
+ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
bch2_increment_clock(c, bio_sectors(bio), WRITE);
data_len = min_t(u64, bio->bi_iter.bi_size,
@@ -1613,6 +1725,7 @@ err:
bch2_disk_reservation_put(c, &op->res);
closure_debug_destroy(&op->cl);
+ async_object_list_del(c, write_op, op->list_idx);
if (op->end_io)
op->end_io(op);
}
@@ -1626,43 +1739,42 @@ static const char * const bch2_write_flags[] = {
void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
{
- prt_str(out, "pos: ");
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
+ prt_printf(out, "pos:\t");
bch2_bpos_to_text(out, op->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_str(out, "started: ");
+ prt_printf(out, "started:\t");
bch2_pr_time_units(out, local_clock() - op->start_time);
prt_newline(out);
- prt_str(out, "flags: ");
+ prt_printf(out, "flags:\t");
prt_bitflags(out, bch2_write_flags, op->flags);
prt_newline(out);
- prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
- prt_newline(out);
+ prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
+ prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
+
+ prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
+ prt_printf(out, "ret\t%s\n", bch2_err_str(op->error));
printbuf_indent_sub(out, 2);
}
void bch2_fs_io_write_exit(struct bch_fs *c)
{
- mempool_exit(&c->bio_bounce_pages);
+ bioset_exit(&c->replica_set);
bioset_exit(&c->bio_write);
}
int bch2_fs_io_write_init(struct bch_fs *c)
{
- if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_bio_write_init;
-
- if (mempool_init_page_pool(&c->bio_bounce_pages,
- max_t(unsigned,
- c->opts.btree_node_size,
- c->opts.encoded_extent_max) /
- PAGE_SIZE, 0))
- return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+ if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
+ bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
+ return bch_err_throw(c, ENOMEM_bio_write_init);
return 0;
}