summaryrefslogtreecommitdiff
path: root/fs/bcachefs/io_read.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/io_read.c')
-rw-r--r--fs/bcachefs/io_read.c474
1 files changed, 292 insertions, 182 deletions
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 3c574d8873a1..aa91fcf51eec 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -21,6 +21,7 @@
#include "io_read.h"
#include "io_misc.h"
#include "io_write.h"
+#include "reflink.h"
#include "subvolume.h"
#include "trace.h"
@@ -58,7 +59,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
}
rcu_read_unlock();
- return bch2_rand_range(nr * CONGESTED_MAX) < total;
+ return get_random_u32_below(nr * CONGESTED_MAX) < total;
}
#else
@@ -84,29 +85,38 @@ struct promote_op {
};
static const struct rhashtable_params bch_promote_params = {
- .head_offset = offsetof(struct promote_op, hash),
- .key_offset = offsetof(struct promote_op, pos),
- .key_len = sizeof(struct bpos),
+ .head_offset = offsetof(struct promote_op, hash),
+ .key_offset = offsetof(struct promote_op, pos),
+ .key_len = sizeof(struct bpos),
+ .automatic_shrinking = true,
};
+static inline bool have_io_error(struct bch_io_failures *failed)
+{
+ return failed && failed->nr;
+}
+
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
struct bpos pos,
struct bch_io_opts opts,
- unsigned flags)
+ unsigned flags,
+ struct bch_io_failures *failed)
{
- BUG_ON(!opts.promote_target);
+ if (!have_io_error(failed)) {
+ BUG_ON(!opts.promote_target);
- if (!(flags & BCH_READ_MAY_PROMOTE))
- return -BCH_ERR_nopromote_may_not;
+ if (!(flags & BCH_READ_MAY_PROMOTE))
+ return -BCH_ERR_nopromote_may_not;
- if (bch2_bkey_has_target(c, k, opts.promote_target))
- return -BCH_ERR_nopromote_already_promoted;
+ if (bch2_bkey_has_target(c, k, opts.promote_target))
+ return -BCH_ERR_nopromote_already_promoted;
- if (bkey_extent_is_unwritten(k))
- return -BCH_ERR_nopromote_unwritten;
+ if (bkey_extent_is_unwritten(k))
+ return -BCH_ERR_nopromote_unwritten;
- if (bch2_target_congested(c, opts.promote_target))
- return -BCH_ERR_nopromote_congested;
+ if (bch2_target_congested(c, opts.promote_target))
+ return -BCH_ERR_nopromote_congested;
+ }
if (rhashtable_lookup_fast(&c->promote_table, &pos,
bch_promote_params))
@@ -163,7 +173,8 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
struct extent_ptr_decoded *pick,
struct bch_io_opts opts,
unsigned sectors,
- struct bch_read_bio **rbio)
+ struct bch_read_bio **rbio,
+ struct bch_io_failures *failed)
{
struct bch_fs *c = trans->c;
struct promote_op *op = NULL;
@@ -174,7 +185,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
return ERR_PTR(-BCH_ERR_nopromote_no_writes);
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL);
+ op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
if (!op) {
ret = -BCH_ERR_nopromote_enomem;
goto err;
@@ -216,14 +227,28 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
bio = &op->write.op.wbio.bio;
bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
+ struct data_update_opts update_opts = {};
+
+ if (!have_io_error(failed)) {
+ update_opts.target = opts.promote_target;
+ update_opts.extra_replicas = 1;
+ update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED;
+ } else {
+ update_opts.target = opts.foreground_target;
+
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned ptr_bit = 1;
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (bch2_dev_io_failures(failed, ptr->dev))
+ update_opts.rewrite_ptrs |= ptr_bit;
+ ptr_bit <<= 1;
+ }
+ }
+
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
writepoint_hashed((unsigned long) current),
opts,
- (struct data_update_opts) {
- .target = opts.promote_target,
- .extra_replicas = 1,
- .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
- },
+ update_opts,
btree_id, k);
/*
* possible errors: -BCH_ERR_nocow_lock_blocked,
@@ -243,7 +268,8 @@ err:
bio_free_pages(&(*rbio)->bio);
kfree(*rbio);
*rbio = NULL;
- kfree(op);
+ /* We may have added to the rhashtable and thus need rcu freeing: */
+ kfree_rcu(op, rcu);
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
return ERR_PTR(ret);
}
@@ -257,10 +283,17 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
unsigned flags,
struct bch_read_bio **rbio,
bool *bounce,
- bool *read_full)
+ bool *read_full,
+ struct bch_io_failures *failed)
{
struct bch_fs *c = trans->c;
- bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+ /*
+ * if failed != NULL we're not actually doing a promote, we're
+ * recovering from an io/checksum error
+ */
+ bool promote_full = (have_io_error(failed) ||
+ *read_full ||
+ READ_ONCE(c->opts.promote_whole_extents));
/* data might have to be decompressed in the write path: */
unsigned sectors = promote_full
? max(pick->crc.compressed_size, pick->crc.live_size)
@@ -271,7 +304,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
struct promote_op *promote;
int ret;
- ret = should_promote(c, k, pos, opts, flags);
+ ret = should_promote(c, k, pos, opts, flags, failed);
if (ret)
goto nopromote;
@@ -279,7 +312,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
k.k->type == KEY_TYPE_reflink_v
? BTREE_ID_reflink
: BTREE_ID_extents,
- k, pos, pick, opts, sectors, rbio);
+ k, pos, pick, opts, sectors, rbio, failed);
ret = PTR_ERR_OR_ZERO(promote);
if (ret)
goto nopromote;
@@ -294,6 +327,20 @@ nopromote:
/* Read */
+static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+ struct bch_read_bio *rbio, struct bpos read_pos)
+{
+ return bch2_inum_offset_err_msg_trans(trans, out,
+ (subvol_inum) { rbio->subvol, read_pos.inode },
+ read_pos.offset << 9);
+}
+
+static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
+ struct bch_read_bio *rbio, struct bpos read_pos)
+{
+ bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
+}
+
#define READ_RETRY_AVOID 1
#define READ_RETRY 2
#define READ_ERR 3
@@ -378,17 +425,17 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
bch2_bkey_buf_init(&sk);
bch2_trans_iter_init(trans, &iter, rbio->data_btree,
- rbio->read_pos, BTREE_ITER_SLOTS);
+ rbio->read_pos, BTREE_ITER_slots);
retry:
+ bch2_trans_begin(trans);
rbio->bio.bi_status = 0;
- k = bch2_btree_iter_peek_slot(&iter);
- if (bkey_err(k))
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (ret)
goto err;
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- bch2_trans_unlock(trans);
if (!bch2_bkey_matches_ptr(c, k,
rbio->pick.ptr,
@@ -472,6 +519,29 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
}
}
+static void bch2_read_io_err(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bio *bio = &rbio->bio;
+ struct bch_fs *c = rbio->c;
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+ prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
+
+ if (ca) {
+ bch2_io_error(ca, BCH_MEMBER_ERROR_read);
+ bch_err_ratelimited(ca, "%s", buf.buf);
+ } else {
+ bch_err_ratelimited(c, "%s", buf.buf);
+ }
+
+ printbuf_exit(&buf);
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+}
+
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
struct bch_read_bio *rbio)
{
@@ -487,11 +557,11 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
return 0;
k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
if ((ret = bkey_err(k)))
goto out;
- if (bversion_cmp(k.k->version, rbio->version) ||
+ if (bversion_cmp(k.k->bversion, rbio->version) ||
!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
goto out;
@@ -523,7 +593,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
goto out;
ret = bch2_trans_update(trans, &iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -531,8 +601,75 @@ out:
static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
{
- bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_rbio_narrow_crcs(trans, rbio));
+ bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __bch2_rbio_narrow_crcs(trans, rbio));
+}
+
+static void bch2_read_csum_err(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bch_fs *c = rbio->c;
+ struct bio *src = &rbio->bio;
+ struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+ struct nonce nonce = extent_nonce(rbio->version, crc);
+ struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+ struct printbuf buf = PRINTBUF;
+
+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+ prt_str(&buf, "data ");
+ bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
+
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ if (ca) {
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ bch_err_ratelimited(ca, "%s", buf.buf);
+ } else {
+ bch_err_ratelimited(c, "%s", buf.buf);
+ }
+
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ printbuf_exit(&buf);
+}
+
+static void bch2_read_decompress_err(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bch_fs *c = rbio->c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+ prt_str(&buf, "decompression error");
+
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ if (ca)
+ bch_err_ratelimited(ca, "%s", buf.buf);
+ else
+ bch_err_ratelimited(c, "%s", buf.buf);
+
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ printbuf_exit(&buf);
+}
+
+static void bch2_read_decrypt_err(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bch_fs *c = rbio->c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+ prt_str(&buf, "decrypt error");
+
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ if (ca)
+ bch_err_ratelimited(ca, "%s", buf.buf);
+ else
+ bch_err_ratelimited(c, "%s", buf.buf);
+
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ printbuf_exit(&buf);
}
/* Inner part that may run in process context */
@@ -541,7 +678,6 @@ static void __bch2_read_endio(struct work_struct *work)
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
struct bio *src = &rbio->bio;
struct bio *dst = &bch2_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
@@ -642,31 +778,13 @@ csum_err:
goto out;
}
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
- prt_str(&buf, "data ");
- bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
-
- bch_err_inum_offset_ratelimited(ca,
- rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "data %s", buf.buf);
- printbuf_exit(&buf);
-
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
goto out;
decompression_err:
- bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "decompression error");
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
goto out;
decrypt_err:
- bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "decrypt error");
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
goto out;
}
@@ -675,7 +793,7 @@ static void bch2_read_endio(struct bio *bio)
struct bch_read_bio *rbio =
container_of(bio, struct bch_read_bio, bio);
struct bch_fs *c = rbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
struct workqueue_struct *wq = NULL;
enum rbio_context context = RBIO_CONTEXT_NULL;
@@ -687,17 +805,13 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
- rbio->read_pos.inode,
- rbio->read_pos.offset,
- "data read error: %s",
- bch2_blk_status_to_str(bio->bi_status))) {
- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+ if (unlikely(bio->bi_status)) {
+ bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
return;
}
if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(ca, &rbio->pick.ptr)) {
+ (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
trace_and_count(c, read_reuse_race, &rbio->bio);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
@@ -718,72 +832,45 @@ static void bch2_read_endio(struct bio *bio)
bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
}
-int __bch2_read_indirect_extent(struct btree_trans *trans,
- unsigned *offset_into_extent,
- struct bkey_buf *orig_k)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 reflink_offset;
- int ret;
-
- reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
- *offset_into_extent;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
- POS(0, reflink_offset), 0);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_reflink_v &&
- k.k->type != KEY_TYPE_indirect_inline_data) {
- bch_err_inum_offset_ratelimited(trans->c,
- orig_k->k->k.p.inode,
- orig_k->k->k.p.offset << 9,
- "%llu len %u points to nonexistent indirect extent %llu",
- orig_k->k->k.p.offset,
- orig_k->k->k.size,
- reflink_offset);
- bch2_inconsistent_error(trans->c);
- ret = -EIO;
- goto err;
- }
-
- *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
- bch2_bkey_buf_reassemble(orig_k, trans->c, k);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bkey_s_c k,
struct bch_extent_ptr ptr)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
struct btree_iter iter;
struct printbuf buf = PRINTBUF;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- PTR_BUCKET_POS(c, &ptr),
- BTREE_ITER_CACHED);
+ PTR_BUCKET_POS(ca, &ptr),
+ BTREE_ITER_cached);
- prt_printf(&buf, "Attempting to read from stale dirty pointer:");
- printbuf_indent_add(&buf, 2);
- prt_newline(&buf);
+ int gen = bucket_gen_get(ca, iter.pos.offset);
+ if (gen >= 0) {
+ prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
+ printbuf_indent_add(&buf, 2);
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
- prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+ prt_printf(&buf, "memory gen: %u", gen);
+
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (!ret) {
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ }
+ } else {
+ prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
+ iter.pos.inode, iter.pos.offset);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "first bucket %u nbuckets %llu\n",
+ ca->mi.first_bucket, ca->mi.nbuckets);
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
- if (!ret) {
- prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
}
bch2_fs_inconsistent(c, "%s", buf.buf);
@@ -801,7 +888,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct bch_read_bio *rbio = NULL;
- struct bch_dev *ca = NULL;
struct promote_op *promote = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
@@ -825,14 +911,29 @@ retry_pick:
if (!pick_ret)
goto hole;
- if (pick_ret < 0) {
- bch_err_inum_offset_ratelimited(c,
- read_pos.inode, read_pos.offset << 9,
- "no device to read from");
+ if (unlikely(pick_ret < 0)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
+ prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret));
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
goto err;
}
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
+ struct printbuf buf = PRINTBUF;
+ bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
+ prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ goto err;
+ }
+
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
/*
* Stale dirty pointers are treated as IO errors, but @failed isn't
@@ -842,25 +943,24 @@ retry_pick:
*/
if ((flags & BCH_READ_IN_RETRY) &&
!pick.ptr.cached &&
- unlikely(ptr_stale(ca, &pick.ptr))) {
- read_from_stale_dirty_pointer(trans, k, pick.ptr);
+ ca &&
+ unlikely(dev_ptr_stale(ca, &pick.ptr))) {
+ read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick);
+ percpu_ref_put(&ca->io_ref);
goto retry_pick;
}
- /*
- * Unlock the iterator while the btree node's lock is still in
- * cache, before doing the IO:
- */
- bch2_trans_unlock(trans);
-
if (flags & BCH_READ_NODECODE) {
/*
* can happen if we retry, and the extent we were going to read
* has been merged in the meantime:
*/
- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) {
+ if (ca)
+ percpu_ref_put(&ca->io_ref);
goto hole;
+ }
iter.bi_size = pick.crc.compressed_size << 9;
goto get_bio;
@@ -888,9 +988,9 @@ retry_pick:
bounce = true;
}
- if (orig->opts.promote_target)
+ if (orig->opts.promote_target || have_io_error(failed))
promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
- &rbio, &bounce, &read_full);
+ &rbio, &bounce, &read_full, failed);
if (!read_full) {
EBUG_ON(crc_is_compressed(pick.crc));
@@ -965,7 +1065,7 @@ get_bio:
rbio->bvec_iter = iter;
rbio->offset_into_extent= offset_into_extent;
rbio->flags = flags;
- rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+ rbio->have_ioref = ca != NULL;
rbio->narrow_crcs = narrow_crcs;
rbio->hole = 0;
rbio->retry = 0;
@@ -977,10 +1077,13 @@ get_bio:
rbio->read_pos = read_pos;
rbio->data_btree = data_btree;
rbio->data_pos = data_pos;
- rbio->version = k.k->version;
+ rbio->version = k.k->bversion;
rbio->promote = promote;
INIT_WORK(&rbio->work, NULL);
+ if (flags & BCH_READ_NODECODE)
+ orig->pick = pick;
+
rbio->bio.bi_opf = orig->bio.bi_opf;
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
@@ -995,7 +1098,7 @@ get_bio:
* If it's being moved internally, we don't want to flag it as a cache
* hit:
*/
- if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+ if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE))
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
PTR_BUCKET_NR(ca, &pick.ptr), READ);
@@ -1004,12 +1107,25 @@ get_bio:
trace_and_count(c, read_split, &orig->bio);
}
+ /*
+ * Unlock the iterator while the btree node's lock is still in
+ * cache, before doing the IO:
+ */
+ if (!(flags & BCH_READ_IN_RETRY))
+ bch2_trans_unlock(trans);
+ else
+ bch2_trans_unlock_long(trans);
+
if (!rbio->pick.idx) {
- if (!rbio->have_ioref) {
- bch_err_inum_offset_ratelimited(c,
- read_pos.inode,
- read_pos.offset << 9,
- "no device to read from");
+ if (unlikely(!rbio->have_ioref)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
+ prt_printf(&buf, "no device to read from:\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
@@ -1035,7 +1151,7 @@ get_bio:
trans->notrace_relock_fail = true;
} else {
/* Attempting reconstruct read: */
- if (bch2_ec_read_extent(trans, rbio)) {
+ if (bch2_ec_read_extent(trans, rbio, k)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
@@ -1047,6 +1163,8 @@ out:
if (likely(!(flags & BCH_READ_IN_RETRY))) {
return 0;
} else {
+ bch2_trans_unlock(trans);
+
int ret;
rbio->context = RBIO_CONTEXT_UNBOUND;
@@ -1097,34 +1215,26 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
- u32 snapshot;
int ret;
BUG_ON(flags & BCH_READ_NODECODE);
bch2_bkey_buf_init(&sk);
-retry:
- bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
- BTREE_ITER_SLOTS);
+ POS(inum.inum, bvec_iter.bi_sector),
+ BTREE_ITER_slots);
+
while (1) {
- unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
- /*
- * read_extent -> io_time_reset may cause a transaction restart
- * without returning an error, we need to check for that here:
- */
- ret = bch2_trans_relock(trans);
+ bch2_trans_begin(trans);
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
- break;
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, bvec_iter.bi_sector));
@@ -1132,18 +1242,18 @@ retry:
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
- break;
+ goto err;
- offset_into_extent = iter.pos.offset -
+ s64 offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
- sectors = k.k->size - offset_into_extent;
+ unsigned sectors = k.k->size - offset_into_extent;
bch2_bkey_buf_reassemble(&sk, c, k);
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &sk);
if (ret)
- break;
+ goto err;
k = bkey_i_to_s_c(sk.k);
@@ -1151,9 +1261,9 @@ retry:
* With indirect extents, the amount of data to read is the min
* of the original extent and the indirect extent:
*/
- sectors = min(sectors, k.k->size - offset_into_extent);
+ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
- bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+ unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
swap(bvec_iter.bi_size, bytes);
if (bvec_iter.bi_size == bytes)
@@ -1163,36 +1273,36 @@ retry:
data_btree, k,
offset_into_extent, failed, flags);
if (ret)
- break;
+ goto err;
if (flags & BCH_READ_LAST_FRAGMENT)
break;
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-
- ret = btree_trans_too_many_iters(trans);
- if (ret)
+err:
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+ ret != READ_RETRY &&
+ ret != READ_RETRY_AVOID)
break;
}
-err:
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- ret == READ_RETRY ||
- ret == READ_RETRY_AVOID)
- goto retry;
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&sk, c);
+ bch2_trans_iter_exit(trans, &iter);
if (ret) {
- bch_err_inum_offset_ratelimited(c, inum.inum,
- bvec_iter.bi_sector << 9,
- "read error %i from btree lookup", ret);
+ struct printbuf buf = PRINTBUF;
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9);
+ prt_printf(&buf, "read error %i from btree lookup", ret);
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+
rbio->bio.bi_status = BLK_STS_IOERR;
bch2_rbio_done(rbio);
}
+
+ bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&sk, c);
}
void bch2_fs_io_read_exit(struct bch_fs *c)