diff options
Diffstat (limited to 'net/ceph/osd_client.c')
| -rw-r--r-- | net/ceph/osd_client.c | 1006 |
1 files changed, 618 insertions, 388 deletions
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4fea3c33af2a..6664ea73ccf8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -171,6 +171,13 @@ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data, osd_data->num_bvecs = num_bvecs; } +static void ceph_osd_iter_init(struct ceph_osd_data *osd_data, + struct iov_iter *iter) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_ITER; + osd_data->iter = *iter; +} + static struct ceph_osd_data * osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) { @@ -213,16 +220,6 @@ void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); -void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, - unsigned int which, struct ceph_pagelist *pagelist) -{ - struct ceph_osd_data *osd_data; - - osd_data = osd_req_op_data(osd_req, which, extent, osd_data); - ceph_osd_data_pagelist_init(osd_data, pagelist); -} -EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); - #ifdef CONFIG_BLOCK void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, unsigned int which, @@ -264,28 +261,31 @@ void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos); -static void osd_req_op_cls_request_info_pagelist( - struct ceph_osd_request *osd_req, - unsigned int which, struct ceph_pagelist *pagelist) +/** + * osd_req_op_extent_osd_iter - Set up an operation with an iterator buffer + * @osd_req: The request to set up + * @which: Index of the operation in which to set the iter + * @iter: The buffer iterator + */ +void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req, + unsigned int which, struct iov_iter *iter) { struct ceph_osd_data *osd_data; - osd_data = osd_req_op_data(osd_req, which, cls, request_info); - ceph_osd_data_pagelist_init(osd_data, pagelist); + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_iter_init(osd_data, iter); } +EXPORT_SYMBOL(osd_req_op_extent_osd_iter); -void osd_req_op_cls_request_data_pagelist( +static void osd_req_op_cls_request_info_pagelist( struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist) { struct ceph_osd_data *osd_data; - osd_data = osd_req_op_data(osd_req, which, cls, request_data); + osd_data = osd_req_op_data(osd_req, which, cls, request_info); ceph_osd_data_pagelist_init(osd_data, pagelist); - osd_req->r_ops[which].cls.indata_len += pagelist->length; - osd_req->r_ops[which].indata_len += pagelist->length; } -EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, @@ -346,6 +346,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) #endif /* CONFIG_BLOCK */ case CEPH_OSD_DATA_TYPE_BVECS: return osd_data->bvec_pos.iter.bi_size; + case CEPH_OSD_DATA_TYPE_ITER: + return iov_iter_count(&osd_data->iter); default: WARN(true, "unrecognized data type %d\n", (int)osd_data->type); return 0; @@ -376,8 +378,10 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, switch (op->op) { case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: + kfree(op->extent.sparse_ext); ceph_osd_data_release(&op->extent.osd_data); break; case CEPH_OSD_OP_CALL: @@ -445,8 +449,10 @@ static void target_copy(struct ceph_osd_request_target *dest, dest->size = src->size; dest->min_size = src->min_size; dest->sort_bitwise = src->sort_bitwise; + dest->recovery_deletes = src->recovery_deletes; dest->flags = src->flags; + dest->used_replica = src->used_replica; dest->paused = src->paused; dest->epoch = src->epoch; @@ -523,7 +529,7 @@ EXPORT_SYMBOL(ceph_osdc_put_request); static void request_init(struct ceph_osd_request *req) { - /* req only, each op is zeroed in _osd_req_op_init() */ + /* req only, each op is zeroed in osd_req_op_init() */ memset(req, 0, sizeof(*req)); kref_init(&req->r_kref); @@ -535,43 +541,6 @@ static void request_init(struct ceph_osd_request *req) target_init(&req->r_t); } -/* - * This is ugly, but it allows us to reuse linger registration and ping - * requests, keeping the structure of the code around send_linger{_ping}() - * reasonable. Setting up a min_nr=2 mempool for each linger request - * and dealing with copying ops (this blasts req only, watch op remains - * intact) isn't any better. - */ -static void request_reinit(struct ceph_osd_request *req) -{ - struct ceph_osd_client *osdc = req->r_osdc; - bool mempool = req->r_mempool; - unsigned int num_ops = req->r_num_ops; - u64 snapid = req->r_snapid; - struct ceph_snap_context *snapc = req->r_snapc; - bool linger = req->r_linger; - struct ceph_msg *request_msg = req->r_request; - struct ceph_msg *reply_msg = req->r_reply; - - dout("%s req %p\n", __func__, req); - WARN_ON(kref_read(&req->r_kref) != 1); - request_release_checks(req); - - WARN_ON(kref_read(&request_msg->kref) != 1); - WARN_ON(kref_read(&reply_msg->kref) != 1); - target_destroy(&req->r_t); - - request_init(req); - req->r_osdc = osdc; - req->r_mempool = mempool; - req->r_num_ops = num_ops; - req->r_snapid = snapid; - req->r_snapc = snapc; - req->r_linger = linger; - req->r_request = request_msg; - req->r_reply = reply_msg; -} - struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, unsigned int num_ops, @@ -704,6 +673,7 @@ static void get_num_data_items(struct ceph_osd_request *req, /* reply */ case CEPH_OSD_OP_STAT: case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_LIST_WATCHERS: *num_reply_data_items += 1; break; @@ -744,8 +714,8 @@ EXPORT_SYMBOL(ceph_osdc_alloc_messages); * other information associated with them. It also serves as a * common init routine for all the other init functions, below. */ -static struct ceph_osd_req_op * -_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, +struct ceph_osd_req_op * +osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u32 flags) { struct ceph_osd_req_op *op; @@ -760,12 +730,6 @@ _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, return op; } - -void osd_req_op_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode, u32 flags) -{ - (void)_osd_req_op_init(osd_req, which, opcode, flags); -} EXPORT_SYMBOL(osd_req_op_init); void osd_req_op_extent_init(struct ceph_osd_request *osd_req, @@ -773,13 +737,13 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - opcode, 0); + struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, + opcode, 0); size_t payload_len = 0; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO && - opcode != CEPH_OSD_OP_TRUNCATE); + opcode != CEPH_OSD_OP_TRUNCATE && opcode != CEPH_OSD_OP_SPARSE_READ); op->extent.offset = offset; op->extent.length = length; @@ -820,7 +784,7 @@ void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, BUG_ON(which + 1 >= osd_req->r_num_ops); prev_op = &osd_req->r_ops[which]; - op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); + op = osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); /* dup previous one */ op->indata_len = prev_op->indata_len; op->outdata_len = prev_op->outdata_len; @@ -843,7 +807,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, size_t size; int ret; - op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); + op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) @@ -881,8 +845,8 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *name, const void *value, size_t size, u8 cmp_op, u8 cmp_mode) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - opcode, 0); + struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, + opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; int ret; @@ -922,14 +886,30 @@ EXPORT_SYMBOL(osd_req_op_xattr_init); * @watch_opcode: CEPH_OSD_WATCH_OP_* */ static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, - u64 cookie, u8 watch_opcode) + u8 watch_opcode, u64 cookie, u32 gen) { struct ceph_osd_req_op *op; - op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); + op = osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); op->watch.cookie = cookie; op->watch.op = watch_opcode; - op->watch.gen = 0; + op->watch.gen = gen; +} + +/* + * prot_ver, timeout and notify payload (may be empty) should already be + * encoded in @request_pl + */ +static void osd_req_op_notify_init(struct ceph_osd_request *req, int which, + u64 cookie, struct ceph_pagelist *request_pl) +{ + struct ceph_osd_req_op *op; + + op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); + op->notify.cookie = cookie; + + ceph_osd_data_pagelist_init(&op->notify.request_data, request_pl); + op->indata_len = request_pl->length; } /* @@ -941,10 +921,9 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, u64 expected_write_size, u32 flags) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - CEPH_OSD_OP_SETALLOCHINT, - 0); + struct ceph_osd_req_op *op; + op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_SETALLOCHINT, 0); op->alloc_hint.expected_object_size = expected_object_size; op->alloc_hint.expected_write_size = expected_write_size; op->alloc_hint.flags = flags; @@ -977,6 +956,8 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, #endif } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) { ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos); + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_ITER) { + ceph_msg_data_add_iter(msg, &osd_data->iter); } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } @@ -989,6 +970,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, case CEPH_OSD_OP_STAT: break; case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_ZERO: @@ -1043,6 +1025,10 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, dst->copy_from.src_fadvise_flags = cpu_to_le32(src->copy_from.src_fadvise_flags); break; + case CEPH_OSD_OP_ASSERT_VER: + dst->assert_ver.unused = cpu_to_le64(0); + dst->assert_ver.ver = cpu_to_le64(src->assert_ver.ver); + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); @@ -1085,7 +1071,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE && - opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE); + opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE && + opcode != CEPH_OSD_OP_SPARSE_READ); req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, GFP_NOFS); @@ -1117,24 +1104,39 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size, truncate_seq); } - req->r_flags = flags; req->r_base_oloc.pool = layout->pool_id; req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); + req->r_flags = flags | osdc->client->options->read_from_replica; req->r_snapid = vino.snap; if (flags & CEPH_OSD_FLAG_WRITE) req->r_data_offset = off; - if (num_ops > 1) + if (num_ops > 1) { + int num_req_ops, num_rep_ops; + /* - * This is a special case for ceph_writepages_start(), but it - * also covers ceph_uninline_data(). If more multi-op request - * use cases emerge, we will need a separate helper. + * If this is a multi-op write request, assume that we'll need + * request ops. If it's a multi-op read then assume we'll need + * reply ops. Anything else and call it -EINVAL. */ - r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_ops, 0); - else + if (flags & CEPH_OSD_FLAG_WRITE) { + num_req_ops = num_ops; + num_rep_ops = 0; + } else if (flags & CEPH_OSD_FLAG_READ) { + num_req_ops = 0; + num_rep_ops = num_ops; + } else { + r = -EINVAL; + goto fail; + } + + r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_req_ops, + num_rep_ops); + } else { r = ceph_osdc_alloc_messages(req, GFP_NOFS); + } if (r) goto fail; @@ -1146,6 +1148,20 @@ fail: } EXPORT_SYMBOL(ceph_osdc_new_request); +int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) +{ + WARN_ON(op->op != CEPH_OSD_OP_SPARSE_READ); + + op->extent.sparse_ext_cnt = cnt; + op->extent.sparse_ext = kmalloc_array(cnt, + sizeof(*op->extent.sparse_ext), + GFP_NOFS); + if (!op->extent.sparse_ext) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL(__ceph_alloc_sparse_ext_map); + /* * We keep osd requests in an rbtree, sorted by ->r_tid. */ @@ -1203,6 +1219,7 @@ static void osd_init(struct ceph_osd *osd) { refcount_set(&osd->o_ref, 1); RB_CLEAR_NODE(&osd->o_node); + spin_lock_init(&osd->o_requests_lock); osd->o_requests = RB_ROOT; osd->o_linger_requests = RB_ROOT; osd->o_backoff_mappings = RB_ROOT; @@ -1213,6 +1230,13 @@ static void osd_init(struct ceph_osd *osd) mutex_init(&osd->lock); } +static void ceph_init_sparse_read(struct ceph_sparse_read *sr) +{ + kfree(sr->sr_extent); + memset(sr, '\0', sizeof(*sr)); + sr->sr_state = CEPH_SPARSE_READ_HDR; +} + static void osd_cleanup(struct ceph_osd *osd) { WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); @@ -1223,6 +1247,8 @@ static void osd_cleanup(struct ceph_osd *osd) WARN_ON(!list_empty(&osd->o_osd_lru)); WARN_ON(!list_empty(&osd->o_keepalive_item)); + ceph_init_sparse_read(&osd->o_sparse_read); + if (osd->o_auth.authorizer) { WARN_ON(osd_homeless(osd)); ceph_auth_destroy_authorizer(osd->o_auth.authorizer); @@ -1242,6 +1268,9 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) osd_init(osd); osd->o_osdc = osdc; osd->o_osd = onum; + osd->o_sparse_op_idx = -1; + + ceph_init_sparse_read(&osd->o_sparse_read); ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); @@ -1432,7 +1461,9 @@ static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req) atomic_inc(&osd->o_osdc->num_homeless); get_osd(osd); + spin_lock(&osd->o_requests_lock); insert_request(&osd->o_requests, req); + spin_unlock(&osd->o_requests_lock); req->r_osd = osd; } @@ -1444,7 +1475,9 @@ static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req) req, req->r_tid); req->r_osd = NULL; + spin_lock(&osd->o_requests_lock); erase_request(&osd->o_requests, req); + spin_unlock(&osd->o_requests_lock); put_osd(osd); if (!osd_homeless(osd)) @@ -1505,7 +1538,7 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, static int pick_random_replica(const struct ceph_osds *acting) { - int i = prandom_u32() % acting->size; + int i = get_random_u32_below(acting->size); dout("%s picked osd%d, primary osd%d\n", __func__, acting->osds[i], acting->primary); @@ -2042,6 +2075,7 @@ static void setup_request_data(struct ceph_osd_request *req) &op->raw_data_in); break; case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: ceph_osdc_msg_data_add(reply_msg, &op->extent.osd_data); break; @@ -2390,7 +2424,11 @@ again: if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) { err = -ENOSPC; } else { - pr_warn_ratelimited("FULL or reached pool quota\n"); + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) + pr_warn_ratelimited("cluster is full (osdmap FULL)\n"); + else + pr_warn_ratelimited("pool %lld is full or reached quota\n", + req->r_t.base_oloc.pool); req->r_t.paused = true; maybe_request_map(osdc); } @@ -2431,14 +2469,11 @@ promote: static void account_request(struct ceph_osd_request *req) { - struct ceph_osd_client *osdc = req->r_osdc; - WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); req->r_flags |= CEPH_OSD_FLAG_ONDISK; - req->r_flags |= osdc->client->options->osd_req_flags; - atomic_inc(&osdc->num_requests); + atomic_inc(&req->r_osdc->num_requests); req->r_start_stamp = jiffies; req->r_start_latency = ktime_get(); @@ -2460,8 +2495,10 @@ static void finish_request(struct ceph_osd_request *req) req->r_end_latency = ktime_get(); - if (req->r_osd) + if (req->r_osd) { + ceph_init_sparse_read(&req->r_osd->o_sparse_read); unlink_request(req->r_osd, req); + } atomic_dec(&osdc->num_requests); /* @@ -2735,10 +2772,13 @@ static void linger_release(struct kref *kref) WARN_ON(!list_empty(&lreq->pending_lworks)); WARN_ON(lreq->osd); - if (lreq->reg_req) - ceph_osdc_put_request(lreq->reg_req); - if (lreq->ping_req) - ceph_osdc_put_request(lreq->ping_req); + if (lreq->request_pl) + ceph_pagelist_release(lreq->request_pl); + if (lreq->notify_id_pages) + ceph_release_page_vector(lreq->notify_id_pages, 1); + + ceph_osdc_put_request(lreq->reg_req); + ceph_osdc_put_request(lreq->ping_req); target_destroy(&lreq->t); kfree(lreq); } @@ -3007,6 +3047,12 @@ static void linger_commit_cb(struct ceph_osd_request *req) struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); + if (req != lreq->reg_req) { + dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", + __func__, lreq, lreq->linger_id, req, lreq->reg_req); + goto out; + } + dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, lreq->linger_id, req->r_result); linger_reg_commit_complete(lreq, req->r_result); @@ -3030,6 +3076,7 @@ static void linger_commit_cb(struct ceph_osd_request *req) } } +out: mutex_unlock(&lreq->lock); linger_put(lreq); } @@ -3052,6 +3099,12 @@ static void linger_reconnect_cb(struct ceph_osd_request *req) struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); + if (req != lreq->reg_req) { + dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", + __func__, lreq, lreq->linger_id, req, lreq->reg_req); + goto out; + } + dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__, lreq, lreq->linger_id, req->r_result, lreq->last_error); if (req->r_result < 0) { @@ -3061,48 +3114,64 @@ static void linger_reconnect_cb(struct ceph_osd_request *req) } } +out: mutex_unlock(&lreq->lock); linger_put(lreq); } static void send_linger(struct ceph_osd_linger_request *lreq) { - struct ceph_osd_request *req = lreq->reg_req; - struct ceph_osd_req_op *op = &req->r_ops[0]; + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osd_request *req; + int ret; - verify_osdc_wrlocked(req->r_osdc); + verify_osdc_wrlocked(osdc); + mutex_lock(&lreq->lock); dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); - if (req->r_osd) - cancel_linger_request(req); + if (lreq->reg_req) { + if (lreq->reg_req->r_osd) + cancel_linger_request(lreq->reg_req); + ceph_osdc_put_request(lreq->reg_req); + } - request_reinit(req); - ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); - ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); - req->r_flags = lreq->t.flags; + req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO); + BUG_ON(!req); + + target_copy(&req->r_t, &lreq->t); req->r_mtime = lreq->mtime; - mutex_lock(&lreq->lock); if (lreq->is_watch && lreq->committed) { - WARN_ON(op->op != CEPH_OSD_OP_WATCH || - op->watch.cookie != lreq->linger_id); - op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT; - op->watch.gen = ++lreq->register_gen; + osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_RECONNECT, + lreq->linger_id, ++lreq->register_gen); dout("lreq %p reconnect register_gen %u\n", lreq, - op->watch.gen); + req->r_ops[0].watch.gen); req->r_callback = linger_reconnect_cb; } else { - if (!lreq->is_watch) + if (lreq->is_watch) { + osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_WATCH, + lreq->linger_id, 0); + } else { lreq->notify_id = 0; - else - WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH); + + refcount_inc(&lreq->request_pl->refcnt); + osd_req_op_notify_init(req, 0, lreq->linger_id, + lreq->request_pl); + ceph_osd_data_pages_init( + osd_req_op_data(req, 0, notify, response_data), + lreq->notify_id_pages, PAGE_SIZE, 0, false, false); + } dout("lreq %p register\n", lreq); req->r_callback = linger_commit_cb; } - mutex_unlock(&lreq->lock); + + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + BUG_ON(ret); req->r_priv = linger_get(lreq); req->r_linger = true; + lreq->reg_req = req; + mutex_unlock(&lreq->lock); submit_request(req, true); } @@ -3112,6 +3181,12 @@ static void linger_ping_cb(struct ceph_osd_request *req) struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); + if (req != lreq->ping_req) { + dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", + __func__, lreq, lreq->linger_id, req, lreq->ping_req); + goto out; + } + dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n", __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent, lreq->last_error); @@ -3127,6 +3202,7 @@ static void linger_ping_cb(struct ceph_osd_request *req) lreq->register_gen, req->r_ops[0].watch.gen); } +out: mutex_unlock(&lreq->lock); linger_put(lreq); } @@ -3134,8 +3210,8 @@ static void linger_ping_cb(struct ceph_osd_request *req) static void send_linger_ping(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; - struct ceph_osd_request *req = lreq->ping_req; - struct ceph_osd_req_op *op = &req->r_ops[0]; + struct ceph_osd_request *req; + int ret; if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { dout("%s PAUSERD\n", __func__); @@ -3147,19 +3223,26 @@ static void send_linger_ping(struct ceph_osd_linger_request *lreq) __func__, lreq, lreq->linger_id, lreq->ping_sent, lreq->register_gen); - if (req->r_osd) - cancel_linger_request(req); + if (lreq->ping_req) { + if (lreq->ping_req->r_osd) + cancel_linger_request(lreq->ping_req); + ceph_osdc_put_request(lreq->ping_req); + } - request_reinit(req); - target_copy(&req->r_t, &lreq->t); + req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO); + BUG_ON(!req); - WARN_ON(op->op != CEPH_OSD_OP_WATCH || - op->watch.cookie != lreq->linger_id || - op->watch.op != CEPH_OSD_WATCH_OP_PING); - op->watch.gen = lreq->register_gen; + target_copy(&req->r_t, &lreq->t); + osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_PING, lreq->linger_id, + lreq->register_gen); req->r_callback = linger_ping_cb; + + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + BUG_ON(ret); + req->r_priv = linger_get(lreq); req->r_linger = true; + lreq->ping_req = req; ceph_osdc_get_request(req); account_request(req); @@ -3175,12 +3258,6 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) down_write(&osdc->lock); linger_register(lreq); - if (lreq->is_watch) { - lreq->reg_req->r_ops[0].watch.cookie = lreq->linger_id; - lreq->ping_req->r_ops[0].watch.cookie = lreq->linger_id; - } else { - lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; - } calc_target(osdc, &lreq->t, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); @@ -3212,9 +3289,9 @@ static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq) */ static void __linger_cancel(struct ceph_osd_linger_request *lreq) { - if (lreq->is_watch && lreq->ping_req->r_osd) + if (lreq->ping_req && lreq->ping_req->r_osd) cancel_linger_request(lreq->ping_req); - if (lreq->reg_req->r_osd) + if (lreq->reg_req && lreq->reg_req->r_osd) cancel_linger_request(lreq->reg_req); cancel_linger_map_check(lreq); unlink_linger(lreq->osd, lreq); @@ -3319,17 +3396,24 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq) int ret; dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); - ret = wait_for_completion_interruptible(&lreq->reg_commit_wait); + ret = wait_for_completion_killable(&lreq->reg_commit_wait); return ret ?: lreq->reg_commit_error; } -static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq) +static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq, + unsigned long timeout) { - int ret; + long left; dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); - ret = wait_for_completion_interruptible(&lreq->notify_finish_wait); - return ret ?: lreq->notify_finish_error; + left = wait_for_completion_killable_timeout(&lreq->notify_finish_wait, + ceph_timeout_jiffies(timeout)); + if (left <= 0) + left = left ?: -ETIMEDOUT; + else + left = lreq->notify_finish_error; /* completed */ + + return left; } /* @@ -3773,6 +3857,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) * one (type of) reply back. */ WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK)); + req->r_version = m.user_version; req->r_result = m.result ?: data_len; finish_request(req); mutex_unlock(&osd->lock); @@ -3864,7 +3949,7 @@ static void scan_requests(struct ceph_osd *osd, if (!force_resend && !force_resend_writes) break; - /* fall through */ + fallthrough; case CALC_TARGET_NEED_RESEND: cancel_linger_map_check(lreq); /* @@ -3901,7 +3986,7 @@ static void scan_requests(struct ceph_osd *osd, !force_resend_writes)) break; - /* fall through */ + fallthrough; case CALC_TARGET_NEED_RESEND: cancel_map_check(req); unlink_request(osd, req); @@ -3928,9 +4013,11 @@ static int handle_one_map(struct ceph_osd_client *osdc, set_pool_was_full(osdc); if (incremental) - newmap = osdmap_apply_incremental(&p, end, osdc->osdmap); + newmap = osdmap_apply_incremental(&p, end, + ceph_msgr2(osdc->client), + osdc->osdmap); else - newmap = ceph_osdmap_decode(&p, end); + newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client)); if (IS_ERR(newmap)) return PTR_ERR(newmap); @@ -4561,21 +4648,23 @@ bad: /* * Register request, send initial attempt. */ -int ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail) +void ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) { down_read(&osdc->lock); submit_request(req, false); up_read(&osdc->lock); - - return 0; } EXPORT_SYMBOL(ceph_osdc_start_request); /* - * Unregister a registered request. The request is not completed: - * ->r_result isn't set and __complete_request() isn't called. + * Unregister request. If @req was registered, it isn't completed: + * r_result isn't set and __complete_request() isn't invoked. + * + * If @req wasn't registered, this call may have raced with + * handle_reply(), in which case r_result would already be set and + * __complete_request() would be getting invoked, possibly even + * concurrently with this call. */ void ceph_osdc_cancel_request(struct ceph_osd_request *req) { @@ -4661,43 +4750,6 @@ again: } EXPORT_SYMBOL(ceph_osdc_sync); -static struct ceph_osd_request * -alloc_linger_request(struct ceph_osd_linger_request *lreq) -{ - struct ceph_osd_request *req; - - req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO); - if (!req) - return NULL; - - ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); - ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); - return req; -} - -static struct ceph_osd_request * -alloc_watch_request(struct ceph_osd_linger_request *lreq, u8 watch_opcode) -{ - struct ceph_osd_request *req; - - req = alloc_linger_request(lreq); - if (!req) - return NULL; - - /* - * Pass 0 for cookie because we don't know it yet, it will be - * filled in by linger_submit(). - */ - osd_req_op_watch_init(req, 0, 0, watch_opcode); - - if (ceph_osdc_alloc_messages(req, GFP_NOIO)) { - ceph_osdc_put_request(req); - return NULL; - } - - return req; -} - /* * Returns a handle, caller owns a ref. */ @@ -4727,18 +4779,6 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, lreq->t.flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&lreq->mtime); - lreq->reg_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_WATCH); - if (!lreq->reg_req) { - ret = -ENOMEM; - goto err_put_lreq; - } - - lreq->ping_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_PING); - if (!lreq->ping_req) { - ret = -ENOMEM; - goto err_put_lreq; - } - linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (ret) { @@ -4776,14 +4816,14 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); req->r_flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&req->r_mtime); - osd_req_op_watch_init(req, 0, lreq->linger_id, - CEPH_OSD_WATCH_OP_UNWATCH); + osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_UNWATCH, + lreq->linger_id, 0); ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) goto out_put_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); linger_cancel(lreq); linger_put(lreq); ret = wait_request_timeout(req, opts->mount_timeout); @@ -4802,7 +4842,7 @@ static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, struct ceph_pagelist *pl; int ret; - op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); + op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); pl = ceph_pagelist_alloc(GFP_NOIO); if (!pl) @@ -4854,7 +4894,7 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); out_put_req: @@ -4863,35 +4903,6 @@ out_put_req: } EXPORT_SYMBOL(ceph_osdc_notify_ack); -static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, - u64 cookie, u32 prot_ver, u32 timeout, - void *payload, u32 payload_len) -{ - struct ceph_osd_req_op *op; - struct ceph_pagelist *pl; - int ret; - - op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); - op->notify.cookie = cookie; - - pl = ceph_pagelist_alloc(GFP_NOIO); - if (!pl) - return -ENOMEM; - - ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */ - ret |= ceph_pagelist_encode_32(pl, timeout); - ret |= ceph_pagelist_encode_32(pl, payload_len); - ret |= ceph_pagelist_append(pl, payload, payload_len); - if (ret) { - ceph_pagelist_release(pl); - return -ENOMEM; - } - - ceph_osd_data_pagelist_init(&op->notify.request_data, pl); - op->indata_len = pl->length; - return 0; -} - /* * @timeout: in seconds * @@ -4910,7 +4921,6 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, size_t *preply_len) { struct ceph_osd_linger_request *lreq; - struct page **pages; int ret; WARN_ON(!timeout); @@ -4923,46 +4933,41 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, if (!lreq) return -ENOMEM; - lreq->preply_pages = preply_pages; - lreq->preply_len = preply_len; - - ceph_oid_copy(&lreq->t.base_oid, oid); - ceph_oloc_copy(&lreq->t.base_oloc, oloc); - lreq->t.flags = CEPH_OSD_FLAG_READ; - - lreq->reg_req = alloc_linger_request(lreq); - if (!lreq->reg_req) { + lreq->request_pl = ceph_pagelist_alloc(GFP_NOIO); + if (!lreq->request_pl) { ret = -ENOMEM; goto out_put_lreq; } - /* - * Pass 0 for cookie because we don't know it yet, it will be - * filled in by linger_submit(). - */ - ret = osd_req_op_notify_init(lreq->reg_req, 0, 0, 1, timeout, - payload, payload_len); - if (ret) + ret = ceph_pagelist_encode_32(lreq->request_pl, 1); /* prot_ver */ + ret |= ceph_pagelist_encode_32(lreq->request_pl, timeout); + ret |= ceph_pagelist_encode_32(lreq->request_pl, payload_len); + ret |= ceph_pagelist_append(lreq->request_pl, payload, payload_len); + if (ret) { + ret = -ENOMEM; goto out_put_lreq; + } /* for notify_id */ - pages = ceph_alloc_page_vector(1, GFP_NOIO); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); + lreq->notify_id_pages = ceph_alloc_page_vector(1, GFP_NOIO); + if (IS_ERR(lreq->notify_id_pages)) { + ret = PTR_ERR(lreq->notify_id_pages); + lreq->notify_id_pages = NULL; goto out_put_lreq; } - ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify, - response_data), - pages, PAGE_SIZE, 0, false, true); - ret = ceph_osdc_alloc_messages(lreq->reg_req, GFP_NOIO); - if (ret) - goto out_put_lreq; + lreq->preply_pages = preply_pages; + lreq->preply_len = preply_len; + + ceph_oid_copy(&lreq->t.base_oid, oid); + ceph_oloc_copy(&lreq->t.base_oloc, oloc); + lreq->t.flags = CEPH_OSD_FLAG_READ; linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (!ret) - ret = linger_notify_finish_wait(lreq); + ret = linger_notify_finish_wait(lreq, + msecs_to_jiffies(2 * timeout * MSEC_PER_SEC)); else dout("lreq %p failed to initiate notify %d\n", lreq, ret); @@ -4973,40 +4978,6 @@ out_put_lreq: } EXPORT_SYMBOL(ceph_osdc_notify); -/* - * Return the number of milliseconds since the watch was last - * confirmed, or an error. If there is an error, the watch is no - * longer valid, and should be destroyed with ceph_osdc_unwatch(). - */ -int ceph_osdc_watch_check(struct ceph_osd_client *osdc, - struct ceph_osd_linger_request *lreq) -{ - unsigned long stamp, age; - int ret; - - down_read(&osdc->lock); - mutex_lock(&lreq->lock); - stamp = lreq->watch_valid_thru; - if (!list_empty(&lreq->pending_lworks)) { - struct linger_work *lwork = - list_first_entry(&lreq->pending_lworks, - struct linger_work, - pending_item); - - if (time_before(lwork->queued_stamp, stamp)) - stamp = lwork->queued_stamp; - } - age = jiffies - stamp; - dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__, - lreq, lreq->linger_id, age, lreq->last_error); - /* we are truncating to msecs, so return a safe upper bound */ - ret = lreq->last_error ?: 1 + jiffies_to_msecs(age); - - mutex_unlock(&lreq->lock); - up_read(&osdc->lock); - return ret; -} - static int decode_watcher(void **p, void *end, struct ceph_watch_item *item) { u8 struct_v; @@ -5106,7 +5077,7 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { void *p = page_address(pages[0]); @@ -5183,7 +5154,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { ret = req->r_ops[0].rval; @@ -5318,14 +5289,14 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) ceph_msgpool_destroy(&osdc->msgpool_op_reply); } -static int osd_req_op_copy_from_init(struct ceph_osd_request *req, - u64 src_snapid, u64 src_version, - struct ceph_object_id *src_oid, - struct ceph_object_locator *src_oloc, - u32 src_fadvise_flags, - u32 dst_fadvise_flags, - u32 truncate_seq, u64 truncate_size, - u8 copy_from_flags) +int osd_req_op_copy_from_init(struct ceph_osd_request *req, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, + u8 copy_from_flags) { struct ceph_osd_req_op *op; struct page **pages; @@ -5335,8 +5306,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, if (IS_ERR(pages)) return PTR_ERR(pages); - op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2, - dst_fadvise_flags); + op = osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2, + dst_fadvise_flags); op->copy_from.snapid = src_snapid; op->copy_from.src_version = src_version; op->copy_from.flags = copy_from_flags; @@ -5354,49 +5325,7 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, op->indata_len, 0, false, true); return 0; } - -int ceph_osdc_copy_from(struct ceph_osd_client *osdc, - u64 src_snapid, u64 src_version, - struct ceph_object_id *src_oid, - struct ceph_object_locator *src_oloc, - u32 src_fadvise_flags, - struct ceph_object_id *dst_oid, - struct ceph_object_locator *dst_oloc, - u32 dst_fadvise_flags, - u32 truncate_seq, u64 truncate_size, - u8 copy_from_flags) -{ - struct ceph_osd_request *req; - int ret; - - req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); - if (!req) - return -ENOMEM; - - req->r_flags = CEPH_OSD_FLAG_WRITE; - - ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc); - ceph_oid_copy(&req->r_t.base_oid, dst_oid); - - ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid, - src_oloc, src_fadvise_flags, - dst_fadvise_flags, truncate_seq, - truncate_size, copy_from_flags); - if (ret) - goto out; - - ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); - if (ret) - goto out; - - ceph_osdc_start_request(osdc, req, false); - ret = ceph_osdc_wait_request(osdc, req); - -out: - ceph_osdc_put_request(req); - return ret; -} -EXPORT_SYMBOL(ceph_osdc_copy_from); +EXPORT_SYMBOL(osd_req_op_copy_from_init); int __init ceph_osdc_setup(void) { @@ -5420,7 +5349,7 @@ void ceph_osdc_cleanup(void) /* * handle incoming message */ -static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +static void osd_dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; @@ -5448,6 +5377,24 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) ceph_msg_put(msg); } +/* How much sparse data was requested? */ +static u64 sparse_data_requested(struct ceph_osd_request *req) +{ + u64 len = 0; + + if (req->r_flags & CEPH_OSD_FLAG_READ) { + int i; + + for (i = 0; i < req->r_num_ops; ++i) { + struct ceph_osd_req_op *op = &req->r_ops[i]; + + if (op->op == CEPH_OSD_OP_SPARSE_READ) + len += op->extent.length; + } + } + return len; +} + /* * Lookup and return message for incoming reply. Don't try to do * anything about a larger than preallocated data portion of the @@ -5464,6 +5411,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, int front_len = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid = le64_to_cpu(hdr->tid); + u64 srlen; down_read(&osdc->lock); if (!osd_registered(osd)) { @@ -5496,7 +5444,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply = m; } - if (data_len > req->r_reply->data_length) { + srlen = sparse_data_requested(req); + if (!srlen && data_len > req->r_reply->data_length) { pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n", __func__, osd->o_osd, req->r_tid, data_len, req->r_reply->data_length); @@ -5506,6 +5455,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } m = ceph_msg_get(req->r_reply); + m->sparse_read_total = srlen; + dout("get_reply tid %lld %p\n", tid, m); out_unlock_session: @@ -5542,9 +5493,9 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) return m; } -static struct ceph_msg *alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip) +static struct ceph_msg *osd_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) { struct ceph_osd *osd = con->private; int type = le16_to_cpu(hdr->type); @@ -5568,7 +5519,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, /* * Wrappers to refcount containing ceph_osd struct */ -static struct ceph_connection *get_osd_con(struct ceph_connection *con) +static struct ceph_connection *osd_get_con(struct ceph_connection *con) { struct ceph_osd *osd = con->private; if (get_osd(osd)) @@ -5576,7 +5527,7 @@ static struct ceph_connection *get_osd_con(struct ceph_connection *con) return NULL; } -static void put_osd_con(struct ceph_connection *con) +static void osd_put_con(struct ceph_connection *con) { struct ceph_osd *osd = con->private; put_osd(osd); @@ -5585,39 +5536,29 @@ static void put_osd_con(struct ceph_connection *con) /* * authentication */ + /* * Note: returned pointer is the address of a structure that's * managed separately. Caller must *not* attempt to free it. */ -static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, - int *proto, int force_new) +static struct ceph_auth_handshake * +osd_get_authorizer(struct ceph_connection *con, int *proto, int force_new) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; + int ret; - if (force_new && auth->authorizer) { - ceph_auth_destroy_authorizer(auth->authorizer); - auth->authorizer = NULL; - } - if (!auth->authorizer) { - int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, - auth); - if (ret) - return ERR_PTR(ret); - } else { - int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD, - auth); - if (ret) - return ERR_PTR(ret); - } - *proto = ac->protocol; + ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD, + force_new, proto, NULL, NULL); + if (ret) + return ERR_PTR(ret); return auth; } -static int add_authorizer_challenge(struct ceph_connection *con, +static int osd_add_authorizer_challenge(struct ceph_connection *con, void *challenge_buf, int challenge_buf_len) { struct ceph_osd *o = con->private; @@ -5628,16 +5569,19 @@ static int add_authorizer_challenge(struct ceph_connection *con, challenge_buf, challenge_buf_len); } -static int verify_authorizer_reply(struct ceph_connection *con) +static int osd_verify_authorizer_reply(struct ceph_connection *con) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; - return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer); + return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, + auth->authorizer_reply_buf, auth->authorizer_reply_buf_len, + NULL, NULL, NULL, NULL); } -static int invalidate_authorizer(struct ceph_connection *con) +static int osd_invalidate_authorizer(struct ceph_connection *con) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; @@ -5647,6 +5591,80 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&osdc->client->monc); } +static int osd_get_auth_request(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + int ret; + + ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int osd_handle_auth_reply_more(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + int ret; + + ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int osd_handle_auth_done(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + + return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len, + session_key, session_key_len, + con_secret, con_secret_len); +} + +static int osd_handle_auth_bad_method(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + struct ceph_osd *o = con->private; + struct ceph_mon_client *monc = &o->o_osdc->client->monc; + int ret; + + if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD, + used_proto, result, + allowed_protos, proto_cnt, + allowed_modes, mode_cnt)) { + ret = ceph_monc_validate_auth(monc); + if (ret) + return ret; + } + + return -EACCES; +} + static void osd_reencode_message(struct ceph_msg *msg) { int type = le16_to_cpu(msg->hdr.type); @@ -5671,17 +5689,229 @@ static int osd_check_message_signature(struct ceph_msg *msg) return ceph_auth_check_message_signature(auth, msg); } +static void advance_cursor(struct ceph_msg_data_cursor *cursor, size_t len, + bool zero) +{ + while (len) { + struct page *page; + size_t poff, plen; + + page = ceph_msg_data_next(cursor, &poff, &plen); + if (plen > len) + plen = len; + if (zero) + zero_user_segment(page, poff, poff + plen); + len -= plen; + ceph_msg_data_advance(cursor, plen); + } +} + +static int prep_next_sparse_read(struct ceph_connection *con, + struct ceph_msg_data_cursor *cursor) +{ + struct ceph_osd *o = con->private; + struct ceph_sparse_read *sr = &o->o_sparse_read; + struct ceph_osd_request *req; + struct ceph_osd_req_op *op; + + spin_lock(&o->o_requests_lock); + req = lookup_request(&o->o_requests, le64_to_cpu(con->in_msg->hdr.tid)); + if (!req) { + spin_unlock(&o->o_requests_lock); + return -EBADR; + } + + if (o->o_sparse_op_idx < 0) { + dout("%s: [%d] starting new sparse read req\n", + __func__, o->o_osd); + } else { + u64 end; + + op = &req->r_ops[o->o_sparse_op_idx]; + + WARN_ON_ONCE(op->extent.sparse_ext); + + /* hand back buffer we took earlier */ + op->extent.sparse_ext = sr->sr_extent; + sr->sr_extent = NULL; + op->extent.sparse_ext_cnt = sr->sr_count; + sr->sr_ext_len = 0; + dout("%s: [%d] completed extent array len %d cursor->resid %zd\n", + __func__, o->o_osd, op->extent.sparse_ext_cnt, cursor->resid); + /* Advance to end of data for this operation */ + end = ceph_sparse_ext_map_end(op); + if (end < sr->sr_req_len) + advance_cursor(cursor, sr->sr_req_len - end, false); + } + + ceph_init_sparse_read(sr); + + /* find next op in this request (if any) */ + while (++o->o_sparse_op_idx < req->r_num_ops) { + op = &req->r_ops[o->o_sparse_op_idx]; + if (op->op == CEPH_OSD_OP_SPARSE_READ) + goto found; + } + + /* reset for next sparse read request */ + spin_unlock(&o->o_requests_lock); + o->o_sparse_op_idx = -1; + return 0; +found: + sr->sr_req_off = op->extent.offset; + sr->sr_req_len = op->extent.length; + sr->sr_pos = sr->sr_req_off; + dout("%s: [%d] new sparse read op at idx %d 0x%llx~0x%llx\n", __func__, + o->o_osd, o->o_sparse_op_idx, sr->sr_req_off, sr->sr_req_len); + + /* hand off request's sparse extent map buffer */ + sr->sr_ext_len = op->extent.sparse_ext_cnt; + op->extent.sparse_ext_cnt = 0; + sr->sr_extent = op->extent.sparse_ext; + op->extent.sparse_ext = NULL; + + spin_unlock(&o->o_requests_lock); + return 1; +} + +#ifdef __BIG_ENDIAN +static inline void convert_extent_map(struct ceph_sparse_read *sr) +{ + int i; + + for (i = 0; i < sr->sr_count; i++) { + struct ceph_sparse_extent *ext = &sr->sr_extent[i]; + + ext->off = le64_to_cpu((__force __le64)ext->off); + ext->len = le64_to_cpu((__force __le64)ext->len); + } +} +#else +static inline void convert_extent_map(struct ceph_sparse_read *sr) +{ +} +#endif + +static int osd_sparse_read(struct ceph_connection *con, + struct ceph_msg_data_cursor *cursor, + char **pbuf) +{ + struct ceph_osd *o = con->private; + struct ceph_sparse_read *sr = &o->o_sparse_read; + u32 count = sr->sr_count; + u64 eoff, elen, len = 0; + int i, ret; + + switch (sr->sr_state) { + case CEPH_SPARSE_READ_HDR: +next_op: + ret = prep_next_sparse_read(con, cursor); + if (ret <= 0) + return ret; + + /* number of extents */ + ret = sizeof(sr->sr_count); + *pbuf = (char *)&sr->sr_count; + sr->sr_state = CEPH_SPARSE_READ_EXTENTS; + break; + case CEPH_SPARSE_READ_EXTENTS: + /* Convert sr_count to host-endian */ + count = le32_to_cpu((__force __le32)sr->sr_count); + sr->sr_count = count; + dout("[%d] got %u extents\n", o->o_osd, count); + + if (count > 0) { + if (!sr->sr_extent || count > sr->sr_ext_len) { + /* no extent array provided, or too short */ + kfree(sr->sr_extent); + sr->sr_extent = kmalloc_array(count, + sizeof(*sr->sr_extent), + GFP_NOIO); + if (!sr->sr_extent) { + pr_err("%s: failed to allocate %u extents\n", + __func__, count); + return -ENOMEM; + } + sr->sr_ext_len = count; + } + ret = count * sizeof(*sr->sr_extent); + *pbuf = (char *)sr->sr_extent; + sr->sr_state = CEPH_SPARSE_READ_DATA_LEN; + break; + } + /* No extents? Read data len */ + fallthrough; + case CEPH_SPARSE_READ_DATA_LEN: + convert_extent_map(sr); + ret = sizeof(sr->sr_datalen); + *pbuf = (char *)&sr->sr_datalen; + sr->sr_state = CEPH_SPARSE_READ_DATA_PRE; + break; + case CEPH_SPARSE_READ_DATA_PRE: + /* Convert sr_datalen to host-endian */ + sr->sr_datalen = le32_to_cpu((__force __le32)sr->sr_datalen); + for (i = 0; i < count; i++) + len += sr->sr_extent[i].len; + if (sr->sr_datalen != len) { + pr_warn_ratelimited("data len %u != extent len %llu\n", + sr->sr_datalen, len); + return -EREMOTEIO; + } + sr->sr_state = CEPH_SPARSE_READ_DATA; + fallthrough; + case CEPH_SPARSE_READ_DATA: + if (sr->sr_index >= count) { + sr->sr_state = CEPH_SPARSE_READ_HDR; + goto next_op; + } + + eoff = sr->sr_extent[sr->sr_index].off; + elen = sr->sr_extent[sr->sr_index].len; + + dout("[%d] ext %d off 0x%llx len 0x%llx\n", + o->o_osd, sr->sr_index, eoff, elen); + + if (elen > INT_MAX) { + dout("Sparse read extent length too long (0x%llx)\n", + elen); + return -EREMOTEIO; + } + + /* zero out anything from sr_pos to start of extent */ + if (sr->sr_pos < eoff) + advance_cursor(cursor, eoff - sr->sr_pos, true); + + /* Set position to end of extent */ + sr->sr_pos = eoff + elen; + + /* send back the new length and nullify the ptr */ + cursor->sr_resid = elen; + ret = elen; + *pbuf = NULL; + + /* Bump the array index */ + ++sr->sr_index; + break; + } + return ret; +} + static const struct ceph_connection_operations osd_con_ops = { - .get = get_osd_con, - .put = put_osd_con, - .dispatch = dispatch, - .get_authorizer = get_authorizer, - .add_authorizer_challenge = add_authorizer_challenge, - .verify_authorizer_reply = verify_authorizer_reply, - .invalidate_authorizer = invalidate_authorizer, - .alloc_msg = alloc_msg, + .get = osd_get_con, + .put = osd_put_con, + .sparse_read = osd_sparse_read, + .alloc_msg = osd_alloc_msg, + .dispatch = osd_dispatch, + .fault = osd_fault, .reencode_message = osd_reencode_message, + .get_authorizer = osd_get_authorizer, + .add_authorizer_challenge = osd_add_authorizer_challenge, + .verify_authorizer_reply = osd_verify_authorizer_reply, + .invalidate_authorizer = osd_invalidate_authorizer, .sign_message = osd_sign_message, .check_message_signature = osd_check_message_signature, - .fault = osd_fault, + .get_auth_request = osd_get_auth_request, + .handle_auth_reply_more = osd_handle_auth_reply_more, + .handle_auth_done = osd_handle_auth_done, + .handle_auth_bad_method = osd_handle_auth_bad_method, }; |
