From 9d521470a40f16110bd31018034155c60c1a1275 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 31 Jan 2014 17:54:26 +0200 Subject: libceph: a per-osdc crush scratch buffer With the addition of erasure coding support in the future, scratch variable-length array in crush_do_rule_ary() is going to grow to at least 200 bytes on average, on top of another 128 bytes consumed by rawosd/osd arrays in the call chain. Replace it with a buffer inside struct osdmap and a mutex. This shouldn't result in any contention, because all osd requests were already serialized by request_mutex at that point; the only unlocked caller was ceph_ioctl_get_dataloc(). Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osdmap.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index aade4a5c1c07..9d1aaa24def6 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -698,7 +698,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) map = kzalloc(sizeof(*map), GFP_NOFS); if (map == NULL) return ERR_PTR(-ENOMEM); + map->pg_temp = RB_ROOT; + mutex_init(&map->crush_scratch_mutex); ceph_decode_16_safe(p, end, version, bad); if (version > 6) { @@ -1142,14 +1144,20 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, } EXPORT_SYMBOL(ceph_oloc_oid_to_pg); -static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x, - int *result, int result_max, - const __u32 *weight, int weight_max) +static int do_crush(struct ceph_osdmap *map, int ruleno, int x, + int *result, int result_max, + const __u32 *weight, int weight_max) { - int scratch[result_max * 3]; + int r; + + BUG_ON(result_max > CEPH_PG_MAX_SIZE); + + mutex_lock(&map->crush_scratch_mutex); + r = crush_do_rule(map->crush, ruleno, x, result, result_max, + weight, weight_max, map->crush_scratch_ary); + mutex_unlock(&map->crush_scratch_mutex); - return crush_do_rule(map, ruleno, x, result, result_max, - weight, weight_max, scratch); + return r; } /* @@ -1205,9 +1213,8 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, pool->pgp_num_mask) + (unsigned)pgid.pool; } - r = crush_do_rule_ary(osdmap->crush, ruleno, pps, - osds, min_t(int, pool->size, *num), - osdmap->osd_weight, osdmap->max_osd); + r = do_crush(osdmap, ruleno, pps, osds, min_t(int, pool->size, *num), + osdmap->osd_weight, osdmap->max_osd); if (r < 0) { pr_err("error %d from crush rule: pool %lld ruleset %d type %d" " size %d\n", r, pgid.pool, pool->crush_ruleset, -- cgit From 7b25bf5f02c5c80adf96120e031dc3a1756ce54d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 25 Feb 2014 16:22:26 +0200 Subject: libceph: encode CEPH_OSD_OP_FLAG_* op flags Encode ceph_osd_op::flags field so that it gets sent over the wire. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0676f2b199d6..5d7fd0b8c1c8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -688,7 +688,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, return 0; } + dst->op = cpu_to_le16(src->op); + dst->flags = cpu_to_le32(src->flags); dst->payload_len = cpu_to_le32(src->payload_len); return request_data_len; -- cgit From c647b8a8c6366f849c2a237bfe525cb1d316d5f4 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 25 Feb 2014 16:22:27 +0200 Subject: libceph: add support for CEPH_OSD_OP_SETALLOCHINT osd op This is primarily for rbd's benefit and is supposed to combat fragmentation: "... knowing that rbd images have a 4m size, librbd can pass a hint that will let the osd do the xfs allocation size ioctl on new files so that they are allocated in 1m or 4m chunks. We've seen cases where users with rbd workloads have very high levels of fragmentation in xfs and this would mitigate that and probably have a pretty nice performance benefit." SETALLOCHINT is considered advisory, so our backwards compatibility mechanism here is to set FAILOK flag for all SETALLOCHINT ops. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5d7fd0b8c1c8..71830d79b0f4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode) case CEPH_OSD_OP_OMAPCLEAR: case CEPH_OSD_OP_OMAPRMKEYS: case CEPH_OSD_OP_OMAP_CMP: + case CEPH_OSD_OP_SETALLOCHINT: case CEPH_OSD_OP_CLONERANGE: case CEPH_OSD_OP_ASSERT_SRC_VERSION: case CEPH_OSD_OP_SRC_CMPXATTR: @@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_watch_init); +void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, + unsigned int which, + u64 expected_object_size, + u64 expected_write_size) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + CEPH_OSD_OP_SETALLOCHINT); + + op->alloc_hint.expected_object_size = expected_object_size; + op->alloc_hint.expected_write_size = expected_write_size; + + /* + * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed + * not worth a feature bit. Set FAILOK per-op flag to make + * sure older osds don't trip over an unsupported opcode. + */ + op->flags |= CEPH_OSD_OP_FLAG_FAILOK; +} +EXPORT_SYMBOL(osd_req_op_alloc_hint_init); + static void ceph_osdc_msg_data_add(struct ceph_msg *msg, struct ceph_osd_data *osd_data) { @@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->watch.ver = cpu_to_le64(src->watch.ver); dst->watch.flag = src->watch.flag; break; + case CEPH_OSD_OP_SETALLOCHINT: + dst->alloc_hint.expected_object_size = + cpu_to_le64(src->alloc_hint.expected_object_size); + dst->alloc_hint.expected_write_size = + cpu_to_le64(src->alloc_hint.expected_write_size); + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); -- cgit From d90deda69cb82411ba7d990e97218e0f8b2d07bb Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 23 Mar 2014 06:50:39 +0800 Subject: libceph: fix oops in ceph_msg_data_{pages,pagelist}_advance() When there is no more data, ceph_msg_data_{pages,pagelist}_advance() should not move on to the next page. Signed-off-by: Yan, Zheng --- net/ceph/messenger.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 30efc5c18622..4f55f9ce63fa 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -919,6 +919,9 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, if (!bytes || cursor->page_offset) return false; /* more bytes to process in the current page */ + if (!cursor->resid) + return false; /* no more data */ + /* Move on to the next page; offset is already at 0 */ BUG_ON(cursor->page_index >= cursor->page_count); @@ -1004,6 +1007,9 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, if (!bytes || cursor->offset & ~PAGE_MASK) return false; /* more bytes to process in the current page */ + if (!cursor->resid) + return false; /* no more data */ + /* Move on to the next page */ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); -- cgit From 48a163dbb517eba13643bf404a0d695c1ab0a60d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 19 Mar 2014 16:58:36 +0200 Subject: crush: fix off-by-one errors in total_tries refactor Back in 27f4d1f6bc32c2ed7b2c5080cbd58b14df622607 we refactored the CRUSH code to allow adjustment of the retry counts on a per-pool basis. That commit had an off-by-one bug: the previous "tries" counter was a *retry* count, not a *try* count, but the new code was passing in 1 meaning there should be no retries. Fix the ftotal vs tries comparison to use < instead of <= to fix the problem. Note that the original code used <= here, which means the global "choose_total_tries" tunable is actually counting retries. Compensate for that by adding 1 in crush_do_rule when we pull the tunable into the local variable. This was noticed looking at output from a user provided osdmap. Unfortunately the map doesn't illustrate the change in mapping behavior and I haven't managed to construct one yet that does. Inspection of the crush debug output now aligns with prior versions, though. Reflects ceph.git commit 795704fd615f0b008dcc81aa088a859b2d075138. Signed-off-by: Ilya Dryomov Reviewed-by: Josh Durgin --- net/ceph/crush/mapper.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index b703790b4e44..074bb2a5e675 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -292,8 +292,8 @@ static int is_out(const struct crush_map *map, * @outpos: our position in that vector * @tries: number of attempts to make * @recurse_tries: number of attempts to have recursive chooseleaf make - * @local_tries: localized retries - * @local_fallback_tries: localized fallback retries + * @local_retries: localized retries + * @local_fallback_retries: localized fallback retries * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) * @out2: second output vector for leaf items (if @recurse_to_leaf) */ @@ -304,8 +304,8 @@ static int crush_choose_firstn(const struct crush_map *map, int *out, int outpos, unsigned int tries, unsigned int recurse_tries, - unsigned int local_tries, - unsigned int local_fallback_tries, + unsigned int local_retries, + unsigned int local_fallback_retries, int recurse_to_leaf, int *out2) { @@ -344,9 +344,9 @@ static int crush_choose_firstn(const struct crush_map *map, reject = 1; goto reject; } - if (local_fallback_tries > 0 && + if (local_fallback_retries > 0 && flocal >= (in->size>>1) && - flocal > local_fallback_tries) + flocal > local_fallback_retries) item = bucket_perm_choose(in, x, r); else item = crush_bucket_choose(in, x, r); @@ -393,8 +393,8 @@ static int crush_choose_firstn(const struct crush_map *map, x, outpos+1, 0, out2, outpos, recurse_tries, 0, - local_tries, - local_fallback_tries, + local_retries, + local_fallback_retries, 0, NULL) <= outpos) /* didn't get leaf */ @@ -420,14 +420,14 @@ reject: ftotal++; flocal++; - if (collide && flocal <= local_tries) + if (collide && flocal <= local_retries) /* retry locally a few times */ retry_bucket = 1; - else if (local_fallback_tries > 0 && - flocal <= in->size + local_fallback_tries) + else if (local_fallback_retries > 0 && + flocal <= in->size + local_fallback_retries) /* exhaustive bucket search */ retry_bucket = 1; - else if (ftotal <= tries) + else if (ftotal < tries) /* then retry descent */ retry_descent = 1; else @@ -640,10 +640,18 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; - int choose_tries = map->choose_total_tries; - int choose_local_tries = map->choose_local_tries; - int choose_local_fallback_tries = map->choose_local_fallback_tries; + /* + * the original choose_total_tries value was off by one (it + * counted "retries" and not "tries"). add one. + */ + int choose_tries = map->choose_total_tries + 1; int choose_leaf_tries = 0; + /* + * the local tries values were counted as "retries", though, + * and need no adjustment + */ + int choose_local_retries = map->choose_local_tries; + int choose_local_fallback_retries = map->choose_local_fallback_tries; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -677,12 +685,12 @@ int crush_do_rule(const struct crush_map *map, case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: if (curstep->arg1 > 0) - choose_local_tries = curstep->arg1; + choose_local_retries = curstep->arg1; break; case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: if (curstep->arg1 > 0) - choose_local_fallback_tries = curstep->arg1; + choose_local_fallback_retries = curstep->arg1; break; case CRUSH_RULE_CHOOSELEAF_FIRSTN: @@ -734,8 +742,8 @@ int crush_do_rule(const struct crush_map *map, o+osize, j, choose_tries, recurse_tries, - choose_local_tries, - choose_local_fallback_tries, + choose_local_retries, + choose_local_fallback_retries, recurse_to_leaf, c+osize); } else { -- cgit From 6ed1002f368c63ef79d7f659fcb4368a90098132 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 19 Mar 2014 16:58:37 +0200 Subject: crush: allow crush rules to set (re)tries counts to 0 These two fields are misnomers; they are *retry* counts. Reflects ceph.git commit f17caba8ae0cad7b6f8f35e53e5f73b444696835. Signed-off-by: Ilya Dryomov Reviewed-by: Josh Durgin --- net/ceph/crush/mapper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 074bb2a5e675..b3fb84903b30 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -684,12 +684,12 @@ int crush_do_rule(const struct crush_map *map, break; case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: - if (curstep->arg1 > 0) + if (curstep->arg1 >= 0) choose_local_retries = curstep->arg1; break; case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: - if (curstep->arg1 > 0) + if (curstep->arg1 >= 0) choose_local_fallback_retries = curstep->arg1; break; -- cgit From e2b149cc4ba00766aceb87950c6de72ea7fc8b2e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 19 Mar 2014 16:58:37 +0200 Subject: crush: add chooseleaf_vary_r tunable The current crush_choose_firstn code will re-use the same 'r' value for the recursive call. That means that if we are hitting a collision or rejection for some reason (say, an OSD that is marked out) and need to retry, we will keep making the same (bad) choice in that recursive selection. Introduce a tunable that fixes that behavior by incorporating the parent 'r' value into the recursive starting point, so that a different path will be taken in subsequent placement attempts. Note that this was done from the get-go for the new crush_choose_indep algorithm. This was exposed by a user who was seeing PGs stuck in active+remapped after reweight-by-utilization because the up set mapped to a single OSD. Reflects ceph.git commit a8e6c9fbf88bad056dd05d3eb790e98a5e43451a. Signed-off-by: Ilya Dryomov Reviewed-by: Josh Durgin --- net/ceph/crush/mapper.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index b3fb84903b30..947150cde297 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -295,7 +295,9 @@ static int is_out(const struct crush_map *map, * @local_retries: localized retries * @local_fallback_retries: localized fallback retries * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) + * @parent_r: r value passed from the parent */ static int crush_choose_firstn(const struct crush_map *map, struct crush_bucket *bucket, @@ -307,7 +309,9 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int local_retries, unsigned int local_fallback_retries, int recurse_to_leaf, - int *out2) + unsigned int vary_r, + int *out2, + int parent_r) { int rep; unsigned int ftotal, flocal; @@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map, int itemtype; int collide, reject; - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", - bucket->id, x, outpos, numrep); + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", + recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep, + tries, recurse_tries, local_retries, local_fallback_retries, + parent_r); for (rep = outpos; rep < numrep; rep++) { /* keep trying until we get a non-out, non-colliding item */ @@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map, do { collide = 0; retry_bucket = 0; - r = rep; + r = rep + parent_r; /* r' = r + f_total */ r += ftotal; @@ -387,6 +394,11 @@ static int crush_choose_firstn(const struct crush_map *map, reject = 0; if (!collide && recurse_to_leaf) { if (item < 0) { + int sub_r; + if (vary_r) + sub_r = r >> (vary_r-1); + else + sub_r = 0; if (crush_choose_firstn(map, map->buckets[-1-item], weight, weight_max, @@ -396,7 +408,9 @@ static int crush_choose_firstn(const struct crush_map *map, local_retries, local_fallback_retries, 0, - NULL) <= outpos) + vary_r, + NULL, + sub_r) <= outpos) /* didn't get leaf */ reject = 1; } else { @@ -653,6 +667,8 @@ int crush_do_rule(const struct crush_map *map, int choose_local_retries = map->choose_local_tries; int choose_local_fallback_retries = map->choose_local_fallback_tries; + int vary_r = map->chooseleaf_vary_r; + if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); return 0; @@ -745,7 +761,9 @@ int crush_do_rule(const struct crush_map *map, choose_local_retries, choose_local_fallback_retries, recurse_to_leaf, - c+osize); + vary_r, + c+osize, + 0); } else { crush_choose_indep( map, -- cgit From d83ed858f144e3cfbef883d4bc499113cdddabeb Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 19 Mar 2014 16:58:37 +0200 Subject: crush: add SET_CHOOSELEAF_VARY_R step This lets you adjust the vary_r tunable on a per-rule basis. Reflects ceph.git commit f944ccc20aee60a7d8da7e405ec75ad1cd449fac. Signed-off-by: Ilya Dryomov Reviewed-by: Josh Durgin --- net/ceph/crush/mapper.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 947150cde297..a1ef53c04415 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -709,6 +709,11 @@ int crush_do_rule(const struct crush_map *map, choose_local_fallback_retries = curstep->arg1; break; + case CRUSH_RULE_SET_CHOOSELEAF_VARY_R: + if (curstep->arg1 >= 0) + vary_r = curstep->arg1; + break; + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; -- cgit From 35fea3a18a1df8f981f292b492c2de3a9e4e5fc2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:12 +0200 Subject: libceph: refer to osdmap directly in osdmap_show() To make it more readable and save screen space. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/debugfs.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 258a382e75ed..d225842c7b41 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -53,34 +53,36 @@ static int osdmap_show(struct seq_file *s, void *p) { int i; struct ceph_client *client = s->private; + struct ceph_osdmap *map = client->osdc.osdmap; struct rb_node *n; - if (client->osdc.osdmap == NULL) + if (map == NULL) return 0; - seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); + + seq_printf(s, "epoch %d\n", map->epoch); seq_printf(s, "flags%s%s\n", - (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? - " NEARFULL" : "", - (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? - " FULL" : ""); - for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { + (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "", + (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : ""); + + for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pool = rb_entry(n, struct ceph_pg_pool_info, node); + seq_printf(s, "pg_pool %llu pg_num %d / %d\n", (unsigned long long)pool->id, pool->pg_num, pool->pg_num_mask); } - for (i = 0; i < client->osdc.osdmap->max_osd; i++) { - struct ceph_entity_addr *addr = - &client->osdc.osdmap->osd_addr[i]; - int state = client->osdc.osdmap->osd_state[i]; + for (i = 0; i < map->max_osd; i++) { + struct ceph_entity_addr *addr = &map->osd_addr[i]; + int state = map->osd_state[i]; char sb[64]; seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", i, ceph_pr_addr(&addr->in_addr), - ((client->osdc.osdmap->osd_weight[i]*100) >> 16), + ((map->osd_weight[i]*100) >> 16), ceph_osdmap_state_str(sb, sizeof(sb), state)); } + return 0; } -- cgit From 0a2800d7280ccdf3194bd8bd74a2eb8c315b54c6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:12 +0200 Subject: libceph: do not prefix osd lines with \t in debugfs output To save screen space in anticipation of more fields (e.g. primary affinity). Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ceph') diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index d225842c7b41..112d98edb156 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -77,7 +77,7 @@ static int osdmap_show(struct seq_file *s, void *p) int state = map->osd_state[i]; char sb[64]; - seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", + seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\n", i, ceph_pr_addr(&addr->in_addr), ((map->osd_weight[i]*100) >> 16), ceph_osdmap_state_str(sb, sizeof(sb), state)); -- cgit From 1c00240e007d14d3242fa490b50166b4f1b2770a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:13 +0200 Subject: libceph: dump pg_temp mappings to debugfs Dump pg_temp mappings to /sys/kernel/debug/ceph//osdmap, one 'pg_temp [, ..., ]' per line, e.g: pg_temp 2.6 [2,3,4] Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/debugfs.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 112d98edb156..c45d235e774e 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -82,6 +82,17 @@ static int osdmap_show(struct seq_file *s, void *p) ((map->osd_weight[i]*100) >> 16), ceph_osdmap_state_str(sb, sizeof(sb), state)); } + for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool, + pg->pgid.seed); + for (i = 0; i < pg->len; i++) + seq_printf(s, "%s%d", (i == 0 ? "" : ","), + pg->osds[i]); + seq_printf(s, "]\n"); + } return 0; } -- cgit From 38a8d560231b45489d5b12f5c7d0edfba94e1f30 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:13 +0200 Subject: libceph: dump osdmap and enhance output on decode errors Dump osdmap in hex on both full and incremental decode errors, to make it easier to match the contents with error offset. dout() map epoch and max_osd value on success. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 9d1aaa24def6..4dd000d128fd 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -690,6 +690,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) u16 version; u32 len, max, i; int err = -EINVAL; + u32 epoch = 0; void *start = *p; struct ceph_pg_pool_info *pi; @@ -714,7 +715,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); - map->epoch = ceph_decode_32(p); + epoch = map->epoch = ceph_decode_32(p); ceph_decode_copy(p, &map->created, sizeof(map->created)); ceph_decode_copy(p, &map->modified, sizeof(map->modified)); @@ -814,14 +815,18 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) goto bad; } - /* ignore the rest of the map */ + /* ignore the rest */ *p = end; - dout("osdmap_decode done %p %p\n", *p, end); + dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); return map; bad: - dout("osdmap_decode fail err %d\n", err); + pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", + err, epoch, (int)(*p - start), *p, start, end); + print_hex_dump(KERN_DEBUG, "osdmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); ceph_osdmap_destroy(map); return ERR_PTR(err); } @@ -845,6 +850,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, int err = -EINVAL; u16 version; + dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); + ceph_decode_16_safe(p, end, version, bad); if (version != 6) { pr_warning("got unknown v %d != 6 of inc osdmap\n", version); @@ -1032,11 +1039,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* ignore the rest */ *p = end; + + dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); return map; bad: - pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", - epoch, (int)(*p - start), *p, start, end); + pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n", + err, epoch, (int)(*p - start), *p, start, end); print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); -- cgit From a2505d63ee0541d9b4685250b033192e68222e97 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:13 +0200 Subject: libceph: split osdmap allocation and decode steps Split osdmap allocation and initialization into a separate function, ceph_osdmap_decode(). Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 2 +- net/ceph/osdmap.c | 44 +++++++++++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 16 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 71830d79b0f4..6f64eec18851 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2062,7 +2062,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) int skipped_map = 0; dout("taking full map %u len %d\n", epoch, maplen); - newmap = osdmap_decode(&p, p+maplen); + newmap = ceph_osdmap_decode(&p, p+maplen); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4dd000d128fd..a82df6ea0749 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -684,9 +684,8 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) /* * decode a full map. */ -struct ceph_osdmap *osdmap_decode(void **p, void *end) +static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) { - struct ceph_osdmap *map; u16 version; u32 len, max, i; int err = -EINVAL; @@ -694,14 +693,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) void *start = *p; struct ceph_pg_pool_info *pi; - dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); - - map = kzalloc(sizeof(*map), GFP_NOFS); - if (map == NULL) - return ERR_PTR(-ENOMEM); - - map->pg_temp = RB_ROOT; - mutex_init(&map->crush_scratch_mutex); + dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); ceph_decode_16_safe(p, end, version, bad); if (version > 6) { @@ -751,7 +743,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) err = osdmap_set_max_osd(map, max); if (err < 0) goto bad; - dout("osdmap_decode max_osd = %d\n", map->max_osd); /* osds */ err = -EINVAL; @@ -819,7 +810,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) *p = end; dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); - return map; + return 0; bad: pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", @@ -827,8 +818,31 @@ bad: print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); - ceph_osdmap_destroy(map); - return ERR_PTR(err); + return err; +} + +/* + * Allocate and decode a full map. + */ +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) +{ + struct ceph_osdmap *map; + int ret; + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (!map) + return ERR_PTR(-ENOMEM); + + map->pg_temp = RB_ROOT; + mutex_init(&map->crush_scratch_mutex); + + ret = osdmap_decode(p, end, map); + if (ret) { + ceph_osdmap_destroy(map); + return ERR_PTR(ret); + } + + return map; } /* @@ -872,7 +886,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); - return osdmap_decode(p, min(*p+len, end)); + return ceph_osdmap_decode(p, min(*p+len, end)); } /* new crush? */ -- cgit From 597b52f6ca247086371abd67e5083292a500e736 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:14 +0200 Subject: libceph: fixup error handling in osdmap_decode() The existing error handling scheme requires resetting err to -EINVAL prior to calling any ceph_decode_* macro. This is ugly and fragile, and there already are a few places where we would return 0 on error, due to a missing reset. Fix this by adding a special e_inval label to be used by all ceph_decode_* macros. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index a82df6ea0749..298d076eee89 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -688,36 +688,37 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) { u16 version; u32 len, max, i; - int err = -EINVAL; u32 epoch = 0; void *start = *p; + int err; struct ceph_pg_pool_info *pi; dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); - ceph_decode_16_safe(p, end, version, bad); + ceph_decode_16_safe(p, end, version, e_inval); if (version > 6) { pr_warning("got unknown v %d > 6 of osdmap\n", version); - goto bad; + goto e_inval; } if (version < 6) { pr_warning("got old v %d < 6 of osdmap\n", version); - goto bad; + goto e_inval; } - ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); + ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), e_inval); ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); epoch = map->epoch = ceph_decode_32(p); ceph_decode_copy(p, &map->created, sizeof(map->created)); ceph_decode_copy(p, &map->modified, sizeof(map->modified)); - ceph_decode_32_safe(p, end, max, bad); + ceph_decode_32_safe(p, end, max, e_inval); while (max--) { - ceph_decode_need(p, end, 8 + 2, bad); - err = -ENOMEM; + ceph_decode_need(p, end, 8 + 2, e_inval); pi = kzalloc(sizeof(*pi), GFP_NOFS); - if (!pi) + if (!pi) { + err = -ENOMEM; goto bad; + } pi->id = ceph_decode_64(p); err = __decode_pool(p, end, pi); if (err < 0) { @@ -728,27 +729,25 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) } err = __decode_pool_names(p, end, map); - if (err < 0) { - dout("fail to decode pool names"); + if (err) goto bad; - } - ceph_decode_32_safe(p, end, map->pool_max, bad); + ceph_decode_32_safe(p, end, map->pool_max, e_inval); - ceph_decode_32_safe(p, end, map->flags, bad); + ceph_decode_32_safe(p, end, map->flags, e_inval); max = ceph_decode_32(p); /* (re)alloc osd arrays */ err = osdmap_set_max_osd(map, max); - if (err < 0) + if (err) goto bad; /* osds */ - err = -EINVAL; ceph_decode_need(p, end, 3*sizeof(u32) + map->max_osd*(1 + sizeof(*map->osd_weight) + - sizeof(*map->osd_addr)), bad); + sizeof(*map->osd_addr)), e_inval); + *p += 4; /* skip length field (should match max) */ ceph_decode_copy(p, map->osd_state, map->max_osd); @@ -762,7 +761,7 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) ceph_decode_addr(&map->osd_addr[i]); /* pg_temp */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); for (i = 0; i < len; i++) { int n, j; struct ceph_pg pgid; @@ -771,16 +770,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) err = ceph_decode_pgid(p, end, &pgid); if (err) goto bad; - ceph_decode_need(p, end, sizeof(u32), bad); + ceph_decode_need(p, end, sizeof(u32), e_inval); n = ceph_decode_32(p); - err = -EINVAL; if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) - goto bad; - ceph_decode_need(p, end, n * sizeof(u32), bad); - err = -ENOMEM; + goto e_inval; + ceph_decode_need(p, end, n * sizeof(u32), e_inval); pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); - if (!pg) + if (!pg) { + err = -ENOMEM; goto bad; + } pg->pgid = pgid; pg->len = n; for (j = 0; j < n; j++) @@ -794,10 +793,10 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) } /* crush */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); dout("osdmap_decode crush len %d from off 0x%x\n", len, (int)(*p - start)); - ceph_decode_need(p, end, len, bad); + ceph_decode_need(p, end, len, e_inval); map->crush = crush_decode(*p, end); *p += len; if (IS_ERR(map->crush)) { @@ -812,6 +811,8 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); return 0; +e_inval: + err = -EINVAL; bad: pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", err, epoch, (int)(*p - start), *p, start, end); -- cgit From 3977058c468b872c6bc5e5273bf911d791848643 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:14 +0200 Subject: libceph: safely decode max_osd value in osdmap_decode() max_osd value is not covered by any ceph_decode_need(). Use a safe version of ceph_decode_* macro to decode it. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 298d076eee89..ec06010657b3 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -687,9 +687,10 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) { u16 version; - u32 len, max, i; u32 epoch = 0; void *start = *p; + u32 max; + u32 len, i; int err; struct ceph_pg_pool_info *pi; @@ -736,7 +737,8 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) ceph_decode_32_safe(p, end, map->flags, e_inval); - max = ceph_decode_32(p); + /* max_osd */ + ceph_decode_32_safe(p, end, max, e_inval); /* (re)alloc osd arrays */ err = osdmap_set_max_osd(map, max); -- cgit From 2d88b2e0819e0401ebb195e9fa20fab4be1965c8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:14 +0200 Subject: libceph: check length of osdmap osd arrays Check length of osd_state, osd_weight and osd_addr arrays. They should all have exactly max_osd elements after the call to osdmap_set_max_osd(). Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index ec06010657b3..c39ac624ccc3 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -745,19 +745,25 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) if (err) goto bad; - /* osds */ + /* osd_state, osd_weight, osd_addrs->client_addr */ ceph_decode_need(p, end, 3*sizeof(u32) + map->max_osd*(1 + sizeof(*map->osd_weight) + sizeof(*map->osd_addr)), e_inval); - *p += 4; /* skip length field (should match max) */ + if (ceph_decode_32(p) != map->max_osd) + goto e_inval; + ceph_decode_copy(p, map->osd_state, map->max_osd); - *p += 4; /* skip length field (should match max) */ + if (ceph_decode_32(p) != map->max_osd) + goto e_inval; + for (i = 0; i < map->max_osd; i++) map->osd_weight[i] = ceph_decode_32(p); - *p += 4; /* skip length field (should match max) */ + if (ceph_decode_32(p) != map->max_osd) + goto e_inval; + ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); for (i = 0; i < map->max_osd; i++) ceph_decode_addr(&map->osd_addr[i]); -- cgit From 9902e682c7f3df9ed5f60bc6f9c7efa6fd6b2d1d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:15 +0200 Subject: libceph: fix crush_decode() call site in osdmap_decode() The size of the memory area feeded to crush_decode() should be limited not only by osdmap end, but also by the crush map length. Also, drop unnecessary dout() (dout() in crush_decode() conveys the same info) and step past crush map only if it is decoded successfully. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c39ac624ccc3..d4a6b0df3627 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -802,16 +802,13 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) /* crush */ ceph_decode_32_safe(p, end, len, e_inval); - dout("osdmap_decode crush len %d from off 0x%x\n", len, - (int)(*p - start)); - ceph_decode_need(p, end, len, e_inval); - map->crush = crush_decode(*p, end); - *p += len; + map->crush = crush_decode(*p, min(*p + len, end)); if (IS_ERR(map->crush)) { err = PTR_ERR(map->crush); map->crush = NULL; goto bad; } + *p += len; /* ignore the rest */ *p = end; -- cgit From 86f1742b94dd0b4a2eb9255205d1756ddea182f8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:15 +0200 Subject: libceph: fixup error handling in osdmap_apply_incremental() The existing error handling scheme requires resetting err to -EINVAL prior to calling any ceph_decode_* macro. This is ugly and fragile, and there already are a few places where we would return 0 on error, due to a missing reset. Follow osdmap_decode() and fix this by adding a special e_inval label to be used by all ceph_decode_* macros. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 66 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 32 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index d4a6b0df3627..b844a9273666 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -867,19 +867,19 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, __s64 new_pool_max; __s32 new_flags, max; void *start = *p; - int err = -EINVAL; + int err; u16 version; dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); - ceph_decode_16_safe(p, end, version, bad); + ceph_decode_16_safe(p, end, version, e_inval); if (version != 6) { pr_warning("got unknown v %d != 6 of inc osdmap\n", version); - goto bad; + goto e_inval; } ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), - bad); + e_inval); ceph_decode_copy(p, &fsid, sizeof(fsid)); epoch = ceph_decode_32(p); BUG_ON(epoch != map->epoch+1); @@ -888,7 +888,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, new_flags = ceph_decode_32(p); /* full map? */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); @@ -896,13 +896,14 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new crush? */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { - dout("apply_incremental new crush map len %d, %p to %p\n", - len, *p, end); newcrush = crush_decode(*p, min(*p+len, end)); - if (IS_ERR(newcrush)) - return ERR_CAST(newcrush); + if (IS_ERR(newcrush)) { + err = PTR_ERR(newcrush); + newcrush = NULL; + goto bad; + } *p += len; } @@ -912,13 +913,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (new_pool_max >= 0) map->pool_max = new_pool_max; - ceph_decode_need(p, end, 5*sizeof(u32), bad); + ceph_decode_need(p, end, 5*sizeof(u32), e_inval); /* new max? */ max = ceph_decode_32(p); if (max >= 0) { err = osdmap_set_max_osd(map, max); - if (err < 0) + if (err) goto bad; } @@ -932,11 +933,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_pool */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { struct ceph_pg_pool_info *pi; - ceph_decode_64_safe(p, end, pool, bad); + ceph_decode_64_safe(p, end, pool, e_inval); pi = __lookup_pg_pool(&map->pg_pools, pool); if (!pi) { pi = kzalloc(sizeof(*pi), GFP_NOFS); @@ -953,29 +954,28 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } if (version >= 5) { err = __decode_pool_names(p, end, map); - if (err < 0) + if (err) goto bad; } /* old_pool */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { struct ceph_pg_pool_info *pi; - ceph_decode_64_safe(p, end, pool, bad); + ceph_decode_64_safe(p, end, pool, e_inval); pi = __lookup_pg_pool(&map->pg_pools, pool); if (pi) __remove_pg_pool(&map->pg_pools, pi); } /* new_up */ - err = -EINVAL; - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { u32 osd; struct ceph_entity_addr addr; - ceph_decode_32_safe(p, end, osd, bad); - ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); + ceph_decode_32_safe(p, end, osd, e_inval); + ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval); ceph_decode_addr(&addr); pr_info("osd%d up\n", osd); BUG_ON(osd >= map->max_osd); @@ -984,11 +984,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_state */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { u32 osd; u8 xorstate; - ceph_decode_32_safe(p, end, osd, bad); + ceph_decode_32_safe(p, end, osd, e_inval); xorstate = **(u8 **)p; (*p)++; /* clean flag */ if (xorstate == 0) @@ -1000,10 +1000,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_weight */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { u32 osd, off; - ceph_decode_need(p, end, sizeof(u32)*2, bad); + ceph_decode_need(p, end, sizeof(u32)*2, e_inval); osd = ceph_decode_32(p); off = ceph_decode_32(p); pr_info("osd%d weight 0x%x %s\n", osd, off, @@ -1014,7 +1014,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_pg_temp */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { struct ceph_pg_mapping *pg; int j; @@ -1024,22 +1024,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, err = ceph_decode_pgid(p, end, &pgid); if (err) goto bad; - ceph_decode_need(p, end, sizeof(u32), bad); + ceph_decode_need(p, end, sizeof(u32), e_inval); pglen = ceph_decode_32(p); if (pglen) { - ceph_decode_need(p, end, pglen*sizeof(u32), bad); + ceph_decode_need(p, end, pglen*sizeof(u32), e_inval); /* removing existing (if any) */ (void) __remove_pg_mapping(&map->pg_temp, pgid); /* insert */ - err = -EINVAL; if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) - goto bad; - err = -ENOMEM; + goto e_inval; pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); - if (!pg) + if (!pg) { + err = -ENOMEM; goto bad; + } pg->pgid = pgid; pg->len = pglen; for (j = 0; j < pglen; j++) @@ -1063,6 +1063,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); return map; +e_inval: + err = -EINVAL; bad: pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n", err, epoch, (int)(*p - start), *p, start, end); -- cgit From 9464d00862ea6a5c0edc7118c86bdfa71a95190e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:16 +0200 Subject: libceph: nuke bogus encoding version check in osdmap_apply_incremental() Only version 6 of osdmap encoding is supported, anything other than version 6 results in an error and halts the decoding process. Checking if version is >= 5 is therefore bogus. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index b844a9273666..07fa3697ea12 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -952,11 +952,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (err < 0) goto bad; } - if (version >= 5) { - err = __decode_pool_names(p, end, map); - if (err) - goto bad; - } + + err = __decode_pool_names(p, end, map); + if (err) + goto bad; /* old_pool */ ceph_decode_32_safe(p, end, len, e_inval); -- cgit From 53bbaba9d811f75acc20038bfd707a4cfdc4571f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Mar 2014 16:36:16 +0200 Subject: libceph: fix and clarify ceph_decode_need() sizes Sum up sizeof(...) results instead of (incorrectly) hard-coding the number of bytes, expressed in ints and longs. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 07fa3697ea12..11649101a4a8 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -706,7 +706,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) goto e_inval; } - ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), e_inval); + /* fsid, epoch, created, modified */ + ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) + + sizeof(map->created) + sizeof(map->modified), e_inval); ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); epoch = map->epoch = ceph_decode_32(p); ceph_decode_copy(p, &map->created, sizeof(map->created)); @@ -878,8 +880,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, goto e_inval; } - ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), - e_inval); + /* fsid, epoch, modified, new_pool_max, new_flags */ + ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) + + sizeof(u64) + sizeof(u32), e_inval); ceph_decode_copy(p, &fsid, sizeof(fsid)); epoch = ceph_decode_32(p); BUG_ON(epoch != map->epoch+1); @@ -913,10 +916,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (new_pool_max >= 0) map->pool_max = new_pool_max; - ceph_decode_need(p, end, 5*sizeof(u32), e_inval); - /* new max? */ - max = ceph_decode_32(p); + ceph_decode_32_safe(p, end, max, e_inval); if (max >= 0) { err = osdmap_set_max_osd(map, max); if (err) -- cgit From 0f70c7eedbbbd245398fc74b4b020f3b800f071c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:27 +0200 Subject: libceph: rename __decode_pool{,_names}() to decode_pool{,_names}() To be in line with all the other osdmap decode helpers. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 11649101a4a8..39938d79756c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -506,7 +506,7 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) kfree(pi); } -static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) +static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) { u8 ev, cv; unsigned len, num; @@ -587,7 +587,7 @@ bad: return -EINVAL; } -static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) +static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) { struct ceph_pg_pool_info *pi; u32 num, len; @@ -723,7 +723,7 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) goto bad; } pi->id = ceph_decode_64(p); - err = __decode_pool(p, end, pi); + err = decode_pool(p, end, pi); if (err < 0) { kfree(pi); goto bad; @@ -731,7 +731,8 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) __insert_pg_pool(&map->pg_pools, pi); } - err = __decode_pool_names(p, end, map); + /* pool_name */ + err = decode_pool_names(p, end, map); if (err) goto bad; @@ -949,12 +950,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, pi->id = pool; __insert_pg_pool(&map->pg_pools, pi); } - err = __decode_pool(p, end, pi); + err = decode_pool(p, end, pi); if (err < 0) goto bad; } - err = __decode_pool_names(p, end, map); + /* new_pool_names */ + err = decode_pool_names(p, end, map); if (err) goto bad; -- cgit From 433fbdd31db267564bab20420bd8f161a7c69e4d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:27 +0200 Subject: libceph: introduce decode{,_new}_pools() and switch to them Consolidate pools (full map, map) and new_pools (inc map, same) decoding logic into a common helper and switch to it. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 94 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 57 insertions(+), 37 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 39938d79756c..0ba3062d3317 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -681,6 +681,55 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) return 0; } +static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, + bool incremental) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct ceph_pg_pool_info *pi; + u64 pool; + int ret; + + ceph_decode_64_safe(p, end, pool, e_inval); + + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (!incremental || !pi) { + pi = kzalloc(sizeof(*pi), GFP_NOFS); + if (!pi) + return -ENOMEM; + + pi->id = pool; + + ret = __insert_pg_pool(&map->pg_pools, pi); + if (ret) { + kfree(pi); + return ret; + } + } + + ret = decode_pool(p, end, pi); + if (ret) + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_pools(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pools(p, end, map, false); +} + +static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pools(p, end, map, true); +} + /* * decode a full map. */ @@ -692,7 +741,6 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) u32 max; u32 len, i; int err; - struct ceph_pg_pool_info *pi; dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); @@ -714,22 +762,10 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) ceph_decode_copy(p, &map->created, sizeof(map->created)); ceph_decode_copy(p, &map->modified, sizeof(map->modified)); - ceph_decode_32_safe(p, end, max, e_inval); - while (max--) { - ceph_decode_need(p, end, 8 + 2, e_inval); - pi = kzalloc(sizeof(*pi), GFP_NOFS); - if (!pi) { - err = -ENOMEM; - goto bad; - } - pi->id = ceph_decode_64(p); - err = decode_pool(p, end, pi); - if (err < 0) { - kfree(pi); - goto bad; - } - __insert_pg_pool(&map->pg_pools, pi); - } + /* pools */ + err = decode_pools(p, end, map); + if (err) + goto bad; /* pool_name */ err = decode_pool_names(p, end, map); @@ -934,26 +970,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, newcrush = NULL; } - /* new_pool */ - ceph_decode_32_safe(p, end, len, e_inval); - while (len--) { - struct ceph_pg_pool_info *pi; - - ceph_decode_64_safe(p, end, pool, e_inval); - pi = __lookup_pg_pool(&map->pg_pools, pool); - if (!pi) { - pi = kzalloc(sizeof(*pi), GFP_NOFS); - if (!pi) { - err = -ENOMEM; - goto bad; - } - pi->id = pool; - __insert_pg_pool(&map->pg_pools, pi); - } - err = decode_pool(p, end, pi); - if (err < 0) - goto bad; - } + /* new_pools */ + err = decode_new_pools(p, end, map); + if (err) + goto bad; /* new_pool_names */ err = decode_pool_names(p, end, map); -- cgit From 4d60351f9089ef0f39d73c0b6a103e61fc0ed187 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:28 +0200 Subject: libceph: switch osdmap_set_max_osd() to krealloc() Use krealloc() instead of rolling our own. (krealloc() with a NULL first argument acts as a kmalloc()). Properly initalize the new array elements. This is needed to make future additions to osdmap easier. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 0ba3062d3317..a350286fd826 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -646,38 +646,40 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) } /* - * adjust max osd value. reallocate arrays. + * Adjust max_osd value, (re)allocate arrays. + * + * The new elements are properly initialized. */ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) { u8 *state; - struct ceph_entity_addr *addr; u32 *weight; + struct ceph_entity_addr *addr; + int i; - state = kcalloc(max, sizeof(*state), GFP_NOFS); - addr = kcalloc(max, sizeof(*addr), GFP_NOFS); - weight = kcalloc(max, sizeof(*weight), GFP_NOFS); - if (state == NULL || addr == NULL || weight == NULL) { + state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); + weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); + addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); + if (!state || !weight || !addr) { kfree(state); - kfree(addr); kfree(weight); + kfree(addr); + return -ENOMEM; } - /* copy old? */ - if (map->osd_state) { - memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); - memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); - memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); - kfree(map->osd_state); - kfree(map->osd_addr); - kfree(map->osd_weight); + for (i = map->max_osd; i < max; i++) { + state[i] = 0; + weight[i] = CEPH_OSD_OUT; + memset(addr + i, 0, sizeof(*addr)); } map->osd_state = state; map->osd_weight = weight; map->osd_addr = addr; + map->max_osd = max; + return 0; } -- cgit From 10db634e2083a202a26123d6d4c9ede98d6a1199 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:28 +0200 Subject: libceph: introduce decode{,_new}_pg_temp() and switch to them Consolidate pg_temp (full map, map>) and new_pg_temp (inc map, same) decoding logic into a common helper and switch to it. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 139 ++++++++++++++++++++++++++---------------------------- 1 file changed, 67 insertions(+), 72 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index a350286fd826..6497322d2e3c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -732,6 +732,67 @@ static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) return __decode_pools(p, end, map, true); } +static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, + bool incremental) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct ceph_pg pgid; + u32 len, i; + int ret; + + ret = ceph_decode_pgid(p, end, &pgid); + if (ret) + return ret; + + ceph_decode_32_safe(p, end, len, e_inval); + + ret = __remove_pg_mapping(&map->pg_temp, pgid); + BUG_ON(!incremental && ret != -ENOENT); + + if (!incremental || len > 0) { + struct ceph_pg_mapping *pg; + + ceph_decode_need(p, end, len*sizeof(u32), e_inval); + + if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) + return -EINVAL; + + pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS); + if (!pg) + return -ENOMEM; + + pg->pgid = pgid; + pg->len = len; + for (i = 0; i < len; i++) + pg->osds[i] = ceph_decode_32(p); + + ret = __insert_pg_mapping(pg, &map->pg_temp); + if (ret) { + kfree(pg); + return ret; + } + } + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pg_temp(p, end, map, false); +} + +static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pg_temp(p, end, map, true); +} + /* * decode a full map. */ @@ -810,36 +871,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) ceph_decode_addr(&map->osd_addr[i]); /* pg_temp */ - ceph_decode_32_safe(p, end, len, e_inval); - for (i = 0; i < len; i++) { - int n, j; - struct ceph_pg pgid; - struct ceph_pg_mapping *pg; - - err = ceph_decode_pgid(p, end, &pgid); - if (err) - goto bad; - ceph_decode_need(p, end, sizeof(u32), e_inval); - n = ceph_decode_32(p); - if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) - goto e_inval; - ceph_decode_need(p, end, n * sizeof(u32), e_inval); - pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); - if (!pg) { - err = -ENOMEM; - goto bad; - } - pg->pgid = pgid; - pg->len = n; - for (j = 0; j < n; j++) - pg->osds[j] = ceph_decode_32(p); - - err = __insert_pg_mapping(pg, &map->pg_temp); - if (err) - goto bad; - dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed, - len); - } + err = decode_pg_temp(p, end, map); + if (err) + goto bad; /* crush */ ceph_decode_32_safe(p, end, len, e_inval); @@ -1038,48 +1072,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_pg_temp */ - ceph_decode_32_safe(p, end, len, e_inval); - while (len--) { - struct ceph_pg_mapping *pg; - int j; - struct ceph_pg pgid; - u32 pglen; - - err = ceph_decode_pgid(p, end, &pgid); - if (err) - goto bad; - ceph_decode_need(p, end, sizeof(u32), e_inval); - pglen = ceph_decode_32(p); - if (pglen) { - ceph_decode_need(p, end, pglen*sizeof(u32), e_inval); - - /* removing existing (if any) */ - (void) __remove_pg_mapping(&map->pg_temp, pgid); - - /* insert */ - if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) - goto e_inval; - pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); - if (!pg) { - err = -ENOMEM; - goto bad; - } - pg->pgid = pgid; - pg->len = pglen; - for (j = 0; j < pglen; j++) - pg->osds[j] = ceph_decode_32(p); - err = __insert_pg_mapping(pg, &map->pg_temp); - if (err) { - kfree(pg); - goto bad; - } - dout(" added pg_temp %lld.%x len %d\n", pgid.pool, - pgid.seed, pglen); - } else { - /* remove */ - __remove_pg_mapping(&map->pg_temp, pgid); - } - } + err = decode_new_pg_temp(p, end, map); + if (err) + goto bad; /* ignore the rest */ *p = end; -- cgit From ec7af97258396161e6effba7e788c3fc3cb55263 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:29 +0200 Subject: libceph: introduce get_osdmap_client_data_v() Full and incremental osdmaps are structured identically and have identical headers. Add a helper to decode both "old" (16-bit version, v6) and "new" (8-bit struct_v+struct_compat+struct_len, v7) osdmap enconding headers and switch to it. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 81 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 16 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 6497322d2e3c..be2a65fbd902 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -683,6 +683,63 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) return 0; } +#define OSDMAP_WRAPPER_COMPAT_VER 7 +#define OSDMAP_CLIENT_DATA_COMPAT_VER 1 + +/* + * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps, + * to struct_v of the client_data section for new (v7 and above) + * osdmaps. + */ +static int get_osdmap_client_data_v(void **p, void *end, + const char *prefix, u8 *v) +{ + u8 struct_v; + + ceph_decode_8_safe(p, end, struct_v, e_inval); + if (struct_v >= 7) { + u8 struct_compat; + + ceph_decode_8_safe(p, end, struct_compat, e_inval); + if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) { + pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n", + struct_v, struct_compat, + OSDMAP_WRAPPER_COMPAT_VER, prefix); + return -EINVAL; + } + *p += 4; /* ignore wrapper struct_len */ + + ceph_decode_8_safe(p, end, struct_v, e_inval); + ceph_decode_8_safe(p, end, struct_compat, e_inval); + if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) { + pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n", + struct_v, struct_compat, + OSDMAP_CLIENT_DATA_COMPAT_VER, prefix); + return -EINVAL; + } + *p += 4; /* ignore client data struct_len */ + } else { + u16 version; + + *p -= 1; + ceph_decode_16_safe(p, end, version, e_inval); + if (version < 6) { + pr_warning("got v %d < 6 of %s ceph_osdmap\n", version, + prefix); + return -EINVAL; + } + + /* old osdmap enconding */ + struct_v = 0; + } + + *v = struct_v; + return 0; + +e_inval: + return -EINVAL; +} + static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, bool incremental) { @@ -798,7 +855,7 @@ static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) */ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) { - u16 version; + u8 struct_v; u32 epoch = 0; void *start = *p; u32 max; @@ -807,15 +864,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); - ceph_decode_16_safe(p, end, version, e_inval); - if (version > 6) { - pr_warning("got unknown v %d > 6 of osdmap\n", version); - goto e_inval; - } - if (version < 6) { - pr_warning("got old v %d < 6 of osdmap\n", version); - goto e_inval; - } + err = get_osdmap_client_data_v(p, end, "full", &struct_v); + if (err) + goto bad; /* fsid, epoch, created, modified */ ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) + @@ -943,15 +994,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, __s32 new_flags, max; void *start = *p; int err; - u16 version; + u8 struct_v; dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); - ceph_decode_16_safe(p, end, version, e_inval); - if (version != 6) { - pr_warning("got unknown v %d != 6 of inc osdmap\n", version); - goto e_inval; - } + err = get_osdmap_client_data_v(p, end, "inc", &struct_v); + if (err) + goto bad; /* fsid, epoch, modified, new_pool_max, new_flags */ ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) + -- cgit From 35a935d75d51abe58d3427a8b4ae3745a5a14e1c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:29 +0200 Subject: libceph: generalize ceph_pg_mapping In preparation for adding support for primary_temp mappings, generalize struct ceph_pg_mapping so it can hold mappings other than pg_temp. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/debugfs.c | 4 ++-- net/ceph/osdmap.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index c45d235e774e..5865f2c9580a 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -88,9 +88,9 @@ static int osdmap_show(struct seq_file *s, void *p) seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool, pg->pgid.seed); - for (i = 0; i < pg->len; i++) + for (i = 0; i < pg->pg_temp.len; i++) seq_printf(s, "%s%d", (i == 0 ? "" : ","), - pg->osds[i]); + pg->pg_temp.osds[i]); seq_printf(s, "]\n"); } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index be2a65fbd902..c67a309fdfc2 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -822,9 +822,9 @@ static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, return -ENOMEM; pg->pgid = pgid; - pg->len = len; + pg->pg_temp.len = len; for (i = 0; i < len; i++) - pg->osds[i] = ceph_decode_32(p); + pg->pg_temp.osds[i] = ceph_decode_32(p); ret = __insert_pg_mapping(pg, &map->pg_temp); if (ret) { @@ -1281,8 +1281,8 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, pool->pg_num_mask); pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { - *num = pg->len; - return pg->osds; + *num = pg->pg_temp.len; + return pg->pg_temp.osds; } /* crush */ -- cgit From 9686f94c8cfc06e8afb7b2233ab8f1f6ac01957f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:29 +0200 Subject: libceph: primary_temp infrastructure Add primary_temp mappings infrastructure. struct ceph_pg_mapping is overloaded, primary_temp mappings are stored in an rb-tree, rooted at ceph_osdmap, in a manner similar to pg_temp mappings. Dump primary_temp mappings to /sys/kernel/debug/ceph//osdmap, one 'primary_temp ' per line, e.g: primary_temp 2.6 4 Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/debugfs.c | 7 +++++++ net/ceph/osdmap.c | 10 +++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'net/ceph') diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 5865f2c9580a..612bf55e6a8b 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -93,6 +93,13 @@ static int osdmap_show(struct seq_file *s, void *p) pg->pg_temp.osds[i]); seq_printf(s, "]\n"); } + for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool, + pg->pgid.seed, pg->primary_temp.osd); + } return 0; } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c67a309fdfc2..c0fc517ab321 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -343,7 +343,7 @@ bad: /* * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid - * to a set of osds) + * to a set of osds) and primary_temp (explicit primary setting) */ static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) { @@ -633,6 +633,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) rb_erase(&pg->node, &map->pg_temp); kfree(pg); } + while (!RB_EMPTY_ROOT(&map->primary_temp)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->primary_temp), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->primary_temp); + kfree(pg); + } while (!RB_EMPTY_ROOT(&map->pg_pools)) { struct ceph_pg_pool_info *pi = rb_entry(rb_first(&map->pg_pools), @@ -966,6 +973,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) return ERR_PTR(-ENOMEM); map->pg_temp = RB_ROOT; + map->primary_temp = RB_ROOT; mutex_init(&map->crush_scratch_mutex); ret = osdmap_decode(p, end, map); -- cgit From d286de796aab9e306e674c6d23c4f3c1f55e394c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:30 +0200 Subject: libceph: primary_temp decode bits Add a common helper to decode both primary_temp (full map, map) and new_primary_temp (inc map, same) and switch to it. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c0fc517ab321..c2d793d3d98d 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -857,6 +857,61 @@ static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) return __decode_pg_temp(p, end, map, true); } +static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map, + bool incremental) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct ceph_pg pgid; + u32 osd; + int ret; + + ret = ceph_decode_pgid(p, end, &pgid); + if (ret) + return ret; + + ceph_decode_32_safe(p, end, osd, e_inval); + + ret = __remove_pg_mapping(&map->primary_temp, pgid); + BUG_ON(!incremental && ret != -ENOENT); + + if (!incremental || osd != (u32)-1) { + struct ceph_pg_mapping *pg; + + pg = kzalloc(sizeof(*pg), GFP_NOFS); + if (!pg) + return -ENOMEM; + + pg->pgid = pgid; + pg->primary_temp.osd = osd; + + ret = __insert_pg_mapping(pg, &map->primary_temp); + if (ret) { + kfree(pg); + return ret; + } + } + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_primary_temp(p, end, map, false); +} + +static int decode_new_primary_temp(void **p, void *end, + struct ceph_osdmap *map) +{ + return __decode_primary_temp(p, end, map, true); +} + /* * decode a full map. */ @@ -933,6 +988,13 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) if (err) goto bad; + /* primary_temp */ + if (struct_v >= 1) { + err = decode_primary_temp(p, end, map); + if (err) + goto bad; + } + /* crush */ ceph_decode_32_safe(p, end, len, e_inval); map->crush = crush_decode(*p, min(*p + len, end)); @@ -1133,6 +1195,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (err) goto bad; + /* new_primary_temp */ + if (struct_v >= 1) { + err = decode_new_primary_temp(p, end, map); + if (err) + goto bad; + } + /* ignore the rest */ *p = end; -- cgit From 2cfa34f2d67a36e292cbe6e4c1e60d212b7ba4d1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:30 +0200 Subject: libceph: primary_affinity infrastructure Add primary_affinity infrastructure. primary_affinity values are stored in an max_osd-sized array, hanging off ceph_osdmap, similar to a osd_weight array. Introduce {get,set}_primary_affinity() helpers, primarily to return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY when no affinity has been set and to abstract out osd_primary_affinity array allocation and initialization. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/debugfs.c | 5 +++-- net/ceph/osdmap.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 612bf55e6a8b..34453a2b4b4d 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -77,10 +77,11 @@ static int osdmap_show(struct seq_file *s, void *p) int state = map->osd_state[i]; char sb[64]; - seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\n", + seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", i, ceph_pr_addr(&addr->in_addr), ((map->osd_weight[i]*100) >> 16), - ceph_osdmap_state_str(sb, sizeof(sb), state)); + ceph_osdmap_state_str(sb, sizeof(sb), state), + ((ceph_get_primary_affinity(map, i)*100) >> 16)); } for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) { struct ceph_pg_mapping *pg = diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c2d793d3d98d..0ac129388e07 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -649,6 +649,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) kfree(map->osd_state); kfree(map->osd_weight); kfree(map->osd_addr); + kfree(map->osd_primary_affinity); kfree(map); } @@ -685,6 +686,20 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) map->osd_weight = weight; map->osd_addr = addr; + if (map->osd_primary_affinity) { + u32 *affinity; + + affinity = krealloc(map->osd_primary_affinity, + max*sizeof(*affinity), GFP_NOFS); + if (!affinity) + return -ENOMEM; + + for (i = map->max_osd; i < max; i++) + affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + + map->osd_primary_affinity = affinity; + } + map->max_osd = max; return 0; @@ -912,6 +927,38 @@ static int decode_new_primary_temp(void **p, void *end, return __decode_primary_temp(p, end, map, true); } +u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) +{ + BUG_ON(osd >= map->max_osd); + + if (!map->osd_primary_affinity) + return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + + return map->osd_primary_affinity[osd]; +} + +static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) +{ + BUG_ON(osd >= map->max_osd); + + if (!map->osd_primary_affinity) { + int i; + + map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32), + GFP_NOFS); + if (!map->osd_primary_affinity) + return -ENOMEM; + + for (i = 0; i < map->max_osd; i++) + map->osd_primary_affinity[i] = + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + } + + map->osd_primary_affinity[osd] = aff; + + return 0; +} + /* * decode a full map. */ -- cgit From 63a6993f521b2629872e89c02a336fb3f18b092b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 Mar 2014 19:05:30 +0200 Subject: libceph: primary_affinity decode bits Add two helpers to decode primary_affinity (full map, vector) and new_primary_affinity (inc map, map) and switch to them. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 0ac129388e07..9568e6221852 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -959,6 +959,60 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) return 0; } +static int decode_primary_affinity(void **p, void *end, + struct ceph_osdmap *map) +{ + u32 len, i; + + ceph_decode_32_safe(p, end, len, e_inval); + if (len == 0) { + kfree(map->osd_primary_affinity); + map->osd_primary_affinity = NULL; + return 0; + } + if (len != map->max_osd) + goto e_inval; + + ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval); + + for (i = 0; i < map->max_osd; i++) { + int ret; + + ret = set_primary_affinity(map, i, ceph_decode_32(p)); + if (ret) + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_new_primary_affinity(void **p, void *end, + struct ceph_osdmap *map) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + u32 osd, aff; + int ret; + + ceph_decode_32_safe(p, end, osd, e_inval); + ceph_decode_32_safe(p, end, aff, e_inval); + + ret = set_primary_affinity(map, osd, aff); + if (ret) + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + /* * decode a full map. */ @@ -1042,6 +1096,17 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) goto bad; } + /* primary_affinity */ + if (struct_v >= 2) { + err = decode_primary_affinity(p, end, map); + if (err) + goto bad; + } else { + /* XXX can this happen? */ + kfree(map->osd_primary_affinity); + map->osd_primary_affinity = NULL; + } + /* crush */ ceph_decode_32_safe(p, end, len, e_inval); map->crush = crush_decode(*p, min(*p + len, end)); @@ -1249,6 +1314,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, goto bad; } + /* new_primary_affinity */ + if (struct_v >= 2) { + err = decode_new_primary_affinity(p, end, map); + if (err) + goto bad; + } + /* ignore the rest */ *p = end; -- cgit From 2bd93d4d7ec2dd461cfb87c6d8a9b1ef9b30de08 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:47 +0200 Subject: libceph: introduce pg_to_raw_osds() and raw_to_up_osds() helpers pg_to_raw_osds() helper for computing a raw (crush) set, which can contain non-existant and down osds. raw_to_up_osds() helper for pruning non-existant and down osds from the raw set, therefore transforming it into an up set, and determining up primary. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 9568e6221852..b8bbef058019 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1520,6 +1520,82 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, return osds; } +/* + * Calculate raw (crush) set for given pgid. + * + * Return raw set length, or error. + */ +static int pg_to_raw_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pool, + struct ceph_pg pgid, u32 pps, int *osds) +{ + int ruleno; + int len; + + /* crush */ + ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, + pool->type, pool->size); + if (ruleno < 0) { + pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", + pgid.pool, pool->crush_ruleset, pool->type, + pool->size); + return -ENOENT; + } + + len = do_crush(osdmap, ruleno, pps, osds, + min_t(int, pool->size, CEPH_PG_MAX_SIZE), + osdmap->osd_weight, osdmap->max_osd); + if (len < 0) { + pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", + len, ruleno, pgid.pool, pool->crush_ruleset, + pool->type, pool->size); + return len; + } + + return len; +} + +/* + * Given raw set, calculate up set and up primary. + * + * Return up set length. *primary is set to up primary osd id, or -1 + * if up set is empty. + */ +static int raw_to_up_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pool, + int *osds, int len, int *primary) +{ + int up_primary = -1; + int i; + + if (ceph_can_shift_osds(pool)) { + int removed = 0; + + for (i = 0; i < len; i++) { + if (ceph_osd_is_down(osdmap, osds[i])) { + removed++; + continue; + } + if (removed) + osds[i - removed] = osds[i]; + } + + len -= removed; + if (len > 0) + up_primary = osds[0]; + } else { + for (i = len - 1; i >= 0; i--) { + if (ceph_osd_is_down(osdmap, osds[i])) + osds[i] = CRUSH_ITEM_NONE; + else + up_primary = osds[i]; + } + } + + *primary = up_primary; + return len; +} + /* * Return acting set for given pgid. */ -- cgit From 45966c3467e8291382a8970adbd78403a7818d45 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:47 +0200 Subject: libceph: introduce apply_temps() helper apply_temp() helper for applying various temporary mappings (at this point only pg_temp mappings) to the up set, therefore transforming it into an acting set. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index b8bbef058019..bd40f56b53ed 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1596,6 +1596,58 @@ static int raw_to_up_osds(struct ceph_osdmap *osdmap, return len; } +/* + * Given up set, apply pg_temp mapping. + * + * Return acting set length. *primary is set to acting primary osd id, + * or -1 if acting set is empty. + */ +static int apply_temps(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pool, struct ceph_pg pgid, + int *osds, int len, int *primary) +{ + struct ceph_pg_mapping *pg; + int temp_len; + int temp_primary; + int i; + + /* raw_pg -> pg */ + pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, + pool->pg_num_mask); + + /* pg_temp? */ + pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); + if (pg) { + temp_len = 0; + temp_primary = -1; + + for (i = 0; i < pg->pg_temp.len; i++) { + if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { + if (ceph_can_shift_osds(pool)) + continue; + else + osds[temp_len++] = CRUSH_ITEM_NONE; + } else { + osds[temp_len++] = pg->pg_temp.osds[i]; + } + } + + /* apply pg_temp's primary */ + for (i = 0; i < temp_len; i++) { + if (osds[i] != CRUSH_ITEM_NONE) { + temp_primary = osds[i]; + break; + } + } + } else { + temp_len = len; + temp_primary = *primary; + } + + *primary = temp_primary; + return temp_len; +} + /* * Return acting set for given pgid. */ -- cgit From ac972230e20581b044f5ce66dcaf3c5af8d57444 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:48 +0200 Subject: libceph: switch ceph_calc_pg_acting() to new helpers Switch ceph_calc_pg_acting() to new helpers: pg_to_raw_osds(), raw_to_up_osds() and apply_temps(). Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 51 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 13 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index bd40f56b53ed..f1cad21d1533 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1649,24 +1649,49 @@ static int apply_temps(struct ceph_osdmap *osdmap, } /* - * Return acting set for given pgid. + * Calculate acting set for given pgid. + * + * Return acting set length, or error. */ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *acting) + int *osds) { - int rawosds[CEPH_PG_MAX_SIZE], *osds; - int i, o, num = CEPH_PG_MAX_SIZE; + struct ceph_pg_pool_info *pool; + u32 pps; + int len; + int primary; - osds = calc_pg_raw(osdmap, pgid, rawosds, &num); - if (!osds) - return -1; + pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); + if (!pool) + return 0; - /* primary is first up osd */ - o = 0; - for (i = 0; i < num; i++) - if (ceph_osd_is_up(osdmap, osds[i])) - acting[o++] = osds[i]; - return o; + if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { + /* hash pool id and seed so that pool PGs do not overlap */ + pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, + ceph_stable_mod(pgid.seed, pool->pgp_num, + pool->pgp_num_mask), + pgid.pool); + } else { + /* + * legacy behavior: add ps and pool together. this is + * not a great approach because the PGs from each pool + * will overlap on top of each other: 0.5 == 1.4 == + * 2.3 == ... + */ + pps = ceph_stable_mod(pgid.seed, pool->pgp_num, + pool->pgp_num_mask) + + (unsigned)pgid.pool; + } + + len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); + if (len < 0) + return len; + + len = raw_to_up_osds(osdmap, pool, osds, len, &primary); + + len = apply_temps(osdmap, pool, pgid, osds, len, &primary); + + return len; } /* -- cgit From 8008ab1080c1768b02d232dcfd9e161cd47cc9f7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:48 +0200 Subject: libceph: return primary from ceph_calc_pg_acting() In preparation for adding support for primary_temp, stop assuming primaryness: add a primary out parameter to ceph_calc_pg_acting() and change call sites accordingly. Primary is now specified separately from the order of osds in the set. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 10 ++++------ net/ceph/osdmap.c | 20 ++++++++++++-------- 2 files changed, 16 insertions(+), 14 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6f64eec18851..b4157dc22199 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1333,7 +1333,7 @@ static int __map_request(struct ceph_osd_client *osdc, { struct ceph_pg pgid; int acting[CEPH_PG_MAX_SIZE]; - int o = -1, num = 0; + int num, o; int err; bool was_paused; @@ -1346,11 +1346,9 @@ static int __map_request(struct ceph_osd_client *osdc, } req->r_pgid = pgid; - err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); - if (err > 0) { - o = acting[0]; - num = err; - } + num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); + if (num < 0) + num = 0; was_paused = req->r_paused; req->r_paused = __req_should_be_paused(osdc, req); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index f1cad21d1533..df9389ddd56c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1651,19 +1651,21 @@ static int apply_temps(struct ceph_osdmap *osdmap, /* * Calculate acting set for given pgid. * - * Return acting set length, or error. + * Return acting set length, or error. *primary is set to acting + * primary osd id, or -1 if acting set is empty or on error. */ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *osds) + int *osds, int *primary) { struct ceph_pg_pool_info *pool; u32 pps; int len; - int primary; pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); - if (!pool) - return 0; + if (!pool) { + *primary = -1; + return -ENOENT; + } if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { /* hash pool id and seed so that pool PGs do not overlap */ @@ -1684,12 +1686,14 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, } len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); - if (len < 0) + if (len < 0) { + *primary = -1; return len; + } - len = raw_to_up_osds(osdmap, pool, osds, len, &primary); + len = raw_to_up_osds(osdmap, pool, osds, len, primary); - len = apply_temps(osdmap, pool, pgid, osds, len, &primary); + len = apply_temps(osdmap, pool, pgid, osds, len, primary); return len; } -- cgit From 5e8d4d36bf23bb7baf027c479d54395840219928 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:48 +0200 Subject: libceph: add support for primary_temp mappings Change apply_temp() to override primary in the same way pg_temp overrides osd set. primary_temp overrides pg_temp primary too. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index df9389ddd56c..20a38a37794c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1597,7 +1597,7 @@ static int raw_to_up_osds(struct ceph_osdmap *osdmap, } /* - * Given up set, apply pg_temp mapping. + * Given up set, apply pg_temp and primary_temp mappings. * * Return acting set length. *primary is set to acting primary osd id, * or -1 if acting set is empty. @@ -1644,6 +1644,11 @@ static int apply_temps(struct ceph_osdmap *osdmap, temp_primary = *primary; } + /* primary_temp? */ + pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); + if (pg) + temp_primary = pg->primary_temp.osd; + *primary = temp_primary; return temp_len; } -- cgit From 47ec1f3cc46dde00deb34922dbffdeda254ad76d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:49 +0200 Subject: libceph: add support for osd primary affinity Respond to non-default primary_affinity values accordingly. (Primary affinity allows the admin to shift 'primary responsibility' away from specific osds, effectively shifting around the read side of the workload and whatever overhead is incurred by peering and writes by virtue of being the primary). Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 20a38a37794c..ae8f367c5291 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1596,6 +1596,72 @@ static int raw_to_up_osds(struct ceph_osdmap *osdmap, return len; } +static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, + struct ceph_pg_pool_info *pool, + int *osds, int len, int *primary) +{ + int i; + int pos = -1; + + /* + * Do we have any non-default primary_affinity values for these + * osds? + */ + if (!osdmap->osd_primary_affinity) + return; + + for (i = 0; i < len; i++) { + if (osds[i] != CRUSH_ITEM_NONE && + osdmap->osd_primary_affinity[i] != + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { + break; + } + } + if (i == len) + return; + + /* + * Pick the primary. Feed both the seed (for the pg) and the + * osd into the hash/rng so that a proportional fraction of an + * osd's pgs get rejected as primary. + */ + for (i = 0; i < len; i++) { + int osd; + u32 aff; + + osd = osds[i]; + if (osd == CRUSH_ITEM_NONE) + continue; + + aff = osdmap->osd_primary_affinity[osd]; + if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY && + (crush_hash32_2(CRUSH_HASH_RJENKINS1, + pps, osd) >> 16) >= aff) { + /* + * We chose not to use this primary. Note it + * anyway as a fallback in case we don't pick + * anyone else, but keep looking. + */ + if (pos < 0) + pos = i; + } else { + pos = i; + break; + } + } + if (pos < 0) + return; + + *primary = osds[pos]; + + if (ceph_can_shift_osds(pool) && pos > 0) { + /* move the new primary to the front */ + for (i = pos; i > 0; i--) + osds[i] = osds[i - 1]; + osds[0] = *primary; + } +} + /* * Given up set, apply pg_temp and primary_temp mappings. * @@ -1698,6 +1764,8 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, len = raw_to_up_osds(osdmap, pool, osds, len, primary); + apply_primary_affinity(osdmap, pps, pool, osds, len, primary); + len = apply_temps(osdmap, pool, pgid, osds, len, primary); return len; -- cgit From c4c1228525a2cbf9e06657b01135391700c1ec14 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Mar 2014 17:12:49 +0200 Subject: libceph: redo ceph_calc_pg_primary() in terms of ceph_calc_pg_acting() Reimplement ceph_calc_pg_primary() in terms of ceph_calc_pg_acting() and get rid of the now unused calc_pg_raw(). Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 79 +++---------------------------------------------------- 1 file changed, 4 insertions(+), 75 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index ae8f367c5291..408d14826b5e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1455,71 +1455,6 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, return r; } -/* - * Calculate raw osd vector for the given pgid. Return pointer to osd - * array, or NULL on failure. - */ -static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *osds, int *num) -{ - struct ceph_pg_mapping *pg; - struct ceph_pg_pool_info *pool; - int ruleno; - int r; - u32 pps; - - pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); - if (!pool) - return NULL; - - /* pg_temp? */ - pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, - pool->pg_num_mask); - pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); - if (pg) { - *num = pg->pg_temp.len; - return pg->pg_temp.osds; - } - - /* crush */ - ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, - pool->type, pool->size); - if (ruleno < 0) { - pr_err("no crush rule pool %lld ruleset %d type %d size %d\n", - pgid.pool, pool->crush_ruleset, pool->type, - pool->size); - return NULL; - } - - if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { - /* hash pool id and seed sothat pool PGs do not overlap */ - pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, - ceph_stable_mod(pgid.seed, pool->pgp_num, - pool->pgp_num_mask), - pgid.pool); - } else { - /* - * legacy ehavior: add ps and pool together. this is - * not a great approach because the PGs from each pool - * will overlap on top of each other: 0.5 == 1.4 == - * 2.3 == ... - */ - pps = ceph_stable_mod(pgid.seed, pool->pgp_num, - pool->pgp_num_mask) + - (unsigned)pgid.pool; - } - r = do_crush(osdmap, ruleno, pps, osds, min_t(int, pool->size, *num), - osdmap->osd_weight, osdmap->max_osd); - if (r < 0) { - pr_err("error %d from crush rule: pool %lld ruleset %d type %d" - " size %d\n", r, pgid.pool, pool->crush_ruleset, - pool->type, pool->size); - return NULL; - } - *num = r; - return osds; -} - /* * Calculate raw (crush) set for given pgid. * @@ -1776,17 +1711,11 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, */ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) { - int rawosds[CEPH_PG_MAX_SIZE], *osds; - int i, num = CEPH_PG_MAX_SIZE; + int osds[CEPH_PG_MAX_SIZE]; + int primary; - osds = calc_pg_raw(osdmap, pgid, rawosds, &num); - if (!osds) - return -1; + ceph_calc_pg_acting(osdmap, pgid, osds, &primary); - /* primary is first up osd */ - for (i = 0; i < num; i++) - if (ceph_osd_is_up(osdmap, osds[i])) - return osds[i]; - return -1; + return primary; } EXPORT_SYMBOL(ceph_calc_pg_primary); -- cgit From f31da0f3e12e57f21d73315e06c48fb9860fe07d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 2 Apr 2014 20:34:04 +0400 Subject: libceph: output primary affinity values on osdmap updates Similar to osd weights, output primary affinity values on incremental osdmap updates. Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 408d14826b5e..e632b5a52f5b 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1005,6 +1005,8 @@ static int decode_new_primary_affinity(void **p, void *end, ret = set_primary_affinity(map, osd, aff); if (ret) return ret; + + pr_info("osd%d primary-affinity 0x%x\n", osd, aff); } return 0; -- cgit From 8a53f23fcda355958a79774c6333a3a31c380ecf Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 4 Apr 2014 18:21:32 +0400 Subject: libceph: dump pool {read,write}_tier to debugfs Dump pool {read,write}_tier to debugfs. While at it, fixup printk type specifiers and remove the unnecessary cast to unsigned long long. Signed-off-by: Ilya Dryomov --- net/ceph/debugfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 34453a2b4b4d..10421a4b76f8 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -68,9 +68,9 @@ static int osdmap_show(struct seq_file *s, void *p) struct ceph_pg_pool_info *pool = rb_entry(n, struct ceph_pg_pool_info, node); - seq_printf(s, "pg_pool %llu pg_num %d / %d\n", - (unsigned long long)pool->id, pool->pg_num, - pool->pg_num_mask); + seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", + pool->id, pool->pg_num, pool->pg_num_mask, + pool->read_tier, pool->write_tier); } for (i = 0; i < map->max_osd; i++) { struct ceph_entity_addr *addr = &map->osd_addr[i]; -- cgit