From 9d521470a40f16110bd31018034155c60c1a1275 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Fri, 31 Jan 2014 17:54:26 +0200
Subject: libceph: a per-osdc crush scratch buffer

With the addition of erasure coding support in the future, scratch
variable-length array in crush_do_rule_ary() is going to grow to at
least 200 bytes on average, on top of another 128 bytes consumed by
rawosd/osd arrays in the call chain.  Replace it with a buffer inside
struct osdmap and a mutex.  This shouldn't result in any contention,
because all osd requests were already serialized by request_mutex at
that point; the only unlocked caller was ceph_ioctl_get_dataloc().

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osdmap.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index aade4a5c1c07..9d1aaa24def6 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -698,7 +698,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	map = kzalloc(sizeof(*map), GFP_NOFS);
 	if (map == NULL)
 		return ERR_PTR(-ENOMEM);
+
 	map->pg_temp = RB_ROOT;
+	mutex_init(&map->crush_scratch_mutex);
 
 	ceph_decode_16_safe(p, end, version, bad);
 	if (version > 6) {
@@ -1142,14 +1144,20 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
 }
 EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
 
-static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x,
-			     int *result, int result_max,
-			     const __u32 *weight, int weight_max)
+static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
+		    int *result, int result_max,
+		    const __u32 *weight, int weight_max)
 {
-	int scratch[result_max * 3];
+	int r;
+
+	BUG_ON(result_max > CEPH_PG_MAX_SIZE);
+
+	mutex_lock(&map->crush_scratch_mutex);
+	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
+			  weight, weight_max, map->crush_scratch_ary);
+	mutex_unlock(&map->crush_scratch_mutex);
 
-	return crush_do_rule(map, ruleno, x, result, result_max,
-			     weight, weight_max, scratch);
+	return r;
 }
 
 /*
@@ -1205,9 +1213,8 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 				      pool->pgp_num_mask) +
 			(unsigned)pgid.pool;
 	}
-	r = crush_do_rule_ary(osdmap->crush, ruleno, pps,
-			      osds, min_t(int, pool->size, *num),
-			      osdmap->osd_weight, osdmap->max_osd);
+	r = do_crush(osdmap, ruleno, pps, osds, min_t(int, pool->size, *num),
+		     osdmap->osd_weight, osdmap->max_osd);
 	if (r < 0) {
 		pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
 		       " size %d\n", r, pgid.pool, pool->crush_ruleset,
-- 
cgit 


From 7b25bf5f02c5c80adf96120e031dc3a1756ce54d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Tue, 25 Feb 2014 16:22:26 +0200
Subject: libceph: encode CEPH_OSD_OP_FLAG_* op flags

Encode ceph_osd_op::flags field so that it gets sent over the wire.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osd_client.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ceph')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 0676f2b199d6..5d7fd0b8c1c8 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -688,7 +688,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 
 		return 0;
 	}
+
 	dst->op = cpu_to_le16(src->op);
+	dst->flags = cpu_to_le32(src->flags);
 	dst->payload_len = cpu_to_le32(src->payload_len);
 
 	return request_data_len;
-- 
cgit 


From c647b8a8c6366f849c2a237bfe525cb1d316d5f4 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Tue, 25 Feb 2014 16:22:27 +0200
Subject: libceph: add support for CEPH_OSD_OP_SETALLOCHINT osd op

This is primarily for rbd's benefit and is supposed to combat
fragmentation:

"... knowing that rbd images have a 4m size, librbd can pass a hint
that will let the osd do the xfs allocation size ioctl on new files so
that they are allocated in 1m or 4m chunks.  We've seen cases where
users with rbd workloads have very high levels of fragmentation in xfs
and this would mitigate that and probably have a pretty nice
performance benefit."

SETALLOCHINT is considered advisory, so our backwards compatibility
mechanism here is to set FAILOK flag for all SETALLOCHINT ops.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osd_client.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'net/ceph')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 5d7fd0b8c1c8..71830d79b0f4 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
 	case CEPH_OSD_OP_OMAPCLEAR:
 	case CEPH_OSD_OP_OMAPRMKEYS:
 	case CEPH_OSD_OP_OMAP_CMP:
+	case CEPH_OSD_OP_SETALLOCHINT:
 	case CEPH_OSD_OP_CLONERANGE:
 	case CEPH_OSD_OP_ASSERT_SRC_VERSION:
 	case CEPH_OSD_OP_SRC_CMPXATTR:
@@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_watch_init);
 
+void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
+				unsigned int which,
+				u64 expected_object_size,
+				u64 expected_write_size)
+{
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+						      CEPH_OSD_OP_SETALLOCHINT);
+
+	op->alloc_hint.expected_object_size = expected_object_size;
+	op->alloc_hint.expected_write_size = expected_write_size;
+
+	/*
+	 * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
+	 * not worth a feature bit.  Set FAILOK per-op flag to make
+	 * sure older osds don't trip over an unsupported opcode.
+	 */
+	op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
+}
+EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
+
 static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
 				struct ceph_osd_data *osd_data)
 {
@@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 		dst->watch.ver = cpu_to_le64(src->watch.ver);
 		dst->watch.flag = src->watch.flag;
 		break;
+	case CEPH_OSD_OP_SETALLOCHINT:
+		dst->alloc_hint.expected_object_size =
+		    cpu_to_le64(src->alloc_hint.expected_object_size);
+		dst->alloc_hint.expected_write_size =
+		    cpu_to_le64(src->alloc_hint.expected_write_size);
+		break;
 	default:
 		pr_err("unsupported osd opcode %s\n",
 			ceph_osd_op_name(src->op));
-- 
cgit 


From d90deda69cb82411ba7d990e97218e0f8b2d07bb Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Sun, 23 Mar 2014 06:50:39 +0800
Subject: libceph: fix oops in ceph_msg_data_{pages,pagelist}_advance()

When there is no more data, ceph_msg_data_{pages,pagelist}_advance()
should not move on to the next page.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 net/ceph/messenger.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net/ceph')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 30efc5c18622..4f55f9ce63fa 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -919,6 +919,9 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
 	if (!bytes || cursor->page_offset)
 		return false;	/* more bytes to process in the current page */
 
+	if (!cursor->resid)
+		return false;   /* no more data */
+
 	/* Move on to the next page; offset is already at 0 */
 
 	BUG_ON(cursor->page_index >= cursor->page_count);
@@ -1004,6 +1007,9 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
 	if (!bytes || cursor->offset & ~PAGE_MASK)
 		return false;	/* more bytes to process in the current page */
 
+	if (!cursor->resid)
+		return false;   /* no more data */
+
 	/* Move on to the next page */
 
 	BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
-- 
cgit 


From 48a163dbb517eba13643bf404a0d695c1ab0a60d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Wed, 19 Mar 2014 16:58:36 +0200
Subject: crush: fix off-by-one errors in total_tries refactor

Back in 27f4d1f6bc32c2ed7b2c5080cbd58b14df622607 we refactored the CRUSH
code to allow adjustment of the retry counts on a per-pool basis.  That
commit had an off-by-one bug: the previous "tries" counter was a *retry*
count, not a *try* count, but the new code was passing in 1 meaning
there should be no retries.

Fix the ftotal vs tries comparison to use < instead of <= to fix the
problem.  Note that the original code used <= here, which means the
global "choose_total_tries" tunable is actually counting retries.
Compensate for that by adding 1 in crush_do_rule when we pull the tunable
into the local variable.

This was noticed looking at output from a user provided osdmap.
Unfortunately the map doesn't illustrate the change in mapping behavior
and I haven't managed to construct one yet that does.  Inspection of the
crush debug output now aligns with prior versions, though.

Reflects ceph.git commit 795704fd615f0b008dcc81aa088a859b2d075138.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/crush/mapper.c | 46 +++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b703790b4e44..074bb2a5e675 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -292,8 +292,8 @@ static int is_out(const struct crush_map *map,
  * @outpos: our position in that vector
  * @tries: number of attempts to make
  * @recurse_tries: number of attempts to have recursive chooseleaf make
- * @local_tries: localized retries
- * @local_fallback_tries: localized fallback retries
+ * @local_retries: localized retries
+ * @local_fallback_retries: localized fallback retries
  * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
  * @out2: second output vector for leaf items (if @recurse_to_leaf)
  */
@@ -304,8 +304,8 @@ static int crush_choose_firstn(const struct crush_map *map,
 			       int *out, int outpos,
 			       unsigned int tries,
 			       unsigned int recurse_tries,
-			       unsigned int local_tries,
-			       unsigned int local_fallback_tries,
+			       unsigned int local_retries,
+			       unsigned int local_fallback_retries,
 			       int recurse_to_leaf,
 			       int *out2)
 {
@@ -344,9 +344,9 @@ static int crush_choose_firstn(const struct crush_map *map,
 					reject = 1;
 					goto reject;
 				}
-				if (local_fallback_tries > 0 &&
+				if (local_fallback_retries > 0 &&
 				    flocal >= (in->size>>1) &&
-				    flocal > local_fallback_tries)
+				    flocal > local_fallback_retries)
 					item = bucket_perm_choose(in, x, r);
 				else
 					item = crush_bucket_choose(in, x, r);
@@ -393,8 +393,8 @@ static int crush_choose_firstn(const struct crush_map *map,
 							 x, outpos+1, 0,
 							 out2, outpos,
 							 recurse_tries, 0,
-							 local_tries,
-							 local_fallback_tries,
+							 local_retries,
+							 local_fallback_retries,
 							 0,
 							 NULL) <= outpos)
 							/* didn't get leaf */
@@ -420,14 +420,14 @@ reject:
 					ftotal++;
 					flocal++;
 
-					if (collide && flocal <= local_tries)
+					if (collide && flocal <= local_retries)
 						/* retry locally a few times */
 						retry_bucket = 1;
-					else if (local_fallback_tries > 0 &&
-						 flocal <= in->size + local_fallback_tries)
+					else if (local_fallback_retries > 0 &&
+						 flocal <= in->size + local_fallback_retries)
 						/* exhaustive bucket search */
 						retry_bucket = 1;
-					else if (ftotal <= tries)
+					else if (ftotal < tries)
 						/* then retry descent */
 						retry_descent = 1;
 					else
@@ -640,10 +640,18 @@ int crush_do_rule(const struct crush_map *map,
 	__u32 step;
 	int i, j;
 	int numrep;
-	int choose_tries = map->choose_total_tries;
-	int choose_local_tries = map->choose_local_tries;
-	int choose_local_fallback_tries = map->choose_local_fallback_tries;
+	/*
+	 * the original choose_total_tries value was off by one (it
+	 * counted "retries" and not "tries").  add one.
+	 */
+	int choose_tries = map->choose_total_tries + 1;
 	int choose_leaf_tries = 0;
+	/*
+	 * the local tries values were counted as "retries", though,
+	 * and need no adjustment
+	 */
+	int choose_local_retries = map->choose_local_tries;
+	int choose_local_fallback_retries = map->choose_local_fallback_tries;
 
 	if ((__u32)ruleno >= map->max_rules) {
 		dprintk(" bad ruleno %d\n", ruleno);
@@ -677,12 +685,12 @@ int crush_do_rule(const struct crush_map *map,
 
 		case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
 			if (curstep->arg1 > 0)
-				choose_local_tries = curstep->arg1;
+				choose_local_retries = curstep->arg1;
 			break;
 
 		case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
 			if (curstep->arg1 > 0)
-				choose_local_fallback_tries = curstep->arg1;
+				choose_local_fallback_retries = curstep->arg1;
 			break;
 
 		case CRUSH_RULE_CHOOSELEAF_FIRSTN:
@@ -734,8 +742,8 @@ int crush_do_rule(const struct crush_map *map,
 						o+osize, j,
 						choose_tries,
 						recurse_tries,
-						choose_local_tries,
-						choose_local_fallback_tries,
+						choose_local_retries,
+						choose_local_fallback_retries,
 						recurse_to_leaf,
 						c+osize);
 				} else {
-- 
cgit 


From 6ed1002f368c63ef79d7f659fcb4368a90098132 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Wed, 19 Mar 2014 16:58:37 +0200
Subject: crush: allow crush rules to set (re)tries counts to 0

These two fields are misnomers; they are *retry* counts.

Reflects ceph.git commit f17caba8ae0cad7b6f8f35e53e5f73b444696835.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/crush/mapper.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 074bb2a5e675..b3fb84903b30 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -684,12 +684,12 @@ int crush_do_rule(const struct crush_map *map,
 			break;
 
 		case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
-			if (curstep->arg1 > 0)
+			if (curstep->arg1 >= 0)
 				choose_local_retries = curstep->arg1;
 			break;
 
 		case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
-			if (curstep->arg1 > 0)
+			if (curstep->arg1 >= 0)
 				choose_local_fallback_retries = curstep->arg1;
 			break;
 
-- 
cgit 


From e2b149cc4ba00766aceb87950c6de72ea7fc8b2e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Wed, 19 Mar 2014 16:58:37 +0200
Subject: crush: add chooseleaf_vary_r tunable

The current crush_choose_firstn code will re-use the same 'r' value for
the recursive call.  That means that if we are hitting a collision or
rejection for some reason (say, an OSD that is marked out) and need to
retry, we will keep making the same (bad) choice in that recursive
selection.

Introduce a tunable that fixes that behavior by incorporating the parent
'r' value into the recursive starting point, so that a different path
will be taken in subsequent placement attempts.

Note that this was done from the get-go for the new crush_choose_indep
algorithm.

This was exposed by a user who was seeing PGs stuck in active+remapped
after reweight-by-utilization because the up set mapped to a single OSD.

Reflects ceph.git commit a8e6c9fbf88bad056dd05d3eb790e98a5e43451a.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/crush/mapper.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b3fb84903b30..947150cde297 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -295,7 +295,9 @@ static int is_out(const struct crush_map *map,
  * @local_retries: localized retries
  * @local_fallback_retries: localized fallback retries
  * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @vary_r: pass r to recursive calls
  * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ * @parent_r: r value passed from the parent
  */
 static int crush_choose_firstn(const struct crush_map *map,
 			       struct crush_bucket *bucket,
@@ -307,7 +309,9 @@ static int crush_choose_firstn(const struct crush_map *map,
 			       unsigned int local_retries,
 			       unsigned int local_fallback_retries,
 			       int recurse_to_leaf,
-			       int *out2)
+			       unsigned int vary_r,
+			       int *out2,
+			       int parent_r)
 {
 	int rep;
 	unsigned int ftotal, flocal;
@@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map,
 	int itemtype;
 	int collide, reject;
 
-	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
-		bucket->id, x, outpos, numrep);
+	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
+		recurse_to_leaf ? "_LEAF" : "",
+		bucket->id, x, outpos, numrep,
+		tries, recurse_tries, local_retries, local_fallback_retries,
+		parent_r);
 
 	for (rep = outpos; rep < numrep; rep++) {
 		/* keep trying until we get a non-out, non-colliding item */
@@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map,
 			do {
 				collide = 0;
 				retry_bucket = 0;
-				r = rep;
+				r = rep + parent_r;
 				/* r' = r + f_total */
 				r += ftotal;
 
@@ -387,6 +394,11 @@ static int crush_choose_firstn(const struct crush_map *map,
 				reject = 0;
 				if (!collide && recurse_to_leaf) {
 					if (item < 0) {
+						int sub_r;
+						if (vary_r)
+							sub_r = r >> (vary_r-1);
+						else
+							sub_r = 0;
 						if (crush_choose_firstn(map,
 							 map->buckets[-1-item],
 							 weight, weight_max,
@@ -396,7 +408,9 @@ static int crush_choose_firstn(const struct crush_map *map,
 							 local_retries,
 							 local_fallback_retries,
 							 0,
-							 NULL) <= outpos)
+							 vary_r,
+							 NULL,
+							 sub_r) <= outpos)
 							/* didn't get leaf */
 							reject = 1;
 					} else {
@@ -653,6 +667,8 @@ int crush_do_rule(const struct crush_map *map,
 	int choose_local_retries = map->choose_local_tries;
 	int choose_local_fallback_retries = map->choose_local_fallback_tries;
 
+	int vary_r = map->chooseleaf_vary_r;
+
 	if ((__u32)ruleno >= map->max_rules) {
 		dprintk(" bad ruleno %d\n", ruleno);
 		return 0;
@@ -745,7 +761,9 @@ int crush_do_rule(const struct crush_map *map,
 						choose_local_retries,
 						choose_local_fallback_retries,
 						recurse_to_leaf,
-						c+osize);
+						vary_r,
+						c+osize,
+						0);
 				} else {
 					crush_choose_indep(
 						map,
-- 
cgit 


From d83ed858f144e3cfbef883d4bc499113cdddabeb Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Wed, 19 Mar 2014 16:58:37 +0200
Subject: crush: add SET_CHOOSELEAF_VARY_R step

This lets you adjust the vary_r tunable on a per-rule basis.

Reflects ceph.git commit f944ccc20aee60a7d8da7e405ec75ad1cd449fac.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/crush/mapper.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net/ceph')

diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 947150cde297..a1ef53c04415 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -709,6 +709,11 @@ int crush_do_rule(const struct crush_map *map,
 				choose_local_fallback_retries = curstep->arg1;
 			break;
 
+		case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
+			if (curstep->arg1 >= 0)
+				vary_r = curstep->arg1;
+			break;
+
 		case CRUSH_RULE_CHOOSELEAF_FIRSTN:
 		case CRUSH_RULE_CHOOSE_FIRSTN:
 			firstn = 1;
-- 
cgit 


From 35fea3a18a1df8f981f292b492c2de3a9e4e5fc2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Thu, 13 Mar 2014 16:36:12 +0200
Subject: libceph: refer to osdmap directly in osdmap_show()

To make it more readable and save screen space.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/debugfs.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 258a382e75ed..d225842c7b41 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -53,34 +53,36 @@ static int osdmap_show(struct seq_file *s, void *p)
 {
 	int i;
 	struct ceph_client *client = s->private;
+	struct ceph_osdmap *map = client->osdc.osdmap;
 	struct rb_node *n;
 
-	if (client->osdc.osdmap == NULL)
+	if (map == NULL)
 		return 0;
-	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+
+	seq_printf(s, "epoch %d\n", map->epoch);
 	seq_printf(s, "flags%s%s\n",
-		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
-		   " NEARFULL" : "",
-		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
-		   " FULL" : "");
-	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+		   (map->flags & CEPH_OSDMAP_NEARFULL) ?  " NEARFULL" : "",
+		   (map->flags & CEPH_OSDMAP_FULL) ?  " FULL" : "");
+
+	for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
 		struct ceph_pg_pool_info *pool =
 			rb_entry(n, struct ceph_pg_pool_info, node);
+
 		seq_printf(s, "pg_pool %llu pg_num %d / %d\n",
 			   (unsigned long long)pool->id, pool->pg_num,
 			   pool->pg_num_mask);
 	}
-	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
-		struct ceph_entity_addr *addr =
-			&client->osdc.osdmap->osd_addr[i];
-		int state = client->osdc.osdmap->osd_state[i];
+	for (i = 0; i < map->max_osd; i++) {
+		struct ceph_entity_addr *addr = &map->osd_addr[i];
+		int state = map->osd_state[i];
 		char sb[64];
 
 		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
 			   i, ceph_pr_addr(&addr->in_addr),
-			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+			   ((map->osd_weight[i]*100) >> 16),
 			   ceph_osdmap_state_str(sb, sizeof(sb), state));
 	}
+
 	return 0;
 }
 
-- 
cgit 


From 0a2800d7280ccdf3194bd8bd74a2eb8c315b54c6 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Thu, 13 Mar 2014 16:36:12 +0200
Subject: libceph: do not prefix osd lines with \t in debugfs output

To save screen space in anticipation of more fields (e.g. primary
affinity).

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ceph')

diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index d225842c7b41..112d98edb156 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -77,7 +77,7 @@ static int osdmap_show(struct seq_file *s, void *p)
 		int state = map->osd_state[i];
 		char sb[64];
 
-		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+		seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\n",
 			   i, ceph_pr_addr(&addr->in_addr),
 			   ((map->osd_weight[i]*100) >> 16),
 			   ceph_osdmap_state_str(sb, sizeof(sb), state));
-- 
cgit 


From 1c00240e007d14d3242fa490b50166b4f1b2770a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Thu, 13 Mar 2014 16:36:13 +0200
Subject: libceph: dump pg_temp mappings to debugfs

Dump pg_temp mappings to /sys/kernel/debug/ceph/<client>/osdmap,
one 'pg_temp <pgid> [<osd>, ..., <osd>]' per line, e.g:

    pg_temp 2.6 [2,3,4]

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/debugfs.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'net/ceph')

diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 112d98edb156..c45d235e774e 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -82,6 +82,17 @@ static int osdmap_show(struct seq_file *s, void *p)
 			   ((map->osd_weight[i]*100) >> 16),
 			   ceph_osdmap_state_str(sb, sizeof(sb), state));
 	}
+	for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(n, struct ceph_pg_mapping, node);
+
+		seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool,
+			   pg->pgid.seed);
+		for (i = 0; i < pg->len; i++)
+			seq_printf(s, "%s%d", (i == 0 ? "" : ","),
+				   pg->osds[i]);
+		seq_printf(s, "]\n");
+	}
 
 	return 0;
 }
-- 
cgit 


From 38a8d560231b45489d5b12f5c7d0edfba94e1f30 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Thu, 13 Mar 2014 16:36:13 +0200
Subject: libceph: dump osdmap and enhance output on decode errors

Dump osdmap in hex on both full and incremental decode errors, to make
it easier to match the contents with error offset.  dout() map epoch
and max_osd value on success.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9d1aaa24def6..4dd000d128fd 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -690,6 +690,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	u16 version;
 	u32 len, max, i;
 	int err = -EINVAL;
+	u32 epoch = 0;
 	void *start = *p;
 	struct ceph_pg_pool_info *pi;
 
@@ -714,7 +715,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 
 	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
 	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
-	map->epoch = ceph_decode_32(p);
+	epoch = map->epoch = ceph_decode_32(p);
 	ceph_decode_copy(p, &map->created, sizeof(map->created));
 	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
 
@@ -814,14 +815,18 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		goto bad;
 	}
 
-	/* ignore the rest of the map */
+	/* ignore the rest */
 	*p = end;
 
-	dout("osdmap_decode done %p %p\n", *p, end);
+	dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
 	return map;
 
 bad:
-	dout("osdmap_decode fail err %d\n", err);
+	pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
+	       err, epoch, (int)(*p - start), *p, start, end);
+	print_hex_dump(KERN_DEBUG, "osdmap: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       start, end - start, true);
 	ceph_osdmap_destroy(map);
 	return ERR_PTR(err);
 }
@@ -845,6 +850,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	int err = -EINVAL;
 	u16 version;
 
+	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
+
 	ceph_decode_16_safe(p, end, version, bad);
 	if (version != 6) {
 		pr_warning("got unknown v %d != 6 of inc osdmap\n", version);
@@ -1032,11 +1039,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 
 	/* ignore the rest */
 	*p = end;
+
+	dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
 	return map;
 
 bad:
-	pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
-	       epoch, (int)(*p - start), *p, start, end);
+	pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
+	       err, epoch, (int)(*p - start), *p, start, end);
 	print_hex_dump(KERN_DEBUG, "osdmap: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       start, end - start, true);
-- 
cgit 


From a2505d63ee0541d9b4685250b033192e68222e97 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Thu, 13 Mar 2014 16:36:13 +0200
Subject: libceph: split osdmap allocation and decode steps

Split osdmap allocation and initialization into a separate function,
ceph_osdmap_decode().

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osd_client.c |  2 +-
 net/ceph/osdmap.c     | 44 +++++++++++++++++++++++++++++---------------
 2 files changed, 30 insertions(+), 16 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 71830d79b0f4..6f64eec18851 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2062,7 +2062,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			int skipped_map = 0;
 
 			dout("taking full map %u len %d\n", epoch, maplen);
-			newmap = osdmap_decode(&p, p+maplen);
+			newmap = ceph_osdmap_decode(&p, p+maplen);
 			if (IS_ERR(newmap)) {
 				err = PTR_ERR(newmap);
 				goto bad;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 4dd000d128fd..a82df6ea0749 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -684,9 +684,8 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 /*
  * decode a full map.
  */
-struct ceph_osdmap *osdmap_decode(void **p, void *end)
+static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 {
-	struct ceph_osdmap *map;
 	u16 version;
 	u32 len, max, i;
 	int err = -EINVAL;
@@ -694,14 +693,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	void *start = *p;
 	struct ceph_pg_pool_info *pi;
 
-	dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-
-	map = kzalloc(sizeof(*map), GFP_NOFS);
-	if (map == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	map->pg_temp = RB_ROOT;
-	mutex_init(&map->crush_scratch_mutex);
+	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
 
 	ceph_decode_16_safe(p, end, version, bad);
 	if (version > 6) {
@@ -751,7 +743,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	err = osdmap_set_max_osd(map, max);
 	if (err < 0)
 		goto bad;
-	dout("osdmap_decode max_osd = %d\n", map->max_osd);
 
 	/* osds */
 	err = -EINVAL;
@@ -819,7 +810,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	*p = end;
 
 	dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
-	return map;
+	return 0;
 
 bad:
 	pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
@@ -827,8 +818,31 @@ bad:
 	print_hex_dump(KERN_DEBUG, "osdmap: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       start, end - start, true);
-	ceph_osdmap_destroy(map);
-	return ERR_PTR(err);
+	return err;
+}
+
+/*
+ * Allocate and decode a full map.
+ */
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
+{
+	struct ceph_osdmap *map;
+	int ret;
+
+	map = kzalloc(sizeof(*map), GFP_NOFS);
+	if (!map)
+		return ERR_PTR(-ENOMEM);
+
+	map->pg_temp = RB_ROOT;
+	mutex_init(&map->crush_scratch_mutex);
+
+	ret = osdmap_decode(p, end, map);
+	if (ret) {
+		ceph_osdmap_destroy(map);
+		return ERR_PTR(ret);
+	}
+
+	return map;
 }
 
 /*
@@ -872,7 +886,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	if (len > 0) {
 		dout("apply_incremental full map len %d, %p to %p\n",
 		     len, *p, end);
-		return osdmap_decode(p, min(*p+len, end));
+		return ceph_osdmap_decode(p, min(*p+len, end));
 	}
 
 	/* new crush? */
-- 
cgit 


From 597b52f6ca247086371abd67e5083292a500e736 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Thu, 13 Mar 2014 16:36:14 +0200
Subject: libceph: fixup error handling in osdmap_decode()

The existing error handling scheme requires resetting err to -EINVAL
prior to calling any ceph_decode_* macro.  This is ugly and fragile,
and there already are a few places where we would return 0 on error,
due to a missing reset.  Fix this by adding a special e_inval label to
be used by all ceph_decode_* macros.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 53 +++++++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index a82df6ea0749..298d076eee89 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -688,36 +688,37 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 {
 	u16 version;
 	u32 len, max, i;
-	int err = -EINVAL;
 	u32 epoch = 0;
 	void *start = *p;
+	int err;
 	struct ceph_pg_pool_info *pi;
 
 	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
 
-	ceph_decode_16_safe(p, end, version, bad);
+	ceph_decode_16_safe(p, end, version, e_inval);
 	if (version > 6) {
 		pr_warning("got unknown v %d > 6 of osdmap\n", version);
-		goto bad;
+		goto e_inval;
 	}
 	if (version < 6) {
 		pr_warning("got old v %d < 6 of osdmap\n", version);
-		goto bad;
+		goto e_inval;
 	}
 
-	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), e_inval);
 	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
 	epoch = map->epoch = ceph_decode_32(p);
 	ceph_decode_copy(p, &map->created, sizeof(map->created));
 	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
 
-	ceph_decode_32_safe(p, end, max, bad);
+	ceph_decode_32_safe(p, end, max, e_inval);
 	while (max--) {
-		ceph_decode_need(p, end, 8 + 2, bad);
-		err = -ENOMEM;
+		ceph_decode_need(p, end, 8 + 2, e_inval);
 		pi = kzalloc(sizeof(*pi), GFP_NOFS);
-		if (!pi)
+		if (!pi) {
+			err = -ENOMEM;
 			goto bad;
+		}
 		pi->id = ceph_decode_64(p);
 		err = __decode_pool(p, end, pi);
 		if (err < 0) {
@@ -728,27 +729,25 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 	}
 
 	err = __decode_pool_names(p, end, map);
-	if (err < 0) {
-		dout("fail to decode pool names");
+	if (err)
 		goto bad;
-	}
 
-	ceph_decode_32_safe(p, end, map->pool_max, bad);
+	ceph_decode_32_safe(p, end, map->pool_max, e_inval);
 
-	ceph_decode_32_safe(p, end, map->flags, bad);
+	ceph_decode_32_safe(p, end, map->flags, e_inval);
 
 	max = ceph_decode_32(p);
 
 	/* (re)alloc osd arrays */
 	err = osdmap_set_max_osd(map, max);
-	if (err < 0)
+	if (err)
 		goto bad;
 
 	/* osds */
-	err = -EINVAL;
 	ceph_decode_need(p, end, 3*sizeof(u32) +
 			 map->max_osd*(1 + sizeof(*map->osd_weight) +
-				       sizeof(*map->osd_addr)), bad);
+				       sizeof(*map->osd_addr)), e_inval);
+
 	*p += 4; /* skip length field (should match max) */
 	ceph_decode_copy(p, map->osd_state, map->max_osd);
 
@@ -762,7 +761,7 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 		ceph_decode_addr(&map->osd_addr[i]);
 
 	/* pg_temp */
-	ceph_decode_32_safe(p, end, len, bad);
+	ceph_decode_32_safe(p, end, len, e_inval);
 	for (i = 0; i < len; i++) {
 		int n, j;
 		struct ceph_pg pgid;
@@ -771,16 +770,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 		err = ceph_decode_pgid(p, end, &pgid);
 		if (err)
 			goto bad;
-		ceph_decode_need(p, end, sizeof(u32), bad);
+		ceph_decode_need(p, end, sizeof(u32), e_inval);
 		n = ceph_decode_32(p);
-		err = -EINVAL;
 		if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
-			goto bad;
-		ceph_decode_need(p, end, n * sizeof(u32), bad);
-		err = -ENOMEM;
+			goto e_inval;
+		ceph_decode_need(p, end, n * sizeof(u32), e_inval);
 		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
-		if (!pg)
+		if (!pg) {
+			err = -ENOMEM;
 			goto bad;
+		}
 		pg->pgid = pgid;
 		pg->len = n;
 		for (j = 0; j < n; j++)
@@ -794,10 +793,10 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 	}
 
 	/* crush */
-	ceph_decode_32_safe(p, end, len, bad);
+	ceph_decode_32_safe(p, end, len, e_inval);
 	dout("osdmap_decode crush len %d from off 0x%x\n", len,
 	     (int)(*p - start));
-	ceph_decode_need(p, end, len, bad);
+	ceph_decode_need(p, end, len, e_inval);
 	map->crush = crush_decode(*p, end);
 	*p += len;
 	if (IS_ERR(map->crush)) {
@@ -812,6 +811,8 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 	dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
 	return 0;
 
+e_inval:
+	err = -EINVAL;
 bad:
 	pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
 	       err, epoch, (int)(*p - start), *p, start, end);
-- 
cgit 


From 3977058c468b872c6bc5e5273bf911d791848643 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Thu, 13 Mar 2014 16:36:14 +0200
Subject: libceph: safely decode max_osd value in osdmap_decode()

max_osd value is not covered by any ceph_decode_need().  Use a safe
version of ceph_decode_* macro to decode it.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 298d076eee89..ec06010657b3 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -687,9 +687,10 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 {
 	u16 version;
-	u32 len, max, i;
 	u32 epoch = 0;
 	void *start = *p;
+	u32 max;
+	u32 len, i;
 	int err;
 	struct ceph_pg_pool_info *pi;
 
@@ -736,7 +737,8 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 
 	ceph_decode_32_safe(p, end, map->flags, e_inval);
 
-	max = ceph_decode_32(p);
+	/* max_osd */
+	ceph_decode_32_safe(p, end, max, e_inval);
 
 	/* (re)alloc osd arrays */
 	err = osdmap_set_max_osd(map, max);
-- 
cgit 


From 2d88b2e0819e0401ebb195e9fa20fab4be1965c8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Thu, 13 Mar 2014 16:36:14 +0200
Subject: libceph: check length of osdmap osd arrays

Check length of osd_state, osd_weight and osd_addr arrays.  They
should all have exactly max_osd elements after the call to
osdmap_set_max_osd().

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index ec06010657b3..c39ac624ccc3 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -745,19 +745,25 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 	if (err)
 		goto bad;
 
-	/* osds */
+	/* osd_state, osd_weight, osd_addrs->client_addr */
 	ceph_decode_need(p, end, 3*sizeof(u32) +
 			 map->max_osd*(1 + sizeof(*map->osd_weight) +
 				       sizeof(*map->osd_addr)), e_inval);
 
-	*p += 4; /* skip length field (should match max) */
+	if (ceph_decode_32(p) != map->max_osd)
+		goto e_inval;
+
 	ceph_decode_copy(p, map->osd_state, map->max_osd);
 
-	*p += 4; /* skip length field (should match max) */
+	if (ceph_decode_32(p) != map->max_osd)
+		goto e_inval;
+
 	for (i = 0; i < map->max_osd; i++)
 		map->osd_weight[i] = ceph_decode_32(p);
 
-	*p += 4; /* skip length field (should match max) */
+	if (ceph_decode_32(p) != map->max_osd)
+		goto e_inval;
+
 	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
 	for (i = 0; i < map->max_osd; i++)
 		ceph_decode_addr(&map->osd_addr[i]);
-- 
cgit 


From 9902e682c7f3df9ed5f60bc6f9c7efa6fd6b2d1d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Thu, 13 Mar 2014 16:36:15 +0200
Subject: libceph: fix crush_decode() call site in osdmap_decode()

The size of the memory area feeded to crush_decode() should be limited
not only by osdmap end, but also by the crush map length.  Also, drop
unnecessary dout() (dout() in crush_decode() conveys the same info) and
step past crush map only if it is decoded successfully.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index c39ac624ccc3..d4a6b0df3627 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -802,16 +802,13 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 
 	/* crush */
 	ceph_decode_32_safe(p, end, len, e_inval);
-	dout("osdmap_decode crush len %d from off 0x%x\n", len,
-	     (int)(*p - start));
-	ceph_decode_need(p, end, len, e_inval);
-	map->crush = crush_decode(*p, end);
-	*p += len;
+	map->crush = crush_decode(*p, min(*p + len, end));
 	if (IS_ERR(map->crush)) {
 		err = PTR_ERR(map->crush);
 		map->crush = NULL;
 		goto bad;
 	}
+	*p += len;
 
 	/* ignore the rest */
 	*p = end;
-- 
cgit 


From 86f1742b94dd0b4a2eb9255205d1756ddea182f8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Thu, 13 Mar 2014 16:36:15 +0200
Subject: libceph: fixup error handling in osdmap_apply_incremental()

The existing error handling scheme requires resetting err to -EINVAL
prior to calling any ceph_decode_* macro.  This is ugly and fragile,
and there already are a few places where we would return 0 on error,
due to a missing reset.  Follow osdmap_decode() and fix this by adding
a special e_inval label to be used by all ceph_decode_* macros.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 66 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 34 insertions(+), 32 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index d4a6b0df3627..b844a9273666 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -867,19 +867,19 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	__s64 new_pool_max;
 	__s32 new_flags, max;
 	void *start = *p;
-	int err = -EINVAL;
+	int err;
 	u16 version;
 
 	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
 
-	ceph_decode_16_safe(p, end, version, bad);
+	ceph_decode_16_safe(p, end, version, e_inval);
 	if (version != 6) {
 		pr_warning("got unknown v %d != 6 of inc osdmap\n", version);
-		goto bad;
+		goto e_inval;
 	}
 
 	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
-			 bad);
+			 e_inval);
 	ceph_decode_copy(p, &fsid, sizeof(fsid));
 	epoch = ceph_decode_32(p);
 	BUG_ON(epoch != map->epoch+1);
@@ -888,7 +888,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	new_flags = ceph_decode_32(p);
 
 	/* full map? */
-	ceph_decode_32_safe(p, end, len, bad);
+	ceph_decode_32_safe(p, end, len, e_inval);
 	if (len > 0) {
 		dout("apply_incremental full map len %d, %p to %p\n",
 		     len, *p, end);
@@ -896,13 +896,14 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	}
 
 	/* new crush? */
-	ceph_decode_32_safe(p, end, len, bad);
+	ceph_decode_32_safe(p, end, len, e_inval);
 	if (len > 0) {
-		dout("apply_incremental new crush map len %d, %p to %p\n",
-		     len, *p, end);
 		newcrush = crush_decode(*p, min(*p+len, end));
-		if (IS_ERR(newcrush))
-			return ERR_CAST(newcrush);
+		if (IS_ERR(newcrush)) {
+			err = PTR_ERR(newcrush);
+			newcrush = NULL;
+			goto bad;
+		}
 		*p += len;
 	}
 
@@ -912,13 +913,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	if (new_pool_max >= 0)
 		map->pool_max = new_pool_max;
 
-	ceph_decode_need(p, end, 5*sizeof(u32), bad);
+	ceph_decode_need(p, end, 5*sizeof(u32), e_inval);
 
 	/* new max? */
 	max = ceph_decode_32(p);
 	if (max >= 0) {
 		err = osdmap_set_max_osd(map, max);
-		if (err < 0)
+		if (err)
 			goto bad;
 	}
 
@@ -932,11 +933,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	}
 
 	/* new_pool */
-	ceph_decode_32_safe(p, end, len, bad);
+	ceph_decode_32_safe(p, end, len, e_inval);
 	while (len--) {
 		struct ceph_pg_pool_info *pi;
 
-		ceph_decode_64_safe(p, end, pool, bad);
+		ceph_decode_64_safe(p, end, pool, e_inval);
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
 		if (!pi) {
 			pi = kzalloc(sizeof(*pi), GFP_NOFS);
@@ -953,29 +954,28 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	}
 	if (version >= 5) {
 		err = __decode_pool_names(p, end, map);
-		if (err < 0)
+		if (err)
 			goto bad;
 	}
 
 	/* old_pool */
-	ceph_decode_32_safe(p, end, len, bad);
+	ceph_decode_32_safe(p, end, len, e_inval);
 	while (len--) {
 		struct ceph_pg_pool_info *pi;
 
-		ceph_decode_64_safe(p, end, pool, bad);
+		ceph_decode_64_safe(p, end, pool, e_inval);
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
 		if (pi)
 			__remove_pg_pool(&map->pg_pools, pi);
 	}
 
 	/* new_up */
-	err = -EINVAL;
-	ceph_decode_32_safe(p, end, len, bad);
+	ceph_decode_32_safe(p, end, len, e_inval);
 	while (len--) {
 		u32 osd;
 		struct ceph_entity_addr addr;
-		ceph_decode_32_safe(p, end, osd, bad);
-		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+		ceph_decode_32_safe(p, end, osd, e_inval);
+		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
 		ceph_decode_addr(&addr);
 		pr_info("osd%d up\n", osd);
 		BUG_ON(osd >= map->max_osd);
@@ -984,11 +984,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	}
 
 	/* new_state */
-	ceph_decode_32_safe(p, end, len, bad);
+	ceph_decode_32_safe(p, end, len, e_inval);
 	while (len--) {
 		u32 osd;
 		u8 xorstate;
-		ceph_decode_32_safe(p, end, osd, bad);
+		ceph_decode_32_safe(p, end, osd, e_inval);
 		xorstate = **(u8 **)p;
 		(*p)++;  /* clean flag */
 		if (xorstate == 0)
@@ -1000,10 +1000,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	}
 
 	/* new_weight */
-	ceph_decode_32_safe(p, end, len, bad);
+	ceph_decode_32_safe(p, end, len, e_inval);
 	while (len--) {
 		u32 osd, off;
-		ceph_decode_need(p, end, sizeof(u32)*2, bad);
+		ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
 		osd = ceph_decode_32(p);
 		off = ceph_decode_32(p);
 		pr_info("osd%d weight 0x%x %s\n", osd, off,
@@ -1014,7 +1014,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	}
 
 	/* new_pg_temp */
-	ceph_decode_32_safe(p, end, len, bad);
+	ceph_decode_32_safe(p, end, len, e_inval);
 	while (len--) {
 		struct ceph_pg_mapping *pg;
 		int j;
@@ -1024,22 +1024,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		err = ceph_decode_pgid(p, end, &pgid);
 		if (err)
 			goto bad;
-		ceph_decode_need(p, end, sizeof(u32), bad);
+		ceph_decode_need(p, end, sizeof(u32), e_inval);
 		pglen = ceph_decode_32(p);
 		if (pglen) {
-			ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+			ceph_decode_need(p, end, pglen*sizeof(u32), e_inval);
 
 			/* removing existing (if any) */
 			(void) __remove_pg_mapping(&map->pg_temp, pgid);
 
 			/* insert */
-			err = -EINVAL;
 			if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
-				goto bad;
-			err = -ENOMEM;
+				goto e_inval;
 			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
-			if (!pg)
+			if (!pg) {
+				err = -ENOMEM;
 				goto bad;
+			}
 			pg->pgid = pgid;
 			pg->len = pglen;
 			for (j = 0; j < pglen; j++)
@@ -1063,6 +1063,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
 	return map;
 
+e_inval:
+	err = -EINVAL;
 bad:
 	pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
 	       err, epoch, (int)(*p - start), *p, start, end);
-- 
cgit 


From 9464d00862ea6a5c0edc7118c86bdfa71a95190e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Thu, 13 Mar 2014 16:36:16 +0200
Subject: libceph: nuke bogus encoding version check in
 osdmap_apply_incremental()

Only version 6 of osdmap encoding is supported, anything other than
version 6 results in an error and halts the decoding process.  Checking
if version is >= 5 is therefore bogus.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index b844a9273666..07fa3697ea12 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -952,11 +952,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		if (err < 0)
 			goto bad;
 	}
-	if (version >= 5) {
-		err = __decode_pool_names(p, end, map);
-		if (err)
-			goto bad;
-	}
+
+	err = __decode_pool_names(p, end, map);
+	if (err)
+		goto bad;
 
 	/* old_pool */
 	ceph_decode_32_safe(p, end, len, e_inval);
-- 
cgit 


From 53bbaba9d811f75acc20038bfd707a4cfdc4571f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Thu, 13 Mar 2014 16:36:16 +0200
Subject: libceph: fix and clarify ceph_decode_need() sizes

Sum up sizeof(...) results instead of (incorrectly) hard-coding the
number of bytes, expressed in ints and longs.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 07fa3697ea12..11649101a4a8 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -706,7 +706,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 		goto e_inval;
 	}
 
-	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), e_inval);
+	/* fsid, epoch, created, modified */
+	ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
+			 sizeof(map->created) + sizeof(map->modified), e_inval);
 	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
 	epoch = map->epoch = ceph_decode_32(p);
 	ceph_decode_copy(p, &map->created, sizeof(map->created));
@@ -878,8 +880,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		goto e_inval;
 	}
 
-	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
-			 e_inval);
+	/* fsid, epoch, modified, new_pool_max, new_flags */
+	ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
+			 sizeof(u64) + sizeof(u32), e_inval);
 	ceph_decode_copy(p, &fsid, sizeof(fsid));
 	epoch = ceph_decode_32(p);
 	BUG_ON(epoch != map->epoch+1);
@@ -913,10 +916,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	if (new_pool_max >= 0)
 		map->pool_max = new_pool_max;
 
-	ceph_decode_need(p, end, 5*sizeof(u32), e_inval);
-
 	/* new max? */
-	max = ceph_decode_32(p);
+	ceph_decode_32_safe(p, end, max, e_inval);
 	if (max >= 0) {
 		err = osdmap_set_max_osd(map, max);
 		if (err)
-- 
cgit 


From 0f70c7eedbbbd245398fc74b4b020f3b800f071c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Fri, 21 Mar 2014 19:05:27 +0200
Subject: libceph: rename __decode_pool{,_names}() to decode_pool{,_names}()

To be in line with all the other osdmap decode helpers.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 11649101a4a8..39938d79756c 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -506,7 +506,7 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
 	kfree(pi);
 }
 
-static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
+static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 {
 	u8 ev, cv;
 	unsigned len, num;
@@ -587,7 +587,7 @@ bad:
 	return -EINVAL;
 }
 
-static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
 {
 	struct ceph_pg_pool_info *pi;
 	u32 num, len;
@@ -723,7 +723,7 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 			goto bad;
 		}
 		pi->id = ceph_decode_64(p);
-		err = __decode_pool(p, end, pi);
+		err = decode_pool(p, end, pi);
 		if (err < 0) {
 			kfree(pi);
 			goto bad;
@@ -731,7 +731,8 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 		__insert_pg_pool(&map->pg_pools, pi);
 	}
 
-	err = __decode_pool_names(p, end, map);
+	/* pool_name */
+	err = decode_pool_names(p, end, map);
 	if (err)
 		goto bad;
 
@@ -949,12 +950,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			pi->id = pool;
 			__insert_pg_pool(&map->pg_pools, pi);
 		}
-		err = __decode_pool(p, end, pi);
+		err = decode_pool(p, end, pi);
 		if (err < 0)
 			goto bad;
 	}
 
-	err = __decode_pool_names(p, end, map);
+	/* new_pool_names */
+	err = decode_pool_names(p, end, map);
 	if (err)
 		goto bad;
 
-- 
cgit 


From 433fbdd31db267564bab20420bd8f161a7c69e4d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Fri, 21 Mar 2014 19:05:27 +0200
Subject: libceph: introduce decode{,_new}_pools() and switch to them

Consolidate pools (full map, map<u64, pg_pool_t>) and new_pools (inc
map, same) decoding logic into a common helper and switch to it.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 94 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 57 insertions(+), 37 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 39938d79756c..0ba3062d3317 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -681,6 +681,55 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 	return 0;
 }
 
+static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
+			  bool incremental)
+{
+	u32 n;
+
+	ceph_decode_32_safe(p, end, n, e_inval);
+	while (n--) {
+		struct ceph_pg_pool_info *pi;
+		u64 pool;
+		int ret;
+
+		ceph_decode_64_safe(p, end, pool, e_inval);
+
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (!incremental || !pi) {
+			pi = kzalloc(sizeof(*pi), GFP_NOFS);
+			if (!pi)
+				return -ENOMEM;
+
+			pi->id = pool;
+
+			ret = __insert_pg_pool(&map->pg_pools, pi);
+			if (ret) {
+				kfree(pi);
+				return ret;
+			}
+		}
+
+		ret = decode_pool(p, end, pi);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+	return __decode_pools(p, end, map, false);
+}
+
+static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+	return __decode_pools(p, end, map, true);
+}
+
 /*
  * decode a full map.
  */
@@ -692,7 +741,6 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 	u32 max;
 	u32 len, i;
 	int err;
-	struct ceph_pg_pool_info *pi;
 
 	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
 
@@ -714,22 +762,10 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 	ceph_decode_copy(p, &map->created, sizeof(map->created));
 	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
 
-	ceph_decode_32_safe(p, end, max, e_inval);
-	while (max--) {
-		ceph_decode_need(p, end, 8 + 2, e_inval);
-		pi = kzalloc(sizeof(*pi), GFP_NOFS);
-		if (!pi) {
-			err = -ENOMEM;
-			goto bad;
-		}
-		pi->id = ceph_decode_64(p);
-		err = decode_pool(p, end, pi);
-		if (err < 0) {
-			kfree(pi);
-			goto bad;
-		}
-		__insert_pg_pool(&map->pg_pools, pi);
-	}
+	/* pools */
+	err = decode_pools(p, end, map);
+	if (err)
+		goto bad;
 
 	/* pool_name */
 	err = decode_pool_names(p, end, map);
@@ -934,26 +970,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		newcrush = NULL;
 	}
 
-	/* new_pool */
-	ceph_decode_32_safe(p, end, len, e_inval);
-	while (len--) {
-		struct ceph_pg_pool_info *pi;
-
-		ceph_decode_64_safe(p, end, pool, e_inval);
-		pi = __lookup_pg_pool(&map->pg_pools, pool);
-		if (!pi) {
-			pi = kzalloc(sizeof(*pi), GFP_NOFS);
-			if (!pi) {
-				err = -ENOMEM;
-				goto bad;
-			}
-			pi->id = pool;
-			__insert_pg_pool(&map->pg_pools, pi);
-		}
-		err = decode_pool(p, end, pi);
-		if (err < 0)
-			goto bad;
-	}
+	/* new_pools */
+	err = decode_new_pools(p, end, map);
+	if (err)
+		goto bad;
 
 	/* new_pool_names */
 	err = decode_pool_names(p, end, map);
-- 
cgit 


From 4d60351f9089ef0f39d73c0b6a103e61fc0ed187 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Fri, 21 Mar 2014 19:05:28 +0200
Subject: libceph: switch osdmap_set_max_osd() to krealloc()

Use krealloc() instead of rolling our own.  (krealloc() with a NULL
first argument acts as a kmalloc()).  Properly initalize the new array
elements.  This is needed to make future additions to osdmap easier.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 0ba3062d3317..a350286fd826 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -646,38 +646,40 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
 }
 
 /*
- * adjust max osd value.  reallocate arrays.
+ * Adjust max_osd value, (re)allocate arrays.
+ *
+ * The new elements are properly initialized.
  */
 static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 {
 	u8 *state;
-	struct ceph_entity_addr *addr;
 	u32 *weight;
+	struct ceph_entity_addr *addr;
+	int i;
 
-	state = kcalloc(max, sizeof(*state), GFP_NOFS);
-	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
-	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
-	if (state == NULL || addr == NULL || weight == NULL) {
+	state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
+	weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
+	addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
+	if (!state || !weight || !addr) {
 		kfree(state);
-		kfree(addr);
 		kfree(weight);
+		kfree(addr);
+
 		return -ENOMEM;
 	}
 
-	/* copy old? */
-	if (map->osd_state) {
-		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
-		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
-		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
-		kfree(map->osd_state);
-		kfree(map->osd_addr);
-		kfree(map->osd_weight);
+	for (i = map->max_osd; i < max; i++) {
+		state[i] = 0;
+		weight[i] = CEPH_OSD_OUT;
+		memset(addr + i, 0, sizeof(*addr));
 	}
 
 	map->osd_state = state;
 	map->osd_weight = weight;
 	map->osd_addr = addr;
+
 	map->max_osd = max;
+
 	return 0;
 }
 
-- 
cgit 


From 10db634e2083a202a26123d6d4c9ede98d6a1199 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Fri, 21 Mar 2014 19:05:28 +0200
Subject: libceph: introduce decode{,_new}_pg_temp() and switch to them

Consolidate pg_temp (full map, map<pg_t, vector<u32>>) and new_pg_temp
(inc map, same) decoding logic into a common helper and switch to it.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 139 ++++++++++++++++++++++++++----------------------------
 1 file changed, 67 insertions(+), 72 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index a350286fd826..6497322d2e3c 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -732,6 +732,67 @@ static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
 	return __decode_pools(p, end, map, true);
 }
 
+static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
+			    bool incremental)
+{
+	u32 n;
+
+	ceph_decode_32_safe(p, end, n, e_inval);
+	while (n--) {
+		struct ceph_pg pgid;
+		u32 len, i;
+		int ret;
+
+		ret = ceph_decode_pgid(p, end, &pgid);
+		if (ret)
+			return ret;
+
+		ceph_decode_32_safe(p, end, len, e_inval);
+
+		ret = __remove_pg_mapping(&map->pg_temp, pgid);
+		BUG_ON(!incremental && ret != -ENOENT);
+
+		if (!incremental || len > 0) {
+			struct ceph_pg_mapping *pg;
+
+			ceph_decode_need(p, end, len*sizeof(u32), e_inval);
+
+			if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
+				return -EINVAL;
+
+			pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS);
+			if (!pg)
+				return -ENOMEM;
+
+			pg->pgid = pgid;
+			pg->len = len;
+			for (i = 0; i < len; i++)
+				pg->osds[i] = ceph_decode_32(p);
+
+			ret = __insert_pg_mapping(pg, &map->pg_temp);
+			if (ret) {
+				kfree(pg);
+				return ret;
+			}
+		}
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+	return __decode_pg_temp(p, end, map, false);
+}
+
+static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+	return __decode_pg_temp(p, end, map, true);
+}
+
 /*
  * decode a full map.
  */
@@ -810,36 +871,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 		ceph_decode_addr(&map->osd_addr[i]);
 
 	/* pg_temp */
-	ceph_decode_32_safe(p, end, len, e_inval);
-	for (i = 0; i < len; i++) {
-		int n, j;
-		struct ceph_pg pgid;
-		struct ceph_pg_mapping *pg;
-
-		err = ceph_decode_pgid(p, end, &pgid);
-		if (err)
-			goto bad;
-		ceph_decode_need(p, end, sizeof(u32), e_inval);
-		n = ceph_decode_32(p);
-		if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
-			goto e_inval;
-		ceph_decode_need(p, end, n * sizeof(u32), e_inval);
-		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
-		if (!pg) {
-			err = -ENOMEM;
-			goto bad;
-		}
-		pg->pgid = pgid;
-		pg->len = n;
-		for (j = 0; j < n; j++)
-			pg->osds[j] = ceph_decode_32(p);
-
-		err = __insert_pg_mapping(pg, &map->pg_temp);
-		if (err)
-			goto bad;
-		dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed,
-		     len);
-	}
+	err = decode_pg_temp(p, end, map);
+	if (err)
+		goto bad;
 
 	/* crush */
 	ceph_decode_32_safe(p, end, len, e_inval);
@@ -1038,48 +1072,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	}
 
 	/* new_pg_temp */
-	ceph_decode_32_safe(p, end, len, e_inval);
-	while (len--) {
-		struct ceph_pg_mapping *pg;
-		int j;
-		struct ceph_pg pgid;
-		u32 pglen;
-
-		err = ceph_decode_pgid(p, end, &pgid);
-		if (err)
-			goto bad;
-		ceph_decode_need(p, end, sizeof(u32), e_inval);
-		pglen = ceph_decode_32(p);
-		if (pglen) {
-			ceph_decode_need(p, end, pglen*sizeof(u32), e_inval);
-
-			/* removing existing (if any) */
-			(void) __remove_pg_mapping(&map->pg_temp, pgid);
-
-			/* insert */
-			if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
-				goto e_inval;
-			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
-			if (!pg) {
-				err = -ENOMEM;
-				goto bad;
-			}
-			pg->pgid = pgid;
-			pg->len = pglen;
-			for (j = 0; j < pglen; j++)
-				pg->osds[j] = ceph_decode_32(p);
-			err = __insert_pg_mapping(pg, &map->pg_temp);
-			if (err) {
-				kfree(pg);
-				goto bad;
-			}
-			dout(" added pg_temp %lld.%x len %d\n", pgid.pool,
-			     pgid.seed, pglen);
-		} else {
-			/* remove */
-			__remove_pg_mapping(&map->pg_temp, pgid);
-		}
-	}
+	err = decode_new_pg_temp(p, end, map);
+	if (err)
+		goto bad;
 
 	/* ignore the rest */
 	*p = end;
-- 
cgit 


From ec7af97258396161e6effba7e788c3fc3cb55263 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Fri, 21 Mar 2014 19:05:29 +0200
Subject: libceph: introduce get_osdmap_client_data_v()

Full and incremental osdmaps are structured identically and have
identical headers.  Add a helper to decode both "old" (16-bit version,
v6) and "new" (8-bit struct_v+struct_compat+struct_len, v7) osdmap
enconding headers and switch to it.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 81 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 65 insertions(+), 16 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 6497322d2e3c..be2a65fbd902 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -683,6 +683,63 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 	return 0;
 }
 
+#define OSDMAP_WRAPPER_COMPAT_VER	7
+#define OSDMAP_CLIENT_DATA_COMPAT_VER	1
+
+/*
+ * Return 0 or error.  On success, *v is set to 0 for old (v6) osdmaps,
+ * to struct_v of the client_data section for new (v7 and above)
+ * osdmaps.
+ */
+static int get_osdmap_client_data_v(void **p, void *end,
+				    const char *prefix, u8 *v)
+{
+	u8 struct_v;
+
+	ceph_decode_8_safe(p, end, struct_v, e_inval);
+	if (struct_v >= 7) {
+		u8 struct_compat;
+
+		ceph_decode_8_safe(p, end, struct_compat, e_inval);
+		if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
+			pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n",
+				   struct_v, struct_compat,
+				   OSDMAP_WRAPPER_COMPAT_VER, prefix);
+			return -EINVAL;
+		}
+		*p += 4; /* ignore wrapper struct_len */
+
+		ceph_decode_8_safe(p, end, struct_v, e_inval);
+		ceph_decode_8_safe(p, end, struct_compat, e_inval);
+		if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
+			pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n",
+				   struct_v, struct_compat,
+				   OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
+			return -EINVAL;
+		}
+		*p += 4; /* ignore client data struct_len */
+	} else {
+		u16 version;
+
+		*p -= 1;
+		ceph_decode_16_safe(p, end, version, e_inval);
+		if (version < 6) {
+			pr_warning("got v %d < 6 of %s ceph_osdmap\n", version,
+				   prefix);
+			return -EINVAL;
+		}
+
+		/* old osdmap enconding */
+		struct_v = 0;
+	}
+
+	*v = struct_v;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
 static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
 			  bool incremental)
 {
@@ -798,7 +855,7 @@ static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
  */
 static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 {
-	u16 version;
+	u8 struct_v;
 	u32 epoch = 0;
 	void *start = *p;
 	u32 max;
@@ -807,15 +864,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 
 	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
 
-	ceph_decode_16_safe(p, end, version, e_inval);
-	if (version > 6) {
-		pr_warning("got unknown v %d > 6 of osdmap\n", version);
-		goto e_inval;
-	}
-	if (version < 6) {
-		pr_warning("got old v %d < 6 of osdmap\n", version);
-		goto e_inval;
-	}
+	err = get_osdmap_client_data_v(p, end, "full", &struct_v);
+	if (err)
+		goto bad;
 
 	/* fsid, epoch, created, modified */
 	ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
@@ -943,15 +994,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	__s32 new_flags, max;
 	void *start = *p;
 	int err;
-	u16 version;
+	u8 struct_v;
 
 	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
 
-	ceph_decode_16_safe(p, end, version, e_inval);
-	if (version != 6) {
-		pr_warning("got unknown v %d != 6 of inc osdmap\n", version);
-		goto e_inval;
-	}
+	err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
+	if (err)
+		goto bad;
 
 	/* fsid, epoch, modified, new_pool_max, new_flags */
 	ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
-- 
cgit 


From 35a935d75d51abe58d3427a8b4ae3745a5a14e1c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Fri, 21 Mar 2014 19:05:29 +0200
Subject: libceph: generalize ceph_pg_mapping

In preparation for adding support for primary_temp mappings, generalize
struct ceph_pg_mapping so it can hold mappings other than pg_temp.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/debugfs.c | 4 ++--
 net/ceph/osdmap.c  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index c45d235e774e..5865f2c9580a 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -88,9 +88,9 @@ static int osdmap_show(struct seq_file *s, void *p)
 
 		seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool,
 			   pg->pgid.seed);
-		for (i = 0; i < pg->len; i++)
+		for (i = 0; i < pg->pg_temp.len; i++)
 			seq_printf(s, "%s%d", (i == 0 ? "" : ","),
-				   pg->osds[i]);
+				   pg->pg_temp.osds[i]);
 		seq_printf(s, "]\n");
 	}
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index be2a65fbd902..c67a309fdfc2 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -822,9 +822,9 @@ static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
 				return -ENOMEM;
 
 			pg->pgid = pgid;
-			pg->len = len;
+			pg->pg_temp.len = len;
 			for (i = 0; i < len; i++)
-				pg->osds[i] = ceph_decode_32(p);
+				pg->pg_temp.osds[i] = ceph_decode_32(p);
 
 			ret = __insert_pg_mapping(pg, &map->pg_temp);
 			if (ret) {
@@ -1281,8 +1281,8 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 				    pool->pg_num_mask);
 	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
 	if (pg) {
-		*num = pg->len;
-		return pg->osds;
+		*num = pg->pg_temp.len;
+		return pg->pg_temp.osds;
 	}
 
 	/* crush */
-- 
cgit 


From 9686f94c8cfc06e8afb7b2233ab8f1f6ac01957f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Fri, 21 Mar 2014 19:05:29 +0200
Subject: libceph: primary_temp infrastructure

Add primary_temp mappings infrastructure.  struct ceph_pg_mapping is
overloaded, primary_temp mappings are stored in an rb-tree, rooted at
ceph_osdmap, in a manner similar to pg_temp mappings.

Dump primary_temp mappings to /sys/kernel/debug/ceph/<client>/osdmap,
one 'primary_temp <pgid> <osd>' per line, e.g:

    primary_temp 2.6 4

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/debugfs.c |  7 +++++++
 net/ceph/osdmap.c  | 10 +++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'net/ceph')

diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 5865f2c9580a..612bf55e6a8b 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -93,6 +93,13 @@ static int osdmap_show(struct seq_file *s, void *p)
 				   pg->pg_temp.osds[i]);
 		seq_printf(s, "]\n");
 	}
+	for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(n, struct ceph_pg_mapping, node);
+
+		seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool,
+			   pg->pgid.seed, pg->primary_temp.osd);
+	}
 
 	return 0;
 }
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index c67a309fdfc2..c0fc517ab321 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -343,7 +343,7 @@ bad:
 
 /*
  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds)
+ * to a set of osds) and primary_temp (explicit primary setting)
  */
 static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
 {
@@ -633,6 +633,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
 		rb_erase(&pg->node, &map->pg_temp);
 		kfree(pg);
 	}
+	while (!RB_EMPTY_ROOT(&map->primary_temp)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(rb_first(&map->primary_temp),
+				 struct ceph_pg_mapping, node);
+		rb_erase(&pg->node, &map->primary_temp);
+		kfree(pg);
+	}
 	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
 		struct ceph_pg_pool_info *pi =
 			rb_entry(rb_first(&map->pg_pools),
@@ -966,6 +973,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
 		return ERR_PTR(-ENOMEM);
 
 	map->pg_temp = RB_ROOT;
+	map->primary_temp = RB_ROOT;
 	mutex_init(&map->crush_scratch_mutex);
 
 	ret = osdmap_decode(p, end, map);
-- 
cgit 


From d286de796aab9e306e674c6d23c4f3c1f55e394c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Fri, 21 Mar 2014 19:05:30 +0200
Subject: libceph: primary_temp decode bits

Add a common helper to decode both primary_temp (full map, map<pg_t,
u32>) and new_primary_temp (inc map, same) and switch to it.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index c0fc517ab321..c2d793d3d98d 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -857,6 +857,61 @@ static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
 	return __decode_pg_temp(p, end, map, true);
 }
 
+static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
+				 bool incremental)
+{
+	u32 n;
+
+	ceph_decode_32_safe(p, end, n, e_inval);
+	while (n--) {
+		struct ceph_pg pgid;
+		u32 osd;
+		int ret;
+
+		ret = ceph_decode_pgid(p, end, &pgid);
+		if (ret)
+			return ret;
+
+		ceph_decode_32_safe(p, end, osd, e_inval);
+
+		ret = __remove_pg_mapping(&map->primary_temp, pgid);
+		BUG_ON(!incremental && ret != -ENOENT);
+
+		if (!incremental || osd != (u32)-1) {
+			struct ceph_pg_mapping *pg;
+
+			pg = kzalloc(sizeof(*pg), GFP_NOFS);
+			if (!pg)
+				return -ENOMEM;
+
+			pg->pgid = pgid;
+			pg->primary_temp.osd = osd;
+
+			ret = __insert_pg_mapping(pg, &map->primary_temp);
+			if (ret) {
+				kfree(pg);
+				return ret;
+			}
+		}
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+	return __decode_primary_temp(p, end, map, false);
+}
+
+static int decode_new_primary_temp(void **p, void *end,
+				   struct ceph_osdmap *map)
+{
+	return __decode_primary_temp(p, end, map, true);
+}
+
 /*
  * decode a full map.
  */
@@ -933,6 +988,13 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 	if (err)
 		goto bad;
 
+	/* primary_temp */
+	if (struct_v >= 1) {
+		err = decode_primary_temp(p, end, map);
+		if (err)
+			goto bad;
+	}
+
 	/* crush */
 	ceph_decode_32_safe(p, end, len, e_inval);
 	map->crush = crush_decode(*p, min(*p + len, end));
@@ -1133,6 +1195,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	if (err)
 		goto bad;
 
+	/* new_primary_temp */
+	if (struct_v >= 1) {
+		err = decode_new_primary_temp(p, end, map);
+		if (err)
+			goto bad;
+	}
+
 	/* ignore the rest */
 	*p = end;
 
-- 
cgit 


From 2cfa34f2d67a36e292cbe6e4c1e60d212b7ba4d1 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Fri, 21 Mar 2014 19:05:30 +0200
Subject: libceph: primary_affinity infrastructure

Add primary_affinity infrastructure.  primary_affinity values are
stored in an max_osd-sized array, hanging off ceph_osdmap, similar to
a osd_weight array.

Introduce {get,set}_primary_affinity() helpers, primarily to return
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY when no affinity has been set and to
abstract out osd_primary_affinity array allocation and initialization.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/debugfs.c |  5 +++--
 net/ceph/osdmap.c  | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 612bf55e6a8b..34453a2b4b4d 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -77,10 +77,11 @@ static int osdmap_show(struct seq_file *s, void *p)
 		int state = map->osd_state[i];
 		char sb[64];
 
-		seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\n",
+		seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
 			   i, ceph_pr_addr(&addr->in_addr),
 			   ((map->osd_weight[i]*100) >> 16),
-			   ceph_osdmap_state_str(sb, sizeof(sb), state));
+			   ceph_osdmap_state_str(sb, sizeof(sb), state),
+			   ((ceph_get_primary_affinity(map, i)*100) >> 16));
 	}
 	for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) {
 		struct ceph_pg_mapping *pg =
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index c2d793d3d98d..0ac129388e07 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -649,6 +649,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
 	kfree(map->osd_state);
 	kfree(map->osd_weight);
 	kfree(map->osd_addr);
+	kfree(map->osd_primary_affinity);
 	kfree(map);
 }
 
@@ -685,6 +686,20 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 	map->osd_weight = weight;
 	map->osd_addr = addr;
 
+	if (map->osd_primary_affinity) {
+		u32 *affinity;
+
+		affinity = krealloc(map->osd_primary_affinity,
+				    max*sizeof(*affinity), GFP_NOFS);
+		if (!affinity)
+			return -ENOMEM;
+
+		for (i = map->max_osd; i < max; i++)
+			affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+		map->osd_primary_affinity = affinity;
+	}
+
 	map->max_osd = max;
 
 	return 0;
@@ -912,6 +927,38 @@ static int decode_new_primary_temp(void **p, void *end,
 	return __decode_primary_temp(p, end, map, true);
 }
 
+u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
+{
+	BUG_ON(osd >= map->max_osd);
+
+	if (!map->osd_primary_affinity)
+		return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+	return map->osd_primary_affinity[osd];
+}
+
+static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
+{
+	BUG_ON(osd >= map->max_osd);
+
+	if (!map->osd_primary_affinity) {
+		int i;
+
+		map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32),
+						    GFP_NOFS);
+		if (!map->osd_primary_affinity)
+			return -ENOMEM;
+
+		for (i = 0; i < map->max_osd; i++)
+			map->osd_primary_affinity[i] =
+			    CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+	}
+
+	map->osd_primary_affinity[osd] = aff;
+
+	return 0;
+}
+
 /*
  * decode a full map.
  */
-- 
cgit 


From 63a6993f521b2629872e89c02a336fb3f18b092b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Fri, 21 Mar 2014 19:05:30 +0200
Subject: libceph: primary_affinity decode bits

Add two helpers to decode primary_affinity (full map, vector<u32>) and
new_primary_affinity (inc map, map<u32, u32>) and switch to them.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 0ac129388e07..9568e6221852 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -959,6 +959,60 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
 	return 0;
 }
 
+static int decode_primary_affinity(void **p, void *end,
+				   struct ceph_osdmap *map)
+{
+	u32 len, i;
+
+	ceph_decode_32_safe(p, end, len, e_inval);
+	if (len == 0) {
+		kfree(map->osd_primary_affinity);
+		map->osd_primary_affinity = NULL;
+		return 0;
+	}
+	if (len != map->max_osd)
+		goto e_inval;
+
+	ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
+
+	for (i = 0; i < map->max_osd; i++) {
+		int ret;
+
+		ret = set_primary_affinity(map, i, ceph_decode_32(p));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int decode_new_primary_affinity(void **p, void *end,
+				       struct ceph_osdmap *map)
+{
+	u32 n;
+
+	ceph_decode_32_safe(p, end, n, e_inval);
+	while (n--) {
+		u32 osd, aff;
+		int ret;
+
+		ceph_decode_32_safe(p, end, osd, e_inval);
+		ceph_decode_32_safe(p, end, aff, e_inval);
+
+		ret = set_primary_affinity(map, osd, aff);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
 /*
  * decode a full map.
  */
@@ -1042,6 +1096,17 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 			goto bad;
 	}
 
+	/* primary_affinity */
+	if (struct_v >= 2) {
+		err = decode_primary_affinity(p, end, map);
+		if (err)
+			goto bad;
+	} else {
+		/* XXX can this happen? */
+		kfree(map->osd_primary_affinity);
+		map->osd_primary_affinity = NULL;
+	}
+
 	/* crush */
 	ceph_decode_32_safe(p, end, len, e_inval);
 	map->crush = crush_decode(*p, min(*p + len, end));
@@ -1249,6 +1314,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			goto bad;
 	}
 
+	/* new_primary_affinity */
+	if (struct_v >= 2) {
+		err = decode_new_primary_affinity(p, end, map);
+		if (err)
+			goto bad;
+	}
+
 	/* ignore the rest */
 	*p = end;
 
-- 
cgit 


From 2bd93d4d7ec2dd461cfb87c6d8a9b1ef9b30de08 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Mon, 24 Mar 2014 17:12:47 +0200
Subject: libceph: introduce pg_to_raw_osds() and raw_to_up_osds() helpers

pg_to_raw_osds() helper for computing a raw (crush) set, which can
contain non-existant and down osds.

raw_to_up_osds() helper for pruning non-existant and down osds from the
raw set, therefore transforming it into an up set, and determining up
primary.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9568e6221852..b8bbef058019 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1520,6 +1520,82 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 	return osds;
 }
 
+/*
+ * Calculate raw (crush) set for given pgid.
+ *
+ * Return raw set length, or error.
+ */
+static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
+			  struct ceph_pg_pool_info *pool,
+			  struct ceph_pg pgid, u32 pps, int *osds)
+{
+	int ruleno;
+	int len;
+
+	/* crush */
+	ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
+				 pool->type, pool->size);
+	if (ruleno < 0) {
+		pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
+		       pgid.pool, pool->crush_ruleset, pool->type,
+		       pool->size);
+		return -ENOENT;
+	}
+
+	len = do_crush(osdmap, ruleno, pps, osds,
+		       min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+		       osdmap->osd_weight, osdmap->max_osd);
+	if (len < 0) {
+		pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
+		       len, ruleno, pgid.pool, pool->crush_ruleset,
+		       pool->type, pool->size);
+		return len;
+	}
+
+	return len;
+}
+
+/*
+ * Given raw set, calculate up set and up primary.
+ *
+ * Return up set length.  *primary is set to up primary osd id, or -1
+ * if up set is empty.
+ */
+static int raw_to_up_osds(struct ceph_osdmap *osdmap,
+			  struct ceph_pg_pool_info *pool,
+			  int *osds, int len, int *primary)
+{
+	int up_primary = -1;
+	int i;
+
+	if (ceph_can_shift_osds(pool)) {
+		int removed = 0;
+
+		for (i = 0; i < len; i++) {
+			if (ceph_osd_is_down(osdmap, osds[i])) {
+				removed++;
+				continue;
+			}
+			if (removed)
+				osds[i - removed] = osds[i];
+		}
+
+		len -= removed;
+		if (len > 0)
+			up_primary = osds[0];
+	} else {
+		for (i = len - 1; i >= 0; i--) {
+			if (ceph_osd_is_down(osdmap, osds[i]))
+				osds[i] = CRUSH_ITEM_NONE;
+			else
+				up_primary = osds[i];
+		}
+	}
+
+	*primary = up_primary;
+	return len;
+}
+
 /*
  * Return acting set for given pgid.
  */
-- 
cgit 


From 45966c3467e8291382a8970adbd78403a7818d45 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Mon, 24 Mar 2014 17:12:47 +0200
Subject: libceph: introduce apply_temps() helper

apply_temp() helper for applying various temporary mappings (at this
point only pg_temp mappings) to the up set, therefore transforming it
into an acting set.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index b8bbef058019..bd40f56b53ed 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1596,6 +1596,58 @@ static int raw_to_up_osds(struct ceph_osdmap *osdmap,
 	return len;
 }
 
+/*
+ * Given up set, apply pg_temp mapping.
+ *
+ * Return acting set length.  *primary is set to acting primary osd id,
+ * or -1 if acting set is empty.
+ */
+static int apply_temps(struct ceph_osdmap *osdmap,
+		       struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
+		       int *osds, int len, int *primary)
+{
+	struct ceph_pg_mapping *pg;
+	int temp_len;
+	int temp_primary;
+	int i;
+
+	/* raw_pg -> pg */
+	pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
+				    pool->pg_num_mask);
+
+	/* pg_temp? */
+	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+	if (pg) {
+		temp_len = 0;
+		temp_primary = -1;
+
+		for (i = 0; i < pg->pg_temp.len; i++) {
+			if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
+				if (ceph_can_shift_osds(pool))
+					continue;
+				else
+					osds[temp_len++] = CRUSH_ITEM_NONE;
+			} else {
+				osds[temp_len++] = pg->pg_temp.osds[i];
+			}
+		}
+
+		/* apply pg_temp's primary */
+		for (i = 0; i < temp_len; i++) {
+			if (osds[i] != CRUSH_ITEM_NONE) {
+				temp_primary = osds[i];
+				break;
+			}
+		}
+	} else {
+		temp_len = len;
+		temp_primary = *primary;
+	}
+
+	*primary = temp_primary;
+	return temp_len;
+}
+
 /*
  * Return acting set for given pgid.
  */
-- 
cgit 


From ac972230e20581b044f5ce66dcaf3c5af8d57444 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Mon, 24 Mar 2014 17:12:48 +0200
Subject: libceph: switch ceph_calc_pg_acting() to new helpers

Switch ceph_calc_pg_acting() to new helpers: pg_to_raw_osds(),
raw_to_up_osds() and apply_temps().

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 51 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 38 insertions(+), 13 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index bd40f56b53ed..f1cad21d1533 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1649,24 +1649,49 @@ static int apply_temps(struct ceph_osdmap *osdmap,
 }
 
 /*
- * Return acting set for given pgid.
+ * Calculate acting set for given pgid.
+ *
+ * Return acting set length, or error.
  */
 int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-			int *acting)
+			int *osds)
 {
-	int rawosds[CEPH_PG_MAX_SIZE], *osds;
-	int i, o, num = CEPH_PG_MAX_SIZE;
+	struct ceph_pg_pool_info *pool;
+	u32 pps;
+	int len;
+	int primary;
 
-	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-	if (!osds)
-		return -1;
+	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
+	if (!pool)
+		return 0;
 
-	/* primary is first up osd */
-	o = 0;
-	for (i = 0; i < num; i++)
-		if (ceph_osd_is_up(osdmap, osds[i]))
-			acting[o++] = osds[i];
-	return o;
+	if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+		/* hash pool id and seed so that pool PGs do not overlap */
+		pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
+				     ceph_stable_mod(pgid.seed, pool->pgp_num,
+						     pool->pgp_num_mask),
+				     pgid.pool);
+	} else {
+		/*
+		 * legacy behavior: add ps and pool together.  this is
+		 * not a great approach because the PGs from each pool
+		 * will overlap on top of each other: 0.5 == 1.4 ==
+		 * 2.3 == ...
+		 */
+		pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
+				      pool->pgp_num_mask) +
+			(unsigned)pgid.pool;
+	}
+
+	len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
+	if (len < 0)
+		return len;
+
+	len = raw_to_up_osds(osdmap, pool, osds, len, &primary);
+
+	len = apply_temps(osdmap, pool, pgid, osds, len, &primary);
+
+	return len;
 }
 
 /*
-- 
cgit 


From 8008ab1080c1768b02d232dcfd9e161cd47cc9f7 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Mon, 24 Mar 2014 17:12:48 +0200
Subject: libceph: return primary from ceph_calc_pg_acting()

In preparation for adding support for primary_temp, stop assuming
primaryness: add a primary out parameter to ceph_calc_pg_acting() and
change call sites accordingly.  Primary is now specified separately
from the order of osds in the set.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osd_client.c | 10 ++++------
 net/ceph/osdmap.c     | 20 ++++++++++++--------
 2 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 6f64eec18851..b4157dc22199 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1333,7 +1333,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 {
 	struct ceph_pg pgid;
 	int acting[CEPH_PG_MAX_SIZE];
-	int o = -1, num = 0;
+	int num, o;
 	int err;
 	bool was_paused;
 
@@ -1346,11 +1346,9 @@ static int __map_request(struct ceph_osd_client *osdc,
 	}
 	req->r_pgid = pgid;
 
-	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
-	if (err > 0) {
-		o = acting[0];
-		num = err;
-	}
+	num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
+	if (num < 0)
+		num = 0;
 
 	was_paused = req->r_paused;
 	req->r_paused = __req_should_be_paused(osdc, req);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index f1cad21d1533..df9389ddd56c 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1651,19 +1651,21 @@ static int apply_temps(struct ceph_osdmap *osdmap,
 /*
  * Calculate acting set for given pgid.
  *
- * Return acting set length, or error.
+ * Return acting set length, or error.  *primary is set to acting
+ * primary osd id, or -1 if acting set is empty or on error.
  */
 int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-			int *osds)
+			int *osds, int *primary)
 {
 	struct ceph_pg_pool_info *pool;
 	u32 pps;
 	int len;
-	int primary;
 
 	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
-	if (!pool)
-		return 0;
+	if (!pool) {
+		*primary = -1;
+		return -ENOENT;
+	}
 
 	if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
 		/* hash pool id and seed so that pool PGs do not overlap */
@@ -1684,12 +1686,14 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 	}
 
 	len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
-	if (len < 0)
+	if (len < 0) {
+		*primary = -1;
 		return len;
+	}
 
-	len = raw_to_up_osds(osdmap, pool, osds, len, &primary);
+	len = raw_to_up_osds(osdmap, pool, osds, len, primary);
 
-	len = apply_temps(osdmap, pool, pgid, osds, len, &primary);
+	len = apply_temps(osdmap, pool, pgid, osds, len, primary);
 
 	return len;
 }
-- 
cgit 


From 5e8d4d36bf23bb7baf027c479d54395840219928 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Mon, 24 Mar 2014 17:12:48 +0200
Subject: libceph: add support for primary_temp mappings

Change apply_temp() to override primary in the same way pg_temp
overrides osd set.  primary_temp overrides pg_temp primary too.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index df9389ddd56c..20a38a37794c 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1597,7 +1597,7 @@ static int raw_to_up_osds(struct ceph_osdmap *osdmap,
 }
 
 /*
- * Given up set, apply pg_temp mapping.
+ * Given up set, apply pg_temp and primary_temp mappings.
  *
  * Return acting set length.  *primary is set to acting primary osd id,
  * or -1 if acting set is empty.
@@ -1644,6 +1644,11 @@ static int apply_temps(struct ceph_osdmap *osdmap,
 		temp_primary = *primary;
 	}
 
+	/* primary_temp? */
+	pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
+	if (pg)
+		temp_primary = pg->primary_temp.osd;
+
 	*primary = temp_primary;
 	return temp_len;
 }
-- 
cgit 


From 47ec1f3cc46dde00deb34922dbffdeda254ad76d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Mon, 24 Mar 2014 17:12:49 +0200
Subject: libceph: add support for osd primary affinity

Respond to non-default primary_affinity values accordingly.  (Primary
affinity allows the admin to shift 'primary responsibility' away from
specific osds, effectively shifting around the read side of the
workload and whatever overhead is incurred by peering and writes by
virtue of being the primary).

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 20a38a37794c..ae8f367c5291 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1596,6 +1596,72 @@ static int raw_to_up_osds(struct ceph_osdmap *osdmap,
 	return len;
 }
 
+static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
+				   struct ceph_pg_pool_info *pool,
+				   int *osds, int len, int *primary)
+{
+	int i;
+	int pos = -1;
+
+	/*
+	 * Do we have any non-default primary_affinity values for these
+	 * osds?
+	 */
+	if (!osdmap->osd_primary_affinity)
+		return;
+
+	for (i = 0; i < len; i++) {
+		if (osds[i] != CRUSH_ITEM_NONE &&
+		    osdmap->osd_primary_affinity[i] !=
+					CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+			break;
+		}
+	}
+	if (i == len)
+		return;
+
+	/*
+	 * Pick the primary.  Feed both the seed (for the pg) and the
+	 * osd into the hash/rng so that a proportional fraction of an
+	 * osd's pgs get rejected as primary.
+	 */
+	for (i = 0; i < len; i++) {
+		int osd;
+		u32 aff;
+
+		osd = osds[i];
+		if (osd == CRUSH_ITEM_NONE)
+			continue;
+
+		aff = osdmap->osd_primary_affinity[osd];
+		if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
+		    (crush_hash32_2(CRUSH_HASH_RJENKINS1,
+				    pps, osd) >> 16) >= aff) {
+			/*
+			 * We chose not to use this primary.  Note it
+			 * anyway as a fallback in case we don't pick
+			 * anyone else, but keep looking.
+			 */
+			if (pos < 0)
+				pos = i;
+		} else {
+			pos = i;
+			break;
+		}
+	}
+	if (pos < 0)
+		return;
+
+	*primary = osds[pos];
+
+	if (ceph_can_shift_osds(pool) && pos > 0) {
+		/* move the new primary to the front */
+		for (i = pos; i > 0; i--)
+			osds[i] = osds[i - 1];
+		osds[0] = *primary;
+	}
+}
+
 /*
  * Given up set, apply pg_temp and primary_temp mappings.
  *
@@ -1698,6 +1764,8 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 
 	len = raw_to_up_osds(osdmap, pool, osds, len, primary);
 
+	apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
+
 	len = apply_temps(osdmap, pool, pgid, osds, len, primary);
 
 	return len;
-- 
cgit 


From c4c1228525a2cbf9e06657b01135391700c1ec14 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Mon, 24 Mar 2014 17:12:49 +0200
Subject: libceph: redo ceph_calc_pg_primary() in terms of
 ceph_calc_pg_acting()

Reimplement ceph_calc_pg_primary() in terms of ceph_calc_pg_acting()
and get rid of the now unused calc_pg_raw().

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osdmap.c | 79 +++----------------------------------------------------
 1 file changed, 4 insertions(+), 75 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index ae8f367c5291..408d14826b5e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1455,71 +1455,6 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 	return r;
 }
 
-/*
- * Calculate raw osd vector for the given pgid.  Return pointer to osd
- * array, or NULL on failure.
- */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-			int *osds, int *num)
-{
-	struct ceph_pg_mapping *pg;
-	struct ceph_pg_pool_info *pool;
-	int ruleno;
-	int r;
-	u32 pps;
-
-	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
-	if (!pool)
-		return NULL;
-
-	/* pg_temp? */
-	pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
-				    pool->pg_num_mask);
-	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
-	if (pg) {
-		*num = pg->pg_temp.len;
-		return pg->pg_temp.osds;
-	}
-
-	/* crush */
-	ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
-				 pool->type, pool->size);
-	if (ruleno < 0) {
-		pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
-		       pgid.pool, pool->crush_ruleset, pool->type,
-		       pool->size);
-		return NULL;
-	}
-
-	if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
-		/* hash pool id and seed sothat pool PGs do not overlap */
-		pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
-				     ceph_stable_mod(pgid.seed, pool->pgp_num,
-						     pool->pgp_num_mask),
-				     pgid.pool);
-	} else {
-		/*
-		 * legacy ehavior: add ps and pool together.  this is
-		 * not a great approach because the PGs from each pool
-		 * will overlap on top of each other: 0.5 == 1.4 ==
-		 * 2.3 == ...
-		 */
-		pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
-				      pool->pgp_num_mask) +
-			(unsigned)pgid.pool;
-	}
-	r = do_crush(osdmap, ruleno, pps, osds, min_t(int, pool->size, *num),
-		     osdmap->osd_weight, osdmap->max_osd);
-	if (r < 0) {
-		pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
-		       " size %d\n", r, pgid.pool, pool->crush_ruleset,
-		       pool->type, pool->size);
-		return NULL;
-	}
-	*num = r;
-	return osds;
-}
-
 /*
  * Calculate raw (crush) set for given pgid.
  *
@@ -1776,17 +1711,11 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
  */
 int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
-	int rawosds[CEPH_PG_MAX_SIZE], *osds;
-	int i, num = CEPH_PG_MAX_SIZE;
+	int osds[CEPH_PG_MAX_SIZE];
+	int primary;
 
-	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-	if (!osds)
-		return -1;
+	ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
 
-	/* primary is first up osd */
-	for (i = 0; i < num; i++)
-		if (ceph_osd_is_up(osdmap, osds[i]))
-			return osds[i];
-	return -1;
+	return primary;
 }
 EXPORT_SYMBOL(ceph_calc_pg_primary);
-- 
cgit 


From f31da0f3e12e57f21d73315e06c48fb9860fe07d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Wed, 2 Apr 2014 20:34:04 +0400
Subject: libceph: output primary affinity values on osdmap updates

Similar to osd weights, output primary affinity values on incremental
osdmap updates.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
---
 net/ceph/osdmap.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ceph')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 408d14826b5e..e632b5a52f5b 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1005,6 +1005,8 @@ static int decode_new_primary_affinity(void **p, void *end,
 		ret = set_primary_affinity(map, osd, aff);
 		if (ret)
 			return ret;
+
+		pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
 	}
 
 	return 0;
-- 
cgit 


From 8a53f23fcda355958a79774c6333a3a31c380ecf Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Fri, 4 Apr 2014 18:21:32 +0400
Subject: libceph: dump pool {read,write}_tier to debugfs

Dump pool {read,write}_tier to debugfs.  While at it, fixup printk type
specifiers and remove the unnecessary cast to unsigned long long.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
---
 net/ceph/debugfs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ceph')

diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 34453a2b4b4d..10421a4b76f8 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -68,9 +68,9 @@ static int osdmap_show(struct seq_file *s, void *p)
 		struct ceph_pg_pool_info *pool =
 			rb_entry(n, struct ceph_pg_pool_info, node);
 
-		seq_printf(s, "pg_pool %llu pg_num %d / %d\n",
-			   (unsigned long long)pool->id, pool->pg_num,
-			   pool->pg_num_mask);
+		seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
+			   pool->id, pool->pg_num, pool->pg_num_mask,
+			   pool->read_tier, pool->write_tier);
 	}
 	for (i = 0; i < map->max_osd; i++) {
 		struct ceph_entity_addr *addr = &map->osd_addr[i];
-- 
cgit