summaryrefslogtreecommitdiff
path: root/net/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'net/ceph')
-rw-r--r--net/ceph/buffer.c22
-rw-r--r--net/ceph/ceph_common.c24
-rw-r--r--net/ceph/crush/crush.c7
-rw-r--r--net/ceph/crush/mapper.c336
-rw-r--r--net/ceph/debugfs.c3
-rw-r--r--net/ceph/messenger.c81
-rw-r--r--net/ceph/mon_client.c8
-rw-r--r--net/ceph/osd_client.c361
-rw-r--r--net/ceph/osdmap.c78
9 files changed, 709 insertions, 211 deletions
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
index bf3e6a13c215..621b5f65407f 100644
--- a/net/ceph/buffer.c
+++ b/net/ceph/buffer.c
@@ -6,6 +6,7 @@
#include <linux/ceph/buffer.h>
#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h> /* for ceph_kv{malloc,free} */
struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
{
@@ -15,16 +16,10 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
if (!b)
return NULL;
- b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
- if (b->vec.iov_base) {
- b->is_vmalloc = false;
- } else {
- b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL);
- if (!b->vec.iov_base) {
- kfree(b);
- return NULL;
- }
- b->is_vmalloc = true;
+ b->vec.iov_base = ceph_kvmalloc(len, gfp);
+ if (!b->vec.iov_base) {
+ kfree(b);
+ return NULL;
}
kref_init(&b->kref);
@@ -40,12 +35,7 @@ void ceph_buffer_release(struct kref *kref)
struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
dout("buffer_release %p\n", b);
- if (b->vec.iov_base) {
- if (b->is_vmalloc)
- vfree(b->vec.iov_base);
- else
- kfree(b->vec.iov_base);
- }
+ ceph_kvfree(b->vec.iov_base);
kfree(b);
}
EXPORT_SYMBOL(ceph_buffer_release);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 34b11ee8124e..67d7721d237e 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/statfs.h>
#include <linux/string.h>
+#include <linux/vmalloc.h>
#include <linux/nsproxy.h>
#include <net/net_namespace.h>
@@ -170,6 +171,25 @@ int ceph_compare_options(struct ceph_options *new_opt,
}
EXPORT_SYMBOL(ceph_compare_options);
+void *ceph_kvmalloc(size_t size, gfp_t flags)
+{
+ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ void *ptr = kmalloc(size, flags | __GFP_NOWARN);
+ if (ptr)
+ return ptr;
+ }
+
+ return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
+}
+
+void ceph_kvfree(const void *ptr)
+{
+ if (is_vmalloc_addr(ptr))
+ vfree(ptr);
+ else
+ kfree(ptr);
+}
+
static int parse_fsid(const char *str, struct ceph_fsid *fsid)
{
@@ -461,8 +481,8 @@ EXPORT_SYMBOL(ceph_client_id);
* create a fresh client instance
*/
struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
- unsigned int supported_features,
- unsigned int required_features)
+ u64 supported_features,
+ u64 required_features)
{
struct ceph_client *client;
struct ceph_entity_addr *myaddr = NULL;
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 089613234f03..16bc199d9a62 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -116,11 +116,14 @@ void crush_destroy(struct crush_map *map)
if (map->rules) {
__u32 b;
for (b = 0; b < map->max_rules; b++)
- kfree(map->rules[b]);
+ crush_destroy_rule(map->rules[b]);
kfree(map->rules);
}
kfree(map);
}
-
+void crush_destroy_rule(struct crush_rule *rule)
+{
+ kfree(rule);
+}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index cbd06a91941c..b703790b4e44 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -189,7 +189,7 @@ static int terminal(int x)
static int bucket_tree_choose(struct crush_bucket_tree *bucket,
int x, int r)
{
- int n, l;
+ int n;
__u32 w;
__u64 t;
@@ -197,6 +197,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
n = bucket->num_nodes >> 1;
while (!terminal(n)) {
+ int l;
/* pick point in [0, w) */
w = bucket->node_weights[n];
t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
@@ -264,8 +265,12 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
* true if device is marked "out" (failed, fully offloaded)
* of the cluster
*/
-static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x)
+static int is_out(const struct crush_map *map,
+ const __u32 *weight, int weight_max,
+ int item, int x)
{
+ if (item >= weight_max)
+ return 1;
if (weight[item] >= 0x10000)
return 0;
if (weight[item] == 0)
@@ -277,7 +282,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
}
/**
- * crush_choose - choose numrep distinct items of given type
+ * crush_choose_firstn - choose numrep distinct items of given type
* @map: the crush_map
* @bucket: the bucket we are choose an item from
* @x: crush input value
@@ -285,18 +290,24 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
* @type: the type of item to choose
* @out: pointer to output vector
* @outpos: our position in that vector
- * @firstn: true if choosing "first n" items, false if choosing "indep"
- * @recurse_to_leaf: true if we want one device under each item of given type
- * @descend_once: true if we should only try one descent before giving up
+ * @tries: number of attempts to make
+ * @recurse_tries: number of attempts to have recursive chooseleaf make
+ * @local_tries: localized retries
+ * @local_fallback_tries: localized fallback retries
+ * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
* @out2: second output vector for leaf items (if @recurse_to_leaf)
*/
-static int crush_choose(const struct crush_map *map,
- struct crush_bucket *bucket,
- const __u32 *weight,
- int x, int numrep, int type,
- int *out, int outpos,
- int firstn, int recurse_to_leaf,
- int descend_once, int *out2)
+static int crush_choose_firstn(const struct crush_map *map,
+ struct crush_bucket *bucket,
+ const __u32 *weight, int weight_max,
+ int x, int numrep, int type,
+ int *out, int outpos,
+ unsigned int tries,
+ unsigned int recurse_tries,
+ unsigned int local_tries,
+ unsigned int local_fallback_tries,
+ int recurse_to_leaf,
+ int *out2)
{
int rep;
unsigned int ftotal, flocal;
@@ -325,35 +336,17 @@ static int crush_choose(const struct crush_map *map,
collide = 0;
retry_bucket = 0;
r = rep;
- if (in->alg == CRUSH_BUCKET_UNIFORM) {
- /* be careful */
- if (firstn || (__u32)numrep >= in->size)
- /* r' = r + f_total */
- r += ftotal;
- else if (in->size % numrep == 0)
- /* r'=r+(n+1)*f_local */
- r += (numrep+1) *
- (flocal+ftotal);
- else
- /* r' = r + n*f_local */
- r += numrep * (flocal+ftotal);
- } else {
- if (firstn)
- /* r' = r + f_total */
- r += ftotal;
- else
- /* r' = r + n*f_local */
- r += numrep * (flocal+ftotal);
- }
+ /* r' = r + f_total */
+ r += ftotal;
/* bucket choose */
if (in->size == 0) {
reject = 1;
goto reject;
}
- if (map->choose_local_fallback_tries > 0 &&
+ if (local_fallback_tries > 0 &&
flocal >= (in->size>>1) &&
- flocal > map->choose_local_fallback_tries)
+ flocal > local_fallback_tries)
item = bucket_perm_choose(in, x, r);
else
item = crush_bucket_choose(in, x, r);
@@ -394,13 +387,15 @@ static int crush_choose(const struct crush_map *map,
reject = 0;
if (!collide && recurse_to_leaf) {
if (item < 0) {
- if (crush_choose(map,
+ if (crush_choose_firstn(map,
map->buckets[-1-item],
- weight,
+ weight, weight_max,
x, outpos+1, 0,
out2, outpos,
- firstn, 0,
- map->chooseleaf_descend_once,
+ recurse_tries, 0,
+ local_tries,
+ local_fallback_tries,
+ 0,
NULL) <= outpos)
/* didn't get leaf */
reject = 1;
@@ -414,6 +409,7 @@ static int crush_choose(const struct crush_map *map,
/* out? */
if (itemtype == 0)
reject = is_out(map, weight,
+ weight_max,
item, x);
else
reject = 0;
@@ -424,17 +420,14 @@ reject:
ftotal++;
flocal++;
- if (reject && descend_once)
- /* let outer call try again */
- skip_rep = 1;
- else if (collide && flocal <= map->choose_local_tries)
+ if (collide && flocal <= local_tries)
/* retry locally a few times */
retry_bucket = 1;
- else if (map->choose_local_fallback_tries > 0 &&
- flocal <= in->size + map->choose_local_fallback_tries)
+ else if (local_fallback_tries > 0 &&
+ flocal <= in->size + local_fallback_tries)
/* exhaustive bucket search */
retry_bucket = 1;
- else if (ftotal <= map->choose_total_tries)
+ else if (ftotal <= tries)
/* then retry descent */
retry_descent = 1;
else
@@ -464,21 +457,179 @@ reject:
/**
+ * crush_choose_indep: alternative breadth-first positionally stable mapping
+ *
+ */
+static void crush_choose_indep(const struct crush_map *map,
+ struct crush_bucket *bucket,
+ const __u32 *weight, int weight_max,
+ int x, int left, int numrep, int type,
+ int *out, int outpos,
+ unsigned int tries,
+ unsigned int recurse_tries,
+ int recurse_to_leaf,
+ int *out2,
+ int parent_r)
+{
+ struct crush_bucket *in = bucket;
+ int endpos = outpos + left;
+ int rep;
+ unsigned int ftotal;
+ int r;
+ int i;
+ int item = 0;
+ int itemtype;
+ int collide;
+
+ dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep);
+
+ /* initially my result is undefined */
+ for (rep = outpos; rep < endpos; rep++) {
+ out[rep] = CRUSH_ITEM_UNDEF;
+ if (out2)
+ out2[rep] = CRUSH_ITEM_UNDEF;
+ }
+
+ for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
+ for (rep = outpos; rep < endpos; rep++) {
+ if (out[rep] != CRUSH_ITEM_UNDEF)
+ continue;
+
+ in = bucket; /* initial bucket */
+
+ /* choose through intervening buckets */
+ for (;;) {
+ /* note: we base the choice on the position
+ * even in the nested call. that means that
+ * if the first layer chooses the same bucket
+ * in a different position, we will tend to
+ * choose a different item in that bucket.
+ * this will involve more devices in data
+ * movement and tend to distribute the load.
+ */
+ r = rep + parent_r;
+
+ /* be careful */
+ if (in->alg == CRUSH_BUCKET_UNIFORM &&
+ in->size % numrep == 0)
+ /* r'=r+(n+1)*f_total */
+ r += (numrep+1) * ftotal;
+ else
+ /* r' = r + n*f_total */
+ r += numrep * ftotal;
+
+ /* bucket choose */
+ if (in->size == 0) {
+ dprintk(" empty bucket\n");
+ break;
+ }
+
+ item = crush_bucket_choose(in, x, r);
+ if (item >= map->max_devices) {
+ dprintk(" bad item %d\n", item);
+ out[rep] = CRUSH_ITEM_NONE;
+ if (out2)
+ out2[rep] = CRUSH_ITEM_NONE;
+ left--;
+ break;
+ }
+
+ /* desired type? */
+ if (item < 0)
+ itemtype = map->buckets[-1-item]->type;
+ else
+ itemtype = 0;
+ dprintk(" item %d type %d\n", item, itemtype);
+
+ /* keep going? */
+ if (itemtype != type) {
+ if (item >= 0 ||
+ (-1-item) >= map->max_buckets) {
+ dprintk(" bad item type %d\n", type);
+ out[rep] = CRUSH_ITEM_NONE;
+ if (out2)
+ out2[rep] =
+ CRUSH_ITEM_NONE;
+ left--;
+ break;
+ }
+ in = map->buckets[-1-item];
+ continue;
+ }
+
+ /* collision? */
+ collide = 0;
+ for (i = outpos; i < endpos; i++) {
+ if (out[i] == item) {
+ collide = 1;
+ break;
+ }
+ }
+ if (collide)
+ break;
+
+ if (recurse_to_leaf) {
+ if (item < 0) {
+ crush_choose_indep(map,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, 1, numrep, 0,
+ out2, rep,
+ recurse_tries, 0,
+ 0, NULL, r);
+ if (out2[rep] == CRUSH_ITEM_NONE) {
+ /* placed nothing; no leaf */
+ break;
+ }
+ } else {
+ /* we already have a leaf! */
+ out2[rep] = item;
+ }
+ }
+
+ /* out? */
+ if (itemtype == 0 &&
+ is_out(map, weight, weight_max, item, x))
+ break;
+
+ /* yay! */
+ out[rep] = item;
+ left--;
+ break;
+ }
+ }
+ }
+ for (rep = outpos; rep < endpos; rep++) {
+ if (out[rep] == CRUSH_ITEM_UNDEF) {
+ out[rep] = CRUSH_ITEM_NONE;
+ }
+ if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
+ out2[rep] = CRUSH_ITEM_NONE;
+ }
+ }
+}
+
+/**
* crush_do_rule - calculate a mapping with the given input and rule
* @map: the crush_map
* @ruleno: the rule id
* @x: hash input
* @result: pointer to result vector
* @result_max: maximum result size
+ * @weight: weight vector (for map leaves)
+ * @weight_max: size of weight vector
+ * @scratch: scratch vector for private use; must be >= 3 * result_max
*/
int crush_do_rule(const struct crush_map *map,
int ruleno, int x, int *result, int result_max,
- const __u32 *weight)
+ const __u32 *weight, int weight_max,
+ int *scratch)
{
int result_len;
- int a[CRUSH_MAX_SET];
- int b[CRUSH_MAX_SET];
- int c[CRUSH_MAX_SET];
+ int *a = scratch;
+ int *b = scratch + result_max;
+ int *c = scratch + result_max*2;
int recurse_to_leaf;
int *w;
int wsize = 0;
@@ -489,8 +640,10 @@ int crush_do_rule(const struct crush_map *map,
__u32 step;
int i, j;
int numrep;
- int firstn;
- const int descend_once = 0;
+ int choose_tries = map->choose_total_tries;
+ int choose_local_tries = map->choose_local_tries;
+ int choose_local_fallback_tries = map->choose_local_fallback_tries;
+ int choose_leaf_tries = 0;
if ((__u32)ruleno >= map->max_rules) {
dprintk(" bad ruleno %d\n", ruleno);
@@ -503,29 +656,49 @@ int crush_do_rule(const struct crush_map *map,
o = b;
for (step = 0; step < rule->len; step++) {
+ int firstn = 0;
struct crush_rule_step *curstep = &rule->steps[step];
- firstn = 0;
switch (curstep->op) {
case CRUSH_RULE_TAKE:
w[0] = curstep->arg1;
wsize = 1;
break;
- case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+ case CRUSH_RULE_SET_CHOOSE_TRIES:
+ if (curstep->arg1 > 0)
+ choose_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
+ if (curstep->arg1 > 0)
+ choose_leaf_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
+ if (curstep->arg1 > 0)
+ choose_local_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
+ if (curstep->arg1 > 0)
+ choose_local_fallback_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_CHOOSELEAF_FIRSTN:
case CRUSH_RULE_CHOOSE_FIRSTN:
firstn = 1;
/* fall through */
- case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+ case CRUSH_RULE_CHOOSELEAF_INDEP:
case CRUSH_RULE_CHOOSE_INDEP:
if (wsize == 0)
break;
recurse_to_leaf =
curstep->op ==
- CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+ CRUSH_RULE_CHOOSELEAF_FIRSTN ||
curstep->op ==
- CRUSH_RULE_CHOOSE_LEAF_INDEP;
+ CRUSH_RULE_CHOOSELEAF_INDEP;
/* reset output */
osize = 0;
@@ -543,22 +716,51 @@ int crush_do_rule(const struct crush_map *map,
continue;
}
j = 0;
- osize += crush_choose(map,
- map->buckets[-1-w[i]],
- weight,
- x, numrep,
- curstep->arg2,
- o+osize, j,
- firstn,
- recurse_to_leaf,
- descend_once, c+osize);
+ if (firstn) {
+ int recurse_tries;
+ if (choose_leaf_tries)
+ recurse_tries =
+ choose_leaf_tries;
+ else if (map->chooseleaf_descend_once)
+ recurse_tries = 1;
+ else
+ recurse_tries = choose_tries;
+ osize += crush_choose_firstn(
+ map,
+ map->buckets[-1-w[i]],
+ weight, weight_max,
+ x, numrep,
+ curstep->arg2,
+ o+osize, j,
+ choose_tries,
+ recurse_tries,
+ choose_local_tries,
+ choose_local_fallback_tries,
+ recurse_to_leaf,
+ c+osize);
+ } else {
+ crush_choose_indep(
+ map,
+ map->buckets[-1-w[i]],
+ weight, weight_max,
+ x, numrep, numrep,
+ curstep->arg2,
+ o+osize, j,
+ choose_tries,
+ choose_leaf_tries ?
+ choose_leaf_tries : 1,
+ recurse_to_leaf,
+ c+osize,
+ 0);
+ osize += numrep;
+ }
}
if (recurse_to_leaf)
/* copy final _leaf_ values to output set */
memcpy(o, c, osize*sizeof(*o));
- /* swap t and w arrays */
+ /* swap o and w arrays */
tmp = o;
o = w;
w = tmp;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 83661cdc0766..258a382e75ed 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -132,7 +132,8 @@ static int osdc_show(struct seq_file *s, void *pp)
req->r_osd ? req->r_osd->o_osd : -1,
req->r_pgid.pool, req->r_pgid.seed);
- seq_printf(s, "%.*s", req->r_oid_len, req->r_oid);
+ seq_printf(s, "%.*s", req->r_base_oid.name_len,
+ req->r_base_oid.name);
if (req->r_reassert_version.epoch)
seq_printf(s, "\t%u'%llu",
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4a5df7b1cc9f..30efc5c18622 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -15,6 +15,7 @@
#include <linux/dns_resolver.h>
#include <net/tcp.h>
+#include <linux/ceph/ceph_features.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/messenger.h>
#include <linux/ceph/decode.h>
@@ -777,13 +778,12 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
bio = data->bio;
BUG_ON(!bio);
- BUG_ON(!bio->bi_vcnt);
cursor->resid = min(length, data->bio_length);
cursor->bio = bio;
- cursor->vector_index = 0;
- cursor->vector_offset = 0;
- cursor->last_piece = length <= bio->bi_io_vec[0].bv_len;
+ cursor->bvec_iter = bio->bi_iter;
+ cursor->last_piece =
+ cursor->resid <= bio_iter_len(bio, cursor->bvec_iter);
}
static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
@@ -792,71 +792,67 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
{
struct ceph_msg_data *data = cursor->data;
struct bio *bio;
- struct bio_vec *bio_vec;
- unsigned int index;
+ struct bio_vec bio_vec;
BUG_ON(data->type != CEPH_MSG_DATA_BIO);
bio = cursor->bio;
BUG_ON(!bio);
- index = cursor->vector_index;
- BUG_ON(index >= (unsigned int) bio->bi_vcnt);
+ bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
- bio_vec = &bio->bi_io_vec[index];
- BUG_ON(cursor->vector_offset >= bio_vec->bv_len);
- *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset);
+ *page_offset = (size_t) bio_vec.bv_offset;
BUG_ON(*page_offset >= PAGE_SIZE);
if (cursor->last_piece) /* pagelist offset is always 0 */
*length = cursor->resid;
else
- *length = (size_t) (bio_vec->bv_len - cursor->vector_offset);
+ *length = (size_t) bio_vec.bv_len;
BUG_ON(*length > cursor->resid);
BUG_ON(*page_offset + *length > PAGE_SIZE);
- return bio_vec->bv_page;
+ return bio_vec.bv_page;
}
static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
size_t bytes)
{
struct bio *bio;
- struct bio_vec *bio_vec;
- unsigned int index;
+ struct bio_vec bio_vec;
BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO);
bio = cursor->bio;
BUG_ON(!bio);
- index = cursor->vector_index;
- BUG_ON(index >= (unsigned int) bio->bi_vcnt);
- bio_vec = &bio->bi_io_vec[index];
+ bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
/* Advance the cursor offset */
BUG_ON(cursor->resid < bytes);
cursor->resid -= bytes;
- cursor->vector_offset += bytes;
- if (cursor->vector_offset < bio_vec->bv_len)
+
+ bio_advance_iter(bio, &cursor->bvec_iter, bytes);
+
+ if (bytes < bio_vec.bv_len)
return false; /* more bytes to process in this segment */
- BUG_ON(cursor->vector_offset != bio_vec->bv_len);
/* Move on to the next segment, and possibly the next bio */
- if (++index == (unsigned int) bio->bi_vcnt) {
+ if (!cursor->bvec_iter.bi_size) {
bio = bio->bi_next;
- index = 0;
+ cursor->bio = bio;
+ if (bio)
+ cursor->bvec_iter = bio->bi_iter;
+ else
+ memset(&cursor->bvec_iter, 0,
+ sizeof(cursor->bvec_iter));
}
- cursor->bio = bio;
- cursor->vector_index = index;
- cursor->vector_offset = 0;
if (!cursor->last_piece) {
BUG_ON(!cursor->resid);
BUG_ON(!bio);
/* A short read is OK, so use <= rather than == */
- if (cursor->resid <= bio->bi_io_vec[index].bv_len)
+ if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter))
cursor->last_piece = true;
}
@@ -1865,7 +1861,9 @@ int ceph_parse_ips(const char *c, const char *end,
port = (port * 10) + (*p - '0');
p++;
}
- if (port > 65535 || port == 0)
+ if (port == 0)
+ port = CEPH_MON_PORT;
+ else if (port > 65535)
goto bad;
} else {
port = CEPH_MON_PORT;
@@ -1945,7 +1943,8 @@ static int process_connect(struct ceph_connection *con)
{
u64 sup_feat = con->msgr->supported_features;
u64 req_feat = con->msgr->required_features;
- u64 server_feat = le64_to_cpu(con->in_reply.features);
+ u64 server_feat = ceph_sanitize_features(
+ le64_to_cpu(con->in_reply.features));
int ret;
dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -2853,8 +2852,8 @@ static void con_fault(struct ceph_connection *con)
*/
void ceph_messenger_init(struct ceph_messenger *msgr,
struct ceph_entity_addr *myaddr,
- u32 supported_features,
- u32 required_features,
+ u64 supported_features,
+ u64 required_features,
bool nocrc)
{
msgr->supported_features = supported_features;
@@ -3126,15 +3125,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
INIT_LIST_HEAD(&m->data);
/* front */
- m->front_max = front_len;
if (front_len) {
- if (front_len > PAGE_CACHE_SIZE) {
- m->front.iov_base = __vmalloc(front_len, flags,
- PAGE_KERNEL);
- m->front_is_vmalloc = true;
- } else {
- m->front.iov_base = kmalloc(front_len, flags);
- }
+ m->front.iov_base = ceph_kvmalloc(front_len, flags);
if (m->front.iov_base == NULL) {
dout("ceph_msg_new can't allocate %d bytes\n",
front_len);
@@ -3143,7 +3135,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
} else {
m->front.iov_base = NULL;
}
- m->front.iov_len = front_len;
+ m->front_alloc_len = m->front.iov_len = front_len;
dout("ceph_msg_new %p front %d\n", m, front_len);
return m;
@@ -3256,10 +3248,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
void ceph_msg_kfree(struct ceph_msg *m)
{
dout("msg_kfree %p\n", m);
- if (m->front_is_vmalloc)
- vfree(m->front.iov_base);
- else
- kfree(m->front.iov_base);
+ ceph_kvfree(m->front.iov_base);
kmem_cache_free(ceph_msg_cache, m);
}
@@ -3301,8 +3290,8 @@ EXPORT_SYMBOL(ceph_msg_last_put);
void ceph_msg_dump(struct ceph_msg *msg)
{
- pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
- msg->front_max, msg->data_length);
+ pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg,
+ msg->front_alloc_len, msg->data_length);
print_hex_dump(KERN_DEBUG, "header: ",
DUMP_PREFIX_OFFSET, 16, 1,
&msg->hdr, sizeof(msg->hdr), true);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 1fe25cd29d0e..2ac9ef35110b 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -152,7 +152,7 @@ static int __open_session(struct ceph_mon_client *monc)
/* initiatiate authentication handshake */
ret = ceph_auth_build_hello(monc->auth,
monc->m_auth->front.iov_base,
- monc->m_auth->front_max);
+ monc->m_auth->front_alloc_len);
__send_prepared_auth_request(monc, ret);
} else {
dout("open_session mon%d already open\n", monc->cur_mon);
@@ -196,7 +196,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)
int num;
p = msg->front.iov_base;
- end = p + msg->front_max;
+ end = p + msg->front_alloc_len;
num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
ceph_encode_32(&p, num);
@@ -897,7 +897,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
msg->front.iov_len,
monc->m_auth->front.iov_base,
- monc->m_auth->front_max);
+ monc->m_auth->front_alloc_len);
if (ret < 0) {
monc->client->auth_err = ret;
wake_up_all(&monc->client->auth_wq);
@@ -939,7 +939,7 @@ static int __validate_auth(struct ceph_mon_client *monc)
return 0;
ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
- monc->m_auth->front_max);
+ monc->m_auth->front_alloc_len);
if (ret <= 0)
return ret; /* either an error, or no need to authenticate */
__send_prepared_auth_request(monc, ret);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2b4b32aaa893..0676f2b199d6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -338,7 +338,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
msg_size = 4 + 4 + 8 + 8 + 4+8;
msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pg_t */
- msg_size += 4 + MAX_OBJ_NAME_SIZE;
+ msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
msg_size += 8; /* snapid */
msg_size += 8; /* snap_seq */
@@ -368,6 +368,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
INIT_LIST_HEAD(&req->r_req_lru_item);
INIT_LIST_HEAD(&req->r_osd_item);
+ req->r_base_oloc.pool = -1;
+ req->r_target_oloc.pool = -1;
+
/* create reply message */
if (use_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
@@ -761,11 +764,11 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
if (num_ops > 1)
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
- req->r_file_layout = *layout; /* keep a copy */
+ req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
- snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx",
- vino.ino, objnum);
- req->r_oid_len = strlen(req->r_oid);
+ snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
+ "%llx.%08llx", vino.ino, objnum);
+ req->r_base_oid.name_len = strlen(req->r_base_oid.name);
return req;
}
@@ -1044,8 +1047,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
!ceph_con_opened(&osd->o_con)) {
struct ceph_osd_request *req;
- dout(" osd addr hasn't changed and connection never opened,"
- " letting msgr retry");
+ dout("osd addr hasn't changed and connection never opened, "
+ "letting msgr retry\n");
/* touch each r_stamp for handle_timeout()'s benfit */
list_for_each_entry(req, &osd->o_requests, r_osd_item)
req->r_stamp = jiffies;
@@ -1232,6 +1235,61 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
EXPORT_SYMBOL(ceph_osdc_set_request_linger);
/*
+ * Returns whether a request should be blocked from being sent
+ * based on the current osdmap and osd_client settings.
+ *
+ * Caller should hold map_sem for read.
+ */
+static bool __req_should_be_paused(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+ bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+ return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
+ (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
+/*
+ * Calculate mapping of a request to a PG. Takes tiering into account.
+ */
+static int __calc_request_pg(struct ceph_osdmap *osdmap,
+ struct ceph_osd_request *req,
+ struct ceph_pg *pg_out)
+{
+ bool need_check_tiering;
+
+ need_check_tiering = false;
+ if (req->r_target_oloc.pool == -1) {
+ req->r_target_oloc = req->r_base_oloc; /* struct */
+ need_check_tiering = true;
+ }
+ if (req->r_target_oid.name_len == 0) {
+ ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
+ need_check_tiering = true;
+ }
+
+ if (need_check_tiering &&
+ (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+ struct ceph_pg_pool_info *pi;
+
+ pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
+ if (pi) {
+ if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+ pi->read_tier >= 0)
+ req->r_target_oloc.pool = pi->read_tier;
+ if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ pi->write_tier >= 0)
+ req->r_target_oloc.pool = pi->write_tier;
+ }
+ /* !pi is caught in ceph_oloc_oid_to_pg() */
+ }
+
+ return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
+ &req->r_target_oid, pg_out);
+}
+
+/*
* Pick an osd (the first 'up' osd in the pg), allocate the osd struct
* (as needed), and set the request r_osd appropriately. If there is
* no up osd, set r_osd to NULL. Move the request to the appropriate list
@@ -1248,10 +1306,11 @@ static int __map_request(struct ceph_osd_client *osdc,
int acting[CEPH_PG_MAX_SIZE];
int o = -1, num = 0;
int err;
+ bool was_paused;
dout("map_request %p tid %lld\n", req, req->r_tid);
- err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap,
- ceph_file_layout_pg_pool(req->r_file_layout));
+
+ err = __calc_request_pg(osdc->osdmap, req, &pgid);
if (err) {
list_move(&req->r_req_lru_item, &osdc->req_notarget);
return err;
@@ -1264,12 +1323,18 @@ static int __map_request(struct ceph_osd_client *osdc,
num = err;
}
+ was_paused = req->r_paused;
+ req->r_paused = __req_should_be_paused(osdc, req);
+ if (was_paused && !req->r_paused)
+ force_resend = 1;
+
if ((!force_resend &&
req->r_osd && req->r_osd->o_osd == o &&
req->r_sent >= req->r_osd->o_incarnation &&
req->r_num_pg_osds == num &&
memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
- (req->r_osd == NULL && o == -1))
+ (req->r_osd == NULL && o == -1) ||
+ req->r_paused)
return 0; /* no change */
dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
@@ -1331,7 +1396,7 @@ static void __send_request(struct ceph_osd_client *osdc,
/* fill in message content that changes each time we send it */
put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
put_unaligned_le32(req->r_flags, req->r_request_flags);
- put_unaligned_le64(req->r_pgid.pool, req->r_request_pool);
+ put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
p = req->r_request_pgid;
ceph_encode_64(&p, req->r_pgid.pool);
ceph_encode_32(&p, req->r_pgid.seed);
@@ -1362,6 +1427,40 @@ static void __send_queued(struct ceph_osd_client *osdc)
}
/*
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req,
+ bool nofail)
+{
+ int rc;
+
+ __register_request(osdc, req);
+ req->r_sent = 0;
+ req->r_got_reply = 0;
+ rc = __map_request(osdc, req, 0);
+ if (rc < 0) {
+ if (nofail) {
+ dout("osdc_start_request failed map, "
+ " will retry %lld\n", req->r_tid);
+ rc = 0;
+ } else {
+ __unregister_request(osdc, req);
+ }
+ return rc;
+ }
+
+ if (req->r_osd == NULL) {
+ dout("send_request %p no up osds in pg\n", req);
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+ } else {
+ __send_queued(osdc);
+ }
+
+ return 0;
+}
+
+/*
* Timeout callback, called every N seconds when 1 or more osd
* requests has been active for more than N seconds. When this
* happens, we ping all OSDs with requests who have timed out to
@@ -1432,6 +1531,109 @@ static void handle_osds_timeout(struct work_struct *work)
round_jiffies_relative(delay));
}
+static int ceph_oloc_decode(void **p, void *end,
+ struct ceph_object_locator *oloc)
+{
+ u8 struct_v, struct_cv;
+ u32 len;
+ void *struct_end;
+ int ret = 0;
+
+ ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
+ struct_v = ceph_decode_8(p);
+ struct_cv = ceph_decode_8(p);
+ if (struct_v < 3) {
+ pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
+ struct_v, struct_cv);
+ goto e_inval;
+ }
+ if (struct_cv > 6) {
+ pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
+ struct_v, struct_cv);
+ goto e_inval;
+ }
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, e_inval);
+ struct_end = *p + len;
+
+ oloc->pool = ceph_decode_64(p);
+ *p += 4; /* skip preferred */
+
+ len = ceph_decode_32(p);
+ if (len > 0) {
+ pr_warn("ceph_object_locator::key is set\n");
+ goto e_inval;
+ }
+
+ if (struct_v >= 5) {
+ len = ceph_decode_32(p);
+ if (len > 0) {
+ pr_warn("ceph_object_locator::nspace is set\n");
+ goto e_inval;
+ }
+ }
+
+ if (struct_v >= 6) {
+ s64 hash = ceph_decode_64(p);
+ if (hash != -1) {
+ pr_warn("ceph_object_locator::hash is set\n");
+ goto e_inval;
+ }
+ }
+
+ /* skip the rest */
+ *p = struct_end;
+out:
+ return ret;
+
+e_inval:
+ ret = -EINVAL;
+ goto out;
+}
+
+static int ceph_redirect_decode(void **p, void *end,
+ struct ceph_request_redirect *redir)
+{
+ u8 struct_v, struct_cv;
+ u32 len;
+ void *struct_end;
+ int ret;
+
+ ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
+ struct_v = ceph_decode_8(p);
+ struct_cv = ceph_decode_8(p);
+ if (struct_cv > 1) {
+ pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
+ struct_v, struct_cv);
+ goto e_inval;
+ }
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, e_inval);
+ struct_end = *p + len;
+
+ ret = ceph_oloc_decode(p, end, &redir->oloc);
+ if (ret)
+ goto out;
+
+ len = ceph_decode_32(p);
+ if (len > 0) {
+ pr_warn("ceph_request_redirect::object_name is set\n");
+ goto e_inval;
+ }
+
+ len = ceph_decode_32(p);
+ *p += len; /* skip osd_instructions */
+
+ /* skip the rest */
+ *p = struct_end;
+out:
+ return ret;
+
+e_inval:
+ ret = -EINVAL;
+ goto out;
+}
+
static void complete_request(struct ceph_osd_request *req)
{
complete_all(&req->r_safe_completion); /* fsync waiter */
@@ -1446,6 +1648,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
{
void *p, *end;
struct ceph_osd_request *req;
+ struct ceph_request_redirect redir;
u64 tid;
int object_len;
unsigned int numops;
@@ -1484,6 +1687,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
osdmap_epoch = ceph_decode_32(&p);
/* lookup */
+ down_read(&osdc->map_sem);
mutex_lock(&osdc->request_mutex);
req = __lookup_request(osdc, tid);
if (req == NULL) {
@@ -1525,10 +1729,40 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
for (i = 0; i < numops; i++)
req->r_reply_op_result[i] = ceph_decode_32(&p);
- already_completed = req->r_got_reply;
+ if (le16_to_cpu(msg->hdr.version) >= 6) {
+ p += 8 + 4; /* skip replay_version */
+ p += 8; /* skip user_version */
- if (!req->r_got_reply) {
+ err = ceph_redirect_decode(&p, end, &redir);
+ if (err)
+ goto bad_put;
+ } else {
+ redir.oloc.pool = -1;
+ }
+
+ if (redir.oloc.pool != -1) {
+ dout("redirect pool %lld\n", redir.oloc.pool);
+
+ __unregister_request(osdc, req);
+ req->r_target_oloc = redir.oloc; /* struct */
+
+ /*
+ * Start redirect requests with nofail=true. If
+ * mapping fails, request will end up on the notarget
+ * list, waiting for the new osdmap (which can take
+ * a while), even though the original request mapped
+ * successfully. In the future we might want to follow
+ * original request's nofail setting here.
+ */
+ err = __ceph_osdc_start_request(osdc, req, true);
+ BUG_ON(err);
+
+ goto out_unlock;
+ }
+
+ already_completed = req->r_got_reply;
+ if (!req->r_got_reply) {
req->r_result = result;
dout("handle_reply result %d bytes %d\n", req->r_result,
bytes);
@@ -1542,8 +1776,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
req->r_got_reply = 1;
} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
dout("handle_reply tid %llu dup ack\n", tid);
- mutex_unlock(&osdc->request_mutex);
- goto done;
+ goto out_unlock;
}
dout("handle_reply tid %llu flags %d\n", tid, flags);
@@ -1558,6 +1791,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
__unregister_request(osdc, req);
mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
if (!already_completed) {
if (req->r_unsafe_callback &&
@@ -1575,15 +1809,27 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
complete_request(req);
}
-done:
+out:
dout("req=%p req->r_linger=%d\n", req, req->r_linger);
ceph_osdc_put_request(req);
return;
+out_unlock:
+ mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
+ goto out;
bad_put:
+ req->r_result = -EIO;
+ __unregister_request(osdc, req);
+ if (req->r_callback)
+ req->r_callback(req, msg);
+ else
+ complete_all(&req->r_completion);
+ complete_request(req);
ceph_osdc_put_request(req);
bad_mutex:
mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
bad:
pr_err("corrupt osd_op_reply got %d %d\n",
(int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
@@ -1613,14 +1859,17 @@ static void reset_changed_osds(struct ceph_osd_client *osdc)
*
* Caller should hold map_sem for read.
*/
-static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
+static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
+ bool force_resend_writes)
{
struct ceph_osd_request *req, *nreq;
struct rb_node *p;
int needmap = 0;
int err;
+ bool force_resend_req;
- dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
+ dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
+ force_resend_writes ? " (force resend writes)" : "");
mutex_lock(&osdc->request_mutex);
for (p = rb_first(&osdc->requests); p; ) {
req = rb_entry(p, struct ceph_osd_request, r_node);
@@ -1645,7 +1894,10 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
continue;
}
- err = __map_request(osdc, req, force_resend);
+ force_resend_req = force_resend ||
+ (force_resend_writes &&
+ req->r_flags & CEPH_OSD_FLAG_WRITE);
+ err = __map_request(osdc, req, force_resend_req);
if (err < 0)
continue; /* error */
if (req->r_osd == NULL) {
@@ -1665,7 +1917,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
r_linger_item) {
dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
- err = __map_request(osdc, req, force_resend);
+ err = __map_request(osdc, req,
+ force_resend || force_resend_writes);
dout("__map_request returned %d\n", err);
if (err == 0)
continue; /* no change and no osd was specified */
@@ -1707,6 +1960,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
struct ceph_osdmap *newmap = NULL, *oldmap;
int err;
struct ceph_fsid fsid;
+ bool was_full;
dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
p = msg->front.iov_base;
@@ -1720,6 +1974,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
down_write(&osdc->map_sem);
+ was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+
/* incremental maps */
ceph_decode_32_safe(&p, end, nr_maps, bad);
dout(" %d inc maps\n", nr_maps);
@@ -1744,7 +2000,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
ceph_osdmap_destroy(osdc->osdmap);
osdc->osdmap = newmap;
}
- kick_requests(osdc, 0);
+ was_full = was_full ||
+ ceph_osdmap_flag(osdc->osdmap,
+ CEPH_OSDMAP_FULL);
+ kick_requests(osdc, 0, was_full);
} else {
dout("ignoring incremental map %u len %d\n",
epoch, maplen);
@@ -1787,7 +2046,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
skipped_map = 1;
ceph_osdmap_destroy(oldmap);
}
- kick_requests(osdc, skipped_map);
+ was_full = was_full ||
+ ceph_osdmap_flag(osdc->osdmap,
+ CEPH_OSDMAP_FULL);
+ kick_requests(osdc, skipped_map, was_full);
}
p += maplen;
nr_maps--;
@@ -1804,7 +2066,9 @@ done:
* we find out when we are no longer full and stop returning
* ENOSPC.
*/
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
+ if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+ ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
+ ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
ceph_monc_request_next_osdmap(&osdc->client->monc);
mutex_lock(&osdc->request_mutex);
@@ -2068,10 +2332,11 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
ceph_encode_32(&p, -1); /* preferred */
/* oid */
- ceph_encode_32(&p, req->r_oid_len);
- memcpy(p, req->r_oid, req->r_oid_len);
- dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
- p += req->r_oid_len;
+ ceph_encode_32(&p, req->r_base_oid.name_len);
+ memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
+ dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
+ req->r_base_oid.name, req->r_base_oid.name_len);
+ p += req->r_base_oid.name_len;
/* ops--can imply data */
ceph_encode_16(&p, (u16)req->r_num_ops);
@@ -2125,34 +2390,16 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req,
bool nofail)
{
- int rc = 0;
+ int rc;
down_read(&osdc->map_sem);
mutex_lock(&osdc->request_mutex);
- __register_request(osdc, req);
- req->r_sent = 0;
- req->r_got_reply = 0;
- rc = __map_request(osdc, req, 0);
- if (rc < 0) {
- if (nofail) {
- dout("osdc_start_request failed map, "
- " will retry %lld\n", req->r_tid);
- rc = 0;
- } else {
- __unregister_request(osdc, req);
- }
- goto out_unlock;
- }
- if (req->r_osd == NULL) {
- dout("send_request %p no up osds in pg\n", req);
- ceph_monc_request_next_osdmap(&osdc->client->monc);
- } else {
- __send_queued(osdc);
- }
- rc = 0;
-out_unlock:
+
+ rc = __ceph_osdc_start_request(osdc, req, nofail);
+
mutex_unlock(&osdc->request_mutex);
up_read(&osdc->map_sem);
+
return rc;
}
EXPORT_SYMBOL(ceph_osdc_start_request);
@@ -2278,9 +2525,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
err = -ENOMEM;
osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
if (!osdc->notify_wq)
- goto out_msgpool;
+ goto out_msgpool_reply;
+
return 0;
+out_msgpool_reply:
+ ceph_msgpool_destroy(&osdc->msgpool_op_reply);
out_msgpool:
ceph_msgpool_destroy(&osdc->msgpool_op);
out_mempool:
@@ -2454,7 +2704,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
struct ceph_osd_client *osdc = osd->o_osdc;
struct ceph_msg *m;
struct ceph_osd_request *req;
- int front = le32_to_cpu(hdr->front_len);
+ int front_len = le32_to_cpu(hdr->front_len);
int data_len = le32_to_cpu(hdr->data_len);
u64 tid;
@@ -2474,12 +2724,13 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
req->r_reply, req->r_reply->con);
ceph_msg_revoke_incoming(req->r_reply);
- if (front > req->r_reply->front.iov_len) {
+ if (front_len > req->r_reply->front_alloc_len) {
pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n",
- front, (int)req->r_reply->front.iov_len,
+ front_len, req->r_reply->front_alloc_len,
(unsigned int)con->peer_name.type,
le64_to_cpu(con->peer_name.num));
- m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false);
+ m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
+ false);
if (!m)
goto out;
ceph_msg_put(req->r_reply);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index dbd9a4792427..aade4a5c1c07 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -464,6 +464,11 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
return NULL;
}
+struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
+{
+ return __lookup_pg_pool(&map->pg_pools, id);
+}
+
const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
{
struct ceph_pg_pool_info *pi;
@@ -514,8 +519,8 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
return -EINVAL;
}
- if (cv > 7) {
- pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv);
+ if (cv > 9) {
+ pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv);
return -EINVAL;
}
len = ceph_decode_32(p);
@@ -543,12 +548,34 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
*p += len;
}
- /* skip removed snaps */
+ /* skip removed_snaps */
num = ceph_decode_32(p);
*p += num * (8 + 8);
*p += 8; /* skip auid */
pi->flags = ceph_decode_64(p);
+ *p += 4; /* skip crash_replay_interval */
+
+ if (ev >= 7)
+ *p += 1; /* skip min_size */
+
+ if (ev >= 8)
+ *p += 8 + 8; /* skip quota_max_* */
+
+ if (ev >= 9) {
+ /* skip tiers */
+ num = ceph_decode_32(p);
+ *p += num * 8;
+
+ *p += 8; /* skip tier_of */
+ *p += 1; /* skip cache_mode */
+
+ pi->read_tier = ceph_decode_64(p);
+ pi->write_tier = ceph_decode_64(p);
+ } else {
+ pi->read_tier = -1;
+ pi->write_tier = -1;
+ }
/* ignore the rest */
@@ -1090,25 +1117,40 @@ invalid:
EXPORT_SYMBOL(ceph_calc_file_object_mapping);
/*
- * calculate an object layout (i.e. pgid) from an oid,
- * file_layout, and osdmap
+ * Calculate mapping of a (oloc, oid) pair to a PG. Should only be
+ * called with target's (oloc, oid), since tiering isn't taken into
+ * account.
*/
-int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
- struct ceph_osdmap *osdmap, uint64_t pool)
+int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
+ struct ceph_object_locator *oloc,
+ struct ceph_object_id *oid,
+ struct ceph_pg *pg_out)
{
- struct ceph_pg_pool_info *pool_info;
+ struct ceph_pg_pool_info *pi;
- BUG_ON(!osdmap);
- pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool);
- if (!pool_info)
+ pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+ if (!pi)
return -EIO;
- pg->pool = pool;
- pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid));
- dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed);
+ pg_out->pool = oloc->pool;
+ pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
+ oid->name_len);
+
+ dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
+ pg_out->pool, pg_out->seed);
return 0;
}
-EXPORT_SYMBOL(ceph_calc_ceph_pg);
+EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+
+static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x,
+ int *result, int result_max,
+ const __u32 *weight, int weight_max)
+{
+ int scratch[result_max * 3];
+
+ return crush_do_rule(map, ruleno, x, result, result_max,
+ weight, weight_max, scratch);
+}
/*
* Calculate raw osd vector for the given pgid. Return pointer to osd
@@ -1163,9 +1205,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
pool->pgp_num_mask) +
(unsigned)pgid.pool;
}
- r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
- min_t(int, pool->size, *num),
- osdmap->osd_weight);
+ r = crush_do_rule_ary(osdmap->crush, ruleno, pps,
+ osds, min_t(int, pool->size, *num),
+ osdmap->osd_weight, osdmap->max_osd);
if (r < 0) {
pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
" size %d\n", r, pgid.pool, pool->crush_ruleset,