diff options
Diffstat (limited to 'net/ceph/osdmap.c')
| -rw-r--r-- | net/ceph/osdmap.c | 273 |
1 files changed, 224 insertions, 49 deletions
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 96c25f5e064a..d245fa508e1c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -11,6 +11,22 @@ #include <linux/crush/hash.h> #include <linux/crush/mapper.h> +static __printf(2, 3) +void osdmap_info(const struct ceph_osdmap *map, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_INFO "%s (%pU e%u): %pV", KBUILD_MODNAME, &map->fsid, + map->epoch, &vaf); + + va_end(args); +} + char *ceph_osdmap_state_str(char *str, int len, u32 state) { if (!len) @@ -571,10 +587,10 @@ static struct crush_map *crush_decode(void *pbyval, void *end) goto bad; #endif r = kmalloc(struct_size(r, steps, yes), GFP_NOFS); - c->rules[i] = r; if (r == NULL) goto badmem; dout(" rule %d is at %p\n", i, r); + c->rules[i] = r; r->len = yes; ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); @@ -965,6 +981,143 @@ bad: } /* + * CRUSH workspaces + * + * workspace_manager framework borrowed from fs/btrfs/compression.c. + * Two simplifications: there is only one type of workspace and there + * is always at least one workspace. + */ +static struct crush_work *alloc_workspace(const struct crush_map *c) +{ + struct crush_work *work; + size_t work_size; + + WARN_ON(!c->working_size); + work_size = crush_work_size(c, CEPH_PG_MAX_SIZE); + dout("%s work_size %zu bytes\n", __func__, work_size); + + work = kvmalloc(work_size, GFP_NOIO); + if (!work) + return NULL; + + INIT_LIST_HEAD(&work->item); + crush_init_workspace(c, work); + return work; +} + +static void free_workspace(struct crush_work *work) +{ + WARN_ON(!list_empty(&work->item)); + kvfree(work); +} + +static void init_workspace_manager(struct workspace_manager *wsm) +{ + INIT_LIST_HEAD(&wsm->idle_ws); + spin_lock_init(&wsm->ws_lock); + atomic_set(&wsm->total_ws, 0); + wsm->free_ws = 0; + init_waitqueue_head(&wsm->ws_wait); +} + +static void add_initial_workspace(struct workspace_manager *wsm, + struct crush_work *work) +{ + WARN_ON(!list_empty(&wsm->idle_ws)); + + list_add(&work->item, &wsm->idle_ws); + atomic_set(&wsm->total_ws, 1); + wsm->free_ws = 1; +} + +static void cleanup_workspace_manager(struct workspace_manager *wsm) +{ + struct crush_work *work; + + while (!list_empty(&wsm->idle_ws)) { + work = list_first_entry(&wsm->idle_ws, struct crush_work, + item); + list_del_init(&work->item); + free_workspace(work); + } + atomic_set(&wsm->total_ws, 0); + wsm->free_ws = 0; +} + +/* + * Finds an available workspace or allocates a new one. If it's not + * possible to allocate a new one, waits until there is one. + */ +static struct crush_work *get_workspace(struct workspace_manager *wsm, + const struct crush_map *c) +{ + struct crush_work *work; + int cpus = num_online_cpus(); + +again: + spin_lock(&wsm->ws_lock); + if (!list_empty(&wsm->idle_ws)) { + work = list_first_entry(&wsm->idle_ws, struct crush_work, + item); + list_del_init(&work->item); + wsm->free_ws--; + spin_unlock(&wsm->ws_lock); + return work; + + } + if (atomic_read(&wsm->total_ws) > cpus) { + DEFINE_WAIT(wait); + + spin_unlock(&wsm->ws_lock); + prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws) + schedule(); + finish_wait(&wsm->ws_wait, &wait); + goto again; + } + atomic_inc(&wsm->total_ws); + spin_unlock(&wsm->ws_lock); + + work = alloc_workspace(c); + if (!work) { + atomic_dec(&wsm->total_ws); + wake_up(&wsm->ws_wait); + + /* + * Do not return the error but go back to waiting. We + * have the initial workspace and the CRUSH computation + * time is bounded so we will get it eventually. + */ + WARN_ON(atomic_read(&wsm->total_ws) < 1); + goto again; + } + return work; +} + +/* + * Puts a workspace back on the list or frees it if we have enough + * idle ones sitting around. + */ +static void put_workspace(struct workspace_manager *wsm, + struct crush_work *work) +{ + spin_lock(&wsm->ws_lock); + if (wsm->free_ws <= num_online_cpus()) { + list_add(&work->item, &wsm->idle_ws); + wsm->free_ws++; + spin_unlock(&wsm->ws_lock); + goto wake; + } + spin_unlock(&wsm->ws_lock); + + free_workspace(work); + atomic_dec(&wsm->total_ws); +wake: + if (wq_has_sleeper(&wsm->ws_wait)) + wake_up(&wsm->ws_wait); +} + +/* * osd map */ struct ceph_osdmap *ceph_osdmap_alloc(void) @@ -981,7 +1134,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) map->primary_temp = RB_ROOT; map->pg_upmap = RB_ROOT; map->pg_upmap_items = RB_ROOT; - mutex_init(&map->crush_workspace_mutex); + + init_workspace_manager(&map->crush_wsm); return map; } @@ -989,8 +1143,11 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) void ceph_osdmap_destroy(struct ceph_osdmap *map) { dout("osdmap_destroy %p\n", map); + if (map->crush) crush_destroy(map->crush); + cleanup_workspace_manager(&map->crush_wsm); + while (!RB_EMPTY_ROOT(&map->pg_temp)) { struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->pg_temp), @@ -1029,7 +1186,6 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) kvfree(map->osd_weight); kvfree(map->osd_addr); kvfree(map->osd_primary_affinity); - kvfree(map->crush_workspace); kfree(map); } @@ -1050,9 +1206,9 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) if (max == map->max_osd) return 0; - state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); - weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); - addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); + state = kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); + weight = kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); + addr = kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); if (!state || !weight || !addr) { kvfree(state); kvfree(weight); @@ -1082,7 +1238,7 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) if (map->osd_primary_affinity) { u32 *affinity; - affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), + affinity = kvmalloc(array_size(max, sizeof(*affinity)), GFP_NOFS); if (!affinity) return -ENOMEM; @@ -1104,26 +1260,22 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) { - void *workspace; - size_t work_size; + struct crush_work *work; if (IS_ERR(crush)) return PTR_ERR(crush); - work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); - dout("%s work_size %zu bytes\n", __func__, work_size); - workspace = ceph_kvmalloc(work_size, GFP_NOIO); - if (!workspace) { + work = alloc_workspace(crush); + if (!work) { crush_destroy(crush); return -ENOMEM; } - crush_init_workspace(crush, workspace); if (map->crush) crush_destroy(map->crush); - kvfree(map->crush_workspace); + cleanup_workspace_manager(&map->crush_wsm); map->crush = crush; - map->crush_workspace = workspace; + add_initial_workspace(&map->crush_wsm, work); return 0; } @@ -1173,7 +1325,7 @@ static int get_osdmap_client_data_v(void **p, void *end, return -EINVAL; } - /* old osdmap enconding */ + /* old osdmap encoding */ struct_v = 0; } @@ -1352,8 +1504,6 @@ static int decode_new_primary_temp(void **p, void *end, u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) { - BUG_ON(osd >= map->max_osd); - if (!map->osd_primary_affinity) return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; @@ -1362,12 +1512,10 @@ u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) { - BUG_ON(osd >= map->max_osd); - if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = ceph_kvmalloc( + map->osd_primary_affinity = kvmalloc( array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), GFP_NOFS); if (!map->osd_primary_affinity) @@ -1425,12 +1573,14 @@ static int decode_new_primary_affinity(void **p, void *end, ceph_decode_32_safe(p, end, osd, e_inval); ceph_decode_32_safe(p, end, aff, e_inval); + if (osd >= map->max_osd) + goto e_inval; ret = set_primary_affinity(map, osd, aff); if (ret) return ret; - pr_info("osd%d primary-affinity 0x%x\n", osd, aff); + osdmap_info(map, "osd%d primary-affinity 0x%x\n", osd, aff); } return 0; @@ -1511,7 +1661,8 @@ static int decode_old_pg_upmap_items(void **p, void *end, /* * decode a full map. */ -static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) +static int osdmap_decode(void **p, void *end, bool msgr2, + struct ceph_osdmap *map) { u8 struct_v; u32 epoch = 0; @@ -1582,9 +1733,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) goto e_inval; for (i = 0; i < map->max_osd; i++) { - err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]); + struct ceph_entity_addr *addr = &map->osd_addr[i]; + + if (struct_v >= 8) + err = ceph_decode_entity_addrvec(p, end, msgr2, addr); + else + err = ceph_decode_entity_addr(p, end, addr); if (err) goto bad; + + dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr)); } /* pg_temp */ @@ -1654,7 +1812,7 @@ bad: /* * Allocate and decode a full map. */ -struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2) { struct ceph_osdmap *map; int ret; @@ -1663,7 +1821,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) if (!map) return ERR_PTR(-ENOMEM); - ret = osdmap_decode(p, end, map); + ret = osdmap_decode(p, end, msgr2, map); if (ret) { ceph_osdmap_destroy(map); return ERR_PTR(ret); @@ -1681,12 +1839,13 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) * new_state: { osd=6, xorstate=EXISTS } # clear osd_state */ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, - struct ceph_osdmap *map) + bool msgr2, struct ceph_osdmap *map) { void *new_up_client; void *new_state; void *new_weight_end; u32 len; + int ret; int i; new_up_client = *p; @@ -1695,8 +1854,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, struct ceph_entity_addr addr; ceph_decode_skip_32(p, end, e_inval); - if (ceph_decode_entity_addr(p, end, &addr)) - goto e_inval; + if (struct_v >= 7) + ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); + else + ret = ceph_decode_entity_addr(p, end, &addr); + if (ret) + return ret; } new_state = *p; @@ -1714,10 +1877,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, ceph_decode_need(p, end, 2*sizeof(u32), e_inval); osd = ceph_decode_32(p); w = ceph_decode_32(p); - BUG_ON(osd >= map->max_osd); - pr_info("osd%d weight 0x%x %s\n", osd, w, - w == CEPH_OSD_IN ? "(in)" : - (w == CEPH_OSD_OUT ? "(out)" : "")); + if (osd >= map->max_osd) + goto e_inval; + + osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w, + w == CEPH_OSD_IN ? "(in)" : + (w == CEPH_OSD_OUT ? "(out)" : "")); map->osd_weight[osd] = w; /* @@ -1738,22 +1903,23 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, while (len--) { s32 osd; u32 xorstate; - int ret; osd = ceph_decode_32(p); + if (osd >= map->max_osd) + goto e_inval; + if (struct_v >= 5) xorstate = ceph_decode_32(p); else xorstate = ceph_decode_8(p); if (xorstate == 0) xorstate = CEPH_OSD_UP; - BUG_ON(osd >= map->max_osd); if ((map->osd_state[osd] & CEPH_OSD_UP) && (xorstate & CEPH_OSD_UP)) - pr_info("osd%d down\n", osd); + osdmap_info(map, "osd%d down\n", osd); if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && (xorstate & CEPH_OSD_EXISTS)) { - pr_info("osd%d does not exist\n", osd); + osdmap_info(map, "osd%d does not exist\n", osd); ret = set_primary_affinity(map, osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); if (ret) @@ -1773,10 +1939,19 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, struct ceph_entity_addr addr; osd = ceph_decode_32(p); - BUG_ON(osd >= map->max_osd); - if (ceph_decode_entity_addr(p, end, &addr)) + if (osd >= map->max_osd) goto e_inval; - pr_info("osd%d up\n", osd); + + if (struct_v >= 7) + ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); + else + ret = ceph_decode_entity_addr(p, end, &addr); + if (ret) + return ret; + + dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr)); + + osdmap_info(map, "osd%d up\n", osd); map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; map->osd_addr[osd] = addr; } @@ -1791,7 +1966,7 @@ e_inval: /* * decode and apply an incremental map update. */ -struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2, struct ceph_osdmap *map) { struct ceph_fsid fsid; @@ -1826,7 +2001,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); - return ceph_osdmap_decode(p, min(*p+len, end)); + return ceph_osdmap_decode(p, min(*p+len, end), msgr2); } /* new crush? */ @@ -1878,7 +2053,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_up_client, new_state, new_weight */ - err = decode_new_up_state_weight(p, end, struct_v, map); + err = decode_new_up_state_weight(p, end, struct_v, msgr2, map); if (err) goto bad; @@ -2322,6 +2497,7 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, s64 choose_args_index) { struct crush_choose_arg_map *arg_map; + struct crush_work *work; int r; BUG_ON(result_max > CEPH_PG_MAX_SIZE); @@ -2332,12 +2508,11 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, arg_map = lookup_choose_arg_map(&map->crush->choose_args, CEPH_DEFAULT_CHOOSE_ARGS); - mutex_lock(&map->crush_workspace_mutex); + work = get_workspace(&map->crush_wsm, map->crush); r = crush_do_rule(map->crush, ruleno, x, result, result_max, - weight, weight_max, map->crush_workspace, + weight, weight_max, work, arg_map ? arg_map->args : NULL); - mutex_unlock(&map->crush_workspace_mutex); - + put_workspace(&map->crush_wsm, work); return r; } @@ -2855,7 +3030,7 @@ static bool is_valid_crush_name(const char *name) * parent, returns 0. * * Does a linear search, as there are no parent pointers of any - * kind. Note that the result is ambigous for items that occur + * kind. Note that the result is ambiguous for items that occur * multiple times in the map. */ static int get_immediate_parent(struct crush_map *c, int id, |
