diff options
Diffstat (limited to 'drivers/block/rbd.c')
| -rw-r--r-- | drivers/block/rbd.c | 807 |
1 files changed, 454 insertions, 353 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 04453f4a319c..af0e21149dbc 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -362,7 +362,7 @@ enum rbd_watch_state { enum rbd_lock_state { RBD_LOCK_STATE_UNLOCKED, RBD_LOCK_STATE_LOCKED, - RBD_LOCK_STATE_RELEASING, + RBD_LOCK_STATE_QUIESCING, }; /* WatchNotify::ClientId */ @@ -422,7 +422,7 @@ struct rbd_device { struct list_head running_list; struct completion acquire_wait; int acquire_err; - struct completion releasing_wait; + struct completion quiescing_wait; spinlock_t object_map_lock; u8 *object_map; @@ -491,12 +491,12 @@ static bool single_major = true; module_param(single_major, bool, 0444); MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); -static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count); -static ssize_t remove_store(struct bus_type *bus, const char *buf, +static ssize_t add_store(const struct bus_type *bus, const char *buf, size_t count); +static ssize_t remove_store(const struct bus_type *bus, const char *buf, size_t count); -static ssize_t add_single_major_store(struct bus_type *bus, const char *buf, +static ssize_t add_single_major_store(const struct bus_type *bus, const char *buf, size_t count); -static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf, +static ssize_t remove_single_major_store(const struct bus_type *bus, const char *buf, size_t count); static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); @@ -525,7 +525,7 @@ static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) lockdep_assert_held(&rbd_dev->lock_rwsem); return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || - rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; + rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING; } static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) @@ -538,7 +538,7 @@ static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) return is_lock_owner; } -static ssize_t supported_features_show(struct bus_type *bus, char *buf) +static ssize_t supported_features_show(const struct bus_type *bus, char *buf) { return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED); } @@ -575,7 +575,7 @@ static const struct attribute_group rbd_bus_group = { }; __ATTRIBUTE_GROUPS(rbd_bus); -static struct bus_type rbd_bus_type = { +static const struct bus_type rbd_bus_type = { .name = "rbd", .bus_groups = rbd_bus_groups, }; @@ -632,9 +632,8 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); static int rbd_dev_refresh(struct rbd_device *rbd_dev); -static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); -static int rbd_dev_header_info(struct rbd_device *rbd_dev); -static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); +static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev, + struct rbd_image_header *header); static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u64 snap_id); static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, @@ -660,9 +659,9 @@ static bool pending_result_dec(struct pending_result *pending, int *result) return true; } -static int rbd_open(struct block_device *bdev, fmode_t mode) +static int rbd_open(struct gendisk *disk, blk_mode_t mode) { - struct rbd_device *rbd_dev = bdev->bd_disk->private_data; + struct rbd_device *rbd_dev = disk->private_data; bool removing = false; spin_lock_irq(&rbd_dev->lock); @@ -679,7 +678,7 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) return 0; } -static void rbd_release(struct gendisk *disk, fmode_t mode) +static void rbd_release(struct gendisk *disk) { struct rbd_device *rbd_dev = disk->private_data; unsigned long open_count_before; @@ -995,15 +994,24 @@ static void rbd_init_layout(struct rbd_device *rbd_dev) RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); } +static void rbd_image_header_cleanup(struct rbd_image_header *header) +{ + kfree(header->object_prefix); + ceph_put_snap_context(header->snapc); + kfree(header->snap_sizes); + kfree(header->snap_names); + + memset(header, 0, sizeof(*header)); +} + /* * Fill an rbd image header with information from the given format 1 * on-disk header. */ -static int rbd_header_from_disk(struct rbd_device *rbd_dev, - struct rbd_image_header_ondisk *ondisk) +static int rbd_header_from_disk(struct rbd_image_header *header, + struct rbd_image_header_ondisk *ondisk, + bool first_time) { - struct rbd_image_header *header = &rbd_dev->header; - bool first_time = header->object_prefix == NULL; struct ceph_snap_context *snapc; char *object_prefix = NULL; char *snap_names = NULL; @@ -1070,11 +1078,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, if (first_time) { header->object_prefix = object_prefix; header->obj_order = ondisk->options.order; - rbd_init_layout(rbd_dev); - } else { - ceph_put_snap_context(header->snapc); - kfree(header->snap_names); - kfree(header->snap_sizes); } /* The remaining fields always get updated (when we refresh) */ @@ -1334,14 +1337,30 @@ static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) /* * Must be called after rbd_obj_calc_img_extents(). */ -static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req) +static void rbd_obj_set_copyup_enabled(struct rbd_obj_request *obj_req) { - if (!obj_req->num_img_extents || - (rbd_obj_is_entire(obj_req) && - !obj_req->img_request->snapc->num_snaps)) - return false; + rbd_assert(obj_req->img_request->snapc); - return true; + if (obj_req->img_request->op_type == OBJ_OP_DISCARD) { + dout("%s %p objno %llu discard\n", __func__, obj_req, + obj_req->ex.oe_objno); + return; + } + + if (!obj_req->num_img_extents) { + dout("%s %p objno %llu not overlapping\n", __func__, obj_req, + obj_req->ex.oe_objno); + return; + } + + if (rbd_obj_is_entire(obj_req) && + !obj_req->img_request->snapc->num_snaps) { + dout("%s %p objno %llu entire\n", __func__, obj_req, + obj_req->ex.oe_objno); + return; + } + + obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; } static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req) @@ -1442,6 +1461,7 @@ __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, static struct ceph_osd_request * rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops) { + rbd_assert(obj_req->img_request->snapc); return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc, num_ops); } @@ -1578,15 +1598,18 @@ static void rbd_img_request_init(struct rbd_img_request *img_request, mutex_init(&img_request->state_mutex); } +/* + * Only snap_id is captured here, for reads. For writes, snapshot + * context is captured in rbd_img_object_requests() after exclusive + * lock is ensured to be held. + */ static void rbd_img_capture_header(struct rbd_img_request *img_req) { struct rbd_device *rbd_dev = img_req->rbd_dev; lockdep_assert_held(&rbd_dev->header_rwsem); - if (rbd_img_is_write(img_req)) - img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc); - else + if (!rbd_img_is_write(img_req)) img_req->snap_id = rbd_dev->spec->snap_id; if (rbd_dev_parent_get(rbd_dev)) @@ -2233,9 +2256,6 @@ static int rbd_obj_init_write(struct rbd_obj_request *obj_req) if (ret) return ret; - if (rbd_obj_copyup_enabled(obj_req)) - obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; - obj_req->write_state = RBD_OBJ_WRITE_START; return 0; } @@ -2341,8 +2361,6 @@ static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req) if (ret) return ret; - if (rbd_obj_copyup_enabled(obj_req)) - obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; if (!obj_req->num_img_extents) { obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT; if (rbd_obj_is_entire(obj_req)) @@ -3068,13 +3086,12 @@ static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) for (i = 0; i < obj_req->copyup_bvec_count; i++) { unsigned int len = min(obj_overlap, (u64)PAGE_SIZE); + struct page *page = alloc_page(GFP_NOIO); - obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO); - if (!obj_req->copyup_bvecs[i].bv_page) + if (!page) return -ENOMEM; - obj_req->copyup_bvecs[i].bv_offset = 0; - obj_req->copyup_bvecs[i].bv_len = len; + bvec_set_page(&obj_req->copyup_bvecs[i], page, len, 0); obj_overlap -= len; } @@ -3287,6 +3304,7 @@ again: case RBD_OBJ_WRITE_START: rbd_assert(!*result); + rbd_obj_set_copyup_enabled(obj_req); if (rbd_obj_write_is_noop(obj_req)) return true; @@ -3434,17 +3452,19 @@ static bool rbd_lock_add_request(struct rbd_img_request *img_req) static void rbd_lock_del_request(struct rbd_img_request *img_req) { struct rbd_device *rbd_dev = img_req->rbd_dev; - bool need_wakeup; + bool need_wakeup = false; lockdep_assert_held(&rbd_dev->lock_rwsem); spin_lock(&rbd_dev->lock_lists_lock); - rbd_assert(!list_empty(&img_req->lock_item)); - list_del_init(&img_req->lock_item); - need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING && - list_empty(&rbd_dev->running_list)); + if (!list_empty(&img_req->lock_item)) { + rbd_assert(!list_empty(&rbd_dev->running_list)); + list_del_init(&img_req->lock_item); + need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING && + list_empty(&rbd_dev->running_list)); + } spin_unlock(&rbd_dev->lock_lists_lock); if (need_wakeup) - complete(&rbd_dev->releasing_wait); + complete(&rbd_dev->quiescing_wait); } static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) @@ -3457,11 +3477,6 @@ static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) if (rbd_lock_add_request(img_req)) return 1; - if (rbd_dev->opts->exclusive) { - WARN_ON(1); /* lock got released? */ - return -EROFS; - } - /* * Note the use of mod_delayed_work() in rbd_acquire_lock() * and cancel_delayed_work() in wake_lock_waiters(). @@ -3473,9 +3488,19 @@ static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) static void rbd_img_object_requests(struct rbd_img_request *img_req) { + struct rbd_device *rbd_dev = img_req->rbd_dev; struct rbd_obj_request *obj_req; rbd_assert(!img_req->pending.result && !img_req->pending.num_pending); + rbd_assert(!need_exclusive_lock(img_req) || + __rbd_is_lock_owner(rbd_dev)); + + if (rbd_img_is_write(img_req)) { + rbd_assert(!img_req->snapc); + down_read(&rbd_dev->header_rwsem); + img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc); + up_read(&rbd_dev->header_rwsem); + } for_each_obj_request(img_req, obj_req) { int result = 0; @@ -3493,7 +3518,6 @@ static void rbd_img_object_requests(struct rbd_img_request *img_req) static bool rbd_img_advance(struct rbd_img_request *img_req, int *result) { - struct rbd_device *rbd_dev = img_req->rbd_dev; int ret; again: @@ -3514,9 +3538,6 @@ again: if (*result) return true; - rbd_assert(!need_exclusive_lock(img_req) || - __rbd_is_lock_owner(rbd_dev)); - rbd_img_object_requests(img_req); if (!img_req->pending.num_pending) { *result = img_req->pending.result; @@ -3654,7 +3675,7 @@ static int rbd_lock(struct rbd_device *rbd_dev) ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie, RBD_LOCK_TAG, "", 0); - if (ret) + if (ret && ret != -EEXIST) return ret; __rbd_lock(rbd_dev, cookie); @@ -3818,61 +3839,97 @@ static void wake_lock_waiters(struct rbd_device *rbd_dev, int result) return; } - list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) { + while (!list_empty(&rbd_dev->acquiring_list)) { + img_req = list_first_entry(&rbd_dev->acquiring_list, + struct rbd_img_request, lock_item); mutex_lock(&img_req->state_mutex); rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK); + if (!result) + list_move_tail(&img_req->lock_item, + &rbd_dev->running_list); + else + list_del_init(&img_req->lock_item); rbd_img_schedule(img_req, result); mutex_unlock(&img_req->state_mutex); } +} + +static bool locker_equal(const struct ceph_locker *lhs, + const struct ceph_locker *rhs) +{ + return lhs->id.name.type == rhs->id.name.type && + lhs->id.name.num == rhs->id.name.num && + !strcmp(lhs->id.cookie, rhs->id.cookie) && + ceph_addr_equal_no_type(&lhs->info.addr, &rhs->info.addr); +} - list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list); +static void free_locker(struct ceph_locker *locker) +{ + if (locker) + ceph_free_lockers(locker, 1); } -static int get_lock_owner_info(struct rbd_device *rbd_dev, - struct ceph_locker **lockers, u32 *num_lockers) +static struct ceph_locker *get_lock_owner_info(struct rbd_device *rbd_dev) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct ceph_locker *lockers; + u32 num_lockers; u8 lock_type; char *lock_tag; + u64 handle; int ret; - dout("%s rbd_dev %p\n", __func__, rbd_dev); - ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, - &lock_type, &lock_tag, lockers, num_lockers); - if (ret) - return ret; + &lock_type, &lock_tag, &lockers, &num_lockers); + if (ret) { + rbd_warn(rbd_dev, "failed to get header lockers: %d", ret); + return ERR_PTR(ret); + } - if (*num_lockers == 0) { + if (num_lockers == 0) { dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev); + lockers = NULL; goto out; } if (strcmp(lock_tag, RBD_LOCK_TAG)) { rbd_warn(rbd_dev, "locked by external mechanism, tag %s", lock_tag); - ret = -EBUSY; - goto out; + goto err_busy; } - if (lock_type == CEPH_CLS_LOCK_SHARED) { - rbd_warn(rbd_dev, "shared lock type detected"); - ret = -EBUSY; - goto out; + if (lock_type != CEPH_CLS_LOCK_EXCLUSIVE) { + rbd_warn(rbd_dev, "incompatible lock type detected"); + goto err_busy; } - if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, - strlen(RBD_LOCK_COOKIE_PREFIX))) { + WARN_ON(num_lockers != 1); + ret = sscanf(lockers[0].id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", + &handle); + if (ret != 1) { rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", - (*lockers)[0].id.cookie); - ret = -EBUSY; - goto out; + lockers[0].id.cookie); + goto err_busy; + } + if (ceph_addr_is_blank(&lockers[0].info.addr)) { + rbd_warn(rbd_dev, "locker has a blank address"); + goto err_busy; } + dout("%s rbd_dev %p got locker %s%llu@%pISpc/%u handle %llu\n", + __func__, rbd_dev, ENTITY_NAME(lockers[0].id.name), + &lockers[0].info.addr.in_addr, + le32_to_cpu(lockers[0].info.addr.nonce), handle); + out: kfree(lock_tag); - return ret; + return lockers; + +err_busy: + kfree(lock_tag); + ceph_free_lockers(lockers, num_lockers); + return ERR_PTR(-EBUSY); } static int find_watcher(struct rbd_device *rbd_dev, @@ -3888,8 +3945,10 @@ static int find_watcher(struct rbd_device *rbd_dev, ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, &watchers, &num_watchers); - if (ret) + if (ret) { + rbd_warn(rbd_dev, "failed to get watchers: %d", ret); return ret; + } sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); for (i = 0; i < num_watchers; i++) { @@ -3926,51 +3985,72 @@ out: static int rbd_try_lock(struct rbd_device *rbd_dev) { struct ceph_client *client = rbd_dev->rbd_client->client; - struct ceph_locker *lockers; - u32 num_lockers; + struct ceph_locker *locker, *refreshed_locker; int ret; for (;;) { + locker = refreshed_locker = NULL; + ret = rbd_lock(rbd_dev); - if (ret != -EBUSY) - return ret; + if (!ret) + goto out; + if (ret != -EBUSY) { + rbd_warn(rbd_dev, "failed to lock header: %d", ret); + goto out; + } /* determine if the current lock holder is still alive */ - ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers); - if (ret) - return ret; - - if (num_lockers == 0) + locker = get_lock_owner_info(rbd_dev); + if (IS_ERR(locker)) { + ret = PTR_ERR(locker); + locker = NULL; + goto out; + } + if (!locker) goto again; - ret = find_watcher(rbd_dev, lockers); + ret = find_watcher(rbd_dev, locker); if (ret) goto out; /* request lock or error */ + refreshed_locker = get_lock_owner_info(rbd_dev); + if (IS_ERR(refreshed_locker)) { + ret = PTR_ERR(refreshed_locker); + refreshed_locker = NULL; + goto out; + } + if (!refreshed_locker || + !locker_equal(locker, refreshed_locker)) + goto again; + rbd_warn(rbd_dev, "breaking header lock owned by %s%llu", - ENTITY_NAME(lockers[0].id.name)); + ENTITY_NAME(locker->id.name)); ret = ceph_monc_blocklist_add(&client->monc, - &lockers[0].info.addr); + &locker->info.addr); if (ret) { - rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d", - ENTITY_NAME(lockers[0].id.name), ret); + rbd_warn(rbd_dev, "failed to blocklist %s%llu: %d", + ENTITY_NAME(locker->id.name), ret); goto out; } ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, - lockers[0].id.cookie, - &lockers[0].id.name); - if (ret && ret != -ENOENT) + locker->id.cookie, &locker->id.name); + if (ret && ret != -ENOENT) { + rbd_warn(rbd_dev, "failed to break header lock: %d", + ret); goto out; + } again: - ceph_free_lockers(lockers, num_lockers); + free_locker(refreshed_locker); + free_locker(locker); } out: - ceph_free_lockers(lockers, num_lockers); + free_locker(refreshed_locker); + free_locker(locker); return ret; } @@ -3978,6 +4058,10 @@ static int rbd_post_acquire_action(struct rbd_device *rbd_dev) { int ret; + ret = rbd_dev_refresh(rbd_dev); + if (ret) + return ret; + if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) { ret = rbd_object_map_open(rbd_dev); if (ret) @@ -4016,11 +4100,8 @@ static int rbd_try_acquire_lock(struct rbd_device *rbd_dev) ret = rbd_try_lock(rbd_dev); if (ret < 0) { - rbd_warn(rbd_dev, "failed to lock header: %d", ret); - if (ret == -EBLOCKLISTED) - goto out; - - ret = 1; /* request lock anyway */ + rbd_warn(rbd_dev, "failed to acquire lock: %d", ret); + goto out; } if (ret > 0) { up_write(&rbd_dev->lock_rwsem); @@ -4096,16 +4177,16 @@ static bool rbd_quiesce_lock(struct rbd_device *rbd_dev) /* * Ensure that all in-flight IO is flushed. */ - rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; - rbd_assert(!completion_done(&rbd_dev->releasing_wait)); + rbd_dev->lock_state = RBD_LOCK_STATE_QUIESCING; + rbd_assert(!completion_done(&rbd_dev->quiescing_wait)); if (list_empty(&rbd_dev->running_list)) return true; up_write(&rbd_dev->lock_rwsem); - wait_for_completion(&rbd_dev->releasing_wait); + wait_for_completion(&rbd_dev->quiescing_wait); down_write(&rbd_dev->lock_rwsem); - if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) + if (rbd_dev->lock_state != RBD_LOCK_STATE_QUIESCING) return false; rbd_assert(list_empty(&rbd_dev->running_list)); @@ -4516,6 +4597,10 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev) rbd_warn(rbd_dev, "failed to update lock cookie: %d", ret); + if (rbd_dev->opts->exclusive) + rbd_warn(rbd_dev, + "temporarily releasing lock on exclusive mapping"); + /* * Lock cookie cannot be updated on older OSDs, so do * a manual release and queue an acquire. @@ -4783,7 +4868,9 @@ out_req: * return, the rbd_dev->header field will contain up-to-date * information about the image. */ -static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) +static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev, + struct rbd_image_header *header, + bool first_time) { struct rbd_image_header_ondisk *ondisk = NULL; u32 snap_count = 0; @@ -4831,7 +4918,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) snap_count = le32_to_cpu(ondisk->snap_count); } while (snap_count != want_count); - ret = rbd_header_from_disk(rbd_dev, ondisk); + ret = rbd_header_from_disk(header, ondisk, first_time); out: kfree(ondisk); @@ -4855,39 +4942,6 @@ static void rbd_dev_update_size(struct rbd_device *rbd_dev) } } -static int rbd_dev_refresh(struct rbd_device *rbd_dev) -{ - u64 mapping_size; - int ret; - - down_write(&rbd_dev->header_rwsem); - mapping_size = rbd_dev->mapping.size; - - ret = rbd_dev_header_info(rbd_dev); - if (ret) - goto out; - - /* - * If there is a parent, see if it has disappeared due to the - * mapped image getting flattened. - */ - if (rbd_dev->parent) { - ret = rbd_dev_v2_parent_info(rbd_dev); - if (ret) - goto out; - } - - rbd_assert(!rbd_is_snap(rbd_dev)); - rbd_dev->mapping.size = rbd_dev->header.image_size; - -out: - up_write(&rbd_dev->header_rwsem); - if (!ret && mapping_size != rbd_dev->mapping.size) - rbd_dev_update_size(rbd_dev); - - return ret; -} - static const struct blk_mq_ops rbd_mq_ops = { .queue_rq = rbd_queue_rq, }; @@ -4895,16 +4949,21 @@ static const struct blk_mq_ops rbd_mq_ops = { static int rbd_init_disk(struct rbd_device *rbd_dev) { struct gendisk *disk; - struct request_queue *q; unsigned int objset_bytes = rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; + struct queue_limits lim = { + .max_hw_sectors = objset_bytes >> SECTOR_SHIFT, + .io_opt = objset_bytes, + .io_min = rbd_dev->opts->alloc_size, + .max_segments = USHRT_MAX, + .max_segment_size = UINT_MAX, + }; int err; memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); rbd_dev->tag_set.ops = &rbd_mq_ops; rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; rbd_dev->tag_set.numa_node = NUMA_NO_NODE; - rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; rbd_dev->tag_set.nr_hw_queues = num_present_cpus(); rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request); @@ -4912,12 +4971,20 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (err) return err; - disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev); + if (rbd_dev->opts->trim) { + lim.discard_granularity = rbd_dev->opts->alloc_size; + lim.max_hw_discard_sectors = objset_bytes >> SECTOR_SHIFT; + lim.max_write_zeroes_sectors = objset_bytes >> SECTOR_SHIFT; + } + + if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) + lim.features |= BLK_FEAT_STABLE_WRITES; + + disk = blk_mq_alloc_disk(&rbd_dev->tag_set, &lim, rbd_dev); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_tag_set; } - q = disk->queue; snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", rbd_dev->dev_id); @@ -4929,26 +4996,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) disk->minors = RBD_MINORS_PER_MAJOR; disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; - - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ - - blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT); - q->limits.max_sectors = queue_max_hw_sectors(q); - blk_queue_max_segments(q, USHRT_MAX); - blk_queue_max_segment_size(q, UINT_MAX); - blk_queue_io_min(q, rbd_dev->opts->alloc_size); - blk_queue_io_opt(q, rbd_dev->opts->alloc_size); - - if (rbd_dev->opts->trim) { - q->limits.discard_granularity = rbd_dev->opts->alloc_size; - blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); - blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); - } - - if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); - rbd_dev->disk = disk; return 0; @@ -5278,7 +5325,7 @@ static void rbd_dev_release(struct device *dev) if (need_put) { destroy_workqueue(rbd_dev->task_wq); - ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); + ida_free(&rbd_dev_id_ida, rbd_dev->dev_id); } rbd_dev_free(rbd_dev); @@ -5292,8 +5339,7 @@ static void rbd_dev_release(struct device *dev) module_put(THIS_MODULE); } -static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, - struct rbd_spec *spec) +static struct rbd_device *__rbd_dev_create(struct rbd_spec *spec) { struct rbd_device *rbd_dev; @@ -5329,7 +5375,7 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, INIT_LIST_HEAD(&rbd_dev->acquiring_list); INIT_LIST_HEAD(&rbd_dev->running_list); init_completion(&rbd_dev->acquire_wait); - init_completion(&rbd_dev->releasing_wait); + init_completion(&rbd_dev->quiescing_wait); spin_lock_init(&rbd_dev->object_map_lock); @@ -5338,9 +5384,6 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, rbd_dev->dev.parent = &rbd_root_dev; device_initialize(&rbd_dev->dev); - rbd_dev->rbd_client = rbdc; - rbd_dev->spec = spec; - return rbd_dev; } @@ -5353,16 +5396,14 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, { struct rbd_device *rbd_dev; - rbd_dev = __rbd_dev_create(rbdc, spec); + rbd_dev = __rbd_dev_create(spec); if (!rbd_dev) return NULL; - rbd_dev->opts = opts; - /* get an id and fill in device name */ - rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, - minor_to_rbd_dev_id(1 << MINORBITS), - GFP_KERNEL); + rbd_dev->dev_id = ida_alloc_max(&rbd_dev_id_ida, + minor_to_rbd_dev_id(1 << MINORBITS) - 1, + GFP_KERNEL); if (rbd_dev->dev_id < 0) goto fail_rbd_dev; @@ -5375,11 +5416,15 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, /* we have a ref from do_rbd_add() */ __module_get(THIS_MODULE); + rbd_dev->rbd_client = rbdc; + rbd_dev->spec = spec; + rbd_dev->opts = opts; + dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id); return rbd_dev; fail_dev_id: - ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); + ida_free(&rbd_dev_id_ida, rbd_dev->dev_id); fail_rbd_dev: rbd_dev_free(rbd_dev); return NULL; @@ -5429,17 +5474,12 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, return 0; } -static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) -{ - return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, - &rbd_dev->header.obj_order, - &rbd_dev->header.image_size); -} - -static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) +static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev, + char **pobject_prefix) { size_t size; void *reply_buf; + char *object_prefix; int ret; void *p; @@ -5457,16 +5497,16 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) goto out; p = reply_buf; - rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, - p + ret, NULL, GFP_NOIO); + object_prefix = ceph_extract_encoded_string(&p, p + ret, NULL, + GFP_NOIO); + if (IS_ERR(object_prefix)) { + ret = PTR_ERR(object_prefix); + goto out; + } ret = 0; - if (IS_ERR(rbd_dev->header.object_prefix)) { - ret = PTR_ERR(rbd_dev->header.object_prefix); - rbd_dev->header.object_prefix = NULL; - } else { - dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); - } + *pobject_prefix = object_prefix; + dout(" object_prefix = %s\n", object_prefix); out: kfree(reply_buf); @@ -5517,13 +5557,6 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, return 0; } -static int rbd_dev_v2_features(struct rbd_device *rbd_dev) -{ - return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, - rbd_is_ro(rbd_dev), - &rbd_dev->header.features); -} - /* * These are generic image flags, but since they are used only for * object map, store them in rbd_dev->object_map_flags. @@ -5560,6 +5593,14 @@ struct parent_image_info { u64 overlap; }; +static void rbd_parent_info_cleanup(struct parent_image_info *pii) +{ + kfree(pii->pool_ns); + kfree(pii->image_id); + + memset(pii, 0, sizeof(*pii)); +} + /* * The caller is responsible for @pii. */ @@ -5629,6 +5670,9 @@ static int __get_parent_info(struct rbd_device *rbd_dev, if (pii->has_overlap) ceph_decode_64_safe(&p, end, pii->overlap, e_inval); + dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", + __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id, + pii->has_overlap, pii->overlap); return 0; e_inval: @@ -5667,14 +5711,17 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev, pii->has_overlap = true; ceph_decode_64_safe(&p, end, pii->overlap, e_inval); + dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", + __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id, + pii->has_overlap, pii->overlap); return 0; e_inval: return -EINVAL; } -static int get_parent_info(struct rbd_device *rbd_dev, - struct parent_image_info *pii) +static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev, + struct parent_image_info *pii) { struct page *req_page, *reply_page; void *p; @@ -5702,7 +5749,7 @@ static int get_parent_info(struct rbd_device *rbd_dev, return ret; } -static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) +static int rbd_dev_setup_parent(struct rbd_device *rbd_dev) { struct rbd_spec *parent_spec; struct parent_image_info pii = { 0 }; @@ -5712,37 +5759,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) if (!parent_spec) return -ENOMEM; - ret = get_parent_info(rbd_dev, &pii); + ret = rbd_dev_v2_parent_info(rbd_dev, &pii); if (ret) goto out_err; - dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", - __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id, - pii.has_overlap, pii.overlap); - - if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) { - /* - * Either the parent never existed, or we have - * record of it but the image got flattened so it no - * longer has a parent. When the parent of a - * layered image disappears we immediately set the - * overlap to 0. The effect of this is that all new - * requests will be treated as if the image had no - * parent. - * - * If !pii.has_overlap, the parent image spec is not - * applicable. It's there to avoid duplication in each - * snapshot record. - */ - if (rbd_dev->parent_overlap) { - rbd_dev->parent_overlap = 0; - rbd_dev_parent_put(rbd_dev); - pr_info("%s: clone image has been flattened\n", - rbd_dev->disk->disk_name); - } - + if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) goto out; /* No parent? No problem. */ - } /* The ceph file layout needs to fit pool id in 32 bits */ @@ -5754,58 +5776,46 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) } /* - * The parent won't change (except when the clone is - * flattened, already handled that). So we only need to - * record the parent spec we have not already done so. + * The parent won't change except when the clone is flattened, + * so we only need to record the parent image spec once. */ - if (!rbd_dev->parent_spec) { - parent_spec->pool_id = pii.pool_id; - if (pii.pool_ns && *pii.pool_ns) { - parent_spec->pool_ns = pii.pool_ns; - pii.pool_ns = NULL; - } - parent_spec->image_id = pii.image_id; - pii.image_id = NULL; - parent_spec->snap_id = pii.snap_id; - - rbd_dev->parent_spec = parent_spec; - parent_spec = NULL; /* rbd_dev now owns this */ + parent_spec->pool_id = pii.pool_id; + if (pii.pool_ns && *pii.pool_ns) { + parent_spec->pool_ns = pii.pool_ns; + pii.pool_ns = NULL; } + parent_spec->image_id = pii.image_id; + pii.image_id = NULL; + parent_spec->snap_id = pii.snap_id; + + rbd_assert(!rbd_dev->parent_spec); + rbd_dev->parent_spec = parent_spec; + parent_spec = NULL; /* rbd_dev now owns this */ /* - * We always update the parent overlap. If it's zero we issue - * a warning, as we will proceed as if there was no parent. + * Record the parent overlap. If it's zero, issue a warning as + * we will proceed as if there is no parent. */ - if (!pii.overlap) { - if (parent_spec) { - /* refresh, careful to warn just once */ - if (rbd_dev->parent_overlap) - rbd_warn(rbd_dev, - "clone now standalone (overlap became 0)"); - } else { - /* initial probe */ - rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); - } - } + if (!pii.overlap) + rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); rbd_dev->parent_overlap = pii.overlap; out: ret = 0; out_err: - kfree(pii.pool_ns); - kfree(pii.image_id); + rbd_parent_info_cleanup(&pii); rbd_spec_put(parent_spec); return ret; } -static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) +static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev, + u64 *stripe_unit, u64 *stripe_count) { struct { __le64 stripe_unit; __le64 stripe_count; } __attribute__ ((packed)) striping_info_buf = { 0 }; size_t size = sizeof (striping_info_buf); - void *p; int ret; ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, @@ -5817,27 +5827,33 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) if (ret < size) return -ERANGE; - p = &striping_info_buf; - rbd_dev->header.stripe_unit = ceph_decode_64(&p); - rbd_dev->header.stripe_count = ceph_decode_64(&p); + *stripe_unit = le64_to_cpu(striping_info_buf.stripe_unit); + *stripe_count = le64_to_cpu(striping_info_buf.stripe_count); + dout(" stripe_unit = %llu stripe_count = %llu\n", *stripe_unit, + *stripe_count); + return 0; } -static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) +static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev, s64 *data_pool_id) { - __le64 data_pool_id; + __le64 data_pool_buf; int ret; ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, &rbd_dev->header_oloc, "get_data_pool", - NULL, 0, &data_pool_id, sizeof(data_pool_id)); + NULL, 0, &data_pool_buf, + sizeof(data_pool_buf)); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; - if (ret < sizeof(data_pool_id)) + if (ret < sizeof(data_pool_buf)) return -EBADMSG; - rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); - WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); + *data_pool_id = le64_to_cpu(data_pool_buf); + dout(" data_pool_id = %lld\n", *data_pool_id); + WARN_ON(*data_pool_id == CEPH_NOPOOL); + return 0; } @@ -6029,7 +6045,8 @@ out_err: return ret; } -static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) +static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, + struct ceph_snap_context **psnapc) { size_t size; int ret; @@ -6090,9 +6107,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) for (i = 0; i < snap_count; i++) snapc->snaps[i] = ceph_decode_64(&p); - ceph_put_snap_context(rbd_dev->header.snapc); - rbd_dev->header.snapc = snapc; - + *psnapc = snapc; dout(" snap context seq = %llu, snap_count = %u\n", (unsigned long long)seq, (unsigned int)snap_count); out: @@ -6141,38 +6156,42 @@ out: return snap_name; } -static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) +static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev, + struct rbd_image_header *header, + bool first_time) { - bool first_time = rbd_dev->header.object_prefix == NULL; int ret; - ret = rbd_dev_v2_image_size(rbd_dev); + ret = _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, + first_time ? &header->obj_order : NULL, + &header->image_size); if (ret) return ret; if (first_time) { - ret = rbd_dev_v2_header_onetime(rbd_dev); + ret = rbd_dev_v2_header_onetime(rbd_dev, header); if (ret) return ret; } - ret = rbd_dev_v2_snap_context(rbd_dev); - if (ret && first_time) { - kfree(rbd_dev->header.object_prefix); - rbd_dev->header.object_prefix = NULL; - } + ret = rbd_dev_v2_snap_context(rbd_dev, &header->snapc); + if (ret) + return ret; - return ret; + return 0; } -static int rbd_dev_header_info(struct rbd_device *rbd_dev) +static int rbd_dev_header_info(struct rbd_device *rbd_dev, + struct rbd_image_header *header, + bool first_time) { rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + rbd_assert(!header->object_prefix && !header->snapc); if (rbd_dev->image_format == 1) - return rbd_dev_v1_header_info(rbd_dev); + return rbd_dev_v1_header_info(rbd_dev, header, first_time); - return rbd_dev_v2_header_info(rbd_dev); + return rbd_dev_v2_header_info(rbd_dev, header, first_time); } /* @@ -6556,18 +6575,12 @@ static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) cancel_delayed_work_sync(&rbd_dev->lock_dwork); if (!ret) ret = -ETIMEDOUT; - } - if (ret) { - rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret); - return ret; + rbd_warn(rbd_dev, "failed to acquire lock: %ld", ret); } + if (ret) + return ret; - /* - * The lock may have been released by now, unless automatic lock - * transitions are disabled. - */ - rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev)); return 0; } @@ -6661,60 +6674,49 @@ out: */ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) { - struct rbd_image_header *header; - rbd_dev_parent_put(rbd_dev); rbd_object_map_free(rbd_dev); rbd_dev_mapping_clear(rbd_dev); /* Free dynamic fields from the header, then zero it out */ - header = &rbd_dev->header; - ceph_put_snap_context(header->snapc); - kfree(header->snap_sizes); - kfree(header->snap_names); - kfree(header->object_prefix); - memset(header, 0, sizeof (*header)); + rbd_image_header_cleanup(&rbd_dev->header); } -static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) +static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev, + struct rbd_image_header *header) { int ret; - ret = rbd_dev_v2_object_prefix(rbd_dev); + ret = rbd_dev_v2_object_prefix(rbd_dev, &header->object_prefix); if (ret) - goto out_err; + return ret; /* * Get the and check features for the image. Currently the * features are assumed to never change. */ - ret = rbd_dev_v2_features(rbd_dev); + ret = _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, + rbd_is_ro(rbd_dev), &header->features); if (ret) - goto out_err; + return ret; /* If the image supports fancy striping, get its parameters */ - if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { - ret = rbd_dev_v2_striping_info(rbd_dev); - if (ret < 0) - goto out_err; + if (header->features & RBD_FEATURE_STRIPINGV2) { + ret = rbd_dev_v2_striping_info(rbd_dev, &header->stripe_unit, + &header->stripe_count); + if (ret) + return ret; } - if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { - ret = rbd_dev_v2_data_pool(rbd_dev); + if (header->features & RBD_FEATURE_DATA_POOL) { + ret = rbd_dev_v2_data_pool(rbd_dev, &header->data_pool_id); if (ret) - goto out_err; + return ret; } - rbd_init_layout(rbd_dev); return 0; - -out_err: - rbd_dev->header.features = 0; - kfree(rbd_dev->header.object_prefix); - rbd_dev->header.object_prefix = NULL; - return ret; } /* @@ -6736,7 +6738,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) goto out_err; } - parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec); + parent = __rbd_dev_create(rbd_dev->parent_spec); if (!parent) { ret = -ENOMEM; goto out_err; @@ -6746,8 +6748,8 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) * Images related by parent/child relationships always share * rbd_client and spec/parent_spec, so bump their refcounts. */ - __rbd_get_client(rbd_dev->rbd_client); - rbd_spec_get(rbd_dev->parent_spec); + parent->rbd_client = __rbd_get_client(rbd_dev->rbd_client); + parent->spec = rbd_spec_get(rbd_dev->parent_spec); __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags); @@ -6909,13 +6911,15 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) if (!depth) down_write(&rbd_dev->header_rwsem); - ret = rbd_dev_header_info(rbd_dev); + ret = rbd_dev_header_info(rbd_dev, &rbd_dev->header, true); if (ret) { if (ret == -ENOENT && !need_watch) rbd_print_dne(rbd_dev, false); goto err_out_probe; } + rbd_init_layout(rbd_dev); + /* * If this image is the one being mapped, we have pool name and * id, image name and id, and snap name - need to fill snap id. @@ -6944,7 +6948,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) } if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { - ret = rbd_dev_v2_parent_info(rbd_dev); + ret = rbd_dev_setup_parent(rbd_dev); if (ret) goto err_out_probe; } @@ -6970,9 +6974,108 @@ err_out_format: return ret; } -static ssize_t do_rbd_add(struct bus_type *bus, - const char *buf, - size_t count) +static void rbd_dev_update_header(struct rbd_device *rbd_dev, + struct rbd_image_header *header) +{ + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + rbd_assert(rbd_dev->header.object_prefix); /* !first_time */ + + if (rbd_dev->header.image_size != header->image_size) { + rbd_dev->header.image_size = header->image_size; + + if (!rbd_is_snap(rbd_dev)) { + rbd_dev->mapping.size = header->image_size; + rbd_dev_update_size(rbd_dev); + } + } + + ceph_put_snap_context(rbd_dev->header.snapc); + rbd_dev->header.snapc = header->snapc; + header->snapc = NULL; + + if (rbd_dev->image_format == 1) { + kfree(rbd_dev->header.snap_names); + rbd_dev->header.snap_names = header->snap_names; + header->snap_names = NULL; + + kfree(rbd_dev->header.snap_sizes); + rbd_dev->header.snap_sizes = header->snap_sizes; + header->snap_sizes = NULL; + } +} + +static void rbd_dev_update_parent(struct rbd_device *rbd_dev, + struct parent_image_info *pii) +{ + if (pii->pool_id == CEPH_NOPOOL || !pii->has_overlap) { + /* + * Either the parent never existed, or we have + * record of it but the image got flattened so it no + * longer has a parent. When the parent of a + * layered image disappears we immediately set the + * overlap to 0. The effect of this is that all new + * requests will be treated as if the image had no + * parent. + * + * If !pii.has_overlap, the parent image spec is not + * applicable. It's there to avoid duplication in each + * snapshot record. + */ + if (rbd_dev->parent_overlap) { + rbd_dev->parent_overlap = 0; + rbd_dev_parent_put(rbd_dev); + pr_info("%s: clone has been flattened\n", + rbd_dev->disk->disk_name); + } + } else { + rbd_assert(rbd_dev->parent_spec); + + /* + * Update the parent overlap. If it became zero, issue + * a warning as we will proceed as if there is no parent. + */ + if (!pii->overlap && rbd_dev->parent_overlap) + rbd_warn(rbd_dev, + "clone has become standalone (overlap 0)"); + rbd_dev->parent_overlap = pii->overlap; + } +} + +static int rbd_dev_refresh(struct rbd_device *rbd_dev) +{ + struct rbd_image_header header = { 0 }; + struct parent_image_info pii = { 0 }; + int ret; + + dout("%s rbd_dev %p\n", __func__, rbd_dev); + + ret = rbd_dev_header_info(rbd_dev, &header, false); + if (ret) + goto out; + + /* + * If there is a parent, see if it has disappeared due to the + * mapped image getting flattened. + */ + if (rbd_dev->parent) { + ret = rbd_dev_v2_parent_info(rbd_dev, &pii); + if (ret) + goto out; + } + + down_write(&rbd_dev->header_rwsem); + rbd_dev_update_header(rbd_dev, &header); + if (rbd_dev->parent) + rbd_dev_update_parent(rbd_dev, &pii); + up_write(&rbd_dev->header_rwsem); + +out: + rbd_parent_info_cleanup(&pii); + rbd_image_header_cleanup(&header); + return ret; +} + +static ssize_t do_rbd_add(const char *buf, size_t count) { struct rbd_device *rbd_dev = NULL; struct ceph_options *ceph_opts = NULL; @@ -7084,18 +7187,18 @@ err_out_args: goto out; } -static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count) +static ssize_t add_store(const struct bus_type *bus, const char *buf, size_t count) { if (single_major) return -EINVAL; - return do_rbd_add(bus, buf, count); + return do_rbd_add(buf, count); } -static ssize_t add_single_major_store(struct bus_type *bus, const char *buf, +static ssize_t add_single_major_store(const struct bus_type *bus, const char *buf, size_t count) { - return do_rbd_add(bus, buf, count); + return do_rbd_add(buf, count); } static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) @@ -7125,12 +7228,9 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) } } -static ssize_t do_rbd_remove(struct bus_type *bus, - const char *buf, - size_t count) +static ssize_t do_rbd_remove(const char *buf, size_t count) { struct rbd_device *rbd_dev = NULL; - struct list_head *tmp; int dev_id; char opt_buf[6]; bool force = false; @@ -7157,8 +7257,7 @@ static ssize_t do_rbd_remove(struct bus_type *bus, ret = -ENOENT; spin_lock(&rbd_dev_list_lock); - list_for_each(tmp, &rbd_dev_list) { - rbd_dev = list_entry(tmp, struct rbd_device, node); + list_for_each_entry(rbd_dev, &rbd_dev_list, node) { if (rbd_dev->dev_id == dev_id) { ret = 0; break; @@ -7182,8 +7281,10 @@ static ssize_t do_rbd_remove(struct bus_type *bus, * Prevent new IO from being queued and wait for existing * IO to complete/fail. */ - blk_mq_freeze_queue(rbd_dev->disk->queue); + unsigned int memflags = blk_mq_freeze_queue(rbd_dev->disk->queue); + blk_mark_disk_dead(rbd_dev->disk); + blk_mq_unfreeze_queue(rbd_dev->disk->queue, memflags); } del_gendisk(rbd_dev->disk); @@ -7199,18 +7300,18 @@ static ssize_t do_rbd_remove(struct bus_type *bus, return count; } -static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count) +static ssize_t remove_store(const struct bus_type *bus, const char *buf, size_t count) { if (single_major) return -EINVAL; - return do_rbd_remove(bus, buf, count); + return do_rbd_remove(buf, count); } -static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf, +static ssize_t remove_single_major_store(const struct bus_type *bus, const char *buf, size_t count) { - return do_rbd_remove(bus, buf, count); + return do_rbd_remove(buf, count); } /* @@ -7288,7 +7389,7 @@ static int __init rbd_init(void) * The number of active work items is limited by the number of * rbd devices * queue depth, so leave @max_active at default. */ - rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); + rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!rbd_wq) { rc = -ENOMEM; goto err_out_slab; |
