summaryrefslogtreecommitdiff
path: root/drivers/block/rbd.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/rbd.c')
-rw-r--r--drivers/block/rbd.c903
1 files changed, 494 insertions, 409 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 531d390902dd..af0e21149dbc 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -362,7 +362,7 @@ enum rbd_watch_state {
enum rbd_lock_state {
RBD_LOCK_STATE_UNLOCKED,
RBD_LOCK_STATE_LOCKED,
- RBD_LOCK_STATE_RELEASING,
+ RBD_LOCK_STATE_QUIESCING,
};
/* WatchNotify::ClientId */
@@ -422,7 +422,7 @@ struct rbd_device {
struct list_head running_list;
struct completion acquire_wait;
int acquire_err;
- struct completion releasing_wait;
+ struct completion quiescing_wait;
spinlock_t object_map_lock;
u8 *object_map;
@@ -491,12 +491,12 @@ static bool single_major = true;
module_param(single_major, bool, 0444);
MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
-static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
-static ssize_t remove_store(struct bus_type *bus, const char *buf,
+static ssize_t add_store(const struct bus_type *bus, const char *buf, size_t count);
+static ssize_t remove_store(const struct bus_type *bus, const char *buf,
size_t count);
-static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
+static ssize_t add_single_major_store(const struct bus_type *bus, const char *buf,
size_t count);
-static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
+static ssize_t remove_single_major_store(const struct bus_type *bus, const char *buf,
size_t count);
static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
@@ -525,7 +525,7 @@ static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
lockdep_assert_held(&rbd_dev->lock_rwsem);
return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
- rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
+ rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING;
}
static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
@@ -538,7 +538,7 @@ static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
return is_lock_owner;
}
-static ssize_t supported_features_show(struct bus_type *bus, char *buf)
+static ssize_t supported_features_show(const struct bus_type *bus, char *buf)
{
return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
}
@@ -575,7 +575,7 @@ static const struct attribute_group rbd_bus_group = {
};
__ATTRIBUTE_GROUPS(rbd_bus);
-static struct bus_type rbd_bus_type = {
+static const struct bus_type rbd_bus_type = {
.name = "rbd",
.bus_groups = rbd_bus_groups,
};
@@ -632,9 +632,8 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
static int rbd_dev_refresh(struct rbd_device *rbd_dev);
-static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
-static int rbd_dev_header_info(struct rbd_device *rbd_dev);
-static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
+static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev,
+ struct rbd_image_header *header);
static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
u64 snap_id);
static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
@@ -660,9 +659,9 @@ static bool pending_result_dec(struct pending_result *pending, int *result)
return true;
}
-static int rbd_open(struct block_device *bdev, fmode_t mode)
+static int rbd_open(struct gendisk *disk, blk_mode_t mode)
{
- struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
+ struct rbd_device *rbd_dev = disk->private_data;
bool removing = false;
spin_lock_irq(&rbd_dev->lock);
@@ -679,7 +678,7 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
return 0;
}
-static void rbd_release(struct gendisk *disk, fmode_t mode)
+static void rbd_release(struct gendisk *disk)
{
struct rbd_device *rbd_dev = disk->private_data;
unsigned long open_count_before;
@@ -756,24 +755,23 @@ static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
*/
static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
{
- struct rbd_client *client_node;
- bool found = false;
+ struct rbd_client *rbdc = NULL, *iter;
if (ceph_opts->flags & CEPH_OPT_NOSHARE)
return NULL;
spin_lock(&rbd_client_list_lock);
- list_for_each_entry(client_node, &rbd_client_list, node) {
- if (!ceph_compare_options(ceph_opts, client_node->client)) {
- __rbd_get_client(client_node);
+ list_for_each_entry(iter, &rbd_client_list, node) {
+ if (!ceph_compare_options(ceph_opts, iter->client)) {
+ __rbd_get_client(iter);
- found = true;
+ rbdc = iter;
break;
}
}
spin_unlock(&rbd_client_list_lock);
- return found ? client_node : NULL;
+ return rbdc;
}
/*
@@ -836,7 +834,7 @@ struct rbd_options {
u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
};
-#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
+#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_DEFAULT_RQ
#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
#define RBD_READ_ONLY_DEFAULT false
@@ -996,15 +994,24 @@ static void rbd_init_layout(struct rbd_device *rbd_dev)
RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
}
+static void rbd_image_header_cleanup(struct rbd_image_header *header)
+{
+ kfree(header->object_prefix);
+ ceph_put_snap_context(header->snapc);
+ kfree(header->snap_sizes);
+ kfree(header->snap_names);
+
+ memset(header, 0, sizeof(*header));
+}
+
/*
* Fill an rbd image header with information from the given format 1
* on-disk header.
*/
-static int rbd_header_from_disk(struct rbd_device *rbd_dev,
- struct rbd_image_header_ondisk *ondisk)
+static int rbd_header_from_disk(struct rbd_image_header *header,
+ struct rbd_image_header_ondisk *ondisk,
+ bool first_time)
{
- struct rbd_image_header *header = &rbd_dev->header;
- bool first_time = header->object_prefix == NULL;
struct ceph_snap_context *snapc;
char *object_prefix = NULL;
char *snap_names = NULL;
@@ -1071,11 +1078,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev,
if (first_time) {
header->object_prefix = object_prefix;
header->obj_order = ondisk->options.order;
- rbd_init_layout(rbd_dev);
- } else {
- ceph_put_snap_context(header->snapc);
- kfree(header->snap_names);
- kfree(header->snap_sizes);
}
/* The remaining fields always get updated (when we refresh) */
@@ -1219,24 +1221,13 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
rbd_dev->mapping.size = 0;
}
-static void zero_bvec(struct bio_vec *bv)
-{
- void *buf;
- unsigned long flags;
-
- buf = bvec_kmap_irq(bv, &flags);
- memset(buf, 0, bv->bv_len);
- flush_dcache_page(bv->bv_page);
- bvec_kunmap_irq(buf, &flags);
-}
-
static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
{
struct ceph_bio_iter it = *bio_pos;
ceph_bio_iter_advance(&it, off);
ceph_bio_iter_advance_step(&it, bytes, ({
- zero_bvec(&bv);
+ memzero_bvec(&bv);
}));
}
@@ -1246,7 +1237,7 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
ceph_bvec_iter_advance(&it, off);
ceph_bvec_iter_advance_step(&it, bytes, ({
- zero_bvec(&bv);
+ memzero_bvec(&bv);
}));
}
@@ -1309,7 +1300,7 @@ static void rbd_osd_submit(struct ceph_osd_request *osd_req)
dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
__func__, osd_req, obj_req, obj_req->ex.oe_objno,
obj_req->ex.oe_off, obj_req->ex.oe_len);
- ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
+ ceph_osdc_start_request(osd_req->r_osdc, osd_req);
}
/*
@@ -1346,14 +1337,30 @@ static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
/*
* Must be called after rbd_obj_calc_img_extents().
*/
-static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
+static void rbd_obj_set_copyup_enabled(struct rbd_obj_request *obj_req)
{
- if (!obj_req->num_img_extents ||
- (rbd_obj_is_entire(obj_req) &&
- !obj_req->img_request->snapc->num_snaps))
- return false;
+ rbd_assert(obj_req->img_request->snapc);
- return true;
+ if (obj_req->img_request->op_type == OBJ_OP_DISCARD) {
+ dout("%s %p objno %llu discard\n", __func__, obj_req,
+ obj_req->ex.oe_objno);
+ return;
+ }
+
+ if (!obj_req->num_img_extents) {
+ dout("%s %p objno %llu not overlapping\n", __func__, obj_req,
+ obj_req->ex.oe_objno);
+ return;
+ }
+
+ if (rbd_obj_is_entire(obj_req) &&
+ !obj_req->img_request->snapc->num_snaps) {
+ dout("%s %p objno %llu entire\n", __func__, obj_req,
+ obj_req->ex.oe_objno);
+ return;
+ }
+
+ obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
}
static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
@@ -1454,6 +1461,7 @@ __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
static struct ceph_osd_request *
rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
{
+ rbd_assert(obj_req->img_request->snapc);
return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
num_ops);
}
@@ -1590,15 +1598,18 @@ static void rbd_img_request_init(struct rbd_img_request *img_request,
mutex_init(&img_request->state_mutex);
}
+/*
+ * Only snap_id is captured here, for reads. For writes, snapshot
+ * context is captured in rbd_img_object_requests() after exclusive
+ * lock is ensured to be held.
+ */
static void rbd_img_capture_header(struct rbd_img_request *img_req)
{
struct rbd_device *rbd_dev = img_req->rbd_dev;
lockdep_assert_held(&rbd_dev->header_rwsem);
- if (rbd_img_is_write(img_req))
- img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
- else
+ if (!rbd_img_is_write(img_req))
img_req->snap_id = rbd_dev->spec->snap_id;
if (rbd_dev_parent_get(rbd_dev))
@@ -2093,7 +2104,7 @@ static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
if (ret)
return ret;
- ceph_osdc_start_request(osdc, req, false);
+ ceph_osdc_start_request(osdc, req);
return 0;
}
@@ -2245,9 +2256,6 @@ static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
if (ret)
return ret;
- if (rbd_obj_copyup_enabled(obj_req))
- obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
-
obj_req->write_state = RBD_OBJ_WRITE_START;
return 0;
}
@@ -2353,8 +2361,6 @@ static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
if (ret)
return ret;
- if (rbd_obj_copyup_enabled(obj_req))
- obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
if (!obj_req->num_img_extents) {
obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
if (rbd_obj_is_entire(obj_req))
@@ -2997,8 +3003,7 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
};
ceph_bvec_iter_advance_step(&it, bytes, ({
- if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
- bv.bv_len))
+ if (memchr_inv(bvec_virt(&bv), 0, bv.bv_len))
return false;
}));
return true;
@@ -3081,13 +3086,12 @@ static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
for (i = 0; i < obj_req->copyup_bvec_count; i++) {
unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
+ struct page *page = alloc_page(GFP_NOIO);
- obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
- if (!obj_req->copyup_bvecs[i].bv_page)
+ if (!page)
return -ENOMEM;
- obj_req->copyup_bvecs[i].bv_offset = 0;
- obj_req->copyup_bvecs[i].bv_len = len;
+ bvec_set_page(&obj_req->copyup_bvecs[i], page, len, 0);
obj_overlap -= len;
}
@@ -3300,6 +3304,7 @@ again:
case RBD_OBJ_WRITE_START:
rbd_assert(!*result);
+ rbd_obj_set_copyup_enabled(obj_req);
if (rbd_obj_write_is_noop(obj_req))
return true;
@@ -3447,17 +3452,19 @@ static bool rbd_lock_add_request(struct rbd_img_request *img_req)
static void rbd_lock_del_request(struct rbd_img_request *img_req)
{
struct rbd_device *rbd_dev = img_req->rbd_dev;
- bool need_wakeup;
+ bool need_wakeup = false;
lockdep_assert_held(&rbd_dev->lock_rwsem);
spin_lock(&rbd_dev->lock_lists_lock);
- rbd_assert(!list_empty(&img_req->lock_item));
- list_del_init(&img_req->lock_item);
- need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
- list_empty(&rbd_dev->running_list));
+ if (!list_empty(&img_req->lock_item)) {
+ rbd_assert(!list_empty(&rbd_dev->running_list));
+ list_del_init(&img_req->lock_item);
+ need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING &&
+ list_empty(&rbd_dev->running_list));
+ }
spin_unlock(&rbd_dev->lock_lists_lock);
if (need_wakeup)
- complete(&rbd_dev->releasing_wait);
+ complete(&rbd_dev->quiescing_wait);
}
static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
@@ -3470,11 +3477,6 @@ static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
if (rbd_lock_add_request(img_req))
return 1;
- if (rbd_dev->opts->exclusive) {
- WARN_ON(1); /* lock got released? */
- return -EROFS;
- }
-
/*
* Note the use of mod_delayed_work() in rbd_acquire_lock()
* and cancel_delayed_work() in wake_lock_waiters().
@@ -3486,9 +3488,19 @@ static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
static void rbd_img_object_requests(struct rbd_img_request *img_req)
{
+ struct rbd_device *rbd_dev = img_req->rbd_dev;
struct rbd_obj_request *obj_req;
rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
+ rbd_assert(!need_exclusive_lock(img_req) ||
+ __rbd_is_lock_owner(rbd_dev));
+
+ if (rbd_img_is_write(img_req)) {
+ rbd_assert(!img_req->snapc);
+ down_read(&rbd_dev->header_rwsem);
+ img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+ up_read(&rbd_dev->header_rwsem);
+ }
for_each_obj_request(img_req, obj_req) {
int result = 0;
@@ -3506,7 +3518,6 @@ static void rbd_img_object_requests(struct rbd_img_request *img_req)
static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
{
- struct rbd_device *rbd_dev = img_req->rbd_dev;
int ret;
again:
@@ -3527,9 +3538,6 @@ again:
if (*result)
return true;
- rbd_assert(!need_exclusive_lock(img_req) ||
- __rbd_is_lock_owner(rbd_dev));
-
rbd_img_object_requests(img_req);
if (!img_req->pending.num_pending) {
*result = img_req->pending.result;
@@ -3667,7 +3675,7 @@ static int rbd_lock(struct rbd_device *rbd_dev)
ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
RBD_LOCK_TAG, "", 0);
- if (ret)
+ if (ret && ret != -EEXIST)
return ret;
__rbd_lock(rbd_dev, cookie);
@@ -3831,61 +3839,97 @@ static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
return;
}
- list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
+ while (!list_empty(&rbd_dev->acquiring_list)) {
+ img_req = list_first_entry(&rbd_dev->acquiring_list,
+ struct rbd_img_request, lock_item);
mutex_lock(&img_req->state_mutex);
rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
+ if (!result)
+ list_move_tail(&img_req->lock_item,
+ &rbd_dev->running_list);
+ else
+ list_del_init(&img_req->lock_item);
rbd_img_schedule(img_req, result);
mutex_unlock(&img_req->state_mutex);
}
+}
- list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
+static bool locker_equal(const struct ceph_locker *lhs,
+ const struct ceph_locker *rhs)
+{
+ return lhs->id.name.type == rhs->id.name.type &&
+ lhs->id.name.num == rhs->id.name.num &&
+ !strcmp(lhs->id.cookie, rhs->id.cookie) &&
+ ceph_addr_equal_no_type(&lhs->info.addr, &rhs->info.addr);
}
-static int get_lock_owner_info(struct rbd_device *rbd_dev,
- struct ceph_locker **lockers, u32 *num_lockers)
+static void free_locker(struct ceph_locker *locker)
+{
+ if (locker)
+ ceph_free_lockers(locker, 1);
+}
+
+static struct ceph_locker *get_lock_owner_info(struct rbd_device *rbd_dev)
{
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct ceph_locker *lockers;
+ u32 num_lockers;
u8 lock_type;
char *lock_tag;
+ u64 handle;
int ret;
- dout("%s rbd_dev %p\n", __func__, rbd_dev);
-
ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
&rbd_dev->header_oloc, RBD_LOCK_NAME,
- &lock_type, &lock_tag, lockers, num_lockers);
- if (ret)
- return ret;
+ &lock_type, &lock_tag, &lockers, &num_lockers);
+ if (ret) {
+ rbd_warn(rbd_dev, "failed to get header lockers: %d", ret);
+ return ERR_PTR(ret);
+ }
- if (*num_lockers == 0) {
+ if (num_lockers == 0) {
dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
+ lockers = NULL;
goto out;
}
if (strcmp(lock_tag, RBD_LOCK_TAG)) {
rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
lock_tag);
- ret = -EBUSY;
- goto out;
+ goto err_busy;
}
- if (lock_type == CEPH_CLS_LOCK_SHARED) {
- rbd_warn(rbd_dev, "shared lock type detected");
- ret = -EBUSY;
- goto out;
+ if (lock_type != CEPH_CLS_LOCK_EXCLUSIVE) {
+ rbd_warn(rbd_dev, "incompatible lock type detected");
+ goto err_busy;
}
- if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
- strlen(RBD_LOCK_COOKIE_PREFIX))) {
+ WARN_ON(num_lockers != 1);
+ ret = sscanf(lockers[0].id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu",
+ &handle);
+ if (ret != 1) {
rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
- (*lockers)[0].id.cookie);
- ret = -EBUSY;
- goto out;
+ lockers[0].id.cookie);
+ goto err_busy;
+ }
+ if (ceph_addr_is_blank(&lockers[0].info.addr)) {
+ rbd_warn(rbd_dev, "locker has a blank address");
+ goto err_busy;
}
+ dout("%s rbd_dev %p got locker %s%llu@%pISpc/%u handle %llu\n",
+ __func__, rbd_dev, ENTITY_NAME(lockers[0].id.name),
+ &lockers[0].info.addr.in_addr,
+ le32_to_cpu(lockers[0].info.addr.nonce), handle);
+
out:
kfree(lock_tag);
- return ret;
+ return lockers;
+
+err_busy:
+ kfree(lock_tag);
+ ceph_free_lockers(lockers, num_lockers);
+ return ERR_PTR(-EBUSY);
}
static int find_watcher(struct rbd_device *rbd_dev,
@@ -3901,8 +3945,10 @@ static int find_watcher(struct rbd_device *rbd_dev,
ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
&rbd_dev->header_oloc, &watchers,
&num_watchers);
- if (ret)
+ if (ret) {
+ rbd_warn(rbd_dev, "failed to get watchers: %d", ret);
return ret;
+ }
sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
for (i = 0; i < num_watchers; i++) {
@@ -3939,51 +3985,72 @@ out:
static int rbd_try_lock(struct rbd_device *rbd_dev)
{
struct ceph_client *client = rbd_dev->rbd_client->client;
- struct ceph_locker *lockers;
- u32 num_lockers;
+ struct ceph_locker *locker, *refreshed_locker;
int ret;
for (;;) {
+ locker = refreshed_locker = NULL;
+
ret = rbd_lock(rbd_dev);
- if (ret != -EBUSY)
- return ret;
+ if (!ret)
+ goto out;
+ if (ret != -EBUSY) {
+ rbd_warn(rbd_dev, "failed to lock header: %d", ret);
+ goto out;
+ }
/* determine if the current lock holder is still alive */
- ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
- if (ret)
- return ret;
-
- if (num_lockers == 0)
+ locker = get_lock_owner_info(rbd_dev);
+ if (IS_ERR(locker)) {
+ ret = PTR_ERR(locker);
+ locker = NULL;
+ goto out;
+ }
+ if (!locker)
goto again;
- ret = find_watcher(rbd_dev, lockers);
+ ret = find_watcher(rbd_dev, locker);
if (ret)
goto out; /* request lock or error */
+ refreshed_locker = get_lock_owner_info(rbd_dev);
+ if (IS_ERR(refreshed_locker)) {
+ ret = PTR_ERR(refreshed_locker);
+ refreshed_locker = NULL;
+ goto out;
+ }
+ if (!refreshed_locker ||
+ !locker_equal(locker, refreshed_locker))
+ goto again;
+
rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
- ENTITY_NAME(lockers[0].id.name));
+ ENTITY_NAME(locker->id.name));
ret = ceph_monc_blocklist_add(&client->monc,
- &lockers[0].info.addr);
+ &locker->info.addr);
if (ret) {
- rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
- ENTITY_NAME(lockers[0].id.name), ret);
+ rbd_warn(rbd_dev, "failed to blocklist %s%llu: %d",
+ ENTITY_NAME(locker->id.name), ret);
goto out;
}
ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
&rbd_dev->header_oloc, RBD_LOCK_NAME,
- lockers[0].id.cookie,
- &lockers[0].id.name);
- if (ret && ret != -ENOENT)
+ locker->id.cookie, &locker->id.name);
+ if (ret && ret != -ENOENT) {
+ rbd_warn(rbd_dev, "failed to break header lock: %d",
+ ret);
goto out;
+ }
again:
- ceph_free_lockers(lockers, num_lockers);
+ free_locker(refreshed_locker);
+ free_locker(locker);
}
out:
- ceph_free_lockers(lockers, num_lockers);
+ free_locker(refreshed_locker);
+ free_locker(locker);
return ret;
}
@@ -3991,6 +4058,10 @@ static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
{
int ret;
+ ret = rbd_dev_refresh(rbd_dev);
+ if (ret)
+ return ret;
+
if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
ret = rbd_object_map_open(rbd_dev);
if (ret)
@@ -4029,11 +4100,8 @@ static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
ret = rbd_try_lock(rbd_dev);
if (ret < 0) {
- rbd_warn(rbd_dev, "failed to lock header: %d", ret);
- if (ret == -EBLOCKLISTED)
- goto out;
-
- ret = 1; /* request lock anyway */
+ rbd_warn(rbd_dev, "failed to acquire lock: %d", ret);
+ goto out;
}
if (ret > 0) {
up_write(&rbd_dev->lock_rwsem);
@@ -4100,8 +4168,6 @@ again:
static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
{
- bool need_wait;
-
dout("%s rbd_dev %p\n", __func__, rbd_dev);
lockdep_assert_held_write(&rbd_dev->lock_rwsem);
@@ -4111,16 +4177,16 @@ static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
/*
* Ensure that all in-flight IO is flushed.
*/
- rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
- rbd_assert(!completion_done(&rbd_dev->releasing_wait));
- need_wait = !list_empty(&rbd_dev->running_list);
- downgrade_write(&rbd_dev->lock_rwsem);
- if (need_wait)
- wait_for_completion(&rbd_dev->releasing_wait);
- up_read(&rbd_dev->lock_rwsem);
+ rbd_dev->lock_state = RBD_LOCK_STATE_QUIESCING;
+ rbd_assert(!completion_done(&rbd_dev->quiescing_wait));
+ if (list_empty(&rbd_dev->running_list))
+ return true;
+
+ up_write(&rbd_dev->lock_rwsem);
+ wait_for_completion(&rbd_dev->quiescing_wait);
down_write(&rbd_dev->lock_rwsem);
- if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
+ if (rbd_dev->lock_state != RBD_LOCK_STATE_QUIESCING)
return false;
rbd_assert(list_empty(&rbd_dev->running_list));
@@ -4203,15 +4269,11 @@ static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
down_write(&rbd_dev->lock_rwsem);
if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
- /*
- * we already know that the remote client is
- * the owner
- */
- up_write(&rbd_dev->lock_rwsem);
- return;
+ dout("%s rbd_dev %p cid %llu-%llu == owner_cid\n",
+ __func__, rbd_dev, cid.gid, cid.handle);
+ } else {
+ rbd_set_owner_cid(rbd_dev, &cid);
}
-
- rbd_set_owner_cid(rbd_dev, &cid);
downgrade_write(&rbd_dev->lock_rwsem);
} else {
down_read(&rbd_dev->lock_rwsem);
@@ -4236,14 +4298,12 @@ static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
down_write(&rbd_dev->lock_rwsem);
if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
- dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
+ dout("%s rbd_dev %p cid %llu-%llu != owner_cid %llu-%llu\n",
__func__, rbd_dev, cid.gid, cid.handle,
rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
- up_write(&rbd_dev->lock_rwsem);
- return;
+ } else {
+ rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
}
-
- rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
downgrade_write(&rbd_dev->lock_rwsem);
} else {
down_read(&rbd_dev->lock_rwsem);
@@ -4537,6 +4597,10 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
rbd_warn(rbd_dev, "failed to update lock cookie: %d",
ret);
+ if (rbd_dev->opts->exclusive)
+ rbd_warn(rbd_dev,
+ "temporarily releasing lock on exclusive mapping");
+
/*
* Lock cookie cannot be updated on older OSDs, so do
* a manual release and queue an acquire.
@@ -4750,7 +4814,7 @@ static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
static void rbd_free_disk(struct rbd_device *rbd_dev)
{
- blk_cleanup_disk(rbd_dev->disk);
+ put_disk(rbd_dev->disk);
blk_mq_free_tag_set(&rbd_dev->tag_set);
rbd_dev->disk = NULL;
}
@@ -4789,7 +4853,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
if (ret)
goto out_req;
- ceph_osdc_start_request(osdc, req, false);
+ ceph_osdc_start_request(osdc, req);
ret = ceph_osdc_wait_request(osdc, req);
if (ret >= 0)
ceph_copy_from_page_vector(pages, buf, 0, ret);
@@ -4804,7 +4868,9 @@ out_req:
* return, the rbd_dev->header field will contain up-to-date
* information about the image.
*/
-static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
+static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev,
+ struct rbd_image_header *header,
+ bool first_time)
{
struct rbd_image_header_ondisk *ondisk = NULL;
u32 snap_count = 0;
@@ -4852,7 +4918,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
snap_count = le32_to_cpu(ondisk->snap_count);
} while (snap_count != want_count);
- ret = rbd_header_from_disk(rbd_dev, ondisk);
+ ret = rbd_header_from_disk(header, ondisk, first_time);
out:
kfree(ondisk);
@@ -4876,39 +4942,6 @@ static void rbd_dev_update_size(struct rbd_device *rbd_dev)
}
}
-static int rbd_dev_refresh(struct rbd_device *rbd_dev)
-{
- u64 mapping_size;
- int ret;
-
- down_write(&rbd_dev->header_rwsem);
- mapping_size = rbd_dev->mapping.size;
-
- ret = rbd_dev_header_info(rbd_dev);
- if (ret)
- goto out;
-
- /*
- * If there is a parent, see if it has disappeared due to the
- * mapped image getting flattened.
- */
- if (rbd_dev->parent) {
- ret = rbd_dev_v2_parent_info(rbd_dev);
- if (ret)
- goto out;
- }
-
- rbd_assert(!rbd_is_snap(rbd_dev));
- rbd_dev->mapping.size = rbd_dev->header.image_size;
-
-out:
- up_write(&rbd_dev->header_rwsem);
- if (!ret && mapping_size != rbd_dev->mapping.size)
- rbd_dev_update_size(rbd_dev);
-
- return ret;
-}
-
static const struct blk_mq_ops rbd_mq_ops = {
.queue_rq = rbd_queue_rq,
};
@@ -4916,16 +4949,21 @@ static const struct blk_mq_ops rbd_mq_ops = {
static int rbd_init_disk(struct rbd_device *rbd_dev)
{
struct gendisk *disk;
- struct request_queue *q;
unsigned int objset_bytes =
rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
+ struct queue_limits lim = {
+ .max_hw_sectors = objset_bytes >> SECTOR_SHIFT,
+ .io_opt = objset_bytes,
+ .io_min = rbd_dev->opts->alloc_size,
+ .max_segments = USHRT_MAX,
+ .max_segment_size = UINT_MAX,
+ };
int err;
memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
rbd_dev->tag_set.ops = &rbd_mq_ops;
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
- rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
@@ -4933,45 +4971,31 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
if (err)
return err;
- disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev);
+ if (rbd_dev->opts->trim) {
+ lim.discard_granularity = rbd_dev->opts->alloc_size;
+ lim.max_hw_discard_sectors = objset_bytes >> SECTOR_SHIFT;
+ lim.max_write_zeroes_sectors = objset_bytes >> SECTOR_SHIFT;
+ }
+
+ if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
+ lim.features |= BLK_FEAT_STABLE_WRITES;
+
+ disk = blk_mq_alloc_disk(&rbd_dev->tag_set, &lim, rbd_dev);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_tag_set;
}
- q = disk->queue;
snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
rbd_dev->dev_id);
disk->major = rbd_dev->major;
disk->first_minor = rbd_dev->minor;
- if (single_major) {
+ if (single_major)
disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT);
- disk->flags |= GENHD_FL_EXT_DEVT;
- } else {
+ else
disk->minors = RBD_MINORS_PER_MAJOR;
- }
disk->fops = &rbd_bd_ops;
-
- blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
- /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
-
- blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
- q->limits.max_sectors = queue_max_hw_sectors(q);
- blk_queue_max_segments(q, USHRT_MAX);
- blk_queue_max_segment_size(q, UINT_MAX);
- blk_queue_io_min(q, rbd_dev->opts->alloc_size);
- blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
-
- if (rbd_dev->opts->trim) {
- blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
- q->limits.discard_granularity = rbd_dev->opts->alloc_size;
- blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
- blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
- }
-
- if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
- blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
-
+ disk->private_data = rbd_dev;
rbd_dev->disk = disk;
return 0;
@@ -5301,7 +5325,7 @@ static void rbd_dev_release(struct device *dev)
if (need_put) {
destroy_workqueue(rbd_dev->task_wq);
- ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
+ ida_free(&rbd_dev_id_ida, rbd_dev->dev_id);
}
rbd_dev_free(rbd_dev);
@@ -5315,8 +5339,7 @@ static void rbd_dev_release(struct device *dev)
module_put(THIS_MODULE);
}
-static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
- struct rbd_spec *spec)
+static struct rbd_device *__rbd_dev_create(struct rbd_spec *spec)
{
struct rbd_device *rbd_dev;
@@ -5352,7 +5375,7 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
INIT_LIST_HEAD(&rbd_dev->acquiring_list);
INIT_LIST_HEAD(&rbd_dev->running_list);
init_completion(&rbd_dev->acquire_wait);
- init_completion(&rbd_dev->releasing_wait);
+ init_completion(&rbd_dev->quiescing_wait);
spin_lock_init(&rbd_dev->object_map_lock);
@@ -5361,9 +5384,6 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
rbd_dev->dev.parent = &rbd_root_dev;
device_initialize(&rbd_dev->dev);
- rbd_dev->rbd_client = rbdc;
- rbd_dev->spec = spec;
-
return rbd_dev;
}
@@ -5376,16 +5396,14 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
{
struct rbd_device *rbd_dev;
- rbd_dev = __rbd_dev_create(rbdc, spec);
+ rbd_dev = __rbd_dev_create(spec);
if (!rbd_dev)
return NULL;
- rbd_dev->opts = opts;
-
/* get an id and fill in device name */
- rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
- minor_to_rbd_dev_id(1 << MINORBITS),
- GFP_KERNEL);
+ rbd_dev->dev_id = ida_alloc_max(&rbd_dev_id_ida,
+ minor_to_rbd_dev_id(1 << MINORBITS) - 1,
+ GFP_KERNEL);
if (rbd_dev->dev_id < 0)
goto fail_rbd_dev;
@@ -5398,11 +5416,15 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
/* we have a ref from do_rbd_add() */
__module_get(THIS_MODULE);
+ rbd_dev->rbd_client = rbdc;
+ rbd_dev->spec = spec;
+ rbd_dev->opts = opts;
+
dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
return rbd_dev;
fail_dev_id:
- ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
+ ida_free(&rbd_dev_id_ida, rbd_dev->dev_id);
fail_rbd_dev:
rbd_dev_free(rbd_dev);
return NULL;
@@ -5452,17 +5474,12 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
return 0;
}
-static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
-{
- return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
- &rbd_dev->header.obj_order,
- &rbd_dev->header.image_size);
-}
-
-static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
+static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev,
+ char **pobject_prefix)
{
size_t size;
void *reply_buf;
+ char *object_prefix;
int ret;
void *p;
@@ -5480,16 +5497,16 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
goto out;
p = reply_buf;
- rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
- p + ret, NULL, GFP_NOIO);
+ object_prefix = ceph_extract_encoded_string(&p, p + ret, NULL,
+ GFP_NOIO);
+ if (IS_ERR(object_prefix)) {
+ ret = PTR_ERR(object_prefix);
+ goto out;
+ }
ret = 0;
- if (IS_ERR(rbd_dev->header.object_prefix)) {
- ret = PTR_ERR(rbd_dev->header.object_prefix);
- rbd_dev->header.object_prefix = NULL;
- } else {
- dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
- }
+ *pobject_prefix = object_prefix;
+ dout(" object_prefix = %s\n", object_prefix);
out:
kfree(reply_buf);
@@ -5540,13 +5557,6 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
return 0;
}
-static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
-{
- return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
- rbd_is_ro(rbd_dev),
- &rbd_dev->header.features);
-}
-
/*
* These are generic image flags, but since they are used only for
* object map, store them in rbd_dev->object_map_flags.
@@ -5583,6 +5593,14 @@ struct parent_image_info {
u64 overlap;
};
+static void rbd_parent_info_cleanup(struct parent_image_info *pii)
+{
+ kfree(pii->pool_ns);
+ kfree(pii->image_id);
+
+ memset(pii, 0, sizeof(*pii));
+}
+
/*
* The caller is responsible for @pii.
*/
@@ -5652,6 +5670,9 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
if (pii->has_overlap)
ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
+ pii->has_overlap, pii->overlap);
return 0;
e_inval:
@@ -5690,14 +5711,17 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
pii->has_overlap = true;
ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
+ pii->has_overlap, pii->overlap);
return 0;
e_inval:
return -EINVAL;
}
-static int get_parent_info(struct rbd_device *rbd_dev,
- struct parent_image_info *pii)
+static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev,
+ struct parent_image_info *pii)
{
struct page *req_page, *reply_page;
void *p;
@@ -5725,7 +5749,7 @@ static int get_parent_info(struct rbd_device *rbd_dev,
return ret;
}
-static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
+static int rbd_dev_setup_parent(struct rbd_device *rbd_dev)
{
struct rbd_spec *parent_spec;
struct parent_image_info pii = { 0 };
@@ -5735,37 +5759,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
if (!parent_spec)
return -ENOMEM;
- ret = get_parent_info(rbd_dev, &pii);
+ ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
if (ret)
goto out_err;
- dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
- __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
- pii.has_overlap, pii.overlap);
-
- if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
- /*
- * Either the parent never existed, or we have
- * record of it but the image got flattened so it no
- * longer has a parent. When the parent of a
- * layered image disappears we immediately set the
- * overlap to 0. The effect of this is that all new
- * requests will be treated as if the image had no
- * parent.
- *
- * If !pii.has_overlap, the parent image spec is not
- * applicable. It's there to avoid duplication in each
- * snapshot record.
- */
- if (rbd_dev->parent_overlap) {
- rbd_dev->parent_overlap = 0;
- rbd_dev_parent_put(rbd_dev);
- pr_info("%s: clone image has been flattened\n",
- rbd_dev->disk->disk_name);
- }
-
+ if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap)
goto out; /* No parent? No problem. */
- }
/* The ceph file layout needs to fit pool id in 32 bits */
@@ -5777,58 +5776,46 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
}
/*
- * The parent won't change (except when the clone is
- * flattened, already handled that). So we only need to
- * record the parent spec we have not already done so.
+ * The parent won't change except when the clone is flattened,
+ * so we only need to record the parent image spec once.
*/
- if (!rbd_dev->parent_spec) {
- parent_spec->pool_id = pii.pool_id;
- if (pii.pool_ns && *pii.pool_ns) {
- parent_spec->pool_ns = pii.pool_ns;
- pii.pool_ns = NULL;
- }
- parent_spec->image_id = pii.image_id;
- pii.image_id = NULL;
- parent_spec->snap_id = pii.snap_id;
-
- rbd_dev->parent_spec = parent_spec;
- parent_spec = NULL; /* rbd_dev now owns this */
+ parent_spec->pool_id = pii.pool_id;
+ if (pii.pool_ns && *pii.pool_ns) {
+ parent_spec->pool_ns = pii.pool_ns;
+ pii.pool_ns = NULL;
}
+ parent_spec->image_id = pii.image_id;
+ pii.image_id = NULL;
+ parent_spec->snap_id = pii.snap_id;
+
+ rbd_assert(!rbd_dev->parent_spec);
+ rbd_dev->parent_spec = parent_spec;
+ parent_spec = NULL; /* rbd_dev now owns this */
/*
- * We always update the parent overlap. If it's zero we issue
- * a warning, as we will proceed as if there was no parent.
+ * Record the parent overlap. If it's zero, issue a warning as
+ * we will proceed as if there is no parent.
*/
- if (!pii.overlap) {
- if (parent_spec) {
- /* refresh, careful to warn just once */
- if (rbd_dev->parent_overlap)
- rbd_warn(rbd_dev,
- "clone now standalone (overlap became 0)");
- } else {
- /* initial probe */
- rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
- }
- }
+ if (!pii.overlap)
+ rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
rbd_dev->parent_overlap = pii.overlap;
out:
ret = 0;
out_err:
- kfree(pii.pool_ns);
- kfree(pii.image_id);
+ rbd_parent_info_cleanup(&pii);
rbd_spec_put(parent_spec);
return ret;
}
-static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
+static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev,
+ u64 *stripe_unit, u64 *stripe_count)
{
struct {
__le64 stripe_unit;
__le64 stripe_count;
} __attribute__ ((packed)) striping_info_buf = { 0 };
size_t size = sizeof (striping_info_buf);
- void *p;
int ret;
ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
@@ -5840,27 +5827,33 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
if (ret < size)
return -ERANGE;
- p = &striping_info_buf;
- rbd_dev->header.stripe_unit = ceph_decode_64(&p);
- rbd_dev->header.stripe_count = ceph_decode_64(&p);
+ *stripe_unit = le64_to_cpu(striping_info_buf.stripe_unit);
+ *stripe_count = le64_to_cpu(striping_info_buf.stripe_count);
+ dout(" stripe_unit = %llu stripe_count = %llu\n", *stripe_unit,
+ *stripe_count);
+
return 0;
}
-static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
+static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev, s64 *data_pool_id)
{
- __le64 data_pool_id;
+ __le64 data_pool_buf;
int ret;
ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
&rbd_dev->header_oloc, "get_data_pool",
- NULL, 0, &data_pool_id, sizeof(data_pool_id));
+ NULL, 0, &data_pool_buf,
+ sizeof(data_pool_buf));
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
return ret;
- if (ret < sizeof(data_pool_id))
+ if (ret < sizeof(data_pool_buf))
return -EBADMSG;
- rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
- WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
+ *data_pool_id = le64_to_cpu(data_pool_buf);
+ dout(" data_pool_id = %lld\n", *data_pool_id);
+ WARN_ON(*data_pool_id == CEPH_NOPOOL);
+
return 0;
}
@@ -6052,7 +6045,8 @@ out_err:
return ret;
}
-static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
+static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev,
+ struct ceph_snap_context **psnapc)
{
size_t size;
int ret;
@@ -6113,9 +6107,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
for (i = 0; i < snap_count; i++)
snapc->snaps[i] = ceph_decode_64(&p);
- ceph_put_snap_context(rbd_dev->header.snapc);
- rbd_dev->header.snapc = snapc;
-
+ *psnapc = snapc;
dout(" snap context seq = %llu, snap_count = %u\n",
(unsigned long long)seq, (unsigned int)snap_count);
out:
@@ -6164,38 +6156,42 @@ out:
return snap_name;
}
-static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
+static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev,
+ struct rbd_image_header *header,
+ bool first_time)
{
- bool first_time = rbd_dev->header.object_prefix == NULL;
int ret;
- ret = rbd_dev_v2_image_size(rbd_dev);
+ ret = _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
+ first_time ? &header->obj_order : NULL,
+ &header->image_size);
if (ret)
return ret;
if (first_time) {
- ret = rbd_dev_v2_header_onetime(rbd_dev);
+ ret = rbd_dev_v2_header_onetime(rbd_dev, header);
if (ret)
return ret;
}
- ret = rbd_dev_v2_snap_context(rbd_dev);
- if (ret && first_time) {
- kfree(rbd_dev->header.object_prefix);
- rbd_dev->header.object_prefix = NULL;
- }
+ ret = rbd_dev_v2_snap_context(rbd_dev, &header->snapc);
+ if (ret)
+ return ret;
- return ret;
+ return 0;
}
-static int rbd_dev_header_info(struct rbd_device *rbd_dev)
+static int rbd_dev_header_info(struct rbd_device *rbd_dev,
+ struct rbd_image_header *header,
+ bool first_time)
{
rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+ rbd_assert(!header->object_prefix && !header->snapc);
if (rbd_dev->image_format == 1)
- return rbd_dev_v1_header_info(rbd_dev);
+ return rbd_dev_v1_header_info(rbd_dev, header, first_time);
- return rbd_dev_v2_header_info(rbd_dev);
+ return rbd_dev_v2_header_info(rbd_dev, header, first_time);
}
/*
@@ -6210,7 +6206,7 @@ static inline size_t next_token(const char **buf)
* These are the characters that produce nonzero for
* isspace() in the "C" and "POSIX" locales.
*/
- const char *spaces = " \f\n\r\t\v";
+ static const char spaces[] = " \f\n\r\t\v";
*buf += strspn(*buf, spaces); /* Find start of token */
@@ -6516,7 +6512,8 @@ static int rbd_add_parse_args(const char *buf,
pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
pctx.opts->trim = RBD_TRIM_DEFAULT;
- ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL);
+ ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL,
+ ',');
if (ret)
goto out_err;
@@ -6578,18 +6575,12 @@ static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
cancel_delayed_work_sync(&rbd_dev->lock_dwork);
if (!ret)
ret = -ETIMEDOUT;
- }
- if (ret) {
- rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
- return ret;
+ rbd_warn(rbd_dev, "failed to acquire lock: %ld", ret);
}
+ if (ret)
+ return ret;
- /*
- * The lock may have been released by now, unless automatic lock
- * transitions are disabled.
- */
- rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
return 0;
}
@@ -6683,60 +6674,49 @@ out:
*/
static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
{
- struct rbd_image_header *header;
-
rbd_dev_parent_put(rbd_dev);
rbd_object_map_free(rbd_dev);
rbd_dev_mapping_clear(rbd_dev);
/* Free dynamic fields from the header, then zero it out */
- header = &rbd_dev->header;
- ceph_put_snap_context(header->snapc);
- kfree(header->snap_sizes);
- kfree(header->snap_names);
- kfree(header->object_prefix);
- memset(header, 0, sizeof (*header));
+ rbd_image_header_cleanup(&rbd_dev->header);
}
-static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
+static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev,
+ struct rbd_image_header *header)
{
int ret;
- ret = rbd_dev_v2_object_prefix(rbd_dev);
+ ret = rbd_dev_v2_object_prefix(rbd_dev, &header->object_prefix);
if (ret)
- goto out_err;
+ return ret;
/*
* Get the and check features for the image. Currently the
* features are assumed to never change.
*/
- ret = rbd_dev_v2_features(rbd_dev);
+ ret = _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
+ rbd_is_ro(rbd_dev), &header->features);
if (ret)
- goto out_err;
+ return ret;
/* If the image supports fancy striping, get its parameters */
- if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
- ret = rbd_dev_v2_striping_info(rbd_dev);
- if (ret < 0)
- goto out_err;
+ if (header->features & RBD_FEATURE_STRIPINGV2) {
+ ret = rbd_dev_v2_striping_info(rbd_dev, &header->stripe_unit,
+ &header->stripe_count);
+ if (ret)
+ return ret;
}
- if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
- ret = rbd_dev_v2_data_pool(rbd_dev);
+ if (header->features & RBD_FEATURE_DATA_POOL) {
+ ret = rbd_dev_v2_data_pool(rbd_dev, &header->data_pool_id);
if (ret)
- goto out_err;
+ return ret;
}
- rbd_init_layout(rbd_dev);
return 0;
-
-out_err:
- rbd_dev->header.features = 0;
- kfree(rbd_dev->header.object_prefix);
- rbd_dev->header.object_prefix = NULL;
- return ret;
}
/*
@@ -6758,7 +6738,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
goto out_err;
}
- parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
+ parent = __rbd_dev_create(rbd_dev->parent_spec);
if (!parent) {
ret = -ENOMEM;
goto out_err;
@@ -6768,8 +6748,8 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
* Images related by parent/child relationships always share
* rbd_client and spec/parent_spec, so bump their refcounts.
*/
- __rbd_get_client(rbd_dev->rbd_client);
- rbd_spec_get(rbd_dev->parent_spec);
+ parent->rbd_client = __rbd_get_client(rbd_dev->rbd_client);
+ parent->spec = rbd_spec_get(rbd_dev->parent_spec);
__set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
@@ -6931,13 +6911,15 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
if (!depth)
down_write(&rbd_dev->header_rwsem);
- ret = rbd_dev_header_info(rbd_dev);
+ ret = rbd_dev_header_info(rbd_dev, &rbd_dev->header, true);
if (ret) {
if (ret == -ENOENT && !need_watch)
rbd_print_dne(rbd_dev, false);
goto err_out_probe;
}
+ rbd_init_layout(rbd_dev);
+
/*
* If this image is the one being mapped, we have pool name and
* id, image name and id, and snap name - need to fill snap id.
@@ -6966,7 +6948,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
}
if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
- ret = rbd_dev_v2_parent_info(rbd_dev);
+ ret = rbd_dev_setup_parent(rbd_dev);
if (ret)
goto err_out_probe;
}
@@ -6992,9 +6974,108 @@ err_out_format:
return ret;
}
-static ssize_t do_rbd_add(struct bus_type *bus,
- const char *buf,
- size_t count)
+static void rbd_dev_update_header(struct rbd_device *rbd_dev,
+ struct rbd_image_header *header)
+{
+ rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+ rbd_assert(rbd_dev->header.object_prefix); /* !first_time */
+
+ if (rbd_dev->header.image_size != header->image_size) {
+ rbd_dev->header.image_size = header->image_size;
+
+ if (!rbd_is_snap(rbd_dev)) {
+ rbd_dev->mapping.size = header->image_size;
+ rbd_dev_update_size(rbd_dev);
+ }
+ }
+
+ ceph_put_snap_context(rbd_dev->header.snapc);
+ rbd_dev->header.snapc = header->snapc;
+ header->snapc = NULL;
+
+ if (rbd_dev->image_format == 1) {
+ kfree(rbd_dev->header.snap_names);
+ rbd_dev->header.snap_names = header->snap_names;
+ header->snap_names = NULL;
+
+ kfree(rbd_dev->header.snap_sizes);
+ rbd_dev->header.snap_sizes = header->snap_sizes;
+ header->snap_sizes = NULL;
+ }
+}
+
+static void rbd_dev_update_parent(struct rbd_device *rbd_dev,
+ struct parent_image_info *pii)
+{
+ if (pii->pool_id == CEPH_NOPOOL || !pii->has_overlap) {
+ /*
+ * Either the parent never existed, or we have
+ * record of it but the image got flattened so it no
+ * longer has a parent. When the parent of a
+ * layered image disappears we immediately set the
+ * overlap to 0. The effect of this is that all new
+ * requests will be treated as if the image had no
+ * parent.
+ *
+ * If !pii.has_overlap, the parent image spec is not
+ * applicable. It's there to avoid duplication in each
+ * snapshot record.
+ */
+ if (rbd_dev->parent_overlap) {
+ rbd_dev->parent_overlap = 0;
+ rbd_dev_parent_put(rbd_dev);
+ pr_info("%s: clone has been flattened\n",
+ rbd_dev->disk->disk_name);
+ }
+ } else {
+ rbd_assert(rbd_dev->parent_spec);
+
+ /*
+ * Update the parent overlap. If it became zero, issue
+ * a warning as we will proceed as if there is no parent.
+ */
+ if (!pii->overlap && rbd_dev->parent_overlap)
+ rbd_warn(rbd_dev,
+ "clone has become standalone (overlap 0)");
+ rbd_dev->parent_overlap = pii->overlap;
+ }
+}
+
+static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+{
+ struct rbd_image_header header = { 0 };
+ struct parent_image_info pii = { 0 };
+ int ret;
+
+ dout("%s rbd_dev %p\n", __func__, rbd_dev);
+
+ ret = rbd_dev_header_info(rbd_dev, &header, false);
+ if (ret)
+ goto out;
+
+ /*
+ * If there is a parent, see if it has disappeared due to the
+ * mapped image getting flattened.
+ */
+ if (rbd_dev->parent) {
+ ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
+ if (ret)
+ goto out;
+ }
+
+ down_write(&rbd_dev->header_rwsem);
+ rbd_dev_update_header(rbd_dev, &header);
+ if (rbd_dev->parent)
+ rbd_dev_update_parent(rbd_dev, &pii);
+ up_write(&rbd_dev->header_rwsem);
+
+out:
+ rbd_parent_info_cleanup(&pii);
+ rbd_image_header_cleanup(&header);
+ return ret;
+}
+
+static ssize_t do_rbd_add(const char *buf, size_t count)
{
struct rbd_device *rbd_dev = NULL;
struct ceph_options *ceph_opts = NULL;
@@ -7073,7 +7154,9 @@ static ssize_t do_rbd_add(struct bus_type *bus,
if (rc)
goto err_out_image_lock;
- device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
+ rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
+ if (rc)
+ goto err_out_cleanup_disk;
spin_lock(&rbd_dev_list_lock);
list_add_tail(&rbd_dev->node, &rbd_dev_list);
@@ -7087,6 +7170,8 @@ out:
module_put(THIS_MODULE);
return rc;
+err_out_cleanup_disk:
+ rbd_free_disk(rbd_dev);
err_out_image_lock:
rbd_dev_image_unlock(rbd_dev);
rbd_dev_device_release(rbd_dev);
@@ -7102,18 +7187,18 @@ err_out_args:
goto out;
}
-static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
+static ssize_t add_store(const struct bus_type *bus, const char *buf, size_t count)
{
if (single_major)
return -EINVAL;
- return do_rbd_add(bus, buf, count);
+ return do_rbd_add(buf, count);
}
-static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
+static ssize_t add_single_major_store(const struct bus_type *bus, const char *buf,
size_t count)
{
- return do_rbd_add(bus, buf, count);
+ return do_rbd_add(buf, count);
}
static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
@@ -7143,12 +7228,9 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
}
}
-static ssize_t do_rbd_remove(struct bus_type *bus,
- const char *buf,
- size_t count)
+static ssize_t do_rbd_remove(const char *buf, size_t count)
{
struct rbd_device *rbd_dev = NULL;
- struct list_head *tmp;
int dev_id;
char opt_buf[6];
bool force = false;
@@ -7175,8 +7257,7 @@ static ssize_t do_rbd_remove(struct bus_type *bus,
ret = -ENOENT;
spin_lock(&rbd_dev_list_lock);
- list_for_each(tmp, &rbd_dev_list) {
- rbd_dev = list_entry(tmp, struct rbd_device, node);
+ list_for_each_entry(rbd_dev, &rbd_dev_list, node) {
if (rbd_dev->dev_id == dev_id) {
ret = 0;
break;
@@ -7200,8 +7281,10 @@ static ssize_t do_rbd_remove(struct bus_type *bus,
* Prevent new IO from being queued and wait for existing
* IO to complete/fail.
*/
- blk_mq_freeze_queue(rbd_dev->disk->queue);
- blk_set_queue_dying(rbd_dev->disk->queue);
+ unsigned int memflags = blk_mq_freeze_queue(rbd_dev->disk->queue);
+
+ blk_mark_disk_dead(rbd_dev->disk);
+ blk_mq_unfreeze_queue(rbd_dev->disk->queue, memflags);
}
del_gendisk(rbd_dev->disk);
@@ -7217,18 +7300,18 @@ static ssize_t do_rbd_remove(struct bus_type *bus,
return count;
}
-static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
+static ssize_t remove_store(const struct bus_type *bus, const char *buf, size_t count)
{
if (single_major)
return -EINVAL;
- return do_rbd_remove(bus, buf, count);
+ return do_rbd_remove(buf, count);
}
-static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
+static ssize_t remove_single_major_store(const struct bus_type *bus, const char *buf,
size_t count)
{
- return do_rbd_remove(bus, buf, count);
+ return do_rbd_remove(buf, count);
}
/*
@@ -7240,8 +7323,10 @@ static int __init rbd_sysfs_init(void)
int ret;
ret = device_register(&rbd_root_dev);
- if (ret < 0)
+ if (ret < 0) {
+ put_device(&rbd_root_dev);
return ret;
+ }
ret = bus_register(&rbd_bus_type);
if (ret < 0)
@@ -7304,7 +7389,7 @@ static int __init rbd_init(void)
* The number of active work items is limited by the number of
* rbd devices * queue depth, so leave @max_active at default.
*/
- rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
+ rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!rbd_wq) {
rc = -ENOMEM;
goto err_out_slab;