summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-28 15:36:09 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-28 15:36:09 -0800
commitb2deee2dc06db7cdf99b84346e69bdb9db9baa85 (patch)
treeceb073fa12c1a9804761ec8ce8911a517b007ed6 /include
parentd4f4cf77b37eaea58ef863a4cbc95dad3880b524 (diff)
parent54ea0046b6fe36ec18e82d282a29a18da6cdea0f (diff)
Merge tag 'ceph-for-4.11-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "This time around we have: - support for rbd data-pool feature, which enables rbd images on erasure-coded pools (myself). CEPH_PG_MAX_SIZE has been bumped to allow erasure-coded profiles with k+m up to 32. - a patch for ceph_d_revalidate() performance regression introduced in 4.9, along with some cleanups in the area (Jeff Layton) - a set of fixes for unsafe ->d_parent accesses in CephFS (Jeff Layton) - buffered reads are now processed in rsize windows instead of rasize windows (Andreas Gerstmayr). The new default for rsize mount option is 64M. - ack vs commit distinction is gone, greatly simplifying ->fsync() and MOSDOpReply handling code (myself) ... also a few filesystem bug fixes from Zheng, a CRUSH sync up (CRUSH computations are still serialized though) and several minor fixes and cleanups all over" * tag 'ceph-for-4.11-rc1' of git://github.com/ceph/ceph-client: (52 commits) libceph, rbd, ceph: WRITE | ONDISK -> WRITE libceph: get rid of ack vs commit ceph: remove special ack vs commit behavior ceph: tidy some white space in get_nonsnap_parent() crush: fix dprintk compilation crush: do is_out test only if we do not collide ceph: remove req from unsafe list when unregistering it rbd: constify device_type structure rbd: kill obj_request->object_name and rbd_segment_name_cache rbd: store and use obj_request->object_no rbd: RBD_V{1,2}_DATA_FORMAT macros rbd: factor out __rbd_osd_req_create() rbd: set offset and length outside of rbd_obj_request_create() rbd: support for data-pool feature rbd: introduce rbd_init_layout() rbd: use rbd_obj_bytes() more rbd: remove now unused rbd_obj_request_wait() and helpers rbd: switch rbd_obj_method_sync() to ceph_osdc_call() libceph: pass reply buffer length through ceph_osdc_call() rbd: do away with obj_request in rbd_obj_read_sync() ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/ceph/osd_client.h6
-rw-r--r--include/linux/ceph/osdmap.h13
-rw-r--r--include/linux/ceph/rados.h2
-rw-r--r--include/linux/crush/crush.h41
-rw-r--r--include/linux/crush/mapper.h16
5 files changed, 54 insertions, 24 deletions
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 03a6653d329a..2ea0c282f3dc 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -22,7 +22,6 @@ struct ceph_osd_client;
* completion callback for async writepages
*/
typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
-typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
#define CEPH_HOMELESS_OSD -1
@@ -170,15 +169,12 @@ struct ceph_osd_request {
unsigned int r_num_ops;
int r_result;
- bool r_got_reply;
struct ceph_osd_client *r_osdc;
struct kref r_kref;
bool r_mempool;
- struct completion r_completion;
- struct completion r_done_completion; /* fsync waiter */
+ struct completion r_completion; /* private to osd_client.c */
ceph_osdc_callback_t r_callback;
- ceph_osdc_unsafe_callback_t r_unsafe_callback;
struct list_head r_unsafe_item;
struct inode *r_inode; /* for use by callbacks */
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 9a9041784dcf..938656f70807 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -57,7 +57,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
case CEPH_POOL_TYPE_EC:
return false;
default:
- BUG_ON(1);
+ BUG();
}
}
@@ -82,13 +82,6 @@ void ceph_oloc_copy(struct ceph_object_locator *dest,
void ceph_oloc_destroy(struct ceph_object_locator *oloc);
/*
- * Maximum supported by kernel client object name length
- *
- * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
- */
-#define CEPH_MAX_OID_NAME_LEN 100
-
-/*
* 51-char inline_name is long enough for all cephfs and all but one
* rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
* arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all
@@ -173,8 +166,8 @@ struct ceph_osdmap {
* the list of osds that store+replicate them. */
struct crush_map *crush;
- struct mutex crush_scratch_mutex;
- int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
+ struct mutex crush_workspace_mutex;
+ void *crush_workspace;
};
static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 5c0da61cb763..5d0018782d50 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -50,7 +50,7 @@ struct ceph_timespec {
#define CEPH_PG_LAYOUT_LINEAR 2
#define CEPH_PG_LAYOUT_HYBRID 3
-#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
+#define CEPH_PG_MAX_SIZE 32 /* max # osds in a single pg */
/*
* placement group.
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index be8f12b8f195..fbecbd089d75 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -135,13 +135,6 @@ struct crush_bucket {
__u32 size; /* num items */
__s32 *items;
- /*
- * cached random permutation: used for uniform bucket and for
- * the linear search fallback for the other bucket types.
- */
- __u32 perm_x; /* @x for which *perm is defined */
- __u32 perm_n; /* num elements of *perm that are permuted/defined */
- __u32 *perm;
};
struct crush_bucket_uniform {
@@ -211,6 +204,21 @@ struct crush_map {
* device fails. */
__u8 chooseleaf_stable;
+ /*
+ * This value is calculated after decode or construction by
+ * the builder. It is exposed here (rather than having a
+ * 'build CRUSH working space' function) so that callers can
+ * reserve a static buffer, allocate space on the stack, or
+ * otherwise avoid calling into the heap allocator if they
+ * want to. The size of the working space depends on the map,
+ * while the size of the scratch vector passed to the mapper
+ * depends on the size of the desired result set.
+ *
+ * Nothing stops the caller from allocating both in one swell
+ * foop and passing in two points, though.
+ */
+ size_t working_size;
+
#ifndef __KERNEL__
/*
* version 0 (original) of straw_calc has various flaws. version 1
@@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i)
return ((i+1) << 1)-1;
}
+/*
+ * These data structures are private to the CRUSH implementation. They
+ * are exposed in this header file because builder needs their
+ * definitions to calculate the total working size.
+ *
+ * Moving this out of the crush map allow us to treat the CRUSH map as
+ * immutable within the mapper and removes the requirement for a CRUSH
+ * map lock.
+ */
+struct crush_work_bucket {
+ __u32 perm_x; /* @x for which *perm is defined */
+ __u32 perm_n; /* num elements of *perm that are permuted/defined */
+ __u32 *perm; /* Permutation of the bucket's items */
+};
+
+struct crush_work {
+ struct crush_work_bucket **work; /* Per-bucket working store */
+};
+
#endif
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h
index 5dfd5b1125d2..c95e19e1ff11 100644
--- a/include/linux/crush/mapper.h
+++ b/include/linux/crush/mapper.h
@@ -15,6 +15,20 @@ extern int crush_do_rule(const struct crush_map *map,
int ruleno,
int x, int *result, int result_max,
const __u32 *weights, int weight_max,
- int *scratch);
+ void *cwin);
+
+/*
+ * Returns the exact amount of workspace that will need to be used
+ * for a given combination of crush_map and result_max. The caller can
+ * then allocate this much on its own, either on the stack, in a
+ * per-thread long-lived buffer, or however it likes.
+ */
+static inline size_t crush_work_size(const struct crush_map *map,
+ int result_max)
+{
+ return map->working_size + result_max * 3 * sizeof(__u32);
+}
+
+void crush_init_workspace(const struct crush_map *map, void *v);
#endif