diff options
Diffstat (limited to 'include/linux/ceph')
| -rw-r--r-- | include/linux/ceph/auth.h | 120 | ||||
| -rw-r--r-- | include/linux/ceph/buffer.h | 8 | ||||
| -rw-r--r-- | include/linux/ceph/ceph_debug.h | 49 | ||||
| -rw-r--r-- | include/linux/ceph/ceph_features.h | 247 | ||||
| -rw-r--r-- | include/linux/ceph/ceph_frag.h | 42 | ||||
| -rw-r--r-- | include/linux/ceph/ceph_fs.h | 294 | ||||
| -rw-r--r-- | include/linux/ceph/ceph_hash.h | 1 | ||||
| -rw-r--r-- | include/linux/ceph/cls_lock_client.h | 58 | ||||
| -rw-r--r-- | include/linux/ceph/debugfs.h | 25 | ||||
| -rw-r--r-- | include/linux/ceph/decode.h | 176 | ||||
| -rw-r--r-- | include/linux/ceph/libceph.h | 238 | ||||
| -rw-r--r-- | include/linux/ceph/mdsmap.h | 63 | ||||
| -rw-r--r-- | include/linux/ceph/messenger.h | 516 | ||||
| -rw-r--r-- | include/linux/ceph/mon_client.h | 81 | ||||
| -rw-r--r-- | include/linux/ceph/msgpool.h | 13 | ||||
| -rw-r--r-- | include/linux/ceph/msgr.h | 84 | ||||
| -rw-r--r-- | include/linux/ceph/osd_client.h | 583 | ||||
| -rw-r--r-- | include/linux/ceph/osdmap.h | 288 | ||||
| -rw-r--r-- | include/linux/ceph/pagelist.h | 31 | ||||
| -rw-r--r-- | include/linux/ceph/rados.h | 343 | ||||
| -rw-r--r-- | include/linux/ceph/string_table.h | 63 | ||||
| -rw-r--r-- | include/linux/ceph/striper.h | 71 | ||||
| -rw-r--r-- | include/linux/ceph/types.h | 2 |
23 files changed, 2556 insertions, 840 deletions
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index 5f3386844134..6b138fa97db8 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_AUTH_H #define _FS_CEPH_AUTH_H @@ -12,7 +13,11 @@ */ struct ceph_auth_client; -struct ceph_authorizer; +struct ceph_msg; + +struct ceph_authorizer { + void (*destroy)(struct ceph_authorizer *); +}; struct ceph_auth_handshake { struct ceph_authorizer *authorizer; @@ -20,11 +25,13 @@ struct ceph_auth_handshake { size_t authorizer_buf_len; void *authorizer_reply_buf; size_t authorizer_reply_buf_len; + int (*sign_message)(struct ceph_auth_handshake *auth, + struct ceph_msg *msg); + int (*check_message_signature)(struct ceph_auth_handshake *auth, + struct ceph_msg *msg); }; struct ceph_auth_client_ops { - const char *name; - /* * true if we are authenticated and can connect to * services. @@ -43,8 +50,10 @@ struct ceph_auth_client_ops { * another request. */ int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end); - int (*handle_reply)(struct ceph_auth_client *ac, int result, - void *buf, void *end); + int (*handle_reply)(struct ceph_auth_client *ac, u64 global_id, + void *buf, void *end, u8 *session_key, + int *session_key_len, u8 *con_secret, + int *con_secret_len); /* * Create authorizer for connecting to a service, and verify @@ -55,10 +64,15 @@ struct ceph_auth_client_ops { /* ensure that an existing authorizer is up to date */ int (*update_authorizer)(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth); + int (*add_authorizer_challenge)(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + void *challenge_buf, + int challenge_buf_len); int (*verify_authorizer_reply)(struct ceph_auth_client *ac, - struct ceph_authorizer *a, size_t len); - void (*destroy_authorizer)(struct ceph_auth_client *ac, - struct ceph_authorizer *a); + struct ceph_authorizer *a, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); void (*invalidate_authorizer)(struct ceph_auth_client *ac, int peer_type); @@ -66,6 +80,11 @@ struct ceph_auth_client_ops { void (*reset)(struct ceph_auth_client *ac); void (*destroy)(struct ceph_auth_client *ac); + + int (*sign_message)(struct ceph_auth_handshake *auth, + struct ceph_msg *msg); + int (*check_message_signature)(struct ceph_auth_handshake *auth, + struct ceph_msg *msg); }; struct ceph_auth_client { @@ -79,11 +98,17 @@ struct ceph_auth_client { const struct ceph_crypto_key *key; /* our secret key */ unsigned want_keys; /* which services we want */ + int preferred_mode; /* CEPH_CON_MODE_* */ + int fallback_mode; /* ditto */ + struct mutex mutex; }; -extern struct ceph_auth_client *ceph_auth_init(const char *name, - const struct ceph_crypto_key *key); +void ceph_auth_set_global_id(struct ceph_auth_client *ac, u64 global_id); + +struct ceph_auth_client *ceph_auth_init(const char *name, + const struct ceph_crypto_key *key, + const int *con_modes); extern void ceph_auth_destroy(struct ceph_auth_client *ac); extern void ceph_auth_reset(struct ceph_auth_client *ac); @@ -93,24 +118,73 @@ extern int ceph_auth_build_hello(struct ceph_auth_client *ac, extern int ceph_handle_auth_reply(struct ceph_auth_client *ac, void *buf, size_t len, void *reply_buf, size_t reply_len); -extern int ceph_entity_name_encode(const char *name, void **p, void *end); +int ceph_auth_entity_name_encode(const char *name, void **p, void *end); extern int ceph_build_auth(struct ceph_auth_client *ac, void *msg_buf, size_t msg_len); - extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); -extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac, - int peer_type, - struct ceph_auth_handshake *auth); -extern void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a); -extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac, - int peer_type, - struct ceph_auth_handshake *a); -extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, - struct ceph_authorizer *a, - size_t len); + +int __ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, bool force_new, + int *proto, int *pref_mode, int *fallb_mode); +void ceph_auth_destroy_authorizer(struct ceph_authorizer *a); +int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + void *challenge_buf, + int challenge_buf_len); +int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type); +static inline int ceph_auth_sign_message(struct ceph_auth_handshake *auth, + struct ceph_msg *msg) +{ + if (auth->sign_message) + return auth->sign_message(auth, msg); + return 0; +} + +static inline +int ceph_auth_check_message_signature(struct ceph_auth_handshake *auth, + struct ceph_msg *msg) +{ + if (auth->check_message_signature) + return auth->check_message_signature(auth, msg); + return 0; +} + +int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len); +int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply, + int reply_len, void *buf, int buf_len); +int ceph_auth_handle_reply_done(struct ceph_auth_client *ac, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); +bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt); + +int ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, void *buf, int *buf_len); +int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + void *buf, int *buf_len); +int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); +bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac, + int peer_type, int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt); + #endif diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h index 58d19014068f..11cdc7c60480 100644 --- a/include/linux/ceph/buffer.h +++ b/include/linux/ceph/buffer.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FS_CEPH_BUFFER_H #define __FS_CEPH_BUFFER_H @@ -10,14 +11,12 @@ /* * a simple reference counted buffer. * - * use kmalloc for small sizes (<= one page), vmalloc for larger - * sizes. + * use kmalloc for smaller sizes, vmalloc for larger sizes. */ struct ceph_buffer { struct kref kref; struct kvec vec; size_t alloc_len; - bool is_vmalloc; }; extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp); @@ -31,7 +30,8 @@ static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b) static inline void ceph_buffer_put(struct ceph_buffer *b) { - kref_put(&b->kref, ceph_buffer_release); + if (b) + kref_put(&b->kref, ceph_buffer_release); } extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end); diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h index aa2e19182d99..5f904591fa5f 100644 --- a/include/linux/ceph/ceph_debug.h +++ b/include/linux/ceph/ceph_debug.h @@ -1,8 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_DEBUG_H #define _FS_CEPH_DEBUG_H #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/string.h> + #ifdef CONFIG_CEPH_LIB_PRETTYDEBUG /* @@ -12,18 +15,25 @@ */ # if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) -extern const char *ceph_file_part(const char *s, int len); # define dout(fmt, ...) \ pr_debug("%.*s %12.12s:%-4d : " fmt, \ 8 - (int)sizeof(KBUILD_MODNAME), " ", \ - ceph_file_part(__FILE__, sizeof(__FILE__)), \ - __LINE__, ##__VA_ARGS__) + kbasename(__FILE__), __LINE__, ##__VA_ARGS__) +# define doutc(client, fmt, ...) \ + pr_debug("%.*s %12.12s:%-4d : [%pU %llu] " fmt, \ + 8 - (int)sizeof(KBUILD_MODNAME), " ", \ + kbasename(__FILE__), __LINE__, \ + &client->fsid, client->monc.auth->global_id, \ + ##__VA_ARGS__) # else /* faux printk call just to see any compiler warnings. */ -# define dout(fmt, ...) do { \ - if (0) \ - printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ - } while (0) +# define dout(fmt, ...) \ + no_printk(KERN_DEBUG fmt, ##__VA_ARGS__) +# define doutc(client, fmt, ...) \ + no_printk(KERN_DEBUG "[%pU %llu] " fmt, \ + &client->fsid, \ + client->monc.auth->global_id, \ + ##__VA_ARGS__) # endif #else @@ -32,7 +42,32 @@ extern const char *ceph_file_part(const char *s, int len); * or, just wrap pr_debug */ # define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__) +# define doutc(client, fmt, ...) \ + pr_debug(" [%pU %llu] %s: " fmt, &client->fsid, \ + client->monc.auth->global_id, __func__, ##__VA_ARGS__) #endif +#define pr_notice_client(client, fmt, ...) \ + pr_notice("[%pU %llu]: " fmt, &client->fsid, \ + client->monc.auth->global_id, ##__VA_ARGS__) +#define pr_info_client(client, fmt, ...) \ + pr_info("[%pU %llu]: " fmt, &client->fsid, \ + client->monc.auth->global_id, ##__VA_ARGS__) +#define pr_warn_client(client, fmt, ...) \ + pr_warn("[%pU %llu]: " fmt, &client->fsid, \ + client->monc.auth->global_id, ##__VA_ARGS__) +#define pr_warn_once_client(client, fmt, ...) \ + pr_warn_once("[%pU %llu]: " fmt, &client->fsid, \ + client->monc.auth->global_id, ##__VA_ARGS__) +#define pr_err_client(client, fmt, ...) \ + pr_err("[%pU %llu]: " fmt, &client->fsid, \ + client->monc.auth->global_id, ##__VA_ARGS__) +#define pr_warn_ratelimited_client(client, fmt, ...) \ + pr_warn_ratelimited("[%pU %llu]: " fmt, &client->fsid, \ + client->monc.auth->global_id, ##__VA_ARGS__) +#define pr_err_ratelimited_client(client, fmt, ...) \ + pr_err_ratelimited("[%pU %llu]: " fmt, &client->fsid, \ + client->monc.auth->global_id, ##__VA_ARGS__) + #endif diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 4c42080347af..3a47acd9cc14 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -1,59 +1,224 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CEPH_FEATURES #define __CEPH_FEATURES /* - * feature bits + * Each time we reclaim bits for reuse we need to specify another bit + * that, if present, indicates we have the new incarnation of that + * feature. Base case is 1 (first use). */ -#define CEPH_FEATURE_UID (1<<0) -#define CEPH_FEATURE_NOSRCADDR (1<<1) -#define CEPH_FEATURE_MONCLOCKCHECK (1<<2) -#define CEPH_FEATURE_FLOCK (1<<3) -#define CEPH_FEATURE_SUBSCRIBE2 (1<<4) -#define CEPH_FEATURE_MONNAMES (1<<5) -#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) -#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) -#define CEPH_FEATURE_OBJECTLOCATOR (1<<8) -#define CEPH_FEATURE_PGID64 (1<<9) -#define CEPH_FEATURE_INCSUBOSDMAP (1<<10) -#define CEPH_FEATURE_PGPOOL3 (1<<11) -#define CEPH_FEATURE_OSDREPLYMUX (1<<12) -#define CEPH_FEATURE_OSDENC (1<<13) -#define CEPH_FEATURE_OMAP (1<<14) -#define CEPH_FEATURE_MONENC (1<<15) -#define CEPH_FEATURE_QUERY_T (1<<16) -#define CEPH_FEATURE_INDEP_PG_MAP (1<<17) -#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) -#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19) -#define CEPH_FEATURE_MON_NULLROUTE (1<<20) -#define CEPH_FEATURE_MON_GV (1<<21) -#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22) -#define CEPH_FEATURE_MSG_AUTH (1<<23) -#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24) -#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25) -#define CEPH_FEATURE_CREATEPOOLID (1<<26) -#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) -#define CEPH_FEATURE_OSD_HBMSGS (1<<28) -#define CEPH_FEATURE_MDSENC (1<<29) -#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30) +#define CEPH_FEATURE_INCARNATION_1 (0ull) +#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // SERVER_JEWEL +#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC + +#define DEFINE_CEPH_FEATURE(bit, incarnation, name) \ + static const uint64_t __maybe_unused CEPH_FEATURE_##name = (1ULL<<bit); \ + static const uint64_t __maybe_unused CEPH_FEATUREMASK_##name = \ + (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); + +/* this bit is ignored but still advertised by release *when* */ +#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \ + static const uint64_t __maybe_unused DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \ + static const uint64_t __maybe_unused DEPRECATED_CEPH_FEATUREMASK_##name = \ + (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); + +/* + * this bit is ignored by release *unused* and not advertised by + * release *unadvertised* + */ +#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised) + + +/* + * test for a feature. this test is safer than a typical mask against + * the bit because it ensures that we have the bit AND the marker for the + * bit's incarnation. this must be used in any case where the features + * bits may include an old meaning of the bit. + */ +#define CEPH_HAVE_FEATURE(x, name) \ + (((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name)) + + +/* + * Notes on deprecation: + * + * A *major* release is a release through which all upgrades must pass + * (e.g., jewel). For example, no pre-jewel server will ever talk to + * a post-jewel server (mon, osd, etc). + * + * For feature bits used *only* on the server-side: + * + * - In the first phase we indicate that a feature is DEPRECATED as of + * a particular release. This is the first major release X (say, + * jewel) that does not depend on its peers advertising the feature. + * That is, it safely assumes its peers all have the feature. We + * indicate this with the DEPRECATED macro. For example, + * + * DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL) + * + * because 10.2.z (jewel) did not care if its peers advertised this + * feature bit. + * + * - In the second phase we stop advertising the bit and call it + * RETIRED. This can normally be done in the *next* major release + * following the one in which we marked the feature DEPRECATED. In + * the above example, for 12.0.z (luminous) we can say: + * + * DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS) + * + * - The bit can be reused in the first post-luminous release, 13.0.z + * (m). + * + * This ensures that no two versions who have different meanings for + * the bit ever speak to each other. + */ + +DEFINE_CEPH_FEATURE( 0, 1, UID) +DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR) +DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS) +DEFINE_CEPH_FEATURE( 3, 1, FLOCK) +DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2) +DEFINE_CEPH_FEATURE( 5, 1, MONNAMES) +DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ) +DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH) +DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR) +DEFINE_CEPH_FEATURE( 9, 1, PGID64) +DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP) +DEFINE_CEPH_FEATURE(11, 1, PGPOOL3) +DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX) +DEFINE_CEPH_FEATURE(13, 1, OSDENC) +DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN) +DEFINE_CEPH_FEATURE(15, 1, MONENC) +DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES) +DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS) +DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap +DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap +DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap +DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH) +DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS) + +DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2) +DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID) +DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE) +DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC) +DEFINE_CEPH_FEATURE(29, 1, MDSENC) +DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL) +DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS) // deprecate me +DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL) +DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2) +DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER) +DEFINE_CEPH_FEATURE(38, 1, OSD_ERASURE_CODES) +DEFINE_CEPH_FEATURE(38, 1, OSD_OSD_TMAP2OMAP) // overlap +DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC) +DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA) +DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3) +DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap +DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2) +DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND) +DEFINE_CEPH_FEATURE(44, 1, ERASURE_CODE_PLUGINS_V2) +DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS) +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap + +DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA) +DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4) +DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap + +DEFINE_CEPH_FEATURE(50, 1, MON_METADATA) +DEFINE_CEPH_FEATURE(51, 1, OSD_BITWISE_HOBJ_SORT) +DEFINE_CEPH_FEATURE(52, 1, OSD_PROXY_WRITE_FEATURES) +DEFINE_CEPH_FEATURE(53, 1, ERASURE_CODE_PLUGINS_V3) +DEFINE_CEPH_FEATURE(54, 1, OSD_HITSET_GMT) +DEFINE_CEPH_FEATURE(55, 1, HAMMER_0_94_4) +DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING) +DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB) +DEFINE_CEPH_FEATURE(57, 1, MON_ROUTE_OSDMAP) // overlap +DEFINE_CEPH_FEATURE(57, 1, OSDSUBOP_NO_SNAPCONTEXT) // overlap +DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap +DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5) +DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap +DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap +DEFINE_CEPH_FEATURE(59, 1, FS_BTIME) +DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap +DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap +DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit* +DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2) // *do not share this bit* + +DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal +DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing + /* * Features supported. */ -#define CEPH_FEATURES_SUPPORTED_DEFAULT \ +#define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_SERVER_NAUTILUS | \ + CEPH_FEATURE_FLOCK | \ + CEPH_FEATURE_SUBSCRIBE2 | \ + CEPH_FEATURE_MONNAMES | \ CEPH_FEATURE_RECONNECT_SEQ | \ + CEPH_FEATURE_DIRLAYOUTHASH | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC | \ + CEPH_FEATURE_MONENC | \ CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_SERVER_LUMINOUS | \ + CEPH_FEATURE_RESEND_ON_SPLIT | \ + CEPH_FEATURE_RADOS_BACKOFF | \ + CEPH_FEATURE_OSDMAP_PG_UPMAP | \ + CEPH_FEATURE_CRUSH_CHOOSE_ARGS | \ + CEPH_FEATURE_MSG_AUTH | \ CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ - CEPH_FEATURE_OSDHASHPSPOOL) - -#define CEPH_FEATURES_REQUIRED_DEFAULT \ - (CEPH_FEATURE_NOSRCADDR | \ - CEPH_FEATURE_RECONNECT_SEQ | \ - CEPH_FEATURE_PGID64 | \ - CEPH_FEATURE_PGPOOL3 | \ - CEPH_FEATURE_OSDENC) + CEPH_FEATURE_SERVER_MIMIC | \ + CEPH_FEATURE_MDSENC | \ + CEPH_FEATURE_OSDHASHPSPOOL | \ + CEPH_FEATURE_OSD_CACHEPOOL | \ + CEPH_FEATURE_CRUSH_V2 | \ + CEPH_FEATURE_EXPORT_PEER | \ + CEPH_FEATURE_OSDMAP_ENC | \ + CEPH_FEATURE_MDS_INLINE_DATA | \ + CEPH_FEATURE_CRUSH_TUNABLES3 | \ + CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ + CEPH_FEATURE_MSGR_KEEPALIVE2 | \ + CEPH_FEATURE_OSD_POOLRESEND | \ + CEPH_FEATURE_MDS_QUOTA | \ + CEPH_FEATURE_CRUSH_V4 | \ + CEPH_FEATURE_NEW_OSDOP_ENCODING | \ + CEPH_FEATURE_SERVER_JEWEL | \ + CEPH_FEATURE_MON_STATEFUL_SUB | \ + CEPH_FEATURE_CRUSH_TUNABLES5 | \ + CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \ + CEPH_FEATURE_MSG_ADDR2 | \ + CEPH_FEATURE_CEPHX_V2) + +#define CEPH_FEATURES_REQUIRED_DEFAULT 0 + #endif diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h index 5babb8e95352..97bab0adc58a 100644 --- a/include/linux/ceph/ceph_frag.h +++ b/include/linux/ceph/ceph_frag.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef FS_CEPH_FRAG_H #define FS_CEPH_FRAG_H @@ -40,57 +41,22 @@ static inline __u32 ceph_frag_mask_shift(__u32 f) return 24 - ceph_frag_bits(f); } -static inline int ceph_frag_contains_value(__u32 f, __u32 v) +static inline bool ceph_frag_contains_value(__u32 f, __u32 v) { return (v & ceph_frag_mask(f)) == ceph_frag_value(f); } -static inline int ceph_frag_contains_frag(__u32 f, __u32 sub) -{ - /* is sub as specific as us, and contained by us? */ - return ceph_frag_bits(sub) >= ceph_frag_bits(f) && - (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f); -} -static inline __u32 ceph_frag_parent(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f) - 1, - ceph_frag_value(f) & (ceph_frag_mask(f) << 1)); -} -static inline int ceph_frag_is_left_child(__u32 f) -{ - return ceph_frag_bits(f) > 0 && - (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0; -} -static inline int ceph_frag_is_right_child(__u32 f) -{ - return ceph_frag_bits(f) > 0 && - (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1; -} -static inline __u32 ceph_frag_sibling(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f), - ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f))); -} -static inline __u32 ceph_frag_left_child(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f)); -} -static inline __u32 ceph_frag_right_child(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f)+1, - ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f)))); -} static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) { int newbits = ceph_frag_bits(f) + by; return ceph_frag_make(newbits, ceph_frag_value(f) | (i << (24 - newbits))); } -static inline int ceph_frag_is_leftmost(__u32 f) +static inline bool ceph_frag_is_leftmost(__u32 f) { return ceph_frag_value(f) == 0; } -static inline int ceph_frag_is_rightmost(__u32 f) +static inline bool ceph_frag_is_rightmost(__u32 f) { return ceph_frag_value(f) == ceph_frag_mask(f); } diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 2ad7b860f062..c7f2c63b3bc3 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * ceph_fs.h - Ceph constants and data types to share between kernel and * user space. @@ -27,16 +28,16 @@ #define CEPH_INO_ROOT 1 -#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ -#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */ +#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ +#define CEPH_INO_GLOBAL_SNAPREALM 3 /* global dummy snaprealm */ /* arbitrary limit on max # of monitors (cluster of 3 is typical) */ #define CEPH_MAX_MON 31 /* - * ceph_file_layout - describe data layout for a file/inode + * legacy ceph_file_layoute */ -struct ceph_file_layout { +struct ceph_file_layout_legacy { /* file -> object mapping */ __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple of page size. */ @@ -53,9 +54,26 @@ struct ceph_file_layout { __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ } __attribute__ ((packed)); -#define CEPH_MIN_STRIPE_UNIT 65536 +struct ceph_string; +/* + * ceph_file_layout - describe data layout for a file/inode + */ +struct ceph_file_layout { + /* file -> object mapping */ + u32 stripe_unit; /* stripe unit, in bytes */ + u32 stripe_count; /* over this many objects */ + u32 object_size; /* until objects are this big */ + s64 pool_id; /* rados pool id */ + struct ceph_string __rcu *pool_ns; /* rados pool namespace */ +}; + +extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); +extern void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy); +extern void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy); -int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); +#define CEPH_MIN_STRIPE_UNIT 65536 struct ceph_dir_layout { __u8 dl_dir_hash; /* see ceph_hash.h for ids */ @@ -75,8 +93,19 @@ struct ceph_dir_layout { #define CEPH_AUTH_NONE 0x1 #define CEPH_AUTH_CEPHX 0x2 +#define CEPH_AUTH_MODE_NONE 0 +#define CEPH_AUTH_MODE_AUTHORIZER 1 +#define CEPH_AUTH_MODE_MON 10 + +/* msgr2 protocol modes */ +#define CEPH_CON_MODE_UNKNOWN 0x0 +#define CEPH_CON_MODE_CRC 0x1 +#define CEPH_CON_MODE_SECURE 0x2 + #define CEPH_AUTH_UID_DEFAULT ((__u64) -1) +const char *ceph_auth_proto_name(int proto); +const char *ceph_con_mode_name(int mode); /********************************************* * message layer @@ -104,6 +133,7 @@ struct ceph_dir_layout { /* client <-> mds */ #define CEPH_MSG_MDS_MAP 21 +#define CEPH_MSG_FS_MAP_USER 103 #define CEPH_MSG_CLIENT_SESSION 22 #define CEPH_MSG_CLIENT_RECONNECT 23 @@ -111,41 +141,37 @@ struct ceph_dir_layout { #define CEPH_MSG_CLIENT_REQUEST 24 #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 #define CEPH_MSG_CLIENT_REPLY 26 +#define CEPH_MSG_CLIENT_METRICS 29 #define CEPH_MSG_CLIENT_CAPS 0x310 #define CEPH_MSG_CLIENT_LEASE 0x311 #define CEPH_MSG_CLIENT_SNAP 0x312 #define CEPH_MSG_CLIENT_CAPRELEASE 0x313 +#define CEPH_MSG_CLIENT_QUOTA 0x314 /* pool ops */ #define CEPH_MSG_POOLOP_REPLY 48 #define CEPH_MSG_POOLOP 49 +/* mon commands */ +#define CEPH_MSG_MON_COMMAND 50 +#define CEPH_MSG_MON_COMMAND_ACK 51 /* osd */ #define CEPH_MSG_OSD_MAP 41 #define CEPH_MSG_OSD_OP 42 #define CEPH_MSG_OSD_OPREPLY 43 #define CEPH_MSG_WATCH_NOTIFY 44 +#define CEPH_MSG_OSD_BACKOFF 61 /* watch-notify operations */ enum { - WATCH_NOTIFY = 1, /* notifying watcher */ - WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ + CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */ + CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */ + CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */ }; -/* pool operations */ -enum { - POOL_OP_CREATE = 0x01, - POOL_OP_DELETE = 0x02, - POOL_OP_AUID_CHANGE = 0x03, - POOL_OP_CREATE_SNAP = 0x11, - POOL_OP_DELETE_SNAP = 0x12, - POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, - POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, -}; - struct ceph_mon_request_header { __le64 have_version; __le16 session_mon; @@ -155,6 +181,8 @@ struct ceph_mon_request_header { struct ceph_mon_statfs { struct ceph_mon_request_header monhdr; struct ceph_fsid fsid; + __u8 contains_data_pool; + __le64 data_pool; } __attribute__ ((packed)); struct ceph_statfs { @@ -168,29 +196,12 @@ struct ceph_mon_statfs_reply { struct ceph_statfs st; } __attribute__ ((packed)); -const char *ceph_pool_op_name(int op); - -struct ceph_mon_poolop { - struct ceph_mon_request_header monhdr; - struct ceph_fsid fsid; - __le32 pool; - __le32 op; - __le64 auid; - __le64 snapid; - __le32 name_len; -} __attribute__ ((packed)); - -struct ceph_mon_poolop_reply { +struct ceph_mon_command { struct ceph_mon_request_header monhdr; struct ceph_fsid fsid; - __le32 reply_code; - __le32 epoch; - char has_data; - char data[0]; -} __attribute__ ((packed)); - -struct ceph_mon_unmanaged_snap { - __le64 snapid; + __le32 num_strs; /* always 1 */ + __le32 str_len; + char str[]; } __attribute__ ((packed)); struct ceph_osd_getmap { @@ -211,8 +222,8 @@ struct ceph_client_mount { #define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ struct ceph_mon_subscribe_item { - __le64 have_version; __le64 have; - __u8 onetime; + __le64 start; + __u8 flags; } __attribute__ ((packed)); struct ceph_mon_subscribe_ack { @@ -220,6 +231,8 @@ struct ceph_mon_subscribe_ack { struct ceph_fsid fsid; } __attribute__ ((packed)); +#define CEPH_FS_CLUSTER_ID_NONE -1 + /* * mdsmap flags */ @@ -282,8 +295,15 @@ enum { CEPH_SESSION_RENEWCAPS, CEPH_SESSION_STALE, CEPH_SESSION_RECALL_STATE, + CEPH_SESSION_FLUSHMSG, + CEPH_SESSION_FLUSHMSG_ACK, + CEPH_SESSION_FORCE_RO, + CEPH_SESSION_REJECT, + CEPH_SESSION_REQUEST_FLUSH_MDLOG, }; +#define CEPH_SESSION_BLOCKLISTED (1 << 0) /* session blocklisted */ + extern const char *ceph_session_op_name(int op); struct ceph_mds_session_head { @@ -307,6 +327,8 @@ enum { CEPH_MDS_OP_LOOKUPHASH = 0x00102, CEPH_MDS_OP_LOOKUPPARENT = 0x00103, CEPH_MDS_OP_LOOKUPINO = 0x00104, + CEPH_MDS_OP_LOOKUPNAME = 0x00105, + CEPH_MDS_OP_GETVXATTR = 0x00106, CEPH_MDS_OP_SETXATTR = 0x01105, CEPH_MDS_OP_RMXATTR = 0x01106, @@ -332,24 +354,61 @@ enum { CEPH_MDS_OP_MKSNAP = 0x01400, CEPH_MDS_OP_RMSNAP = 0x01401, CEPH_MDS_OP_LSSNAP = 0x00402, + CEPH_MDS_OP_RENAMESNAP = 0x01403, }; -extern const char *ceph_mds_op_name(int op); +#define IS_CEPH_MDS_OP_NEWINODE(op) (op == CEPH_MDS_OP_CREATE || \ + op == CEPH_MDS_OP_MKNOD || \ + op == CEPH_MDS_OP_MKDIR || \ + op == CEPH_MDS_OP_SYMLINK) +extern const char *ceph_mds_op_name(int op); -#define CEPH_SETATTR_MODE 1 -#define CEPH_SETATTR_UID 2 -#define CEPH_SETATTR_GID 4 -#define CEPH_SETATTR_MTIME 8 -#define CEPH_SETATTR_ATIME 16 -#define CEPH_SETATTR_SIZE 32 -#define CEPH_SETATTR_CTIME 64 +#define CEPH_SETATTR_MODE (1 << 0) +#define CEPH_SETATTR_UID (1 << 1) +#define CEPH_SETATTR_GID (1 << 2) +#define CEPH_SETATTR_MTIME (1 << 3) +#define CEPH_SETATTR_ATIME (1 << 4) +#define CEPH_SETATTR_SIZE (1 << 5) +#define CEPH_SETATTR_CTIME (1 << 6) +#define CEPH_SETATTR_MTIME_NOW (1 << 7) +#define CEPH_SETATTR_ATIME_NOW (1 << 8) +#define CEPH_SETATTR_BTIME (1 << 9) +#define CEPH_SETATTR_KILL_SGUID (1 << 10) +#define CEPH_SETATTR_FSCRYPT_AUTH (1 << 11) +#define CEPH_SETATTR_FSCRYPT_FILE (1 << 12) /* * Ceph setxattr request flags. */ -#define CEPH_XATTR_CREATE 1 -#define CEPH_XATTR_REPLACE 2 +#define CEPH_XATTR_CREATE (1 << 0) +#define CEPH_XATTR_REPLACE (1 << 1) +#define CEPH_XATTR_REMOVE (1 << 31) + +/* + * readdir request flags; + */ +#define CEPH_READDIR_REPLY_BITFLAGS (1<<0) + +/* + * readdir reply flags. + */ +#define CEPH_READDIR_FRAG_END (1<<0) +#define CEPH_READDIR_FRAG_COMPLETE (1<<8) +#define CEPH_READDIR_HASH_ORDER (1<<9) +#define CEPH_READDIR_OFFSET_HASH (1<<10) + +/* + * open request flags + */ +#define CEPH_O_RDONLY 00000000 +#define CEPH_O_WRONLY 00000001 +#define CEPH_O_RDWR 00000002 +#define CEPH_O_CREAT 00000100 +#define CEPH_O_EXCL 00000200 +#define CEPH_O_TRUNC 00001000 +#define CEPH_O_DIRECTORY 00200000 +#define CEPH_O_NOFOLLOW 00400000 union ceph_mds_request_args { struct { @@ -368,6 +427,8 @@ union ceph_mds_request_args { __le32 frag; /* which dir fragment */ __le32 max_entries; /* how many dentries to grab */ __le32 max_bytes; + __le16 flags; + __le32 offset_hash; } __attribute__ ((packed)) readdir; struct { __le32 mode; @@ -382,30 +443,53 @@ union ceph_mds_request_args { __le32 stripe_unit; /* layout for newly created file */ __le32 stripe_count; /* ... */ __le32 object_size; - __le32 file_replication; - __le32 unused; /* used to be preferred osd */ + __le32 pool; + __le32 mask; /* CEPH_CAP_* */ + __le64 old_size; } __attribute__ ((packed)) open; struct { __le32 flags; + __le32 osdmap_epoch; /* used for setting file/dir layouts */ } __attribute__ ((packed)) setxattr; struct { - struct ceph_file_layout layout; + struct ceph_file_layout_legacy layout; } __attribute__ ((packed)) setlayout; struct { __u8 rule; /* currently fcntl or flock */ __u8 type; /* shared, exclusive, remove*/ + __le64 owner; /* owner of the lock */ __le64 pid; /* process id requesting the lock */ - __le64 pid_namespace; __le64 start; /* initial location to lock */ __le64 length; /* num bytes to lock from start */ __u8 wait; /* will caller wait for lock to become available? */ } __attribute__ ((packed)) filelock_change; + struct { + __le32 mask; /* CEPH_CAP_* */ + __le64 snapid; + __le64 parent; + __le32 hash; + } __attribute__ ((packed)) lookupino; } __attribute__ ((packed)); -#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ -#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ +union ceph_mds_request_args_ext { + union ceph_mds_request_args old; + struct { + __le32 mode; + __le32 uid; + __le32 gid; + struct ceph_timespec mtime; + struct ceph_timespec atime; + __le64 size, old_size; /* old_size needed by truncate */ + __le32 mask; /* CEPH_SETATTR_* */ + struct ceph_timespec btime; + } __attribute__ ((packed)) setattr_ext; +}; + +#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ +#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ +#define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */ -struct ceph_mds_request_head { +struct ceph_mds_request_head_legacy { __le64 oldest_client_tid; __le32 mdsmap_epoch; /* on client */ __le32 flags; /* CEPH_MDS_FLAG_* */ @@ -418,6 +502,28 @@ struct ceph_mds_request_head { union ceph_mds_request_args args; } __attribute__ ((packed)); +#define CEPH_MDS_REQUEST_HEAD_VERSION 3 + +struct ceph_mds_request_head { + __le16 version; /* struct version */ + __le64 oldest_client_tid; + __le32 mdsmap_epoch; /* on client */ + __le32 flags; /* CEPH_MDS_FLAG_* */ + __u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */ + __le16 num_releases; /* # include cap/lease release records */ + __le32 op; /* mds op code */ + __le32 caller_uid, caller_gid; + __le64 ino; /* use this ino for openc, mkdir, mknod, + etc. (if replaying) */ + union ceph_mds_request_args_ext args; + + __le32 ext_num_retry; /* new count retry attempts */ + __le32 ext_num_fwd; /* new count fwd attempts */ + + __le32 struct_len; /* to store size of struct ceph_mds_request_head */ + __le32 owner_uid, owner_gid; /* used for OPs which create inodes */ +} __attribute__ ((packed)); + /* cap/lease release record */ struct ceph_mds_request_release { __le64 ino, cap_id; /* ino and unique cap id */ @@ -457,7 +563,8 @@ struct ceph_mds_reply_cap { __u8 flags; /* CEPH_CAP_FLAG_* */ } __attribute__ ((packed)); -#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */ +#define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */ +#define CEPH_CAP_FLAG_RELEASE (1 << 1) /* release the cap */ /* inode record, for bundling with mds reply */ struct ceph_mds_reply_inode { @@ -467,7 +574,7 @@ struct ceph_mds_reply_inode { __le64 version; /* inode version */ __le64 xattr_version; /* version for xattr blob */ struct ceph_mds_reply_cap cap; /* caps issued for this inode */ - struct ceph_file_layout layout; + struct ceph_file_layout_legacy layout; struct ceph_timespec ctime, mtime, atime; __le32 time_warp_seq; __le64 size, max_size, truncate_size; @@ -487,6 +594,9 @@ struct ceph_mds_reply_lease { __le32 seq; } __attribute__ ((packed)); +#define CEPH_LEASE_VALID (1 | 2) /* old and new bit values */ +#define CEPH_LEASE_PRIMARY_LINK 4 /* primary linkage */ + struct ceph_mds_reply_dirfrag { __le32 frag; /* fragment */ __le32 auth; /* auth mds, if this is a delegation point */ @@ -494,8 +604,11 @@ struct ceph_mds_reply_dirfrag { __le32 dist[]; } __attribute__ ((packed)); -#define CEPH_LOCK_FCNTL 1 -#define CEPH_LOCK_FLOCK 2 +#define CEPH_LOCK_FCNTL 1 +#define CEPH_LOCK_FLOCK 2 +#define CEPH_LOCK_FCNTL_INTR 3 +#define CEPH_LOCK_FLOCK_INTR 4 + #define CEPH_LOCK_SHARED 1 #define CEPH_LOCK_EXCL 2 @@ -505,8 +618,8 @@ struct ceph_filelock { __le64 start;/* file offset to start lock at */ __le64 length; /* num bytes to lock; 0 for all following start */ __le64 client; /* which client holds the lock */ + __le64 owner; /* owner the lock */ __le64 pid; /* process id holding the lock on the client */ - __le64 pid_namespace; __u8 type; /* shared lock, exclusive lock, or unlock */ } __attribute__ ((packed)); @@ -517,10 +630,12 @@ struct ceph_filelock { #define CEPH_FILE_MODE_WR 2 #define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ #define CEPH_FILE_MODE_LAZY 4 /* lazy io */ -#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */ +#define CEPH_FILE_MODE_BITS 4 +#define CEPH_FILE_MODE_MASK ((1 << CEPH_FILE_MODE_BITS) - 1) int ceph_flags_to_mode(int flags); +#define CEPH_INLINE_NONE ((__u64)-1) /* capability bits */ #define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */ @@ -585,6 +700,9 @@ int ceph_flags_to_mode(int flags); CEPH_CAP_LINK_SHARED | \ CEPH_CAP_FILE_SHARED | \ CEPH_CAP_XATTR_SHARED) +#define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \ + CEPH_CAP_FILE_RD) +#define CEPH_STAT_RSTAT CEPH_CAP_FILE_WREXTEND #define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \ CEPH_CAP_LINK_SHARED | \ @@ -597,16 +715,27 @@ int ceph_flags_to_mode(int flags); CEPH_CAP_LINK_EXCL | \ CEPH_CAP_XATTR_EXCL | \ CEPH_CAP_FILE_EXCL) +#define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \ + CEPH_CAP_FILE_SHARED) #define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \ CEPH_CAP_FILE_EXCL) #define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) #define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ CEPH_CAP_PIN) +#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \ + CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \ + CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR) #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ CEPH_LOCK_IXATTR) +/* cap masks async dir operations */ +#define CEPH_CAP_DIR_CREATE CEPH_CAP_FILE_CACHE +#define CEPH_CAP_DIR_UNLINK CEPH_CAP_FILE_RD +#define CEPH_CAP_ANY_DIR_OPS (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | \ + CEPH_CAP_FILE_WREXTEND | CEPH_CAP_FILE_LAZYIO) + int ceph_caps_for_mode(int mode); enum { @@ -627,6 +756,11 @@ enum { extern const char *ceph_cap_op_name(int op); +/* flags field in client cap messages (version >= 10) */ +#define CEPH_CLIENT_CAPS_SYNC (1<<0) +#define CEPH_CLIENT_CAPS_NO_CAPSNAP (1<<1) +#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP (1<<2) + /* * caps message, used for capability callbacks, acks, requests, etc. */ @@ -650,14 +784,22 @@ struct ceph_mds_caps { __le32 xattr_len; __le64 xattr_version; - /* filelock */ + /* a union of non-export and export bodies. */ __le64 size, max_size, truncate_size; __le32 truncate_seq; struct ceph_timespec mtime, atime, ctime; - struct ceph_file_layout layout; + struct ceph_file_layout_legacy layout; __le32 time_warp_seq; } __attribute__ ((packed)); +struct ceph_mds_cap_peer { + __le64 cap_id; + __le32 issue_seq; + __le32 mseq; + __le32 mds; + __u8 flags; +} __attribute__ ((packed)); + /* cap release msg head */ struct ceph_mds_cap_release { __le32 num; /* number of cap_items that follow */ @@ -666,7 +808,7 @@ struct ceph_mds_cap_release { struct ceph_mds_cap_item { __le64 ino; __le64 cap_id; - __le32 migrate_seq, seq; + __le32 migrate_seq, issue_seq; } __attribute__ ((packed)); #define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */ @@ -750,4 +892,20 @@ struct ceph_mds_snap_realm { } __attribute__ ((packed)); /* followed by my snap list, then prior parent snap list */ +/* + * quotas + */ +struct ceph_mds_quota { + __le64 ino; /* ino */ + struct ceph_timespec rctime; + __le64 rbytes; /* dir stats */ + __le64 rfiles; + __le64 rsubdirs; + __u8 struct_v; /* compat */ + __u8 struct_compat; + __le32 struct_len; + __le64 max_bytes; /* quota max. bytes */ + __le64 max_files; /* quota max. files */ +} __attribute__ ((packed)); + #endif diff --git a/include/linux/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h index d099c3f90236..fda474c7a5d6 100644 --- a/include/linux/ceph/ceph_hash.h +++ b/include/linux/ceph/ceph_hash.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef FS_CEPH_HASH_H #define FS_CEPH_HASH_H diff --git a/include/linux/ceph/cls_lock_client.h b/include/linux/ceph/cls_lock_client.h new file mode 100644 index 000000000000..17bc7584d1fe --- /dev/null +++ b/include/linux/ceph/cls_lock_client.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CEPH_CLS_LOCK_CLIENT_H +#define _LINUX_CEPH_CLS_LOCK_CLIENT_H + +#include <linux/ceph/osd_client.h> + +enum ceph_cls_lock_type { + CEPH_CLS_LOCK_NONE = 0, + CEPH_CLS_LOCK_EXCLUSIVE = 1, + CEPH_CLS_LOCK_SHARED = 2, +}; + +struct ceph_locker_id { + struct ceph_entity_name name; /* locker's client name */ + char *cookie; /* locker's cookie */ +}; + +struct ceph_locker_info { + struct ceph_entity_addr addr; /* locker's address */ +}; + +struct ceph_locker { + struct ceph_locker_id id; + struct ceph_locker_info info; +}; + +int ceph_cls_lock(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, u8 type, char *cookie, + char *tag, char *desc, u8 flags); +int ceph_cls_unlock(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, char *cookie); +int ceph_cls_break_lock(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, char *cookie, + struct ceph_entity_name *locker); +int ceph_cls_set_cookie(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, u8 type, char *old_cookie, + char *tag, char *new_cookie); + +void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers); + +int ceph_cls_lock_info(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, u8 *type, char **tag, + struct ceph_locker **lockers, u32 *num_lockers); + +int ceph_cls_assert_locked(struct ceph_osd_request *req, int which, + char *lock_name, u8 type, char *cookie, char *tag); + +#endif diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h index 1df086d7882d..8b3a1a7a953a 100644 --- a/include/linux/ceph/debugfs.h +++ b/include/linux/ceph/debugfs.h @@ -1,32 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_DEBUGFS_H #define _FS_CEPH_DEBUGFS_H -#include <linux/ceph/ceph_debug.h> #include <linux/ceph/types.h> -#define CEPH_DEFINE_SHOW_FUNC(name) \ -static int name##_open(struct inode *inode, struct file *file) \ -{ \ - struct seq_file *sf; \ - int ret; \ - \ - ret = single_open(file, name, NULL); \ - sf = file->private_data; \ - sf->private = inode->i_private; \ - return ret; \ -} \ - \ -static const struct file_operations name##_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - /* debugfs.c */ -extern int ceph_debugfs_init(void); +extern void ceph_debugfs_init(void); extern void ceph_debugfs_cleanup(void); -extern int ceph_debugfs_client_init(struct ceph_client *client); +extern void ceph_debugfs_client_init(struct ceph_client *client); extern void ceph_debugfs_client_cleanup(struct ceph_client *client); #endif diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 0442c3d800f0..8fc1aed64113 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -1,30 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CEPH_DECODE_H #define __CEPH_DECODE_H #include <linux/err.h> #include <linux/bug.h> +#include <linux/slab.h> #include <linux/time.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/ceph/types.h> -/* This seemed to be the easiest place to define these */ - -#define U8_MAX ((u8)(~0U)) -#define U16_MAX ((u16)(~0U)) -#define U32_MAX ((u32)(~0U)) -#define U64_MAX ((u64)(~0ULL)) - -#define S8_MAX ((s8)(U8_MAX >> 1)) -#define S16_MAX ((s16)(U16_MAX >> 1)) -#define S32_MAX ((s32)(U32_MAX >> 1)) -#define S64_MAX ((s64)(U64_MAX >> 1LL)) - -#define S8_MIN ((s8)(-S8_MAX - 1)) -#define S16_MIN ((s16)(-S16_MAX - 1)) -#define S32_MIN ((s32)(-S32_MAX - 1)) -#define S64_MIN ((s64)(-S64_MAX - 1LL)) - /* * in all cases, * void **p pointer to position pointer @@ -64,7 +49,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n) /* * bounds check input. */ -static inline int ceph_has_room(void **p, void *end, size_t n) +static inline bool ceph_has_room(void **p, void *end, size_t n) { return end >= *p && n <= end - *p; } @@ -149,16 +134,82 @@ bad: } /* - * struct ceph_timespec <-> struct timespec + * skip helpers */ -static inline void ceph_decode_timespec(struct timespec *ts, - const struct ceph_timespec *tv) +#define ceph_decode_skip_n(p, end, n, bad) \ + do { \ + ceph_decode_need(p, end, n, bad); \ + *p += n; \ + } while (0) + +#define ceph_decode_skip_64(p, end, bad) \ +ceph_decode_skip_n(p, end, sizeof(u64), bad) + +#define ceph_decode_skip_32(p, end, bad) \ +ceph_decode_skip_n(p, end, sizeof(u32), bad) + +#define ceph_decode_skip_16(p, end, bad) \ +ceph_decode_skip_n(p, end, sizeof(u16), bad) + +#define ceph_decode_skip_8(p, end, bad) \ +ceph_decode_skip_n(p, end, sizeof(u8), bad) + +#define ceph_decode_skip_string(p, end, bad) \ + do { \ + u32 len; \ + \ + ceph_decode_32_safe(p, end, len, bad); \ + ceph_decode_skip_n(p, end, len, bad); \ + } while (0) + +#define ceph_decode_skip_set(p, end, type, bad) \ + do { \ + u32 len; \ + \ + ceph_decode_32_safe(p, end, len, bad); \ + while (len--) \ + ceph_decode_skip_##type(p, end, bad); \ + } while (0) + +#define ceph_decode_skip_map(p, end, ktype, vtype, bad) \ + do { \ + u32 len; \ + \ + ceph_decode_32_safe(p, end, len, bad); \ + while (len--) { \ + ceph_decode_skip_##ktype(p, end, bad); \ + ceph_decode_skip_##vtype(p, end, bad); \ + } \ + } while (0) + +#define ceph_decode_skip_map_of_map(p, end, ktype1, ktype2, vtype2, bad) \ + do { \ + u32 len; \ + \ + ceph_decode_32_safe(p, end, len, bad); \ + while (len--) { \ + ceph_decode_skip_##ktype1(p, end, bad); \ + ceph_decode_skip_map(p, end, ktype2, vtype2, bad); \ + } \ + } while (0) + +/* + * struct ceph_timespec <-> struct timespec64 + */ +static inline void ceph_decode_timespec64(struct timespec64 *ts, + const struct ceph_timespec *tv) { - ts->tv_sec = (__kernel_time_t)le32_to_cpu(tv->tv_sec); + /* + * This will still overflow in year 2106. We could extend + * the protocol to steal two more bits from tv_nsec to + * add three more 136 year epochs after that the way ext4 + * does if necessary. + */ + ts->tv_sec = (time64_t)le32_to_cpu(tv->tv_sec); ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec); } -static inline void ceph_encode_timespec(struct ceph_timespec *tv, - const struct timespec *ts) +static inline void ceph_encode_timespec64(struct ceph_timespec *tv, + const struct timespec64 *ts) { tv->tv_sec = cpu_to_le32((u32)ts->tv_sec); tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec); @@ -167,18 +218,35 @@ static inline void ceph_encode_timespec(struct ceph_timespec *tv, /* * sockaddr_storage <-> ceph_sockaddr */ -static inline void ceph_encode_addr(struct ceph_entity_addr *a) +#define CEPH_ENTITY_ADDR_TYPE_NONE 0 +#define CEPH_ENTITY_ADDR_TYPE_LEGACY __cpu_to_le32(1) +#define CEPH_ENTITY_ADDR_TYPE_MSGR2 __cpu_to_le32(2) +#define CEPH_ENTITY_ADDR_TYPE_ANY __cpu_to_le32(3) + +static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a) { __be16 ss_family = htons(a->in_addr.ss_family); a->in_addr.ss_family = *(__u16 *)&ss_family; + + /* Banner addresses require TYPE_NONE */ + a->type = CEPH_ENTITY_ADDR_TYPE_NONE; } -static inline void ceph_decode_addr(struct ceph_entity_addr *a) +static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a) { __be16 ss_family = *(__be16 *)&a->in_addr.ss_family; a->in_addr.ss_family = ntohs(ss_family); WARN_ON(a->in_addr.ss_family == 512); + a->type = CEPH_ENTITY_ADDR_TYPE_LEGACY; } +extern int ceph_decode_entity_addr(void **p, void *end, + struct ceph_entity_addr *addr); +int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2, + struct ceph_entity_addr *addr); + +int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr); +void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr); + /* * encoders */ @@ -234,6 +302,60 @@ static inline void ceph_encode_string(void **p, void *end, *p += len; } +/* + * version and length starting block encoders/decoders + */ + +/* current code version (u8) + compat code version (u8) + len of struct (u32) */ +#define CEPH_ENCODING_START_BLK_LEN 6 + +/** + * ceph_start_encoding - start encoding block + * @struct_v: current (code) version of the encoding + * @struct_compat: oldest code version that can decode it + * @struct_len: length of struct encoding + */ +static inline void ceph_start_encoding(void **p, u8 struct_v, u8 struct_compat, + u32 struct_len) +{ + ceph_encode_8(p, struct_v); + ceph_encode_8(p, struct_compat); + ceph_encode_32(p, struct_len); +} + +/** + * ceph_start_decoding - start decoding block + * @v: current version of the encoding that the code supports + * @name: name of the struct (free-form) + * @struct_v: out param for the encoding version + * @struct_len: out param for the length of struct encoding + * + * Validates the length of struct encoding, so unsafe ceph_decode_* + * variants can be used for decoding. + */ +static inline int ceph_start_decoding(void **p, void *end, u8 v, + const char *name, u8 *struct_v, + u32 *struct_len) +{ + u8 struct_compat; + + ceph_decode_need(p, end, CEPH_ENCODING_START_BLK_LEN, bad); + *struct_v = ceph_decode_8(p); + struct_compat = ceph_decode_8(p); + if (v < struct_compat) { + pr_warn("got struct_v %d struct_compat %d > %d of %s\n", + *struct_v, struct_compat, v, name); + return -EINVAL; + } + + *struct_len = ceph_decode_32(p); + ceph_decode_need(p, end, *struct_len, bad); + return 0; + +bad: + return -ERANGE; +} + #define ceph_encode_need(p, end, n, bad) \ do { \ if (!likely(ceph_has_room(p, end, n))) \ diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 2e3024881a5e..63e0e2aa1ce9 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -1,9 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_LIBCEPH_H #define _FS_CEPH_LIBCEPH_H #include <linux/ceph/ceph_debug.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/backing-dev.h> #include <linux/completion.h> #include <linux/exportfs.h> @@ -14,6 +15,7 @@ #include <linux/wait.h> #include <linux/writeback.h> #include <linux/slab.h> +#include <linux/refcount.h> #include <linux/ceph/types.h> #include <linux/ceph/messenger.h> @@ -21,6 +23,7 @@ #include <linux/ceph/mon_client.h> #include <linux/ceph/osd_client.h> #include <linux/ceph/ceph_fs.h> +#include <linux/ceph/string_table.h> /* * mount options @@ -28,9 +31,13 @@ #define CEPH_OPT_FSID (1<<0) #define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ #define CEPH_OPT_MYIP (1<<2) /* specified my ip */ -#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ +#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes (msgr1) */ +#define CEPH_OPT_TCP_NODELAY (1<<4) /* TCP_NODELAY on TCP sockets */ +#define CEPH_OPT_NOMSGSIGN (1<<5) /* don't sign msgs (msgr1) */ +#define CEPH_OPT_ABORT_ON_FULL (1<<6) /* abort w/ ENOSPC when full */ +#define CEPH_OPT_RXBOUNCE (1<<7) /* double-buffer read data */ -#define CEPH_OPT_DEFAULT (0) +#define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY) #define ceph_set_opt(client, opt) \ (client)->options->flags |= CEPH_OPT_##opt; @@ -41,12 +48,15 @@ struct ceph_options { int flags; struct ceph_fsid fsid; struct ceph_entity_addr my_addr; - int mount_timeout; - int osd_idle_ttl; - int osd_keepalive_timeout; + unsigned long mount_timeout; /* jiffies */ + unsigned long osd_idle_ttl; /* jiffies */ + unsigned long osd_keepalive_timeout; /* jiffies */ + unsigned long osd_request_timeout; /* jiffies */ + u32 read_from_replica; /* CEPH_OSD_FLAG_BALANCE/LOCALIZE_READS */ + int con_modes[2]; /* CEPH_CON_MODE_* */ /* - * any type that can't be simply compared or doesn't need need + * any type that can't be simply compared or doesn't need * to be compared should go beyond this point, * ceph_compare_options() should be updated accordingly */ @@ -56,48 +66,42 @@ struct ceph_options { int num_mon; char *name; struct ceph_crypto_key *key; + struct rb_root crush_locs; }; /* * defaults */ -#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 -#define CEPH_OSD_KEEPALIVE_DEFAULT 5 -#define CEPH_OSD_IDLE_TTL_DEFAULT 60 - +#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) +#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) +#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) +#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0 /* no timeout */ +#define CEPH_READ_FROM_REPLICA_DEFAULT 0 /* read from primary */ + +#define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) +#define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) +#define CEPH_MONC_PING_TIMEOUT msecs_to_jiffies(30 * 1000) +#define CEPH_MONC_HUNT_BACKOFF 2 +#define CEPH_MONC_HUNT_MAX_MULT 10 + +#define CEPH_MSG_MAX_CONTROL_LEN (16*1024*1024) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) -#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) - -#define CEPH_AUTH_NAME_DEFAULT "guest" /* - * Delay telling the MDS we no longer want caps, in case we reopen - * the file. Delay a minimum amount of time, even if we send a cap - * message for some other reason. Otherwise, take the oppotunity to - * update the mds to avoid sending another message later. + * The largest possible rbd data object is 32M. + * The largest possible rbd object map object is 64M. + * + * There is no limit on the size of cephfs objects, but it has to obey + * rsize and wsize mount options anyway. */ -#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ -#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ - -#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) - -/* mount state */ -enum { - CEPH_MOUNT_MOUNTING, - CEPH_MOUNT_MOUNTED, - CEPH_MOUNT_UNMOUNTING, - CEPH_MOUNT_UNMOUNTED, - CEPH_MOUNT_SHUTDOWN, -}; +#define CEPH_MSG_MAX_DATA_LEN (64*1024*1024) -/* - * subtract jiffies - */ -static inline unsigned long time_sub(unsigned long a, unsigned long b) +#define CEPH_AUTH_NAME_DEFAULT "guest" + +static inline unsigned long ceph_timeout_jiffies(unsigned long timeout) { - BUG_ON(time_after(b, a)); - return (long)a - (long)b; + return timeout ?: MAX_SCHEDULE_TIMEOUT; } struct ceph_mds_client; @@ -122,8 +126,8 @@ struct ceph_client { int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *); - u32 supported_features; - u32 required_features; + u64 supported_features; + u64 required_features; struct ceph_messenger msgr; /* messenger instance */ struct ceph_mon_client monc; @@ -133,10 +137,16 @@ struct ceph_client { struct dentry *debugfs_dir; struct dentry *debugfs_monmap; struct dentry *debugfs_osdmap; + struct dentry *debugfs_options; #endif }; +#define from_msgr(ms) container_of(ms, struct ceph_client, msgr) +static inline bool ceph_msgr2(struct ceph_client *client) +{ + return client->options->con_modes[0] != CEPH_CON_MODE_UNKNOWN; +} /* * snapshots @@ -151,7 +161,7 @@ struct ceph_client { * dirtied. */ struct ceph_snap_context { - atomic_t nref; + refcount_t nref; u64 seq; u32 num_snaps; u64 snaps[]; @@ -169,58 +179,146 @@ extern void ceph_put_snap_context(struct ceph_snap_context *sc); */ static inline int calc_pages_for(u64 off, u64 len) { - return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - - (off >> PAGE_CACHE_SHIFT); + return ((off+len+PAGE_SIZE-1) >> PAGE_SHIFT) - + (off >> PAGE_SHIFT); } -/* ceph_common.c */ -extern bool libceph_compatible(void *data); +#define RB_BYVAL(a) (a) +#define RB_BYPTR(a) (&(a)) +#define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b)) + +#define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \ +static bool __insert_##name(struct rb_root *root, type *t) \ +{ \ + struct rb_node **n = &root->rb_node; \ + struct rb_node *parent = NULL; \ + \ + BUG_ON(!RB_EMPTY_NODE(&t->nodefld)); \ + \ + while (*n) { \ + type *cur = rb_entry(*n, type, nodefld); \ + int cmp; \ + \ + parent = *n; \ + cmp = cmpexp(keyexp(t->keyfld), keyexp(cur->keyfld)); \ + if (cmp < 0) \ + n = &(*n)->rb_left; \ + else if (cmp > 0) \ + n = &(*n)->rb_right; \ + else \ + return false; \ + } \ + \ + rb_link_node(&t->nodefld, parent, n); \ + rb_insert_color(&t->nodefld, root); \ + return true; \ +} \ +static void __maybe_unused insert_##name(struct rb_root *root, type *t) \ +{ \ + if (!__insert_##name(root, t)) \ + BUG(); \ +} \ +static void erase_##name(struct rb_root *root, type *t) \ +{ \ + BUG_ON(RB_EMPTY_NODE(&t->nodefld)); \ + rb_erase(&t->nodefld, root); \ + RB_CLEAR_NODE(&t->nodefld); \ +} + +/* + * @lookup_param_type is a parameter and not constructed from (@type, + * @keyfld) with typeof() because adding const is too unwieldy. + */ +#define DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \ + lookup_param_type, nodefld) \ +static type *lookup_##name(struct rb_root *root, lookup_param_type key) \ +{ \ + struct rb_node *n = root->rb_node; \ + \ + while (n) { \ + type *cur = rb_entry(n, type, nodefld); \ + int cmp; \ + \ + cmp = cmpexp(key, keyexp(cur->keyfld)); \ + if (cmp < 0) \ + n = n->rb_left; \ + else if (cmp > 0) \ + n = n->rb_right; \ + else \ + return cur; \ + } \ + \ + return NULL; \ +} + +#define DEFINE_RB_FUNCS2(name, type, keyfld, cmpexp, keyexp, \ + lookup_param_type, nodefld) \ +DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \ +DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \ + lookup_param_type, nodefld) + +/* + * Shorthands for integer keys. + */ +#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ +DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, nodefld) + +#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ +extern type __lookup_##name##_key; \ +DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, \ + typeof(__lookup_##name##_key.keyfld), nodefld) + +#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \ +DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ +DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) -extern const char *ceph_msg_type_name(int type); -extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); extern struct kmem_cache *ceph_inode_cachep; extern struct kmem_cache *ceph_cap_cachep; +extern struct kmem_cache *ceph_cap_snap_cachep; +extern struct kmem_cache *ceph_cap_flush_cachep; extern struct kmem_cache *ceph_dentry_cachep; extern struct kmem_cache *ceph_file_cachep; +extern struct kmem_cache *ceph_dir_file_cachep; +extern struct kmem_cache *ceph_mds_request_cachep; +extern mempool_t *ceph_wb_pagevec_pool; -extern struct ceph_options *ceph_parse_options(char *options, - const char *dev_name, const char *dev_name_end, - int (*parse_extra_token)(char *c, void *private), - void *private); +/* ceph_common.c */ +extern bool libceph_compatible(void *data); + +extern const char *ceph_msg_type_name(int type); +extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); +extern int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid); + +struct fs_parameter; +struct fc_log; +struct ceph_options *ceph_alloc_options(void); +int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt, + struct fc_log *l, char delim); +int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, + struct fc_log *l); +int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, + bool show_all); extern void ceph_destroy_options(struct ceph_options *opt); extern int ceph_compare_options(struct ceph_options *new_opt, struct ceph_client *client); -extern struct ceph_client *ceph_create_client(struct ceph_options *opt, - void *private, - unsigned supported_features, - unsigned required_features); -extern u64 ceph_client_id(struct ceph_client *client); +struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private); +struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); +u64 ceph_client_gid(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); -extern int __ceph_open_session(struct ceph_client *client, - unsigned long started); +extern void ceph_reset_client_addr(struct ceph_client *client); +extern int __ceph_open_session(struct ceph_client *client); extern int ceph_open_session(struct ceph_client *client); +int ceph_wait_for_latest_osdmap(struct ceph_client *client, + unsigned long timeout); /* pagevec.c */ extern void ceph_release_page_vector(struct page **pages, int num_pages); - -extern struct page **ceph_get_direct_page_vector(const void __user *data, - int num_pages, - bool write_page); extern void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty); -extern void ceph_release_page_vector(struct page **pages, int num_pages); extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); -extern int ceph_copy_user_to_page_vector(struct page **pages, - const void __user *data, - loff_t off, size_t len); -extern void ceph_copy_to_page_vector(struct page **pages, - const void *data, - loff_t off, size_t len); extern void ceph_copy_from_page_vector(struct page **pages, void *data, loff_t off, size_t len); -extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data, - loff_t off, size_t len); extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h deleted file mode 100644 index 87ed09f54800..000000000000 --- a/include/linux/ceph/mdsmap.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef _FS_CEPH_MDSMAP_H -#define _FS_CEPH_MDSMAP_H - -#include <linux/bug.h> -#include <linux/ceph/types.h> - -/* - * mds map - describe servers in the mds cluster. - * - * we limit fields to those the client actually xcares about - */ -struct ceph_mds_info { - u64 global_id; - struct ceph_entity_addr addr; - s32 state; - int num_export_targets; - bool laggy; - u32 *export_targets; -}; - -struct ceph_mdsmap { - u32 m_epoch, m_client_epoch, m_last_failure; - u32 m_root; - u32 m_session_timeout; /* seconds */ - u32 m_session_autoclose; /* seconds */ - u64 m_max_file_size; - u32 m_max_mds; /* size of m_addr, m_state arrays */ - struct ceph_mds_info *m_info; - - /* which object pools file data can be stored in */ - int m_num_data_pg_pools; - u64 *m_data_pg_pools; - u64 m_cas_pg_pool; -}; - -static inline struct ceph_entity_addr * -ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) -{ - if (w >= m->m_max_mds) - return NULL; - return &m->m_info[w].addr; -} - -static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) -{ - BUG_ON(w < 0); - if (w >= m->m_max_mds) - return CEPH_MDS_STATE_DNE; - return m->m_info[w].state; -} - -static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) -{ - if (w >= 0 && w < m->m_max_mds) - return m->m_info[w].laggy; - return false; -} - -extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); -extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); -extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); - -#endif diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 7c1420bb1dce..6aa4c6478c9f 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -1,18 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FS_CEPH_MESSENGER_H #define __FS_CEPH_MESSENGER_H +#include <crypto/sha2.h> +#include <linux/bvec.h> +#include <linux/crypto.h> #include <linux/kref.h> #include <linux/mutex.h> #include <linux/net.h> #include <linux/radix-tree.h> #include <linux/uio.h> #include <linux/workqueue.h> +#include <net/net_namespace.h> #include <linux/ceph/types.h> #include <linux/ceph/buffer.h> struct ceph_msg; struct ceph_connection; +struct ceph_msg_data_cursor; /* * Ceph defines these callbacks for handling connection events. @@ -28,7 +34,10 @@ struct ceph_connection_operations { struct ceph_auth_handshake *(*get_authorizer) ( struct ceph_connection *con, int *proto, int force_new); - int (*verify_authorizer_reply) (struct ceph_connection *con, int len); + int (*add_authorizer_challenge)(struct ceph_connection *con, + void *challenge_buf, + int challenge_buf_len); + int (*verify_authorizer_reply) (struct ceph_connection *con); int (*invalidate_authorizer)(struct ceph_connection *con); /* there was some error on the socket (disconnect, whatever) */ @@ -41,9 +50,55 @@ struct ceph_connection_operations { struct ceph_msg * (*alloc_msg) (struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip); + + void (*reencode_message) (struct ceph_msg *msg); + + int (*sign_message) (struct ceph_msg *msg); + int (*check_message_signature) (struct ceph_msg *msg); + + /* msgr2 authentication exchange */ + int (*get_auth_request)(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len); + int (*handle_auth_reply_more)(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len); + int (*handle_auth_done)(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); + int (*handle_auth_bad_method)(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt); + + /** + * sparse_read: read sparse data + * @con: connection we're reading from + * @cursor: data cursor for reading extents + * @buf: optional buffer to read into + * + * This should be called more than once, each time setting up to + * receive an extent into the current cursor position, and zeroing + * the holes between them. + * + * Returns amount of data to be read (in bytes), 0 if reading is + * complete, or -errno if there was an error. + * + * If @buf is set on a >0 return, then the data should be read into + * the provided buffer. Otherwise, it should be read into the cursor. + * + * The sparse read operation is expected to initialize the cursor + * with a length covering up to the end of the last extent. + */ + int (*sparse_read)(struct ceph_connection *con, + struct ceph_msg_data_cursor *cursor, + char **buf); + }; -/* use format string %s%d */ +/* use format string %s%lld */ #define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num) struct ceph_messenger { @@ -51,7 +106,7 @@ struct ceph_messenger { struct ceph_entity_addr my_enc_addr; atomic_t stopping; - bool nocrc; + possible_net_t net; /* * the global_seq counts connections i (attempt to) initiate @@ -59,9 +114,6 @@ struct ceph_messenger { */ u32 global_seq; spinlock_t global_seq_lock; - - u32 supported_features; - u32 required_features; }; enum ceph_msg_data_type { @@ -71,58 +123,125 @@ enum ceph_msg_data_type { #ifdef CONFIG_BLOCK CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */ #endif /* CONFIG_BLOCK */ + CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */ + CEPH_MSG_DATA_ITER, /* data source/destination is an iov_iter */ }; -static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) -{ - switch (type) { - case CEPH_MSG_DATA_NONE: - case CEPH_MSG_DATA_PAGES: - case CEPH_MSG_DATA_PAGELIST: #ifdef CONFIG_BLOCK - case CEPH_MSG_DATA_BIO: + +struct ceph_bio_iter { + struct bio *bio; + struct bvec_iter iter; +}; + +#define __ceph_bio_iter_advance_step(it, n, STEP) do { \ + unsigned int __n = (n), __cur_n; \ + \ + while (__n) { \ + BUG_ON(!(it)->iter.bi_size); \ + __cur_n = min((it)->iter.bi_size, __n); \ + (void)(STEP); \ + bio_advance_iter((it)->bio, &(it)->iter, __cur_n); \ + if (!(it)->iter.bi_size && (it)->bio->bi_next) { \ + dout("__ceph_bio_iter_advance_step next bio\n"); \ + (it)->bio = (it)->bio->bi_next; \ + (it)->iter = (it)->bio->bi_iter; \ + } \ + __n -= __cur_n; \ + } \ +} while (0) + +/* + * Advance @it by @n bytes. + */ +#define ceph_bio_iter_advance(it, n) \ + __ceph_bio_iter_advance_step(it, n, 0) + +/* + * Advance @it by @n bytes, executing BVEC_STEP for each bio_vec. + */ +#define ceph_bio_iter_advance_step(it, n, BVEC_STEP) \ + __ceph_bio_iter_advance_step(it, n, ({ \ + struct bio_vec bv; \ + struct bvec_iter __cur_iter; \ + \ + __cur_iter = (it)->iter; \ + __cur_iter.bi_size = __cur_n; \ + __bio_for_each_segment(bv, (it)->bio, __cur_iter, __cur_iter) \ + (void)(BVEC_STEP); \ + })) + #endif /* CONFIG_BLOCK */ - return true; - default: - return false; - } -} + +struct ceph_bvec_iter { + struct bio_vec *bvecs; + struct bvec_iter iter; +}; + +#define __ceph_bvec_iter_advance_step(it, n, STEP) do { \ + BUG_ON((n) > (it)->iter.bi_size); \ + (void)(STEP); \ + bvec_iter_advance((it)->bvecs, &(it)->iter, (n)); \ +} while (0) + +/* + * Advance @it by @n bytes. + */ +#define ceph_bvec_iter_advance(it, n) \ + __ceph_bvec_iter_advance_step(it, n, 0) + +/* + * Advance @it by @n bytes, executing BVEC_STEP for each bio_vec. + */ +#define ceph_bvec_iter_advance_step(it, n, BVEC_STEP) \ + __ceph_bvec_iter_advance_step(it, n, ({ \ + struct bio_vec bv; \ + struct bvec_iter __cur_iter; \ + \ + __cur_iter = (it)->iter; \ + __cur_iter.bi_size = (n); \ + for_each_bvec(bv, (it)->bvecs, __cur_iter, __cur_iter) \ + (void)(BVEC_STEP); \ + })) + +#define ceph_bvec_iter_shorten(it, n) do { \ + BUG_ON((n) > (it)->iter.bi_size); \ + (it)->iter.bi_size = (n); \ +} while (0) struct ceph_msg_data { - struct list_head links; /* ceph_msg->data */ enum ceph_msg_data_type type; union { #ifdef CONFIG_BLOCK struct { - struct bio *bio; - size_t bio_length; + struct ceph_bio_iter bio_pos; + u32 bio_length; }; #endif /* CONFIG_BLOCK */ + struct ceph_bvec_iter bvec_pos; struct { - struct page **pages; /* NOT OWNER. */ + struct page **pages; size_t length; /* total # bytes */ unsigned int alignment; /* first page */ + bool own_pages; }; struct ceph_pagelist *pagelist; + struct iov_iter iter; }; }; struct ceph_msg_data_cursor { size_t total_resid; /* across all data items */ - struct list_head *data_head; /* = &ceph_msg->data */ struct ceph_msg_data *data; /* current data item */ size_t resid; /* bytes not yet consumed */ - bool last_piece; /* current is last piece */ + int sr_resid; /* residual sparse_read len */ bool need_crc; /* crc update needed */ union { #ifdef CONFIG_BLOCK - struct { /* bio */ - struct bio *bio; /* bio from list */ - unsigned int vector_index; /* vector from bio */ - unsigned int vector_offset; /* bytes from vector */ - }; + struct ceph_bio_iter bio_iter; #endif /* CONFIG_BLOCK */ + struct bvec_iter bvec_iter; struct { /* pages */ unsigned int page_offset; /* offset in page */ unsigned short page_index; /* index in array */ @@ -132,6 +251,10 @@ struct ceph_msg_data_cursor { struct page *page; /* page from list */ size_t offset; /* bytes from list */ }; + struct { + struct iov_iter iov_iter; + unsigned int lastlen; + }; }; }; @@ -142,30 +265,202 @@ struct ceph_msg_data_cursor { */ struct ceph_msg { struct ceph_msg_header hdr; /* header */ - struct ceph_msg_footer footer; /* footer */ + union { + struct ceph_msg_footer footer; /* footer */ + struct ceph_msg_footer_old old_footer; /* old format footer */ + }; struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; size_t data_length; - struct list_head data; + struct ceph_msg_data *data; + int num_data_items; + int max_data_items; struct ceph_msg_data_cursor cursor; struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ struct kref kref; - bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; - int front_max; - unsigned long ack_stamp; /* tx: when we were acked */ + u64 sparse_read_total; + int front_alloc_len; struct ceph_msgpool *pool; }; +/* + * connection states + */ +#define CEPH_CON_S_CLOSED 1 +#define CEPH_CON_S_PREOPEN 2 +#define CEPH_CON_S_V1_BANNER 3 +#define CEPH_CON_S_V1_CONNECT_MSG 4 +#define CEPH_CON_S_V2_BANNER_PREFIX 5 +#define CEPH_CON_S_V2_BANNER_PAYLOAD 6 +#define CEPH_CON_S_V2_HELLO 7 +#define CEPH_CON_S_V2_AUTH 8 +#define CEPH_CON_S_V2_AUTH_SIGNATURE 9 +#define CEPH_CON_S_V2_SESSION_CONNECT 10 +#define CEPH_CON_S_V2_SESSION_RECONNECT 11 +#define CEPH_CON_S_OPEN 12 +#define CEPH_CON_S_STANDBY 13 + +/* + * ceph_connection flag bits + */ +#define CEPH_CON_F_LOSSYTX 0 /* we can close channel or drop + messages on errors */ +#define CEPH_CON_F_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ +#define CEPH_CON_F_WRITE_PENDING 2 /* we have data ready to send */ +#define CEPH_CON_F_SOCK_CLOSED 3 /* socket state changed to closed */ +#define CEPH_CON_F_BACKOFF 4 /* need to retry queuing delayed + work */ + /* ceph connection fault delay defaults, for exponential backoff */ -#define BASE_DELAY_INTERVAL (HZ/2) -#define MAX_DELAY_INTERVAL (5 * 60 * HZ) +#define BASE_DELAY_INTERVAL (HZ / 4) +#define MAX_DELAY_INTERVAL (15 * HZ) + +struct ceph_connection_v1_info { + struct kvec out_kvec[8], /* sending header/footer data */ + *out_kvec_cur; + int out_kvec_left; /* kvec's left in out_kvec */ + int out_skip; /* skip this many bytes */ + int out_kvec_bytes; /* total bytes left */ + bool out_more; /* there is more data after the kvecs */ + bool out_msg_done; + + struct ceph_auth_handshake *auth; + int auth_retry; /* true if we need a newer authorizer */ + + /* connection negotiation temps */ + u8 in_banner[CEPH_BANNER_MAX_LEN]; + struct ceph_entity_addr actual_peer_addr; + struct ceph_entity_addr peer_addr_for_me; + struct ceph_msg_connect out_connect; + struct ceph_msg_connect_reply in_reply; + + int in_base_pos; /* bytes read */ + + /* sparse reads */ + struct kvec in_sr_kvec; /* current location to receive into */ + u64 in_sr_len; /* amount of data in this extent */ + + /* message in temps */ + u8 in_tag; /* protocol control byte */ + struct ceph_msg_header in_hdr; + __le64 in_temp_ack; /* for reading an ack */ + + /* message out temps */ + struct ceph_msg_header out_hdr; + __le64 out_temp_ack; /* for writing an ack */ + struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2 + stamp */ + + u32 connect_seq; /* identify the most recent connection + attempt for this session */ + u32 peer_global_seq; /* peer's global seq for this connection */ +}; + +#define CEPH_CRC_LEN 4 +#define CEPH_GCM_KEY_LEN 16 +#define CEPH_GCM_IV_LEN sizeof(struct ceph_gcm_nonce) +#define CEPH_GCM_BLOCK_LEN 16 +#define CEPH_GCM_TAG_LEN 16 + +#define CEPH_PREAMBLE_LEN 32 +#define CEPH_PREAMBLE_INLINE_LEN 48 +#define CEPH_PREAMBLE_PLAIN_LEN CEPH_PREAMBLE_LEN +#define CEPH_PREAMBLE_SECURE_LEN (CEPH_PREAMBLE_LEN + \ + CEPH_PREAMBLE_INLINE_LEN + \ + CEPH_GCM_TAG_LEN) +#define CEPH_EPILOGUE_PLAIN_LEN (1 + 3 * CEPH_CRC_LEN) +#define CEPH_EPILOGUE_SECURE_LEN (CEPH_GCM_BLOCK_LEN + CEPH_GCM_TAG_LEN) + +#define CEPH_FRAME_MAX_SEGMENT_COUNT 4 + +struct ceph_frame_desc { + int fd_tag; /* FRAME_TAG_* */ + int fd_seg_cnt; + int fd_lens[CEPH_FRAME_MAX_SEGMENT_COUNT]; /* logical */ + int fd_aligns[CEPH_FRAME_MAX_SEGMENT_COUNT]; +}; + +struct ceph_gcm_nonce { + __le32 fixed; + __le64 counter __packed; +}; + +struct ceph_connection_v2_info { + struct iov_iter in_iter; + struct kvec in_kvecs[5]; /* recvmsg */ + struct bio_vec in_bvec; /* recvmsg (in_cursor) */ + int in_kvec_cnt; + int in_state; /* IN_S_* */ + + struct iov_iter out_iter; + struct kvec out_kvecs[8]; /* sendmsg */ + struct bio_vec out_bvec; /* sendpage (out_cursor, out_zero), + sendmsg (out_enc_pages) */ + int out_kvec_cnt; + int out_state; /* OUT_S_* */ + + int out_zero; /* # of zero bytes to send */ + bool out_iter_sendpage; /* use sendpage if possible */ + + struct ceph_frame_desc in_desc; + struct ceph_msg_data_cursor in_cursor; + struct ceph_msg_data_cursor out_cursor; + + struct hmac_sha256_key hmac_key; /* post-auth signature */ + bool hmac_key_set; + struct crypto_aead *gcm_tfm; /* on-wire encryption */ + struct aead_request *gcm_req; + struct crypto_wait gcm_wait; + struct ceph_gcm_nonce in_gcm_nonce; + struct ceph_gcm_nonce out_gcm_nonce; + + struct page **in_enc_pages; + int in_enc_page_cnt; + int in_enc_resid; + int in_enc_i; + struct page **out_enc_pages; + int out_enc_page_cnt; + int out_enc_resid; + int out_enc_i; + + int con_mode; /* CEPH_CON_MODE_* */ + + void *conn_bufs[16]; + int conn_buf_cnt; + int data_len_remain; + + struct kvec in_sign_kvecs[8]; + struct kvec out_sign_kvecs[8]; + int in_sign_kvec_cnt; + int out_sign_kvec_cnt; + + u64 client_cookie; + u64 server_cookie; + u64 global_seq; + u64 connect_seq; + u64 peer_global_seq; + + u8 in_buf[CEPH_PREAMBLE_SECURE_LEN]; + u8 out_buf[CEPH_PREAMBLE_SECURE_LEN]; + struct { + u8 late_status; /* FRAME_LATE_STATUS_* */ + union { + struct { + u32 front_crc; + u32 middle_crc; + u32 data_crc; + } __packed; + u8 pad[CEPH_GCM_BLOCK_LEN - 1]; + }; + } out_epil; +}; /* * A single connection with another host. @@ -181,25 +476,16 @@ struct ceph_connection { struct ceph_messenger *msgr; + int state; /* CEPH_CON_S_* */ atomic_t sock_state; struct socket *sock; - struct ceph_entity_addr peer_addr; /* peer address */ - struct ceph_entity_addr peer_addr_for_me; - unsigned long flags; - unsigned long state; + unsigned long flags; /* CEPH_CON_F_* */ const char *error_msg; /* error message, if any */ struct ceph_entity_name peer_name; /* peer name */ - - unsigned peer_features; - u32 connect_seq; /* identify the most recent connection - attempt for this connection, client */ - u32 peer_global_seq; /* peer's global seq for this connection */ - - int auth_retry; /* true if we need a newer authorizer */ - void *auth_reply_buf; /* where to put the authorizer reply */ - int auth_reply_buf_len; + struct ceph_entity_addr peer_addr; /* peer address */ + u64 peer_features; struct mutex mutex; @@ -210,55 +496,95 @@ struct ceph_connection { u64 in_seq, in_seq_acked; /* last message received, acked */ - /* connection negotiation temps */ - char in_banner[CEPH_BANNER_MAX_LEN]; - struct ceph_msg_connect out_connect; - struct ceph_msg_connect_reply in_reply; - struct ceph_entity_addr actual_peer_addr; - - /* message out temps */ + struct ceph_msg *in_msg; struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ - bool out_msg_done; - struct kvec out_kvec[8], /* sending header/footer data */ - *out_kvec_cur; - int out_kvec_left; /* kvec's left in out_kvec */ - int out_skip; /* skip this many bytes */ - int out_kvec_bytes; /* total bytes left */ - bool out_kvec_is_msg; /* kvec refers to out_msg */ - int out_more; /* there is more data after the kvecs */ - __le64 out_temp_ack; /* for writing an ack */ - - /* message in temps */ - struct ceph_msg_header in_hdr; - struct ceph_msg *in_msg; + struct page *bounce_page; u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ - char in_tag; /* protocol control byte */ - int in_base_pos; /* bytes read */ - __le64 in_temp_ack; /* for reading an ack */ + struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */ struct delayed_work work; /* send|recv work */ unsigned long delay; /* current delay interval */ + + union { + struct ceph_connection_v1_info v1; + struct ceph_connection_v2_info v2; + }; }; +extern struct page *ceph_zero_page; + +void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag); +void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag); +bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag); +bool ceph_con_flag_test_and_clear(struct ceph_connection *con, + unsigned long con_flag); +bool ceph_con_flag_test_and_set(struct ceph_connection *con, + unsigned long con_flag); + +void ceph_encode_my_addr(struct ceph_messenger *msgr); + +int ceph_tcp_connect(struct ceph_connection *con); +int ceph_con_close_socket(struct ceph_connection *con); +void ceph_con_reset_session(struct ceph_connection *con); + +u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt); +void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq); +void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq); + +void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, + struct ceph_msg *msg, size_t length); +struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length); +void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes); + +u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, + unsigned int length); + +bool ceph_addr_is_blank(const struct ceph_entity_addr *addr); +int ceph_addr_port(const struct ceph_entity_addr *addr); +void ceph_addr_set_port(struct ceph_entity_addr *addr, int p); + +void ceph_con_process_message(struct ceph_connection *con); +int ceph_con_in_msg_alloc(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip); +struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con); + +/* messenger_v1.c */ +int ceph_con_v1_try_read(struct ceph_connection *con); +int ceph_con_v1_try_write(struct ceph_connection *con); +void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg); +void ceph_con_v1_revoke_incoming(struct ceph_connection *con); +bool ceph_con_v1_opened(struct ceph_connection *con); +void ceph_con_v1_reset_session(struct ceph_connection *con); +void ceph_con_v1_reset_protocol(struct ceph_connection *con); + +/* messenger_v2.c */ +int ceph_con_v2_try_read(struct ceph_connection *con); +int ceph_con_v2_try_write(struct ceph_connection *con); +void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg); +void ceph_con_v2_revoke_incoming(struct ceph_connection *con); +bool ceph_con_v2_opened(struct ceph_connection *con); +void ceph_con_v2_reset_session(struct ceph_connection *con); +void ceph_con_v2_reset_protocol(struct ceph_connection *con); + + +extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr); -extern const char *ceph_pr_addr(const struct sockaddr_storage *ss); extern int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, - int max_count, int *count); - + int max_count, int *count, char delim); extern int ceph_msgr_init(void); extern void ceph_msgr_exit(void); extern void ceph_msgr_flush(void); extern void ceph_messenger_init(struct ceph_messenger *msgr, - struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features, - bool nocrc); + struct ceph_entity_addr *myaddr); +extern void ceph_messenger_fini(struct ceph_messenger *msgr); +extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr); extern void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, @@ -274,31 +600,29 @@ extern void ceph_msg_revoke(struct ceph_msg *msg); extern void ceph_msg_revoke_incoming(struct ceph_msg *msg); extern void ceph_con_keepalive(struct ceph_connection *con); +extern bool ceph_con_keepalive_expired(struct ceph_connection *con, + unsigned long interval); -extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, - size_t length, size_t alignment); +void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, + size_t length, size_t alignment, bool own_pages); extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist); #ifdef CONFIG_BLOCK -extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, - size_t length); +void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, + u32 length); #endif /* CONFIG_BLOCK */ +void ceph_msg_data_add_bvecs(struct ceph_msg *msg, + struct ceph_bvec_iter *bvec_pos); +void ceph_msg_data_add_iter(struct ceph_msg *msg, + struct iov_iter *iter); +struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, + gfp_t flags, bool can_fail); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); -extern void ceph_msg_kfree(struct ceph_msg *m); - - -static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) -{ - kref_get(&msg->kref); - return msg; -} -extern void ceph_msg_last_put(struct kref *kref); -static inline void ceph_msg_put(struct ceph_msg *msg) -{ - kref_put(&msg->kref, ceph_msg_last_put); -} + +extern struct ceph_msg *ceph_msg_get(struct ceph_msg *msg); +extern void ceph_msg_put(struct ceph_msg *msg); extern void ceph_msg_dump(struct ceph_msg *msg); diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index a486f390dfbe..7a9a40163c0f 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_MON_CLIENT_H #define _FS_CEPH_MON_CLIENT_H @@ -18,7 +19,7 @@ struct ceph_monmap { struct ceph_fsid fsid; u32 epoch; u32 num_mon; - struct ceph_entity_inst mon_inst[0]; + struct ceph_entity_inst mon_inst[] __counted_by(num_mon); }; struct ceph_mon_client; @@ -39,21 +40,31 @@ struct ceph_mon_request { ceph_monc_request_func_t do_request; }; +typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *); + /* - * ceph_mon_generic_request is being used for the statfs and poolop requests - * which are bening done a bit differently because we need to get data back - * to the caller + * ceph_mon_generic_request is being used for the statfs and + * mon_get_version requests which are being done a bit differently + * because we need to get data back to the caller */ struct ceph_mon_generic_request { + struct ceph_mon_client *monc; struct kref kref; u64 tid; struct rb_node node; int result; - void *buf; - int buf_len; + struct completion completion; + ceph_monc_callback_t complete_cb; + u64 private_data; /* r_tid/linger_id */ + struct ceph_msg *request; /* original request */ struct ceph_msg *reply; /* and reply */ + + union { + struct ceph_statfs *st; + u64 newest; + } u; }; struct ceph_mon_client { @@ -69,53 +80,73 @@ struct ceph_mon_client { bool hunting; int cur_mon; /* last monitor i contacted */ - unsigned long sub_sent, sub_renew_after; + unsigned long sub_renew_after; + unsigned long sub_renew_sent; struct ceph_connection con; + bool had_a_connection; + int hunt_mult; /* [1..CEPH_MONC_HUNT_MAX_MULT] */ + /* pending generic requests */ struct rb_root generic_request_tree; - int num_generic_requests; u64 last_tid; - /* mds/osd map */ - int want_mdsmap; - int want_next_osdmap; /* 1 = want, 2 = want+asked */ - u32 have_osdmap, have_mdsmap; + /* subs, indexed with CEPH_SUB_* */ + struct { + struct ceph_mon_subscribe_item item; + bool want; + u32 have; /* epoch */ + } subs[4]; + int fs_cluster_id; /* "mdsmap.<id>" sub */ #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_file; #endif }; -extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end); extern int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr); extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); extern void ceph_monc_stop(struct ceph_mon_client *monc); +extern void ceph_monc_reopen_session(struct ceph_mon_client *monc); + +enum { + CEPH_SUB_MONMAP = 0, + CEPH_SUB_OSDMAP, + CEPH_SUB_FSMAP, + CEPH_SUB_MDSMAP, +}; + +extern const char *ceph_sub_str[]; /* * The model here is to indicate that we need a new map of at least - * epoch @want, and also call in when we receive a map. We will + * epoch @epoch, and also call in when we receive a map. We will * periodically rerequest the map from the monitor cluster until we * get what we want. */ -extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have); -extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); +bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, + bool continuous); +void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); +void ceph_monc_renew_subs(struct ceph_mon_client *monc); -extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); +extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, + unsigned long timeout); -extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, - struct ceph_statfs *buf); +int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool, + struct ceph_statfs *buf); -extern int ceph_monc_open_session(struct ceph_mon_client *monc); +int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, + u64 *newest); +int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what, + ceph_monc_callback_t cb, u64 private_data); -extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); +int ceph_monc_blocklist_add(struct ceph_mon_client *monc, + struct ceph_entity_addr *client_addr); -extern int ceph_monc_create_snapid(struct ceph_mon_client *monc, - u32 pool, u64 *snapid); +extern int ceph_monc_open_session(struct ceph_mon_client *monc); -extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc, - u32 pool, u64 snapid); +extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); #endif diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h index 4b0d38960726..729cdf700eae 100644 --- a/include/linux/ceph/msgpool.h +++ b/include/linux/ceph/msgpool.h @@ -1,8 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_MSGPOOL #define _FS_CEPH_MSGPOOL #include <linux/mempool.h> -#include <linux/ceph/messenger.h> /* * we use memory pools for preallocating messages we may receive, to @@ -13,14 +13,15 @@ struct ceph_msgpool { mempool_t *pool; int type; /* preallocated message type */ int front_len; /* preallocated payload size */ + int max_data_items; }; -extern int ceph_msgpool_init(struct ceph_msgpool *pool, int type, - int front_len, int size, bool blocking, - const char *name); +int ceph_msgpool_init(struct ceph_msgpool *pool, int type, + int front_len, int max_data_items, int size, + const char *name); extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); -extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, - int front_len); +struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len, + int max_data_items); extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); #endif diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h index 3d94a73b5f30..3989dcb94d3d 100644 --- a/include/linux/ceph/msgr.h +++ b/include/linux/ceph/msgr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef CEPH_MSGR_H #define CEPH_MSGR_H @@ -8,24 +9,45 @@ #define CEPH_MON_PORT 6789 /* default monitor port */ /* - * client-side processes will try to bind to ports in this - * range, simply for the benefit of tools like nmap or wireshark - * that would like to identify the protocol. - */ -#define CEPH_PORT_FIRST 6789 -#define CEPH_PORT_START 6800 /* non-monitors start here */ -#define CEPH_PORT_LAST 6900 - -/* * tcp connection banner. include a protocol version. and adjust * whenever the wire protocol changes. try to keep this string length * constant. */ #define CEPH_BANNER "ceph v027" +#define CEPH_BANNER_LEN 9 #define CEPH_BANNER_MAX_LEN 30 /* + * messenger V2 connection banner prefix. + * The full banner string should have the form: "ceph v2\n<le16>" + * the 2 bytes are the length of the remaining banner. + */ +#define CEPH_BANNER_V2 "ceph v2\n" +#define CEPH_BANNER_V2_LEN 8 +#define CEPH_BANNER_V2_PREFIX_LEN (CEPH_BANNER_V2_LEN + sizeof(__le16)) + +/* + * messenger V2 features + */ +#define CEPH_MSGR2_INCARNATION_1 (0ull) + +#define DEFINE_MSGR2_FEATURE(bit, incarnation, name) \ + static const uint64_t __maybe_unused CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \ + static const uint64_t __maybe_unused CEPH_MSGR2_FEATUREMASK_##name = \ + (1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation); + +#define HAVE_MSGR2_FEATURE(x, name) \ + (((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name)) + +DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1) // msgr2.1 + +#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1) + +#define CEPH_MSGR2_REQUIRED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1) + + +/* * Rollover-safe type and comparator for 32-bit sequence numbers. * Comparator returns -1, 0, or 1. */ @@ -60,11 +82,18 @@ extern const char *ceph_entity_type_name(int type); * entity_addr -- network address */ struct ceph_entity_addr { - __le32 type; + __le32 type; /* CEPH_ENTITY_ADDR_TYPE_* */ __le32 nonce; /* unique id for process (e.g. pid) */ struct sockaddr_storage in_addr; } __attribute__ ((packed)); +static inline bool ceph_addr_equal_no_type(const struct ceph_entity_addr *lhs, + const struct ceph_entity_addr *rhs) +{ + return !memcmp(&lhs->in_addr, &rhs->in_addr, sizeof(lhs->in_addr)) && + lhs->nonce == rhs->nonce; +} + struct ceph_entity_inst { struct ceph_entity_name name; struct ceph_entity_addr addr; @@ -84,11 +113,13 @@ struct ceph_entity_inst { #define CEPH_MSGR_TAG_MSG 7 /* message */ #define CEPH_MSGR_TAG_ACK 8 /* message ack */ #define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */ -#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ +#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ #define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ #define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */ - +#define CEPH_MSGR_TAG_KEEPALIVE2 14 /* keepalive2 byte + ceph_timespec */ +#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */ +#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16 /* cephx v2 doing server challenge */ /* * connection negotiation @@ -152,10 +183,29 @@ struct ceph_msg_header { receiver: mask against ~PAGE_MASK */ struct ceph_entity_name src; - __le32 reserved; + __le16 compat_version; + __le16 reserved; __le32 crc; /* header crc32c */ } __attribute__ ((packed)); +struct ceph_msg_header2 { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 data_pre_padding_len; + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + __le64 ack_seq; + __u8 flags; + /* oldest code we think can decode this. unknown if zero. */ + __le16 compat_version; + __le16 reserved; +} __attribute__ ((packed)); + #define CEPH_MSG_PRIO_LOW 64 #define CEPH_MSG_PRIO_DEFAULT 127 #define CEPH_MSG_PRIO_HIGH 196 @@ -164,13 +214,21 @@ struct ceph_msg_header { /* * follows data payload */ +struct ceph_msg_footer_old { + __le32 front_crc, middle_crc, data_crc; + __u8 flags; +} __attribute__ ((packed)); + struct ceph_msg_footer { __le32 front_crc, middle_crc, data_crc; + // sig holds the 64 bits of the digital signature for the message PLR + __le64 sig; __u8 flags; } __attribute__ ((packed)); #define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */ #define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ +#define CEPH_MSG_FOOTER_SIGNED (1<<2) /* msg was signed */ #endif diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ce6df39f60ff..50b14a5661c7 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -1,55 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_OSD_CLIENT_H #define _FS_CEPH_OSD_CLIENT_H +#include <linux/bitrev.h> #include <linux/completion.h> #include <linux/kref.h> #include <linux/mempool.h> #include <linux/rbtree.h> +#include <linux/refcount.h> +#include <linux/ktime.h> #include <linux/ceph/types.h> #include <linux/ceph/osdmap.h> #include <linux/ceph/messenger.h> +#include <linux/ceph/msgpool.h> #include <linux/ceph/auth.h> #include <linux/ceph/pagelist.h> -/* - * Maximum object name size - * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100) - */ -#define MAX_OBJ_NAME_SIZE 100 - struct ceph_msg; struct ceph_snap_context; struct ceph_osd_request; struct ceph_osd_client; -struct ceph_authorizer; /* * completion callback for async writepages */ -typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, - struct ceph_msg *); -typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); +typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); + +#define CEPH_HOMELESS_OSD -1 + +/* + * A single extent in a SPARSE_READ reply. + * + * Note that these come from the OSD as little-endian values. On BE arches, + * we convert them in-place after receipt. + */ +struct ceph_sparse_extent { + u64 off; + u64 len; +} __packed; + +/* Sparse read state machine state values */ +enum ceph_sparse_read_state { + CEPH_SPARSE_READ_HDR = 0, + CEPH_SPARSE_READ_EXTENTS, + CEPH_SPARSE_READ_DATA_LEN, + CEPH_SPARSE_READ_DATA_PRE, + CEPH_SPARSE_READ_DATA, +}; + +/* + * A SPARSE_READ reply is a 32-bit count of extents, followed by an array of + * 64-bit offset/length pairs, and then all of the actual file data + * concatenated after it (sans holes). + * + * Unfortunately, we don't know how long the extent array is until we've + * started reading the data section of the reply. The caller should send down + * a destination buffer for the array, but we'll alloc one if it's too small + * or if the caller doesn't. + */ +struct ceph_sparse_read { + enum ceph_sparse_read_state sr_state; /* state machine state */ + u64 sr_req_off; /* orig request offset */ + u64 sr_req_len; /* orig request length */ + u64 sr_pos; /* current pos in buffer */ + int sr_index; /* current extent index */ + u32 sr_datalen; /* length of actual data */ + u32 sr_count; /* extent count in reply */ + int sr_ext_len; /* length of extent array */ + struct ceph_sparse_extent *sr_extent; /* extent array */ +}; -/* a given osd we're communicating with */ +/* + * A given osd we're communicating with. + * + * Note that the o_requests tree can be searched while holding the "lock" mutex + * or the "o_requests_lock" spinlock. Insertion or removal requires both! + */ struct ceph_osd { - atomic_t o_ref; + refcount_t o_ref; + int o_sparse_op_idx; struct ceph_osd_client *o_osdc; int o_osd; int o_incarnation; struct rb_node o_node; struct ceph_connection o_con; - struct list_head o_requests; - struct list_head o_linger_requests; + spinlock_t o_requests_lock; + struct rb_root o_requests; + struct rb_root o_linger_requests; + struct rb_root o_backoff_mappings; + struct rb_root o_backoffs_by_id; struct list_head o_osd_lru; struct ceph_auth_handshake o_auth; unsigned long lru_ttl; - int o_marked_for_keepalive; struct list_head o_keepalive_item; + struct mutex lock; + struct ceph_sparse_read o_sparse_read; }; - -#define CEPH_OSD_MAX_OP 2 +#define CEPH_OSD_SLAB_OPS 2 +#define CEPH_OSD_MAX_OPS 16 enum ceph_osd_data_type { CEPH_OSD_DATA_TYPE_NONE = 0, @@ -58,6 +108,8 @@ enum ceph_osd_data_type { #ifdef CONFIG_BLOCK CEPH_OSD_DATA_TYPE_BIO, #endif /* CONFIG_BLOCK */ + CEPH_OSD_DATA_TYPE_BVECS, + CEPH_OSD_DATA_TYPE_ITER, }; struct ceph_osd_data { @@ -73,25 +125,43 @@ struct ceph_osd_data { struct ceph_pagelist *pagelist; #ifdef CONFIG_BLOCK struct { - struct bio *bio; /* list of bios */ - size_t bio_length; /* total in list */ + struct ceph_bio_iter bio_pos; + u32 bio_length; }; #endif /* CONFIG_BLOCK */ + struct { + struct ceph_bvec_iter bvec_pos; + u32 num_bvecs; + }; + struct iov_iter iter; }; }; struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ - u32 payload_len; + u32 flags; /* CEPH_OSD_OP_FLAG_* */ + u32 indata_len; /* request */ + u32 outdata_len; /* reply */ + s32 rval; + union { struct ceph_osd_data raw_data_in; struct { u64 offset, length; u64 truncate_size; u32 truncate_seq; + int sparse_ext_cnt; + struct ceph_sparse_extent *sparse_ext; struct ceph_osd_data osd_data; } extent; struct { + u32 name_len; + u32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ + struct ceph_osd_data osd_data; + } xattr; + struct { const char *class_name; const char *method_name; struct ceph_osd_data request_info; @@ -99,112 +169,258 @@ struct ceph_osd_req_op { struct ceph_osd_data response_data; __u8 class_len; __u8 method_len; - __u8 argc; + u32 indata_len; } cls; struct { u64 cookie; - u64 ver; - u32 prot_ver; - u32 timeout; - __u8 flag; + __u8 op; /* CEPH_OSD_WATCH_OP_ */ + u32 gen; } watch; + struct { + struct ceph_osd_data request_data; + } notify_ack; + struct { + u64 cookie; + struct ceph_osd_data request_data; + struct ceph_osd_data response_data; + } notify; + struct { + struct ceph_osd_data response_data; + } list_watchers; + struct { + u64 expected_object_size; + u64 expected_write_size; + u32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ + } alloc_hint; + struct { + u64 snapid; + u64 src_version; + u8 flags; + u32 src_fadvise_flags; + struct ceph_osd_data osd_data; + } copy_from; + struct { + u64 ver; + } assert_ver; }; }; +struct ceph_osd_request_target { + struct ceph_object_id base_oid; + struct ceph_object_locator base_oloc; + struct ceph_object_id target_oid; + struct ceph_object_locator target_oloc; + + struct ceph_pg pgid; /* last raw pg we mapped to */ + struct ceph_spg spgid; /* last actual spg we mapped to */ + u32 pg_num; + u32 pg_num_mask; + struct ceph_osds acting; + struct ceph_osds up; + int size; + int min_size; + bool sort_bitwise; + bool recovery_deletes; + + unsigned int flags; /* CEPH_OSD_FLAG_* */ + bool used_replica; + bool paused; + + u32 epoch; + u32 last_force_resend; + + int osd; +}; + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ struct rb_node r_node; - struct list_head r_req_lru_item; - struct list_head r_osd_item; - struct list_head r_linger_item; - struct list_head r_linger_osd; + struct rb_node r_mc_node; /* map check */ + struct work_struct r_complete_work; struct ceph_osd *r_osd; - struct ceph_pg r_pgid; - int r_pg_osds[CEPH_PG_MAX_SIZE]; - int r_num_pg_osds; + + struct ceph_osd_request_target r_t; +#define r_base_oid r_t.base_oid +#define r_base_oloc r_t.base_oloc +#define r_flags r_t.flags struct ceph_msg *r_request, *r_reply; - int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ /* request osd ops array */ unsigned int r_num_ops; - struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; - - /* these are updated on each send */ - __le32 *r_request_osdmap_epoch; - __le32 *r_request_flags; - __le64 *r_request_pool; - void *r_request_pgid; - __le32 *r_request_attempts; - struct ceph_eversion *r_request_reassert_version; int r_result; - int r_reply_op_len[CEPH_OSD_MAX_OP]; - s32 r_reply_op_result[CEPH_OSD_MAX_OP]; - int r_got_reply; - int r_linger; struct ceph_osd_client *r_osdc; struct kref r_kref; bool r_mempool; - struct completion r_completion, r_safe_completion; + bool r_linger; /* don't resend on failure */ + struct completion r_completion; /* private to osd_client.c */ ceph_osdc_callback_t r_callback; - ceph_osdc_unsafe_callback_t r_unsafe_callback; - struct ceph_eversion r_reassert_version; - struct list_head r_unsafe_item; struct inode *r_inode; /* for use by callbacks */ + struct list_head r_private_item; /* ditto */ void *r_priv; /* ditto */ - char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ - int r_oid_len; - u64 r_snapid; - unsigned long r_stamp; /* send OR check time */ + /* set by submitter */ + u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */ + struct ceph_snap_context *r_snapc; /* for writes */ + struct timespec64 r_mtime; /* ditto */ + u64 r_data_offset; /* ditto */ + + /* internal */ + u64 r_version; /* data version sent in reply */ + unsigned long r_stamp; /* jiffies, send or check time */ + unsigned long r_start_stamp; /* jiffies */ + ktime_t r_start_latency; /* ktime_t */ + ktime_t r_end_latency; /* ktime_t */ + int r_attempts; + u32 r_map_dne_bound; + + struct ceph_osd_req_op r_ops[] __counted_by(r_num_ops); +}; - struct ceph_file_layout r_file_layout; - struct ceph_snap_context *r_snapc; /* snap context for writes */ +struct ceph_request_redirect { + struct ceph_object_locator oloc; }; -struct ceph_osd_event { - u64 cookie; - int one_shot; +/* + * osd request identifier + * + * caller name + incarnation# + tid to unique identify this request + */ +struct ceph_osd_reqid { + struct ceph_entity_name name; + __le64 tid; + __le32 inc; +} __packed; + +struct ceph_blkin_trace_info { + __le64 trace_id; + __le64 span_id; + __le64 parent_span_id; +} __packed; + +typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie, + u64 notifier_id, void *data, size_t data_len); +typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err); + +struct ceph_osd_linger_request { struct ceph_osd_client *osdc; - void (*cb)(u64, u64, u8, void *); + u64 linger_id; + bool committed; + bool is_watch; /* watch or notify */ + + struct ceph_osd *osd; + struct ceph_osd_request *reg_req; + struct ceph_osd_request *ping_req; + unsigned long ping_sent; + unsigned long watch_valid_thru; + struct list_head pending_lworks; + + struct ceph_osd_request_target t; + u32 map_dne_bound; + + struct timespec64 mtime; + + struct kref kref; + struct mutex lock; + struct rb_node node; /* osd */ + struct rb_node osdc_node; /* osdc */ + struct rb_node mc_node; /* map check */ + struct list_head scan_item; + + struct completion reg_commit_wait; + struct completion notify_finish_wait; + int reg_commit_error; + int notify_finish_error; + int last_error; + + u32 register_gen; + u64 notify_id; + + rados_watchcb2_t wcb; + rados_watcherrcb_t errcb; void *data; + + struct ceph_pagelist *request_pl; + struct page **notify_id_pages; + + struct page ***preply_pages; + size_t *preply_len; +}; + +struct ceph_watch_item { + struct ceph_entity_name name; + u64 cookie; + struct ceph_entity_addr addr; +}; + +struct ceph_spg_mapping { struct rb_node node; - struct list_head osd_node; - struct kref kref; + struct ceph_spg spgid; + + struct rb_root backoffs; }; -struct ceph_osd_event_work { - struct work_struct work; - struct ceph_osd_event *event; - u64 ver; - u64 notify_id; - u8 opcode; +struct ceph_hobject_id { + void *key; + size_t key_len; + void *oid; + size_t oid_len; + u64 snapid; + u32 hash; + u8 is_max; + void *nspace; + size_t nspace_len; + s64 pool; + + /* cache */ + u32 hash_reverse_bits; }; +static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid) +{ + hoid->hash_reverse_bits = bitrev32(hoid->hash); +} + +/* + * PG-wide backoff: [begin, end) + * per-object backoff: begin == end + */ +struct ceph_osd_backoff { + struct rb_node spg_node; + struct rb_node id_node; + + struct ceph_spg spgid; + u64 id; + struct ceph_hobject_id *begin; + struct ceph_hobject_id *end; +}; + +#define CEPH_LINGER_ID_START 0xffff000000000000ULL + struct ceph_osd_client { struct ceph_client *client; struct ceph_osdmap *osdmap; /* current map */ - struct rw_semaphore map_sem; - struct completion map_waiters; - u64 last_requested_map; + struct rw_semaphore lock; - struct mutex request_mutex; struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ - u64 timeout_tid; /* tid of timeout triggering rq */ - u64 last_tid; /* tid of last request */ - struct rb_root requests; /* pending requests */ - struct list_head req_lru; /* in-flight lru */ - struct list_head req_unsent; /* unsent/need-resend queue */ - struct list_head req_notarget; /* map to no osd */ - struct list_head req_linger; /* lingering requests */ - int num_requests; + spinlock_t osd_lru_lock; + u32 epoch_barrier; + struct ceph_osd homeless_osd; + atomic64_t last_tid; /* tid of last request */ + u64 last_linger_id; + struct rb_root linger_requests; /* lingering requests */ + struct rb_root map_checks; + struct rb_root linger_map_checks; + atomic_t num_requests; + atomic_t num_homeless; + int abort_err; struct delayed_work timeout_work; struct delayed_work osds_timeout_work; #ifdef CONFIG_DEBUG_FS @@ -216,27 +432,39 @@ struct ceph_osd_client { struct ceph_msgpool msgpool_op; struct ceph_msgpool msgpool_op_reply; - spinlock_t event_lock; - struct rb_root event_tree; - u64 event_count; - struct workqueue_struct *notify_wq; + struct workqueue_struct *completion_wq; }; +static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag) +{ + return osdc->osdmap->flags & flag; +} + extern int ceph_osdc_setup(void); extern void ceph_osdc_cleanup(void); extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); +extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc); -extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, - struct ceph_msg *msg); extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); +void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); +void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc); -extern void osd_req_op_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode); +#define osd_req_op_data(oreq, whch, typ, fld) \ +({ \ + struct ceph_osd_request *__oreq = (oreq); \ + unsigned int __whch = (whch); \ + BUG_ON(__whch >= __oreq->r_num_ops); \ + &__oreq->r_ops[__whch].typ.fld; \ +}) + +struct ceph_osd_req_op *osd_req_op_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, u32 flags); extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, unsigned int which, @@ -250,113 +478,166 @@ extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, u64 truncate_size, u32 truncate_seq); extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, unsigned int which, u64 length); +extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, + unsigned int which, u64 offset_inc); extern struct ceph_osd_data *osd_req_op_extent_osd_data( struct ceph_osd_request *osd_req, unsigned int which); -extern struct ceph_osd_data *osd_req_op_cls_response_data( - struct ceph_osd_request *osd_req, - unsigned int which); extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages); -extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, - unsigned int which, - struct ceph_pagelist *pagelist); #ifdef CONFIG_BLOCK -extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, - unsigned int which, - struct bio *bio, size_t bio_length); +void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, + unsigned int which, + struct ceph_bio_iter *bio_pos, + u32 bio_length); #endif /* CONFIG_BLOCK */ +void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req, + unsigned int which, + struct bio_vec *bvecs, u32 num_bvecs, + u32 bytes); +void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, + unsigned int which, + struct ceph_bvec_iter *bvec_pos); +void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req, + unsigned int which, struct iov_iter *iter); -extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, - unsigned int which, - struct ceph_pagelist *pagelist); extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages); +void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, + unsigned int which, + struct bio_vec *bvecs, u32 num_bvecs, + u32 bytes); extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages); - -extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode, - const char *class, const char *method); -extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode, - u64 cookie, u64 version, int flag); +int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, + const char *class, const char *method); +extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode, const char *name, const void *value, + size_t size, u8 cmp_op, u8 cmp_mode); +extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, + unsigned int which, + u64 expected_object_size, + u64 expected_write_size, + u32 flags); +extern int osd_req_op_copy_from_init(struct ceph_osd_request *req, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, + u8 copy_from_flags); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, unsigned int num_ops, bool use_mempool, gfp_t gfp_flags); - -extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, - struct ceph_snap_context *snapc, - u64 snap_id, - struct timespec *mtime); +int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp); extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, u64 offset, u64 *len, - int num_ops, int opcode, int flags, + unsigned int which, int num_ops, + int opcode, int flags, struct ceph_snap_context *snapc, u32 truncate_seq, u64 truncate_size, bool use_mempool); -extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); -extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); +int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt); -static inline void ceph_osdc_get_request(struct ceph_osd_request *req) -{ - kref_get(&req->r_kref); -} -extern void ceph_osdc_release_request(struct kref *kref); -static inline void ceph_osdc_put_request(struct ceph_osd_request *req) +/* + * How big an extent array should we preallocate for a sparse read? This is + * just a starting value. If we get more than this back from the OSD, the + * receiver will reallocate. + */ +#define CEPH_SPARSE_EXT_ARRAY_INITIAL 16 + +static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) { - kref_put(&req->r_kref, ceph_osdc_release_request); + if (!cnt) + cnt = CEPH_SPARSE_EXT_ARRAY_INITIAL; + + return __ceph_alloc_sparse_ext_map(op, cnt); } -extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail); +extern void ceph_osdc_get_request(struct ceph_osd_request *req); +extern void ceph_osdc_put_request(struct ceph_osd_request *req); + +void ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +extern void ceph_osdc_cancel_request(struct ceph_osd_request *req); extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); extern void ceph_osdc_sync(struct ceph_osd_client *osdc); -extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, - u32 truncate_seq, u64 truncate_size, - struct page **pages, int nr_pages, - int page_align); - -extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - struct ceph_snap_context *sc, - u64 off, u64 len, - u32 truncate_seq, u64 truncate_size, - struct timespec *mtime, - struct page **pages, int nr_pages); - -/* watch/notify events */ -extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, - void (*event_cb)(u64, u64, u8, void *), - void *data, struct ceph_osd_event **pevent); -extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); -extern void ceph_osdc_put_event(struct ceph_osd_event *event); -#endif +extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); +void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc); + +int ceph_osdc_call(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + const char *class, const char *method, + unsigned int flags, + struct page *req_page, size_t req_len, + struct page **resp_pages, size_t *resp_len); + +/* watch/notify */ +struct ceph_osd_linger_request * +ceph_osdc_watch(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + rados_watchcb2_t wcb, + rados_watcherrcb_t errcb, + void *data); +int ceph_osdc_unwatch(struct ceph_osd_client *osdc, + struct ceph_osd_linger_request *lreq); + +int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + u64 notify_id, + u64 cookie, + void *payload, + u32 payload_len); +int ceph_osdc_notify(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + void *payload, + u32 payload_len, + u32 timeout, + struct page ***preply_pages, + size_t *preply_len); +int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + struct ceph_watch_item **watchers, + u32 *num_watchers); + +/* Find offset into the buffer of the end of the extent map */ +static inline u64 ceph_sparse_ext_map_end(struct ceph_osd_req_op *op) +{ + struct ceph_sparse_extent *ext; + /* No extents? No data */ + if (op->extent.sparse_ext_cnt == 0) + return 0; + + ext = &op->extent.sparse_ext[op->extent.sparse_ext_cnt - 1]; + + return ext->off + ext->len - op->extent.offset; +} + +#endif diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index d05cc4451af6..5553019c3f07 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -1,10 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_OSDMAP_H #define _FS_CEPH_OSDMAP_H #include <linux/rbtree.h> #include <linux/ceph/types.h> #include <linux/ceph/decode.h> -#include <linux/ceph/ceph_fs.h> #include <linux/crush/crush.h> /* @@ -24,93 +24,199 @@ struct ceph_pg { uint32_t seed; }; -#define CEPH_POOL_FLAG_HASHPSPOOL 1 +#define CEPH_SPG_NOSHARD -1 + +struct ceph_spg { + struct ceph_pg pgid; + s8 shard; +}; + +int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); +int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs); + +#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id + together */ +#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */ +#define CEPH_POOL_FLAG_FULL_QUOTA (1ULL << 10) /* pool ran out of quota, + will set FULL too */ +#define CEPH_POOL_FLAG_NEARFULL (1ULL << 11) /* pool is nearfull */ struct ceph_pg_pool_info { struct rb_node node; s64 id; - u8 type; + u8 type; /* CEPH_POOL_TYPE_* */ u8 size; + u8 min_size; u8 crush_ruleset; u8 object_hash; + u32 last_force_request_resend; u32 pg_num, pgp_num; int pg_num_mask, pgp_num_mask; - u64 flags; + s64 read_tier; + s64 write_tier; /* wins for read+write ops */ + u64 flags; /* CEPH_POOL_FLAG_* */ char *name; + + bool was_full; /* for handle_one_map() */ }; +static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) +{ + switch (pool->type) { + case CEPH_POOL_TYPE_REP: + return true; + case CEPH_POOL_TYPE_EC: + return false; + default: + BUG(); + } +} + struct ceph_object_locator { - uint64_t pool; - char *key; + s64 pool; + struct ceph_string *pool_ns; +}; + +static inline void ceph_oloc_init(struct ceph_object_locator *oloc) +{ + oloc->pool = -1; + oloc->pool_ns = NULL; +} + +static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc) +{ + return oloc->pool == -1; +} + +void ceph_oloc_copy(struct ceph_object_locator *dest, + const struct ceph_object_locator *src); +void ceph_oloc_destroy(struct ceph_object_locator *oloc); + +/* + * 51-char inline_name is long enough for all cephfs and all but one + * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be + * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all + * other rbd requests fit into inline_name. + * + * Makes ceph_object_id 64 bytes on 64-bit. + */ +#define CEPH_OID_INLINE_LEN 52 + +/* + * Both inline and external buffers have space for a NUL-terminator, + * which is carried around. It's not required though - RADOS object + * names don't have to be NUL-terminated and may contain NULs. + */ +struct ceph_object_id { + char *name; + char inline_name[CEPH_OID_INLINE_LEN]; + int name_len; +}; + +#define __CEPH_OID_INITIALIZER(oid) { .name = (oid).inline_name } + +#define CEPH_DEFINE_OID_ONSTACK(oid) \ + struct ceph_object_id oid = __CEPH_OID_INITIALIZER(oid) + +static inline void ceph_oid_init(struct ceph_object_id *oid) +{ + *oid = (struct ceph_object_id) __CEPH_OID_INITIALIZER(*oid); +} + +static inline bool ceph_oid_empty(const struct ceph_object_id *oid) +{ + return oid->name == oid->inline_name && !oid->name_len; +} + +void ceph_oid_copy(struct ceph_object_id *dest, + const struct ceph_object_id *src); +__printf(2, 3) +void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...); +__printf(3, 4) +int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, + const char *fmt, ...); +void ceph_oid_destroy(struct ceph_object_id *oid); + +struct workspace_manager { + struct list_head idle_ws; + spinlock_t ws_lock; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ + wait_queue_head_t ws_wait; }; struct ceph_pg_mapping { struct rb_node node; struct ceph_pg pgid; - int len; - int osds[]; + + union { + struct { + int len; + int osds[]; + } pg_temp, pg_upmap; + struct { + int osd; + } primary_temp; + struct { + int len; + int from_to[][2]; + } pg_upmap_items; + }; }; struct ceph_osdmap { struct ceph_fsid fsid; u32 epoch; - u32 mkfs_epoch; struct ceph_timespec created, modified; u32 flags; /* CEPH_OSDMAP_* */ u32 max_osd; /* size of osd_state, _offload, _addr arrays */ - u8 *osd_state; /* CEPH_OSD_* */ + u32 *osd_state; /* CEPH_OSD_* */ u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */ struct ceph_entity_addr *osd_addr; struct rb_root pg_temp; + struct rb_root primary_temp; + + /* remap (post-CRUSH, pre-up) */ + struct rb_root pg_upmap; /* PG := raw set */ + struct rb_root pg_upmap_items; /* from -> to within raw set */ + + u32 *osd_primary_affinity; + struct rb_root pg_pools; u32 pool_max; /* the CRUSH map specifies the mapping of placement groups to * the list of osds that store+replicate them. */ struct crush_map *crush; -}; -/* - * file layout helpers - */ -#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) -#define ceph_file_layout_stripe_count(l) \ - ((__s32)le32_to_cpu((l).fl_stripe_count)) -#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) -#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) -#define ceph_file_layout_object_su(l) \ - ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) -#define ceph_file_layout_pg_pool(l) \ - ((__s32)le32_to_cpu((l).fl_pg_pool)) - -static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) -{ - return le32_to_cpu(l->fl_stripe_unit) * - le32_to_cpu(l->fl_stripe_count); -} + struct workspace_manager crush_wsm; +}; -/* "period" == bytes before i start on a new set of objects */ -static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) +static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) { - return le32_to_cpu(l->fl_object_size) * - le32_to_cpu(l->fl_stripe_count); + return osd >= 0 && osd < map->max_osd && + (map->osd_state[osd] & CEPH_OSD_EXISTS); } - -static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) +static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd) { - return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); + return ceph_osd_exists(map, osd) && + (map->osd_state[osd] & CEPH_OSD_UP); } -static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) +static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd) { - return map && (map->flags & flag); + return !ceph_osd_is_up(map, osd); } -extern char *ceph_osdmap_state_str(char *str, int len, int state); +char *ceph_osdmap_state_str(char *str, int len, u32 state); +extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd); static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, int osd) @@ -120,18 +226,19 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, return &map->osd_addr[osd]; } +#define CEPH_PGID_ENCODING_LEN (1 + 8 + 4 + 4) + static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) { __u8 version; - if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) { - pr_warning("incomplete pg encoding"); - + if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) { + pr_warn("incomplete pg encoding\n"); return -EINVAL; } version = ceph_decode_8(p); if (version > 1) { - pr_warning("do not understand pg encoding %d > 1", + pr_warn("do not understand pg encoding %d > 1\n", (int)version); return -EINVAL; } @@ -143,27 +250,90 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) return 0; } -extern struct ceph_osdmap *osdmap_decode(void **p, void *end); -extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, - struct ceph_osdmap *map, - struct ceph_messenger *msgr); +struct ceph_osdmap *ceph_osdmap_alloc(void); +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2); +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2, + struct ceph_osdmap *map); extern void ceph_osdmap_destroy(struct ceph_osdmap *map); -/* calculate mapping of a file extent to an object */ -extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 len, - u64 *bno, u64 *oxoff, u64 *oxlen); +struct ceph_osds { + int osds[CEPH_PG_MAX_SIZE]; + int size; + int primary; /* id, NOT index */ +}; + +static inline void ceph_osds_init(struct ceph_osds *set) +{ + set->size = 0; + set->primary = -1; +} + +void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); + +bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num, + u32 new_pg_num); +bool ceph_is_new_interval(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + const struct ceph_osds *old_up, + const struct ceph_osds *new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + u32 old_pg_num, + u32 new_pg_num, + bool old_sort_bitwise, + bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, + const struct ceph_pg *pgid); +bool ceph_osds_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + bool any_change); + +void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid); +int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid); + +void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_osds *up, + struct ceph_osds *acting); +bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_spg *spgid); +int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid); + +struct crush_loc { + char *cl_type_name; + char *cl_name; +}; + +struct crush_loc_node { + struct rb_node cl_node; + struct crush_loc cl_loc; /* pointers into cl_data */ + char cl_data[]; +}; + +int ceph_parse_crush_location(char *crush_location, struct rb_root *locs); +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2); +void ceph_clear_crush_locs(struct rb_root *locs); -/* calculate mapping of object to a placement group */ -extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, - struct ceph_osdmap *osdmap, uint64_t pool); -extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, - struct ceph_pg pgid, - int *acting); -extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, - struct ceph_pg pgid); +int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id, + struct rb_root *locs); +extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, + u64 id); extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); +u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id); #endif diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h index 9660d6b0a35d..879bec0863aa 100644 --- a/include/linux/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h @@ -1,7 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FS_CEPH_PAGELIST_H #define __FS_CEPH_PAGELIST_H +#include <asm/byteorder.h> +#include <linux/refcount.h> #include <linux/list.h> +#include <linux/types.h> struct ceph_pagelist { struct list_head head; @@ -10,25 +14,12 @@ struct ceph_pagelist { size_t room; struct list_head free_list; size_t num_pages_free; + refcount_t refcnt; }; -struct ceph_pagelist_cursor { - struct ceph_pagelist *pl; /* pagelist, for error checking */ - struct list_head *page_lru; /* page in list */ - size_t room; /* room remaining to reset to */ -}; - -static inline void ceph_pagelist_init(struct ceph_pagelist *pl) -{ - INIT_LIST_HEAD(&pl->head); - pl->mapped_tail = NULL; - pl->length = 0; - pl->room = 0; - INIT_LIST_HEAD(&pl->free_list); - pl->num_pages_free = 0; -} +struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags); -extern int ceph_pagelist_release(struct ceph_pagelist *pl); +extern void ceph_pagelist_release(struct ceph_pagelist *pl); extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l); @@ -36,12 +27,6 @@ extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space); extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl); -extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, - struct ceph_pagelist_cursor *c); - -extern int ceph_pagelist_truncate(struct ceph_pagelist *pl, - struct ceph_pagelist_cursor *c); - static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) { __le64 ev = cpu_to_le64(v); @@ -62,7 +47,7 @@ static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v) return ceph_pagelist_append(pl, &v, 1); } static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl, - char *s, size_t len) + char *s, u32 len) { int ret = ceph_pagelist_encode_32(pl, len); if (ret) diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 68c96a508ac2..73c3efbec36c 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef CEPH_RADOS_H #define CEPH_RADOS_H @@ -50,7 +51,7 @@ struct ceph_timespec { #define CEPH_PG_LAYOUT_LINEAR 2 #define CEPH_PG_LAYOUT_HYBRID 3 -#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ +#define CEPH_PG_MAX_SIZE 32 /* max # osds in a single pg */ /* * placement group. @@ -81,8 +82,9 @@ struct ceph_pg_v1 { */ #define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */ -#define CEPH_PG_TYPE_REP 1 -#define CEPH_PG_TYPE_RAID4 2 +#define CEPH_POOL_TYPE_REP 1 +#define CEPH_POOL_TYPE_RAID4 2 /* never implemented */ +#define CEPH_POOL_TYPE_EC 3 /* * stable_mod func is used to control number of placement groups. @@ -113,8 +115,8 @@ struct ceph_object_layout { * compound epoch+version, used by storage layer to serialize mutations */ struct ceph_eversion { - __le32 epoch; __le64 version; + __le32 epoch; } __attribute__ ((packed)); /* @@ -133,12 +135,18 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSD_IN 0x10000 #define CEPH_OSD_OUT 0 +/* osd primary-affinity. fixed point value: 0x10000 == baseline */ +#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000 +#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000 + /* * osd map flag bits */ -#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ -#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ +#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC), + not set since ~luminous */ +#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC), + not set since ~luminous */ #define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ #define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ #define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ @@ -148,6 +156,15 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ +#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */ +#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */ +#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ +#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ +#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ +#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */ +#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */ +#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */ +#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */ /* * The error code to return when an OSD can't handle a write @@ -167,6 +184,7 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSD_OP_MODE_WR 0x2000 #define CEPH_OSD_OP_MODE_RMW 0x3000 #define CEPH_OSD_OP_MODE_SUB 0x4000 +#define CEPH_OSD_OP_MODE_CACHE 0x8000 #define CEPH_OSD_OP_TYPE 0x0f00 #define CEPH_OSD_OP_TYPE_LOCK 0x0100 @@ -176,100 +194,135 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSD_OP_TYPE_PG 0x0500 #define CEPH_OSD_OP_TYPE_MULTI 0x0600 /* multiobject */ +#define __CEPH_OSD_OP1(mode, nr) \ + (CEPH_OSD_OP_MODE_##mode | (nr)) + +#define __CEPH_OSD_OP(mode, type, nr) \ + (CEPH_OSD_OP_MODE_##mode | CEPH_OSD_OP_TYPE_##type | (nr)) + +#define __CEPH_FORALL_OSD_OPS(f) \ + /** data **/ \ + /* read */ \ + f(READ, __CEPH_OSD_OP(RD, DATA, 1), "read") \ + f(STAT, __CEPH_OSD_OP(RD, DATA, 2), "stat") \ + f(MAPEXT, __CEPH_OSD_OP(RD, DATA, 3), "mapext") \ + \ + /* fancy read */ \ + f(MASKTRUNC, __CEPH_OSD_OP(RD, DATA, 4), "masktrunc") \ + f(SPARSE_READ, __CEPH_OSD_OP(RD, DATA, 5), "sparse-read") \ + \ + f(NOTIFY, __CEPH_OSD_OP(RD, DATA, 6), "notify") \ + f(NOTIFY_ACK, __CEPH_OSD_OP(RD, DATA, 7), "notify-ack") \ + \ + /* versioning */ \ + f(ASSERT_VER, __CEPH_OSD_OP(RD, DATA, 8), "assert-version") \ + \ + f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9), "list-watchers") \ + \ + f(LIST_SNAPS, __CEPH_OSD_OP(RD, DATA, 10), "list-snaps") \ + \ + /* sync */ \ + f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \ + \ + /* write */ \ + f(WRITE, __CEPH_OSD_OP(WR, DATA, 1), "write") \ + f(WRITEFULL, __CEPH_OSD_OP(WR, DATA, 2), "writefull") \ + f(TRUNCATE, __CEPH_OSD_OP(WR, DATA, 3), "truncate") \ + f(ZERO, __CEPH_OSD_OP(WR, DATA, 4), "zero") \ + f(DELETE, __CEPH_OSD_OP(WR, DATA, 5), "delete") \ + \ + /* fancy write */ \ + f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \ + f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \ + f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \ + \ + f(TMAPUP, __CEPH_OSD_OP(RMW, DATA, 10), "tmapup") \ + f(TMAPPUT, __CEPH_OSD_OP(WR, DATA, 11), "tmapput") \ + f(TMAPGET, __CEPH_OSD_OP(RD, DATA, 12), "tmapget") \ + \ + f(CREATE, __CEPH_OSD_OP(WR, DATA, 13), "create") \ + f(ROLLBACK, __CEPH_OSD_OP(WR, DATA, 14), "rollback") \ + \ + f(WATCH, __CEPH_OSD_OP(WR, DATA, 15), "watch") \ + \ + /* omap */ \ + f(OMAPGETKEYS, __CEPH_OSD_OP(RD, DATA, 17), "omap-get-keys") \ + f(OMAPGETVALS, __CEPH_OSD_OP(RD, DATA, 18), "omap-get-vals") \ + f(OMAPGETHEADER, __CEPH_OSD_OP(RD, DATA, 19), "omap-get-header") \ + f(OMAPGETVALSBYKEYS, __CEPH_OSD_OP(RD, DATA, 20), "omap-get-vals-by-keys") \ + f(OMAPSETVALS, __CEPH_OSD_OP(WR, DATA, 21), "omap-set-vals") \ + f(OMAPSETHEADER, __CEPH_OSD_OP(WR, DATA, 22), "omap-set-header") \ + f(OMAPCLEAR, __CEPH_OSD_OP(WR, DATA, 23), "omap-clear") \ + f(OMAPRMKEYS, __CEPH_OSD_OP(WR, DATA, 24), "omap-rm-keys") \ + f(OMAP_CMP, __CEPH_OSD_OP(RD, DATA, 25), "omap-cmp") \ + \ + /* tiering */ \ + f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \ + f(COPY_FROM2, __CEPH_OSD_OP(WR, DATA, 45), "copy-from2") \ + f(COPY_GET_CLASSIC, __CEPH_OSD_OP(RD, DATA, 27), "copy-get-classic") \ + f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \ + f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \ + f(COPY_GET, __CEPH_OSD_OP(RD, DATA, 30), "copy-get") \ + f(CACHE_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 31), "cache-flush") \ + f(CACHE_EVICT, __CEPH_OSD_OP(CACHE, DATA, 32), "cache-evict") \ + f(CACHE_TRY_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 33), "cache-try-flush") \ + \ + /* convert tmap to omap */ \ + f(TMAP2OMAP, __CEPH_OSD_OP(RMW, DATA, 34), "tmap2omap") \ + \ + /* hints */ \ + f(SETALLOCHINT, __CEPH_OSD_OP(WR, DATA, 35), "set-alloc-hint") \ + \ + /** multi **/ \ + f(CLONERANGE, __CEPH_OSD_OP(WR, MULTI, 1), "clonerange") \ + f(ASSERT_SRC_VERSION, __CEPH_OSD_OP(RD, MULTI, 2), "assert-src-version") \ + f(SRC_CMPXATTR, __CEPH_OSD_OP(RD, MULTI, 3), "src-cmpxattr") \ + \ + /** attrs **/ \ + /* read */ \ + f(GETXATTR, __CEPH_OSD_OP(RD, ATTR, 1), "getxattr") \ + f(GETXATTRS, __CEPH_OSD_OP(RD, ATTR, 2), "getxattrs") \ + f(CMPXATTR, __CEPH_OSD_OP(RD, ATTR, 3), "cmpxattr") \ + \ + /* write */ \ + f(SETXATTR, __CEPH_OSD_OP(WR, ATTR, 1), "setxattr") \ + f(SETXATTRS, __CEPH_OSD_OP(WR, ATTR, 2), "setxattrs") \ + f(RESETXATTRS, __CEPH_OSD_OP(WR, ATTR, 3), "resetxattrs") \ + f(RMXATTR, __CEPH_OSD_OP(WR, ATTR, 4), "rmxattr") \ + \ + /** subop **/ \ + f(PULL, __CEPH_OSD_OP1(SUB, 1), "pull") \ + f(PUSH, __CEPH_OSD_OP1(SUB, 2), "push") \ + f(BALANCEREADS, __CEPH_OSD_OP1(SUB, 3), "balance-reads") \ + f(UNBALANCEREADS, __CEPH_OSD_OP1(SUB, 4), "unbalance-reads") \ + f(SCRUB, __CEPH_OSD_OP1(SUB, 5), "scrub") \ + f(SCRUB_RESERVE, __CEPH_OSD_OP1(SUB, 6), "scrub-reserve") \ + f(SCRUB_UNRESERVE, __CEPH_OSD_OP1(SUB, 7), "scrub-unreserve") \ + f(SCRUB_STOP, __CEPH_OSD_OP1(SUB, 8), "scrub-stop") \ + f(SCRUB_MAP, __CEPH_OSD_OP1(SUB, 9), "scrub-map") \ + \ + /** lock **/ \ + f(WRLOCK, __CEPH_OSD_OP(WR, LOCK, 1), "wrlock") \ + f(WRUNLOCK, __CEPH_OSD_OP(WR, LOCK, 2), "wrunlock") \ + f(RDLOCK, __CEPH_OSD_OP(WR, LOCK, 3), "rdlock") \ + f(RDUNLOCK, __CEPH_OSD_OP(WR, LOCK, 4), "rdunlock") \ + f(UPLOCK, __CEPH_OSD_OP(WR, LOCK, 5), "uplock") \ + f(DNLOCK, __CEPH_OSD_OP(WR, LOCK, 6), "dnlock") \ + \ + /** exec **/ \ + /* note: the RD bit here is wrong; see special-case below in helper */ \ + f(CALL, __CEPH_OSD_OP(RD, EXEC, 1), "call") \ + \ + /** pg **/ \ + f(PGLS, __CEPH_OSD_OP(RD, PG, 1), "pgls") \ + f(PGLS_FILTER, __CEPH_OSD_OP(RD, PG, 2), "pgls-filter") \ + f(PG_HITSET_LS, __CEPH_OSD_OP(RD, PG, 3), "pg-hitset-ls") \ + f(PG_HITSET_GET, __CEPH_OSD_OP(RD, PG, 4), "pg-hitset-get") + enum { - /** data **/ - /* read */ - CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, - CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, - CEPH_OSD_OP_MAPEXT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 3, - - /* fancy read */ - CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, - CEPH_OSD_OP_SPARSE_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 5, - - CEPH_OSD_OP_NOTIFY = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 6, - CEPH_OSD_OP_NOTIFY_ACK = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 7, - - /* versioning */ - CEPH_OSD_OP_ASSERT_VER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 8, - - /* write */ - CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, - CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2, - CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3, - CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4, - CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5, - - /* fancy write */ - CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6, - CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7, - CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8, - CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9, - - CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10, - CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11, - CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, - - CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, - CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14, - - CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15, - - /* omap */ - CEPH_OSD_OP_OMAPGETKEYS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 17, - CEPH_OSD_OP_OMAPGETVALS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 18, - CEPH_OSD_OP_OMAPGETHEADER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 19, - CEPH_OSD_OP_OMAPGETVALSBYKEYS = - CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 20, - CEPH_OSD_OP_OMAPSETVALS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 21, - CEPH_OSD_OP_OMAPSETHEADER = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 22, - CEPH_OSD_OP_OMAPCLEAR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 23, - CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24, - CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25, - - /** multi **/ - CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1, - CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2, - CEPH_OSD_OP_SRC_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 3, - - /** attrs **/ - /* read */ - CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, - CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, - CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3, - - /* write */ - CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, - CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2, - CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3, - CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, - - /** subop **/ - CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, - CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, - CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, - CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, - CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, - CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6, - CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7, - CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8, - CEPH_OSD_OP_SCRUB_MAP = CEPH_OSD_OP_MODE_SUB | 9, - - /** lock **/ - CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, - CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2, - CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3, - CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4, - CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5, - CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, - - /** exec **/ - /* note: the RD bit here is wrong; see special-case below in helper */ - CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, - - /** pg **/ - CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, - CEPH_OSD_OP_PGLS_FILTER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 2, +#define GENERATE_ENUM_ENTRY(op, opcode, str) CEPH_OSD_OP_##op = (opcode), +__CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY) +#undef GENERATE_ENUM_ENTRY }; static inline int ceph_osd_op_type_lock(int op) @@ -344,15 +397,34 @@ enum { CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */ CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */ CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */ + CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */ + CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ + CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ + CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ + CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000, /* map snap direct to clone id */ + CEPH_OSD_FLAG_ENFORCE_SNAPC = 0x100000, /* use snapc provided even if + pool uses pool snaps */ + CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */ + CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */ + CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */ + CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */ }; enum { CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */ + CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */ + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */ + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in + the near future */ + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed + in the near future */ + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40,/* data will be accessed only + once by this client */ }; #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ -#define EBLACKLISTED ESHUTDOWN /* blacklisted */ +#define EBLOCKLISTED ESHUTDOWN /* blocklisted */ /* xattr comparison */ enum { @@ -370,7 +442,46 @@ enum { CEPH_OSD_CMPXATTR_MODE_U64 = 2 }; -#define RADOS_NOTIFY_VER 1 +enum { + CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1, /* part of a flush operation */ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2, /* ignore pool overlay */ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */ + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to + * cloneid */ + CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */ + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32, /* send truncate_{seq,size} */ +}; + +enum { + CEPH_OSD_WATCH_OP_UNWATCH = 0, + CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1, + /* note: use only ODD ids to prevent pre-giant code from + interpreting the op as UNWATCH */ + CEPH_OSD_WATCH_OP_WATCH = 3, + CEPH_OSD_WATCH_OP_RECONNECT = 5, + CEPH_OSD_WATCH_OP_PING = 7, +}; + +const char *ceph_osd_watch_op_name(int o); + +enum { + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8, + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16, + CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32, + CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64, + CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128, + CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, +}; + +enum { + CEPH_OSD_BACKOFF_OP_BLOCK = 1, + CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2, + CEPH_OSD_BACKOFF_OP_UNBLOCK = 3, +}; /* * an individual object operation. each may be accompanied by some data @@ -378,7 +489,7 @@ enum { */ struct ceph_osd_op { __le16 op; /* CEPH_OSD_OP_* */ - __le32 flags; /* CEPH_OSD_FLAG_* */ + __le32 flags; /* CEPH_OSD_OP_FLAG_* */ union { struct { __le64 offset, length; @@ -405,13 +516,37 @@ struct ceph_osd_op { } __attribute__ ((packed)) snap; struct { __le64 cookie; - __le64 ver; - __u8 flag; /* 0 = unwatch, 1 = watch */ + __le64 ver; /* no longer used */ + __u8 op; /* CEPH_OSD_WATCH_OP_* */ + __le32 gen; /* registration generation */ } __attribute__ ((packed)) watch; struct { + __le64 cookie; + } __attribute__ ((packed)) notify; + struct { + __le64 unused; + __le64 ver; + } __attribute__ ((packed)) assert_ver; + struct { __le64 offset, length; __le64 src_offset; } __attribute__ ((packed)) clonerange; + struct { + __le64 expected_object_size; + __le64 expected_write_size; + __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ + } __attribute__ ((packed)) alloc_hint; + struct { + __le64 snapid; + __le64 src_version; + __u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */ + /* + * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags + * for src object, flags for dest object are in + * ceph_osd_op::flags. + */ + __le32 src_fadvise_flags; + } __attribute__ ((packed)) copy_from; }; __le32 payload_len; } __attribute__ ((packed)); diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h new file mode 100644 index 000000000000..a4a9962d1e14 --- /dev/null +++ b/include/linux/ceph/string_table.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_STRING_TABLE_H +#define _FS_CEPH_STRING_TABLE_H + +#include <linux/types.h> +#include <linux/kref.h> +#include <linux/rbtree.h> +#include <linux/rcupdate.h> + +struct ceph_string { + struct kref kref; + union { + struct rb_node node; + struct rcu_head rcu; + }; + size_t len; + char str[]; +}; + +extern void ceph_release_string(struct kref *ref); +extern struct ceph_string *ceph_find_or_create_string(const char *str, + size_t len); +extern bool ceph_strings_empty(void); + +static inline struct ceph_string *ceph_get_string(struct ceph_string *str) +{ + kref_get(&str->kref); + return str; +} + +static inline void ceph_put_string(struct ceph_string *str) +{ + if (!str) + return; + kref_put(&str->kref, ceph_release_string); +} + +static inline int ceph_compare_string(struct ceph_string *cs, + const char* str, size_t len) +{ + size_t cs_len = cs ? cs->len : 0; + if (cs_len != len) + return cs_len - len; + if (len == 0) + return 0; + return strncmp(cs->str, str, len); +} + +#define ceph_try_get_string(x) \ +({ \ + struct ceph_string *___str; \ + rcu_read_lock(); \ + for (;;) { \ + ___str = rcu_dereference(x); \ + if (!___str || \ + kref_get_unless_zero(&___str->kref)) \ + break; \ + } \ + rcu_read_unlock(); \ + (___str); \ +}) + +#endif diff --git a/include/linux/ceph/striper.h b/include/linux/ceph/striper.h new file mode 100644 index 000000000000..3486636c0e6e --- /dev/null +++ b/include/linux/ceph/striper.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CEPH_STRIPER_H +#define _LINUX_CEPH_STRIPER_H + +#include <linux/list.h> +#include <linux/types.h> + +struct ceph_file_layout; + +void ceph_calc_file_object_mapping(struct ceph_file_layout *l, + u64 off, u64 len, + u64 *objno, u64 *objoff, u32 *xlen); + +struct ceph_object_extent { + struct list_head oe_item; + u64 oe_objno; + u64 oe_off; + u64 oe_len; +}; + +static inline void ceph_object_extent_init(struct ceph_object_extent *ex) +{ + INIT_LIST_HEAD(&ex->oe_item); +} + +/* + * Called for each mapped stripe unit. + * + * @bytes: number of bytes mapped, i.e. the minimum of the full length + * requested (file extent length) or the remainder of the stripe + * unit within an object + */ +typedef void (*ceph_object_extent_fn_t)(struct ceph_object_extent *ex, + u32 bytes, void *arg); + +int ceph_file_to_extents(struct ceph_file_layout *l, u64 off, u64 len, + struct list_head *object_extents, + struct ceph_object_extent *alloc_fn(void *arg), + void *alloc_arg, + ceph_object_extent_fn_t action_fn, + void *action_arg); +int ceph_iterate_extents(struct ceph_file_layout *l, u64 off, u64 len, + struct list_head *object_extents, + ceph_object_extent_fn_t action_fn, + void *action_arg); + +struct ceph_file_extent { + u64 fe_off; + u64 fe_len; +}; + +static inline u64 ceph_file_extents_bytes(struct ceph_file_extent *file_extents, + u32 num_file_extents) +{ + u64 bytes = 0; + u32 i; + + for (i = 0; i < num_file_extents; i++) + bytes += file_extents[i].fe_len; + + return bytes; +} + +int ceph_extent_to_file(struct ceph_file_layout *l, + u64 objno, u64 objoff, u64 objlen, + struct ceph_file_extent **file_extents, + u32 *num_file_extents); + +u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size); + +#endif diff --git a/include/linux/ceph/types.h b/include/linux/ceph/types.h index d3ff1cf2d27e..bd3d532902d7 100644 --- a/include/linux/ceph/types.h +++ b/include/linux/ceph/types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_TYPES_H #define _FS_CEPH_TYPES_H @@ -23,6 +24,7 @@ struct ceph_vino { /* context for the caps reservation mechanism */ struct ceph_cap_reservation { int count; + int used; }; |
